bucket aggregation

2025-08-04 17:28:55 +03:00
parent 504736c0f7
commit de77b0afa8
10 changed files with 2592 additions and 0 deletions
--- a/COBY/processing/init.py
+++ b/COBY/processing/init.py
@@ -0,0 +1,15 @@
+"""
+Data processing and normalization components for the COBY system.
+"""
+
+from .data_processor import StandardDataProcessor
+from .quality_checker import DataQualityChecker
+from .anomaly_detector import AnomalyDetector
+from .metrics_calculator import MetricsCalculator
+
+__all__ = [
+    'StandardDataProcessor',
+    'DataQualityChecker',
+    'AnomalyDetector',
+    'MetricsCalculator'
+]
--- a/COBY/processing/anomaly_detector.py
+++ b/COBY/processing/anomaly_detector.py
@@ -0,0 +1,329 @@
+"""
+Anomaly detection for market data.
+"""
+
+import statistics
+from typing import Dict, List, Union, Optional, Deque
+from collections import deque
+from datetime import datetime, timedelta
+from ..models.core import OrderBookSnapshot, TradeEvent
+from ..utils.logging import get_logger
+from ..utils.timing import get_current_timestamp
+
+logger = get_logger(__name__)
+
+
+class AnomalyDetector:
+    """
+    Detects anomalies in market data using statistical methods.
+    
+    Detects:
+    - Price spikes and drops
+    - Volume anomalies
+    - Spread anomalies
+    - Frequency anomalies
+    """
+    
+    def __init__(self, window_size: int = 100, z_score_threshold: float = 3.0):
+        """
+        Initialize anomaly detector.
+        
+        Args:
+            window_size: Size of rolling window for statistics
+            z_score_threshold: Z-score threshold for anomaly detection
+        """
+        self.window_size = window_size
+        self.z_score_threshold = z_score_threshold
+        
+        # Rolling windows for statistics
+        self.price_windows: Dict[str, Deque[float]] = {}
+        self.volume_windows: Dict[str, Deque[float]] = {}
+        self.spread_windows: Dict[str, Deque[float]] = {}
+        self.timestamp_windows: Dict[str, Deque[datetime]] = {}
+        
+        logger.info(f"Anomaly detector initialized with window_size={window_size}, threshold={z_score_threshold}")
+    
+    def detect_orderbook_anomalies(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """
+        Detect anomalies in order book data.
+        
+        Args:
+            orderbook: Order book snapshot to analyze
+            
+        Returns:
+            List[str]: List of detected anomalies
+        """
+        anomalies = []
+        key = f"{orderbook.symbol}_{orderbook.exchange}"
+        
+        try:
+            # Price anomalies
+            if orderbook.mid_price:
+                price_anomalies = self._detect_price_anomalies(key, orderbook.mid_price)
+                anomalies.extend(price_anomalies)
+            
+            # Volume anomalies
+            total_volume = orderbook.bid_volume + orderbook.ask_volume
+            volume_anomalies = self._detect_volume_anomalies(key, total_volume)
+            anomalies.extend(volume_anomalies)
+            
+            # Spread anomalies
+            if orderbook.spread and orderbook.mid_price:
+                spread_pct = (orderbook.spread / orderbook.mid_price) * 100
+                spread_anomalies = self._detect_spread_anomalies(key, spread_pct)
+                anomalies.extend(spread_anomalies)
+            
+            # Frequency anomalies
+            frequency_anomalies = self._detect_frequency_anomalies(key, orderbook.timestamp)
+            anomalies.extend(frequency_anomalies)
+            
+            # Update windows
+            self._update_windows(key, orderbook)
+            
+        except Exception as e:
+            logger.error(f"Error detecting order book anomalies: {e}")
+            anomalies.append(f"Anomaly detection error: {e}")
+        
+        if anomalies:
+            logger.warning(f"Anomalies detected in {orderbook.symbol}@{orderbook.exchange}: {anomalies}")
+        
+        return anomalies    
+
+    def detect_trade_anomalies(self, trade: TradeEvent) -> List[str]:
+        """
+        Detect anomalies in trade data.
+        
+        Args:
+            trade: Trade event to analyze
+            
+        Returns:
+            List[str]: List of detected anomalies
+        """
+        anomalies = []
+        key = f"{trade.symbol}_{trade.exchange}_trade"
+        
+        try:
+            # Price anomalies
+            price_anomalies = self._detect_price_anomalies(key, trade.price)
+            anomalies.extend(price_anomalies)
+            
+            # Volume anomalies
+            volume_anomalies = self._detect_volume_anomalies(key, trade.size)
+            anomalies.extend(volume_anomalies)
+            
+            # Update windows
+            self._update_trade_windows(key, trade)
+            
+        except Exception as e:
+            logger.error(f"Error detecting trade anomalies: {e}")
+            anomalies.append(f"Anomaly detection error: {e}")
+        
+        if anomalies:
+            logger.warning(f"Trade anomalies detected in {trade.symbol}@{trade.exchange}: {anomalies}")
+        
+        return anomalies
+    
+    def _detect_price_anomalies(self, key: str, price: float) -> List[str]:
+        """Detect price anomalies using z-score"""
+        anomalies = []
+        
+        if key not in self.price_windows:
+            self.price_windows[key] = deque(maxlen=self.window_size)
+            return anomalies
+        
+        window = self.price_windows[key]
+        if len(window) < 10:  # Need minimum data points
+            return anomalies
+        
+        try:
+            mean_price = statistics.mean(window)
+            std_price = statistics.stdev(window)
+            
+            if std_price > 0:
+                z_score = abs(price - mean_price) / std_price
+                
+                if z_score > self.z_score_threshold:
+                    direction = "spike" if price > mean_price else "drop"
+                    anomalies.append(f"Price {direction}: {price:.6f} (z-score: {z_score:.2f})")
+        
+        except statistics.StatisticsError:
+            pass  # Not enough data or all values are the same
+        
+        return anomalies
+    
+    def _detect_volume_anomalies(self, key: str, volume: float) -> List[str]:
+        """Detect volume anomalies using z-score"""
+        anomalies = []
+        
+        volume_key = f"{key}_volume"
+        if volume_key not in self.volume_windows:
+            self.volume_windows[volume_key] = deque(maxlen=self.window_size)
+            return anomalies
+        
+        window = self.volume_windows[volume_key]
+        if len(window) < 10:
+            return anomalies
+        
+        try:
+            mean_volume = statistics.mean(window)
+            std_volume = statistics.stdev(window)
+            
+            if std_volume > 0:
+                z_score = abs(volume - mean_volume) / std_volume
+                
+                if z_score > self.z_score_threshold:
+                    direction = "spike" if volume > mean_volume else "drop"
+                    anomalies.append(f"Volume {direction}: {volume:.6f} (z-score: {z_score:.2f})")
+        
+        except statistics.StatisticsError:
+            pass
+        
+        return anomalies
+    
+    def _detect_spread_anomalies(self, key: str, spread_pct: float) -> List[str]:
+        """Detect spread anomalies using z-score"""
+        anomalies = []
+        
+        spread_key = f"{key}_spread"
+        if spread_key not in self.spread_windows:
+            self.spread_windows[spread_key] = deque(maxlen=self.window_size)
+            return anomalies
+        
+        window = self.spread_windows[spread_key]
+        if len(window) < 10:
+            return anomalies
+        
+        try:
+            mean_spread = statistics.mean(window)
+            std_spread = statistics.stdev(window)
+            
+            if std_spread > 0:
+                z_score = abs(spread_pct - mean_spread) / std_spread
+                
+                if z_score > self.z_score_threshold:
+                    direction = "widening" if spread_pct > mean_spread else "tightening"
+                    anomalies.append(f"Spread {direction}: {spread_pct:.4f}% (z-score: {z_score:.2f})")
+        
+        except statistics.StatisticsError:
+            pass
+        
+        return anomalies    
+
+    def _detect_frequency_anomalies(self, key: str, timestamp: datetime) -> List[str]:
+        """Detect frequency anomalies in data updates"""
+        anomalies = []
+        
+        timestamp_key = f"{key}_timestamp"
+        if timestamp_key not in self.timestamp_windows:
+            self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
+            return anomalies
+        
+        window = self.timestamp_windows[timestamp_key]
+        if len(window) < 5:
+            return anomalies
+        
+        try:
+            # Calculate intervals between updates
+            intervals = []
+            for i in range(1, len(window)):
+                interval = (window[i] - window[i-1]).total_seconds()
+                intervals.append(interval)
+            
+            if len(intervals) >= 5:
+                mean_interval = statistics.mean(intervals)
+                std_interval = statistics.stdev(intervals)
+                
+                # Check current interval
+                current_interval = (timestamp - window[-1]).total_seconds()
+                
+                if std_interval > 0:
+                    z_score = abs(current_interval - mean_interval) / std_interval
+                    
+                    if z_score > self.z_score_threshold:
+                        if current_interval > mean_interval:
+                            anomalies.append(f"Update delay: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
+                        else:
+                            anomalies.append(f"Update burst: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
+        
+        except (statistics.StatisticsError, IndexError):
+            pass
+        
+        return anomalies
+    
+    def _update_windows(self, key: str, orderbook: OrderBookSnapshot) -> None:
+        """Update rolling windows with new data"""
+        # Update price window
+        if orderbook.mid_price:
+            if key not in self.price_windows:
+                self.price_windows[key] = deque(maxlen=self.window_size)
+            self.price_windows[key].append(orderbook.mid_price)
+        
+        # Update volume window
+        total_volume = orderbook.bid_volume + orderbook.ask_volume
+        volume_key = f"{key}_volume"
+        if volume_key not in self.volume_windows:
+            self.volume_windows[volume_key] = deque(maxlen=self.window_size)
+        self.volume_windows[volume_key].append(total_volume)
+        
+        # Update spread window
+        if orderbook.spread and orderbook.mid_price:
+            spread_pct = (orderbook.spread / orderbook.mid_price) * 100
+            spread_key = f"{key}_spread"
+            if spread_key not in self.spread_windows:
+                self.spread_windows[spread_key] = deque(maxlen=self.window_size)
+            self.spread_windows[spread_key].append(spread_pct)
+        
+        # Update timestamp window
+        timestamp_key = f"{key}_timestamp"
+        if timestamp_key not in self.timestamp_windows:
+            self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
+        self.timestamp_windows[timestamp_key].append(orderbook.timestamp)
+    
+    def _update_trade_windows(self, key: str, trade: TradeEvent) -> None:
+        """Update rolling windows with trade data"""
+        # Update price window
+        if key not in self.price_windows:
+            self.price_windows[key] = deque(maxlen=self.window_size)
+        self.price_windows[key].append(trade.price)
+        
+        # Update volume window
+        volume_key = f"{key}_volume"
+        if volume_key not in self.volume_windows:
+            self.volume_windows[volume_key] = deque(maxlen=self.window_size)
+        self.volume_windows[volume_key].append(trade.size)
+    
+    def get_statistics(self) -> Dict[str, Dict[str, float]]:
+        """Get current statistics for all tracked symbols"""
+        stats = {}
+        
+        for key, window in self.price_windows.items():
+            if len(window) >= 2:
+                try:
+                    stats[key] = {
+                        'price_mean': statistics.mean(window),
+                        'price_std': statistics.stdev(window),
+                        'price_min': min(window),
+                        'price_max': max(window),
+                        'data_points': len(window)
+                    }
+                except statistics.StatisticsError:
+                    stats[key] = {'error': 'insufficient_data'}
+        
+        return stats
+    
+    def reset_windows(self, key: Optional[str] = None) -> None:
+        """Reset rolling windows for a specific key or all keys"""
+        if key:
+            # Reset specific key
+            self.price_windows.pop(key, None)
+            self.volume_windows.pop(f"{key}_volume", None)
+            self.spread_windows.pop(f"{key}_spread", None)
+            self.timestamp_windows.pop(f"{key}_timestamp", None)
+        else:
+            # Reset all windows
+            self.price_windows.clear()
+            self.volume_windows.clear()
+            self.spread_windows.clear()
+            self.timestamp_windows.clear()
+        
+        logger.info(f"Reset anomaly detection windows for {key or 'all keys'}")
--- a/COBY/processing/data_processor.py
+++ b/COBY/processing/data_processor.py
@@ -0,0 +1,378 @@
+"""
+Main data processor implementation.
+"""
+
+from typing import Dict, Union, List, Optional, Any
+from ..interfaces.data_processor import DataProcessor
+from ..models.core import OrderBookSnapshot, TradeEvent, OrderBookMetrics
+from ..utils.logging import get_logger, set_correlation_id
+from ..utils.exceptions import ValidationError, ProcessingError
+from ..utils.timing import get_current_timestamp
+from .quality_checker import DataQualityChecker
+from .anomaly_detector import AnomalyDetector
+from .metrics_calculator import MetricsCalculator
+
+logger = get_logger(__name__)
+
+
+class StandardDataProcessor(DataProcessor):
+    """
+    Standard implementation of data processor interface.
+    
+    Provides:
+    - Data normalization and validation
+    - Quality checking
+    - Anomaly detection
+    - Metrics calculation
+    - Data enrichment
+    """
+    
+    def __init__(self):
+        """Initialize data processor with components"""
+        self.quality_checker = DataQualityChecker()
+        self.anomaly_detector = AnomalyDetector()
+        self.metrics_calculator = MetricsCalculator()
+        
+        # Processing statistics
+        self.processed_orderbooks = 0
+        self.processed_trades = 0
+        self.quality_failures = 0
+        self.anomalies_detected = 0
+        
+        logger.info("Standard data processor initialized")
+    
+    def normalize_orderbook(self, raw_data: Dict, exchange: str) -> OrderBookSnapshot:
+        """
+        Normalize raw order book data to standard format.
+        
+        Args:
+            raw_data: Raw order book data from exchange
+            exchange: Exchange name
+            
+        Returns:
+            OrderBookSnapshot: Normalized order book data
+        """
+        try:
+            set_correlation_id()
+            
+            # This is a generic implementation - specific exchanges would override
+            # For now, assume data is already in correct format
+            if isinstance(raw_data, OrderBookSnapshot):
+                return raw_data
+            
+            # If raw_data is a dict, try to construct OrderBookSnapshot
+            # This would be customized per exchange
+            raise NotImplementedError(
+                "normalize_orderbook should be implemented by exchange-specific processors"
+            )
+            
+        except Exception as e:
+            logger.error(f"Error normalizing order book data: {e}")
+            raise ProcessingError(f"Normalization failed: {e}", "NORMALIZE_ERROR")
+    
+    def normalize_trade(self, raw_data: Dict, exchange: str) -> TradeEvent:
+        """
+        Normalize raw trade data to standard format.
+        
+        Args:
+            raw_data: Raw trade data from exchange
+            exchange: Exchange name
+            
+        Returns:
+            TradeEvent: Normalized trade data
+        """
+        try:
+            set_correlation_id()
+            
+            # This is a generic implementation - specific exchanges would override
+            if isinstance(raw_data, TradeEvent):
+                return raw_data
+            
+            # If raw_data is a dict, try to construct TradeEvent
+            # This would be customized per exchange
+            raise NotImplementedError(
+                "normalize_trade should be implemented by exchange-specific processors"
+            )
+            
+        except Exception as e:
+            logger.error(f"Error normalizing trade data: {e}")
+            raise ProcessingError(f"Normalization failed: {e}", "NORMALIZE_ERROR")    
+
+    def validate_data(self, data: Union[OrderBookSnapshot, TradeEvent]) -> bool:
+        """
+        Validate normalized data for quality and consistency.
+        
+        Args:
+            data: Normalized data to validate
+            
+        Returns:
+            bool: True if data is valid, False otherwise
+        """
+        try:
+            set_correlation_id()
+            
+            if isinstance(data, OrderBookSnapshot):
+                quality_score, issues = self.quality_checker.check_orderbook_quality(data)
+                self.processed_orderbooks += 1
+                
+                if quality_score < 0.5:  # Threshold for acceptable quality
+                    self.quality_failures += 1
+                    logger.warning(f"Low quality order book data: score={quality_score:.2f}, issues={issues}")
+                    return False
+                
+                return True
+                
+            elif isinstance(data, TradeEvent):
+                quality_score, issues = self.quality_checker.check_trade_quality(data)
+                self.processed_trades += 1
+                
+                if quality_score < 0.5:
+                    self.quality_failures += 1
+                    logger.warning(f"Low quality trade data: score={quality_score:.2f}, issues={issues}")
+                    return False
+                
+                return True
+            
+            else:
+                logger.error(f"Unknown data type for validation: {type(data)}")
+                return False
+                
+        except Exception as e:
+            logger.error(f"Error validating data: {e}")
+            return False
+    
+    def calculate_metrics(self, orderbook: OrderBookSnapshot) -> OrderBookMetrics:
+        """
+        Calculate metrics from order book data.
+        
+        Args:
+            orderbook: Order book snapshot
+            
+        Returns:
+            OrderBookMetrics: Calculated metrics
+        """
+        try:
+            set_correlation_id()
+            return self.metrics_calculator.calculate_orderbook_metrics(orderbook)
+            
+        except Exception as e:
+            logger.error(f"Error calculating metrics: {e}")
+            raise ProcessingError(f"Metrics calculation failed: {e}", "METRICS_ERROR")
+    
+    def detect_anomalies(self, data: Union[OrderBookSnapshot, TradeEvent]) -> List[str]:
+        """
+        Detect anomalies in the data.
+        
+        Args:
+            data: Data to analyze for anomalies
+            
+        Returns:
+            List[str]: List of detected anomaly descriptions
+        """
+        try:
+            set_correlation_id()
+            
+            if isinstance(data, OrderBookSnapshot):
+                anomalies = self.anomaly_detector.detect_orderbook_anomalies(data)
+            elif isinstance(data, TradeEvent):
+                anomalies = self.anomaly_detector.detect_trade_anomalies(data)
+            else:
+                logger.error(f"Unknown data type for anomaly detection: {type(data)}")
+                return ["Unknown data type"]
+            
+            if anomalies:
+                self.anomalies_detected += len(anomalies)
+            
+            return anomalies
+            
+        except Exception as e:
+            logger.error(f"Error detecting anomalies: {e}")
+            return [f"Anomaly detection error: {e}"]
+    
+    def filter_data(self, data: Union[OrderBookSnapshot, TradeEvent], criteria: Dict) -> bool:
+        """
+        Filter data based on criteria.
+        
+        Args:
+            data: Data to filter
+            criteria: Filtering criteria
+            
+        Returns:
+            bool: True if data passes filter, False otherwise
+        """
+        try:
+            set_correlation_id()
+            
+            # Symbol filter
+            if 'symbols' in criteria:
+                allowed_symbols = criteria['symbols']
+                if data.symbol not in allowed_symbols:
+                    return False
+            
+            # Exchange filter
+            if 'exchanges' in criteria:
+                allowed_exchanges = criteria['exchanges']
+                if data.exchange not in allowed_exchanges:
+                    return False
+            
+            # Quality filter
+            if 'min_quality' in criteria:
+                min_quality = criteria['min_quality']
+                if isinstance(data, OrderBookSnapshot):
+                    quality_score, _ = self.quality_checker.check_orderbook_quality(data)
+                elif isinstance(data, TradeEvent):
+                    quality_score, _ = self.quality_checker.check_trade_quality(data)
+                else:
+                    quality_score = 0.0
+                
+                if quality_score < min_quality:
+                    return False
+            
+            # Price range filter
+            if 'price_range' in criteria:
+                price_range = criteria['price_range']
+                min_price, max_price = price_range
+                
+                if isinstance(data, OrderBookSnapshot):
+                    price = data.mid_price
+                elif isinstance(data, TradeEvent):
+                    price = data.price
+                else:
+                    return False
+                
+                if price and (price < min_price or price > max_price):
+                    return False
+            
+            # Volume filter for trades
+            if 'min_volume' in criteria and isinstance(data, TradeEvent):
+                min_volume = criteria['min_volume']
+                if data.size < min_volume:
+                    return False
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error filtering data: {e}")
+            return False 
+   
+    def enrich_data(self, data: Union[OrderBookSnapshot, TradeEvent]) -> Dict:
+        """
+        Enrich data with additional metadata.
+        
+        Args:
+            data: Data to enrich
+            
+        Returns:
+            Dict: Enriched data with metadata
+        """
+        try:
+            set_correlation_id()
+            
+            enriched = {
+                'original_data': data,
+                'processing_timestamp': get_current_timestamp(),
+                'processor_version': '1.0.0'
+            }
+            
+            # Add quality metrics
+            if isinstance(data, OrderBookSnapshot):
+                quality_score, quality_issues = self.quality_checker.check_orderbook_quality(data)
+                enriched['quality_score'] = quality_score
+                enriched['quality_issues'] = quality_issues
+                
+                # Add calculated metrics
+                try:
+                    metrics = self.calculate_metrics(data)
+                    enriched['metrics'] = {
+                        'mid_price': metrics.mid_price,
+                        'spread': metrics.spread,
+                        'spread_percentage': metrics.spread_percentage,
+                        'volume_imbalance': metrics.volume_imbalance,
+                        'depth_10': metrics.depth_10,
+                        'depth_50': metrics.depth_50
+                    }
+                except Exception as e:
+                    enriched['metrics_error'] = str(e)
+                
+                # Add liquidity score
+                try:
+                    liquidity_score = self.metrics_calculator.calculate_liquidity_score(data)
+                    enriched['liquidity_score'] = liquidity_score
+                except Exception as e:
+                    enriched['liquidity_error'] = str(e)
+                
+            elif isinstance(data, TradeEvent):
+                quality_score, quality_issues = self.quality_checker.check_trade_quality(data)
+                enriched['quality_score'] = quality_score
+                enriched['quality_issues'] = quality_issues
+                
+                # Add trade-specific enrichments
+                enriched['trade_value'] = data.price * data.size
+                enriched['side_numeric'] = 1 if data.side == 'buy' else -1
+            
+            # Add anomaly detection results
+            anomalies = self.detect_anomalies(data)
+            enriched['anomalies'] = anomalies
+            enriched['anomaly_count'] = len(anomalies)
+            
+            return enriched
+            
+        except Exception as e:
+            logger.error(f"Error enriching data: {e}")
+            return {
+                'original_data': data,
+                'enrichment_error': str(e)
+            }
+    
+    def get_data_quality_score(self, data: Union[OrderBookSnapshot, TradeEvent]) -> float:
+        """
+        Calculate data quality score.
+        
+        Args:
+            data: Data to score
+            
+        Returns:
+            float: Quality score between 0.0 and 1.0
+        """
+        try:
+            set_correlation_id()
+            
+            if isinstance(data, OrderBookSnapshot):
+                quality_score, _ = self.quality_checker.check_orderbook_quality(data)
+            elif isinstance(data, TradeEvent):
+                quality_score, _ = self.quality_checker.check_trade_quality(data)
+            else:
+                logger.error(f"Unknown data type for quality scoring: {type(data)}")
+                return 0.0
+            
+            return quality_score
+            
+        except Exception as e:
+            logger.error(f"Error calculating quality score: {e}")
+            return 0.0
+    
+    def get_processing_stats(self) -> Dict[str, Any]:
+        """Get processing statistics"""
+        return {
+            'processed_orderbooks': self.processed_orderbooks,
+            'processed_trades': self.processed_trades,
+            'quality_failures': self.quality_failures,
+            'anomalies_detected': self.anomalies_detected,
+            'quality_failure_rate': (
+                self.quality_failures / max(1, self.processed_orderbooks + self.processed_trades)
+            ),
+            'anomaly_rate': (
+                self.anomalies_detected / max(1, self.processed_orderbooks + self.processed_trades)
+            ),
+            'quality_checker_summary': self.quality_checker.get_quality_summary(),
+            'anomaly_detector_stats': self.anomaly_detector.get_statistics()
+        }
+    
+    def reset_stats(self) -> None:
+        """Reset processing statistics"""
+        self.processed_orderbooks = 0
+        self.processed_trades = 0
+        self.quality_failures = 0
+        self.anomalies_detected = 0
+        
+        logger.info("Processing statistics reset")
--- a/COBY/processing/metrics_calculator.py
+++ b/COBY/processing/metrics_calculator.py
@@ -0,0 +1,275 @@
+"""
+Metrics calculation for order book analysis.
+"""
+
+from typing import Dict, List, Optional
+from ..models.core import OrderBookSnapshot, OrderBookMetrics, ImbalanceMetrics
+from ..utils.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+class MetricsCalculator:
+    """
+    Calculates various metrics from order book data.
+    
+    Metrics include:
+    - Basic metrics (mid price, spread, volumes)
+    - Imbalance metrics
+    - Depth metrics
+    - Liquidity metrics
+    """
+    
+    def __init__(self):
+        """Initialize metrics calculator"""
+        logger.info("Metrics calculator initialized")
+    
+    def calculate_orderbook_metrics(self, orderbook: OrderBookSnapshot) -> OrderBookMetrics:
+        """
+        Calculate comprehensive order book metrics.
+        
+        Args:
+            orderbook: Order book snapshot
+            
+        Returns:
+            OrderBookMetrics: Calculated metrics
+        """
+        try:
+            # Basic calculations
+            mid_price = self._calculate_mid_price(orderbook)
+            spread = self._calculate_spread(orderbook)
+            spread_percentage = (spread / mid_price * 100) if mid_price > 0 else 0.0
+            
+            # Volume calculations
+            bid_volume = sum(level.size for level in orderbook.bids)
+            ask_volume = sum(level.size for level in orderbook.asks)
+            
+            # Imbalance calculation
+            total_volume = bid_volume + ask_volume
+            volume_imbalance = ((bid_volume - ask_volume) / total_volume) if total_volume > 0 else 0.0
+            
+            # Depth calculations
+            depth_10 = self._calculate_depth(orderbook, 10)
+            depth_50 = self._calculate_depth(orderbook, 50)
+            
+            return OrderBookMetrics(
+                symbol=orderbook.symbol,
+                exchange=orderbook.exchange,
+                timestamp=orderbook.timestamp,
+                mid_price=mid_price,
+                spread=spread,
+                spread_percentage=spread_percentage,
+                bid_volume=bid_volume,
+                ask_volume=ask_volume,
+                volume_imbalance=volume_imbalance,
+                depth_10=depth_10,
+                depth_50=depth_50
+            )
+            
+        except Exception as e:
+            logger.error(f"Error calculating order book metrics: {e}")
+            raise
+    
+    def calculate_imbalance_metrics(self, orderbook: OrderBookSnapshot) -> ImbalanceMetrics:
+        """
+        Calculate order book imbalance metrics.
+        
+        Args:
+            orderbook: Order book snapshot
+            
+        Returns:
+            ImbalanceMetrics: Calculated imbalance metrics
+        """
+        try:
+            # Volume imbalance
+            bid_volume = sum(level.size for level in orderbook.bids)
+            ask_volume = sum(level.size for level in orderbook.asks)
+            total_volume = bid_volume + ask_volume
+            volume_imbalance = ((bid_volume - ask_volume) / total_volume) if total_volume > 0 else 0.0
+            
+            # Price imbalance (weighted by volume)
+            price_imbalance = self._calculate_price_imbalance(orderbook)
+            
+            # Depth imbalance
+            depth_imbalance = self._calculate_depth_imbalance(orderbook)
+            
+            # Momentum score (simplified - would need historical data for full implementation)
+            momentum_score = volume_imbalance * 0.5 + price_imbalance * 0.3 + depth_imbalance * 0.2
+            
+            return ImbalanceMetrics(
+                symbol=orderbook.symbol,
+                timestamp=orderbook.timestamp,
+                volume_imbalance=volume_imbalance,
+                price_imbalance=price_imbalance,
+                depth_imbalance=depth_imbalance,
+                momentum_score=momentum_score
+            )
+            
+        except Exception as e:
+            logger.error(f"Error calculating imbalance metrics: {e}")
+            raise 
+   
+    def _calculate_mid_price(self, orderbook: OrderBookSnapshot) -> float:
+        """Calculate mid price"""
+        if not orderbook.bids or not orderbook.asks:
+            return 0.0
+        
+        best_bid = orderbook.bids[0].price
+        best_ask = orderbook.asks[0].price
+        
+        return (best_bid + best_ask) / 2.0
+    
+    def _calculate_spread(self, orderbook: OrderBookSnapshot) -> float:
+        """Calculate bid-ask spread"""
+        if not orderbook.bids or not orderbook.asks:
+            return 0.0
+        
+        best_bid = orderbook.bids[0].price
+        best_ask = orderbook.asks[0].price
+        
+        return best_ask - best_bid
+    
+    def _calculate_depth(self, orderbook: OrderBookSnapshot, levels: int) -> float:
+        """Calculate market depth for specified number of levels"""
+        bid_depth = sum(
+            level.size for level in orderbook.bids[:levels]
+        )
+        ask_depth = sum(
+            level.size for level in orderbook.asks[:levels]
+        )
+        
+        return bid_depth + ask_depth
+    
+    def _calculate_price_imbalance(self, orderbook: OrderBookSnapshot) -> float:
+        """Calculate price-weighted imbalance"""
+        if not orderbook.bids or not orderbook.asks:
+            return 0.0
+        
+        # Calculate volume-weighted average prices for top levels
+        bid_vwap = self._calculate_vwap(orderbook.bids[:5])
+        ask_vwap = self._calculate_vwap(orderbook.asks[:5])
+        
+        if bid_vwap == 0 or ask_vwap == 0:
+            return 0.0
+        
+        mid_price = (bid_vwap + ask_vwap) / 2.0
+        
+        # Normalize imbalance
+        price_imbalance = (bid_vwap - ask_vwap) / mid_price if mid_price > 0 else 0.0
+        
+        return max(-1.0, min(1.0, price_imbalance))
+    
+    def _calculate_depth_imbalance(self, orderbook: OrderBookSnapshot) -> float:
+        """Calculate depth imbalance across multiple levels"""
+        levels_to_check = [5, 10, 20]
+        imbalances = []
+        
+        for levels in levels_to_check:
+            bid_depth = sum(level.size for level in orderbook.bids[:levels])
+            ask_depth = sum(level.size for level in orderbook.asks[:levels])
+            total_depth = bid_depth + ask_depth
+            
+            if total_depth > 0:
+                imbalance = (bid_depth - ask_depth) / total_depth
+                imbalances.append(imbalance)
+        
+        # Return weighted average of imbalances
+        if imbalances:
+            return sum(imbalances) / len(imbalances)
+        
+        return 0.0
+    
+    def _calculate_vwap(self, levels: List) -> float:
+        """Calculate volume-weighted average price for price levels"""
+        if not levels:
+            return 0.0
+        
+        total_volume = sum(level.size for level in levels)
+        if total_volume == 0:
+            return 0.0
+        
+        weighted_sum = sum(level.price * level.size for level in levels)
+        
+        return weighted_sum / total_volume
+    
+    def calculate_liquidity_score(self, orderbook: OrderBookSnapshot) -> float:
+        """
+        Calculate liquidity score based on depth and spread.
+        
+        Args:
+            orderbook: Order book snapshot
+            
+        Returns:
+            float: Liquidity score (0.0 to 1.0)
+        """
+        try:
+            if not orderbook.bids or not orderbook.asks:
+                return 0.0
+            
+            # Spread component (lower spread = higher liquidity)
+            spread = self._calculate_spread(orderbook)
+            mid_price = self._calculate_mid_price(orderbook)
+            
+            if mid_price == 0:
+                return 0.0
+            
+            spread_pct = (spread / mid_price) * 100
+            spread_score = max(0.0, 1.0 - (spread_pct / 5.0))  # Normalize to 5% max spread
+            
+            # Depth component (higher depth = higher liquidity)
+            total_depth = self._calculate_depth(orderbook, 10)
+            depth_score = min(1.0, total_depth / 100.0)  # Normalize to 100 units max depth
+            
+            # Volume balance component (more balanced = higher liquidity)
+            bid_volume = sum(level.size for level in orderbook.bids[:10])
+            ask_volume = sum(level.size for level in orderbook.asks[:10])
+            total_volume = bid_volume + ask_volume
+            
+            if total_volume > 0:
+                imbalance = abs(bid_volume - ask_volume) / total_volume
+                balance_score = 1.0 - imbalance
+            else:
+                balance_score = 0.0
+            
+            # Weighted combination
+            liquidity_score = (spread_score * 0.4 + depth_score * 0.4 + balance_score * 0.2)
+            
+            return max(0.0, min(1.0, liquidity_score))
+            
+        except Exception as e:
+            logger.error(f"Error calculating liquidity score: {e}")
+            return 0.0
+    
+    def get_market_summary(self, orderbook: OrderBookSnapshot) -> Dict[str, float]:
+        """
+        Get comprehensive market summary.
+        
+        Args:
+            orderbook: Order book snapshot
+            
+        Returns:
+            Dict[str, float]: Market summary metrics
+        """
+        try:
+            metrics = self.calculate_orderbook_metrics(orderbook)
+            imbalance = self.calculate_imbalance_metrics(orderbook)
+            liquidity = self.calculate_liquidity_score(orderbook)
+            
+            return {
+                'mid_price': metrics.mid_price,
+                'spread': metrics.spread,
+                'spread_percentage': metrics.spread_percentage,
+                'bid_volume': metrics.bid_volume,
+                'ask_volume': metrics.ask_volume,
+                'volume_imbalance': metrics.volume_imbalance,
+                'depth_10': metrics.depth_10,
+                'depth_50': metrics.depth_50,
+                'price_imbalance': imbalance.price_imbalance,
+                'depth_imbalance': imbalance.depth_imbalance,
+                'momentum_score': imbalance.momentum_score,
+                'liquidity_score': liquidity
+            }
+            
+        except Exception as e:
+            logger.error(f"Error generating market summary: {e}")
+            return {}
--- a/COBY/processing/quality_checker.py
+++ b/COBY/processing/quality_checker.py
@@ -0,0 +1,288 @@
+"""
+Data quality checking and validation for market data.
+"""
+
+from typing import Dict, List, Union, Optional, Tuple
+from datetime import datetime, timezone
+from ..models.core import OrderBookSnapshot, TradeEvent
+from ..utils.logging import get_logger
+from ..utils.validation import validate_price, validate_volume, validate_symbol
+from ..utils.timing import get_current_timestamp
+
+logger = get_logger(__name__)
+
+
+class DataQualityChecker:
+    """
+    Comprehensive data quality checker for market data.
+    
+    Validates:
+    - Data structure integrity
+    - Price and volume ranges
+    - Timestamp consistency
+    - Cross-validation between related data points
+    """
+    
+    def __init__(self):
+        """Initialize quality checker with default thresholds"""
+        # Quality thresholds
+        self.max_spread_percentage = 10.0  # Maximum spread as % of mid price
+        self.max_price_change_percentage = 50.0  # Maximum price change between updates
+        self.min_volume_threshold = 0.000001  # Minimum meaningful volume
+        self.max_timestamp_drift = 300  # Maximum seconds drift from current time
+        
+        # Price history for validation
+        self.price_history: Dict[str, Dict[str, float]] = {}  # symbol -> exchange -> last_price
+        
+        logger.info("Data quality checker initialized")    
+
+    def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]:
+        """
+        Check order book data quality.
+        
+        Args:
+            orderbook: Order book snapshot to validate
+            
+        Returns:
+            Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
+        """
+        issues = []
+        quality_score = 1.0
+        
+        try:
+            # Basic structure validation
+            structure_issues = self._check_orderbook_structure(orderbook)
+            issues.extend(structure_issues)
+            quality_score -= len(structure_issues) * 0.1
+            
+            # Price validation
+            price_issues = self._check_orderbook_prices(orderbook)
+            issues.extend(price_issues)
+            quality_score -= len(price_issues) * 0.15
+            
+            # Volume validation
+            volume_issues = self._check_orderbook_volumes(orderbook)
+            issues.extend(volume_issues)
+            quality_score -= len(volume_issues) * 0.1
+            
+            # Spread validation
+            spread_issues = self._check_orderbook_spread(orderbook)
+            issues.extend(spread_issues)
+            quality_score -= len(spread_issues) * 0.2
+            
+            # Timestamp validation
+            timestamp_issues = self._check_timestamp(orderbook.timestamp)
+            issues.extend(timestamp_issues)
+            quality_score -= len(timestamp_issues) * 0.1
+            
+            # Cross-validation with history
+            history_issues = self._check_price_history(orderbook)
+            issues.extend(history_issues)
+            quality_score -= len(history_issues) * 0.15
+            
+            # Update price history
+            self._update_price_history(orderbook)
+            
+        except Exception as e:
+            logger.error(f"Error checking order book quality: {e}")
+            issues.append(f"Quality check error: {e}")
+            quality_score = 0.0
+        
+        # Ensure score is within bounds
+        quality_score = max(0.0, min(1.0, quality_score))
+        
+        if issues:
+            logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}")
+        
+        return quality_score, issues    de
+f check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]:
+        """
+        Check trade data quality.
+        
+        Args:
+            trade: Trade event to validate
+            
+        Returns:
+            Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
+        """
+        issues = []
+        quality_score = 1.0
+        
+        try:
+            # Basic structure validation
+            if not validate_symbol(trade.symbol):
+                issues.append("Invalid symbol format")
+            
+            if not trade.exchange:
+                issues.append("Missing exchange")
+            
+            if not trade.trade_id:
+                issues.append("Missing trade ID")
+            
+            # Price validation
+            if not validate_price(trade.price):
+                issues.append(f"Invalid price: {trade.price}")
+            
+            # Volume validation
+            if not validate_volume(trade.size):
+                issues.append(f"Invalid size: {trade.size}")
+            
+            if trade.size < self.min_volume_threshold:
+                issues.append(f"Size below threshold: {trade.size}")
+            
+            # Side validation
+            if trade.side not in ['buy', 'sell']:
+                issues.append(f"Invalid side: {trade.side}")
+            
+            # Timestamp validation
+            timestamp_issues = self._check_timestamp(trade.timestamp)
+            issues.extend(timestamp_issues)
+            
+            # Calculate quality score
+            quality_score -= len(issues) * 0.2
+            
+        except Exception as e:
+            logger.error(f"Error checking trade quality: {e}")
+            issues.append(f"Quality check error: {e}")
+            quality_score = 0.0
+        
+        # Ensure score is within bounds
+        quality_score = max(0.0, min(1.0, quality_score))
+        
+        if issues:
+            logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}")
+        
+        return quality_score, issues
+    
+    def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """Check basic order book structure"""
+        issues = []
+        
+        if not validate_symbol(orderbook.symbol):
+            issues.append("Invalid symbol format")
+        
+        if not orderbook.exchange:
+            issues.append("Missing exchange")
+        
+        if not orderbook.bids:
+            issues.append("No bid levels")
+        
+        if not orderbook.asks:
+            issues.append("No ask levels")
+        
+        return issues
+    
+    def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """Check order book price validity"""
+        issues = []
+        
+        # Check bid prices (should be descending)
+        for i, bid in enumerate(orderbook.bids):
+            if not validate_price(bid.price):
+                issues.append(f"Invalid bid price at level {i}: {bid.price}")
+            
+            if i > 0 and bid.price >= orderbook.bids[i-1].price:
+                issues.append(f"Bid prices not descending at level {i}")
+        
+        # Check ask prices (should be ascending)
+        for i, ask in enumerate(orderbook.asks):
+            if not validate_price(ask.price):
+                issues.append(f"Invalid ask price at level {i}: {ask.price}")
+            
+            if i > 0 and ask.price <= orderbook.asks[i-1].price:
+                issues.append(f"Ask prices not ascending at level {i}")
+        
+        # Check bid-ask ordering
+        if orderbook.bids and orderbook.asks:
+            if orderbook.bids[0].price >= orderbook.asks[0].price:
+                issues.append("Best bid >= best ask (crossed book)")
+        
+        return issues    def
+ _check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """Check order book volume validity"""
+        issues = []
+        
+        # Check bid volumes
+        for i, bid in enumerate(orderbook.bids):
+            if not validate_volume(bid.size):
+                issues.append(f"Invalid bid volume at level {i}: {bid.size}")
+            
+            if bid.size < self.min_volume_threshold:
+                issues.append(f"Bid volume below threshold at level {i}: {bid.size}")
+        
+        # Check ask volumes
+        for i, ask in enumerate(orderbook.asks):
+            if not validate_volume(ask.size):
+                issues.append(f"Invalid ask volume at level {i}: {ask.size}")
+            
+            if ask.size < self.min_volume_threshold:
+                issues.append(f"Ask volume below threshold at level {i}: {ask.size}")
+        
+        return issues
+    
+    def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """Check order book spread validity"""
+        issues = []
+        
+        if orderbook.mid_price and orderbook.spread:
+            spread_percentage = (orderbook.spread / orderbook.mid_price) * 100
+            
+            if spread_percentage > self.max_spread_percentage:
+                issues.append(f"Spread too wide: {spread_percentage:.2f}%")
+            
+            if spread_percentage < 0:
+                issues.append(f"Negative spread: {spread_percentage:.2f}%")
+        
+        return issues
+    
+    def _check_timestamp(self, timestamp: datetime) -> List[str]:
+        """Check timestamp validity"""
+        issues = []
+        
+        if not timestamp:
+            issues.append("Missing timestamp")
+            return issues
+        
+        # Check if timestamp is timezone-aware
+        if timestamp.tzinfo is None:
+            issues.append("Timestamp missing timezone info")
+        
+        # Check timestamp drift
+        current_time = get_current_timestamp()
+        time_diff = abs((timestamp - current_time).total_seconds())
+        
+        if time_diff > self.max_timestamp_drift:
+            issues.append(f"Timestamp drift too large: {time_diff:.1f}s")
+        
+        return issues
+    
+    def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]:
+        """Check price consistency with history"""
+        issues = []
+        
+        key = f"{orderbook.symbol}_{orderbook.exchange}"
+        
+        if key in self.price_history and orderbook.mid_price:
+            last_price = self.price_history[key]
+            price_change = abs(orderbook.mid_price - last_price) / last_price * 100
+            
+            if price_change > self.max_price_change_percentage:
+                issues.append(f"Large price change: {price_change:.2f}%")
+        
+        return issues
+    
+    def _update_price_history(self, orderbook: OrderBookSnapshot) -> None:
+        """Update price history for future validation"""
+        if orderbook.mid_price:
+            key = f"{orderbook.symbol}_{orderbook.exchange}"
+            self.price_history[key] = orderbook.mid_price
+    
+    def get_quality_summary(self) -> Dict[str, int]:
+        """Get summary of quality checks performed"""
+        return {
+            'symbols_tracked': len(self.price_history),
+            'max_spread_percentage': self.max_spread_percentage,
+            'max_price_change_percentage': self.max_price_change_percentage,
+            'min_volume_threshold': self.min_volume_threshold,
+            'max_timestamp_drift': self.max_timestamp_drift
+        }