""" Data quality checking and validation for market data. """ from typing import Dict, List, Union, Optional, Tuple from datetime import datetime, timezone from ..models.core import OrderBookSnapshot, TradeEvent from ..utils.logging import get_logger from ..utils.validation import validate_price, validate_volume, validate_symbol from ..utils.timing import get_current_timestamp logger = get_logger(__name__) class DataQualityChecker: """ Comprehensive data quality checker for market data. Validates: - Data structure integrity - Price and volume ranges - Timestamp consistency - Cross-validation between related data points """ def __init__(self): """Initialize quality checker with default thresholds""" # Quality thresholds self.max_spread_percentage = 10.0 # Maximum spread as % of mid price self.max_price_change_percentage = 50.0 # Maximum price change between updates self.min_volume_threshold = 0.000001 # Minimum meaningful volume self.max_timestamp_drift = 300 # Maximum seconds drift from current time # Price history for validation self.price_history: Dict[str, Dict[str, float]] = {} # symbol -> exchange -> last_price logger.info("Data quality checker initialized") def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]: """ Check order book data quality. Args: orderbook: Order book snapshot to validate Returns: Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues """ issues = [] quality_score = 1.0 try: # Basic structure validation structure_issues = self._check_orderbook_structure(orderbook) issues.extend(structure_issues) quality_score -= len(structure_issues) * 0.1 # Price validation price_issues = self._check_orderbook_prices(orderbook) issues.extend(price_issues) quality_score -= len(price_issues) * 0.15 # Volume validation volume_issues = self._check_orderbook_volumes(orderbook) issues.extend(volume_issues) quality_score -= len(volume_issues) * 0.1 # Spread validation spread_issues = self._check_orderbook_spread(orderbook) issues.extend(spread_issues) quality_score -= len(spread_issues) * 0.2 # Timestamp validation timestamp_issues = self._check_timestamp(orderbook.timestamp) issues.extend(timestamp_issues) quality_score -= len(timestamp_issues) * 0.1 # Cross-validation with history history_issues = self._check_price_history(orderbook) issues.extend(history_issues) quality_score -= len(history_issues) * 0.15 # Update price history self._update_price_history(orderbook) except Exception as e: logger.error(f"Error checking order book quality: {e}") issues.append(f"Quality check error: {e}") quality_score = 0.0 # Ensure score is within bounds quality_score = max(0.0, min(1.0, quality_score)) if issues: logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}") return quality_score, issues def check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]: """ Check trade data quality. Args: trade: Trade event to validate Returns: Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues """ issues = [] quality_score = 1.0 try: # Basic structure validation if not validate_symbol(trade.symbol): issues.append("Invalid symbol format") if not trade.exchange: issues.append("Missing exchange") if not trade.trade_id: issues.append("Missing trade ID") # Price validation if not validate_price(trade.price): issues.append(f"Invalid price: {trade.price}") # Volume validation if not validate_volume(trade.size): issues.append(f"Invalid size: {trade.size}") if trade.size < self.min_volume_threshold: issues.append(f"Size below threshold: {trade.size}") # Side validation if trade.side not in ['buy', 'sell']: issues.append(f"Invalid side: {trade.side}") # Timestamp validation timestamp_issues = self._check_timestamp(trade.timestamp) issues.extend(timestamp_issues) # Calculate quality score quality_score -= len(issues) * 0.2 except Exception as e: logger.error(f"Error checking trade quality: {e}") issues.append(f"Quality check error: {e}") quality_score = 0.0 # Ensure score is within bounds quality_score = max(0.0, min(1.0, quality_score)) if issues: logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}") return quality_score, issues def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]: """Check basic order book structure""" issues = [] if not validate_symbol(orderbook.symbol): issues.append("Invalid symbol format") if not orderbook.exchange: issues.append("Missing exchange") if not orderbook.bids: issues.append("No bid levels") if not orderbook.asks: issues.append("No ask levels") return issues def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]: """Check order book price validity""" issues = [] # Check bid prices (should be descending) for i, bid in enumerate(orderbook.bids): if not validate_price(bid.price): issues.append(f"Invalid bid price at level {i}: {bid.price}") if i > 0 and bid.price >= orderbook.bids[i-1].price: issues.append(f"Bid prices not descending at level {i}") # Check ask prices (should be ascending) for i, ask in enumerate(orderbook.asks): if not validate_price(ask.price): issues.append(f"Invalid ask price at level {i}: {ask.price}") if i > 0 and ask.price <= orderbook.asks[i-1].price: issues.append(f"Ask prices not ascending at level {i}") # Check bid-ask ordering if orderbook.bids and orderbook.asks: if orderbook.bids[0].price >= orderbook.asks[0].price: issues.append("Best bid >= best ask (crossed book)") return issues def _check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]: """Check order book volume validity""" issues = [] # Check bid volumes for i, bid in enumerate(orderbook.bids): if not validate_volume(bid.size): issues.append(f"Invalid bid volume at level {i}: {bid.size}") if bid.size < self.min_volume_threshold: issues.append(f"Bid volume below threshold at level {i}: {bid.size}") # Check ask volumes for i, ask in enumerate(orderbook.asks): if not validate_volume(ask.size): issues.append(f"Invalid ask volume at level {i}: {ask.size}") if ask.size < self.min_volume_threshold: issues.append(f"Ask volume below threshold at level {i}: {ask.size}") return issues def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]: """Check order book spread validity""" issues = [] if orderbook.mid_price and orderbook.spread: spread_percentage = (orderbook.spread / orderbook.mid_price) * 100 if spread_percentage > self.max_spread_percentage: issues.append(f"Spread too wide: {spread_percentage:.2f}%") if spread_percentage < 0: issues.append(f"Negative spread: {spread_percentage:.2f}%") return issues def _check_timestamp(self, timestamp: datetime) -> List[str]: """Check timestamp validity""" issues = [] if not timestamp: issues.append("Missing timestamp") return issues # Check if timestamp is timezone-aware if timestamp.tzinfo is None: issues.append("Timestamp missing timezone info") # Check timestamp drift current_time = get_current_timestamp() time_diff = abs((timestamp - current_time).total_seconds()) if time_diff > self.max_timestamp_drift: issues.append(f"Timestamp drift too large: {time_diff:.1f}s") return issues def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]: """Check price consistency with history""" issues = [] key = f"{orderbook.symbol}_{orderbook.exchange}" if key in self.price_history and orderbook.mid_price: last_price = self.price_history[key] price_change = abs(orderbook.mid_price - last_price) / last_price * 100 if price_change > self.max_price_change_percentage: issues.append(f"Large price change: {price_change:.2f}%") return issues def _update_price_history(self, orderbook: OrderBookSnapshot) -> None: """Update price history for future validation""" if orderbook.mid_price: key = f"{orderbook.symbol}_{orderbook.exchange}" self.price_history[key] = orderbook.mid_price def get_quality_summary(self) -> Dict[str, int]: """Get summary of quality checks performed""" return { 'symbols_tracked': len(self.price_history), 'max_spread_percentage': self.max_spread_percentage, 'max_price_change_percentage': self.max_price_change_percentage, 'min_volume_threshold': self.min_volume_threshold, 'max_timestamp_drift': self.max_timestamp_drift }