""" Data Validator for unified storage system. Validates all incoming data before storage to ensure data integrity. """ import logging from datetime import datetime, timezone from typing import Dict, List, Tuple, Optional, Any import pandas as pd from .unified_data_models import OHLCVCandle, OrderBookDataFrame, TradeEvent logger = logging.getLogger(__name__) class DataValidator: """ Validates all incoming data before storage. Ensures data integrity and consistency across the system. """ # Validation thresholds MAX_PRICE_CHANGE_PERCENT = 50.0 # 50% max price change per candle MIN_PRICE = 0.0001 # Minimum valid price MAX_PRICE = 1000000.0 # Maximum valid price MIN_VOLUME = 0.0 # Minimum valid volume MAX_SPREAD_PERCENT = 10.0 # 10% max spread @staticmethod def validate_ohlcv(candle: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """ Validate OHLCV candle data. Args: candle: Dictionary with OHLCV data Returns: Tuple of (is_valid, error_message) """ try: # Check required fields required_fields = ['timestamp', 'symbol', 'timeframe', 'open_price', 'high_price', 'low_price', 'close_price', 'volume'] for field in required_fields: if field not in candle: return False, f"Missing required field: {field}" # Extract values open_price = float(candle['open_price']) high_price = float(candle['high_price']) low_price = float(candle['low_price']) close_price = float(candle['close_price']) volume = float(candle['volume']) # Validate price ranges prices = [open_price, high_price, low_price, close_price] for price in prices: if price < DataValidator.MIN_PRICE: return False, f"Price below minimum: {price}" if price > DataValidator.MAX_PRICE: return False, f"Price above maximum: {price}" # Validate OHLC relationships if high_price < low_price: return False, f"High ({high_price}) < Low ({low_price})" if high_price < open_price: return False, f"High ({high_price}) < Open ({open_price})" if high_price < close_price: return False, f"High ({high_price}) < Close ({close_price})" if low_price > open_price: return False, f"Low ({low_price}) > Open ({open_price})" if low_price > close_price: return False, f"Low ({low_price}) > Close ({close_price})" # Validate volume if volume < DataValidator.MIN_VOLUME: return False, f"Volume below minimum: {volume}" # Validate price change (prevent extreme outliers) if open_price > 0: price_change_percent = abs((close_price - open_price) / open_price) * 100 if price_change_percent > DataValidator.MAX_PRICE_CHANGE_PERCENT: return False, f"Price change too large: {price_change_percent:.2f}%" # Validate timestamp if not DataValidator._validate_timestamp(candle['timestamp']): return False, "Invalid timestamp" # Validate symbol if not DataValidator._validate_symbol(candle['symbol']): return False, f"Invalid symbol: {candle['symbol']}" # Validate timeframe if not DataValidator._validate_timeframe(candle['timeframe']): return False, f"Invalid timeframe: {candle['timeframe']}" return True, None except (ValueError, TypeError, KeyError) as e: return False, f"Validation error: {str(e)}" except Exception as e: logger.error(f"Unexpected error validating OHLCV: {e}") return False, f"Unexpected error: {str(e)}" @staticmethod def validate_ohlcv_candle(candle: OHLCVCandle) -> Tuple[bool, Optional[str]]: """ Validate OHLCVCandle object. Args: candle: OHLCVCandle object Returns: Tuple of (is_valid, error_message) """ candle_dict = { 'timestamp': candle.timestamp, 'symbol': candle.symbol, 'timeframe': candle.timeframe, 'open_price': candle.open_price, 'high_price': candle.high_price, 'low_price': candle.low_price, 'close_price': candle.close_price, 'volume': candle.volume } return DataValidator.validate_ohlcv(candle_dict) @staticmethod def validate_orderbook(orderbook: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """ Validate order book data. Args: orderbook: Dictionary with order book data Returns: Tuple of (is_valid, error_message) """ try: # Check required fields if 'bids' not in orderbook or 'asks' not in orderbook: return False, "Missing bids or asks" bids = orderbook['bids'] asks = orderbook['asks'] # Check if bids and asks are lists if not isinstance(bids, list) or not isinstance(asks, list): return False, "Bids and asks must be lists" # Check if bids and asks are not empty if not bids or not asks: return False, "Bids or asks are empty" # Validate bid levels for bid in bids: if not isinstance(bid, (list, tuple)) or len(bid) < 2: return False, "Invalid bid format" price, size = float(bid[0]), float(bid[1]) if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE: return False, f"Invalid bid price: {price}" if size <= 0: return False, f"Invalid bid size: {size}" # Validate ask levels for ask in asks: if not isinstance(ask, (list, tuple)) or len(ask) < 2: return False, "Invalid ask format" price, size = float(ask[0]), float(ask[1]) if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE: return False, f"Invalid ask price: {price}" if size <= 0: return False, f"Invalid ask size: {size}" # Validate bid/ask relationship best_bid = max(float(bid[0]) for bid in bids) best_ask = min(float(ask[0]) for ask in asks) if best_bid >= best_ask: return False, f"Best bid ({best_bid}) >= Best ask ({best_ask})" # Validate spread spread = best_ask - best_bid mid_price = (best_bid + best_ask) / 2 spread_percent = (spread / mid_price) * 100 if spread_percent > DataValidator.MAX_SPREAD_PERCENT: return False, f"Spread too large: {spread_percent:.2f}%" # Validate timestamp if present if 'timestamp' in orderbook: if not DataValidator._validate_timestamp(orderbook['timestamp']): return False, "Invalid timestamp" # Validate symbol if present if 'symbol' in orderbook: if not DataValidator._validate_symbol(orderbook['symbol']): return False, f"Invalid symbol: {orderbook['symbol']}" return True, None except (ValueError, TypeError, KeyError) as e: return False, f"Validation error: {str(e)}" except Exception as e: logger.error(f"Unexpected error validating orderbook: {e}") return False, f"Unexpected error: {str(e)}" @staticmethod def validate_orderbook_dataframe(orderbook: OrderBookDataFrame) -> Tuple[bool, Optional[str]]: """ Validate OrderBookDataFrame object. Args: orderbook: OrderBookDataFrame object Returns: Tuple of (is_valid, error_message) """ orderbook_dict = { 'bids': orderbook.bids, 'asks': orderbook.asks, 'timestamp': orderbook.timestamp, 'symbol': orderbook.symbol } return DataValidator.validate_orderbook(orderbook_dict) @staticmethod def validate_trade(trade: Dict[str, Any]) -> Tuple[bool, Optional[str]]: """ Validate trade event data. Args: trade: Dictionary with trade data Returns: Tuple of (is_valid, error_message) """ try: # Check required fields required_fields = ['timestamp', 'symbol', 'price', 'size', 'side', 'trade_id'] for field in required_fields: if field not in trade: return False, f"Missing required field: {field}" # Validate price price = float(trade['price']) if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE: return False, f"Invalid price: {price}" # Validate size size = float(trade['size']) if size <= 0: return False, f"Invalid size: {size}" # Validate side side = trade['side'].lower() if side not in ['buy', 'sell', 'bid', 'ask']: return False, f"Invalid side: {trade['side']}" # Validate timestamp if not DataValidator._validate_timestamp(trade['timestamp']): return False, "Invalid timestamp" # Validate symbol if not DataValidator._validate_symbol(trade['symbol']): return False, f"Invalid symbol: {trade['symbol']}" # Validate trade_id if not trade['trade_id']: return False, "Empty trade_id" return True, None except (ValueError, TypeError, KeyError) as e: return False, f"Validation error: {str(e)}" except Exception as e: logger.error(f"Unexpected error validating trade: {e}") return False, f"Unexpected error: {str(e)}" @staticmethod def validate_trade_event(trade: TradeEvent) -> Tuple[bool, Optional[str]]: """ Validate TradeEvent object. Args: trade: TradeEvent object Returns: Tuple of (is_valid, error_message) """ trade_dict = { 'timestamp': trade.timestamp, 'symbol': trade.symbol, 'price': trade.price, 'size': trade.size, 'side': trade.side, 'trade_id': trade.trade_id } return DataValidator.validate_trade(trade_dict) @staticmethod def validate_imbalances(imbalances: Dict[str, float]) -> Tuple[bool, Optional[str]]: """ Validate order book imbalance metrics. Args: imbalances: Dictionary with imbalance metrics Returns: Tuple of (is_valid, error_message) """ try: # Expected imbalance fields expected_fields = [ 'imbalance_1s', 'imbalance_5s', 'imbalance_15s', 'imbalance_60s', 'volume_imbalance_1s', 'volume_imbalance_5s', 'volume_imbalance_15s', 'volume_imbalance_60s' ] # Check if at least some imbalance fields are present present_fields = [f for f in expected_fields if f in imbalances] if not present_fields: return False, "No imbalance fields present" # Validate imbalance values (should be between -1 and 1) for field in present_fields: value = float(imbalances[field]) if value < -1.0 or value > 1.0: logger.warning(f"Imbalance {field} out of range [-1, 1]: {value}") # Don't fail validation, just warn return True, None except (ValueError, TypeError) as e: return False, f"Validation error: {str(e)}" except Exception as e: logger.error(f"Unexpected error validating imbalances: {e}") return False, f"Unexpected error: {str(e)}" @staticmethod def _validate_timestamp(timestamp: Any) -> bool: """Validate timestamp format and value.""" try: if isinstance(timestamp, datetime): # Check if timestamp is not too far in the future or past now = datetime.now(timezone.utc) diff_days = abs((timestamp - now).days) # Allow timestamps within 10 years if diff_days > 3650: logger.warning(f"Timestamp too far from now: {timestamp}") return False return True elif isinstance(timestamp, (int, float)): # Unix timestamp dt = datetime.fromtimestamp(timestamp, tz=timezone.utc) return DataValidator._validate_timestamp(dt) elif isinstance(timestamp, str): # ISO format string dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00')) return DataValidator._validate_timestamp(dt) else: return False except Exception as e: logger.debug(f"Timestamp validation error: {e}") return False @staticmethod def _validate_symbol(symbol: str) -> bool: """Validate trading symbol format.""" if not symbol or not isinstance(symbol, str): return False # Symbol should be non-empty and reasonable length if len(symbol) < 3 or len(symbol) > 20: return False # Common symbol formats: BTCUSDT, BTC/USDT, BTC-USDT valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789/-') if not all(c in valid_chars for c in symbol.upper()): return False return True @staticmethod def _validate_timeframe(timeframe: str) -> bool: """Validate timeframe format.""" valid_timeframes = ['1s', '1m', '5m', '15m', '30m', '1h', '4h', '1d', '1w'] return timeframe in valid_timeframes @staticmethod def validate_dataframe(df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, Optional[str]]: """ Validate pandas DataFrame has required columns and valid data. Args: df: DataFrame to validate required_columns: List of required column names Returns: Tuple of (is_valid, error_message) """ try: # Check if DataFrame is empty if df.empty: return False, "DataFrame is empty" # Check required columns missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: return False, f"Missing columns: {missing_columns}" # Check for null values in required columns null_counts = df[required_columns].isnull().sum() if null_counts.any(): null_cols = null_counts[null_counts > 0].to_dict() return False, f"Null values found: {null_cols}" return True, None except Exception as e: logger.error(f"DataFrame validation error: {e}") return False, f"Validation error: {str(e)}" @staticmethod def sanitize_ohlcv(candle: Dict[str, Any]) -> Dict[str, Any]: """ Sanitize OHLCV data by fixing common issues. Args: candle: OHLCV candle dictionary Returns: Sanitized candle dictionary """ sanitized = candle.copy() try: # Ensure numeric types for field in ['open_price', 'high_price', 'low_price', 'close_price', 'volume']: if field in sanitized: sanitized[field] = float(sanitized[field]) # Fix high/low if needed prices = [sanitized.get('open_price', 0), sanitized.get('close_price', 0)] if 'high_price' in sanitized and 'low_price' in sanitized: high = sanitized['high_price'] low = sanitized['low_price'] # Ensure high >= all prices sanitized['high_price'] = max(high, *prices) # Ensure low <= all prices sanitized['low_price'] = min(low, *prices) # Ensure non-negative volume if 'volume' in sanitized: sanitized['volume'] = max(0, sanitized['volume']) # Ensure trade_count is integer if 'trade_count' in sanitized: sanitized['trade_count'] = int(sanitized['trade_count']) except Exception as e: logger.error(f"Error sanitizing OHLCV: {e}") return sanitized @staticmethod def sanitize_orderbook(orderbook: Dict[str, Any]) -> Dict[str, Any]: """ Sanitize order book data by fixing common issues. Args: orderbook: Order book dictionary Returns: Sanitized order book dictionary """ sanitized = orderbook.copy() try: # Ensure bids and asks are lists if 'bids' in sanitized and not isinstance(sanitized['bids'], list): sanitized['bids'] = [] if 'asks' in sanitized and not isinstance(sanitized['asks'], list): sanitized['asks'] = [] # Remove invalid levels if 'bids' in sanitized: sanitized['bids'] = [ bid for bid in sanitized['bids'] if isinstance(bid, (list, tuple)) and len(bid) >= 2 and float(bid[0]) > 0 and float(bid[1]) > 0 ] if 'asks' in sanitized: sanitized['asks'] = [ ask for ask in sanitized['asks'] if isinstance(ask, (list, tuple)) and len(ask) >= 2 and float(ask[0]) > 0 and float(ask[1]) > 0 ] # Sort bids descending, asks ascending if 'bids' in sanitized: sanitized['bids'] = sorted(sanitized['bids'], key=lambda x: float(x[0]), reverse=True) if 'asks' in sanitized: sanitized['asks'] = sorted(sanitized['asks'], key=lambda x: float(x[0])) except Exception as e: logger.error(f"Error sanitizing orderbook: {e}") return sanitized