Files
gogo2/core/unified_data_validator.py
2025-10-20 11:16:27 +03:00

528 lines
20 KiB
Python

"""
Data Validator for unified storage system.
Validates all incoming data before storage to ensure data integrity.
"""
import logging
from datetime import datetime, timezone
from typing import Dict, List, Tuple, Optional, Any
import pandas as pd
from .unified_data_models import OHLCVCandle, OrderBookDataFrame, TradeEvent
logger = logging.getLogger(__name__)
class DataValidator:
"""
Validates all incoming data before storage.
Ensures data integrity and consistency across the system.
"""
# Validation thresholds
MAX_PRICE_CHANGE_PERCENT = 50.0 # 50% max price change per candle
MIN_PRICE = 0.0001 # Minimum valid price
MAX_PRICE = 1000000.0 # Maximum valid price
MIN_VOLUME = 0.0 # Minimum valid volume
MAX_SPREAD_PERCENT = 10.0 # 10% max spread
@staticmethod
def validate_ohlcv(candle: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""
Validate OHLCV candle data.
Args:
candle: Dictionary with OHLCV data
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check required fields
required_fields = ['timestamp', 'symbol', 'timeframe', 'open_price',
'high_price', 'low_price', 'close_price', 'volume']
for field in required_fields:
if field not in candle:
return False, f"Missing required field: {field}"
# Extract values
open_price = float(candle['open_price'])
high_price = float(candle['high_price'])
low_price = float(candle['low_price'])
close_price = float(candle['close_price'])
volume = float(candle['volume'])
# Validate price ranges
prices = [open_price, high_price, low_price, close_price]
for price in prices:
if price < DataValidator.MIN_PRICE:
return False, f"Price below minimum: {price}"
if price > DataValidator.MAX_PRICE:
return False, f"Price above maximum: {price}"
# Validate OHLC relationships
if high_price < low_price:
return False, f"High ({high_price}) < Low ({low_price})"
if high_price < open_price:
return False, f"High ({high_price}) < Open ({open_price})"
if high_price < close_price:
return False, f"High ({high_price}) < Close ({close_price})"
if low_price > open_price:
return False, f"Low ({low_price}) > Open ({open_price})"
if low_price > close_price:
return False, f"Low ({low_price}) > Close ({close_price})"
# Validate volume
if volume < DataValidator.MIN_VOLUME:
return False, f"Volume below minimum: {volume}"
# Validate price change (prevent extreme outliers)
if open_price > 0:
price_change_percent = abs((close_price - open_price) / open_price) * 100
if price_change_percent > DataValidator.MAX_PRICE_CHANGE_PERCENT:
return False, f"Price change too large: {price_change_percent:.2f}%"
# Validate timestamp
if not DataValidator._validate_timestamp(candle['timestamp']):
return False, "Invalid timestamp"
# Validate symbol
if not DataValidator._validate_symbol(candle['symbol']):
return False, f"Invalid symbol: {candle['symbol']}"
# Validate timeframe
if not DataValidator._validate_timeframe(candle['timeframe']):
return False, f"Invalid timeframe: {candle['timeframe']}"
return True, None
except (ValueError, TypeError, KeyError) as e:
return False, f"Validation error: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error validating OHLCV: {e}")
return False, f"Unexpected error: {str(e)}"
@staticmethod
def validate_ohlcv_candle(candle: OHLCVCandle) -> Tuple[bool, Optional[str]]:
"""
Validate OHLCVCandle object.
Args:
candle: OHLCVCandle object
Returns:
Tuple of (is_valid, error_message)
"""
candle_dict = {
'timestamp': candle.timestamp,
'symbol': candle.symbol,
'timeframe': candle.timeframe,
'open_price': candle.open_price,
'high_price': candle.high_price,
'low_price': candle.low_price,
'close_price': candle.close_price,
'volume': candle.volume
}
return DataValidator.validate_ohlcv(candle_dict)
@staticmethod
def validate_orderbook(orderbook: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""
Validate order book data.
Args:
orderbook: Dictionary with order book data
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check required fields
if 'bids' not in orderbook or 'asks' not in orderbook:
return False, "Missing bids or asks"
bids = orderbook['bids']
asks = orderbook['asks']
# Check if bids and asks are lists
if not isinstance(bids, list) or not isinstance(asks, list):
return False, "Bids and asks must be lists"
# Check if bids and asks are not empty
if not bids or not asks:
return False, "Bids or asks are empty"
# Validate bid levels
for bid in bids:
if not isinstance(bid, (list, tuple)) or len(bid) < 2:
return False, "Invalid bid format"
price, size = float(bid[0]), float(bid[1])
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
return False, f"Invalid bid price: {price}"
if size <= 0:
return False, f"Invalid bid size: {size}"
# Validate ask levels
for ask in asks:
if not isinstance(ask, (list, tuple)) or len(ask) < 2:
return False, "Invalid ask format"
price, size = float(ask[0]), float(ask[1])
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
return False, f"Invalid ask price: {price}"
if size <= 0:
return False, f"Invalid ask size: {size}"
# Validate bid/ask relationship
best_bid = max(float(bid[0]) for bid in bids)
best_ask = min(float(ask[0]) for ask in asks)
if best_bid >= best_ask:
return False, f"Best bid ({best_bid}) >= Best ask ({best_ask})"
# Validate spread
spread = best_ask - best_bid
mid_price = (best_bid + best_ask) / 2
spread_percent = (spread / mid_price) * 100
if spread_percent > DataValidator.MAX_SPREAD_PERCENT:
return False, f"Spread too large: {spread_percent:.2f}%"
# Validate timestamp if present
if 'timestamp' in orderbook:
if not DataValidator._validate_timestamp(orderbook['timestamp']):
return False, "Invalid timestamp"
# Validate symbol if present
if 'symbol' in orderbook:
if not DataValidator._validate_symbol(orderbook['symbol']):
return False, f"Invalid symbol: {orderbook['symbol']}"
return True, None
except (ValueError, TypeError, KeyError) as e:
return False, f"Validation error: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error validating orderbook: {e}")
return False, f"Unexpected error: {str(e)}"
@staticmethod
def validate_orderbook_dataframe(orderbook: OrderBookDataFrame) -> Tuple[bool, Optional[str]]:
"""
Validate OrderBookDataFrame object.
Args:
orderbook: OrderBookDataFrame object
Returns:
Tuple of (is_valid, error_message)
"""
orderbook_dict = {
'bids': orderbook.bids,
'asks': orderbook.asks,
'timestamp': orderbook.timestamp,
'symbol': orderbook.symbol
}
return DataValidator.validate_orderbook(orderbook_dict)
@staticmethod
def validate_trade(trade: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
"""
Validate trade event data.
Args:
trade: Dictionary with trade data
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check required fields
required_fields = ['timestamp', 'symbol', 'price', 'size', 'side', 'trade_id']
for field in required_fields:
if field not in trade:
return False, f"Missing required field: {field}"
# Validate price
price = float(trade['price'])
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
return False, f"Invalid price: {price}"
# Validate size
size = float(trade['size'])
if size <= 0:
return False, f"Invalid size: {size}"
# Validate side
side = trade['side'].lower()
if side not in ['buy', 'sell', 'bid', 'ask']:
return False, f"Invalid side: {trade['side']}"
# Validate timestamp
if not DataValidator._validate_timestamp(trade['timestamp']):
return False, "Invalid timestamp"
# Validate symbol
if not DataValidator._validate_symbol(trade['symbol']):
return False, f"Invalid symbol: {trade['symbol']}"
# Validate trade_id
if not trade['trade_id']:
return False, "Empty trade_id"
return True, None
except (ValueError, TypeError, KeyError) as e:
return False, f"Validation error: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error validating trade: {e}")
return False, f"Unexpected error: {str(e)}"
@staticmethod
def validate_trade_event(trade: TradeEvent) -> Tuple[bool, Optional[str]]:
"""
Validate TradeEvent object.
Args:
trade: TradeEvent object
Returns:
Tuple of (is_valid, error_message)
"""
trade_dict = {
'timestamp': trade.timestamp,
'symbol': trade.symbol,
'price': trade.price,
'size': trade.size,
'side': trade.side,
'trade_id': trade.trade_id
}
return DataValidator.validate_trade(trade_dict)
@staticmethod
def validate_imbalances(imbalances: Dict[str, float]) -> Tuple[bool, Optional[str]]:
"""
Validate order book imbalance metrics.
Args:
imbalances: Dictionary with imbalance metrics
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Expected imbalance fields
expected_fields = [
'imbalance_1s', 'imbalance_5s', 'imbalance_15s', 'imbalance_60s',
'volume_imbalance_1s', 'volume_imbalance_5s',
'volume_imbalance_15s', 'volume_imbalance_60s'
]
# Check if at least some imbalance fields are present
present_fields = [f for f in expected_fields if f in imbalances]
if not present_fields:
return False, "No imbalance fields present"
# Validate imbalance values (should be between -1 and 1)
for field in present_fields:
value = float(imbalances[field])
if value < -1.0 or value > 1.0:
logger.warning(f"Imbalance {field} out of range [-1, 1]: {value}")
# Don't fail validation, just warn
return True, None
except (ValueError, TypeError) as e:
return False, f"Validation error: {str(e)}"
except Exception as e:
logger.error(f"Unexpected error validating imbalances: {e}")
return False, f"Unexpected error: {str(e)}"
@staticmethod
def _validate_timestamp(timestamp: Any) -> bool:
"""Validate timestamp format and value."""
try:
if isinstance(timestamp, datetime):
# Check if timestamp is not too far in the future or past
now = datetime.now(timezone.utc)
diff_days = abs((timestamp - now).days)
# Allow timestamps within 10 years
if diff_days > 3650:
logger.warning(f"Timestamp too far from now: {timestamp}")
return False
return True
elif isinstance(timestamp, (int, float)):
# Unix timestamp
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
return DataValidator._validate_timestamp(dt)
elif isinstance(timestamp, str):
# ISO format string
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
return DataValidator._validate_timestamp(dt)
else:
return False
except Exception as e:
logger.debug(f"Timestamp validation error: {e}")
return False
@staticmethod
def _validate_symbol(symbol: str) -> bool:
"""Validate trading symbol format."""
if not symbol or not isinstance(symbol, str):
return False
# Symbol should be non-empty and reasonable length
if len(symbol) < 3 or len(symbol) > 20:
return False
# Common symbol formats: BTCUSDT, BTC/USDT, BTC-USDT
valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789/-')
if not all(c in valid_chars for c in symbol.upper()):
return False
return True
@staticmethod
def _validate_timeframe(timeframe: str) -> bool:
"""Validate timeframe format."""
valid_timeframes = ['1s', '1m', '5m', '15m', '30m', '1h', '4h', '1d', '1w']
return timeframe in valid_timeframes
@staticmethod
def validate_dataframe(df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, Optional[str]]:
"""
Validate pandas DataFrame has required columns and valid data.
Args:
df: DataFrame to validate
required_columns: List of required column names
Returns:
Tuple of (is_valid, error_message)
"""
try:
# Check if DataFrame is empty
if df.empty:
return False, "DataFrame is empty"
# Check required columns
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
return False, f"Missing columns: {missing_columns}"
# Check for null values in required columns
null_counts = df[required_columns].isnull().sum()
if null_counts.any():
null_cols = null_counts[null_counts > 0].to_dict()
return False, f"Null values found: {null_cols}"
return True, None
except Exception as e:
logger.error(f"DataFrame validation error: {e}")
return False, f"Validation error: {str(e)}"
@staticmethod
def sanitize_ohlcv(candle: Dict[str, Any]) -> Dict[str, Any]:
"""
Sanitize OHLCV data by fixing common issues.
Args:
candle: OHLCV candle dictionary
Returns:
Sanitized candle dictionary
"""
sanitized = candle.copy()
try:
# Ensure numeric types
for field in ['open_price', 'high_price', 'low_price', 'close_price', 'volume']:
if field in sanitized:
sanitized[field] = float(sanitized[field])
# Fix high/low if needed
prices = [sanitized.get('open_price', 0), sanitized.get('close_price', 0)]
if 'high_price' in sanitized and 'low_price' in sanitized:
high = sanitized['high_price']
low = sanitized['low_price']
# Ensure high >= all prices
sanitized['high_price'] = max(high, *prices)
# Ensure low <= all prices
sanitized['low_price'] = min(low, *prices)
# Ensure non-negative volume
if 'volume' in sanitized:
sanitized['volume'] = max(0, sanitized['volume'])
# Ensure trade_count is integer
if 'trade_count' in sanitized:
sanitized['trade_count'] = int(sanitized['trade_count'])
except Exception as e:
logger.error(f"Error sanitizing OHLCV: {e}")
return sanitized
@staticmethod
def sanitize_orderbook(orderbook: Dict[str, Any]) -> Dict[str, Any]:
"""
Sanitize order book data by fixing common issues.
Args:
orderbook: Order book dictionary
Returns:
Sanitized order book dictionary
"""
sanitized = orderbook.copy()
try:
# Ensure bids and asks are lists
if 'bids' in sanitized and not isinstance(sanitized['bids'], list):
sanitized['bids'] = []
if 'asks' in sanitized and not isinstance(sanitized['asks'], list):
sanitized['asks'] = []
# Remove invalid levels
if 'bids' in sanitized:
sanitized['bids'] = [
bid for bid in sanitized['bids']
if isinstance(bid, (list, tuple)) and len(bid) >= 2 and float(bid[0]) > 0 and float(bid[1]) > 0
]
if 'asks' in sanitized:
sanitized['asks'] = [
ask for ask in sanitized['asks']
if isinstance(ask, (list, tuple)) and len(ask) >= 2 and float(ask[0]) > 0 and float(ask[1]) > 0
]
# Sort bids descending, asks ascending
if 'bids' in sanitized:
sanitized['bids'] = sorted(sanitized['bids'], key=lambda x: float(x[0]), reverse=True)
if 'asks' in sanitized:
sanitized['asks'] = sorted(sanitized['asks'], key=lambda x: float(x[0]))
except Exception as e:
logger.error(f"Error sanitizing orderbook: {e}")
return sanitized