528 lines
20 KiB
Python
528 lines
20 KiB
Python
"""
|
|
Data Validator for unified storage system.
|
|
Validates all incoming data before storage to ensure data integrity.
|
|
"""
|
|
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Dict, List, Tuple, Optional, Any
|
|
import pandas as pd
|
|
|
|
from .unified_data_models import OHLCVCandle, OrderBookDataFrame, TradeEvent
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DataValidator:
|
|
"""
|
|
Validates all incoming data before storage.
|
|
Ensures data integrity and consistency across the system.
|
|
"""
|
|
|
|
# Validation thresholds
|
|
MAX_PRICE_CHANGE_PERCENT = 50.0 # 50% max price change per candle
|
|
MIN_PRICE = 0.0001 # Minimum valid price
|
|
MAX_PRICE = 1000000.0 # Maximum valid price
|
|
MIN_VOLUME = 0.0 # Minimum valid volume
|
|
MAX_SPREAD_PERCENT = 10.0 # 10% max spread
|
|
|
|
@staticmethod
|
|
def validate_ohlcv(candle: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate OHLCV candle data.
|
|
|
|
Args:
|
|
candle: Dictionary with OHLCV data
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Check required fields
|
|
required_fields = ['timestamp', 'symbol', 'timeframe', 'open_price',
|
|
'high_price', 'low_price', 'close_price', 'volume']
|
|
|
|
for field in required_fields:
|
|
if field not in candle:
|
|
return False, f"Missing required field: {field}"
|
|
|
|
# Extract values
|
|
open_price = float(candle['open_price'])
|
|
high_price = float(candle['high_price'])
|
|
low_price = float(candle['low_price'])
|
|
close_price = float(candle['close_price'])
|
|
volume = float(candle['volume'])
|
|
|
|
# Validate price ranges
|
|
prices = [open_price, high_price, low_price, close_price]
|
|
for price in prices:
|
|
if price < DataValidator.MIN_PRICE:
|
|
return False, f"Price below minimum: {price}"
|
|
if price > DataValidator.MAX_PRICE:
|
|
return False, f"Price above maximum: {price}"
|
|
|
|
# Validate OHLC relationships
|
|
if high_price < low_price:
|
|
return False, f"High ({high_price}) < Low ({low_price})"
|
|
|
|
if high_price < open_price:
|
|
return False, f"High ({high_price}) < Open ({open_price})"
|
|
|
|
if high_price < close_price:
|
|
return False, f"High ({high_price}) < Close ({close_price})"
|
|
|
|
if low_price > open_price:
|
|
return False, f"Low ({low_price}) > Open ({open_price})"
|
|
|
|
if low_price > close_price:
|
|
return False, f"Low ({low_price}) > Close ({close_price})"
|
|
|
|
# Validate volume
|
|
if volume < DataValidator.MIN_VOLUME:
|
|
return False, f"Volume below minimum: {volume}"
|
|
|
|
# Validate price change (prevent extreme outliers)
|
|
if open_price > 0:
|
|
price_change_percent = abs((close_price - open_price) / open_price) * 100
|
|
if price_change_percent > DataValidator.MAX_PRICE_CHANGE_PERCENT:
|
|
return False, f"Price change too large: {price_change_percent:.2f}%"
|
|
|
|
# Validate timestamp
|
|
if not DataValidator._validate_timestamp(candle['timestamp']):
|
|
return False, "Invalid timestamp"
|
|
|
|
# Validate symbol
|
|
if not DataValidator._validate_symbol(candle['symbol']):
|
|
return False, f"Invalid symbol: {candle['symbol']}"
|
|
|
|
# Validate timeframe
|
|
if not DataValidator._validate_timeframe(candle['timeframe']):
|
|
return False, f"Invalid timeframe: {candle['timeframe']}"
|
|
|
|
return True, None
|
|
|
|
except (ValueError, TypeError, KeyError) as e:
|
|
return False, f"Validation error: {str(e)}"
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error validating OHLCV: {e}")
|
|
return False, f"Unexpected error: {str(e)}"
|
|
|
|
@staticmethod
|
|
def validate_ohlcv_candle(candle: OHLCVCandle) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate OHLCVCandle object.
|
|
|
|
Args:
|
|
candle: OHLCVCandle object
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
candle_dict = {
|
|
'timestamp': candle.timestamp,
|
|
'symbol': candle.symbol,
|
|
'timeframe': candle.timeframe,
|
|
'open_price': candle.open_price,
|
|
'high_price': candle.high_price,
|
|
'low_price': candle.low_price,
|
|
'close_price': candle.close_price,
|
|
'volume': candle.volume
|
|
}
|
|
return DataValidator.validate_ohlcv(candle_dict)
|
|
|
|
@staticmethod
|
|
def validate_orderbook(orderbook: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate order book data.
|
|
|
|
Args:
|
|
orderbook: Dictionary with order book data
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Check required fields
|
|
if 'bids' not in orderbook or 'asks' not in orderbook:
|
|
return False, "Missing bids or asks"
|
|
|
|
bids = orderbook['bids']
|
|
asks = orderbook['asks']
|
|
|
|
# Check if bids and asks are lists
|
|
if not isinstance(bids, list) or not isinstance(asks, list):
|
|
return False, "Bids and asks must be lists"
|
|
|
|
# Check if bids and asks are not empty
|
|
if not bids or not asks:
|
|
return False, "Bids or asks are empty"
|
|
|
|
# Validate bid levels
|
|
for bid in bids:
|
|
if not isinstance(bid, (list, tuple)) or len(bid) < 2:
|
|
return False, "Invalid bid format"
|
|
|
|
price, size = float(bid[0]), float(bid[1])
|
|
|
|
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
|
|
return False, f"Invalid bid price: {price}"
|
|
|
|
if size <= 0:
|
|
return False, f"Invalid bid size: {size}"
|
|
|
|
# Validate ask levels
|
|
for ask in asks:
|
|
if not isinstance(ask, (list, tuple)) or len(ask) < 2:
|
|
return False, "Invalid ask format"
|
|
|
|
price, size = float(ask[0]), float(ask[1])
|
|
|
|
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
|
|
return False, f"Invalid ask price: {price}"
|
|
|
|
if size <= 0:
|
|
return False, f"Invalid ask size: {size}"
|
|
|
|
# Validate bid/ask relationship
|
|
best_bid = max(float(bid[0]) for bid in bids)
|
|
best_ask = min(float(ask[0]) for ask in asks)
|
|
|
|
if best_bid >= best_ask:
|
|
return False, f"Best bid ({best_bid}) >= Best ask ({best_ask})"
|
|
|
|
# Validate spread
|
|
spread = best_ask - best_bid
|
|
mid_price = (best_bid + best_ask) / 2
|
|
spread_percent = (spread / mid_price) * 100
|
|
|
|
if spread_percent > DataValidator.MAX_SPREAD_PERCENT:
|
|
return False, f"Spread too large: {spread_percent:.2f}%"
|
|
|
|
# Validate timestamp if present
|
|
if 'timestamp' in orderbook:
|
|
if not DataValidator._validate_timestamp(orderbook['timestamp']):
|
|
return False, "Invalid timestamp"
|
|
|
|
# Validate symbol if present
|
|
if 'symbol' in orderbook:
|
|
if not DataValidator._validate_symbol(orderbook['symbol']):
|
|
return False, f"Invalid symbol: {orderbook['symbol']}"
|
|
|
|
return True, None
|
|
|
|
except (ValueError, TypeError, KeyError) as e:
|
|
return False, f"Validation error: {str(e)}"
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error validating orderbook: {e}")
|
|
return False, f"Unexpected error: {str(e)}"
|
|
|
|
@staticmethod
|
|
def validate_orderbook_dataframe(orderbook: OrderBookDataFrame) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate OrderBookDataFrame object.
|
|
|
|
Args:
|
|
orderbook: OrderBookDataFrame object
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
orderbook_dict = {
|
|
'bids': orderbook.bids,
|
|
'asks': orderbook.asks,
|
|
'timestamp': orderbook.timestamp,
|
|
'symbol': orderbook.symbol
|
|
}
|
|
return DataValidator.validate_orderbook(orderbook_dict)
|
|
|
|
@staticmethod
|
|
def validate_trade(trade: Dict[str, Any]) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate trade event data.
|
|
|
|
Args:
|
|
trade: Dictionary with trade data
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Check required fields
|
|
required_fields = ['timestamp', 'symbol', 'price', 'size', 'side', 'trade_id']
|
|
|
|
for field in required_fields:
|
|
if field not in trade:
|
|
return False, f"Missing required field: {field}"
|
|
|
|
# Validate price
|
|
price = float(trade['price'])
|
|
if price < DataValidator.MIN_PRICE or price > DataValidator.MAX_PRICE:
|
|
return False, f"Invalid price: {price}"
|
|
|
|
# Validate size
|
|
size = float(trade['size'])
|
|
if size <= 0:
|
|
return False, f"Invalid size: {size}"
|
|
|
|
# Validate side
|
|
side = trade['side'].lower()
|
|
if side not in ['buy', 'sell', 'bid', 'ask']:
|
|
return False, f"Invalid side: {trade['side']}"
|
|
|
|
# Validate timestamp
|
|
if not DataValidator._validate_timestamp(trade['timestamp']):
|
|
return False, "Invalid timestamp"
|
|
|
|
# Validate symbol
|
|
if not DataValidator._validate_symbol(trade['symbol']):
|
|
return False, f"Invalid symbol: {trade['symbol']}"
|
|
|
|
# Validate trade_id
|
|
if not trade['trade_id']:
|
|
return False, "Empty trade_id"
|
|
|
|
return True, None
|
|
|
|
except (ValueError, TypeError, KeyError) as e:
|
|
return False, f"Validation error: {str(e)}"
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error validating trade: {e}")
|
|
return False, f"Unexpected error: {str(e)}"
|
|
|
|
@staticmethod
|
|
def validate_trade_event(trade: TradeEvent) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate TradeEvent object.
|
|
|
|
Args:
|
|
trade: TradeEvent object
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
trade_dict = {
|
|
'timestamp': trade.timestamp,
|
|
'symbol': trade.symbol,
|
|
'price': trade.price,
|
|
'size': trade.size,
|
|
'side': trade.side,
|
|
'trade_id': trade.trade_id
|
|
}
|
|
return DataValidator.validate_trade(trade_dict)
|
|
|
|
@staticmethod
|
|
def validate_imbalances(imbalances: Dict[str, float]) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate order book imbalance metrics.
|
|
|
|
Args:
|
|
imbalances: Dictionary with imbalance metrics
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Expected imbalance fields
|
|
expected_fields = [
|
|
'imbalance_1s', 'imbalance_5s', 'imbalance_15s', 'imbalance_60s',
|
|
'volume_imbalance_1s', 'volume_imbalance_5s',
|
|
'volume_imbalance_15s', 'volume_imbalance_60s'
|
|
]
|
|
|
|
# Check if at least some imbalance fields are present
|
|
present_fields = [f for f in expected_fields if f in imbalances]
|
|
if not present_fields:
|
|
return False, "No imbalance fields present"
|
|
|
|
# Validate imbalance values (should be between -1 and 1)
|
|
for field in present_fields:
|
|
value = float(imbalances[field])
|
|
if value < -1.0 or value > 1.0:
|
|
logger.warning(f"Imbalance {field} out of range [-1, 1]: {value}")
|
|
# Don't fail validation, just warn
|
|
|
|
return True, None
|
|
|
|
except (ValueError, TypeError) as e:
|
|
return False, f"Validation error: {str(e)}"
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error validating imbalances: {e}")
|
|
return False, f"Unexpected error: {str(e)}"
|
|
|
|
@staticmethod
|
|
def _validate_timestamp(timestamp: Any) -> bool:
|
|
"""Validate timestamp format and value."""
|
|
try:
|
|
if isinstance(timestamp, datetime):
|
|
# Check if timestamp is not too far in the future or past
|
|
now = datetime.now(timezone.utc)
|
|
diff_days = abs((timestamp - now).days)
|
|
|
|
# Allow timestamps within 10 years
|
|
if diff_days > 3650:
|
|
logger.warning(f"Timestamp too far from now: {timestamp}")
|
|
return False
|
|
|
|
return True
|
|
elif isinstance(timestamp, (int, float)):
|
|
# Unix timestamp
|
|
dt = datetime.fromtimestamp(timestamp, tz=timezone.utc)
|
|
return DataValidator._validate_timestamp(dt)
|
|
elif isinstance(timestamp, str):
|
|
# ISO format string
|
|
dt = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
|
|
return DataValidator._validate_timestamp(dt)
|
|
else:
|
|
return False
|
|
|
|
except Exception as e:
|
|
logger.debug(f"Timestamp validation error: {e}")
|
|
return False
|
|
|
|
@staticmethod
|
|
def _validate_symbol(symbol: str) -> bool:
|
|
"""Validate trading symbol format."""
|
|
if not symbol or not isinstance(symbol, str):
|
|
return False
|
|
|
|
# Symbol should be non-empty and reasonable length
|
|
if len(symbol) < 3 or len(symbol) > 20:
|
|
return False
|
|
|
|
# Common symbol formats: BTCUSDT, BTC/USDT, BTC-USDT
|
|
valid_chars = set('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789/-')
|
|
if not all(c in valid_chars for c in symbol.upper()):
|
|
return False
|
|
|
|
return True
|
|
|
|
@staticmethod
|
|
def _validate_timeframe(timeframe: str) -> bool:
|
|
"""Validate timeframe format."""
|
|
valid_timeframes = ['1s', '1m', '5m', '15m', '30m', '1h', '4h', '1d', '1w']
|
|
return timeframe in valid_timeframes
|
|
|
|
@staticmethod
|
|
def validate_dataframe(df: pd.DataFrame, required_columns: List[str]) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate pandas DataFrame has required columns and valid data.
|
|
|
|
Args:
|
|
df: DataFrame to validate
|
|
required_columns: List of required column names
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
try:
|
|
# Check if DataFrame is empty
|
|
if df.empty:
|
|
return False, "DataFrame is empty"
|
|
|
|
# Check required columns
|
|
missing_columns = [col for col in required_columns if col not in df.columns]
|
|
if missing_columns:
|
|
return False, f"Missing columns: {missing_columns}"
|
|
|
|
# Check for null values in required columns
|
|
null_counts = df[required_columns].isnull().sum()
|
|
if null_counts.any():
|
|
null_cols = null_counts[null_counts > 0].to_dict()
|
|
return False, f"Null values found: {null_cols}"
|
|
|
|
return True, None
|
|
|
|
except Exception as e:
|
|
logger.error(f"DataFrame validation error: {e}")
|
|
return False, f"Validation error: {str(e)}"
|
|
|
|
@staticmethod
|
|
def sanitize_ohlcv(candle: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Sanitize OHLCV data by fixing common issues.
|
|
|
|
Args:
|
|
candle: OHLCV candle dictionary
|
|
|
|
Returns:
|
|
Sanitized candle dictionary
|
|
"""
|
|
sanitized = candle.copy()
|
|
|
|
try:
|
|
# Ensure numeric types
|
|
for field in ['open_price', 'high_price', 'low_price', 'close_price', 'volume']:
|
|
if field in sanitized:
|
|
sanitized[field] = float(sanitized[field])
|
|
|
|
# Fix high/low if needed
|
|
prices = [sanitized.get('open_price', 0), sanitized.get('close_price', 0)]
|
|
if 'high_price' in sanitized and 'low_price' in sanitized:
|
|
high = sanitized['high_price']
|
|
low = sanitized['low_price']
|
|
|
|
# Ensure high >= all prices
|
|
sanitized['high_price'] = max(high, *prices)
|
|
|
|
# Ensure low <= all prices
|
|
sanitized['low_price'] = min(low, *prices)
|
|
|
|
# Ensure non-negative volume
|
|
if 'volume' in sanitized:
|
|
sanitized['volume'] = max(0, sanitized['volume'])
|
|
|
|
# Ensure trade_count is integer
|
|
if 'trade_count' in sanitized:
|
|
sanitized['trade_count'] = int(sanitized['trade_count'])
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error sanitizing OHLCV: {e}")
|
|
|
|
return sanitized
|
|
|
|
@staticmethod
|
|
def sanitize_orderbook(orderbook: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Sanitize order book data by fixing common issues.
|
|
|
|
Args:
|
|
orderbook: Order book dictionary
|
|
|
|
Returns:
|
|
Sanitized order book dictionary
|
|
"""
|
|
sanitized = orderbook.copy()
|
|
|
|
try:
|
|
# Ensure bids and asks are lists
|
|
if 'bids' in sanitized and not isinstance(sanitized['bids'], list):
|
|
sanitized['bids'] = []
|
|
|
|
if 'asks' in sanitized and not isinstance(sanitized['asks'], list):
|
|
sanitized['asks'] = []
|
|
|
|
# Remove invalid levels
|
|
if 'bids' in sanitized:
|
|
sanitized['bids'] = [
|
|
bid for bid in sanitized['bids']
|
|
if isinstance(bid, (list, tuple)) and len(bid) >= 2 and float(bid[0]) > 0 and float(bid[1]) > 0
|
|
]
|
|
|
|
if 'asks' in sanitized:
|
|
sanitized['asks'] = [
|
|
ask for ask in sanitized['asks']
|
|
if isinstance(ask, (list, tuple)) and len(ask) >= 2 and float(ask[0]) > 0 and float(ask[1]) > 0
|
|
]
|
|
|
|
# Sort bids descending, asks ascending
|
|
if 'bids' in sanitized:
|
|
sanitized['bids'] = sorted(sanitized['bids'], key=lambda x: float(x[0]), reverse=True)
|
|
|
|
if 'asks' in sanitized:
|
|
sanitized['asks'] = sorted(sanitized['asks'], key=lambda x: float(x[0]))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error sanitizing orderbook: {e}")
|
|
|
|
return sanitized
|