288 lines
11 KiB
Python
288 lines
11 KiB
Python
"""
|
|
Data quality checking and validation for market data.
|
|
"""
|
|
|
|
from typing import Dict, List, Union, Optional, Tuple
|
|
from datetime import datetime, timezone
|
|
from ..models.core import OrderBookSnapshot, TradeEvent
|
|
from ..utils.logging import get_logger
|
|
from ..utils.validation import validate_price, validate_volume, validate_symbol
|
|
from ..utils.timing import get_current_timestamp
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class DataQualityChecker:
|
|
"""
|
|
Comprehensive data quality checker for market data.
|
|
|
|
Validates:
|
|
- Data structure integrity
|
|
- Price and volume ranges
|
|
- Timestamp consistency
|
|
- Cross-validation between related data points
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize quality checker with default thresholds"""
|
|
# Quality thresholds
|
|
self.max_spread_percentage = 10.0 # Maximum spread as % of mid price
|
|
self.max_price_change_percentage = 50.0 # Maximum price change between updates
|
|
self.min_volume_threshold = 0.000001 # Minimum meaningful volume
|
|
self.max_timestamp_drift = 300 # Maximum seconds drift from current time
|
|
|
|
# Price history for validation
|
|
self.price_history: Dict[str, Dict[str, float]] = {} # symbol -> exchange -> last_price
|
|
|
|
logger.info("Data quality checker initialized")
|
|
|
|
def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]:
|
|
"""
|
|
Check order book data quality.
|
|
|
|
Args:
|
|
orderbook: Order book snapshot to validate
|
|
|
|
Returns:
|
|
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
|
|
"""
|
|
issues = []
|
|
quality_score = 1.0
|
|
|
|
try:
|
|
# Basic structure validation
|
|
structure_issues = self._check_orderbook_structure(orderbook)
|
|
issues.extend(structure_issues)
|
|
quality_score -= len(structure_issues) * 0.1
|
|
|
|
# Price validation
|
|
price_issues = self._check_orderbook_prices(orderbook)
|
|
issues.extend(price_issues)
|
|
quality_score -= len(price_issues) * 0.15
|
|
|
|
# Volume validation
|
|
volume_issues = self._check_orderbook_volumes(orderbook)
|
|
issues.extend(volume_issues)
|
|
quality_score -= len(volume_issues) * 0.1
|
|
|
|
# Spread validation
|
|
spread_issues = self._check_orderbook_spread(orderbook)
|
|
issues.extend(spread_issues)
|
|
quality_score -= len(spread_issues) * 0.2
|
|
|
|
# Timestamp validation
|
|
timestamp_issues = self._check_timestamp(orderbook.timestamp)
|
|
issues.extend(timestamp_issues)
|
|
quality_score -= len(timestamp_issues) * 0.1
|
|
|
|
# Cross-validation with history
|
|
history_issues = self._check_price_history(orderbook)
|
|
issues.extend(history_issues)
|
|
quality_score -= len(history_issues) * 0.15
|
|
|
|
# Update price history
|
|
self._update_price_history(orderbook)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking order book quality: {e}")
|
|
issues.append(f"Quality check error: {e}")
|
|
quality_score = 0.0
|
|
|
|
# Ensure score is within bounds
|
|
quality_score = max(0.0, min(1.0, quality_score))
|
|
|
|
if issues:
|
|
logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}")
|
|
|
|
return quality_score, issues de
|
|
f check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]:
|
|
"""
|
|
Check trade data quality.
|
|
|
|
Args:
|
|
trade: Trade event to validate
|
|
|
|
Returns:
|
|
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
|
|
"""
|
|
issues = []
|
|
quality_score = 1.0
|
|
|
|
try:
|
|
# Basic structure validation
|
|
if not validate_symbol(trade.symbol):
|
|
issues.append("Invalid symbol format")
|
|
|
|
if not trade.exchange:
|
|
issues.append("Missing exchange")
|
|
|
|
if not trade.trade_id:
|
|
issues.append("Missing trade ID")
|
|
|
|
# Price validation
|
|
if not validate_price(trade.price):
|
|
issues.append(f"Invalid price: {trade.price}")
|
|
|
|
# Volume validation
|
|
if not validate_volume(trade.size):
|
|
issues.append(f"Invalid size: {trade.size}")
|
|
|
|
if trade.size < self.min_volume_threshold:
|
|
issues.append(f"Size below threshold: {trade.size}")
|
|
|
|
# Side validation
|
|
if trade.side not in ['buy', 'sell']:
|
|
issues.append(f"Invalid side: {trade.side}")
|
|
|
|
# Timestamp validation
|
|
timestamp_issues = self._check_timestamp(trade.timestamp)
|
|
issues.extend(timestamp_issues)
|
|
|
|
# Calculate quality score
|
|
quality_score -= len(issues) * 0.2
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking trade quality: {e}")
|
|
issues.append(f"Quality check error: {e}")
|
|
quality_score = 0.0
|
|
|
|
# Ensure score is within bounds
|
|
quality_score = max(0.0, min(1.0, quality_score))
|
|
|
|
if issues:
|
|
logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}")
|
|
|
|
return quality_score, issues
|
|
|
|
def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""Check basic order book structure"""
|
|
issues = []
|
|
|
|
if not validate_symbol(orderbook.symbol):
|
|
issues.append("Invalid symbol format")
|
|
|
|
if not orderbook.exchange:
|
|
issues.append("Missing exchange")
|
|
|
|
if not orderbook.bids:
|
|
issues.append("No bid levels")
|
|
|
|
if not orderbook.asks:
|
|
issues.append("No ask levels")
|
|
|
|
return issues
|
|
|
|
def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""Check order book price validity"""
|
|
issues = []
|
|
|
|
# Check bid prices (should be descending)
|
|
for i, bid in enumerate(orderbook.bids):
|
|
if not validate_price(bid.price):
|
|
issues.append(f"Invalid bid price at level {i}: {bid.price}")
|
|
|
|
if i > 0 and bid.price >= orderbook.bids[i-1].price:
|
|
issues.append(f"Bid prices not descending at level {i}")
|
|
|
|
# Check ask prices (should be ascending)
|
|
for i, ask in enumerate(orderbook.asks):
|
|
if not validate_price(ask.price):
|
|
issues.append(f"Invalid ask price at level {i}: {ask.price}")
|
|
|
|
if i > 0 and ask.price <= orderbook.asks[i-1].price:
|
|
issues.append(f"Ask prices not ascending at level {i}")
|
|
|
|
# Check bid-ask ordering
|
|
if orderbook.bids and orderbook.asks:
|
|
if orderbook.bids[0].price >= orderbook.asks[0].price:
|
|
issues.append("Best bid >= best ask (crossed book)")
|
|
|
|
return issues def
|
|
_check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""Check order book volume validity"""
|
|
issues = []
|
|
|
|
# Check bid volumes
|
|
for i, bid in enumerate(orderbook.bids):
|
|
if not validate_volume(bid.size):
|
|
issues.append(f"Invalid bid volume at level {i}: {bid.size}")
|
|
|
|
if bid.size < self.min_volume_threshold:
|
|
issues.append(f"Bid volume below threshold at level {i}: {bid.size}")
|
|
|
|
# Check ask volumes
|
|
for i, ask in enumerate(orderbook.asks):
|
|
if not validate_volume(ask.size):
|
|
issues.append(f"Invalid ask volume at level {i}: {ask.size}")
|
|
|
|
if ask.size < self.min_volume_threshold:
|
|
issues.append(f"Ask volume below threshold at level {i}: {ask.size}")
|
|
|
|
return issues
|
|
|
|
def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""Check order book spread validity"""
|
|
issues = []
|
|
|
|
if orderbook.mid_price and orderbook.spread:
|
|
spread_percentage = (orderbook.spread / orderbook.mid_price) * 100
|
|
|
|
if spread_percentage > self.max_spread_percentage:
|
|
issues.append(f"Spread too wide: {spread_percentage:.2f}%")
|
|
|
|
if spread_percentage < 0:
|
|
issues.append(f"Negative spread: {spread_percentage:.2f}%")
|
|
|
|
return issues
|
|
|
|
def _check_timestamp(self, timestamp: datetime) -> List[str]:
|
|
"""Check timestamp validity"""
|
|
issues = []
|
|
|
|
if not timestamp:
|
|
issues.append("Missing timestamp")
|
|
return issues
|
|
|
|
# Check if timestamp is timezone-aware
|
|
if timestamp.tzinfo is None:
|
|
issues.append("Timestamp missing timezone info")
|
|
|
|
# Check timestamp drift
|
|
current_time = get_current_timestamp()
|
|
time_diff = abs((timestamp - current_time).total_seconds())
|
|
|
|
if time_diff > self.max_timestamp_drift:
|
|
issues.append(f"Timestamp drift too large: {time_diff:.1f}s")
|
|
|
|
return issues
|
|
|
|
def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""Check price consistency with history"""
|
|
issues = []
|
|
|
|
key = f"{orderbook.symbol}_{orderbook.exchange}"
|
|
|
|
if key in self.price_history and orderbook.mid_price:
|
|
last_price = self.price_history[key]
|
|
price_change = abs(orderbook.mid_price - last_price) / last_price * 100
|
|
|
|
if price_change > self.max_price_change_percentage:
|
|
issues.append(f"Large price change: {price_change:.2f}%")
|
|
|
|
return issues
|
|
|
|
def _update_price_history(self, orderbook: OrderBookSnapshot) -> None:
|
|
"""Update price history for future validation"""
|
|
if orderbook.mid_price:
|
|
key = f"{orderbook.symbol}_{orderbook.exchange}"
|
|
self.price_history[key] = orderbook.mid_price
|
|
|
|
def get_quality_summary(self) -> Dict[str, int]:
|
|
"""Get summary of quality checks performed"""
|
|
return {
|
|
'symbols_tracked': len(self.price_history),
|
|
'max_spread_percentage': self.max_spread_percentage,
|
|
'max_price_change_percentage': self.max_price_change_percentage,
|
|
'min_volume_threshold': self.min_volume_threshold,
|
|
'max_timestamp_drift': self.max_timestamp_drift
|
|
} |