Files
gogo2/COBY/processing/quality_checker.py
2025-08-04 17:28:55 +03:00

288 lines
11 KiB
Python

"""
Data quality checking and validation for market data.
"""
from typing import Dict, List, Union, Optional, Tuple
from datetime import datetime, timezone
from ..models.core import OrderBookSnapshot, TradeEvent
from ..utils.logging import get_logger
from ..utils.validation import validate_price, validate_volume, validate_symbol
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
class DataQualityChecker:
"""
Comprehensive data quality checker for market data.
Validates:
- Data structure integrity
- Price and volume ranges
- Timestamp consistency
- Cross-validation between related data points
"""
def __init__(self):
"""Initialize quality checker with default thresholds"""
# Quality thresholds
self.max_spread_percentage = 10.0 # Maximum spread as % of mid price
self.max_price_change_percentage = 50.0 # Maximum price change between updates
self.min_volume_threshold = 0.000001 # Minimum meaningful volume
self.max_timestamp_drift = 300 # Maximum seconds drift from current time
# Price history for validation
self.price_history: Dict[str, Dict[str, float]] = {} # symbol -> exchange -> last_price
logger.info("Data quality checker initialized")
def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]:
"""
Check order book data quality.
Args:
orderbook: Order book snapshot to validate
Returns:
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
"""
issues = []
quality_score = 1.0
try:
# Basic structure validation
structure_issues = self._check_orderbook_structure(orderbook)
issues.extend(structure_issues)
quality_score -= len(structure_issues) * 0.1
# Price validation
price_issues = self._check_orderbook_prices(orderbook)
issues.extend(price_issues)
quality_score -= len(price_issues) * 0.15
# Volume validation
volume_issues = self._check_orderbook_volumes(orderbook)
issues.extend(volume_issues)
quality_score -= len(volume_issues) * 0.1
# Spread validation
spread_issues = self._check_orderbook_spread(orderbook)
issues.extend(spread_issues)
quality_score -= len(spread_issues) * 0.2
# Timestamp validation
timestamp_issues = self._check_timestamp(orderbook.timestamp)
issues.extend(timestamp_issues)
quality_score -= len(timestamp_issues) * 0.1
# Cross-validation with history
history_issues = self._check_price_history(orderbook)
issues.extend(history_issues)
quality_score -= len(history_issues) * 0.15
# Update price history
self._update_price_history(orderbook)
except Exception as e:
logger.error(f"Error checking order book quality: {e}")
issues.append(f"Quality check error: {e}")
quality_score = 0.0
# Ensure score is within bounds
quality_score = max(0.0, min(1.0, quality_score))
if issues:
logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}")
return quality_score, issues de
f check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]:
"""
Check trade data quality.
Args:
trade: Trade event to validate
Returns:
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
"""
issues = []
quality_score = 1.0
try:
# Basic structure validation
if not validate_symbol(trade.symbol):
issues.append("Invalid symbol format")
if not trade.exchange:
issues.append("Missing exchange")
if not trade.trade_id:
issues.append("Missing trade ID")
# Price validation
if not validate_price(trade.price):
issues.append(f"Invalid price: {trade.price}")
# Volume validation
if not validate_volume(trade.size):
issues.append(f"Invalid size: {trade.size}")
if trade.size < self.min_volume_threshold:
issues.append(f"Size below threshold: {trade.size}")
# Side validation
if trade.side not in ['buy', 'sell']:
issues.append(f"Invalid side: {trade.side}")
# Timestamp validation
timestamp_issues = self._check_timestamp(trade.timestamp)
issues.extend(timestamp_issues)
# Calculate quality score
quality_score -= len(issues) * 0.2
except Exception as e:
logger.error(f"Error checking trade quality: {e}")
issues.append(f"Quality check error: {e}")
quality_score = 0.0
# Ensure score is within bounds
quality_score = max(0.0, min(1.0, quality_score))
if issues:
logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}")
return quality_score, issues
def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check basic order book structure"""
issues = []
if not validate_symbol(orderbook.symbol):
issues.append("Invalid symbol format")
if not orderbook.exchange:
issues.append("Missing exchange")
if not orderbook.bids:
issues.append("No bid levels")
if not orderbook.asks:
issues.append("No ask levels")
return issues
def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book price validity"""
issues = []
# Check bid prices (should be descending)
for i, bid in enumerate(orderbook.bids):
if not validate_price(bid.price):
issues.append(f"Invalid bid price at level {i}: {bid.price}")
if i > 0 and bid.price >= orderbook.bids[i-1].price:
issues.append(f"Bid prices not descending at level {i}")
# Check ask prices (should be ascending)
for i, ask in enumerate(orderbook.asks):
if not validate_price(ask.price):
issues.append(f"Invalid ask price at level {i}: {ask.price}")
if i > 0 and ask.price <= orderbook.asks[i-1].price:
issues.append(f"Ask prices not ascending at level {i}")
# Check bid-ask ordering
if orderbook.bids and orderbook.asks:
if orderbook.bids[0].price >= orderbook.asks[0].price:
issues.append("Best bid >= best ask (crossed book)")
return issues def
_check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book volume validity"""
issues = []
# Check bid volumes
for i, bid in enumerate(orderbook.bids):
if not validate_volume(bid.size):
issues.append(f"Invalid bid volume at level {i}: {bid.size}")
if bid.size < self.min_volume_threshold:
issues.append(f"Bid volume below threshold at level {i}: {bid.size}")
# Check ask volumes
for i, ask in enumerate(orderbook.asks):
if not validate_volume(ask.size):
issues.append(f"Invalid ask volume at level {i}: {ask.size}")
if ask.size < self.min_volume_threshold:
issues.append(f"Ask volume below threshold at level {i}: {ask.size}")
return issues
def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book spread validity"""
issues = []
if orderbook.mid_price and orderbook.spread:
spread_percentage = (orderbook.spread / orderbook.mid_price) * 100
if spread_percentage > self.max_spread_percentage:
issues.append(f"Spread too wide: {spread_percentage:.2f}%")
if spread_percentage < 0:
issues.append(f"Negative spread: {spread_percentage:.2f}%")
return issues
def _check_timestamp(self, timestamp: datetime) -> List[str]:
"""Check timestamp validity"""
issues = []
if not timestamp:
issues.append("Missing timestamp")
return issues
# Check if timestamp is timezone-aware
if timestamp.tzinfo is None:
issues.append("Timestamp missing timezone info")
# Check timestamp drift
current_time = get_current_timestamp()
time_diff = abs((timestamp - current_time).total_seconds())
if time_diff > self.max_timestamp_drift:
issues.append(f"Timestamp drift too large: {time_diff:.1f}s")
return issues
def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check price consistency with history"""
issues = []
key = f"{orderbook.symbol}_{orderbook.exchange}"
if key in self.price_history and orderbook.mid_price:
last_price = self.price_history[key]
price_change = abs(orderbook.mid_price - last_price) / last_price * 100
if price_change > self.max_price_change_percentage:
issues.append(f"Large price change: {price_change:.2f}%")
return issues
def _update_price_history(self, orderbook: OrderBookSnapshot) -> None:
"""Update price history for future validation"""
if orderbook.mid_price:
key = f"{orderbook.symbol}_{orderbook.exchange}"
self.price_history[key] = orderbook.mid_price
def get_quality_summary(self) -> Dict[str, int]:
"""Get summary of quality checks performed"""
return {
'symbols_tracked': len(self.price_history),
'max_spread_percentage': self.max_spread_percentage,
'max_price_change_percentage': self.max_price_change_percentage,
'min_volume_threshold': self.min_volume_threshold,
'max_timestamp_drift': self.max_timestamp_drift
}