bucket aggregation
This commit is contained in:
288
COBY/processing/quality_checker.py
Normal file
288
COBY/processing/quality_checker.py
Normal file
@ -0,0 +1,288 @@
|
||||
"""
|
||||
Data quality checking and validation for market data.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Union, Optional, Tuple
|
||||
from datetime import datetime, timezone
|
||||
from ..models.core import OrderBookSnapshot, TradeEvent
|
||||
from ..utils.logging import get_logger
|
||||
from ..utils.validation import validate_price, validate_volume, validate_symbol
|
||||
from ..utils.timing import get_current_timestamp
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
class DataQualityChecker:
|
||||
"""
|
||||
Comprehensive data quality checker for market data.
|
||||
|
||||
Validates:
|
||||
- Data structure integrity
|
||||
- Price and volume ranges
|
||||
- Timestamp consistency
|
||||
- Cross-validation between related data points
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize quality checker with default thresholds"""
|
||||
# Quality thresholds
|
||||
self.max_spread_percentage = 10.0 # Maximum spread as % of mid price
|
||||
self.max_price_change_percentage = 50.0 # Maximum price change between updates
|
||||
self.min_volume_threshold = 0.000001 # Minimum meaningful volume
|
||||
self.max_timestamp_drift = 300 # Maximum seconds drift from current time
|
||||
|
||||
# Price history for validation
|
||||
self.price_history: Dict[str, Dict[str, float]] = {} # symbol -> exchange -> last_price
|
||||
|
||||
logger.info("Data quality checker initialized")
|
||||
|
||||
def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]:
|
||||
"""
|
||||
Check order book data quality.
|
||||
|
||||
Args:
|
||||
orderbook: Order book snapshot to validate
|
||||
|
||||
Returns:
|
||||
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
|
||||
"""
|
||||
issues = []
|
||||
quality_score = 1.0
|
||||
|
||||
try:
|
||||
# Basic structure validation
|
||||
structure_issues = self._check_orderbook_structure(orderbook)
|
||||
issues.extend(structure_issues)
|
||||
quality_score -= len(structure_issues) * 0.1
|
||||
|
||||
# Price validation
|
||||
price_issues = self._check_orderbook_prices(orderbook)
|
||||
issues.extend(price_issues)
|
||||
quality_score -= len(price_issues) * 0.15
|
||||
|
||||
# Volume validation
|
||||
volume_issues = self._check_orderbook_volumes(orderbook)
|
||||
issues.extend(volume_issues)
|
||||
quality_score -= len(volume_issues) * 0.1
|
||||
|
||||
# Spread validation
|
||||
spread_issues = self._check_orderbook_spread(orderbook)
|
||||
issues.extend(spread_issues)
|
||||
quality_score -= len(spread_issues) * 0.2
|
||||
|
||||
# Timestamp validation
|
||||
timestamp_issues = self._check_timestamp(orderbook.timestamp)
|
||||
issues.extend(timestamp_issues)
|
||||
quality_score -= len(timestamp_issues) * 0.1
|
||||
|
||||
# Cross-validation with history
|
||||
history_issues = self._check_price_history(orderbook)
|
||||
issues.extend(history_issues)
|
||||
quality_score -= len(history_issues) * 0.15
|
||||
|
||||
# Update price history
|
||||
self._update_price_history(orderbook)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking order book quality: {e}")
|
||||
issues.append(f"Quality check error: {e}")
|
||||
quality_score = 0.0
|
||||
|
||||
# Ensure score is within bounds
|
||||
quality_score = max(0.0, min(1.0, quality_score))
|
||||
|
||||
if issues:
|
||||
logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}")
|
||||
|
||||
return quality_score, issues de
|
||||
f check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]:
|
||||
"""
|
||||
Check trade data quality.
|
||||
|
||||
Args:
|
||||
trade: Trade event to validate
|
||||
|
||||
Returns:
|
||||
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
|
||||
"""
|
||||
issues = []
|
||||
quality_score = 1.0
|
||||
|
||||
try:
|
||||
# Basic structure validation
|
||||
if not validate_symbol(trade.symbol):
|
||||
issues.append("Invalid symbol format")
|
||||
|
||||
if not trade.exchange:
|
||||
issues.append("Missing exchange")
|
||||
|
||||
if not trade.trade_id:
|
||||
issues.append("Missing trade ID")
|
||||
|
||||
# Price validation
|
||||
if not validate_price(trade.price):
|
||||
issues.append(f"Invalid price: {trade.price}")
|
||||
|
||||
# Volume validation
|
||||
if not validate_volume(trade.size):
|
||||
issues.append(f"Invalid size: {trade.size}")
|
||||
|
||||
if trade.size < self.min_volume_threshold:
|
||||
issues.append(f"Size below threshold: {trade.size}")
|
||||
|
||||
# Side validation
|
||||
if trade.side not in ['buy', 'sell']:
|
||||
issues.append(f"Invalid side: {trade.side}")
|
||||
|
||||
# Timestamp validation
|
||||
timestamp_issues = self._check_timestamp(trade.timestamp)
|
||||
issues.extend(timestamp_issues)
|
||||
|
||||
# Calculate quality score
|
||||
quality_score -= len(issues) * 0.2
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking trade quality: {e}")
|
||||
issues.append(f"Quality check error: {e}")
|
||||
quality_score = 0.0
|
||||
|
||||
# Ensure score is within bounds
|
||||
quality_score = max(0.0, min(1.0, quality_score))
|
||||
|
||||
if issues:
|
||||
logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}")
|
||||
|
||||
return quality_score, issues
|
||||
|
||||
def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]:
|
||||
"""Check basic order book structure"""
|
||||
issues = []
|
||||
|
||||
if not validate_symbol(orderbook.symbol):
|
||||
issues.append("Invalid symbol format")
|
||||
|
||||
if not orderbook.exchange:
|
||||
issues.append("Missing exchange")
|
||||
|
||||
if not orderbook.bids:
|
||||
issues.append("No bid levels")
|
||||
|
||||
if not orderbook.asks:
|
||||
issues.append("No ask levels")
|
||||
|
||||
return issues
|
||||
|
||||
def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]:
|
||||
"""Check order book price validity"""
|
||||
issues = []
|
||||
|
||||
# Check bid prices (should be descending)
|
||||
for i, bid in enumerate(orderbook.bids):
|
||||
if not validate_price(bid.price):
|
||||
issues.append(f"Invalid bid price at level {i}: {bid.price}")
|
||||
|
||||
if i > 0 and bid.price >= orderbook.bids[i-1].price:
|
||||
issues.append(f"Bid prices not descending at level {i}")
|
||||
|
||||
# Check ask prices (should be ascending)
|
||||
for i, ask in enumerate(orderbook.asks):
|
||||
if not validate_price(ask.price):
|
||||
issues.append(f"Invalid ask price at level {i}: {ask.price}")
|
||||
|
||||
if i > 0 and ask.price <= orderbook.asks[i-1].price:
|
||||
issues.append(f"Ask prices not ascending at level {i}")
|
||||
|
||||
# Check bid-ask ordering
|
||||
if orderbook.bids and orderbook.asks:
|
||||
if orderbook.bids[0].price >= orderbook.asks[0].price:
|
||||
issues.append("Best bid >= best ask (crossed book)")
|
||||
|
||||
return issues def
|
||||
_check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]:
|
||||
"""Check order book volume validity"""
|
||||
issues = []
|
||||
|
||||
# Check bid volumes
|
||||
for i, bid in enumerate(orderbook.bids):
|
||||
if not validate_volume(bid.size):
|
||||
issues.append(f"Invalid bid volume at level {i}: {bid.size}")
|
||||
|
||||
if bid.size < self.min_volume_threshold:
|
||||
issues.append(f"Bid volume below threshold at level {i}: {bid.size}")
|
||||
|
||||
# Check ask volumes
|
||||
for i, ask in enumerate(orderbook.asks):
|
||||
if not validate_volume(ask.size):
|
||||
issues.append(f"Invalid ask volume at level {i}: {ask.size}")
|
||||
|
||||
if ask.size < self.min_volume_threshold:
|
||||
issues.append(f"Ask volume below threshold at level {i}: {ask.size}")
|
||||
|
||||
return issues
|
||||
|
||||
def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]:
|
||||
"""Check order book spread validity"""
|
||||
issues = []
|
||||
|
||||
if orderbook.mid_price and orderbook.spread:
|
||||
spread_percentage = (orderbook.spread / orderbook.mid_price) * 100
|
||||
|
||||
if spread_percentage > self.max_spread_percentage:
|
||||
issues.append(f"Spread too wide: {spread_percentage:.2f}%")
|
||||
|
||||
if spread_percentage < 0:
|
||||
issues.append(f"Negative spread: {spread_percentage:.2f}%")
|
||||
|
||||
return issues
|
||||
|
||||
def _check_timestamp(self, timestamp: datetime) -> List[str]:
|
||||
"""Check timestamp validity"""
|
||||
issues = []
|
||||
|
||||
if not timestamp:
|
||||
issues.append("Missing timestamp")
|
||||
return issues
|
||||
|
||||
# Check if timestamp is timezone-aware
|
||||
if timestamp.tzinfo is None:
|
||||
issues.append("Timestamp missing timezone info")
|
||||
|
||||
# Check timestamp drift
|
||||
current_time = get_current_timestamp()
|
||||
time_diff = abs((timestamp - current_time).total_seconds())
|
||||
|
||||
if time_diff > self.max_timestamp_drift:
|
||||
issues.append(f"Timestamp drift too large: {time_diff:.1f}s")
|
||||
|
||||
return issues
|
||||
|
||||
def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]:
|
||||
"""Check price consistency with history"""
|
||||
issues = []
|
||||
|
||||
key = f"{orderbook.symbol}_{orderbook.exchange}"
|
||||
|
||||
if key in self.price_history and orderbook.mid_price:
|
||||
last_price = self.price_history[key]
|
||||
price_change = abs(orderbook.mid_price - last_price) / last_price * 100
|
||||
|
||||
if price_change > self.max_price_change_percentage:
|
||||
issues.append(f"Large price change: {price_change:.2f}%")
|
||||
|
||||
return issues
|
||||
|
||||
def _update_price_history(self, orderbook: OrderBookSnapshot) -> None:
|
||||
"""Update price history for future validation"""
|
||||
if orderbook.mid_price:
|
||||
key = f"{orderbook.symbol}_{orderbook.exchange}"
|
||||
self.price_history[key] = orderbook.mid_price
|
||||
|
||||
def get_quality_summary(self) -> Dict[str, int]:
|
||||
"""Get summary of quality checks performed"""
|
||||
return {
|
||||
'symbols_tracked': len(self.price_history),
|
||||
'max_spread_percentage': self.max_spread_percentage,
|
||||
'max_price_change_percentage': self.max_price_change_percentage,
|
||||
'min_volume_threshold': self.min_volume_threshold,
|
||||
'max_timestamp_drift': self.max_timestamp_drift
|
||||
}
|
Reference in New Issue
Block a user