bucket aggregation

This commit is contained in:
Dobromir Popov
2025-08-04 17:28:55 +03:00
parent 504736c0f7
commit de77b0afa8
10 changed files with 2592 additions and 0 deletions

View File

@ -0,0 +1,15 @@
"""
Data processing and normalization components for the COBY system.
"""
from .data_processor import StandardDataProcessor
from .quality_checker import DataQualityChecker
from .anomaly_detector import AnomalyDetector
from .metrics_calculator import MetricsCalculator
__all__ = [
'StandardDataProcessor',
'DataQualityChecker',
'AnomalyDetector',
'MetricsCalculator'
]

View File

@ -0,0 +1,329 @@
"""
Anomaly detection for market data.
"""
import statistics
from typing import Dict, List, Union, Optional, Deque
from collections import deque
from datetime import datetime, timedelta
from ..models.core import OrderBookSnapshot, TradeEvent
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
class AnomalyDetector:
"""
Detects anomalies in market data using statistical methods.
Detects:
- Price spikes and drops
- Volume anomalies
- Spread anomalies
- Frequency anomalies
"""
def __init__(self, window_size: int = 100, z_score_threshold: float = 3.0):
"""
Initialize anomaly detector.
Args:
window_size: Size of rolling window for statistics
z_score_threshold: Z-score threshold for anomaly detection
"""
self.window_size = window_size
self.z_score_threshold = z_score_threshold
# Rolling windows for statistics
self.price_windows: Dict[str, Deque[float]] = {}
self.volume_windows: Dict[str, Deque[float]] = {}
self.spread_windows: Dict[str, Deque[float]] = {}
self.timestamp_windows: Dict[str, Deque[datetime]] = {}
logger.info(f"Anomaly detector initialized with window_size={window_size}, threshold={z_score_threshold}")
def detect_orderbook_anomalies(self, orderbook: OrderBookSnapshot) -> List[str]:
"""
Detect anomalies in order book data.
Args:
orderbook: Order book snapshot to analyze
Returns:
List[str]: List of detected anomalies
"""
anomalies = []
key = f"{orderbook.symbol}_{orderbook.exchange}"
try:
# Price anomalies
if orderbook.mid_price:
price_anomalies = self._detect_price_anomalies(key, orderbook.mid_price)
anomalies.extend(price_anomalies)
# Volume anomalies
total_volume = orderbook.bid_volume + orderbook.ask_volume
volume_anomalies = self._detect_volume_anomalies(key, total_volume)
anomalies.extend(volume_anomalies)
# Spread anomalies
if orderbook.spread and orderbook.mid_price:
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
spread_anomalies = self._detect_spread_anomalies(key, spread_pct)
anomalies.extend(spread_anomalies)
# Frequency anomalies
frequency_anomalies = self._detect_frequency_anomalies(key, orderbook.timestamp)
anomalies.extend(frequency_anomalies)
# Update windows
self._update_windows(key, orderbook)
except Exception as e:
logger.error(f"Error detecting order book anomalies: {e}")
anomalies.append(f"Anomaly detection error: {e}")
if anomalies:
logger.warning(f"Anomalies detected in {orderbook.symbol}@{orderbook.exchange}: {anomalies}")
return anomalies
def detect_trade_anomalies(self, trade: TradeEvent) -> List[str]:
"""
Detect anomalies in trade data.
Args:
trade: Trade event to analyze
Returns:
List[str]: List of detected anomalies
"""
anomalies = []
key = f"{trade.symbol}_{trade.exchange}_trade"
try:
# Price anomalies
price_anomalies = self._detect_price_anomalies(key, trade.price)
anomalies.extend(price_anomalies)
# Volume anomalies
volume_anomalies = self._detect_volume_anomalies(key, trade.size)
anomalies.extend(volume_anomalies)
# Update windows
self._update_trade_windows(key, trade)
except Exception as e:
logger.error(f"Error detecting trade anomalies: {e}")
anomalies.append(f"Anomaly detection error: {e}")
if anomalies:
logger.warning(f"Trade anomalies detected in {trade.symbol}@{trade.exchange}: {anomalies}")
return anomalies
def _detect_price_anomalies(self, key: str, price: float) -> List[str]:
"""Detect price anomalies using z-score"""
anomalies = []
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
return anomalies
window = self.price_windows[key]
if len(window) < 10: # Need minimum data points
return anomalies
try:
mean_price = statistics.mean(window)
std_price = statistics.stdev(window)
if std_price > 0:
z_score = abs(price - mean_price) / std_price
if z_score > self.z_score_threshold:
direction = "spike" if price > mean_price else "drop"
anomalies.append(f"Price {direction}: {price:.6f} (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass # Not enough data or all values are the same
return anomalies
def _detect_volume_anomalies(self, key: str, volume: float) -> List[str]:
"""Detect volume anomalies using z-score"""
anomalies = []
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
return anomalies
window = self.volume_windows[volume_key]
if len(window) < 10:
return anomalies
try:
mean_volume = statistics.mean(window)
std_volume = statistics.stdev(window)
if std_volume > 0:
z_score = abs(volume - mean_volume) / std_volume
if z_score > self.z_score_threshold:
direction = "spike" if volume > mean_volume else "drop"
anomalies.append(f"Volume {direction}: {volume:.6f} (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass
return anomalies
def _detect_spread_anomalies(self, key: str, spread_pct: float) -> List[str]:
"""Detect spread anomalies using z-score"""
anomalies = []
spread_key = f"{key}_spread"
if spread_key not in self.spread_windows:
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
return anomalies
window = self.spread_windows[spread_key]
if len(window) < 10:
return anomalies
try:
mean_spread = statistics.mean(window)
std_spread = statistics.stdev(window)
if std_spread > 0:
z_score = abs(spread_pct - mean_spread) / std_spread
if z_score > self.z_score_threshold:
direction = "widening" if spread_pct > mean_spread else "tightening"
anomalies.append(f"Spread {direction}: {spread_pct:.4f}% (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass
return anomalies
def _detect_frequency_anomalies(self, key: str, timestamp: datetime) -> List[str]:
"""Detect frequency anomalies in data updates"""
anomalies = []
timestamp_key = f"{key}_timestamp"
if timestamp_key not in self.timestamp_windows:
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
return anomalies
window = self.timestamp_windows[timestamp_key]
if len(window) < 5:
return anomalies
try:
# Calculate intervals between updates
intervals = []
for i in range(1, len(window)):
interval = (window[i] - window[i-1]).total_seconds()
intervals.append(interval)
if len(intervals) >= 5:
mean_interval = statistics.mean(intervals)
std_interval = statistics.stdev(intervals)
# Check current interval
current_interval = (timestamp - window[-1]).total_seconds()
if std_interval > 0:
z_score = abs(current_interval - mean_interval) / std_interval
if z_score > self.z_score_threshold:
if current_interval > mean_interval:
anomalies.append(f"Update delay: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
else:
anomalies.append(f"Update burst: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
except (statistics.StatisticsError, IndexError):
pass
return anomalies
def _update_windows(self, key: str, orderbook: OrderBookSnapshot) -> None:
"""Update rolling windows with new data"""
# Update price window
if orderbook.mid_price:
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
self.price_windows[key].append(orderbook.mid_price)
# Update volume window
total_volume = orderbook.bid_volume + orderbook.ask_volume
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
self.volume_windows[volume_key].append(total_volume)
# Update spread window
if orderbook.spread and orderbook.mid_price:
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
spread_key = f"{key}_spread"
if spread_key not in self.spread_windows:
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
self.spread_windows[spread_key].append(spread_pct)
# Update timestamp window
timestamp_key = f"{key}_timestamp"
if timestamp_key not in self.timestamp_windows:
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
self.timestamp_windows[timestamp_key].append(orderbook.timestamp)
def _update_trade_windows(self, key: str, trade: TradeEvent) -> None:
"""Update rolling windows with trade data"""
# Update price window
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
self.price_windows[key].append(trade.price)
# Update volume window
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
self.volume_windows[volume_key].append(trade.size)
def get_statistics(self) -> Dict[str, Dict[str, float]]:
"""Get current statistics for all tracked symbols"""
stats = {}
for key, window in self.price_windows.items():
if len(window) >= 2:
try:
stats[key] = {
'price_mean': statistics.mean(window),
'price_std': statistics.stdev(window),
'price_min': min(window),
'price_max': max(window),
'data_points': len(window)
}
except statistics.StatisticsError:
stats[key] = {'error': 'insufficient_data'}
return stats
def reset_windows(self, key: Optional[str] = None) -> None:
"""Reset rolling windows for a specific key or all keys"""
if key:
# Reset specific key
self.price_windows.pop(key, None)
self.volume_windows.pop(f"{key}_volume", None)
self.spread_windows.pop(f"{key}_spread", None)
self.timestamp_windows.pop(f"{key}_timestamp", None)
else:
# Reset all windows
self.price_windows.clear()
self.volume_windows.clear()
self.spread_windows.clear()
self.timestamp_windows.clear()
logger.info(f"Reset anomaly detection windows for {key or 'all keys'}")

View File

@ -0,0 +1,378 @@
"""
Main data processor implementation.
"""
from typing import Dict, Union, List, Optional, Any
from ..interfaces.data_processor import DataProcessor
from ..models.core import OrderBookSnapshot, TradeEvent, OrderBookMetrics
from ..utils.logging import get_logger, set_correlation_id
from ..utils.exceptions import ValidationError, ProcessingError
from ..utils.timing import get_current_timestamp
from .quality_checker import DataQualityChecker
from .anomaly_detector import AnomalyDetector
from .metrics_calculator import MetricsCalculator
logger = get_logger(__name__)
class StandardDataProcessor(DataProcessor):
"""
Standard implementation of data processor interface.
Provides:
- Data normalization and validation
- Quality checking
- Anomaly detection
- Metrics calculation
- Data enrichment
"""
def __init__(self):
"""Initialize data processor with components"""
self.quality_checker = DataQualityChecker()
self.anomaly_detector = AnomalyDetector()
self.metrics_calculator = MetricsCalculator()
# Processing statistics
self.processed_orderbooks = 0
self.processed_trades = 0
self.quality_failures = 0
self.anomalies_detected = 0
logger.info("Standard data processor initialized")
def normalize_orderbook(self, raw_data: Dict, exchange: str) -> OrderBookSnapshot:
"""
Normalize raw order book data to standard format.
Args:
raw_data: Raw order book data from exchange
exchange: Exchange name
Returns:
OrderBookSnapshot: Normalized order book data
"""
try:
set_correlation_id()
# This is a generic implementation - specific exchanges would override
# For now, assume data is already in correct format
if isinstance(raw_data, OrderBookSnapshot):
return raw_data
# If raw_data is a dict, try to construct OrderBookSnapshot
# This would be customized per exchange
raise NotImplementedError(
"normalize_orderbook should be implemented by exchange-specific processors"
)
except Exception as e:
logger.error(f"Error normalizing order book data: {e}")
raise ProcessingError(f"Normalization failed: {e}", "NORMALIZE_ERROR")
def normalize_trade(self, raw_data: Dict, exchange: str) -> TradeEvent:
"""
Normalize raw trade data to standard format.
Args:
raw_data: Raw trade data from exchange
exchange: Exchange name
Returns:
TradeEvent: Normalized trade data
"""
try:
set_correlation_id()
# This is a generic implementation - specific exchanges would override
if isinstance(raw_data, TradeEvent):
return raw_data
# If raw_data is a dict, try to construct TradeEvent
# This would be customized per exchange
raise NotImplementedError(
"normalize_trade should be implemented by exchange-specific processors"
)
except Exception as e:
logger.error(f"Error normalizing trade data: {e}")
raise ProcessingError(f"Normalization failed: {e}", "NORMALIZE_ERROR")
def validate_data(self, data: Union[OrderBookSnapshot, TradeEvent]) -> bool:
"""
Validate normalized data for quality and consistency.
Args:
data: Normalized data to validate
Returns:
bool: True if data is valid, False otherwise
"""
try:
set_correlation_id()
if isinstance(data, OrderBookSnapshot):
quality_score, issues = self.quality_checker.check_orderbook_quality(data)
self.processed_orderbooks += 1
if quality_score < 0.5: # Threshold for acceptable quality
self.quality_failures += 1
logger.warning(f"Low quality order book data: score={quality_score:.2f}, issues={issues}")
return False
return True
elif isinstance(data, TradeEvent):
quality_score, issues = self.quality_checker.check_trade_quality(data)
self.processed_trades += 1
if quality_score < 0.5:
self.quality_failures += 1
logger.warning(f"Low quality trade data: score={quality_score:.2f}, issues={issues}")
return False
return True
else:
logger.error(f"Unknown data type for validation: {type(data)}")
return False
except Exception as e:
logger.error(f"Error validating data: {e}")
return False
def calculate_metrics(self, orderbook: OrderBookSnapshot) -> OrderBookMetrics:
"""
Calculate metrics from order book data.
Args:
orderbook: Order book snapshot
Returns:
OrderBookMetrics: Calculated metrics
"""
try:
set_correlation_id()
return self.metrics_calculator.calculate_orderbook_metrics(orderbook)
except Exception as e:
logger.error(f"Error calculating metrics: {e}")
raise ProcessingError(f"Metrics calculation failed: {e}", "METRICS_ERROR")
def detect_anomalies(self, data: Union[OrderBookSnapshot, TradeEvent]) -> List[str]:
"""
Detect anomalies in the data.
Args:
data: Data to analyze for anomalies
Returns:
List[str]: List of detected anomaly descriptions
"""
try:
set_correlation_id()
if isinstance(data, OrderBookSnapshot):
anomalies = self.anomaly_detector.detect_orderbook_anomalies(data)
elif isinstance(data, TradeEvent):
anomalies = self.anomaly_detector.detect_trade_anomalies(data)
else:
logger.error(f"Unknown data type for anomaly detection: {type(data)}")
return ["Unknown data type"]
if anomalies:
self.anomalies_detected += len(anomalies)
return anomalies
except Exception as e:
logger.error(f"Error detecting anomalies: {e}")
return [f"Anomaly detection error: {e}"]
def filter_data(self, data: Union[OrderBookSnapshot, TradeEvent], criteria: Dict) -> bool:
"""
Filter data based on criteria.
Args:
data: Data to filter
criteria: Filtering criteria
Returns:
bool: True if data passes filter, False otherwise
"""
try:
set_correlation_id()
# Symbol filter
if 'symbols' in criteria:
allowed_symbols = criteria['symbols']
if data.symbol not in allowed_symbols:
return False
# Exchange filter
if 'exchanges' in criteria:
allowed_exchanges = criteria['exchanges']
if data.exchange not in allowed_exchanges:
return False
# Quality filter
if 'min_quality' in criteria:
min_quality = criteria['min_quality']
if isinstance(data, OrderBookSnapshot):
quality_score, _ = self.quality_checker.check_orderbook_quality(data)
elif isinstance(data, TradeEvent):
quality_score, _ = self.quality_checker.check_trade_quality(data)
else:
quality_score = 0.0
if quality_score < min_quality:
return False
# Price range filter
if 'price_range' in criteria:
price_range = criteria['price_range']
min_price, max_price = price_range
if isinstance(data, OrderBookSnapshot):
price = data.mid_price
elif isinstance(data, TradeEvent):
price = data.price
else:
return False
if price and (price < min_price or price > max_price):
return False
# Volume filter for trades
if 'min_volume' in criteria and isinstance(data, TradeEvent):
min_volume = criteria['min_volume']
if data.size < min_volume:
return False
return True
except Exception as e:
logger.error(f"Error filtering data: {e}")
return False
def enrich_data(self, data: Union[OrderBookSnapshot, TradeEvent]) -> Dict:
"""
Enrich data with additional metadata.
Args:
data: Data to enrich
Returns:
Dict: Enriched data with metadata
"""
try:
set_correlation_id()
enriched = {
'original_data': data,
'processing_timestamp': get_current_timestamp(),
'processor_version': '1.0.0'
}
# Add quality metrics
if isinstance(data, OrderBookSnapshot):
quality_score, quality_issues = self.quality_checker.check_orderbook_quality(data)
enriched['quality_score'] = quality_score
enriched['quality_issues'] = quality_issues
# Add calculated metrics
try:
metrics = self.calculate_metrics(data)
enriched['metrics'] = {
'mid_price': metrics.mid_price,
'spread': metrics.spread,
'spread_percentage': metrics.spread_percentage,
'volume_imbalance': metrics.volume_imbalance,
'depth_10': metrics.depth_10,
'depth_50': metrics.depth_50
}
except Exception as e:
enriched['metrics_error'] = str(e)
# Add liquidity score
try:
liquidity_score = self.metrics_calculator.calculate_liquidity_score(data)
enriched['liquidity_score'] = liquidity_score
except Exception as e:
enriched['liquidity_error'] = str(e)
elif isinstance(data, TradeEvent):
quality_score, quality_issues = self.quality_checker.check_trade_quality(data)
enriched['quality_score'] = quality_score
enriched['quality_issues'] = quality_issues
# Add trade-specific enrichments
enriched['trade_value'] = data.price * data.size
enriched['side_numeric'] = 1 if data.side == 'buy' else -1
# Add anomaly detection results
anomalies = self.detect_anomalies(data)
enriched['anomalies'] = anomalies
enriched['anomaly_count'] = len(anomalies)
return enriched
except Exception as e:
logger.error(f"Error enriching data: {e}")
return {
'original_data': data,
'enrichment_error': str(e)
}
def get_data_quality_score(self, data: Union[OrderBookSnapshot, TradeEvent]) -> float:
"""
Calculate data quality score.
Args:
data: Data to score
Returns:
float: Quality score between 0.0 and 1.0
"""
try:
set_correlation_id()
if isinstance(data, OrderBookSnapshot):
quality_score, _ = self.quality_checker.check_orderbook_quality(data)
elif isinstance(data, TradeEvent):
quality_score, _ = self.quality_checker.check_trade_quality(data)
else:
logger.error(f"Unknown data type for quality scoring: {type(data)}")
return 0.0
return quality_score
except Exception as e:
logger.error(f"Error calculating quality score: {e}")
return 0.0
def get_processing_stats(self) -> Dict[str, Any]:
"""Get processing statistics"""
return {
'processed_orderbooks': self.processed_orderbooks,
'processed_trades': self.processed_trades,
'quality_failures': self.quality_failures,
'anomalies_detected': self.anomalies_detected,
'quality_failure_rate': (
self.quality_failures / max(1, self.processed_orderbooks + self.processed_trades)
),
'anomaly_rate': (
self.anomalies_detected / max(1, self.processed_orderbooks + self.processed_trades)
),
'quality_checker_summary': self.quality_checker.get_quality_summary(),
'anomaly_detector_stats': self.anomaly_detector.get_statistics()
}
def reset_stats(self) -> None:
"""Reset processing statistics"""
self.processed_orderbooks = 0
self.processed_trades = 0
self.quality_failures = 0
self.anomalies_detected = 0
logger.info("Processing statistics reset")

View File

@ -0,0 +1,275 @@
"""
Metrics calculation for order book analysis.
"""
from typing import Dict, List, Optional
from ..models.core import OrderBookSnapshot, OrderBookMetrics, ImbalanceMetrics
from ..utils.logging import get_logger
logger = get_logger(__name__)
class MetricsCalculator:
"""
Calculates various metrics from order book data.
Metrics include:
- Basic metrics (mid price, spread, volumes)
- Imbalance metrics
- Depth metrics
- Liquidity metrics
"""
def __init__(self):
"""Initialize metrics calculator"""
logger.info("Metrics calculator initialized")
def calculate_orderbook_metrics(self, orderbook: OrderBookSnapshot) -> OrderBookMetrics:
"""
Calculate comprehensive order book metrics.
Args:
orderbook: Order book snapshot
Returns:
OrderBookMetrics: Calculated metrics
"""
try:
# Basic calculations
mid_price = self._calculate_mid_price(orderbook)
spread = self._calculate_spread(orderbook)
spread_percentage = (spread / mid_price * 100) if mid_price > 0 else 0.0
# Volume calculations
bid_volume = sum(level.size for level in orderbook.bids)
ask_volume = sum(level.size for level in orderbook.asks)
# Imbalance calculation
total_volume = bid_volume + ask_volume
volume_imbalance = ((bid_volume - ask_volume) / total_volume) if total_volume > 0 else 0.0
# Depth calculations
depth_10 = self._calculate_depth(orderbook, 10)
depth_50 = self._calculate_depth(orderbook, 50)
return OrderBookMetrics(
symbol=orderbook.symbol,
exchange=orderbook.exchange,
timestamp=orderbook.timestamp,
mid_price=mid_price,
spread=spread,
spread_percentage=spread_percentage,
bid_volume=bid_volume,
ask_volume=ask_volume,
volume_imbalance=volume_imbalance,
depth_10=depth_10,
depth_50=depth_50
)
except Exception as e:
logger.error(f"Error calculating order book metrics: {e}")
raise
def calculate_imbalance_metrics(self, orderbook: OrderBookSnapshot) -> ImbalanceMetrics:
"""
Calculate order book imbalance metrics.
Args:
orderbook: Order book snapshot
Returns:
ImbalanceMetrics: Calculated imbalance metrics
"""
try:
# Volume imbalance
bid_volume = sum(level.size for level in orderbook.bids)
ask_volume = sum(level.size for level in orderbook.asks)
total_volume = bid_volume + ask_volume
volume_imbalance = ((bid_volume - ask_volume) / total_volume) if total_volume > 0 else 0.0
# Price imbalance (weighted by volume)
price_imbalance = self._calculate_price_imbalance(orderbook)
# Depth imbalance
depth_imbalance = self._calculate_depth_imbalance(orderbook)
# Momentum score (simplified - would need historical data for full implementation)
momentum_score = volume_imbalance * 0.5 + price_imbalance * 0.3 + depth_imbalance * 0.2
return ImbalanceMetrics(
symbol=orderbook.symbol,
timestamp=orderbook.timestamp,
volume_imbalance=volume_imbalance,
price_imbalance=price_imbalance,
depth_imbalance=depth_imbalance,
momentum_score=momentum_score
)
except Exception as e:
logger.error(f"Error calculating imbalance metrics: {e}")
raise
def _calculate_mid_price(self, orderbook: OrderBookSnapshot) -> float:
"""Calculate mid price"""
if not orderbook.bids or not orderbook.asks:
return 0.0
best_bid = orderbook.bids[0].price
best_ask = orderbook.asks[0].price
return (best_bid + best_ask) / 2.0
def _calculate_spread(self, orderbook: OrderBookSnapshot) -> float:
"""Calculate bid-ask spread"""
if not orderbook.bids or not orderbook.asks:
return 0.0
best_bid = orderbook.bids[0].price
best_ask = orderbook.asks[0].price
return best_ask - best_bid
def _calculate_depth(self, orderbook: OrderBookSnapshot, levels: int) -> float:
"""Calculate market depth for specified number of levels"""
bid_depth = sum(
level.size for level in orderbook.bids[:levels]
)
ask_depth = sum(
level.size for level in orderbook.asks[:levels]
)
return bid_depth + ask_depth
def _calculate_price_imbalance(self, orderbook: OrderBookSnapshot) -> float:
"""Calculate price-weighted imbalance"""
if not orderbook.bids or not orderbook.asks:
return 0.0
# Calculate volume-weighted average prices for top levels
bid_vwap = self._calculate_vwap(orderbook.bids[:5])
ask_vwap = self._calculate_vwap(orderbook.asks[:5])
if bid_vwap == 0 or ask_vwap == 0:
return 0.0
mid_price = (bid_vwap + ask_vwap) / 2.0
# Normalize imbalance
price_imbalance = (bid_vwap - ask_vwap) / mid_price if mid_price > 0 else 0.0
return max(-1.0, min(1.0, price_imbalance))
def _calculate_depth_imbalance(self, orderbook: OrderBookSnapshot) -> float:
"""Calculate depth imbalance across multiple levels"""
levels_to_check = [5, 10, 20]
imbalances = []
for levels in levels_to_check:
bid_depth = sum(level.size for level in orderbook.bids[:levels])
ask_depth = sum(level.size for level in orderbook.asks[:levels])
total_depth = bid_depth + ask_depth
if total_depth > 0:
imbalance = (bid_depth - ask_depth) / total_depth
imbalances.append(imbalance)
# Return weighted average of imbalances
if imbalances:
return sum(imbalances) / len(imbalances)
return 0.0
def _calculate_vwap(self, levels: List) -> float:
"""Calculate volume-weighted average price for price levels"""
if not levels:
return 0.0
total_volume = sum(level.size for level in levels)
if total_volume == 0:
return 0.0
weighted_sum = sum(level.price * level.size for level in levels)
return weighted_sum / total_volume
def calculate_liquidity_score(self, orderbook: OrderBookSnapshot) -> float:
"""
Calculate liquidity score based on depth and spread.
Args:
orderbook: Order book snapshot
Returns:
float: Liquidity score (0.0 to 1.0)
"""
try:
if not orderbook.bids or not orderbook.asks:
return 0.0
# Spread component (lower spread = higher liquidity)
spread = self._calculate_spread(orderbook)
mid_price = self._calculate_mid_price(orderbook)
if mid_price == 0:
return 0.0
spread_pct = (spread / mid_price) * 100
spread_score = max(0.0, 1.0 - (spread_pct / 5.0)) # Normalize to 5% max spread
# Depth component (higher depth = higher liquidity)
total_depth = self._calculate_depth(orderbook, 10)
depth_score = min(1.0, total_depth / 100.0) # Normalize to 100 units max depth
# Volume balance component (more balanced = higher liquidity)
bid_volume = sum(level.size for level in orderbook.bids[:10])
ask_volume = sum(level.size for level in orderbook.asks[:10])
total_volume = bid_volume + ask_volume
if total_volume > 0:
imbalance = abs(bid_volume - ask_volume) / total_volume
balance_score = 1.0 - imbalance
else:
balance_score = 0.0
# Weighted combination
liquidity_score = (spread_score * 0.4 + depth_score * 0.4 + balance_score * 0.2)
return max(0.0, min(1.0, liquidity_score))
except Exception as e:
logger.error(f"Error calculating liquidity score: {e}")
return 0.0
def get_market_summary(self, orderbook: OrderBookSnapshot) -> Dict[str, float]:
"""
Get comprehensive market summary.
Args:
orderbook: Order book snapshot
Returns:
Dict[str, float]: Market summary metrics
"""
try:
metrics = self.calculate_orderbook_metrics(orderbook)
imbalance = self.calculate_imbalance_metrics(orderbook)
liquidity = self.calculate_liquidity_score(orderbook)
return {
'mid_price': metrics.mid_price,
'spread': metrics.spread,
'spread_percentage': metrics.spread_percentage,
'bid_volume': metrics.bid_volume,
'ask_volume': metrics.ask_volume,
'volume_imbalance': metrics.volume_imbalance,
'depth_10': metrics.depth_10,
'depth_50': metrics.depth_50,
'price_imbalance': imbalance.price_imbalance,
'depth_imbalance': imbalance.depth_imbalance,
'momentum_score': imbalance.momentum_score,
'liquidity_score': liquidity
}
except Exception as e:
logger.error(f"Error generating market summary: {e}")
return {}

View File

@ -0,0 +1,288 @@
"""
Data quality checking and validation for market data.
"""
from typing import Dict, List, Union, Optional, Tuple
from datetime import datetime, timezone
from ..models.core import OrderBookSnapshot, TradeEvent
from ..utils.logging import get_logger
from ..utils.validation import validate_price, validate_volume, validate_symbol
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
class DataQualityChecker:
"""
Comprehensive data quality checker for market data.
Validates:
- Data structure integrity
- Price and volume ranges
- Timestamp consistency
- Cross-validation between related data points
"""
def __init__(self):
"""Initialize quality checker with default thresholds"""
# Quality thresholds
self.max_spread_percentage = 10.0 # Maximum spread as % of mid price
self.max_price_change_percentage = 50.0 # Maximum price change between updates
self.min_volume_threshold = 0.000001 # Minimum meaningful volume
self.max_timestamp_drift = 300 # Maximum seconds drift from current time
# Price history for validation
self.price_history: Dict[str, Dict[str, float]] = {} # symbol -> exchange -> last_price
logger.info("Data quality checker initialized")
def check_orderbook_quality(self, orderbook: OrderBookSnapshot) -> Tuple[float, List[str]]:
"""
Check order book data quality.
Args:
orderbook: Order book snapshot to validate
Returns:
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
"""
issues = []
quality_score = 1.0
try:
# Basic structure validation
structure_issues = self._check_orderbook_structure(orderbook)
issues.extend(structure_issues)
quality_score -= len(structure_issues) * 0.1
# Price validation
price_issues = self._check_orderbook_prices(orderbook)
issues.extend(price_issues)
quality_score -= len(price_issues) * 0.15
# Volume validation
volume_issues = self._check_orderbook_volumes(orderbook)
issues.extend(volume_issues)
quality_score -= len(volume_issues) * 0.1
# Spread validation
spread_issues = self._check_orderbook_spread(orderbook)
issues.extend(spread_issues)
quality_score -= len(spread_issues) * 0.2
# Timestamp validation
timestamp_issues = self._check_timestamp(orderbook.timestamp)
issues.extend(timestamp_issues)
quality_score -= len(timestamp_issues) * 0.1
# Cross-validation with history
history_issues = self._check_price_history(orderbook)
issues.extend(history_issues)
quality_score -= len(history_issues) * 0.15
# Update price history
self._update_price_history(orderbook)
except Exception as e:
logger.error(f"Error checking order book quality: {e}")
issues.append(f"Quality check error: {e}")
quality_score = 0.0
# Ensure score is within bounds
quality_score = max(0.0, min(1.0, quality_score))
if issues:
logger.debug(f"Order book quality issues for {orderbook.symbol}@{orderbook.exchange}: {issues}")
return quality_score, issues de
f check_trade_quality(self, trade: TradeEvent) -> Tuple[float, List[str]]:
"""
Check trade data quality.
Args:
trade: Trade event to validate
Returns:
Tuple[float, List[str]]: Quality score (0.0-1.0) and list of issues
"""
issues = []
quality_score = 1.0
try:
# Basic structure validation
if not validate_symbol(trade.symbol):
issues.append("Invalid symbol format")
if not trade.exchange:
issues.append("Missing exchange")
if not trade.trade_id:
issues.append("Missing trade ID")
# Price validation
if not validate_price(trade.price):
issues.append(f"Invalid price: {trade.price}")
# Volume validation
if not validate_volume(trade.size):
issues.append(f"Invalid size: {trade.size}")
if trade.size < self.min_volume_threshold:
issues.append(f"Size below threshold: {trade.size}")
# Side validation
if trade.side not in ['buy', 'sell']:
issues.append(f"Invalid side: {trade.side}")
# Timestamp validation
timestamp_issues = self._check_timestamp(trade.timestamp)
issues.extend(timestamp_issues)
# Calculate quality score
quality_score -= len(issues) * 0.2
except Exception as e:
logger.error(f"Error checking trade quality: {e}")
issues.append(f"Quality check error: {e}")
quality_score = 0.0
# Ensure score is within bounds
quality_score = max(0.0, min(1.0, quality_score))
if issues:
logger.debug(f"Trade quality issues for {trade.symbol}@{trade.exchange}: {issues}")
return quality_score, issues
def _check_orderbook_structure(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check basic order book structure"""
issues = []
if not validate_symbol(orderbook.symbol):
issues.append("Invalid symbol format")
if not orderbook.exchange:
issues.append("Missing exchange")
if not orderbook.bids:
issues.append("No bid levels")
if not orderbook.asks:
issues.append("No ask levels")
return issues
def _check_orderbook_prices(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book price validity"""
issues = []
# Check bid prices (should be descending)
for i, bid in enumerate(orderbook.bids):
if not validate_price(bid.price):
issues.append(f"Invalid bid price at level {i}: {bid.price}")
if i > 0 and bid.price >= orderbook.bids[i-1].price:
issues.append(f"Bid prices not descending at level {i}")
# Check ask prices (should be ascending)
for i, ask in enumerate(orderbook.asks):
if not validate_price(ask.price):
issues.append(f"Invalid ask price at level {i}: {ask.price}")
if i > 0 and ask.price <= orderbook.asks[i-1].price:
issues.append(f"Ask prices not ascending at level {i}")
# Check bid-ask ordering
if orderbook.bids and orderbook.asks:
if orderbook.bids[0].price >= orderbook.asks[0].price:
issues.append("Best bid >= best ask (crossed book)")
return issues def
_check_orderbook_volumes(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book volume validity"""
issues = []
# Check bid volumes
for i, bid in enumerate(orderbook.bids):
if not validate_volume(bid.size):
issues.append(f"Invalid bid volume at level {i}: {bid.size}")
if bid.size < self.min_volume_threshold:
issues.append(f"Bid volume below threshold at level {i}: {bid.size}")
# Check ask volumes
for i, ask in enumerate(orderbook.asks):
if not validate_volume(ask.size):
issues.append(f"Invalid ask volume at level {i}: {ask.size}")
if ask.size < self.min_volume_threshold:
issues.append(f"Ask volume below threshold at level {i}: {ask.size}")
return issues
def _check_orderbook_spread(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check order book spread validity"""
issues = []
if orderbook.mid_price and orderbook.spread:
spread_percentage = (orderbook.spread / orderbook.mid_price) * 100
if spread_percentage > self.max_spread_percentage:
issues.append(f"Spread too wide: {spread_percentage:.2f}%")
if spread_percentage < 0:
issues.append(f"Negative spread: {spread_percentage:.2f}%")
return issues
def _check_timestamp(self, timestamp: datetime) -> List[str]:
"""Check timestamp validity"""
issues = []
if not timestamp:
issues.append("Missing timestamp")
return issues
# Check if timestamp is timezone-aware
if timestamp.tzinfo is None:
issues.append("Timestamp missing timezone info")
# Check timestamp drift
current_time = get_current_timestamp()
time_diff = abs((timestamp - current_time).total_seconds())
if time_diff > self.max_timestamp_drift:
issues.append(f"Timestamp drift too large: {time_diff:.1f}s")
return issues
def _check_price_history(self, orderbook: OrderBookSnapshot) -> List[str]:
"""Check price consistency with history"""
issues = []
key = f"{orderbook.symbol}_{orderbook.exchange}"
if key in self.price_history and orderbook.mid_price:
last_price = self.price_history[key]
price_change = abs(orderbook.mid_price - last_price) / last_price * 100
if price_change > self.max_price_change_percentage:
issues.append(f"Large price change: {price_change:.2f}%")
return issues
def _update_price_history(self, orderbook: OrderBookSnapshot) -> None:
"""Update price history for future validation"""
if orderbook.mid_price:
key = f"{orderbook.symbol}_{orderbook.exchange}"
self.price_history[key] = orderbook.mid_price
def get_quality_summary(self) -> Dict[str, int]:
"""Get summary of quality checks performed"""
return {
'symbols_tracked': len(self.price_history),
'max_spread_percentage': self.max_spread_percentage,
'max_price_change_percentage': self.max_price_change_percentage,
'min_volume_threshold': self.min_volume_threshold,
'max_timestamp_drift': self.max_timestamp_drift
}