Files
gogo2/COBY/processing/anomaly_detector.py
2025-08-04 17:28:55 +03:00

329 lines
12 KiB
Python

"""
Anomaly detection for market data.
"""
import statistics
from typing import Dict, List, Union, Optional, Deque
from collections import deque
from datetime import datetime, timedelta
from ..models.core import OrderBookSnapshot, TradeEvent
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
class AnomalyDetector:
"""
Detects anomalies in market data using statistical methods.
Detects:
- Price spikes and drops
- Volume anomalies
- Spread anomalies
- Frequency anomalies
"""
def __init__(self, window_size: int = 100, z_score_threshold: float = 3.0):
"""
Initialize anomaly detector.
Args:
window_size: Size of rolling window for statistics
z_score_threshold: Z-score threshold for anomaly detection
"""
self.window_size = window_size
self.z_score_threshold = z_score_threshold
# Rolling windows for statistics
self.price_windows: Dict[str, Deque[float]] = {}
self.volume_windows: Dict[str, Deque[float]] = {}
self.spread_windows: Dict[str, Deque[float]] = {}
self.timestamp_windows: Dict[str, Deque[datetime]] = {}
logger.info(f"Anomaly detector initialized with window_size={window_size}, threshold={z_score_threshold}")
def detect_orderbook_anomalies(self, orderbook: OrderBookSnapshot) -> List[str]:
"""
Detect anomalies in order book data.
Args:
orderbook: Order book snapshot to analyze
Returns:
List[str]: List of detected anomalies
"""
anomalies = []
key = f"{orderbook.symbol}_{orderbook.exchange}"
try:
# Price anomalies
if orderbook.mid_price:
price_anomalies = self._detect_price_anomalies(key, orderbook.mid_price)
anomalies.extend(price_anomalies)
# Volume anomalies
total_volume = orderbook.bid_volume + orderbook.ask_volume
volume_anomalies = self._detect_volume_anomalies(key, total_volume)
anomalies.extend(volume_anomalies)
# Spread anomalies
if orderbook.spread and orderbook.mid_price:
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
spread_anomalies = self._detect_spread_anomalies(key, spread_pct)
anomalies.extend(spread_anomalies)
# Frequency anomalies
frequency_anomalies = self._detect_frequency_anomalies(key, orderbook.timestamp)
anomalies.extend(frequency_anomalies)
# Update windows
self._update_windows(key, orderbook)
except Exception as e:
logger.error(f"Error detecting order book anomalies: {e}")
anomalies.append(f"Anomaly detection error: {e}")
if anomalies:
logger.warning(f"Anomalies detected in {orderbook.symbol}@{orderbook.exchange}: {anomalies}")
return anomalies
def detect_trade_anomalies(self, trade: TradeEvent) -> List[str]:
"""
Detect anomalies in trade data.
Args:
trade: Trade event to analyze
Returns:
List[str]: List of detected anomalies
"""
anomalies = []
key = f"{trade.symbol}_{trade.exchange}_trade"
try:
# Price anomalies
price_anomalies = self._detect_price_anomalies(key, trade.price)
anomalies.extend(price_anomalies)
# Volume anomalies
volume_anomalies = self._detect_volume_anomalies(key, trade.size)
anomalies.extend(volume_anomalies)
# Update windows
self._update_trade_windows(key, trade)
except Exception as e:
logger.error(f"Error detecting trade anomalies: {e}")
anomalies.append(f"Anomaly detection error: {e}")
if anomalies:
logger.warning(f"Trade anomalies detected in {trade.symbol}@{trade.exchange}: {anomalies}")
return anomalies
def _detect_price_anomalies(self, key: str, price: float) -> List[str]:
"""Detect price anomalies using z-score"""
anomalies = []
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
return anomalies
window = self.price_windows[key]
if len(window) < 10: # Need minimum data points
return anomalies
try:
mean_price = statistics.mean(window)
std_price = statistics.stdev(window)
if std_price > 0:
z_score = abs(price - mean_price) / std_price
if z_score > self.z_score_threshold:
direction = "spike" if price > mean_price else "drop"
anomalies.append(f"Price {direction}: {price:.6f} (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass # Not enough data or all values are the same
return anomalies
def _detect_volume_anomalies(self, key: str, volume: float) -> List[str]:
"""Detect volume anomalies using z-score"""
anomalies = []
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
return anomalies
window = self.volume_windows[volume_key]
if len(window) < 10:
return anomalies
try:
mean_volume = statistics.mean(window)
std_volume = statistics.stdev(window)
if std_volume > 0:
z_score = abs(volume - mean_volume) / std_volume
if z_score > self.z_score_threshold:
direction = "spike" if volume > mean_volume else "drop"
anomalies.append(f"Volume {direction}: {volume:.6f} (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass
return anomalies
def _detect_spread_anomalies(self, key: str, spread_pct: float) -> List[str]:
"""Detect spread anomalies using z-score"""
anomalies = []
spread_key = f"{key}_spread"
if spread_key not in self.spread_windows:
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
return anomalies
window = self.spread_windows[spread_key]
if len(window) < 10:
return anomalies
try:
mean_spread = statistics.mean(window)
std_spread = statistics.stdev(window)
if std_spread > 0:
z_score = abs(spread_pct - mean_spread) / std_spread
if z_score > self.z_score_threshold:
direction = "widening" if spread_pct > mean_spread else "tightening"
anomalies.append(f"Spread {direction}: {spread_pct:.4f}% (z-score: {z_score:.2f})")
except statistics.StatisticsError:
pass
return anomalies
def _detect_frequency_anomalies(self, key: str, timestamp: datetime) -> List[str]:
"""Detect frequency anomalies in data updates"""
anomalies = []
timestamp_key = f"{key}_timestamp"
if timestamp_key not in self.timestamp_windows:
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
return anomalies
window = self.timestamp_windows[timestamp_key]
if len(window) < 5:
return anomalies
try:
# Calculate intervals between updates
intervals = []
for i in range(1, len(window)):
interval = (window[i] - window[i-1]).total_seconds()
intervals.append(interval)
if len(intervals) >= 5:
mean_interval = statistics.mean(intervals)
std_interval = statistics.stdev(intervals)
# Check current interval
current_interval = (timestamp - window[-1]).total_seconds()
if std_interval > 0:
z_score = abs(current_interval - mean_interval) / std_interval
if z_score > self.z_score_threshold:
if current_interval > mean_interval:
anomalies.append(f"Update delay: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
else:
anomalies.append(f"Update burst: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
except (statistics.StatisticsError, IndexError):
pass
return anomalies
def _update_windows(self, key: str, orderbook: OrderBookSnapshot) -> None:
"""Update rolling windows with new data"""
# Update price window
if orderbook.mid_price:
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
self.price_windows[key].append(orderbook.mid_price)
# Update volume window
total_volume = orderbook.bid_volume + orderbook.ask_volume
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
self.volume_windows[volume_key].append(total_volume)
# Update spread window
if orderbook.spread and orderbook.mid_price:
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
spread_key = f"{key}_spread"
if spread_key not in self.spread_windows:
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
self.spread_windows[spread_key].append(spread_pct)
# Update timestamp window
timestamp_key = f"{key}_timestamp"
if timestamp_key not in self.timestamp_windows:
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
self.timestamp_windows[timestamp_key].append(orderbook.timestamp)
def _update_trade_windows(self, key: str, trade: TradeEvent) -> None:
"""Update rolling windows with trade data"""
# Update price window
if key not in self.price_windows:
self.price_windows[key] = deque(maxlen=self.window_size)
self.price_windows[key].append(trade.price)
# Update volume window
volume_key = f"{key}_volume"
if volume_key not in self.volume_windows:
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
self.volume_windows[volume_key].append(trade.size)
def get_statistics(self) -> Dict[str, Dict[str, float]]:
"""Get current statistics for all tracked symbols"""
stats = {}
for key, window in self.price_windows.items():
if len(window) >= 2:
try:
stats[key] = {
'price_mean': statistics.mean(window),
'price_std': statistics.stdev(window),
'price_min': min(window),
'price_max': max(window),
'data_points': len(window)
}
except statistics.StatisticsError:
stats[key] = {'error': 'insufficient_data'}
return stats
def reset_windows(self, key: Optional[str] = None) -> None:
"""Reset rolling windows for a specific key or all keys"""
if key:
# Reset specific key
self.price_windows.pop(key, None)
self.volume_windows.pop(f"{key}_volume", None)
self.spread_windows.pop(f"{key}_spread", None)
self.timestamp_windows.pop(f"{key}_timestamp", None)
else:
# Reset all windows
self.price_windows.clear()
self.volume_windows.clear()
self.spread_windows.clear()
self.timestamp_windows.clear()
logger.info(f"Reset anomaly detection windows for {key or 'all keys'}")