329 lines
12 KiB
Python
329 lines
12 KiB
Python
"""
|
|
Anomaly detection for market data.
|
|
"""
|
|
|
|
import statistics
|
|
from typing import Dict, List, Union, Optional, Deque
|
|
from collections import deque
|
|
from datetime import datetime, timedelta
|
|
from ..models.core import OrderBookSnapshot, TradeEvent
|
|
from ..utils.logging import get_logger
|
|
from ..utils.timing import get_current_timestamp
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
class AnomalyDetector:
|
|
"""
|
|
Detects anomalies in market data using statistical methods.
|
|
|
|
Detects:
|
|
- Price spikes and drops
|
|
- Volume anomalies
|
|
- Spread anomalies
|
|
- Frequency anomalies
|
|
"""
|
|
|
|
def __init__(self, window_size: int = 100, z_score_threshold: float = 3.0):
|
|
"""
|
|
Initialize anomaly detector.
|
|
|
|
Args:
|
|
window_size: Size of rolling window for statistics
|
|
z_score_threshold: Z-score threshold for anomaly detection
|
|
"""
|
|
self.window_size = window_size
|
|
self.z_score_threshold = z_score_threshold
|
|
|
|
# Rolling windows for statistics
|
|
self.price_windows: Dict[str, Deque[float]] = {}
|
|
self.volume_windows: Dict[str, Deque[float]] = {}
|
|
self.spread_windows: Dict[str, Deque[float]] = {}
|
|
self.timestamp_windows: Dict[str, Deque[datetime]] = {}
|
|
|
|
logger.info(f"Anomaly detector initialized with window_size={window_size}, threshold={z_score_threshold}")
|
|
|
|
def detect_orderbook_anomalies(self, orderbook: OrderBookSnapshot) -> List[str]:
|
|
"""
|
|
Detect anomalies in order book data.
|
|
|
|
Args:
|
|
orderbook: Order book snapshot to analyze
|
|
|
|
Returns:
|
|
List[str]: List of detected anomalies
|
|
"""
|
|
anomalies = []
|
|
key = f"{orderbook.symbol}_{orderbook.exchange}"
|
|
|
|
try:
|
|
# Price anomalies
|
|
if orderbook.mid_price:
|
|
price_anomalies = self._detect_price_anomalies(key, orderbook.mid_price)
|
|
anomalies.extend(price_anomalies)
|
|
|
|
# Volume anomalies
|
|
total_volume = orderbook.bid_volume + orderbook.ask_volume
|
|
volume_anomalies = self._detect_volume_anomalies(key, total_volume)
|
|
anomalies.extend(volume_anomalies)
|
|
|
|
# Spread anomalies
|
|
if orderbook.spread and orderbook.mid_price:
|
|
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
|
|
spread_anomalies = self._detect_spread_anomalies(key, spread_pct)
|
|
anomalies.extend(spread_anomalies)
|
|
|
|
# Frequency anomalies
|
|
frequency_anomalies = self._detect_frequency_anomalies(key, orderbook.timestamp)
|
|
anomalies.extend(frequency_anomalies)
|
|
|
|
# Update windows
|
|
self._update_windows(key, orderbook)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error detecting order book anomalies: {e}")
|
|
anomalies.append(f"Anomaly detection error: {e}")
|
|
|
|
if anomalies:
|
|
logger.warning(f"Anomalies detected in {orderbook.symbol}@{orderbook.exchange}: {anomalies}")
|
|
|
|
return anomalies
|
|
|
|
def detect_trade_anomalies(self, trade: TradeEvent) -> List[str]:
|
|
"""
|
|
Detect anomalies in trade data.
|
|
|
|
Args:
|
|
trade: Trade event to analyze
|
|
|
|
Returns:
|
|
List[str]: List of detected anomalies
|
|
"""
|
|
anomalies = []
|
|
key = f"{trade.symbol}_{trade.exchange}_trade"
|
|
|
|
try:
|
|
# Price anomalies
|
|
price_anomalies = self._detect_price_anomalies(key, trade.price)
|
|
anomalies.extend(price_anomalies)
|
|
|
|
# Volume anomalies
|
|
volume_anomalies = self._detect_volume_anomalies(key, trade.size)
|
|
anomalies.extend(volume_anomalies)
|
|
|
|
# Update windows
|
|
self._update_trade_windows(key, trade)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error detecting trade anomalies: {e}")
|
|
anomalies.append(f"Anomaly detection error: {e}")
|
|
|
|
if anomalies:
|
|
logger.warning(f"Trade anomalies detected in {trade.symbol}@{trade.exchange}: {anomalies}")
|
|
|
|
return anomalies
|
|
|
|
def _detect_price_anomalies(self, key: str, price: float) -> List[str]:
|
|
"""Detect price anomalies using z-score"""
|
|
anomalies = []
|
|
|
|
if key not in self.price_windows:
|
|
self.price_windows[key] = deque(maxlen=self.window_size)
|
|
return anomalies
|
|
|
|
window = self.price_windows[key]
|
|
if len(window) < 10: # Need minimum data points
|
|
return anomalies
|
|
|
|
try:
|
|
mean_price = statistics.mean(window)
|
|
std_price = statistics.stdev(window)
|
|
|
|
if std_price > 0:
|
|
z_score = abs(price - mean_price) / std_price
|
|
|
|
if z_score > self.z_score_threshold:
|
|
direction = "spike" if price > mean_price else "drop"
|
|
anomalies.append(f"Price {direction}: {price:.6f} (z-score: {z_score:.2f})")
|
|
|
|
except statistics.StatisticsError:
|
|
pass # Not enough data or all values are the same
|
|
|
|
return anomalies
|
|
|
|
def _detect_volume_anomalies(self, key: str, volume: float) -> List[str]:
|
|
"""Detect volume anomalies using z-score"""
|
|
anomalies = []
|
|
|
|
volume_key = f"{key}_volume"
|
|
if volume_key not in self.volume_windows:
|
|
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
|
|
return anomalies
|
|
|
|
window = self.volume_windows[volume_key]
|
|
if len(window) < 10:
|
|
return anomalies
|
|
|
|
try:
|
|
mean_volume = statistics.mean(window)
|
|
std_volume = statistics.stdev(window)
|
|
|
|
if std_volume > 0:
|
|
z_score = abs(volume - mean_volume) / std_volume
|
|
|
|
if z_score > self.z_score_threshold:
|
|
direction = "spike" if volume > mean_volume else "drop"
|
|
anomalies.append(f"Volume {direction}: {volume:.6f} (z-score: {z_score:.2f})")
|
|
|
|
except statistics.StatisticsError:
|
|
pass
|
|
|
|
return anomalies
|
|
|
|
def _detect_spread_anomalies(self, key: str, spread_pct: float) -> List[str]:
|
|
"""Detect spread anomalies using z-score"""
|
|
anomalies = []
|
|
|
|
spread_key = f"{key}_spread"
|
|
if spread_key not in self.spread_windows:
|
|
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
|
|
return anomalies
|
|
|
|
window = self.spread_windows[spread_key]
|
|
if len(window) < 10:
|
|
return anomalies
|
|
|
|
try:
|
|
mean_spread = statistics.mean(window)
|
|
std_spread = statistics.stdev(window)
|
|
|
|
if std_spread > 0:
|
|
z_score = abs(spread_pct - mean_spread) / std_spread
|
|
|
|
if z_score > self.z_score_threshold:
|
|
direction = "widening" if spread_pct > mean_spread else "tightening"
|
|
anomalies.append(f"Spread {direction}: {spread_pct:.4f}% (z-score: {z_score:.2f})")
|
|
|
|
except statistics.StatisticsError:
|
|
pass
|
|
|
|
return anomalies
|
|
|
|
def _detect_frequency_anomalies(self, key: str, timestamp: datetime) -> List[str]:
|
|
"""Detect frequency anomalies in data updates"""
|
|
anomalies = []
|
|
|
|
timestamp_key = f"{key}_timestamp"
|
|
if timestamp_key not in self.timestamp_windows:
|
|
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
|
|
return anomalies
|
|
|
|
window = self.timestamp_windows[timestamp_key]
|
|
if len(window) < 5:
|
|
return anomalies
|
|
|
|
try:
|
|
# Calculate intervals between updates
|
|
intervals = []
|
|
for i in range(1, len(window)):
|
|
interval = (window[i] - window[i-1]).total_seconds()
|
|
intervals.append(interval)
|
|
|
|
if len(intervals) >= 5:
|
|
mean_interval = statistics.mean(intervals)
|
|
std_interval = statistics.stdev(intervals)
|
|
|
|
# Check current interval
|
|
current_interval = (timestamp - window[-1]).total_seconds()
|
|
|
|
if std_interval > 0:
|
|
z_score = abs(current_interval - mean_interval) / std_interval
|
|
|
|
if z_score > self.z_score_threshold:
|
|
if current_interval > mean_interval:
|
|
anomalies.append(f"Update delay: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
|
|
else:
|
|
anomalies.append(f"Update burst: {current_interval:.1f}s (expected: {mean_interval:.1f}s)")
|
|
|
|
except (statistics.StatisticsError, IndexError):
|
|
pass
|
|
|
|
return anomalies
|
|
|
|
def _update_windows(self, key: str, orderbook: OrderBookSnapshot) -> None:
|
|
"""Update rolling windows with new data"""
|
|
# Update price window
|
|
if orderbook.mid_price:
|
|
if key not in self.price_windows:
|
|
self.price_windows[key] = deque(maxlen=self.window_size)
|
|
self.price_windows[key].append(orderbook.mid_price)
|
|
|
|
# Update volume window
|
|
total_volume = orderbook.bid_volume + orderbook.ask_volume
|
|
volume_key = f"{key}_volume"
|
|
if volume_key not in self.volume_windows:
|
|
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
|
|
self.volume_windows[volume_key].append(total_volume)
|
|
|
|
# Update spread window
|
|
if orderbook.spread and orderbook.mid_price:
|
|
spread_pct = (orderbook.spread / orderbook.mid_price) * 100
|
|
spread_key = f"{key}_spread"
|
|
if spread_key not in self.spread_windows:
|
|
self.spread_windows[spread_key] = deque(maxlen=self.window_size)
|
|
self.spread_windows[spread_key].append(spread_pct)
|
|
|
|
# Update timestamp window
|
|
timestamp_key = f"{key}_timestamp"
|
|
if timestamp_key not in self.timestamp_windows:
|
|
self.timestamp_windows[timestamp_key] = deque(maxlen=self.window_size)
|
|
self.timestamp_windows[timestamp_key].append(orderbook.timestamp)
|
|
|
|
def _update_trade_windows(self, key: str, trade: TradeEvent) -> None:
|
|
"""Update rolling windows with trade data"""
|
|
# Update price window
|
|
if key not in self.price_windows:
|
|
self.price_windows[key] = deque(maxlen=self.window_size)
|
|
self.price_windows[key].append(trade.price)
|
|
|
|
# Update volume window
|
|
volume_key = f"{key}_volume"
|
|
if volume_key not in self.volume_windows:
|
|
self.volume_windows[volume_key] = deque(maxlen=self.window_size)
|
|
self.volume_windows[volume_key].append(trade.size)
|
|
|
|
def get_statistics(self) -> Dict[str, Dict[str, float]]:
|
|
"""Get current statistics for all tracked symbols"""
|
|
stats = {}
|
|
|
|
for key, window in self.price_windows.items():
|
|
if len(window) >= 2:
|
|
try:
|
|
stats[key] = {
|
|
'price_mean': statistics.mean(window),
|
|
'price_std': statistics.stdev(window),
|
|
'price_min': min(window),
|
|
'price_max': max(window),
|
|
'data_points': len(window)
|
|
}
|
|
except statistics.StatisticsError:
|
|
stats[key] = {'error': 'insufficient_data'}
|
|
|
|
return stats
|
|
|
|
def reset_windows(self, key: Optional[str] = None) -> None:
|
|
"""Reset rolling windows for a specific key or all keys"""
|
|
if key:
|
|
# Reset specific key
|
|
self.price_windows.pop(key, None)
|
|
self.volume_windows.pop(f"{key}_volume", None)
|
|
self.spread_windows.pop(f"{key}_spread", None)
|
|
self.timestamp_windows.pop(f"{key}_timestamp", None)
|
|
else:
|
|
# Reset all windows
|
|
self.price_windows.clear()
|
|
self.volume_windows.clear()
|
|
self.spread_windows.clear()
|
|
self.timestamp_windows.clear()
|
|
|
|
logger.info(f"Reset anomaly detection windows for {key or 'all keys'}") |