""" Unified Data Models for the storage system. Standardized data structures for all components. """ import pandas as pd import numpy as np from dataclasses import dataclass, field, asdict from datetime import datetime from typing import Dict, List, Optional, Tuple, Any import json import logging logger = logging.getLogger(__name__) @dataclass class InferenceDataFrame: """ Complete inference data for a single timestamp. Contains all data needed for model inference including multi-timeframe OHLCV, order book data, imbalances, and technical indicators. """ symbol: str timestamp: datetime # Multi-timeframe OHLCV data ohlcv_1s: pd.DataFrame = field(default_factory=pd.DataFrame) ohlcv_1m: pd.DataFrame = field(default_factory=pd.DataFrame) ohlcv_5m: pd.DataFrame = field(default_factory=pd.DataFrame) ohlcv_15m: pd.DataFrame = field(default_factory=pd.DataFrame) ohlcv_1h: pd.DataFrame = field(default_factory=pd.DataFrame) ohlcv_1d: pd.DataFrame = field(default_factory=pd.DataFrame) # Order book data orderbook_snapshot: Optional[Dict] = None orderbook_1s_agg: pd.DataFrame = field(default_factory=pd.DataFrame) # Imbalance metrics (multi-timeframe) imbalances: pd.DataFrame = field(default_factory=pd.DataFrame) # Technical indicators (pre-calculated from latest candle) indicators: Dict[str, float] = field(default_factory=dict) # Context window data (±N minutes around timestamp) context_data: Optional[pd.DataFrame] = None # Metadata data_source: str = 'unknown' # 'cache' or 'database' query_latency_ms: float = 0.0 def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { 'symbol': self.symbol, 'timestamp': self.timestamp.isoformat() if self.timestamp else None, 'ohlcv_1s': self.ohlcv_1s.to_dict('records') if not self.ohlcv_1s.empty else [], 'ohlcv_1m': self.ohlcv_1m.to_dict('records') if not self.ohlcv_1m.empty else [], 'ohlcv_5m': self.ohlcv_5m.to_dict('records') if not self.ohlcv_5m.empty else [], 'ohlcv_15m': self.ohlcv_15m.to_dict('records') if not self.ohlcv_15m.empty else [], 'ohlcv_1h': self.ohlcv_1h.to_dict('records') if not self.ohlcv_1h.empty else [], 'ohlcv_1d': self.ohlcv_1d.to_dict('records') if not self.ohlcv_1d.empty else [], 'orderbook_snapshot': self.orderbook_snapshot, 'orderbook_1s_agg': self.orderbook_1s_agg.to_dict('records') if not self.orderbook_1s_agg.empty else [], 'imbalances': self.imbalances.to_dict('records') if not self.imbalances.empty else [], 'indicators': self.indicators, 'context_data': self.context_data.to_dict('records') if self.context_data is not None and not self.context_data.empty else None, 'data_source': self.data_source, 'query_latency_ms': self.query_latency_ms } def to_json(self) -> str: """Convert to JSON string.""" return json.dumps(self.to_dict(), default=str) @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'InferenceDataFrame': """Create from dictionary.""" return cls( symbol=data['symbol'], timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(), ohlcv_1s=pd.DataFrame(data.get('ohlcv_1s', [])), ohlcv_1m=pd.DataFrame(data.get('ohlcv_1m', [])), ohlcv_5m=pd.DataFrame(data.get('ohlcv_5m', [])), ohlcv_15m=pd.DataFrame(data.get('ohlcv_15m', [])), ohlcv_1h=pd.DataFrame(data.get('ohlcv_1h', [])), ohlcv_1d=pd.DataFrame(data.get('ohlcv_1d', [])), orderbook_snapshot=data.get('orderbook_snapshot'), orderbook_1s_agg=pd.DataFrame(data.get('orderbook_1s_agg', [])), imbalances=pd.DataFrame(data.get('imbalances', [])), indicators=data.get('indicators', {}), context_data=pd.DataFrame(data['context_data']) if data.get('context_data') else None, data_source=data.get('data_source', 'unknown'), query_latency_ms=data.get('query_latency_ms', 0.0) ) def get_latest_price(self) -> Optional[float]: """Get the latest close price from 1s data.""" if not self.ohlcv_1s.empty: return float(self.ohlcv_1s.iloc[-1]['close_price']) return None def get_timeframe_data(self, timeframe: str) -> pd.DataFrame: """Get OHLCV data for a specific timeframe.""" timeframe_map = { '1s': self.ohlcv_1s, '1m': self.ohlcv_1m, '5m': self.ohlcv_5m, '15m': self.ohlcv_15m, '1h': self.ohlcv_1h, '1d': self.ohlcv_1d } return timeframe_map.get(timeframe, pd.DataFrame()) def has_complete_data(self) -> bool: """Check if all required data is present.""" return ( not self.ohlcv_1s.empty and not self.ohlcv_1m.empty and not self.imbalances.empty and self.orderbook_snapshot is not None ) def get_data_summary(self) -> Dict[str, Any]: """Get summary of available data.""" return { 'symbol': self.symbol, 'timestamp': self.timestamp.isoformat() if self.timestamp else None, 'ohlcv_1s_rows': len(self.ohlcv_1s), 'ohlcv_1m_rows': len(self.ohlcv_1m), 'ohlcv_5m_rows': len(self.ohlcv_5m), 'ohlcv_15m_rows': len(self.ohlcv_15m), 'ohlcv_1h_rows': len(self.ohlcv_1h), 'ohlcv_1d_rows': len(self.ohlcv_1d), 'has_orderbook': self.orderbook_snapshot is not None, 'orderbook_1s_agg_rows': len(self.orderbook_1s_agg), 'imbalances_rows': len(self.imbalances), 'indicators_count': len(self.indicators), 'has_context_data': self.context_data is not None, 'data_source': self.data_source, 'query_latency_ms': self.query_latency_ms, 'is_complete': self.has_complete_data() } @dataclass class OrderBookDataFrame: """ Order book data with imbalances and aggregations. Contains raw order book, price buckets, and multi-timeframe imbalance metrics. """ symbol: str timestamp: datetime # Raw order book (top levels) bids: List[Tuple[float, float]] = field(default_factory=list) # (price, size) asks: List[Tuple[float, float]] = field(default_factory=list) # (price, size) # Aggregated data (price buckets) price_buckets: pd.DataFrame = field(default_factory=pd.DataFrame) # Multi-timeframe imbalance metrics imbalance_1s: float = 0.0 imbalance_5s: float = 0.0 imbalance_15s: float = 0.0 imbalance_60s: float = 0.0 # Volume-weighted imbalances volume_imbalance_1s: float = 0.0 volume_imbalance_5s: float = 0.0 volume_imbalance_15s: float = 0.0 volume_imbalance_60s: float = 0.0 # Order book statistics mid_price: float = 0.0 spread: float = 0.0 bid_volume: float = 0.0 ask_volume: float = 0.0 # Metadata exchange: str = 'binance' sequence_id: Optional[int] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary for serialization.""" return { 'symbol': self.symbol, 'timestamp': self.timestamp.isoformat() if self.timestamp else None, 'bids': self.bids, 'asks': self.asks, 'price_buckets': self.price_buckets.to_dict('records') if not self.price_buckets.empty else [], 'imbalance_1s': self.imbalance_1s, 'imbalance_5s': self.imbalance_5s, 'imbalance_15s': self.imbalance_15s, 'imbalance_60s': self.imbalance_60s, 'volume_imbalance_1s': self.volume_imbalance_1s, 'volume_imbalance_5s': self.volume_imbalance_5s, 'volume_imbalance_15s': self.volume_imbalance_15s, 'volume_imbalance_60s': self.volume_imbalance_60s, 'mid_price': self.mid_price, 'spread': self.spread, 'bid_volume': self.bid_volume, 'ask_volume': self.ask_volume, 'exchange': self.exchange, 'sequence_id': self.sequence_id } def to_json(self) -> str: """Convert to JSON string.""" return json.dumps(self.to_dict(), default=str) @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'OrderBookDataFrame': """Create from dictionary.""" return cls( symbol=data['symbol'], timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(), bids=data.get('bids', []), asks=data.get('asks', []), price_buckets=pd.DataFrame(data.get('price_buckets', [])), imbalance_1s=data.get('imbalance_1s', 0.0), imbalance_5s=data.get('imbalance_5s', 0.0), imbalance_15s=data.get('imbalance_15s', 0.0), imbalance_60s=data.get('imbalance_60s', 0.0), volume_imbalance_1s=data.get('volume_imbalance_1s', 0.0), volume_imbalance_5s=data.get('volume_imbalance_5s', 0.0), volume_imbalance_15s=data.get('volume_imbalance_15s', 0.0), volume_imbalance_60s=data.get('volume_imbalance_60s', 0.0), mid_price=data.get('mid_price', 0.0), spread=data.get('spread', 0.0), bid_volume=data.get('bid_volume', 0.0), ask_volume=data.get('ask_volume', 0.0), exchange=data.get('exchange', 'binance'), sequence_id=data.get('sequence_id') ) def calculate_statistics(self): """Calculate order book statistics from bids and asks.""" if self.bids and self.asks: # Best bid and ask best_bid = max(self.bids, key=lambda x: x[0])[0] if self.bids else 0 best_ask = min(self.asks, key=lambda x: x[0])[0] if self.asks else 0 # Mid price and spread if best_bid > 0 and best_ask > 0: self.mid_price = (best_bid + best_ask) / 2 self.spread = best_ask - best_bid # Total volumes self.bid_volume = sum(size for _, size in self.bids) self.ask_volume = sum(size for _, size in self.asks) def get_best_bid(self) -> Optional[Tuple[float, float]]: """Get best bid (highest price).""" if self.bids: return max(self.bids, key=lambda x: x[0]) return None def get_best_ask(self) -> Optional[Tuple[float, float]]: """Get best ask (lowest price).""" if self.asks: return min(self.asks, key=lambda x: x[0]) return None def get_spread_bps(self) -> float: """Get spread in basis points.""" if self.mid_price > 0 and self.spread > 0: return (self.spread / self.mid_price) * 10000 return 0.0 def get_imbalance_summary(self) -> Dict[str, float]: """Get summary of all imbalance metrics.""" return { 'imbalance_1s': self.imbalance_1s, 'imbalance_5s': self.imbalance_5s, 'imbalance_15s': self.imbalance_15s, 'imbalance_60s': self.imbalance_60s, 'volume_imbalance_1s': self.volume_imbalance_1s, 'volume_imbalance_5s': self.volume_imbalance_5s, 'volume_imbalance_15s': self.volume_imbalance_15s, 'volume_imbalance_60s': self.volume_imbalance_60s } def is_valid(self) -> bool: """Check if order book data is valid.""" if not self.bids or not self.asks: return False best_bid = self.get_best_bid() best_ask = self.get_best_ask() if not best_bid or not best_ask: return False # Bid must be less than ask return best_bid[0] < best_ask[0] @dataclass class TradeEvent: """Individual trade event.""" symbol: str timestamp: datetime price: float size: float side: str # 'buy' or 'sell' trade_id: str exchange: str = 'binance' is_buyer_maker: bool = False def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return asdict(self) def to_json(self) -> str: """Convert to JSON string.""" return json.dumps(self.to_dict(), default=str) @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'TradeEvent': """Create from dictionary.""" return cls( symbol=data['symbol'], timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'], price=float(data['price']), size=float(data['size']), side=data['side'], trade_id=str(data['trade_id']), exchange=data.get('exchange', 'binance'), is_buyer_maker=data.get('is_buyer_maker', False) ) @dataclass class OHLCVCandle: """Single OHLCV candlestick.""" symbol: str timestamp: datetime timeframe: str open_price: float high_price: float low_price: float close_price: float volume: float trade_count: int = 0 # Technical indicators (optional) rsi_14: Optional[float] = None macd: Optional[float] = None macd_signal: Optional[float] = None macd_histogram: Optional[float] = None bb_upper: Optional[float] = None bb_middle: Optional[float] = None bb_lower: Optional[float] = None ema_12: Optional[float] = None ema_26: Optional[float] = None sma_20: Optional[float] = None def to_dict(self) -> Dict[str, Any]: """Convert to dictionary.""" return asdict(self) def to_json(self) -> str: """Convert to JSON string.""" return json.dumps(self.to_dict(), default=str) @classmethod def from_dict(cls, data: Dict[str, Any]) -> 'OHLCVCandle': """Create from dictionary.""" return cls( symbol=data['symbol'], timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'], timeframe=data['timeframe'], open_price=float(data['open_price']), high_price=float(data['high_price']), low_price=float(data['low_price']), close_price=float(data['close_price']), volume=float(data['volume']), trade_count=int(data.get('trade_count', 0)), rsi_14=float(data['rsi_14']) if data.get('rsi_14') is not None else None, macd=float(data['macd']) if data.get('macd') is not None else None, macd_signal=float(data['macd_signal']) if data.get('macd_signal') is not None else None, macd_histogram=float(data['macd_histogram']) if data.get('macd_histogram') is not None else None, bb_upper=float(data['bb_upper']) if data.get('bb_upper') is not None else None, bb_middle=float(data['bb_middle']) if data.get('bb_middle') is not None else None, bb_lower=float(data['bb_lower']) if data.get('bb_lower') is not None else None, ema_12=float(data['ema_12']) if data.get('ema_12') is not None else None, ema_26=float(data['ema_26']) if data.get('ema_26') is not None else None, sma_20=float(data['sma_20']) if data.get('sma_20') is not None else None ) def is_valid(self) -> bool: """Check if candle data is valid.""" # High must be >= low if self.high_price < self.low_price: return False # High must be >= open and close if self.high_price < self.open_price or self.high_price < self.close_price: return False # Low must be <= open and close if self.low_price > self.open_price or self.low_price > self.close_price: return False # Volume must be non-negative if self.volume < 0: return False return True def get_price_change(self) -> float: """Get price change (close - open).""" return self.close_price - self.open_price def get_price_change_percent(self) -> float: """Get price change percentage.""" if self.open_price > 0: return ((self.close_price - self.open_price) / self.open_price) * 100 return 0.0 def get_range(self) -> float: """Get price range (high - low).""" return self.high_price - self.low_price def is_bullish(self) -> bool: """Check if candle is bullish (close > open).""" return self.close_price > self.open_price def is_bearish(self) -> bool: """Check if candle is bearish (close < open).""" return self.close_price < self.open_price