Files
gogo2/core/unified_data_models.py
2025-10-20 11:16:27 +03:00

433 lines
17 KiB
Python

"""
Unified Data Models for the storage system.
Standardized data structures for all components.
"""
import pandas as pd
import numpy as np
from dataclasses import dataclass, field, asdict
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
import json
import logging
logger = logging.getLogger(__name__)
@dataclass
class InferenceDataFrame:
"""
Complete inference data for a single timestamp.
Contains all data needed for model inference including multi-timeframe OHLCV,
order book data, imbalances, and technical indicators.
"""
symbol: str
timestamp: datetime
# Multi-timeframe OHLCV data
ohlcv_1s: pd.DataFrame = field(default_factory=pd.DataFrame)
ohlcv_1m: pd.DataFrame = field(default_factory=pd.DataFrame)
ohlcv_5m: pd.DataFrame = field(default_factory=pd.DataFrame)
ohlcv_15m: pd.DataFrame = field(default_factory=pd.DataFrame)
ohlcv_1h: pd.DataFrame = field(default_factory=pd.DataFrame)
ohlcv_1d: pd.DataFrame = field(default_factory=pd.DataFrame)
# Order book data
orderbook_snapshot: Optional[Dict] = None
orderbook_1s_agg: pd.DataFrame = field(default_factory=pd.DataFrame)
# Imbalance metrics (multi-timeframe)
imbalances: pd.DataFrame = field(default_factory=pd.DataFrame)
# Technical indicators (pre-calculated from latest candle)
indicators: Dict[str, float] = field(default_factory=dict)
# Context window data (±N minutes around timestamp)
context_data: Optional[pd.DataFrame] = None
# Metadata
data_source: str = 'unknown' # 'cache' or 'database'
query_latency_ms: float = 0.0
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
'symbol': self.symbol,
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
'ohlcv_1s': self.ohlcv_1s.to_dict('records') if not self.ohlcv_1s.empty else [],
'ohlcv_1m': self.ohlcv_1m.to_dict('records') if not self.ohlcv_1m.empty else [],
'ohlcv_5m': self.ohlcv_5m.to_dict('records') if not self.ohlcv_5m.empty else [],
'ohlcv_15m': self.ohlcv_15m.to_dict('records') if not self.ohlcv_15m.empty else [],
'ohlcv_1h': self.ohlcv_1h.to_dict('records') if not self.ohlcv_1h.empty else [],
'ohlcv_1d': self.ohlcv_1d.to_dict('records') if not self.ohlcv_1d.empty else [],
'orderbook_snapshot': self.orderbook_snapshot,
'orderbook_1s_agg': self.orderbook_1s_agg.to_dict('records') if not self.orderbook_1s_agg.empty else [],
'imbalances': self.imbalances.to_dict('records') if not self.imbalances.empty else [],
'indicators': self.indicators,
'context_data': self.context_data.to_dict('records') if self.context_data is not None and not self.context_data.empty else None,
'data_source': self.data_source,
'query_latency_ms': self.query_latency_ms
}
def to_json(self) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), default=str)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'InferenceDataFrame':
"""Create from dictionary."""
return cls(
symbol=data['symbol'],
timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(),
ohlcv_1s=pd.DataFrame(data.get('ohlcv_1s', [])),
ohlcv_1m=pd.DataFrame(data.get('ohlcv_1m', [])),
ohlcv_5m=pd.DataFrame(data.get('ohlcv_5m', [])),
ohlcv_15m=pd.DataFrame(data.get('ohlcv_15m', [])),
ohlcv_1h=pd.DataFrame(data.get('ohlcv_1h', [])),
ohlcv_1d=pd.DataFrame(data.get('ohlcv_1d', [])),
orderbook_snapshot=data.get('orderbook_snapshot'),
orderbook_1s_agg=pd.DataFrame(data.get('orderbook_1s_agg', [])),
imbalances=pd.DataFrame(data.get('imbalances', [])),
indicators=data.get('indicators', {}),
context_data=pd.DataFrame(data['context_data']) if data.get('context_data') else None,
data_source=data.get('data_source', 'unknown'),
query_latency_ms=data.get('query_latency_ms', 0.0)
)
def get_latest_price(self) -> Optional[float]:
"""Get the latest close price from 1s data."""
if not self.ohlcv_1s.empty:
return float(self.ohlcv_1s.iloc[-1]['close_price'])
return None
def get_timeframe_data(self, timeframe: str) -> pd.DataFrame:
"""Get OHLCV data for a specific timeframe."""
timeframe_map = {
'1s': self.ohlcv_1s,
'1m': self.ohlcv_1m,
'5m': self.ohlcv_5m,
'15m': self.ohlcv_15m,
'1h': self.ohlcv_1h,
'1d': self.ohlcv_1d
}
return timeframe_map.get(timeframe, pd.DataFrame())
def has_complete_data(self) -> bool:
"""Check if all required data is present."""
return (
not self.ohlcv_1s.empty and
not self.ohlcv_1m.empty and
not self.imbalances.empty and
self.orderbook_snapshot is not None
)
def get_data_summary(self) -> Dict[str, Any]:
"""Get summary of available data."""
return {
'symbol': self.symbol,
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
'ohlcv_1s_rows': len(self.ohlcv_1s),
'ohlcv_1m_rows': len(self.ohlcv_1m),
'ohlcv_5m_rows': len(self.ohlcv_5m),
'ohlcv_15m_rows': len(self.ohlcv_15m),
'ohlcv_1h_rows': len(self.ohlcv_1h),
'ohlcv_1d_rows': len(self.ohlcv_1d),
'has_orderbook': self.orderbook_snapshot is not None,
'orderbook_1s_agg_rows': len(self.orderbook_1s_agg),
'imbalances_rows': len(self.imbalances),
'indicators_count': len(self.indicators),
'has_context_data': self.context_data is not None,
'data_source': self.data_source,
'query_latency_ms': self.query_latency_ms,
'is_complete': self.has_complete_data()
}
@dataclass
class OrderBookDataFrame:
"""
Order book data with imbalances and aggregations.
Contains raw order book, price buckets, and multi-timeframe imbalance metrics.
"""
symbol: str
timestamp: datetime
# Raw order book (top levels)
bids: List[Tuple[float, float]] = field(default_factory=list) # (price, size)
asks: List[Tuple[float, float]] = field(default_factory=list) # (price, size)
# Aggregated data (price buckets)
price_buckets: pd.DataFrame = field(default_factory=pd.DataFrame)
# Multi-timeframe imbalance metrics
imbalance_1s: float = 0.0
imbalance_5s: float = 0.0
imbalance_15s: float = 0.0
imbalance_60s: float = 0.0
# Volume-weighted imbalances
volume_imbalance_1s: float = 0.0
volume_imbalance_5s: float = 0.0
volume_imbalance_15s: float = 0.0
volume_imbalance_60s: float = 0.0
# Order book statistics
mid_price: float = 0.0
spread: float = 0.0
bid_volume: float = 0.0
ask_volume: float = 0.0
# Metadata
exchange: str = 'binance'
sequence_id: Optional[int] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
'symbol': self.symbol,
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
'bids': self.bids,
'asks': self.asks,
'price_buckets': self.price_buckets.to_dict('records') if not self.price_buckets.empty else [],
'imbalance_1s': self.imbalance_1s,
'imbalance_5s': self.imbalance_5s,
'imbalance_15s': self.imbalance_15s,
'imbalance_60s': self.imbalance_60s,
'volume_imbalance_1s': self.volume_imbalance_1s,
'volume_imbalance_5s': self.volume_imbalance_5s,
'volume_imbalance_15s': self.volume_imbalance_15s,
'volume_imbalance_60s': self.volume_imbalance_60s,
'mid_price': self.mid_price,
'spread': self.spread,
'bid_volume': self.bid_volume,
'ask_volume': self.ask_volume,
'exchange': self.exchange,
'sequence_id': self.sequence_id
}
def to_json(self) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), default=str)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'OrderBookDataFrame':
"""Create from dictionary."""
return cls(
symbol=data['symbol'],
timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(),
bids=data.get('bids', []),
asks=data.get('asks', []),
price_buckets=pd.DataFrame(data.get('price_buckets', [])),
imbalance_1s=data.get('imbalance_1s', 0.0),
imbalance_5s=data.get('imbalance_5s', 0.0),
imbalance_15s=data.get('imbalance_15s', 0.0),
imbalance_60s=data.get('imbalance_60s', 0.0),
volume_imbalance_1s=data.get('volume_imbalance_1s', 0.0),
volume_imbalance_5s=data.get('volume_imbalance_5s', 0.0),
volume_imbalance_15s=data.get('volume_imbalance_15s', 0.0),
volume_imbalance_60s=data.get('volume_imbalance_60s', 0.0),
mid_price=data.get('mid_price', 0.0),
spread=data.get('spread', 0.0),
bid_volume=data.get('bid_volume', 0.0),
ask_volume=data.get('ask_volume', 0.0),
exchange=data.get('exchange', 'binance'),
sequence_id=data.get('sequence_id')
)
def calculate_statistics(self):
"""Calculate order book statistics from bids and asks."""
if self.bids and self.asks:
# Best bid and ask
best_bid = max(self.bids, key=lambda x: x[0])[0] if self.bids else 0
best_ask = min(self.asks, key=lambda x: x[0])[0] if self.asks else 0
# Mid price and spread
if best_bid > 0 and best_ask > 0:
self.mid_price = (best_bid + best_ask) / 2
self.spread = best_ask - best_bid
# Total volumes
self.bid_volume = sum(size for _, size in self.bids)
self.ask_volume = sum(size for _, size in self.asks)
def get_best_bid(self) -> Optional[Tuple[float, float]]:
"""Get best bid (highest price)."""
if self.bids:
return max(self.bids, key=lambda x: x[0])
return None
def get_best_ask(self) -> Optional[Tuple[float, float]]:
"""Get best ask (lowest price)."""
if self.asks:
return min(self.asks, key=lambda x: x[0])
return None
def get_spread_bps(self) -> float:
"""Get spread in basis points."""
if self.mid_price > 0 and self.spread > 0:
return (self.spread / self.mid_price) * 10000
return 0.0
def get_imbalance_summary(self) -> Dict[str, float]:
"""Get summary of all imbalance metrics."""
return {
'imbalance_1s': self.imbalance_1s,
'imbalance_5s': self.imbalance_5s,
'imbalance_15s': self.imbalance_15s,
'imbalance_60s': self.imbalance_60s,
'volume_imbalance_1s': self.volume_imbalance_1s,
'volume_imbalance_5s': self.volume_imbalance_5s,
'volume_imbalance_15s': self.volume_imbalance_15s,
'volume_imbalance_60s': self.volume_imbalance_60s
}
def is_valid(self) -> bool:
"""Check if order book data is valid."""
if not self.bids or not self.asks:
return False
best_bid = self.get_best_bid()
best_ask = self.get_best_ask()
if not best_bid or not best_ask:
return False
# Bid must be less than ask
return best_bid[0] < best_ask[0]
@dataclass
class TradeEvent:
"""Individual trade event."""
symbol: str
timestamp: datetime
price: float
size: float
side: str # 'buy' or 'sell'
trade_id: str
exchange: str = 'binance'
is_buyer_maker: bool = False
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return asdict(self)
def to_json(self) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), default=str)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'TradeEvent':
"""Create from dictionary."""
return cls(
symbol=data['symbol'],
timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'],
price=float(data['price']),
size=float(data['size']),
side=data['side'],
trade_id=str(data['trade_id']),
exchange=data.get('exchange', 'binance'),
is_buyer_maker=data.get('is_buyer_maker', False)
)
@dataclass
class OHLCVCandle:
"""Single OHLCV candlestick."""
symbol: str
timestamp: datetime
timeframe: str
open_price: float
high_price: float
low_price: float
close_price: float
volume: float
trade_count: int = 0
# Technical indicators (optional)
rsi_14: Optional[float] = None
macd: Optional[float] = None
macd_signal: Optional[float] = None
macd_histogram: Optional[float] = None
bb_upper: Optional[float] = None
bb_middle: Optional[float] = None
bb_lower: Optional[float] = None
ema_12: Optional[float] = None
ema_26: Optional[float] = None
sma_20: Optional[float] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary."""
return asdict(self)
def to_json(self) -> str:
"""Convert to JSON string."""
return json.dumps(self.to_dict(), default=str)
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'OHLCVCandle':
"""Create from dictionary."""
return cls(
symbol=data['symbol'],
timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'],
timeframe=data['timeframe'],
open_price=float(data['open_price']),
high_price=float(data['high_price']),
low_price=float(data['low_price']),
close_price=float(data['close_price']),
volume=float(data['volume']),
trade_count=int(data.get('trade_count', 0)),
rsi_14=float(data['rsi_14']) if data.get('rsi_14') is not None else None,
macd=float(data['macd']) if data.get('macd') is not None else None,
macd_signal=float(data['macd_signal']) if data.get('macd_signal') is not None else None,
macd_histogram=float(data['macd_histogram']) if data.get('macd_histogram') is not None else None,
bb_upper=float(data['bb_upper']) if data.get('bb_upper') is not None else None,
bb_middle=float(data['bb_middle']) if data.get('bb_middle') is not None else None,
bb_lower=float(data['bb_lower']) if data.get('bb_lower') is not None else None,
ema_12=float(data['ema_12']) if data.get('ema_12') is not None else None,
ema_26=float(data['ema_26']) if data.get('ema_26') is not None else None,
sma_20=float(data['sma_20']) if data.get('sma_20') is not None else None
)
def is_valid(self) -> bool:
"""Check if candle data is valid."""
# High must be >= low
if self.high_price < self.low_price:
return False
# High must be >= open and close
if self.high_price < self.open_price or self.high_price < self.close_price:
return False
# Low must be <= open and close
if self.low_price > self.open_price or self.low_price > self.close_price:
return False
# Volume must be non-negative
if self.volume < 0:
return False
return True
def get_price_change(self) -> float:
"""Get price change (close - open)."""
return self.close_price - self.open_price
def get_price_change_percent(self) -> float:
"""Get price change percentage."""
if self.open_price > 0:
return ((self.close_price - self.open_price) / self.open_price) * 100
return 0.0
def get_range(self) -> float:
"""Get price range (high - low)."""
return self.high_price - self.low_price
def is_bullish(self) -> bool:
"""Check if candle is bullish (close > open)."""
return self.close_price > self.open_price
def is_bearish(self) -> bool:
"""Check if candle is bearish (close < open)."""
return self.close_price < self.open_price