433 lines
17 KiB
Python
433 lines
17 KiB
Python
"""
|
|
Unified Data Models for the storage system.
|
|
Standardized data structures for all components.
|
|
"""
|
|
|
|
import pandas as pd
|
|
import numpy as np
|
|
from dataclasses import dataclass, field, asdict
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
import json
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class InferenceDataFrame:
|
|
"""
|
|
Complete inference data for a single timestamp.
|
|
Contains all data needed for model inference including multi-timeframe OHLCV,
|
|
order book data, imbalances, and technical indicators.
|
|
"""
|
|
symbol: str
|
|
timestamp: datetime
|
|
|
|
# Multi-timeframe OHLCV data
|
|
ohlcv_1s: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
ohlcv_1m: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
ohlcv_5m: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
ohlcv_15m: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
ohlcv_1h: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
ohlcv_1d: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
|
|
# Order book data
|
|
orderbook_snapshot: Optional[Dict] = None
|
|
orderbook_1s_agg: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
|
|
# Imbalance metrics (multi-timeframe)
|
|
imbalances: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
|
|
# Technical indicators (pre-calculated from latest candle)
|
|
indicators: Dict[str, float] = field(default_factory=dict)
|
|
|
|
# Context window data (±N minutes around timestamp)
|
|
context_data: Optional[pd.DataFrame] = None
|
|
|
|
# Metadata
|
|
data_source: str = 'unknown' # 'cache' or 'database'
|
|
query_latency_ms: float = 0.0
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary for serialization."""
|
|
return {
|
|
'symbol': self.symbol,
|
|
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
|
|
'ohlcv_1s': self.ohlcv_1s.to_dict('records') if not self.ohlcv_1s.empty else [],
|
|
'ohlcv_1m': self.ohlcv_1m.to_dict('records') if not self.ohlcv_1m.empty else [],
|
|
'ohlcv_5m': self.ohlcv_5m.to_dict('records') if not self.ohlcv_5m.empty else [],
|
|
'ohlcv_15m': self.ohlcv_15m.to_dict('records') if not self.ohlcv_15m.empty else [],
|
|
'ohlcv_1h': self.ohlcv_1h.to_dict('records') if not self.ohlcv_1h.empty else [],
|
|
'ohlcv_1d': self.ohlcv_1d.to_dict('records') if not self.ohlcv_1d.empty else [],
|
|
'orderbook_snapshot': self.orderbook_snapshot,
|
|
'orderbook_1s_agg': self.orderbook_1s_agg.to_dict('records') if not self.orderbook_1s_agg.empty else [],
|
|
'imbalances': self.imbalances.to_dict('records') if not self.imbalances.empty else [],
|
|
'indicators': self.indicators,
|
|
'context_data': self.context_data.to_dict('records') if self.context_data is not None and not self.context_data.empty else None,
|
|
'data_source': self.data_source,
|
|
'query_latency_ms': self.query_latency_ms
|
|
}
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert to JSON string."""
|
|
return json.dumps(self.to_dict(), default=str)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'InferenceDataFrame':
|
|
"""Create from dictionary."""
|
|
return cls(
|
|
symbol=data['symbol'],
|
|
timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(),
|
|
ohlcv_1s=pd.DataFrame(data.get('ohlcv_1s', [])),
|
|
ohlcv_1m=pd.DataFrame(data.get('ohlcv_1m', [])),
|
|
ohlcv_5m=pd.DataFrame(data.get('ohlcv_5m', [])),
|
|
ohlcv_15m=pd.DataFrame(data.get('ohlcv_15m', [])),
|
|
ohlcv_1h=pd.DataFrame(data.get('ohlcv_1h', [])),
|
|
ohlcv_1d=pd.DataFrame(data.get('ohlcv_1d', [])),
|
|
orderbook_snapshot=data.get('orderbook_snapshot'),
|
|
orderbook_1s_agg=pd.DataFrame(data.get('orderbook_1s_agg', [])),
|
|
imbalances=pd.DataFrame(data.get('imbalances', [])),
|
|
indicators=data.get('indicators', {}),
|
|
context_data=pd.DataFrame(data['context_data']) if data.get('context_data') else None,
|
|
data_source=data.get('data_source', 'unknown'),
|
|
query_latency_ms=data.get('query_latency_ms', 0.0)
|
|
)
|
|
|
|
def get_latest_price(self) -> Optional[float]:
|
|
"""Get the latest close price from 1s data."""
|
|
if not self.ohlcv_1s.empty:
|
|
return float(self.ohlcv_1s.iloc[-1]['close_price'])
|
|
return None
|
|
|
|
def get_timeframe_data(self, timeframe: str) -> pd.DataFrame:
|
|
"""Get OHLCV data for a specific timeframe."""
|
|
timeframe_map = {
|
|
'1s': self.ohlcv_1s,
|
|
'1m': self.ohlcv_1m,
|
|
'5m': self.ohlcv_5m,
|
|
'15m': self.ohlcv_15m,
|
|
'1h': self.ohlcv_1h,
|
|
'1d': self.ohlcv_1d
|
|
}
|
|
return timeframe_map.get(timeframe, pd.DataFrame())
|
|
|
|
def has_complete_data(self) -> bool:
|
|
"""Check if all required data is present."""
|
|
return (
|
|
not self.ohlcv_1s.empty and
|
|
not self.ohlcv_1m.empty and
|
|
not self.imbalances.empty and
|
|
self.orderbook_snapshot is not None
|
|
)
|
|
|
|
def get_data_summary(self) -> Dict[str, Any]:
|
|
"""Get summary of available data."""
|
|
return {
|
|
'symbol': self.symbol,
|
|
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
|
|
'ohlcv_1s_rows': len(self.ohlcv_1s),
|
|
'ohlcv_1m_rows': len(self.ohlcv_1m),
|
|
'ohlcv_5m_rows': len(self.ohlcv_5m),
|
|
'ohlcv_15m_rows': len(self.ohlcv_15m),
|
|
'ohlcv_1h_rows': len(self.ohlcv_1h),
|
|
'ohlcv_1d_rows': len(self.ohlcv_1d),
|
|
'has_orderbook': self.orderbook_snapshot is not None,
|
|
'orderbook_1s_agg_rows': len(self.orderbook_1s_agg),
|
|
'imbalances_rows': len(self.imbalances),
|
|
'indicators_count': len(self.indicators),
|
|
'has_context_data': self.context_data is not None,
|
|
'data_source': self.data_source,
|
|
'query_latency_ms': self.query_latency_ms,
|
|
'is_complete': self.has_complete_data()
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class OrderBookDataFrame:
|
|
"""
|
|
Order book data with imbalances and aggregations.
|
|
Contains raw order book, price buckets, and multi-timeframe imbalance metrics.
|
|
"""
|
|
symbol: str
|
|
timestamp: datetime
|
|
|
|
# Raw order book (top levels)
|
|
bids: List[Tuple[float, float]] = field(default_factory=list) # (price, size)
|
|
asks: List[Tuple[float, float]] = field(default_factory=list) # (price, size)
|
|
|
|
# Aggregated data (price buckets)
|
|
price_buckets: pd.DataFrame = field(default_factory=pd.DataFrame)
|
|
|
|
# Multi-timeframe imbalance metrics
|
|
imbalance_1s: float = 0.0
|
|
imbalance_5s: float = 0.0
|
|
imbalance_15s: float = 0.0
|
|
imbalance_60s: float = 0.0
|
|
|
|
# Volume-weighted imbalances
|
|
volume_imbalance_1s: float = 0.0
|
|
volume_imbalance_5s: float = 0.0
|
|
volume_imbalance_15s: float = 0.0
|
|
volume_imbalance_60s: float = 0.0
|
|
|
|
# Order book statistics
|
|
mid_price: float = 0.0
|
|
spread: float = 0.0
|
|
bid_volume: float = 0.0
|
|
ask_volume: float = 0.0
|
|
|
|
# Metadata
|
|
exchange: str = 'binance'
|
|
sequence_id: Optional[int] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary for serialization."""
|
|
return {
|
|
'symbol': self.symbol,
|
|
'timestamp': self.timestamp.isoformat() if self.timestamp else None,
|
|
'bids': self.bids,
|
|
'asks': self.asks,
|
|
'price_buckets': self.price_buckets.to_dict('records') if not self.price_buckets.empty else [],
|
|
'imbalance_1s': self.imbalance_1s,
|
|
'imbalance_5s': self.imbalance_5s,
|
|
'imbalance_15s': self.imbalance_15s,
|
|
'imbalance_60s': self.imbalance_60s,
|
|
'volume_imbalance_1s': self.volume_imbalance_1s,
|
|
'volume_imbalance_5s': self.volume_imbalance_5s,
|
|
'volume_imbalance_15s': self.volume_imbalance_15s,
|
|
'volume_imbalance_60s': self.volume_imbalance_60s,
|
|
'mid_price': self.mid_price,
|
|
'spread': self.spread,
|
|
'bid_volume': self.bid_volume,
|
|
'ask_volume': self.ask_volume,
|
|
'exchange': self.exchange,
|
|
'sequence_id': self.sequence_id
|
|
}
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert to JSON string."""
|
|
return json.dumps(self.to_dict(), default=str)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'OrderBookDataFrame':
|
|
"""Create from dictionary."""
|
|
return cls(
|
|
symbol=data['symbol'],
|
|
timestamp=datetime.fromisoformat(data['timestamp']) if data.get('timestamp') else datetime.now(),
|
|
bids=data.get('bids', []),
|
|
asks=data.get('asks', []),
|
|
price_buckets=pd.DataFrame(data.get('price_buckets', [])),
|
|
imbalance_1s=data.get('imbalance_1s', 0.0),
|
|
imbalance_5s=data.get('imbalance_5s', 0.0),
|
|
imbalance_15s=data.get('imbalance_15s', 0.0),
|
|
imbalance_60s=data.get('imbalance_60s', 0.0),
|
|
volume_imbalance_1s=data.get('volume_imbalance_1s', 0.0),
|
|
volume_imbalance_5s=data.get('volume_imbalance_5s', 0.0),
|
|
volume_imbalance_15s=data.get('volume_imbalance_15s', 0.0),
|
|
volume_imbalance_60s=data.get('volume_imbalance_60s', 0.0),
|
|
mid_price=data.get('mid_price', 0.0),
|
|
spread=data.get('spread', 0.0),
|
|
bid_volume=data.get('bid_volume', 0.0),
|
|
ask_volume=data.get('ask_volume', 0.0),
|
|
exchange=data.get('exchange', 'binance'),
|
|
sequence_id=data.get('sequence_id')
|
|
)
|
|
|
|
def calculate_statistics(self):
|
|
"""Calculate order book statistics from bids and asks."""
|
|
if self.bids and self.asks:
|
|
# Best bid and ask
|
|
best_bid = max(self.bids, key=lambda x: x[0])[0] if self.bids else 0
|
|
best_ask = min(self.asks, key=lambda x: x[0])[0] if self.asks else 0
|
|
|
|
# Mid price and spread
|
|
if best_bid > 0 and best_ask > 0:
|
|
self.mid_price = (best_bid + best_ask) / 2
|
|
self.spread = best_ask - best_bid
|
|
|
|
# Total volumes
|
|
self.bid_volume = sum(size for _, size in self.bids)
|
|
self.ask_volume = sum(size for _, size in self.asks)
|
|
|
|
def get_best_bid(self) -> Optional[Tuple[float, float]]:
|
|
"""Get best bid (highest price)."""
|
|
if self.bids:
|
|
return max(self.bids, key=lambda x: x[0])
|
|
return None
|
|
|
|
def get_best_ask(self) -> Optional[Tuple[float, float]]:
|
|
"""Get best ask (lowest price)."""
|
|
if self.asks:
|
|
return min(self.asks, key=lambda x: x[0])
|
|
return None
|
|
|
|
def get_spread_bps(self) -> float:
|
|
"""Get spread in basis points."""
|
|
if self.mid_price > 0 and self.spread > 0:
|
|
return (self.spread / self.mid_price) * 10000
|
|
return 0.0
|
|
|
|
def get_imbalance_summary(self) -> Dict[str, float]:
|
|
"""Get summary of all imbalance metrics."""
|
|
return {
|
|
'imbalance_1s': self.imbalance_1s,
|
|
'imbalance_5s': self.imbalance_5s,
|
|
'imbalance_15s': self.imbalance_15s,
|
|
'imbalance_60s': self.imbalance_60s,
|
|
'volume_imbalance_1s': self.volume_imbalance_1s,
|
|
'volume_imbalance_5s': self.volume_imbalance_5s,
|
|
'volume_imbalance_15s': self.volume_imbalance_15s,
|
|
'volume_imbalance_60s': self.volume_imbalance_60s
|
|
}
|
|
|
|
def is_valid(self) -> bool:
|
|
"""Check if order book data is valid."""
|
|
if not self.bids or not self.asks:
|
|
return False
|
|
|
|
best_bid = self.get_best_bid()
|
|
best_ask = self.get_best_ask()
|
|
|
|
if not best_bid or not best_ask:
|
|
return False
|
|
|
|
# Bid must be less than ask
|
|
return best_bid[0] < best_ask[0]
|
|
|
|
|
|
@dataclass
|
|
class TradeEvent:
|
|
"""Individual trade event."""
|
|
symbol: str
|
|
timestamp: datetime
|
|
price: float
|
|
size: float
|
|
side: str # 'buy' or 'sell'
|
|
trade_id: str
|
|
exchange: str = 'binance'
|
|
is_buyer_maker: bool = False
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary."""
|
|
return asdict(self)
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert to JSON string."""
|
|
return json.dumps(self.to_dict(), default=str)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'TradeEvent':
|
|
"""Create from dictionary."""
|
|
return cls(
|
|
symbol=data['symbol'],
|
|
timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'],
|
|
price=float(data['price']),
|
|
size=float(data['size']),
|
|
side=data['side'],
|
|
trade_id=str(data['trade_id']),
|
|
exchange=data.get('exchange', 'binance'),
|
|
is_buyer_maker=data.get('is_buyer_maker', False)
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class OHLCVCandle:
|
|
"""Single OHLCV candlestick."""
|
|
symbol: str
|
|
timestamp: datetime
|
|
timeframe: str
|
|
open_price: float
|
|
high_price: float
|
|
low_price: float
|
|
close_price: float
|
|
volume: float
|
|
trade_count: int = 0
|
|
|
|
# Technical indicators (optional)
|
|
rsi_14: Optional[float] = None
|
|
macd: Optional[float] = None
|
|
macd_signal: Optional[float] = None
|
|
macd_histogram: Optional[float] = None
|
|
bb_upper: Optional[float] = None
|
|
bb_middle: Optional[float] = None
|
|
bb_lower: Optional[float] = None
|
|
ema_12: Optional[float] = None
|
|
ema_26: Optional[float] = None
|
|
sma_20: Optional[float] = None
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Convert to dictionary."""
|
|
return asdict(self)
|
|
|
|
def to_json(self) -> str:
|
|
"""Convert to JSON string."""
|
|
return json.dumps(self.to_dict(), default=str)
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'OHLCVCandle':
|
|
"""Create from dictionary."""
|
|
return cls(
|
|
symbol=data['symbol'],
|
|
timestamp=datetime.fromisoformat(data['timestamp']) if isinstance(data['timestamp'], str) else data['timestamp'],
|
|
timeframe=data['timeframe'],
|
|
open_price=float(data['open_price']),
|
|
high_price=float(data['high_price']),
|
|
low_price=float(data['low_price']),
|
|
close_price=float(data['close_price']),
|
|
volume=float(data['volume']),
|
|
trade_count=int(data.get('trade_count', 0)),
|
|
rsi_14=float(data['rsi_14']) if data.get('rsi_14') is not None else None,
|
|
macd=float(data['macd']) if data.get('macd') is not None else None,
|
|
macd_signal=float(data['macd_signal']) if data.get('macd_signal') is not None else None,
|
|
macd_histogram=float(data['macd_histogram']) if data.get('macd_histogram') is not None else None,
|
|
bb_upper=float(data['bb_upper']) if data.get('bb_upper') is not None else None,
|
|
bb_middle=float(data['bb_middle']) if data.get('bb_middle') is not None else None,
|
|
bb_lower=float(data['bb_lower']) if data.get('bb_lower') is not None else None,
|
|
ema_12=float(data['ema_12']) if data.get('ema_12') is not None else None,
|
|
ema_26=float(data['ema_26']) if data.get('ema_26') is not None else None,
|
|
sma_20=float(data['sma_20']) if data.get('sma_20') is not None else None
|
|
)
|
|
|
|
def is_valid(self) -> bool:
|
|
"""Check if candle data is valid."""
|
|
# High must be >= low
|
|
if self.high_price < self.low_price:
|
|
return False
|
|
|
|
# High must be >= open and close
|
|
if self.high_price < self.open_price or self.high_price < self.close_price:
|
|
return False
|
|
|
|
# Low must be <= open and close
|
|
if self.low_price > self.open_price or self.low_price > self.close_price:
|
|
return False
|
|
|
|
# Volume must be non-negative
|
|
if self.volume < 0:
|
|
return False
|
|
|
|
return True
|
|
|
|
def get_price_change(self) -> float:
|
|
"""Get price change (close - open)."""
|
|
return self.close_price - self.open_price
|
|
|
|
def get_price_change_percent(self) -> float:
|
|
"""Get price change percentage."""
|
|
if self.open_price > 0:
|
|
return ((self.close_price - self.open_price) / self.open_price) * 100
|
|
return 0.0
|
|
|
|
def get_range(self) -> float:
|
|
"""Get price range (high - low)."""
|
|
return self.high_price - self.low_price
|
|
|
|
def is_bullish(self) -> bool:
|
|
"""Check if candle is bullish (close > open)."""
|
|
return self.close_price > self.open_price
|
|
|
|
def is_bearish(self) -> bool:
|
|
"""Check if candle is bearish (close < open)."""
|
|
return self.close_price < self.open_price
|