Files
gogo2/core/unified_model_data_interface.py

414 lines
17 KiB
Python

"""
Unified Model Data Interface
CRITICAL POLICY: NO SYNTHETIC DATA ALLOWED
This module provides a standardized data interface for all models.
NEVER use mock/fake/synthetic data or placeholder values.
If data is unavailable: return None, log errors, raise exceptions.
This interface ensures:
- Consistent data format across all models
- Proper feature engineering and normalization
- Real-time data streaming to models
- No data dumps or unnecessary file I/O
"""
import logging
import numpy as np
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any, Union
from dataclasses import dataclass
logger = logging.getLogger(__name__)
@dataclass
class ModelInputData:
"""Standardized input data structure for all models"""
symbol: str
timestamp: datetime
current_price: float
# Time series data (preserves structure)
candles_1m: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
candles_1s: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
candles_5m: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
# Technical indicators (flattened for models that need it)
technical_indicators: Optional[np.ndarray] = None # Shape: (n_indicators,)
# Market microstructure (for COB models)
order_book_features: Optional[np.ndarray] = None # Shape: (n_features,)
# Market context
volume_profile: Optional[np.ndarray] = None # Shape: (n_levels,)
volatility_regime: float = 0.0
trend_strength: float = 0.0
# Metadata
data_quality_score: float = 1.0
feature_count: int = 0
class UnifiedModelDataInterface:
"""
Unified interface for preparing data for all model types
Features:
- Standardized data format across models
- Preserves time series structure for CNN/Transformer
- Flattened features for DQN/Generic models
- Real-time data streaming
- No unnecessary file I/O or dumps
"""
def __init__(self, data_provider, config):
self.data_provider = data_provider
self.config = config
# Model-specific requirements
self.model_requirements = {
'cnn': {
'input_shape': (60, 5), # 60 candles, 5 features (OHLCV)
'requires_sequence': True,
'normalization': 'pivot_based'
},
'dqn': {
'input_shape': (100,), # 100-dim state vector
'requires_sequence': False,
'normalization': 'min_max'
},
'cob_rl': {
'input_shape': (50,), # 50-dim order book features
'requires_sequence': False,
'normalization': 'z_score'
},
'transformer': {
'input_shape': (150, 5), # 150 candles, 5 features
'requires_sequence': True,
'normalization': 'pivot_based'
},
'generic': {
'input_shape': (200,), # 200-dim feature vector
'requires_sequence': False,
'normalization': 'min_max'
}
}
logger.info("Unified Model Data Interface initialized")
def prepare_model_input(self, symbol: str, model_type: str,
window_size: int = None) -> Optional[ModelInputData]:
"""
Prepare standardized input data for any model type
Args:
symbol: Trading symbol (e.g., 'ETH/USDT')
model_type: Type of model ('cnn', 'dqn', 'cob_rl', 'transformer', 'generic')
window_size: Number of candles to include (model-specific default if None)
Returns:
ModelInputData: Standardized input data structure
"""
try:
if model_type not in self.model_requirements:
logger.error(f"Unknown model type: {model_type}")
return None
requirements = self.model_requirements[model_type]
if window_size is None:
window_size = requirements['input_shape'][0]
# Get current market data
current_price = self.data_provider.get_current_price(symbol)
if current_price is None:
logger.warning(f"No current price available for {symbol}")
return None
# Prepare base data structure
model_input = ModelInputData(
symbol=symbol,
timestamp=datetime.now(),
current_price=current_price
)
# Get time series data based on model requirements
if requirements['requires_sequence']:
model_input = self._add_sequence_data(model_input, symbol, window_size)
else:
model_input = self._add_feature_vector(model_input, symbol, window_size)
# Add model-specific features
if model_type == 'cob_rl':
model_input = self._add_order_book_features(model_input, symbol)
# Apply normalization
model_input = self._apply_normalization(model_input, model_type, requirements)
# Calculate data quality score
model_input.data_quality_score = self._calculate_data_quality(model_input)
model_input.feature_count = self._count_features(model_input)
logger.debug(f"Prepared {model_type} input for {symbol}: {model_input.feature_count} features, quality: {model_input.data_quality_score:.2f}")
return model_input
except Exception as e:
logger.error(f"Error preparing {model_type} input for {symbol}: {e}")
return None
def _add_sequence_data(self, model_input: ModelInputData, symbol: str, window_size: int) -> ModelInputData:
"""Add time series sequence data for CNN/Transformer models"""
try:
# Get 1m candles
candles_1m = self.data_provider.get_historical_data_for_inference(symbol, '1m', limit=window_size + 50)
if candles_1m is not None and not candles_1m.empty:
# Extract OHLCV sequence
ohlcv_data = candles_1m[['open', 'high', 'low', 'close', 'volume']].values
model_input.candles_1m = ohlcv_data[-window_size:] # Last window_size candles
# Get 1s candles for high-frequency models
candles_1s = self.data_provider.get_historical_data_for_inference(symbol, '1s', limit=window_size + 50)
if candles_1s is not None and not candles_1s.empty:
ohlcv_data = candles_1s[['open', 'high', 'low', 'close', 'volume']].values
model_input.candles_1s = ohlcv_data[-window_size:]
# Get 5m candles for longer-term context
candles_5m = self.data_provider.get_historical_data_for_inference(symbol, '5m', limit=window_size + 50)
if candles_5m is not None and not candles_5m.empty:
ohlcv_data = candles_5m[['open', 'high', 'low', 'close', 'volume']].values
model_input.candles_5m = ohlcv_data[-window_size:]
return model_input
except Exception as e:
logger.error(f"Error adding sequence data for {symbol}: {e}")
return model_input
def _add_feature_vector(self, model_input: ModelInputData, symbol: str, window_size: int) -> ModelInputData:
"""Add flattened feature vector for DQN/Generic models"""
try:
# Get feature matrix
feature_matrix = self.data_provider.get_feature_matrix(symbol, window_size=window_size)
if feature_matrix is not None:
# Flatten and limit to expected size
flattened = feature_matrix.flatten()
target_size = 200 # Default for generic models
if len(flattened) > target_size:
flattened = flattened[:target_size]
elif len(flattened) < target_size:
# Pad with zeros
padded = np.zeros(target_size)
padded[:len(flattened)] = flattened
flattened = padded
model_input.technical_indicators = flattened
return model_input
except Exception as e:
logger.error(f"Error adding feature vector for {symbol}: {e}")
return model_input
def _add_order_book_features(self, model_input: ModelInputData, symbol: str) -> ModelInputData:
"""Add order book features for COB models"""
try:
# Get COB state from data provider
cob_state = self.data_provider.get_cob_state(symbol)
if cob_state is not None:
model_input.order_book_features = cob_state
return model_input
except Exception as e:
logger.error(f"Error adding order book features for {symbol}: {e}")
return model_input
def _apply_normalization(self, model_input: ModelInputData, model_type: str, requirements: Dict) -> ModelInputData:
"""Apply model-specific normalization"""
try:
norm_type = requirements['normalization']
if norm_type == 'pivot_based':
# Use pivot-based normalization for price data
if model_input.candles_1m is not None:
model_input.candles_1m = self._normalize_with_pivot_bounds(model_input.candles_1m, model_input.symbol)
if model_input.candles_1s is not None:
model_input.candles_1s = self._normalize_with_pivot_bounds(model_input.candles_1s, model_input.symbol)
if model_input.candles_5m is not None:
model_input.candles_5m = self._normalize_with_pivot_bounds(model_input.candles_5m, model_input.symbol)
elif norm_type == 'min_max':
# Min-max normalization for feature vectors
if model_input.technical_indicators is not None:
model_input.technical_indicators = self._min_max_normalize(model_input.technical_indicators)
elif norm_type == 'z_score':
# Z-score normalization for order book features
if model_input.order_book_features is not None:
model_input.order_book_features = self._z_score_normalize(model_input.order_book_features)
return model_input
except Exception as e:
logger.error(f"Error applying normalization for {model_type}: {e}")
return model_input
def _normalize_with_pivot_bounds(self, candles: np.ndarray, symbol: str) -> np.ndarray:
"""Normalize candles using pivot bounds"""
try:
if symbol not in self.data_provider.pivot_bounds:
# Fallback to simple normalization
return self._min_max_normalize(candles)
bounds = self.data_provider.pivot_bounds[symbol]
price_range = bounds.get_price_range()
volume_range = bounds.volume_max - bounds.volume_min
normalized = candles.copy()
# Normalize price columns (0-3: OHLC)
for i in range(4):
normalized[:, i] = (candles[:, i] - bounds.price_min) / price_range
# Normalize volume column (4)
if volume_range > 0:
normalized[:, 4] = (candles[:, 4] - bounds.volume_min) / volume_range
else:
normalized[:, 4] = 0.5 # Default to middle if no volume range
return normalized
except Exception as e:
logger.error(f"Error normalizing with pivot bounds: {e}")
return self._min_max_normalize(candles)
def _min_max_normalize(self, data: np.ndarray) -> np.ndarray:
"""Min-max normalization"""
try:
data_min = np.min(data)
data_max = np.max(data)
if data_max - data_min == 0:
return np.zeros_like(data)
return (data - data_min) / (data_max - data_min)
except Exception as e:
logger.error(f"Error in min-max normalization: {e}")
return data
def _z_score_normalize(self, data: np.ndarray) -> np.ndarray:
"""Z-score normalization"""
try:
mean = np.mean(data)
std = np.std(data)
if std == 0:
return np.zeros_like(data)
return (data - mean) / std
except Exception as e:
logger.error(f"Error in z-score normalization: {e}")
return data
def _calculate_data_quality(self, model_input: ModelInputData) -> float:
"""Calculate data quality score (0.0 to 1.0)"""
try:
score = 1.0
# Check for missing data
if model_input.candles_1m is None:
score -= 0.3
if model_input.technical_indicators is None:
score -= 0.2
if model_input.order_book_features is None:
score -= 0.1
# Check for NaN values
if model_input.candles_1m is not None and np.isnan(model_input.candles_1m).any():
score -= 0.2
if model_input.technical_indicators is not None and np.isnan(model_input.technical_indicators).any():
score -= 0.2
# Check for zero variance (indicating stale data)
if model_input.candles_1m is not None:
price_variance = np.var(model_input.candles_1m[:, 3]) # Close price variance
if price_variance < 1e-8:
score -= 0.3
return max(0.0, score)
except Exception as e:
logger.error(f"Error calculating data quality: {e}")
return 0.0
def _count_features(self, model_input: ModelInputData) -> int:
"""Count total number of features"""
try:
count = 0
if model_input.candles_1m is not None:
count += model_input.candles_1m.size
if model_input.candles_1s is not None:
count += model_input.candles_1s.size
if model_input.candles_5m is not None:
count += model_input.candles_5m.size
if model_input.technical_indicators is not None:
count += model_input.technical_indicators.size
if model_input.order_book_features is not None:
count += model_input.order_book_features.size
if model_input.volume_profile is not None:
count += model_input.volume_profile.size
return count
except Exception as e:
logger.error(f"Error counting features: {e}")
return 0
def get_model_specific_input(self, model_input: ModelInputData, model_type: str) -> Optional[np.ndarray]:
"""
Extract model-specific input from standardized ModelInputData
Args:
model_input: Standardized input data
model_type: Type of model
Returns:
np.ndarray: Model-specific input data
"""
try:
if model_type == 'cnn':
if model_input.candles_1m is not None:
return model_input.candles_1m
return None
elif model_type == 'dqn':
if model_input.technical_indicators is not None:
return model_input.technical_indicators
return None
elif model_type == 'cob_rl':
if model_input.order_book_features is not None:
return model_input.order_book_features
return None
elif model_type == 'transformer':
if model_input.candles_1m is not None:
return model_input.candles_1m
return None
elif model_type == 'generic':
if model_input.technical_indicators is not None:
return model_input.technical_indicators
return None
else:
logger.error(f"Unknown model type: {model_type}")
return None
except Exception as e:
logger.error(f"Error getting model-specific input for {model_type}: {e}")
return None