414 lines
17 KiB
Python
414 lines
17 KiB
Python
"""
|
|
Unified Model Data Interface
|
|
|
|
CRITICAL POLICY: NO SYNTHETIC DATA ALLOWED
|
|
This module provides a standardized data interface for all models.
|
|
NEVER use mock/fake/synthetic data or placeholder values.
|
|
If data is unavailable: return None, log errors, raise exceptions.
|
|
|
|
This interface ensures:
|
|
- Consistent data format across all models
|
|
- Proper feature engineering and normalization
|
|
- Real-time data streaming to models
|
|
- No data dumps or unnecessary file I/O
|
|
"""
|
|
|
|
import logging
|
|
import numpy as np
|
|
import pandas as pd
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Tuple, Any, Union
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class ModelInputData:
|
|
"""Standardized input data structure for all models"""
|
|
symbol: str
|
|
timestamp: datetime
|
|
current_price: float
|
|
|
|
# Time series data (preserves structure)
|
|
candles_1m: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
|
|
candles_1s: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
|
|
candles_5m: Optional[np.ndarray] = None # Shape: (window_size, 5) [OHLCV]
|
|
|
|
# Technical indicators (flattened for models that need it)
|
|
technical_indicators: Optional[np.ndarray] = None # Shape: (n_indicators,)
|
|
|
|
# Market microstructure (for COB models)
|
|
order_book_features: Optional[np.ndarray] = None # Shape: (n_features,)
|
|
|
|
# Market context
|
|
volume_profile: Optional[np.ndarray] = None # Shape: (n_levels,)
|
|
volatility_regime: float = 0.0
|
|
trend_strength: float = 0.0
|
|
|
|
# Metadata
|
|
data_quality_score: float = 1.0
|
|
feature_count: int = 0
|
|
|
|
class UnifiedModelDataInterface:
|
|
"""
|
|
Unified interface for preparing data for all model types
|
|
|
|
Features:
|
|
- Standardized data format across models
|
|
- Preserves time series structure for CNN/Transformer
|
|
- Flattened features for DQN/Generic models
|
|
- Real-time data streaming
|
|
- No unnecessary file I/O or dumps
|
|
"""
|
|
|
|
def __init__(self, data_provider, config):
|
|
self.data_provider = data_provider
|
|
self.config = config
|
|
|
|
# Model-specific requirements
|
|
self.model_requirements = {
|
|
'cnn': {
|
|
'input_shape': (60, 5), # 60 candles, 5 features (OHLCV)
|
|
'requires_sequence': True,
|
|
'normalization': 'pivot_based'
|
|
},
|
|
'dqn': {
|
|
'input_shape': (100,), # 100-dim state vector
|
|
'requires_sequence': False,
|
|
'normalization': 'min_max'
|
|
},
|
|
'cob_rl': {
|
|
'input_shape': (50,), # 50-dim order book features
|
|
'requires_sequence': False,
|
|
'normalization': 'z_score'
|
|
},
|
|
'transformer': {
|
|
'input_shape': (150, 5), # 150 candles, 5 features
|
|
'requires_sequence': True,
|
|
'normalization': 'pivot_based'
|
|
},
|
|
'generic': {
|
|
'input_shape': (200,), # 200-dim feature vector
|
|
'requires_sequence': False,
|
|
'normalization': 'min_max'
|
|
}
|
|
}
|
|
|
|
logger.info("Unified Model Data Interface initialized")
|
|
|
|
def prepare_model_input(self, symbol: str, model_type: str,
|
|
window_size: int = None) -> Optional[ModelInputData]:
|
|
"""
|
|
Prepare standardized input data for any model type
|
|
|
|
Args:
|
|
symbol: Trading symbol (e.g., 'ETH/USDT')
|
|
model_type: Type of model ('cnn', 'dqn', 'cob_rl', 'transformer', 'generic')
|
|
window_size: Number of candles to include (model-specific default if None)
|
|
|
|
Returns:
|
|
ModelInputData: Standardized input data structure
|
|
"""
|
|
try:
|
|
if model_type not in self.model_requirements:
|
|
logger.error(f"Unknown model type: {model_type}")
|
|
return None
|
|
|
|
requirements = self.model_requirements[model_type]
|
|
if window_size is None:
|
|
window_size = requirements['input_shape'][0]
|
|
|
|
# Get current market data
|
|
current_price = self.data_provider.get_current_price(symbol)
|
|
if current_price is None:
|
|
logger.warning(f"No current price available for {symbol}")
|
|
return None
|
|
|
|
# Prepare base data structure
|
|
model_input = ModelInputData(
|
|
symbol=symbol,
|
|
timestamp=datetime.now(),
|
|
current_price=current_price
|
|
)
|
|
|
|
# Get time series data based on model requirements
|
|
if requirements['requires_sequence']:
|
|
model_input = self._add_sequence_data(model_input, symbol, window_size)
|
|
else:
|
|
model_input = self._add_feature_vector(model_input, symbol, window_size)
|
|
|
|
# Add model-specific features
|
|
if model_type == 'cob_rl':
|
|
model_input = self._add_order_book_features(model_input, symbol)
|
|
|
|
# Apply normalization
|
|
model_input = self._apply_normalization(model_input, model_type, requirements)
|
|
|
|
# Calculate data quality score
|
|
model_input.data_quality_score = self._calculate_data_quality(model_input)
|
|
model_input.feature_count = self._count_features(model_input)
|
|
|
|
logger.debug(f"Prepared {model_type} input for {symbol}: {model_input.feature_count} features, quality: {model_input.data_quality_score:.2f}")
|
|
return model_input
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error preparing {model_type} input for {symbol}: {e}")
|
|
return None
|
|
|
|
def _add_sequence_data(self, model_input: ModelInputData, symbol: str, window_size: int) -> ModelInputData:
|
|
"""Add time series sequence data for CNN/Transformer models"""
|
|
try:
|
|
# Get 1m candles
|
|
candles_1m = self.data_provider.get_historical_data_for_inference(symbol, '1m', limit=window_size + 50)
|
|
if candles_1m is not None and not candles_1m.empty:
|
|
# Extract OHLCV sequence
|
|
ohlcv_data = candles_1m[['open', 'high', 'low', 'close', 'volume']].values
|
|
model_input.candles_1m = ohlcv_data[-window_size:] # Last window_size candles
|
|
|
|
# Get 1s candles for high-frequency models
|
|
candles_1s = self.data_provider.get_historical_data_for_inference(symbol, '1s', limit=window_size + 50)
|
|
if candles_1s is not None and not candles_1s.empty:
|
|
ohlcv_data = candles_1s[['open', 'high', 'low', 'close', 'volume']].values
|
|
model_input.candles_1s = ohlcv_data[-window_size:]
|
|
|
|
# Get 5m candles for longer-term context
|
|
candles_5m = self.data_provider.get_historical_data_for_inference(symbol, '5m', limit=window_size + 50)
|
|
if candles_5m is not None and not candles_5m.empty:
|
|
ohlcv_data = candles_5m[['open', 'high', 'low', 'close', 'volume']].values
|
|
model_input.candles_5m = ohlcv_data[-window_size:]
|
|
|
|
return model_input
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding sequence data for {symbol}: {e}")
|
|
return model_input
|
|
|
|
def _add_feature_vector(self, model_input: ModelInputData, symbol: str, window_size: int) -> ModelInputData:
|
|
"""Add flattened feature vector for DQN/Generic models"""
|
|
try:
|
|
# Get feature matrix
|
|
feature_matrix = self.data_provider.get_feature_matrix(symbol, window_size=window_size)
|
|
if feature_matrix is not None:
|
|
# Flatten and limit to expected size
|
|
flattened = feature_matrix.flatten()
|
|
target_size = 200 # Default for generic models
|
|
|
|
if len(flattened) > target_size:
|
|
flattened = flattened[:target_size]
|
|
elif len(flattened) < target_size:
|
|
# Pad with zeros
|
|
padded = np.zeros(target_size)
|
|
padded[:len(flattened)] = flattened
|
|
flattened = padded
|
|
|
|
model_input.technical_indicators = flattened
|
|
|
|
return model_input
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding feature vector for {symbol}: {e}")
|
|
return model_input
|
|
|
|
def _add_order_book_features(self, model_input: ModelInputData, symbol: str) -> ModelInputData:
|
|
"""Add order book features for COB models"""
|
|
try:
|
|
# Get COB state from data provider
|
|
cob_state = self.data_provider.get_cob_state(symbol)
|
|
if cob_state is not None:
|
|
model_input.order_book_features = cob_state
|
|
|
|
return model_input
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error adding order book features for {symbol}: {e}")
|
|
return model_input
|
|
|
|
def _apply_normalization(self, model_input: ModelInputData, model_type: str, requirements: Dict) -> ModelInputData:
|
|
"""Apply model-specific normalization"""
|
|
try:
|
|
norm_type = requirements['normalization']
|
|
|
|
if norm_type == 'pivot_based':
|
|
# Use pivot-based normalization for price data
|
|
if model_input.candles_1m is not None:
|
|
model_input.candles_1m = self._normalize_with_pivot_bounds(model_input.candles_1m, model_input.symbol)
|
|
if model_input.candles_1s is not None:
|
|
model_input.candles_1s = self._normalize_with_pivot_bounds(model_input.candles_1s, model_input.symbol)
|
|
if model_input.candles_5m is not None:
|
|
model_input.candles_5m = self._normalize_with_pivot_bounds(model_input.candles_5m, model_input.symbol)
|
|
|
|
elif norm_type == 'min_max':
|
|
# Min-max normalization for feature vectors
|
|
if model_input.technical_indicators is not None:
|
|
model_input.technical_indicators = self._min_max_normalize(model_input.technical_indicators)
|
|
|
|
elif norm_type == 'z_score':
|
|
# Z-score normalization for order book features
|
|
if model_input.order_book_features is not None:
|
|
model_input.order_book_features = self._z_score_normalize(model_input.order_book_features)
|
|
|
|
return model_input
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error applying normalization for {model_type}: {e}")
|
|
return model_input
|
|
|
|
def _normalize_with_pivot_bounds(self, candles: np.ndarray, symbol: str) -> np.ndarray:
|
|
"""Normalize candles using pivot bounds"""
|
|
try:
|
|
if symbol not in self.data_provider.pivot_bounds:
|
|
# Fallback to simple normalization
|
|
return self._min_max_normalize(candles)
|
|
|
|
bounds = self.data_provider.pivot_bounds[symbol]
|
|
price_range = bounds.get_price_range()
|
|
volume_range = bounds.volume_max - bounds.volume_min
|
|
|
|
normalized = candles.copy()
|
|
|
|
# Normalize price columns (0-3: OHLC)
|
|
for i in range(4):
|
|
normalized[:, i] = (candles[:, i] - bounds.price_min) / price_range
|
|
|
|
# Normalize volume column (4)
|
|
if volume_range > 0:
|
|
normalized[:, 4] = (candles[:, 4] - bounds.volume_min) / volume_range
|
|
else:
|
|
normalized[:, 4] = 0.5 # Default to middle if no volume range
|
|
|
|
return normalized
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error normalizing with pivot bounds: {e}")
|
|
return self._min_max_normalize(candles)
|
|
|
|
def _min_max_normalize(self, data: np.ndarray) -> np.ndarray:
|
|
"""Min-max normalization"""
|
|
try:
|
|
data_min = np.min(data)
|
|
data_max = np.max(data)
|
|
|
|
if data_max - data_min == 0:
|
|
return np.zeros_like(data)
|
|
|
|
return (data - data_min) / (data_max - data_min)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in min-max normalization: {e}")
|
|
return data
|
|
|
|
def _z_score_normalize(self, data: np.ndarray) -> np.ndarray:
|
|
"""Z-score normalization"""
|
|
try:
|
|
mean = np.mean(data)
|
|
std = np.std(data)
|
|
|
|
if std == 0:
|
|
return np.zeros_like(data)
|
|
|
|
return (data - mean) / std
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in z-score normalization: {e}")
|
|
return data
|
|
|
|
def _calculate_data_quality(self, model_input: ModelInputData) -> float:
|
|
"""Calculate data quality score (0.0 to 1.0)"""
|
|
try:
|
|
score = 1.0
|
|
|
|
# Check for missing data
|
|
if model_input.candles_1m is None:
|
|
score -= 0.3
|
|
if model_input.technical_indicators is None:
|
|
score -= 0.2
|
|
if model_input.order_book_features is None:
|
|
score -= 0.1
|
|
|
|
# Check for NaN values
|
|
if model_input.candles_1m is not None and np.isnan(model_input.candles_1m).any():
|
|
score -= 0.2
|
|
if model_input.technical_indicators is not None and np.isnan(model_input.technical_indicators).any():
|
|
score -= 0.2
|
|
|
|
# Check for zero variance (indicating stale data)
|
|
if model_input.candles_1m is not None:
|
|
price_variance = np.var(model_input.candles_1m[:, 3]) # Close price variance
|
|
if price_variance < 1e-8:
|
|
score -= 0.3
|
|
|
|
return max(0.0, score)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error calculating data quality: {e}")
|
|
return 0.0
|
|
|
|
def _count_features(self, model_input: ModelInputData) -> int:
|
|
"""Count total number of features"""
|
|
try:
|
|
count = 0
|
|
|
|
if model_input.candles_1m is not None:
|
|
count += model_input.candles_1m.size
|
|
if model_input.candles_1s is not None:
|
|
count += model_input.candles_1s.size
|
|
if model_input.candles_5m is not None:
|
|
count += model_input.candles_5m.size
|
|
if model_input.technical_indicators is not None:
|
|
count += model_input.technical_indicators.size
|
|
if model_input.order_book_features is not None:
|
|
count += model_input.order_book_features.size
|
|
if model_input.volume_profile is not None:
|
|
count += model_input.volume_profile.size
|
|
|
|
return count
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error counting features: {e}")
|
|
return 0
|
|
|
|
def get_model_specific_input(self, model_input: ModelInputData, model_type: str) -> Optional[np.ndarray]:
|
|
"""
|
|
Extract model-specific input from standardized ModelInputData
|
|
|
|
Args:
|
|
model_input: Standardized input data
|
|
model_type: Type of model
|
|
|
|
Returns:
|
|
np.ndarray: Model-specific input data
|
|
"""
|
|
try:
|
|
if model_type == 'cnn':
|
|
if model_input.candles_1m is not None:
|
|
return model_input.candles_1m
|
|
return None
|
|
|
|
elif model_type == 'dqn':
|
|
if model_input.technical_indicators is not None:
|
|
return model_input.technical_indicators
|
|
return None
|
|
|
|
elif model_type == 'cob_rl':
|
|
if model_input.order_book_features is not None:
|
|
return model_input.order_book_features
|
|
return None
|
|
|
|
elif model_type == 'transformer':
|
|
if model_input.candles_1m is not None:
|
|
return model_input.candles_1m
|
|
return None
|
|
|
|
elif model_type == 'generic':
|
|
if model_input.technical_indicators is not None:
|
|
return model_input.technical_indicators
|
|
return None
|
|
|
|
else:
|
|
logger.error(f"Unknown model type: {model_type}")
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error getting model-specific input for {model_type}: {e}")
|
|
return None
|