data normalizations

This commit is contained in:
Dobromir Popov
2025-09-02 18:51:49 +03:00
parent 1c013f2806
commit 6dcb82c184
4 changed files with 251 additions and 322 deletions

View File

@@ -16,6 +16,7 @@ import logging
import time
import threading
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any, Tuple, Union
from dataclasses import dataclass, field
@@ -2364,12 +2365,262 @@ class TradingOrchestrator:
logger.info("Initializing ExtremaTrainer with historical context...")
self.extrema_trainer.initialize_context_data()
# CRITICAL: Initialize ALL models with historical data
self._initialize_models_with_historical_data(loaded_data)
logger.info(f"🎯 Historical data loading complete: {total_candles} total candles loaded")
logger.info(f"📊 Available datasets: {list(loaded_data.keys())}")
except Exception as e:
logger.error(f"Error in historical data loading: {e}")
def _initialize_models_with_historical_data(self, loaded_data: Dict[str, Any]):
"""Initialize all NN models with historical data and multi-symbol support"""
try:
logger.info("Initializing models with historical data and multi-symbol support...")
# Prepare multi-symbol feature matrices
symbol_features = self._prepare_multi_symbol_features(loaded_data)
# Initialize CNN with multi-symbol data
if hasattr(self, 'cnn_model') and self.cnn_model:
logger.info("Initializing CNN with multi-symbol historical features...")
self._initialize_cnn_with_data(symbol_features)
# Initialize DQN with multi-symbol states
if hasattr(self, 'rl_agent') and self.rl_agent:
logger.info("Initializing DQN with multi-symbol state vectors...")
self._initialize_dqn_with_data(symbol_features)
# Initialize Transformer with sequence data
if hasattr(self, 'transformer_model') and self.transformer_model:
logger.info("Initializing Transformer with multi-symbol sequences...")
self._initialize_transformer_with_data(symbol_features)
# Initialize Decision Fusion with comprehensive features
if hasattr(self, 'decision_fusion') and self.decision_fusion:
logger.info("Initializing Decision Fusion with multi-symbol features...")
self._initialize_decision_with_data(symbol_features)
logger.info("✅ All models initialized with historical multi-symbol data")
except Exception as e:
logger.error(f"Error initializing models with historical data: {e}")
def _prepare_multi_symbol_features(self, loaded_data: Dict[str, Any]) -> Dict[str, Any]:
"""Prepare normalized multi-symbol feature matrices"""
try:
symbol_features = {
'ETH/USDT': {'1m': None, '1h': None, '1d': None},
'BTC/USDT': {'1m': None}
}
# Process each symbol's data with symbol-specific normalization
for data_key, df in loaded_data.items():
if df is None or df.empty:
continue
# Extract symbol and timeframe
if '_1m' in data_key:
symbol = data_key.replace('_1m', '')
timeframe = '1m'
elif '_1h' in data_key:
symbol = data_key.replace('_1h', '')
timeframe = '1h'
elif '_1d' in data_key:
symbol = data_key.replace('_1d', '')
timeframe = '1d'
else:
continue
# Apply symbol-grouped normalization
normalized_df = self._apply_symbol_grouped_normalization(df, symbol)
if normalized_df is not None:
symbol_features[symbol][timeframe] = normalized_df
logger.debug(f"Prepared normalized features for {symbol} {timeframe}")
return symbol_features
except Exception as e:
logger.error(f"Error preparing multi-symbol features: {e}")
return {}
def _apply_symbol_grouped_normalization(self, df: pd.DataFrame, symbol: str) -> pd.DataFrame:
"""Apply symbol-grouped normalization with consistent ranges across timeframes"""
try:
df_norm = df.copy()
# Get symbol-specific price ranges for consistent normalization
symbol_price_ranges = {
'ETH/USDT': {'min': 1000, 'max': 5000}, # ETH price range
'BTC/USDT': {'min': 90000, 'max': 120000} # BTC price range
}
if symbol in symbol_price_ranges:
price_range = symbol_price_ranges[symbol]
range_size = price_range['max'] - price_range['min']
# Normalize price columns to [0, 1] range specific to symbol
price_cols = ['open', 'high', 'low', 'close']
for col in price_cols:
if col in df_norm.columns:
df_norm[col] = (df_norm[col] - price_range['min']) / range_size
df_norm[col] = np.clip(df_norm[col], 0, 1) # Ensure [0,1] range
# Normalize volume to [0, 1] using log scale
if 'volume' in df_norm.columns:
df_norm['volume'] = np.log1p(df_norm['volume'])
vol_max = df_norm['volume'].max()
if vol_max > 0:
df_norm['volume'] = df_norm['volume'] / vol_max
logger.debug(f"Applied symbol-grouped normalization for {symbol}")
# Fill any NaN values
df_norm = df_norm.fillna(0)
return df_norm
except Exception as e:
logger.error(f"Error in symbol-grouped normalization for {symbol}: {e}")
return df
def _initialize_cnn_with_data(self, symbol_features: Dict[str, Any]):
"""Initialize CNN with multi-symbol feature matrix"""
try:
# Create combined feature matrix: [ETH_1m, ETH_1h, ETH_1d, BTC_1m]
combined_features = []
# ETH features (1m, 1h, 1d)
for timeframe in ['1m', '1h', '1d']:
eth_data = symbol_features.get('ETH/USDT', {}).get(timeframe)
if eth_data is not None and not eth_data.empty:
# Use last 60 candles for CNN input
recent_data = eth_data.tail(60)
features = recent_data[['open', 'high', 'low', 'close', 'volume']].values
combined_features.append(features.flatten())
# BTC features (1m)
btc_data = symbol_features.get('BTC/USDT', {}).get('1m')
if btc_data is not None and not btc_data.empty:
recent_data = btc_data.tail(60)
features = recent_data[['open', 'high', 'low', 'close', 'volume']].values
combined_features.append(features.flatten())
if combined_features:
# Concatenate all features
full_features = np.concatenate(combined_features)
logger.info(f"CNN initialized with {len(full_features)} multi-symbol features")
# Store for model access
if not hasattr(self, 'model_historical_features'):
self.model_historical_features = {}
self.model_historical_features['cnn'] = full_features
except Exception as e:
logger.error(f"Error initializing CNN with historical data: {e}")
def _initialize_dqn_with_data(self, symbol_features: Dict[str, Any]):
"""Initialize DQN with multi-symbol state vectors"""
try:
# Create comprehensive state vector combining all symbols and timeframes
state_components = []
for symbol in ['ETH/USDT', 'BTC/USDT']:
timeframes = ['1m', '1h', '1d'] if symbol == 'ETH/USDT' else ['1m']
for timeframe in timeframes:
data = symbol_features.get(symbol, {}).get(timeframe)
if data is not None and not data.empty:
# Extract key features for state
latest = data.iloc[-1]
state_features = [
latest['close'], # Current price
latest['volume'], # Current volume
data['close'].pct_change().iloc[-1] if len(data) > 1 else 0, # Price change
]
state_components.extend(state_features)
if state_components:
# Pad or truncate to expected DQN state size
target_size = 100 # DQN expects 100-dimensional state
if len(state_components) < target_size:
state_components.extend([0] * (target_size - len(state_components)))
else:
state_components = state_components[:target_size]
state_vector = np.array(state_components, dtype=np.float32)
logger.info(f"DQN initialized with {len(state_vector)} dimensional multi-symbol state")
# Store for model access
if not hasattr(self, 'model_historical_features'):
self.model_historical_features = {}
self.model_historical_features['dqn'] = state_vector
except Exception as e:
logger.error(f"Error initializing DQN with historical data: {e}")
def _initialize_transformer_with_data(self, symbol_features: Dict[str, Any]):
"""Initialize Transformer with multi-symbol sequence data"""
try:
# Prepare sequence data for transformer
sequences = []
# ETH sequences
for timeframe in ['1m', '1h', '1d']:
eth_data = symbol_features.get('ETH/USDT', {}).get(timeframe)
if eth_data is not None and not eth_data.empty:
# Use last 150 points as sequence
sequence = eth_data.tail(150)[['open', 'high', 'low', 'close', 'volume']].values
sequences.append(sequence)
# BTC sequence
btc_data = symbol_features.get('BTC/USDT', {}).get('1m')
if btc_data is not None and not btc_data.empty:
sequence = btc_data.tail(150)[['open', 'high', 'low', 'close', 'volume']].values
sequences.append(sequence)
if sequences:
logger.info(f"Transformer initialized with {len(sequences)} multi-symbol sequences")
# Store for model access
if not hasattr(self, 'model_historical_features'):
self.model_historical_features = {}
self.model_historical_features['transformer'] = sequences
except Exception as e:
logger.error(f"Error initializing Transformer with historical data: {e}")
def _initialize_decision_with_data(self, symbol_features: Dict[str, Any]):
"""Initialize Decision Fusion with comprehensive multi-symbol features"""
try:
# Aggregate all available features for decision fusion
all_features = {}
for symbol in symbol_features:
for timeframe in symbol_features[symbol]:
data = symbol_features[symbol][timeframe]
if data is not None and not data.empty:
key = f"{symbol}_{timeframe}"
all_features[key] = {
'latest_price': data['close'].iloc[-1],
'volume': data['volume'].iloc[-1],
'price_change': data['close'].pct_change().iloc[-1] if len(data) > 1 else 0,
'volatility': data['close'].std() if len(data) > 1 else 0
}
if all_features:
logger.info(f"Decision Fusion initialized with {len(all_features)} symbol-timeframe combinations")
# Store for model access
if not hasattr(self, 'model_historical_features'):
self.model_historical_features = {}
self.model_historical_features['decision'] = all_features
except Exception as e:
logger.error(f"Error initializing Decision Fusion with historical data: {e}")
def get_ohlcv_data(self, symbol: str, timeframe: str, limit: int = 300) -> List:
"""Get OHLCV data for a symbol with specified timeframe and limit."""
try: