This commit is contained in:
Dobromir Popov
2025-05-24 02:15:25 +03:00
parent 6e8ec97539
commit b181d11923
6 changed files with 1117 additions and 254 deletions

View File

@ -149,41 +149,166 @@ class DataProvider:
return None
def _add_technical_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add technical indicators to the DataFrame"""
"""Add comprehensive technical indicators for multi-timeframe analysis"""
try:
df = df.copy()
# Moving averages
# Ensure we have enough data for indicators
if len(df) < 50:
logger.warning(f"Insufficient data for comprehensive indicators: {len(df)} rows")
return self._add_basic_indicators(df)
# === TREND INDICATORS ===
# Moving averages (multiple timeframes)
df['sma_10'] = ta.trend.sma_indicator(df['close'], window=10)
df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
df['sma_50'] = ta.trend.sma_indicator(df['close'], window=50)
df['ema_12'] = ta.trend.ema_indicator(df['close'], window=12)
df['ema_26'] = ta.trend.ema_indicator(df['close'], window=26)
df['ema_50'] = ta.trend.ema_indicator(df['close'], window=50)
# MACD
# MACD family
macd = ta.trend.MACD(df['close'])
df['macd'] = macd.macd()
df['macd_signal'] = macd.macd_signal()
df['macd_histogram'] = macd.macd_diff()
# RSI
df['rsi'] = ta.momentum.rsi(df['close'], window=14)
# ADX (Average Directional Index)
adx = ta.trend.ADXIndicator(df['high'], df['low'], df['close'])
df['adx'] = adx.adx()
df['adx_pos'] = adx.adx_pos()
df['adx_neg'] = adx.adx_neg()
# Parabolic SAR
psar = ta.trend.PSARIndicator(df['high'], df['low'], df['close'])
df['psar'] = psar.psar()
# === MOMENTUM INDICATORS ===
# RSI (multiple periods)
df['rsi_14'] = ta.momentum.rsi(df['close'], window=14)
df['rsi_7'] = ta.momentum.rsi(df['close'], window=7)
df['rsi_21'] = ta.momentum.rsi(df['close'], window=21)
# Stochastic Oscillator
stoch = ta.momentum.StochasticOscillator(df['high'], df['low'], df['close'])
df['stoch_k'] = stoch.stoch()
df['stoch_d'] = stoch.stoch_signal()
# Williams %R
df['williams_r'] = ta.momentum.williams_r(df['high'], df['low'], df['close'])
# Ultimate Oscillator (instead of CCI which isn't available)
df['ultimate_osc'] = ta.momentum.ultimate_oscillator(df['high'], df['low'], df['close'])
# === VOLATILITY INDICATORS ===
# Bollinger Bands
bollinger = ta.volatility.BollingerBands(df['close'])
df['bb_upper'] = bollinger.bollinger_hband()
df['bb_lower'] = bollinger.bollinger_lband()
df['bb_middle'] = bollinger.bollinger_mavg()
df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']
df['bb_percent'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
# Volume moving average (simple rolling mean since ta.volume.volume_sma doesn't exist)
df['volume_sma'] = df['volume'].rolling(window=20).mean()
# Average True Range
df['atr'] = ta.volatility.average_true_range(df['high'], df['low'], df['close'])
# Keltner Channels
keltner = ta.volatility.KeltnerChannel(df['high'], df['low'], df['close'])
df['keltner_upper'] = keltner.keltner_channel_hband()
df['keltner_lower'] = keltner.keltner_channel_lband()
df['keltner_middle'] = keltner.keltner_channel_mband()
# === VOLUME INDICATORS ===
# Volume moving averages
df['volume_sma_10'] = df['volume'].rolling(window=10).mean()
df['volume_sma_20'] = df['volume'].rolling(window=20).mean()
df['volume_sma_50'] = df['volume'].rolling(window=50).mean()
# On Balance Volume
df['obv'] = ta.volume.on_balance_volume(df['close'], df['volume'])
# Volume Price Trend
df['vpt'] = ta.volume.volume_price_trend(df['close'], df['volume'])
# Money Flow Index
df['mfi'] = ta.volume.money_flow_index(df['high'], df['low'], df['close'], df['volume'])
# Accumulation/Distribution Line
df['ad_line'] = ta.volume.acc_dist_index(df['high'], df['low'], df['close'], df['volume'])
# Volume Weighted Average Price (VWAP)
df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
# === PRICE ACTION INDICATORS ===
# Price position relative to range
df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])
# True Range (use ATR calculation for true range)
df['true_range'] = df['atr'] # ATR is based on true range, so use it directly
# Rate of Change
df['roc'] = ta.momentum.roc(df['close'], window=10)
# === CUSTOM INDICATORS ===
# Trend strength (combination of multiple trend indicators)
df['trend_strength'] = (
(df['close'] > df['sma_20']).astype(int) +
(df['sma_10'] > df['sma_20']).astype(int) +
(df['macd'] > df['macd_signal']).astype(int) +
(df['adx'] > 25).astype(int)
) / 4.0
# Momentum composite
df['momentum_composite'] = (
(df['rsi_14'] / 100) +
((df['stoch_k'] + 50) / 100) + # Normalize stoch_k
((df['williams_r'] + 50) / 100) # Normalize williams_r
) / 3.0
# Volatility regime
df['volatility_regime'] = (df['atr'] / df['close']).rolling(window=20).rank(pct=True)
# === FILL NaN VALUES ===
# Forward fill first, then backward fill, then zero fill
df = df.ffill().bfill().fillna(0)
logger.debug(f"Added {len([col for col in df.columns if col not in ['timestamp', 'open', 'high', 'low', 'close', 'volume']])} technical indicators")
return df
except Exception as e:
logger.error(f"Error adding comprehensive technical indicators: {e}")
# Fallback to basic indicators
return self._add_basic_indicators(df)
def _add_basic_indicators(self, df: pd.DataFrame) -> pd.DataFrame:
"""Add basic indicators for small datasets"""
try:
df = df.copy()
# Basic moving averages
if len(df) >= 20:
df['sma_20'] = ta.trend.sma_indicator(df['close'], window=20)
df['ema_12'] = ta.trend.ema_indicator(df['close'], window=12)
# Basic RSI
if len(df) >= 14:
df['rsi_14'] = ta.momentum.rsi(df['close'], window=14)
# Basic volume indicators
if len(df) >= 10:
df['volume_sma_10'] = df['volume'].rolling(window=10).mean()
# Basic price action
df['price_position'] = (df['close'] - df['low']) / (df['high'] - df['low'])
df['price_position'] = df['price_position'].fillna(0.5) # Default to middle
# Fill NaN values
df = df.bfill().fillna(0)
df = df.ffill().bfill().fillna(0)
return df
except Exception as e:
logger.error(f"Error adding technical indicators: {e}")
logger.error(f"Error adding basic indicators: {e}")
return df
def _load_from_cache(self, symbol: str, timeframe: str) -> Optional[pd.DataFrame]:
@ -381,37 +506,255 @@ class DataProvider:
def get_feature_matrix(self, symbol: str, timeframes: List[str] = None,
window_size: int = 20) -> Optional[np.ndarray]:
"""Get feature matrix for multiple timeframes"""
"""
Get comprehensive feature matrix for multiple timeframes with technical indicators
Returns:
np.ndarray: Shape (n_timeframes, window_size, n_features)
Each timeframe becomes a separate channel for CNN
"""
try:
if timeframes is None:
timeframes = self.timeframes
features = []
feature_channels = []
common_feature_names = None
# First pass: determine common features across all timeframes
timeframe_features = {}
for tf in timeframes:
df = self.get_latest_candles(symbol, tf, limit=window_size + 50)
logger.debug(f"Processing timeframe {tf} for {symbol}")
df = self.get_latest_candles(symbol, tf, limit=window_size + 100)
if df is not None and len(df) >= window_size:
# Select feature columns
feature_cols = ['open', 'high', 'low', 'close', 'volume']
if 'sma_20' in df.columns:
feature_cols.extend(['sma_20', 'rsi', 'macd'])
# Get the latest window
tf_features = df[feature_cols].tail(window_size).values
features.append(tf_features)
if df is None or len(df) < window_size:
logger.warning(f"Insufficient data for {symbol} {tf}: {len(df) if df is not None else 0} rows")
continue
# Get feature columns
basic_cols = ['open', 'high', 'low', 'close', 'volume']
indicator_cols = [col for col in df.columns
if col not in basic_cols + ['timestamp'] and not col.startswith('unnamed')]
selected_features = self._select_cnn_features(df, basic_cols, indicator_cols)
timeframe_features[tf] = (df, selected_features)
if common_feature_names is None:
common_feature_names = set(selected_features)
else:
logger.warning(f"Insufficient data for {symbol} {tf}")
return None
common_feature_names = common_feature_names.intersection(set(selected_features))
if features:
# Stack features from all timeframes
return np.stack(features, axis=0) # Shape: (n_timeframes, window_size, n_features)
if not common_feature_names:
logger.error(f"No common features found across timeframes for {symbol}")
return None
# Convert to sorted list for consistent ordering
common_feature_names = sorted(list(common_feature_names))
logger.info(f"Using {len(common_feature_names)} common features: {common_feature_names}")
# Second pass: create feature channels with common features
for tf in timeframes:
if tf not in timeframe_features:
continue
df, _ = timeframe_features[tf]
# Use only common features
try:
tf_features = self._normalize_features(df[common_feature_names].tail(window_size))
if tf_features is not None and len(tf_features) == window_size:
feature_channels.append(tf_features.values)
logger.debug(f"Added {len(common_feature_names)} features for {tf}")
else:
logger.warning(f"Feature normalization failed for {tf}")
except Exception as e:
logger.error(f"Error processing features for {tf}: {e}")
continue
if not feature_channels:
logger.error(f"No valid feature channels created for {symbol}")
return None
# Verify all channels have the same shape
shapes = [channel.shape for channel in feature_channels]
if len(set(shapes)) > 1:
logger.error(f"Shape mismatch in feature channels: {shapes}")
return None
# Stack all timeframe channels
feature_matrix = np.stack(feature_channels, axis=0)
logger.info(f"Created feature matrix for {symbol}: {feature_matrix.shape} "
f"({len(feature_channels)} timeframes, {window_size} steps, {len(common_feature_names)} features)")
return feature_matrix
except Exception as e:
logger.error(f"Error creating feature matrix for {symbol}: {e}")
import traceback
logger.error(traceback.format_exc())
return None
def _select_cnn_features(self, df: pd.DataFrame, basic_cols: List[str], indicator_cols: List[str]) -> List[str]:
"""Select the most important features for CNN training"""
try:
selected = []
# Always include basic OHLCV (normalized)
selected.extend(basic_cols)
# Priority indicators (most informative for CNNs)
priority_indicators = [
# Trend indicators
'sma_10', 'sma_20', 'sma_50', 'ema_12', 'ema_26', 'ema_50',
'macd', 'macd_signal', 'macd_histogram',
'adx', 'adx_pos', 'adx_neg', 'psar',
# Momentum indicators
'rsi_14', 'rsi_7', 'rsi_21',
'stoch_k', 'stoch_d', 'williams_r', 'ultimate_osc',
# Volatility indicators
'bb_upper', 'bb_lower', 'bb_middle', 'bb_width', 'bb_percent',
'atr', 'keltner_upper', 'keltner_lower', 'keltner_middle',
# Volume indicators
'volume_sma_10', 'volume_sma_20', 'obv', 'vpt', 'mfi', 'ad_line', 'vwap',
# Price action
'price_position', 'true_range', 'roc',
# Custom composites
'trend_strength', 'momentum_composite', 'volatility_regime'
]
# Add available priority indicators
for indicator in priority_indicators:
if indicator in indicator_cols:
selected.append(indicator)
# Add any other technical indicators not in priority list (limit to avoid curse of dimensionality)
remaining_indicators = [col for col in indicator_cols if col not in selected]
if remaining_indicators:
# Limit to 10 additional indicators
selected.extend(remaining_indicators[:10])
# Verify all selected features exist in dataframe
final_selected = [col for col in selected if col in df.columns]
logger.debug(f"Selected {len(final_selected)} features from {len(df.columns)} available columns")
return final_selected
except Exception as e:
logger.error(f"Error selecting CNN features: {e}")
return basic_cols # Fallback to basic OHLCV
def _normalize_features(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
"""Normalize features for CNN training"""
try:
df_norm = df.copy()
# Handle different normalization strategies for different feature types
for col in df_norm.columns:
if col in ['open', 'high', 'low', 'close', 'sma_10', 'sma_20', 'sma_50',
'ema_12', 'ema_26', 'ema_50', 'bb_upper', 'bb_lower', 'bb_middle',
'keltner_upper', 'keltner_lower', 'keltner_middle', 'psar', 'vwap']:
# Price-based indicators: normalize by close price
if 'close' in df_norm.columns:
base_price = df_norm['close'].iloc[-1] # Use latest close as reference
if base_price > 0:
df_norm[col] = df_norm[col] / base_price
elif col == 'volume':
# Volume: normalize by its own rolling mean
volume_mean = df_norm[col].rolling(window=min(20, len(df_norm))).mean().iloc[-1]
if volume_mean > 0:
df_norm[col] = df_norm[col] / volume_mean
elif col in ['rsi_14', 'rsi_7', 'rsi_21']:
# RSI: already 0-100, normalize to 0-1
df_norm[col] = df_norm[col] / 100.0
elif col in ['stoch_k', 'stoch_d']:
# Stochastic: already 0-100, normalize to 0-1
df_norm[col] = df_norm[col] / 100.0
elif col == 'williams_r':
# Williams %R: -100 to 0, normalize to 0-1
df_norm[col] = (df_norm[col] + 100) / 100.0
elif col in ['macd', 'macd_signal', 'macd_histogram']:
# MACD: normalize by ATR or close price
if 'atr' in df_norm.columns and df_norm['atr'].iloc[-1] > 0:
df_norm[col] = df_norm[col] / df_norm['atr'].iloc[-1]
elif 'close' in df_norm.columns:
df_norm[col] = df_norm[col] / df_norm['close'].iloc[-1]
elif col in ['bb_width', 'bb_percent', 'price_position', 'trend_strength',
'momentum_composite', 'volatility_regime']:
# Already normalized indicators: ensure 0-1 range
df_norm[col] = np.clip(df_norm[col], 0, 1)
elif col in ['atr', 'true_range']:
# Volatility indicators: normalize by close price
if 'close' in df_norm.columns:
df_norm[col] = df_norm[col] / df_norm['close'].iloc[-1]
else:
# Other indicators: z-score normalization
col_mean = df_norm[col].rolling(window=min(20, len(df_norm))).mean().iloc[-1]
col_std = df_norm[col].rolling(window=min(20, len(df_norm))).std().iloc[-1]
if col_std > 0:
df_norm[col] = (df_norm[col] - col_mean) / col_std
else:
df_norm[col] = 0
# Replace inf/-inf with 0
df_norm = df_norm.replace([np.inf, -np.inf], 0)
# Fill any remaining NaN values
df_norm = df_norm.fillna(0)
return df_norm
except Exception as e:
logger.error(f"Error normalizing features: {e}")
return df
def get_multi_symbol_feature_matrix(self, symbols: List[str] = None,
timeframes: List[str] = None,
window_size: int = 20) -> Optional[np.ndarray]:
"""
Get feature matrix for multiple symbols and timeframes
Returns:
np.ndarray: Shape (n_symbols, n_timeframes, window_size, n_features)
"""
try:
if symbols is None:
symbols = self.symbols
if timeframes is None:
timeframes = self.timeframes
symbol_matrices = []
for symbol in symbols:
symbol_matrix = self.get_feature_matrix(symbol, timeframes, window_size)
if symbol_matrix is not None:
symbol_matrices.append(symbol_matrix)
else:
logger.warning(f"Could not create feature matrix for {symbol}")
if symbol_matrices:
# Stack all symbol matrices
multi_symbol_matrix = np.stack(symbol_matrices, axis=0)
logger.info(f"Created multi-symbol feature matrix: {multi_symbol_matrix.shape}")
return multi_symbol_matrix
return None
except Exception as e:
logger.error(f"Error creating feature matrix for {symbol}: {e}")
logger.error(f"Error creating multi-symbol feature matrix: {e}")
return None
def health_check(self) -> Dict[str, Any]: