""" Real-Time RL Training System This module implements continuous learning from live trading decisions. The RL agent learns from every trade signal and position closure to improve decision-making over time. """ import logging import numpy as np import torch import torch.nn as nn import torch.optim as optim from collections import deque from datetime import datetime, timedelta from typing import Dict, List, Optional, Tuple, Any import threading import time import json import os # Import existing DQN agent import sys sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'NN')) from NN.models.dqn_agent import DQNAgent logger = logging.getLogger(__name__) class TradingExperience: """Represents a single trading experience for RL learning""" def __init__(self, pre_trade_state: np.ndarray, action: int, # 0=SELL, 1=HOLD, 2=BUY entry_price: float, exit_price: float, holding_time: float, # seconds pnl: float, fees: float, confidence: float, market_conditions: Dict[str, Any], timestamp: datetime): self.pre_trade_state = pre_trade_state self.action = action self.entry_price = entry_price self.exit_price = exit_price self.holding_time = holding_time self.pnl = pnl self.fees = fees self.confidence = confidence self.market_conditions = market_conditions self.timestamp = timestamp # Calculate reward self.reward = self._calculate_reward() def _calculate_reward(self) -> float: """Calculate reward for this trading experience""" # Net PnL after fees net_pnl = self.pnl - self.fees # Base reward from PnL (normalized by entry price) base_reward = net_pnl / self.entry_price # Time penalty - prefer faster profitable trades time_penalty = 0.0 if self.holding_time > 300: # 5 minutes time_penalty = -0.001 * (self.holding_time / 60) # -0.001 per minute # Confidence bonus - reward high-confidence correct decisions confidence_bonus = 0.0 if net_pnl > 0 and self.confidence > 0.7: confidence_bonus = 0.01 * self.confidence # Volume consideration (prefer trades that move significant amounts) volume_factor = min(abs(base_reward) * 10, 0.05) # Cap at 5% total_reward = base_reward + time_penalty + confidence_bonus # Scale reward to reasonable range return np.tanh(total_reward * 100) * 10 # Scale and bound reward class MarketStateBuilder: """Builds state representations for RL agent from market data""" def __init__(self, state_size: int = 100): self.state_size = state_size self.price_history = deque(maxlen=50) self.volume_history = deque(maxlen=50) self.rsi_history = deque(maxlen=14) self.macd_history = deque(maxlen=26) def update_market_data(self, price: float, volume: float, rsi: float = None, macd: float = None): """Update market data buffers""" self.price_history.append(price) self.volume_history.append(volume) if rsi is not None: self.rsi_history.append(rsi) if macd is not None: self.macd_history.append(macd) def build_state(self, current_position: str = 'NONE', position_pnl: float = 0.0, account_balance: float = 1000.0) -> np.ndarray: """Build state vector for RL agent""" state = np.zeros(self.state_size) try: # Price features (normalized returns) if len(self.price_history) >= 2: prices = np.array(list(self.price_history)) returns = np.diff(prices) / prices[:-1] # Recent returns (last 20) recent_returns = returns[-20:] if len(returns) >= 20 else returns state[:len(recent_returns)] = recent_returns # Price momentum features state[20] = np.mean(returns[-5:]) if len(returns) >= 5 else 0 # 5-bar momentum state[21] = np.mean(returns[-10:]) if len(returns) >= 10 else 0 # 10-bar momentum state[22] = np.std(returns[-10:]) if len(returns) >= 10 else 0 # Volatility # Volume features if len(self.volume_history) >= 2: volumes = np.array(list(self.volume_history)) volume_changes = np.diff(volumes) / volumes[:-1] recent_volume_changes = volume_changes[-10:] if len(volume_changes) >= 10 else volume_changes state[30:30+len(recent_volume_changes)] = recent_volume_changes # Volume momentum state[40] = np.mean(volume_changes[-5:]) if len(volume_changes) >= 5 else 0 # Technical indicators if len(self.rsi_history) >= 1: state[50] = (list(self.rsi_history)[-1] - 50) / 50 # Normalized RSI if len(self.macd_history) >= 2: macd_values = list(self.macd_history) state[51] = macd_values[-1] / 100 # Normalized MACD state[52] = (macd_values[-1] - macd_values[-2]) / 100 # MACD change # Position information position_encoding = {'NONE': 0, 'LONG': 1, 'SHORT': -1} state[60] = position_encoding.get(current_position, 0) state[61] = position_pnl / 100 # Normalized PnL state[62] = account_balance / 1000 # Normalized balance # Market regime features if len(self.price_history) >= 20: prices = np.array(list(self.price_history)) # Trend strength state[70] = (prices[-1] - prices[-20]) / prices[-20] # 20-bar trend # Market volatility regime returns = np.diff(prices) / prices[:-1] state[71] = np.std(returns[-20:]) if len(returns) >= 20 else 0 # Support/resistance levels high_20 = np.max(prices[-20:]) low_20 = np.min(prices[-20:]) current_price = prices[-1] state[72] = (current_price - low_20) / (high_20 - low_20) if high_20 != low_20 else 0.5 except Exception as e: logger.error(f"Error building state: {e}") return state class RealTimeRLTrainer: """Real-time RL trainer that learns from live trading decisions""" def __init__(self, config: Dict[str, Any] = None): """Initialize the real-time RL trainer""" self.config = config or {} # RL Agent configuration state_size = self.config.get('state_size', 100) action_size = 3 # BUY, HOLD, SELL # Initialize RL agent self.agent = DQNAgent( state_shape=(state_size,), n_actions=action_size, learning_rate=self.config.get('learning_rate', 0.0001), gamma=self.config.get('gamma', 0.95), epsilon=self.config.get('epsilon', 0.1), # Low epsilon for live trading epsilon_min=0.05, epsilon_decay=0.999, buffer_size=self.config.get('buffer_size', 10000), batch_size=self.config.get('batch_size', 32) ) # Market state builder self.state_builder = MarketStateBuilder(state_size) # Training data storage self.pending_trades = {} # symbol -> trade info self.completed_experiences = deque(maxlen=1000) self.learning_history = [] # Training controls self.training_enabled = self.config.get('training_enabled', True) self.min_experiences_for_training = self.config.get('min_experiences', 10) self.training_frequency = self.config.get('training_frequency', 5) # Train every N experiences self.experience_count = 0 # Model saving self.model_save_path = self.config.get('model_save_path', 'models/realtime_rl') self.save_frequency = self.config.get('save_frequency', 100) # Save every N experiences # Performance tracking self.performance_history = [] self.recent_rewards = deque(maxlen=100) self.trade_count = 0 self.win_count = 0 # Threading for async training self.training_thread = None self.training_queue = deque() self.training_lock = threading.Lock() logger.info(f"Real-time RL trainer initialized") logger.info(f"State size: {state_size}, Action size: {action_size}") logger.info(f"Training enabled: {self.training_enabled}") def update_market_data(self, symbol: str, price: float, volume: float, rsi: float = None, macd: float = None): """Update market data for state building""" self.state_builder.update_market_data(price, volume, rsi, macd) def record_trade_signal(self, symbol: str, action: str, confidence: float, current_price: float, position_info: Dict[str, Any] = None): """Record a trade signal for future learning""" try: # Build current state current_position = 'NONE' position_pnl = 0.0 account_balance = 1000.0 if position_info: current_position = position_info.get('side', 'NONE') position_pnl = position_info.get('unrealized_pnl', 0.0) account_balance = position_info.get('account_balance', 1000.0) state = self.state_builder.build_state(current_position, position_pnl, account_balance) # Convert action to numeric action_map = {'SELL': 0, 'HOLD': 1, 'BUY': 2} action_num = action_map.get(action.upper(), 1) # Store pending trade trade_info = { 'pre_trade_state': state.copy(), 'action': action_num, 'entry_price': current_price, 'confidence': confidence, 'entry_time': datetime.now(), 'market_conditions': { 'volatility': np.std(list(self.state_builder.price_history)[-10:]) if len(self.state_builder.price_history) >= 10 else 0, 'trend': state[70] if len(state) > 70 else 0, 'volume_trend': state[40] if len(state) > 40 else 0 } } if action.upper() in ['BUY', 'SELL']: self.pending_trades[symbol] = trade_info logger.info(f"Recorded {action} signal for {symbol} at ${current_price:.2f} (confidence: {confidence:.2f})") except Exception as e: logger.error(f"Error recording trade signal: {e}") def record_position_closure(self, symbol: str, exit_price: float, pnl: float, fees: float): """Record position closure and create learning experience""" try: if symbol not in self.pending_trades: logger.warning(f"No pending trade found for {symbol}") return trade_info = self.pending_trades.pop(symbol) # Calculate holding time holding_time = (datetime.now() - trade_info['entry_time']).total_seconds() # Create trading experience experience = TradingExperience( pre_trade_state=trade_info['pre_trade_state'], action=trade_info['action'], entry_price=trade_info['entry_price'], exit_price=exit_price, holding_time=holding_time, pnl=pnl, fees=fees, confidence=trade_info['confidence'], market_conditions=trade_info['market_conditions'], timestamp=datetime.now() ) # Add to completed experiences self.completed_experiences.append(experience) self.recent_rewards.append(experience.reward) self.experience_count += 1 self.trade_count += 1 if experience.reward > 0: self.win_count += 1 # Log the experience logger.info(f"Recorded experience: {symbol} PnL=${pnl:.4f} Reward={experience.reward:.4f} " f"(Win rate: {self.win_count/self.trade_count*100:.1f}%)") # Create next state (current market state after trade) current_state = self.state_builder.build_state('NONE', 0.0, 1000.0) # Store in agent memory for learning self.agent.remember( state=trade_info['pre_trade_state'], action=trade_info['action'], reward=experience.reward, next_state=current_state, done=True # Each trade is a complete episode ) # Trigger training if conditions are met if self.training_enabled: self._maybe_train() # Save model periodically if self.experience_count % self.save_frequency == 0: self._save_model() except Exception as e: logger.error(f"Error recording position closure: {e}") def _maybe_train(self): """Train the agent if conditions are met""" try: if (len(self.agent.memory) >= self.min_experiences_for_training and self.experience_count % self.training_frequency == 0): # Perform training step loss = self.agent.replay() if loss is not None: self.learning_history.append({ 'timestamp': datetime.now().isoformat(), 'experience_count': self.experience_count, 'loss': loss, 'epsilon': self.agent.epsilon, 'avg_reward': np.mean(list(self.recent_rewards)) if self.recent_rewards else 0, 'win_rate': self.win_count / self.trade_count if self.trade_count > 0 else 0, 'memory_size': len(self.agent.memory) }) logger.info(f"RL Training: Loss={loss:.4f}, Epsilon={self.agent.epsilon:.3f}, " f"Avg Reward={np.mean(list(self.recent_rewards)):.4f}, " f"Memory Size={len(self.agent.memory)}") except Exception as e: logger.error(f"Error in training: {e}") def get_action_prediction(self, symbol: str, current_position: str = 'NONE', position_pnl: float = 0.0, account_balance: float = 1000.0) -> Tuple[str, float]: """Get action prediction from trained RL agent""" try: # Build current state state = self.state_builder.build_state(current_position, position_pnl, account_balance) # Get prediction from agent with torch.no_grad(): q_values, _, _, _, _ = self.agent.policy_net( torch.FloatTensor(state).unsqueeze(0).to(self.agent.device) ) # Get action with highest Q-value action_idx = q_values.argmax().item() confidence = torch.softmax(q_values, dim=1).max().item() # Convert to action string action_map = {0: 'SELL', 1: 'HOLD', 2: 'BUY'} action = action_map[action_idx] return action, confidence except Exception as e: logger.error(f"Error getting action prediction: {e}") return 'HOLD', 0.5 def get_training_stats(self) -> Dict[str, Any]: """Get current training statistics""" try: return { 'total_experiences': self.experience_count, 'total_trades': self.trade_count, 'win_count': self.win_count, 'win_rate': self.win_count / self.trade_count if self.trade_count > 0 else 0, 'avg_reward': np.mean(list(self.recent_rewards)) if self.recent_rewards else 0, 'memory_size': len(self.agent.memory), 'epsilon': self.agent.epsilon, 'recent_loss': self.learning_history[-1]['loss'] if self.learning_history else 0, 'training_enabled': self.training_enabled, 'pending_trades': len(self.pending_trades) } except Exception as e: logger.error(f"Error getting training stats: {e}") return {} def _save_model(self): """Save the trained model""" try: os.makedirs(self.model_save_path, exist_ok=True) # Save RL agent self.agent.save(os.path.join(self.model_save_path, 'rl_agent')) # Save training history history_path = os.path.join(self.model_save_path, 'training_history.json') with open(history_path, 'w') as f: json.dump(self.learning_history, f, indent=2) # Save performance stats stats_path = os.path.join(self.model_save_path, 'performance_stats.json') with open(stats_path, 'w') as f: json.dump(self.get_training_stats(), f, indent=2) logger.info(f"Saved RL model and training data to {self.model_save_path}") except Exception as e: logger.error(f"Error saving model: {e}") def load_model(self): """Load a previously saved model""" try: model_path = os.path.join(self.model_save_path, 'rl_agent') if os.path.exists(f"{model_path}_policy_model.pt"): self.agent.load(model_path) logger.info(f"Loaded RL model from {model_path}") # Load training history if available history_path = os.path.join(self.model_save_path, 'training_history.json') if os.path.exists(history_path): with open(history_path, 'r') as f: self.learning_history = json.load(f) return True else: logger.info("No saved model found, starting with fresh model") return False except Exception as e: logger.error(f"Error loading model: {e}") return False def enable_training(self, enabled: bool = True): """Enable or disable training""" self.training_enabled = enabled logger.info(f"RL training {'enabled' if enabled else 'disabled'}") def reset_performance_stats(self): """Reset performance tracking statistics""" self.trade_count = 0 self.win_count = 0 self.recent_rewards.clear() logger.info("Reset RL performance statistics")