import torch import torch.nn as nn import torch.optim as optim import numpy as np from collections import deque import random from typing import Tuple, List import os import sys import logging import torch.nn.functional as F import time # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) # Configure logger logger = logging.getLogger(__name__) class DQNAgent: """ Deep Q-Network agent for trading Uses Enhanced CNN model as the base network with GPU support for improved performance """ def __init__(self, state_shape: Tuple[int, ...], n_actions: int = 2, learning_rate: float = 0.001, epsilon: float = 1.0, epsilon_min: float = 0.01, epsilon_decay: float = 0.995, buffer_size: int = 10000, batch_size: int = 32, target_update: int = 100, priority_memory: bool = True, device=None): # Extract state dimensions if isinstance(state_shape, tuple) and len(state_shape) > 1: # Multi-dimensional state (like image or sequence) self.state_dim = state_shape else: # 1D state if isinstance(state_shape, tuple): self.state_dim = state_shape[0] else: self.state_dim = state_shape # Store parameters self.n_actions = n_actions self.learning_rate = learning_rate self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.buffer_size = buffer_size self.batch_size = batch_size self.target_update = target_update # Set device for computation (default to GPU if available) if device is None: self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: self.device = device # Initialize models with Enhanced CNN architecture for better performance from NN.models.enhanced_cnn import EnhancedCNN # Use Enhanced CNN for both policy and target networks self.policy_net = EnhancedCNN(self.state_dim, self.n_actions) self.target_net = EnhancedCNN(self.state_dim, self.n_actions) # Initialize the target network with the same weights as the policy network self.target_net.load_state_dict(self.policy_net.state_dict()) # Set models to eval mode (important for batch norm, dropout) self.target_net.eval() # Optimization components self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.criterion = nn.MSELoss() # Experience replay memory self.memory = [] self.positive_memory = [] # Special memory for storing good experiences self.update_count = 0 # Extrema detection tracking self.last_extrema_pred = { 'class': 2, # Default to "neither" (not extrema) 'confidence': 0.0, 'raw': None } self.extrema_memory = [] # Special memory for storing extrema points # Price prediction tracking self.last_price_pred = { 'immediate': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 }, 'midterm': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 }, 'longterm': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 } } # Store separate memory for price direction examples self.price_movement_memory = [] # For storing examples of clear price movements # Performance tracking self.losses = [] self.avg_reward = 0.0 self.best_reward = -float('inf') self.no_improvement_count = 0 # Confidence tracking self.confidence_history = [] self.avg_confidence = 0.0 self.max_confidence = 0.0 self.min_confidence = 1.0 # Enhanced features from EnhancedDQNAgent # Market adaptation capabilities self.market_regime_weights = { 'trending': 1.2, # Higher confidence in trending markets 'ranging': 0.8, # Lower confidence in ranging markets 'volatile': 0.6 # Much lower confidence in volatile markets } # Dueling network support (requires enhanced network architecture) self.use_dueling = True # Prioritized experience replay parameters self.use_prioritized_replay = priority_memory self.alpha = 0.6 # Priority exponent self.beta = 0.4 # Importance sampling exponent self.beta_increment = 0.001 # Double DQN support self.use_double_dqn = True # Enhanced training features from EnhancedDQNAgent self.target_update_freq = target_update # More descriptive name self.training_steps = 0 self.gradient_clip_norm = 1.0 # Gradient clipping # Enhanced statistics tracking self.epsilon_history = [] self.td_errors = [] # Track TD errors for analysis # Trade action fee and confidence thresholds self.trade_action_fee = 0.0005 # Small fee to discourage unnecessary trading self.minimum_action_confidence = 0.3 # Minimum confidence to consider trading (lowered from 0.5) self.recent_actions = deque(maxlen=10) self.recent_prices = deque(maxlen=20) self.recent_rewards = deque(maxlen=100) # Violent move detection self.price_history = [] self.volatility_window = 20 # Window size for volatility calculation self.volatility_threshold = 0.0015 # Threshold for considering a move "violent" self.post_violent_move = False # Flag for recent violent move self.violent_move_cooldown = 0 # Cooldown after violent move # Feature integration self.last_hidden_features = None # Store last extracted features self.feature_history = [] # Store history of features for analysis # Real-time tick features integration self.realtime_tick_features = None # Latest tick features from tick processor self.tick_feature_weight = 0.3 # Weight for tick features in decision making # Check if mixed precision training should be used self.use_mixed_precision = False if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ: self.use_mixed_precision = True self.scaler = torch.cuda.amp.GradScaler() logger.info("Mixed precision training enabled") else: logger.info("Mixed precision training disabled") # Track if we're in training mode self.training = True # For compatibility with old code self.state_size = np.prod(state_shape) self.action_size = n_actions self.memory_size = buffer_size self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3] # Default timeframes logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}") logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}") logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}") # Log model parameters total_params = sum(p.numel() for p in self.policy_net.parameters()) logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters") # Position management for 2-action system self.current_position = 0.0 # -1 (short), 0 (neutral), 1 (long) self.position_entry_price = 0.0 self.position_entry_time = None # Different thresholds for entry vs exit decisions self.entry_confidence_threshold = 0.7 # High threshold for new positions self.exit_confidence_threshold = 0.3 # Lower threshold for closing positions self.uncertainty_threshold = 0.1 # When to stay neutral def move_models_to_device(self, device=None): """Move models to the specified device (GPU/CPU)""" if device is not None: self.device = device try: self.policy_net = self.policy_net.to(self.device) self.target_net = self.target_net.to(self.device) logger.info(f"Moved models to {self.device}") return True except Exception as e: logger.error(f"Failed to move models to {self.device}: {str(e)}") return False def remember(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, is_extrema: bool = False): """ Store experience in memory with prioritization Args: state: Current state action: Action taken reward: Reward received next_state: Next state done: Whether episode is done is_extrema: Whether this is a local extrema sample (for specialized learning) """ experience = (state, action, reward, next_state, done) # Always add to main memory self.memory.append(experience) # Try to extract price change to analyze the experience try: # Extract price feature from sequence data (if available) if len(state.shape) > 1: # 2D state [timeframes, features] current_price = state[-1, -1] # Last timeframe, last feature next_price = next_state[-1, -1] else: # 1D state current_price = state[-1] # Last feature next_price = next_state[-1] # Calculate price change - avoid division by zero if np.isscalar(current_price) and current_price != 0: price_change = (next_price - current_price) / current_price elif isinstance(current_price, np.ndarray): # Handle array case - protect against division by zero with np.errstate(divide='ignore', invalid='ignore'): price_change = (next_price - current_price) / current_price # Replace infinities and NaNs with zeros if isinstance(price_change, np.ndarray): price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0) else: price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change else: price_change = 0.0 # Check if this is a significant price movement if abs(price_change) > 0.002: # Significant price change # Store in price movement memory self.price_movement_memory.append(experience) # Log significant price movements direction = "UP" if price_change > 0 else "DOWN" logger.info(f"Stored significant {direction} price movement: {price_change:.4f}") # For clear price movements, also duplicate in main memory to learn more if abs(price_change) > 0.005: # Very significant movement for _ in range(2): # Add 2 extra copies self.memory.append(experience) except Exception as e: # Skip price movement analysis if it fails pass # Check if this is an extrema point based on our extrema detection head if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2: # Class 0 = bottom, 1 = top, 2 = neither # Only consider high confidence predictions if self.last_extrema_pred['confidence'] > 0.7: self.extrema_memory.append(experience) # Log this special experience extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP" logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}") # For tops and bottoms, also duplicate the experience in memory to learn more from it for _ in range(2): # Add 2 extra copies self.memory.append(experience) # Explicitly marked extrema points also go to extrema memory elif is_extrema: self.extrema_memory.append(experience) # Store positive experiences separately for prioritized replay if reward > 0: self.positive_memory.append(experience) # For very good rewards, duplicate to learn more from them if reward > 0.1: for _ in range(min(int(reward * 10), 5)): # Cap at 5 extra copies for very high rewards self.positive_memory.append(experience) # Keep memory size under control if len(self.memory) > self.buffer_size: # Keep more recent experiences self.memory = self.memory[-self.buffer_size:] # Keep specialized memories under control too if len(self.positive_memory) > self.buffer_size // 4: self.positive_memory = self.positive_memory[-(self.buffer_size // 4):] if len(self.extrema_memory) > self.buffer_size // 4: self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):] if len(self.price_movement_memory) > self.buffer_size // 4: self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):] def act(self, state: np.ndarray, explore=True, current_price=None, market_context=None) -> int: """ Choose action based on current state using 2-action system with intelligent position management Args: state: Current market state explore: Whether to use epsilon-greedy exploration current_price: Current market price for position management market_context: Additional market context for decision making Returns: int: Action (0=SELL, 1=BUY) or None if should hold position """ # Convert state to tensor if isinstance(state, np.ndarray): state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) else: state_tensor = state.unsqueeze(0).to(self.device) # Get Q-values q_values = self.policy_net(state_tensor) action_values = q_values.cpu().data.numpy()[0] # Calculate confidence scores sell_confidence = torch.softmax(q_values, dim=1)[0, 0].item() buy_confidence = torch.softmax(q_values, dim=1)[0, 1].item() # Determine action based on current position and confidence thresholds action = self._determine_action_with_position_management( sell_confidence, buy_confidence, current_price, market_context, explore ) # Update tracking if current_price: self.recent_prices.append(current_price) if action is not None: self.recent_actions.append(action) return action else: # Return None to indicate HOLD (don't change position) return None def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float]: """Choose action with confidence score adapted to market regime (from Enhanced DQN)""" with torch.no_grad(): state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device) q_values = self.policy_net(state_tensor) # Convert Q-values to probabilities action_probs = torch.softmax(q_values, dim=1) action = q_values.argmax().item() base_confidence = action_probs[0, action].item() # Adapt confidence based on market regime regime_weight = self.market_regime_weights.get(market_regime, 1.0) adapted_confidence = min(base_confidence * regime_weight, 1.0) return action, adapted_confidence def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore): """ Determine action based on current position and confidence thresholds This implements the intelligent position management where: - When neutral: Need high confidence to enter position - When in position: Need lower confidence to exit - Different thresholds for entry vs exit """ # Apply epsilon-greedy exploration if explore and np.random.random() <= self.epsilon: return np.random.choice([0, 1]) # Get the dominant signal dominant_action = 0 if sell_conf > buy_conf else 1 dominant_confidence = max(sell_conf, buy_conf) # Decision logic based on current position if self.current_position == 0: # No position - need high confidence to enter if dominant_confidence >= self.entry_confidence_threshold: # Strong enough signal to enter position if dominant_action == 1: # BUY signal self.current_position = 1.0 self.position_entry_price = current_price self.position_entry_time = time.time() logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}") return 1 else: # SELL signal self.current_position = -1.0 self.position_entry_price = current_price self.position_entry_time = time.time() logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}") return 0 else: # Not confident enough to enter position return None elif self.current_position > 0: # Long position if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold: # SELL signal with enough confidence to close long position pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 0.0 self.position_entry_price = 0.0 self.position_entry_time = None return 0 elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold: # Very strong SELL signal - close long and enter short pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = -1.0 self.position_entry_price = current_price self.position_entry_time = time.time() return 0 else: # Hold the long position return None elif self.current_position < 0: # Short position if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold: # BUY signal with enough confidence to close short position pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 0.0 self.position_entry_price = 0.0 self.position_entry_time = None return 1 elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold: # Very strong BUY signal - close short and enter long pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 1.0 self.position_entry_price = current_price self.position_entry_time = time.time() return 1 else: # Hold the short position return None return None def replay(self, experiences=None): """Train the model using experiences from memory""" # Don't train if not in training mode if not self.training: return 0.0 # If no experiences provided, sample from memory if experiences is None: # Skip if memory is too small if len(self.memory) < self.batch_size: return 0.0 # Sample random mini-batch from memory indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False) experiences = [self.memory[i] for i in indices] # Choose appropriate replay method if self.use_mixed_precision: # Convert experiences to tensors for mixed precision states = torch.FloatTensor(np.array([e[0] for e in experiences])).to(self.device) actions = torch.LongTensor(np.array([e[1] for e in experiences])).to(self.device) rewards = torch.FloatTensor(np.array([e[2] for e in experiences])).to(self.device) next_states = torch.FloatTensor(np.array([e[3] for e in experiences])).to(self.device) dones = torch.FloatTensor(np.array([e[4] for e in experiences])).to(self.device) # Use mixed precision replay loss = self._replay_mixed_precision(states, actions, rewards, next_states, dones) else: # Pass experiences directly to standard replay method loss = self._replay_standard(experiences) # Store loss for monitoring self.losses.append(loss) # Track and decay epsilon self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) # Randomly decide if we should train on extrema points from special memory if random.random() < 0.3 and len(self.extrema_memory) >= self.batch_size: # Train specifically on extrema memory examples extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False) extrema_batch = [self.extrema_memory[i] for i in extrema_indices] # Extract tensors from extrema batch extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device) extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device) extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device) extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device) extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device) # Use a slightly reduced learning rate for extrema training old_lr = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = old_lr * 0.8 # Train on extrema memory if self.use_mixed_precision: extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones) else: extrema_loss = self._replay_standard(extrema_batch) # Reset learning rate self.optimizer.param_groups[0]['lr'] = old_lr # Log extrema loss logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}") # Randomly train on price movement examples (similar to extrema) if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size: # Train specifically on price movement memory examples price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False) price_batch = [self.price_movement_memory[i] for i in price_indices] # Extract tensors from price movement batch price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device) price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device) price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device) price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device) price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device) # Use a slightly reduced learning rate for price movement training old_lr = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = old_lr * 0.75 # Train on price movement memory if self.use_mixed_precision: price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones) else: price_loss = self._replay_standard(price_batch) # Reset learning rate self.optimizer.param_groups[0]['lr'] = old_lr # Log price movement loss logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}") return loss def _replay_standard(self, experiences=None): """Standard training step without mixed precision""" try: # Use experiences if provided, otherwise sample from memory if experiences is None: # If memory is too small, skip training if len(self.memory) < self.batch_size: return 0.0 # Sample random mini-batch from memory indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False) batch = [self.memory[i] for i in indices] experiences = batch # Unpack experiences states, actions, rewards, next_states, dones = zip(*experiences) # Convert to PyTorch tensors states = torch.FloatTensor(np.array(states)).to(self.device) actions = torch.LongTensor(np.array(actions)).to(self.device) rewards = torch.FloatTensor(np.array(rewards)).to(self.device) next_states = torch.FloatTensor(np.array(next_states)).to(self.device) dones = torch.FloatTensor(np.array(dones)).to(self.device) # Get current Q values current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states) current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Enhanced Double DQN implementation with torch.no_grad(): if self.use_double_dqn: # Double DQN: Use policy network to select actions, target network to evaluate policy_q_values, _, _, _, _ = self.policy_net(next_states) next_actions = policy_q_values.argmax(1) target_q_values_all, _, _, _, _ = self.target_net(next_states) next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1) else: # Standard DQN: Use target network for both selection and evaluation next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states) next_q_values = next_q_values.max(1)[0] # Check for dimension mismatch between rewards and next_q_values if rewards.shape[0] != next_q_values.shape[0]: logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}") # Use the smaller size to prevent index error min_size = min(rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] # Calculate target Q values target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute loss for Q value q_loss = self.criterion(current_q_values, target_q_values) # Try to compute extrema loss if possible try: # Get the target classes from extrema predictions extrema_targets = torch.argmax(current_extrema_pred, dim=1).long() # Compute extrema loss using cross-entropy - this is an auxiliary task extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets) # Combined loss with emphasis on Q-learning total_loss = q_loss + 0.1 * extrema_loss except Exception as e: logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.") total_loss = q_loss # Reset gradients self.optimizer.zero_grad() # Backward pass total_loss.backward() # Enhanced gradient clipping with configurable norm torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), self.gradient_clip_norm) # Update weights self.optimizer.step() # Enhanced target network update tracking self.training_steps += 1 if self.training_steps % self.target_update_freq == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) logger.debug(f"Target network updated at step {self.training_steps}") # Enhanced statistics tracking self.epsilon_history.append(self.epsilon) # Calculate and store TD error for analysis with torch.no_grad(): td_error = torch.abs(current_q_values - target_q_values).mean().item() self.td_errors.append(td_error) # Return loss return total_loss.item() except Exception as e: logger.error(f"Error in replay standard: {str(e)}") import traceback logger.error(traceback.format_exc()) return 0.0 def _replay_mixed_precision(self, states, actions, rewards, next_states, dones): """Mixed precision training step for better GPU performance""" # Check if mixed precision should be explicitly disabled if 'DISABLE_MIXED_PRECISION' in os.environ: logger.info("Mixed precision explicitly disabled by environment variable") return self._replay_standard(states, actions, rewards, next_states, dones) try: # Zero gradients self.optimizer.zero_grad() # Forward pass with amp autocasting with torch.cuda.amp.autocast(): # Get current Q values and extrema predictions current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states) current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Get next Q values from target network with torch.no_grad(): next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states) next_q_values = next_q_values.max(1)[0] # Check for dimension mismatch and fix it if rewards.shape[0] != next_q_values.shape[0]: # Log the shape mismatch for debugging logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}") # Use the smaller size to prevent index errors min_size = min(rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute Q-value loss (primary task) q_loss = nn.MSELoss()(current_q_values, target_q_values) # Initialize loss with q_loss loss = q_loss # Try to extract price from current and next states try: # Extract price feature from sequence data (if available) if len(states.shape) == 3: # [batch, seq, features] current_prices = states[:, -1, -1] # Last timestep, last feature next_prices = next_states[:, -1, -1] else: # [batch, features] current_prices = states[:, -1] # Last feature next_prices = next_states[:, -1] # Calculate price change for different timeframes immediate_changes = (next_prices - current_prices) / current_prices # Get the actual batch size for this calculation actual_batch_size = states.shape[0] # Create price direction labels - simplified for training # 0 = down, 1 = sideways, 2 = up immediate_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Default: sideways midterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 longterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Immediate term direction (1s, 1m) immediate_up = (immediate_changes > 0.0005) immediate_down = (immediate_changes < -0.0005) immediate_labels[immediate_up] = 2 # Up immediate_labels[immediate_down] = 0 # Down # For mid and long term, we can only approximate during training # In a real system, we'd need historical data to validate these # Here we'll use the immediate term with increasing thresholds as approximation # Mid-term (1h) - use slightly higher threshold midterm_up = (immediate_changes > 0.001) midterm_down = (immediate_changes < -0.001) midterm_labels[midterm_up] = 2 # Up midterm_labels[midterm_down] = 0 # Down # Long-term (1d) - use even higher threshold longterm_up = (immediate_changes > 0.002) longterm_down = (immediate_changes < -0.002) longterm_labels[longterm_up] = 2 # Up longterm_labels[longterm_down] = 0 # Down # Generate target values for price change regression # For simplicity, we'll use the immediate change and scaled versions for longer timeframes price_value_targets = torch.zeros((actual_batch_size, 4), device=self.device) price_value_targets[:, 0] = immediate_changes price_value_targets[:, 1] = immediate_changes * 2.0 # Approximate 1h change price_value_targets[:, 2] = immediate_changes * 4.0 # Approximate 1d change price_value_targets[:, 3] = immediate_changes * 6.0 # Approximate 1w change # Calculate loss for price direction prediction (classification) if len(current_price_pred['immediate'].shape) > 1 and current_price_pred['immediate'].shape[0] >= actual_batch_size: # Slice predictions to match the adjusted batch size immediate_pred = current_price_pred['immediate'][:actual_batch_size] midterm_pred = current_price_pred['midterm'][:actual_batch_size] longterm_pred = current_price_pred['longterm'][:actual_batch_size] price_values_pred = current_price_pred['values'][:actual_batch_size] # Compute losses for each task immediate_loss = nn.CrossEntropyLoss()(immediate_pred, immediate_labels) midterm_loss = nn.CrossEntropyLoss()(midterm_pred, midterm_labels) longterm_loss = nn.CrossEntropyLoss()(longterm_pred, longterm_labels) # MSE loss for price value regression price_value_loss = nn.MSELoss()(price_values_pred, price_value_targets) # Combine all price prediction losses price_loss = immediate_loss + 0.7 * midterm_loss + 0.5 * longterm_loss + 0.3 * price_value_loss # Create extrema labels (same as before) extrema_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 2 # Default: neither # Identify potential bottoms (significant negative change) bottoms = (immediate_changes < -0.003) extrema_labels[bottoms] = 0 # Identify potential tops (significant positive change) tops = (immediate_changes > 0.003) extrema_labels[tops] = 1 # Calculate extrema prediction loss if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= actual_batch_size: current_extrema_pred = current_extrema_pred[:actual_batch_size] extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels) # Combined loss with all components # Primary task: Q-value learning (RL objective) # Secondary tasks: extrema detection and price prediction (supervised objectives) loss = q_loss + 0.3 * extrema_loss + 0.3 * price_loss # Log loss components occasionally if random.random() < 0.01: # Log 1% of the time logger.info( f"Mixed precision losses: Q-loss={q_loss.item():.4f}, " f"Extrema-loss={extrema_loss.item():.4f}, " f"Price-loss={price_loss.item():.4f}" ) except Exception as e: # Fallback if price extraction fails logger.warning(f"Failed to calculate price prediction loss: {str(e)}. Using only Q-value loss.") # Just use Q-value loss loss = q_loss # Backward pass with scaled gradients self.scaler.scale(loss).backward() # Gradient clipping on scaled gradients self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0) # Update with scaler self.scaler.step(self.optimizer) self.scaler.update() # Update target network if needed self.update_count += 1 if self.update_count % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # Track and decay epsilon self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) return loss.item() except Exception as e: logger.error(f"Error in mixed precision training: {str(e)}") logger.warning("Falling back to standard precision training") # Fall back to standard training return self._replay_standard(states, actions, rewards, next_states, dones) def train_on_extrema(self, states, actions, rewards, next_states, dones): """ Special training function specifically for extrema points Args: states: Batch of states at extrema points actions: Batch of actions rewards: Batch of rewards next_states: Batch of next states dones: Batch of done flags Returns: float: Training loss """ # Convert to numpy arrays if not already if not isinstance(states, np.ndarray): states = np.array(states) if not isinstance(actions, np.ndarray): actions = np.array(actions) if not isinstance(rewards, np.ndarray): rewards = np.array(rewards) if not isinstance(next_states, np.ndarray): next_states = np.array(next_states) if not isinstance(dones, np.ndarray): dones = np.array(dones, dtype=np.float32) # Normalize states states = np.vstack([self._normalize_state(s) for s in states]) next_states = np.vstack([self._normalize_state(s) for s in next_states]) # Convert to torch tensors and move to device states_tensor = torch.FloatTensor(states).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device) next_states_tensor = torch.FloatTensor(next_states).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device) # Choose training method based on precision mode if self.use_mixed_precision: return self._replay_mixed_precision( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) else: return self._replay_standard( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) def _normalize_state(self, state: np.ndarray) -> np.ndarray: """Normalize the state data to prevent numerical issues""" # Handle NaN and infinite values state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0) # Check if state is 1D array (happens in some environments) if len(state.shape) == 1: # If 1D, we need to normalize the whole array normalized_state = state.copy() # Convert any timestamp or non-numeric data to float for i in range(len(normalized_state)): # Check for timestamp-like objects if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i] = float(normalized_state[i].timestamp()) elif not isinstance(normalized_state[i], (int, float, np.number)): # Set non-numeric data to 0 normalized_state[i] = 0.0 # Ensure all values are float normalized_state = normalized_state.astype(np.float32) # Simple min-max normalization for 1D state state_min = np.min(normalized_state) state_max = np.max(normalized_state) if state_max > state_min: normalized_state = (normalized_state - state_min) / (state_max - state_min) return normalized_state # Handle 2D arrays normalized_state = np.zeros_like(state, dtype=np.float32) # Convert any timestamp or non-numeric data to float for i in range(state.shape[0]): for j in range(state.shape[1]): if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i, j] = float(state[i, j].timestamp()) elif isinstance(state[i, j], (int, float, np.number)): normalized_state[i, j] = state[i, j] else: # Set non-numeric data to 0 normalized_state[i, j] = 0.0 # Loop through each timeframe's features in the combined state feature_count = state.shape[1] // len(self.timeframes) for tf_idx in range(len(self.timeframes)): start_idx = tf_idx * feature_count end_idx = start_idx + feature_count # Extract this timeframe's features tf_features = normalized_state[:, start_idx:end_idx] # Normalize OHLCV data by the first close price in the window # This makes price movements relative rather than absolute price_idx = 3 # Assuming close price is at index 3 if price_idx < tf_features.shape[1]: reference_price = np.mean(tf_features[:, price_idx]) if reference_price != 0: # Normalize price-related columns (OHLC) for i in range(4): # First 4 columns are OHLC if i < tf_features.shape[1]: normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price # Normalize volume using mean and std vol_idx = 4 # Assuming volume is at index 4 if vol_idx < tf_features.shape[1]: vol_mean = np.mean(tf_features[:, vol_idx]) vol_std = np.std(tf_features[:, vol_idx]) if vol_std > 0: normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std else: normalized_state[:, start_idx + vol_idx] = 0 # Other features (technical indicators) - normalize with min-max scaling for i in range(5, feature_count): if i < tf_features.shape[1]: feature_min = np.min(tf_features[:, i]) feature_max = np.max(tf_features[:, i]) if feature_max > feature_min: normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min) else: normalized_state[:, start_idx + i] = 0 return normalized_state def update_realtime_tick_features(self, tick_features): """Update with real-time tick features from tick processor""" try: if tick_features is not None: self.realtime_tick_features = tick_features # Log high-confidence tick features if tick_features.get('confidence', 0) > 0.8: logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}") except Exception as e: logger.error(f"Error updating real-time tick features: {e}") def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray: """Enhance state with real-time tick features if available""" try: if self.realtime_tick_features is None: return state # Extract neural features from tick processor neural_features = self.realtime_tick_features.get('neural_features', np.array([])) volume_features = self.realtime_tick_features.get('volume_features', np.array([])) microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([])) confidence = self.realtime_tick_features.get('confidence', 0.0) # Combine tick features - make them compact to match state dimensions tick_features = np.concatenate([ neural_features[:3] if len(neural_features) >= 3 else np.zeros(3), # Take first 3 neural features volume_features[:1] if len(volume_features) >= 1 else np.zeros(1), # Take first volume feature microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1), # Take first microstructure feature ]) # Weight the tick features weighted_tick_features = tick_features * self.tick_feature_weight # Enhance the state by adding tick features to each timeframe if len(state.shape) == 1: # 1D state - append tick features enhanced_state = np.concatenate([state, weighted_tick_features]) else: # 2D state - add tick features to each timeframe row num_timeframes, num_features = state.shape # Ensure tick features match the number of original features if len(weighted_tick_features) != num_features: # Pad or truncate tick features to match state feature dimension if len(weighted_tick_features) < num_features: # Pad with zeros padded_features = np.zeros(num_features) padded_features[:len(weighted_tick_features)] = weighted_tick_features weighted_tick_features = padded_features else: # Truncate to match weighted_tick_features = weighted_tick_features[:num_features] # Add tick features to the last row (most recent timeframe) enhanced_state = state.copy() enhanced_state[-1, :] += weighted_tick_features # Add to last timeframe return enhanced_state except Exception as e: logger.error(f"Error enhancing state with tick features: {e}") return state def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01): """Update learning metrics and perform learning rate adjustments if needed""" # Update average reward with exponential moving average if self.avg_reward == 0: self.avg_reward = episode_reward else: self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward # Check if we're making sufficient progress if episode_reward > (1 + best_reward_threshold) * self.best_reward: self.best_reward = episode_reward self.no_improvement_count = 0 return True # Improved else: self.no_improvement_count += 1 # If no improvement for a while, adjust learning rate if self.no_improvement_count >= 10: current_lr = self.optimizer.param_groups[0]['lr'] new_lr = current_lr * 0.5 if new_lr >= 1e-6: # Don't reduce below minimum threshold for param_group in self.optimizer.param_groups: param_group['lr'] = new_lr logger.info(f"Reducing learning rate from {current_lr} to {new_lr}") self.no_improvement_count = 0 return False # No improvement def save(self, path: str): """Save model and agent state""" os.makedirs(os.path.dirname(path), exist_ok=True) # Save policy network self.policy_net.save(f"{path}_policy") # Save target network self.target_net.save(f"{path}_target") # Save agent state state = { 'epsilon': self.epsilon, 'update_count': self.update_count, 'losses': self.losses, 'optimizer_state': self.optimizer.state_dict(), 'best_reward': self.best_reward, 'avg_reward': self.avg_reward } torch.save(state, f"{path}_agent_state.pt") logger.info(f"Agent state saved to {path}_agent_state.pt") def load(self, path: str): """Load model and agent state""" # Load policy network self.policy_net.load(f"{path}_policy") # Load target network self.target_net.load(f"{path}_target") # Load agent state try: agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device) self.epsilon = agent_state['epsilon'] self.update_count = agent_state['update_count'] self.losses = agent_state['losses'] self.optimizer.load_state_dict(agent_state['optimizer_state']) # Load additional metrics if they exist if 'best_reward' in agent_state: self.best_reward = agent_state['best_reward'] if 'avg_reward' in agent_state: self.avg_reward = agent_state['avg_reward'] logger.info(f"Agent state loaded from {path}_agent_state.pt") except FileNotFoundError: logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values") def get_position_info(self): """Get current position information""" return { 'position': self.current_position, 'entry_price': self.position_entry_price, 'entry_time': self.position_entry_time, 'entry_threshold': self.entry_confidence_threshold, 'exit_threshold': self.exit_confidence_threshold } def get_enhanced_training_stats(self): """Get enhanced RL training statistics with detailed metrics (from EnhancedDQNAgent)""" return { 'buffer_size': len(self.memory), 'epsilon': self.epsilon, 'avg_reward': self.avg_reward, 'best_reward': self.best_reward, 'recent_rewards': list(self.recent_rewards) if hasattr(self, 'recent_rewards') else [], 'no_improvement_count': self.no_improvement_count, # Enhanced statistics from EnhancedDQNAgent 'training_steps': self.training_steps, 'avg_td_error': np.mean(self.td_errors[-100:]) if self.td_errors else 0.0, 'recent_losses': self.losses[-10:] if self.losses else [], 'epsilon_trend': self.epsilon_history[-20:] if self.epsilon_history else [], 'specialized_buffers': { 'extrema_memory': len(self.extrema_memory), 'positive_memory': len(self.positive_memory), 'price_movement_memory': len(self.price_movement_memory) }, 'market_regime_weights': self.market_regime_weights, 'use_double_dqn': self.use_double_dqn, 'use_prioritized_replay': self.use_prioritized_replay, 'gradient_clip_norm': self.gradient_clip_norm, 'target_update_frequency': self.target_update_freq }