import torch import torch.nn as nn import torch.optim as optim import numpy as np from collections import deque import random from typing import Tuple, List import os import sys import logging import torch.nn.functional as F # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) # Configure logger logger = logging.getLogger(__name__) class DQNAgent: """ Deep Q-Network agent for trading Uses Enhanced CNN model as the base network with GPU support for improved performance """ def __init__(self, state_shape: Tuple[int, ...], n_actions: int, learning_rate: float = 0.0005, # Reduced learning rate for more stability gamma: float = 0.97, # Slightly reduced discount factor epsilon: float = 1.0, epsilon_min: float = 0.05, # Increased minimum epsilon for more exploration epsilon_decay: float = 0.9975, # Slower decay rate buffer_size: int = 20000, # Increased memory size batch_size: int = 128, # Larger batch size target_update: int = 5, # More frequent target updates device=None): # Device for computations # Extract state dimensions if isinstance(state_shape, tuple) and len(state_shape) > 1: # Multi-dimensional state (like image or sequence) self.state_dim = state_shape else: # 1D state if isinstance(state_shape, tuple): self.state_dim = state_shape[0] else: self.state_dim = state_shape # Store parameters self.n_actions = n_actions self.learning_rate = learning_rate self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.epsilon_start = epsilon # Store initial epsilon value for resets/bumps self.buffer_size = buffer_size self.batch_size = batch_size self.target_update = target_update # Set device for computation (default to GPU if available) if device is None: self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: self.device = device # Initialize models with Enhanced CNN architecture for better performance from NN.models.enhanced_cnn import EnhancedCNN # Use Enhanced CNN for both policy and target networks self.policy_net = EnhancedCNN(self.state_dim, self.n_actions) self.target_net = EnhancedCNN(self.state_dim, self.n_actions) # Initialize the target network with the same weights as the policy network self.target_net.load_state_dict(self.policy_net.state_dict()) # Set models to eval mode (important for batch norm, dropout) self.target_net.eval() # Optimization components self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.criterion = nn.MSELoss() # Experience replay memory self.memory = [] self.positive_memory = [] # Special memory for storing good experiences self.update_count = 0 # Extrema detection tracking self.last_extrema_pred = { 'class': 2, # Default to "neither" (not extrema) 'confidence': 0.0, 'raw': None } self.extrema_memory = [] # Special memory for storing extrema points # Price prediction tracking self.last_price_pred = { 'immediate': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 }, 'midterm': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 }, 'longterm': { 'direction': 1, # Default to "sideways" 'confidence': 0.0, 'change': 0.0 } } # Store separate memory for price direction examples self.price_movement_memory = [] # For storing examples of clear price movements # Performance tracking self.losses = [] self.avg_reward = 0.0 self.best_reward = -float('inf') self.no_improvement_count = 0 # Confidence tracking self.confidence_history = [] self.avg_confidence = 0.0 self.max_confidence = 0.0 self.min_confidence = 1.0 # Trade action fee and confidence thresholds self.trade_action_fee = 0.0005 # Small fee to discourage unnecessary trading self.minimum_action_confidence = 0.3 # Minimum confidence to consider trading (lowered from 0.5) self.recent_actions = [] # Track recent actions to avoid oscillations # Violent move detection self.price_history = [] self.volatility_window = 20 # Window size for volatility calculation self.volatility_threshold = 0.0015 # Threshold for considering a move "violent" self.post_violent_move = False # Flag for recent violent move self.violent_move_cooldown = 0 # Cooldown after violent move # Feature integration self.last_hidden_features = None # Store last extracted features self.feature_history = [] # Store history of features for analysis # Real-time tick features integration self.realtime_tick_features = None # Latest tick features from tick processor self.tick_feature_weight = 0.3 # Weight for tick features in decision making # Check if mixed precision training should be used self.use_mixed_precision = False if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ: self.use_mixed_precision = True self.scaler = torch.cuda.amp.GradScaler() logger.info("Mixed precision training enabled") else: logger.info("Mixed precision training disabled") # Track if we're in training mode self.training = True # For compatibility with old code self.state_size = np.prod(state_shape) self.action_size = n_actions self.memory_size = buffer_size self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3] # Default timeframes logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}") logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}") logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}") # Log model parameters total_params = sum(p.numel() for p in self.policy_net.parameters()) logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters") def move_models_to_device(self, device=None): """Move models to the specified device (GPU/CPU)""" if device is not None: self.device = device try: self.policy_net = self.policy_net.to(self.device) self.target_net = self.target_net.to(self.device) logger.info(f"Moved models to {self.device}") return True except Exception as e: logger.error(f"Failed to move models to {self.device}: {str(e)}") return False def remember(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, is_extrema: bool = False): """ Store experience in memory with prioritization Args: state: Current state action: Action taken reward: Reward received next_state: Next state done: Whether episode is done is_extrema: Whether this is a local extrema sample (for specialized learning) """ experience = (state, action, reward, next_state, done) # Always add to main memory self.memory.append(experience) # Try to extract price change to analyze the experience try: # Extract price feature from sequence data (if available) if len(state.shape) > 1: # 2D state [timeframes, features] current_price = state[-1, -1] # Last timeframe, last feature next_price = next_state[-1, -1] else: # 1D state current_price = state[-1] # Last feature next_price = next_state[-1] # Calculate price change - avoid division by zero if np.isscalar(current_price) and current_price != 0: price_change = (next_price - current_price) / current_price elif isinstance(current_price, np.ndarray): # Handle array case - protect against division by zero with np.errstate(divide='ignore', invalid='ignore'): price_change = (next_price - current_price) / current_price # Replace infinities and NaNs with zeros if isinstance(price_change, np.ndarray): price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0) else: price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change else: price_change = 0.0 # Check if this is a significant price movement if abs(price_change) > 0.002: # Significant price change # Store in price movement memory self.price_movement_memory.append(experience) # Log significant price movements direction = "UP" if price_change > 0 else "DOWN" logger.info(f"Stored significant {direction} price movement: {price_change:.4f}") # For clear price movements, also duplicate in main memory to learn more if abs(price_change) > 0.005: # Very significant movement for _ in range(2): # Add 2 extra copies self.memory.append(experience) except Exception as e: # Skip price movement analysis if it fails pass # Check if this is an extrema point based on our extrema detection head if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2: # Class 0 = bottom, 1 = top, 2 = neither # Only consider high confidence predictions if self.last_extrema_pred['confidence'] > 0.7: self.extrema_memory.append(experience) # Log this special experience extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP" logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}") # For tops and bottoms, also duplicate the experience in memory to learn more from it for _ in range(2): # Add 2 extra copies self.memory.append(experience) # Explicitly marked extrema points also go to extrema memory elif is_extrema: self.extrema_memory.append(experience) # Store positive experiences separately for prioritized replay if reward > 0: self.positive_memory.append(experience) # For very good rewards, duplicate to learn more from them if reward > 0.1: for _ in range(min(int(reward * 10), 5)): # Cap at 5 extra copies for very high rewards self.positive_memory.append(experience) # Keep memory size under control if len(self.memory) > self.buffer_size: # Keep more recent experiences self.memory = self.memory[-self.buffer_size:] # Keep specialized memories under control too if len(self.positive_memory) > self.buffer_size // 4: self.positive_memory = self.positive_memory[-(self.buffer_size // 4):] if len(self.extrema_memory) > self.buffer_size // 4: self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):] if len(self.price_movement_memory) > self.buffer_size // 4: self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):] def act(self, state: np.ndarray, explore=True) -> int: """Choose action using epsilon-greedy policy with explore flag""" if explore and random.random() < self.epsilon: return random.randrange(self.n_actions) with torch.no_grad(): # Enhance state with real-time tick features enhanced_state = self._enhance_state_with_tick_features(state) # Ensure state is normalized before inference state_tensor = self._normalize_state(enhanced_state) state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device) # Get predictions using the policy network self.policy_net.eval() # Set to evaluation mode for inference action_probs, extrema_pred, price_predictions, hidden_features, advanced_predictions = self.policy_net(state_tensor) self.policy_net.train() # Back to training mode # Store hidden features for integration self.last_hidden_features = hidden_features.cpu().numpy() # Track feature history (limited size) self.feature_history.append(hidden_features.cpu().numpy()) if len(self.feature_history) > 100: self.feature_history = self.feature_history[-100:] # Get the predicted extrema class (0=bottom, 1=top, 2=neither) extrema_class = extrema_pred.argmax(dim=1).item() extrema_confidence = torch.softmax(extrema_pred, dim=1)[0, extrema_class].item() # Log extrema prediction for significant signals if extrema_confidence > 0.7 and extrema_class != 2: # Only log strong top/bottom signals extrema_type = "BOTTOM" if extrema_class == 0 else "TOP" if extrema_class == 1 else "NEITHER" logger.info(f"High confidence {extrema_type} detected! Confidence: {extrema_confidence:.4f}") # Process price predictions price_immediate = torch.softmax(price_predictions['immediate'], dim=1) price_midterm = torch.softmax(price_predictions['midterm'], dim=1) price_longterm = torch.softmax(price_predictions['longterm'], dim=1) price_values = price_predictions['values'] # Get predicted direction for each timeframe (0=down, 1=sideways, 2=up) immediate_direction = price_immediate.argmax(dim=1).item() midterm_direction = price_midterm.argmax(dim=1).item() longterm_direction = price_longterm.argmax(dim=1).item() # Get confidence levels immediate_conf = price_immediate[0, immediate_direction].item() midterm_conf = price_midterm[0, midterm_direction].item() longterm_conf = price_longterm[0, longterm_direction].item() # Get predicted price change percentages price_changes = price_values[0].tolist() # Log significant price movement predictions timeframes = ["1s/1m", "1h", "1d", "1w"] directions = ["DOWN", "SIDEWAYS", "UP"] for i, (direction, conf) in enumerate([ (immediate_direction, immediate_conf), (midterm_direction, midterm_conf), (longterm_direction, longterm_conf) ]): if conf > 0.7 and direction != 1: # Only log high confidence non-sideways predictions logger.info(f"Price prediction: {timeframes[i]} -> {directions[direction]}, " f"Confidence: {conf:.4f}, Expected change: {price_changes[i]:.2f}%") # Store predictions for environment to use self.last_extrema_pred = { 'class': extrema_class, 'confidence': extrema_confidence, 'raw': extrema_pred.cpu().numpy() } self.last_price_pred = { 'immediate': { 'direction': immediate_direction, 'confidence': immediate_conf, 'change': price_changes[0] }, 'midterm': { 'direction': midterm_direction, 'confidence': midterm_conf, 'change': price_changes[1] }, 'longterm': { 'direction': longterm_direction, 'confidence': longterm_conf, 'change': price_changes[2] } } # Get the action with highest Q-value action = action_probs.argmax().item() # Calculate overall confidence in the action q_values_softmax = F.softmax(action_probs, dim=1)[0] action_confidence = q_values_softmax[action].item() # Track confidence metrics self.confidence_history.append(action_confidence) if len(self.confidence_history) > 100: self.confidence_history = self.confidence_history[-100:] # Update confidence metrics self.avg_confidence = sum(self.confidence_history) / len(self.confidence_history) self.max_confidence = max(self.max_confidence, action_confidence) self.min_confidence = min(self.min_confidence, action_confidence) # Log average confidence occasionally if random.random() < 0.01: # 1% of the time logger.info(f"Confidence metrics - Current: {action_confidence:.4f}, Avg: {self.avg_confidence:.4f}, " + f"Min: {self.min_confidence:.4f}, Max: {self.max_confidence:.4f}") # Track price for violent move detection try: # Extract current price from state (assuming it's in the last position) if len(state.shape) > 1: # For 2D state current_price = state[-1, -1] else: # For 1D state current_price = state[-1] self.price_history.append(current_price) if len(self.price_history) > self.volatility_window: self.price_history = self.price_history[-self.volatility_window:] # Detect violent price moves if we have enough price history if len(self.price_history) >= 5: # Calculate short-term volatility recent_prices = self.price_history[-5:] # Make sure we're working with scalar values, not arrays if isinstance(recent_prices[0], np.ndarray): # If prices are arrays, extract the last value (current price) recent_prices = [p[-1] if isinstance(p, np.ndarray) and p.size > 0 else p for p in recent_prices] # Calculate price changes with protection against division by zero price_changes = [] for i in range(1, len(recent_prices)): if recent_prices[i-1] != 0 and not np.isnan(recent_prices[i-1]) and not np.isnan(recent_prices[i]): change = (recent_prices[i] - recent_prices[i-1]) / recent_prices[i-1] price_changes.append(change) else: price_changes.append(0.0) # Calculate volatility as sum of absolute price changes volatility = sum([abs(change) for change in price_changes]) # Check if we've had a violent move if volatility > self.volatility_threshold: logger.info(f"Violent price move detected! Volatility: {volatility:.6f}") self.post_violent_move = True self.violent_move_cooldown = 10 # Set cooldown period # Handle post-violent move period if self.post_violent_move: if self.violent_move_cooldown > 0: self.violent_move_cooldown -= 1 # Increase confidence threshold temporarily after violent moves effective_threshold = self.minimum_action_confidence * 1.1 logger.info(f"Post-violent move period: {self.violent_move_cooldown} steps remaining. " + f"Using higher confidence threshold: {effective_threshold:.4f}") else: self.post_violent_move = False logger.info("Post-violent move period ended") except Exception as e: logger.warning(f"Error in violent move detection: {str(e)}") # Apply trade action fee to buy/sell actions but not to hold # This creates a threshold that must be exceeded to justify a trade action_values = action_probs.clone() # If BUY or SELL, apply fee by reducing the Q-value if action == 0 or action == 1: # BUY or SELL # Check if confidence is above minimum threshold effective_threshold = self.minimum_action_confidence if self.post_violent_move: effective_threshold *= 1.1 # Higher threshold after violent moves if action_confidence < effective_threshold: # If confidence is below threshold, force HOLD action logger.info(f"Action {action} confidence {action_confidence:.4f} below threshold {effective_threshold}, forcing HOLD") action = 2 # HOLD else: # Apply trade action fee to ensure we only trade when there's clear benefit fee_adjusted_action_values = action_values.clone() fee_adjusted_action_values[0, 0] -= self.trade_action_fee # Reduce BUY value fee_adjusted_action_values[0, 1] -= self.trade_action_fee # Reduce SELL value # Hold value remains unchanged # Re-determine the action based on fee-adjusted values fee_adjusted_action = fee_adjusted_action_values.argmax().item() # If the fee changes our decision, log this if fee_adjusted_action != action: logger.info(f"Trade action fee changed decision from {action} to {fee_adjusted_action}") action = fee_adjusted_action # Adjust action based on extrema and price predictions # Prioritize short-term movement for trading decisions if immediate_conf > 0.8: # Only adjust for strong signals if immediate_direction == 2: # UP prediction # Bias toward BUY for strong up predictions if action != 0 and action != 2 and random.random() < 0.3 * immediate_conf: logger.info(f"Adjusting action to BUY based on immediate UP prediction") action = 0 # BUY elif immediate_direction == 0: # DOWN prediction # Bias toward SELL for strong down predictions if action != 1 and action != 2 and random.random() < 0.3 * immediate_conf: logger.info(f"Adjusting action to SELL based on immediate DOWN prediction") action = 1 # SELL # Also consider extrema detection for action adjustment if extrema_confidence > 0.8: # Only adjust for strong signals if extrema_class == 0: # Bottom detected # Bias toward BUY at bottoms if action != 0 and action != 2 and random.random() < 0.3 * extrema_confidence: logger.info(f"Adjusting action to BUY based on bottom detection") action = 0 # BUY elif extrema_class == 1: # Top detected # Bias toward SELL at tops if action != 1 and action != 2 and random.random() < 0.3 * extrema_confidence: logger.info(f"Adjusting action to SELL based on top detection") action = 1 # SELL # Finally, avoid action oscillation by checking recent history if len(self.recent_actions) >= 2: last_action = self.recent_actions[-1] if action != last_action and action != 2 and last_action != 2: # We're switching between BUY and SELL too quickly # Only allow this if we have very high confidence if action_confidence < 0.85: logger.info(f"Preventing oscillation from {last_action} to {action}, forcing HOLD") action = 2 # HOLD # Update recent actions list self.recent_actions.append(action) if len(self.recent_actions) > 5: self.recent_actions = self.recent_actions[-5:] return action def replay(self, experiences=None): """Train the model using experiences from memory""" # Don't train if not in training mode if not self.training: return 0.0 # If no experiences provided, sample from memory if experiences is None: # Skip if memory is too small if len(self.memory) < self.batch_size: return 0.0 # Sample random mini-batch from memory indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False) experiences = [self.memory[i] for i in indices] # Choose appropriate replay method if self.use_mixed_precision: # Convert experiences to tensors for mixed precision states = torch.FloatTensor(np.array([e[0] for e in experiences])).to(self.device) actions = torch.LongTensor(np.array([e[1] for e in experiences])).to(self.device) rewards = torch.FloatTensor(np.array([e[2] for e in experiences])).to(self.device) next_states = torch.FloatTensor(np.array([e[3] for e in experiences])).to(self.device) dones = torch.FloatTensor(np.array([e[4] for e in experiences])).to(self.device) # Use mixed precision replay loss = self._replay_mixed_precision(states, actions, rewards, next_states, dones) else: # Pass experiences directly to standard replay method loss = self._replay_standard(experiences) # Store loss for monitoring self.losses.append(loss) # Track and decay epsilon self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) # Randomly decide if we should train on extrema points from special memory if random.random() < 0.3 and len(self.extrema_memory) >= self.batch_size: # Train specifically on extrema memory examples extrema_indices = np.random.choice(len(self.extrema_memory), size=min(self.batch_size, len(self.extrema_memory)), replace=False) extrema_batch = [self.extrema_memory[i] for i in extrema_indices] # Extract tensors from extrema batch extrema_states = torch.FloatTensor(np.array([e[0] for e in extrema_batch])).to(self.device) extrema_actions = torch.LongTensor(np.array([e[1] for e in extrema_batch])).to(self.device) extrema_rewards = torch.FloatTensor(np.array([e[2] for e in extrema_batch])).to(self.device) extrema_next_states = torch.FloatTensor(np.array([e[3] for e in extrema_batch])).to(self.device) extrema_dones = torch.FloatTensor(np.array([e[4] for e in extrema_batch])).to(self.device) # Use a slightly reduced learning rate for extrema training old_lr = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = old_lr * 0.8 # Train on extrema memory if self.use_mixed_precision: extrema_loss = self._replay_mixed_precision(extrema_states, extrema_actions, extrema_rewards, extrema_next_states, extrema_dones) else: extrema_loss = self._replay_standard(extrema_batch) # Reset learning rate self.optimizer.param_groups[0]['lr'] = old_lr # Log extrema loss logger.info(f"Extra training on extrema points, loss: {extrema_loss:.4f}") # Randomly train on price movement examples (similar to extrema) if random.random() < 0.3 and len(self.price_movement_memory) >= self.batch_size: # Train specifically on price movement memory examples price_indices = np.random.choice(len(self.price_movement_memory), size=min(self.batch_size, len(self.price_movement_memory)), replace=False) price_batch = [self.price_movement_memory[i] for i in price_indices] # Extract tensors from price movement batch price_states = torch.FloatTensor(np.array([e[0] for e in price_batch])).to(self.device) price_actions = torch.LongTensor(np.array([e[1] for e in price_batch])).to(self.device) price_rewards = torch.FloatTensor(np.array([e[2] for e in price_batch])).to(self.device) price_next_states = torch.FloatTensor(np.array([e[3] for e in price_batch])).to(self.device) price_dones = torch.FloatTensor(np.array([e[4] for e in price_batch])).to(self.device) # Use a slightly reduced learning rate for price movement training old_lr = self.optimizer.param_groups[0]['lr'] self.optimizer.param_groups[0]['lr'] = old_lr * 0.75 # Train on price movement memory if self.use_mixed_precision: price_loss = self._replay_mixed_precision(price_states, price_actions, price_rewards, price_next_states, price_dones) else: price_loss = self._replay_standard(price_batch) # Reset learning rate self.optimizer.param_groups[0]['lr'] = old_lr # Log price movement loss logger.info(f"Extra training on price movement examples, loss: {price_loss:.4f}") return loss def _replay_standard(self, experiences=None): """Standard training step without mixed precision""" try: # Use experiences if provided, otherwise sample from memory if experiences is None: # If memory is too small, skip training if len(self.memory) < self.batch_size: return 0.0 # Sample random mini-batch from memory indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False) batch = [self.memory[i] for i in indices] experiences = batch # Unpack experiences states, actions, rewards, next_states, dones = zip(*experiences) # Convert to PyTorch tensors states = torch.FloatTensor(np.array(states)).to(self.device) actions = torch.LongTensor(np.array(actions)).to(self.device) rewards = torch.FloatTensor(np.array(rewards)).to(self.device) next_states = torch.FloatTensor(np.array(next_states)).to(self.device) dones = torch.FloatTensor(np.array(dones)).to(self.device) # Get current Q values current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states) current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Get next Q values with target network with torch.no_grad(): next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states) next_q_values = next_q_values.max(1)[0] # Check for dimension mismatch between rewards and next_q_values if rewards.shape[0] != next_q_values.shape[0]: logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}") # Use the smaller size to prevent index error min_size = min(rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] # Calculate target Q values target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute loss for Q value q_loss = self.criterion(current_q_values, target_q_values) # Try to compute extrema loss if possible try: # Get the target classes from extrema predictions extrema_targets = torch.argmax(current_extrema_pred, dim=1).long() # Compute extrema loss using cross-entropy - this is an auxiliary task extrema_loss = F.cross_entropy(current_extrema_pred, extrema_targets) # Combined loss with emphasis on Q-learning total_loss = q_loss + 0.1 * extrema_loss except Exception as e: logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.") total_loss = q_loss # Reset gradients self.optimizer.zero_grad() # Backward pass total_loss.backward() # Clip gradients to avoid exploding gradients torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0) # Update weights self.optimizer.step() # Update target network if needed self.update_count += 1 if self.update_count % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # Return loss return total_loss.item() except Exception as e: logger.error(f"Error in replay standard: {str(e)}") import traceback logger.error(traceback.format_exc()) return 0.0 def _replay_mixed_precision(self, states, actions, rewards, next_states, dones): """Mixed precision training step for better GPU performance""" # Check if mixed precision should be explicitly disabled if 'DISABLE_MIXED_PRECISION' in os.environ: logger.info("Mixed precision explicitly disabled by environment variable") return self._replay_standard(states, actions, rewards, next_states, dones) try: # Zero gradients self.optimizer.zero_grad() # Forward pass with amp autocasting with torch.cuda.amp.autocast(): # Get current Q values and extrema predictions current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self.policy_net(states) current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Get next Q values from target network with torch.no_grad(): next_q_values, next_extrema_pred, next_price_pred, next_hidden_features, next_advanced_pred = self.target_net(next_states) next_q_values = next_q_values.max(1)[0] # Check for dimension mismatch and fix it if rewards.shape[0] != next_q_values.shape[0]: # Log the shape mismatch for debugging logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}") # Use the smaller size to prevent index errors min_size = min(rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute Q-value loss (primary task) q_loss = nn.MSELoss()(current_q_values, target_q_values) # Initialize loss with q_loss loss = q_loss # Try to extract price from current and next states try: # Extract price feature from sequence data (if available) if len(states.shape) == 3: # [batch, seq, features] current_prices = states[:, -1, -1] # Last timestep, last feature next_prices = next_states[:, -1, -1] else: # [batch, features] current_prices = states[:, -1] # Last feature next_prices = next_states[:, -1] # Calculate price change for different timeframes immediate_changes = (next_prices - current_prices) / current_prices # Get the actual batch size for this calculation actual_batch_size = states.shape[0] # Create price direction labels - simplified for training # 0 = down, 1 = sideways, 2 = up immediate_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Default: sideways midterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 longterm_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 1 # Immediate term direction (1s, 1m) immediate_up = (immediate_changes > 0.0005) immediate_down = (immediate_changes < -0.0005) immediate_labels[immediate_up] = 2 # Up immediate_labels[immediate_down] = 0 # Down # For mid and long term, we can only approximate during training # In a real system, we'd need historical data to validate these # Here we'll use the immediate term with increasing thresholds as approximation # Mid-term (1h) - use slightly higher threshold midterm_up = (immediate_changes > 0.001) midterm_down = (immediate_changes < -0.001) midterm_labels[midterm_up] = 2 # Up midterm_labels[midterm_down] = 0 # Down # Long-term (1d) - use even higher threshold longterm_up = (immediate_changes > 0.002) longterm_down = (immediate_changes < -0.002) longterm_labels[longterm_up] = 2 # Up longterm_labels[longterm_down] = 0 # Down # Generate target values for price change regression # For simplicity, we'll use the immediate change and scaled versions for longer timeframes price_value_targets = torch.zeros((actual_batch_size, 4), device=self.device) price_value_targets[:, 0] = immediate_changes price_value_targets[:, 1] = immediate_changes * 2.0 # Approximate 1h change price_value_targets[:, 2] = immediate_changes * 4.0 # Approximate 1d change price_value_targets[:, 3] = immediate_changes * 6.0 # Approximate 1w change # Calculate loss for price direction prediction (classification) if len(current_price_pred['immediate'].shape) > 1 and current_price_pred['immediate'].shape[0] >= actual_batch_size: # Slice predictions to match the adjusted batch size immediate_pred = current_price_pred['immediate'][:actual_batch_size] midterm_pred = current_price_pred['midterm'][:actual_batch_size] longterm_pred = current_price_pred['longterm'][:actual_batch_size] price_values_pred = current_price_pred['values'][:actual_batch_size] # Compute losses for each task immediate_loss = nn.CrossEntropyLoss()(immediate_pred, immediate_labels) midterm_loss = nn.CrossEntropyLoss()(midterm_pred, midterm_labels) longterm_loss = nn.CrossEntropyLoss()(longterm_pred, longterm_labels) # MSE loss for price value regression price_value_loss = nn.MSELoss()(price_values_pred, price_value_targets) # Combine all price prediction losses price_loss = immediate_loss + 0.7 * midterm_loss + 0.5 * longterm_loss + 0.3 * price_value_loss # Create extrema labels (same as before) extrema_labels = torch.ones(actual_batch_size, dtype=torch.long, device=self.device) * 2 # Default: neither # Identify potential bottoms (significant negative change) bottoms = (immediate_changes < -0.003) extrema_labels[bottoms] = 0 # Identify potential tops (significant positive change) tops = (immediate_changes > 0.003) extrema_labels[tops] = 1 # Calculate extrema prediction loss if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= actual_batch_size: current_extrema_pred = current_extrema_pred[:actual_batch_size] extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels) # Combined loss with all components # Primary task: Q-value learning (RL objective) # Secondary tasks: extrema detection and price prediction (supervised objectives) loss = q_loss + 0.3 * extrema_loss + 0.3 * price_loss # Log loss components occasionally if random.random() < 0.01: # Log 1% of the time logger.info( f"Mixed precision losses: Q-loss={q_loss.item():.4f}, " f"Extrema-loss={extrema_loss.item():.4f}, " f"Price-loss={price_loss.item():.4f}" ) except Exception as e: # Fallback if price extraction fails logger.warning(f"Failed to calculate price prediction loss: {str(e)}. Using only Q-value loss.") # Just use Q-value loss loss = q_loss # Backward pass with scaled gradients self.scaler.scale(loss).backward() # Gradient clipping on scaled gradients self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0) # Update with scaler self.scaler.step(self.optimizer) self.scaler.update() # Update target network if needed self.update_count += 1 if self.update_count % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # Track and decay epsilon self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) return loss.item() except Exception as e: logger.error(f"Error in mixed precision training: {str(e)}") logger.warning("Falling back to standard precision training") # Fall back to standard training return self._replay_standard(states, actions, rewards, next_states, dones) def train_on_extrema(self, states, actions, rewards, next_states, dones): """ Special training function specifically for extrema points Args: states: Batch of states at extrema points actions: Batch of actions rewards: Batch of rewards next_states: Batch of next states dones: Batch of done flags Returns: float: Training loss """ # Convert to numpy arrays if not already if not isinstance(states, np.ndarray): states = np.array(states) if not isinstance(actions, np.ndarray): actions = np.array(actions) if not isinstance(rewards, np.ndarray): rewards = np.array(rewards) if not isinstance(next_states, np.ndarray): next_states = np.array(next_states) if not isinstance(dones, np.ndarray): dones = np.array(dones, dtype=np.float32) # Normalize states states = np.vstack([self._normalize_state(s) for s in states]) next_states = np.vstack([self._normalize_state(s) for s in next_states]) # Convert to torch tensors and move to device states_tensor = torch.FloatTensor(states).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device) next_states_tensor = torch.FloatTensor(next_states).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device) # Choose training method based on precision mode if self.use_mixed_precision: return self._replay_mixed_precision( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) else: return self._replay_standard( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) def _normalize_state(self, state: np.ndarray) -> np.ndarray: """Normalize the state data to prevent numerical issues""" # Handle NaN and infinite values state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0) # Check if state is 1D array (happens in some environments) if len(state.shape) == 1: # If 1D, we need to normalize the whole array normalized_state = state.copy() # Convert any timestamp or non-numeric data to float for i in range(len(normalized_state)): # Check for timestamp-like objects if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i] = float(normalized_state[i].timestamp()) elif not isinstance(normalized_state[i], (int, float, np.number)): # Set non-numeric data to 0 normalized_state[i] = 0.0 # Ensure all values are float normalized_state = normalized_state.astype(np.float32) # Simple min-max normalization for 1D state state_min = np.min(normalized_state) state_max = np.max(normalized_state) if state_max > state_min: normalized_state = (normalized_state - state_min) / (state_max - state_min) return normalized_state # Handle 2D arrays normalized_state = np.zeros_like(state, dtype=np.float32) # Convert any timestamp or non-numeric data to float for i in range(state.shape[0]): for j in range(state.shape[1]): if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i, j] = float(state[i, j].timestamp()) elif isinstance(state[i, j], (int, float, np.number)): normalized_state[i, j] = state[i, j] else: # Set non-numeric data to 0 normalized_state[i, j] = 0.0 # Loop through each timeframe's features in the combined state feature_count = state.shape[1] // len(self.timeframes) for tf_idx in range(len(self.timeframes)): start_idx = tf_idx * feature_count end_idx = start_idx + feature_count # Extract this timeframe's features tf_features = normalized_state[:, start_idx:end_idx] # Normalize OHLCV data by the first close price in the window # This makes price movements relative rather than absolute price_idx = 3 # Assuming close price is at index 3 if price_idx < tf_features.shape[1]: reference_price = np.mean(tf_features[:, price_idx]) if reference_price != 0: # Normalize price-related columns (OHLC) for i in range(4): # First 4 columns are OHLC if i < tf_features.shape[1]: normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price # Normalize volume using mean and std vol_idx = 4 # Assuming volume is at index 4 if vol_idx < tf_features.shape[1]: vol_mean = np.mean(tf_features[:, vol_idx]) vol_std = np.std(tf_features[:, vol_idx]) if vol_std > 0: normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std else: normalized_state[:, start_idx + vol_idx] = 0 # Other features (technical indicators) - normalize with min-max scaling for i in range(5, feature_count): if i < tf_features.shape[1]: feature_min = np.min(tf_features[:, i]) feature_max = np.max(tf_features[:, i]) if feature_max > feature_min: normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min) else: normalized_state[:, start_idx + i] = 0 return normalized_state def update_realtime_tick_features(self, tick_features): """Update with real-time tick features from tick processor""" try: if tick_features is not None: self.realtime_tick_features = tick_features # Log high-confidence tick features if tick_features.get('confidence', 0) > 0.8: logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}") except Exception as e: logger.error(f"Error updating real-time tick features: {e}") def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray: """Enhance state with real-time tick features if available""" try: if self.realtime_tick_features is None: return state # Extract neural features from tick processor neural_features = self.realtime_tick_features.get('neural_features', np.array([])) volume_features = self.realtime_tick_features.get('volume_features', np.array([])) microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([])) confidence = self.realtime_tick_features.get('confidence', 0.0) # Combine tick features - make them compact to match state dimensions tick_features = np.concatenate([ neural_features[:3] if len(neural_features) >= 3 else np.zeros(3), # Take first 3 neural features volume_features[:1] if len(volume_features) >= 1 else np.zeros(1), # Take first volume feature microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1), # Take first microstructure feature ]) # Weight the tick features weighted_tick_features = tick_features * self.tick_feature_weight # Enhance the state by adding tick features to each timeframe if len(state.shape) == 1: # 1D state - append tick features enhanced_state = np.concatenate([state, weighted_tick_features]) else: # 2D state - add tick features to each timeframe row num_timeframes, num_features = state.shape # Ensure tick features match the number of original features if len(weighted_tick_features) != num_features: # Pad or truncate tick features to match state feature dimension if len(weighted_tick_features) < num_features: # Pad with zeros padded_features = np.zeros(num_features) padded_features[:len(weighted_tick_features)] = weighted_tick_features weighted_tick_features = padded_features else: # Truncate to match weighted_tick_features = weighted_tick_features[:num_features] # Add tick features to the last row (most recent timeframe) enhanced_state = state.copy() enhanced_state[-1, :] += weighted_tick_features # Add to last timeframe return enhanced_state except Exception as e: logger.error(f"Error enhancing state with tick features: {e}") return state def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01): """Update learning metrics and perform learning rate adjustments if needed""" # Update average reward with exponential moving average if self.avg_reward == 0: self.avg_reward = episode_reward else: self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward # Check if we're making sufficient progress if episode_reward > (1 + best_reward_threshold) * self.best_reward: self.best_reward = episode_reward self.no_improvement_count = 0 return True # Improved else: self.no_improvement_count += 1 # If no improvement for a while, adjust learning rate if self.no_improvement_count >= 10: current_lr = self.optimizer.param_groups[0]['lr'] new_lr = current_lr * 0.5 if new_lr >= 1e-6: # Don't reduce below minimum threshold for param_group in self.optimizer.param_groups: param_group['lr'] = new_lr logger.info(f"Reducing learning rate from {current_lr} to {new_lr}") self.no_improvement_count = 0 return False # No improvement def save(self, path: str): """Save model and agent state""" os.makedirs(os.path.dirname(path), exist_ok=True) # Save policy network self.policy_net.save(f"{path}_policy") # Save target network self.target_net.save(f"{path}_target") # Save agent state state = { 'epsilon': self.epsilon, 'update_count': self.update_count, 'losses': self.losses, 'optimizer_state': self.optimizer.state_dict(), 'best_reward': self.best_reward, 'avg_reward': self.avg_reward } torch.save(state, f"{path}_agent_state.pt") logger.info(f"Agent state saved to {path}_agent_state.pt") def load(self, path: str): """Load model and agent state""" # Load policy network self.policy_net.load(f"{path}_policy") # Load target network self.target_net.load(f"{path}_target") # Load agent state try: agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device) self.epsilon = agent_state['epsilon'] self.update_count = agent_state['update_count'] self.losses = agent_state['losses'] self.optimizer.load_state_dict(agent_state['optimizer_state']) # Load additional metrics if they exist if 'best_reward' in agent_state: self.best_reward = agent_state['best_reward'] if 'avg_reward' in agent_state: self.avg_reward = agent_state['avg_reward'] logger.info(f"Agent state loaded from {path}_agent_state.pt") except FileNotFoundError: logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")