import torch import torch.nn as nn import torch.optim as optim import numpy as np from collections import deque import random from typing import Tuple, List, Dict, Any import os import sys import logging import torch.nn.functional as F import time # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) # Import checkpoint management from utils.checkpoint_manager import save_checkpoint, load_best_checkpoint from utils.training_integration import get_training_integration # Configure logger logger = logging.getLogger(__name__) class DQNNetwork(nn.Module): """ Massive Deep Q-Network specifically designed for RL trading with unified BaseDataInput features Handles 7850 input features from multi-timeframe, multi-asset data TARGET: 50M parameters for enhanced learning capacity """ def __init__(self, input_dim: int, n_actions: int): super(DQNNetwork, self).__init__() # Handle different input dimension formats if isinstance(input_dim, (tuple, list)): if len(input_dim) == 1: self.input_size = input_dim[0] else: self.input_size = np.prod(input_dim) # Flatten multi-dimensional input else: self.input_size = input_dim self.n_actions = n_actions # MASSIVE network architecture optimized for trading features # Target: ~50M parameters self.feature_extractor = nn.Sequential( # Initial feature extraction with massive width nn.Linear(self.input_size, 8192), # 7850 -> 8192 = ~64M weights nn.LayerNorm(8192), nn.ReLU(inplace=True), nn.Dropout(0.1), # Deep feature processing layers nn.Linear(8192, 6144), # 8192 -> 6144 = ~50M weights nn.LayerNorm(6144), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(6144, 4096), # 6144 -> 4096 = ~25M weights nn.LayerNorm(4096), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(4096, 3072), # 4096 -> 3072 = ~12M weights nn.LayerNorm(3072), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(3072, 2048), # 3072 -> 2048 = ~6M weights nn.LayerNorm(2048), nn.ReLU(inplace=True), nn.Dropout(0.1), ) # Market regime detection head self.regime_head = nn.Sequential( nn.Linear(2048, 1024), nn.LayerNorm(1024), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(1024, 512), nn.LayerNorm(512), nn.ReLU(inplace=True), nn.Linear(512, 4) # trending, ranging, volatile, mixed ) # Price direction prediction head - outputs direction and confidence self.price_direction_head = nn.Sequential( nn.Linear(2048, 1024), nn.LayerNorm(1024), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(1024, 512), nn.LayerNorm(512), nn.ReLU(inplace=True), nn.Linear(512, 2) # [direction, confidence] ) # Direction activation (tanh for -1 to 1) self.direction_activation = nn.Tanh() # Confidence activation (sigmoid for 0 to 1) self.confidence_activation = nn.Sigmoid() # Volatility prediction head self.volatility_head = nn.Sequential( nn.Linear(2048, 1024), nn.LayerNorm(1024), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(1024, 256), nn.LayerNorm(256), nn.ReLU(inplace=True), nn.Linear(256, 4) # predicted volatility for 4 timeframes ) # Main Q-value head (dueling architecture) self.value_head = nn.Sequential( nn.Linear(2048, 1024), nn.LayerNorm(1024), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(1024, 512), nn.LayerNorm(512), nn.ReLU(inplace=True), nn.Linear(512, 1) # State value ) self.advantage_head = nn.Sequential( nn.Linear(2048, 1024), nn.LayerNorm(1024), nn.ReLU(inplace=True), nn.Dropout(0.1), nn.Linear(1024, 512), nn.LayerNorm(512), nn.ReLU(inplace=True), nn.Linear(512, n_actions) # Action advantages ) # Initialize weights self._initialize_weights() # Log parameter count total_params = sum(p.numel() for p in self.parameters()) logger.info(f"DQN Network initialized with {total_params:,} parameters (target: 50M)") def _initialize_weights(self): """Initialize network weights using Xavier initialization""" for module in self.modules(): if isinstance(module, nn.Linear): nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) elif isinstance(module, nn.LayerNorm): nn.init.constant_(module.bias, 0) nn.init.constant_(module.weight, 1.0) def forward(self, x): """Forward pass through the network""" # Ensure input is properly shaped if x.dim() > 2: x = x.view(x.size(0), -1) # Flatten if needed elif x.dim() == 1: x = x.unsqueeze(0) # Add batch dimension if needed # Feature extraction features = self.feature_extractor(x) # Multiple prediction heads regime_pred = self.regime_head(features) price_direction_raw = self.price_direction_head(features) # Apply separate activations to direction and confidence direction = self.direction_activation(price_direction_raw[:, 0:1]) # -1 to 1 confidence = self.confidence_activation(price_direction_raw[:, 1:2]) # 0 to 1 price_direction_pred = torch.cat([direction, confidence], dim=1) # [batch, 2] volatility_pred = self.volatility_head(features) # Dueling Q-network value = self.value_head(features) advantage = self.advantage_head(features) # Combine value and advantage for Q-values q_values = value + advantage - advantage.mean(dim=1, keepdim=True) return q_values, regime_pred, price_direction_pred, volatility_pred, features def act(self, state, explore=True): """ Select action using epsilon-greedy policy Args: state: Current state (numpy array or tensor) explore: Whether to use epsilon-greedy exploration Returns: action_idx: Selected action index confidence: Confidence score action_probs: Action probabilities """ # Convert state to tensor if needed if isinstance(state, np.ndarray): state = torch.FloatTensor(state) # Move to device device = next(self.parameters()).device state = state.to(device) # Ensure proper shape if state.dim() == 1: state = state.unsqueeze(0) with torch.no_grad(): q_values, regime_pred, price_direction_pred, volatility_pred, features = self.forward(state) # Price direction predictions are processed in the agent's act method # This is just the network forward pass # Get action probabilities using softmax action_probs = F.softmax(q_values, dim=1) # Select action (greedy for inference) action_idx = torch.argmax(q_values, dim=1).item() # Calculate confidence as max probability confidence = float(action_probs[0, action_idx].item()) # Convert probabilities to list probs_list = action_probs.squeeze(0).cpu().numpy().tolist() return action_idx, confidence, probs_list class DQNAgent: """ Deep Q-Network agent for trading Uses Enhanced CNN model as the base network with GPU support for improved performance """ def __init__(self, state_shape: Tuple[int, ...], n_actions: int = 3, # BUY=0, SELL=1, HOLD=2 learning_rate: float = 0.001, epsilon: float = 1.0, epsilon_min: float = 0.01, epsilon_decay: float = 0.995, buffer_size: int = 10000, batch_size: int = 32, target_update: int = 100, priority_memory: bool = True, device=None, model_name: str = "dqn_agent", enable_checkpoints: bool = True): # Checkpoint management self.model_name = model_name self.enable_checkpoints = enable_checkpoints self.training_integration = get_training_integration() if enable_checkpoints else None self.episode_count = 0 self.best_reward = float('-inf') self.reward_history = deque(maxlen=100) self.checkpoint_frequency = 100 # Save checkpoint every 100 episodes # Extract state dimensions if isinstance(state_shape, tuple) and len(state_shape) > 1: # Multi-dimensional state (like image or sequence) self.state_dim = state_shape else: # 1D state if isinstance(state_shape, tuple): if len(state_shape) == 0: self.state_dim = 1 # Safe default for empty tuple else: self.state_dim = state_shape[0] else: self.state_dim = state_shape # Store parameters self.n_actions = n_actions self.learning_rate = learning_rate self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.buffer_size = buffer_size self.batch_size = batch_size self.target_update = target_update # Set device for computation (default to GPU if available) if device is None: self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') else: self.device = device logger.info(f"DQN Agent using device: {self.device}") # Initialize models with RL-specific network architecture self.policy_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device) self.target_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device) # Ensure models are on the correct device self.policy_net = self.policy_net.to(self.device) self.target_net = self.target_net.to(self.device) # Initialize the target network with the same weights as the policy network self.target_net.load_state_dict(self.policy_net.state_dict()) # Set models to eval mode (important for batch norm, dropout) self.target_net.eval() # Optimization components self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate) self.criterion = nn.MSELoss() # Experience replay memory self.memory = [] self.positive_memory = [] # Special memory for storing good experiences self.update_count = 0 # Extrema detection tracking self.last_extrema_pred = { 'class': 2, # Default to "neither" (not extrema) 'confidence': 0.0, 'raw': None } self.extrema_memory = [] # DQN hyperparameters self.gamma = 0.99 # Discount factor # Initialize avg_reward for dashboard compatibility self.avg_reward = 0.0 # Average reward tracking for dashboard # Market regime adaptation weights self.market_regime_weights = { 'trending': 1.0, 'sideways': 0.8, 'volatile': 1.2, 'bullish': 1.1, 'bearish': 1.1 } # Load best checkpoint if available if self.enable_checkpoints: self.load_best_checkpoint() logger.info(f"DQN Agent initialized with checkpoint management: {enable_checkpoints}") if enable_checkpoints: logger.info(f"Model name: {model_name}, Checkpoint frequency: {self.checkpoint_frequency}") # Add this line to the __init__ method self.recent_actions = deque(maxlen=10) self.recent_prices = deque(maxlen=20) self.recent_rewards = deque(maxlen=100) # Price direction tracking - stores direction and confidence self.last_price_direction = { 'direction': 0.0, # Single value between -1 and 1 'confidence': 0.0 # Single value between 0 and 1 } # Store separate memory for price direction examples self.price_movement_memory = [] # For storing examples of clear price movements # Performance tracking self.losses = [] self.no_improvement_count = 0 # Confidence tracking self.confidence_history = [] self.avg_confidence = 0.0 self.max_confidence = 0.0 self.min_confidence = 1.0 # Enhanced features from EnhancedDQNAgent # Market adaptation capabilities self.market_regime_weights = { 'trending': 1.2, # Higher confidence in trending markets 'ranging': 0.8, # Lower confidence in ranging markets 'volatile': 0.6 # Much lower confidence in volatile markets } # Dueling network support (requires enhanced network architecture) self.use_dueling = True # Prioritized experience replay parameters self.use_prioritized_replay = priority_memory self.alpha = 0.6 # Priority exponent self.beta = 0.4 # Importance sampling exponent self.beta_increment = 0.001 # Double DQN support self.use_double_dqn = True # Enhanced training features from EnhancedDQNAgent self.target_update_freq = target_update # More descriptive name self.training_steps = 0 self.gradient_clip_norm = 1.0 # Gradient clipping # Enhanced statistics tracking self.epsilon_history = [] self.td_errors = [] # Track TD errors for analysis # Trade action fee and confidence thresholds self.trade_action_fee = 0.0005 # Small fee to discourage unnecessary trading self.minimum_action_confidence = 0.3 # Minimum confidence to consider trading (lowered from 0.5) # Violent move detection self.price_history = [] self.volatility_window = 20 # Window size for volatility calculation self.volatility_threshold = 0.0015 # Threshold for considering a move "violent" self.post_violent_move = False # Flag for recent violent move self.violent_move_cooldown = 0 # Cooldown after violent move # Feature integration self.last_hidden_features = None # Store last extracted features self.feature_history = [] # Store history of features for analysis # Real-time tick features integration self.realtime_tick_features = None # Latest tick features from tick processor self.tick_feature_weight = 0.3 # Weight for tick features in decision making # Check if mixed precision training should be used if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ: self.use_mixed_precision = True self.scaler = torch.cuda.amp.GradScaler() logger.info("Mixed precision training enabled") else: self.use_mixed_precision = False logger.info("Mixed precision training disabled") # Track if we're in training mode self.training = True # For compatibility with old code self.state_size = np.prod(state_shape) self.action_size = n_actions self.memory_size = buffer_size self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3] # Default timeframes logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}") logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}") logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}") # Log model parameters total_params = sum(p.numel() for p in self.policy_net.parameters()) logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters") # Position management for 2-action system self.current_position = 0.0 # -1 (short), 0 (neutral), 1 (long) self.position_entry_price = 0.0 self.position_entry_time = None # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data self.entry_confidence_threshold = 0.35 # Lower threshold for new positions (was 0.7) self.exit_confidence_threshold = 0.15 # Very low threshold for closing positions (was 0.3) self.uncertainty_threshold = 0.1 # When to stay neutral def load_best_checkpoint(self): """Load the best checkpoint for this DQN agent""" try: if not self.enable_checkpoints: return result = load_best_checkpoint(self.model_name) if result: file_path, metadata = result checkpoint = torch.load(file_path, map_location=self.device, weights_only=False) # Load model states if 'policy_net_state_dict' in checkpoint: self.policy_net.load_state_dict(checkpoint['policy_net_state_dict']) if 'target_net_state_dict' in checkpoint: self.target_net.load_state_dict(checkpoint['target_net_state_dict']) if 'optimizer_state_dict' in checkpoint: self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # Load training state if 'episode_count' in checkpoint: self.episode_count = checkpoint['episode_count'] if 'epsilon' in checkpoint: self.epsilon = checkpoint['epsilon'] if 'best_reward' in checkpoint: self.best_reward = checkpoint['best_reward'] logger.info(f"Loaded DQN checkpoint: {metadata.checkpoint_id}") logger.info(f"Episode: {self.episode_count}, Best reward: {self.best_reward:.4f}") except Exception as e: logger.warning(f"Failed to load checkpoint for {self.model_name}: {e}") def save_checkpoint(self, episode_reward: float, force_save: bool = False): """Save checkpoint if performance improved or forced""" try: if not self.enable_checkpoints: return False self.episode_count += 1 self.reward_history.append(episode_reward) # Calculate average reward over recent episodes avg_reward = sum(self.reward_history) / len(self.reward_history) # Update best reward if episode_reward > self.best_reward: self.best_reward = episode_reward # Save checkpoint every N episodes or if forced should_save = ( force_save or self.episode_count % self.checkpoint_frequency == 0 or episode_reward > self.best_reward * 0.95 # Within 5% of best ) if should_save and self.training_integration: return self.training_integration.save_rl_checkpoint( rl_agent=self, model_name=self.model_name, episode=self.episode_count, avg_reward=avg_reward, best_reward=self.best_reward, epsilon=self.epsilon, total_pnl=0.0 # Default to 0, can be set by calling code ) return False except Exception as e: logger.error(f"Error saving DQN checkpoint: {e}") return False # Store separate memory for price direction examples self.price_movement_memory = [] # For storing examples of clear price movements # Performance tracking self.losses = [] self.no_improvement_count = 0 # Confidence tracking self.confidence_history = [] self.avg_confidence = 0.0 self.max_confidence = 0.0 self.min_confidence = 1.0 # Enhanced features from EnhancedDQNAgent # Market adaptation capabilities self.market_regime_weights = { 'trending': 1.2, # Higher confidence in trending markets 'ranging': 0.8, # Lower confidence in ranging markets 'volatile': 0.6 # Much lower confidence in volatile markets } # Dueling network support (requires enhanced network architecture) self.use_dueling = True # Prioritized experience replay parameters self.use_prioritized_replay = priority_memory self.alpha = 0.6 # Priority exponent self.beta = 0.4 # Importance sampling exponent self.beta_increment = 0.001 # Double DQN support self.use_double_dqn = True # Enhanced training features from EnhancedDQNAgent self.target_update_freq = target_update # More descriptive name self.training_steps = 0 self.gradient_clip_norm = 1.0 # Gradient clipping # Enhanced statistics tracking self.epsilon_history = [] self.td_errors = [] # Track TD errors for analysis # Trade action fee and confidence thresholds self.trade_action_fee = 0.0005 # Small fee to discourage unnecessary trading self.minimum_action_confidence = 0.3 # Minimum confidence to consider trading (lowered from 0.5) # Violent move detection self.price_history = [] self.volatility_window = 20 # Window size for volatility calculation self.volatility_threshold = 0.0015 # Threshold for considering a move "violent" self.post_violent_move = False # Flag for recent violent move self.violent_move_cooldown = 0 # Cooldown after violent move # Feature integration self.last_hidden_features = None # Store last extracted features self.feature_history = [] # Store history of features for analysis # Real-time tick features integration self.realtime_tick_features = None # Latest tick features from tick processor self.tick_feature_weight = 0.3 # Weight for tick features in decision making # Check if mixed precision training should be used if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ: self.use_mixed_precision = True self.scaler = torch.cuda.amp.GradScaler() logger.info("Mixed precision training enabled") else: self.use_mixed_precision = False logger.info("Mixed precision training disabled") # Track if we're in training mode self.training = True # For compatibility with old code self.state_size = np.prod(state_shape) self.action_size = n_actions self.memory_size = buffer_size self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3] # Default timeframes logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}") logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}") logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}") # Log model parameters total_params = sum(p.numel() for p in self.policy_net.parameters()) logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters") # Position management for 2-action system self.current_position = 0.0 # -1 (short), 0 (neutral), 1 (long) self.position_entry_price = 0.0 self.position_entry_time = None # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data self.entry_confidence_threshold = 0.35 # Lower threshold for new positions (was 0.7) self.exit_confidence_threshold = 0.15 # Very low threshold for closing positions (was 0.3) self.uncertainty_threshold = 0.1 # When to stay neutral def move_models_to_device(self, device=None): """Move models to the specified device (GPU/CPU)""" if device is not None: self.device = device try: self.policy_net = self.policy_net.to(self.device) self.target_net = self.target_net.to(self.device) logger.info(f"Moved models to {self.device}") return True except Exception as e: logger.error(f"Failed to move models to {self.device}: {str(e)}") return False def to(self, device): """PyTorch-style device movement method""" self.device = device self.policy_net = self.policy_net.to(device) self.target_net = self.target_net.to(device) return self def remember(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, is_extrema: bool = False): """ Store experience in memory with prioritization Args: state: Current state action: Action taken reward: Reward received next_state: Next state done: Whether episode is done is_extrema: Whether this is a local extrema sample (for specialized learning) """ experience = (state, action, reward, next_state, done) # Always add to main memory self.memory.append(experience) # Try to extract price change to analyze the experience try: # Extract price feature from sequence data (if available) if len(state.shape) > 1: # 2D state [timeframes, features] current_price = state[-1, -1] # Last timeframe, last feature next_price = next_state[-1, -1] else: # 1D state current_price = state[-1] # Last feature next_price = next_state[-1] # Calculate price change - avoid division by zero if np.isscalar(current_price) and current_price != 0: price_change = (next_price - current_price) / current_price elif isinstance(current_price, np.ndarray): # Handle array case - protect against division by zero with np.errstate(divide='ignore', invalid='ignore'): price_change = (next_price - current_price) / current_price # Replace infinities and NaNs with zeros if isinstance(price_change, np.ndarray): price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0) else: price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change else: price_change = 0.0 # Check if this is a significant price movement if abs(price_change) > 0.002: # Significant price change # Store in price movement memory self.price_movement_memory.append(experience) # Log significant price movements direction = "UP" if price_change > 0 else "DOWN" logger.info(f"Stored significant {direction} price movement: {price_change:.4f}") # For clear price movements, also duplicate in main memory to learn more if abs(price_change) > 0.005: # Very significant movement for _ in range(2): # Add 2 extra copies self.memory.append(experience) except Exception as e: # Skip price movement analysis if it fails pass # Check if this is an extrema point based on our extrema detection head if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2: # Class 0 = bottom, 1 = top, 2 = neither # Only consider high confidence predictions if self.last_extrema_pred['confidence'] > 0.7: self.extrema_memory.append(experience) # Log this special experience extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP" logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}") # For tops and bottoms, also duplicate the experience in memory to learn more from it for _ in range(2): # Add 2 extra copies self.memory.append(experience) # Explicitly marked extrema points also go to extrema memory elif is_extrema: self.extrema_memory.append(experience) # Store positive experiences separately for prioritized replay if reward > 0: self.positive_memory.append(experience) # For very good rewards, duplicate to learn more from them if reward > 0.1: for _ in range(min(int(reward * 10), 5)): # Cap at 5 extra copies for very high rewards self.positive_memory.append(experience) # Keep memory size under control if len(self.memory) > self.buffer_size: # Keep more recent experiences self.memory = self.memory[-self.buffer_size:] # Keep specialized memories under control too if len(self.positive_memory) > self.buffer_size // 4: self.positive_memory = self.positive_memory[-(self.buffer_size // 4):] if len(self.extrema_memory) > self.buffer_size // 4: self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):] if len(self.price_movement_memory) > self.buffer_size // 4: self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):] def act(self, state: np.ndarray, explore=True, current_price=None, market_context=None) -> int: """ Choose action based on current state using 2-action system with intelligent position management Args: state: Current market state explore: Whether to use epsilon-greedy exploration current_price: Current market price for position management market_context: Additional market context for decision making Returns: int: Action (0=BUY, 1=SELL) """ try: # Use the DQNNetwork's act method for consistent behavior action_idx, confidence, action_probs = self.policy_net.act(state, explore=explore) # Process price direction predictions from the network # Get the raw predictions from the network's forward pass with torch.no_grad(): q_values, regime_pred, price_direction_pred, volatility_pred, features = self.policy_net.forward(state) if price_direction_pred is not None: self.process_price_direction_predictions(price_direction_pred) # Apply epsilon-greedy exploration if requested if explore and np.random.random() <= self.epsilon: action_idx = np.random.choice(self.n_actions) # Update tracking if current_price: self.recent_prices.append(current_price) self.recent_actions.append(action_idx) return action_idx except Exception as e: logger.error(f"Error in act method: {e}") # Return default action (HOLD/SELL) return 1 def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float, List[float]]: """Choose action with confidence score adapted to market regime""" try: # Convert state to tensor if needed if isinstance(state, np.ndarray): state_tensor = torch.FloatTensor(state) device = next(self.policy_net.parameters()).device state_tensor = state_tensor.to(device) # Ensure proper shape if state_tensor.dim() == 1: state_tensor = state_tensor.unsqueeze(0) else: state_tensor = state # Get network outputs with torch.no_grad(): q_values, regime_pred, price_direction_pred, volatility_pred, features = self.policy_net.forward(state_tensor) # Process price direction predictions if price_direction_pred is not None: self.process_price_direction_predictions(price_direction_pred) # Get action probabilities using softmax action_probs = F.softmax(q_values, dim=1) # Select action (greedy for inference) action_idx = torch.argmax(q_values, dim=1).item() # Calculate confidence as max probability base_confidence = float(action_probs[0, action_idx].item()) # Adapt confidence based on market regime regime_weight = self.market_regime_weights.get(market_regime, 1.0) adapted_confidence = min(base_confidence * regime_weight, 1.0) # Convert probabilities to list probs_list = action_probs.squeeze(0).cpu().numpy().tolist() # Return action, confidence, and probabilities (for orchestrator compatibility) return int(action_idx), float(adapted_confidence), probs_list except Exception as e: logger.error(f"Error in act_with_confidence: {e}") # Return default action with low confidence return 1, 0.1, [0.45, 0.55] # Default to HOLD action def process_price_direction_predictions(self, price_direction_pred: torch.Tensor) -> Dict[str, float]: """ Process price direction predictions and convert to standardized format Args: price_direction_pred: Tensor of shape (batch_size, 2) containing [direction, confidence] Returns: Dict with direction (-1 to 1) and confidence (0 to 1) """ try: if price_direction_pred is None or price_direction_pred.numel() == 0: return self.last_price_direction # Extract direction and confidence values direction_value = float(price_direction_pred[0, 0].item()) # -1 to 1 confidence_value = float(price_direction_pred[0, 1].item()) # 0 to 1 # Update last price direction self.last_price_direction = { 'direction': direction_value, 'confidence': confidence_value } return self.last_price_direction except Exception as e: logger.error(f"Error processing price direction predictions: {e}") return self.last_price_direction def get_price_direction_vector(self) -> Dict[str, float]: """ Get the current price direction and confidence Returns: Dict with direction (-1 to 1) and confidence (0 to 1) """ return self.last_price_direction def get_price_direction_summary(self) -> Dict[str, Any]: """ Get a summary of price direction prediction Returns: Dict containing direction and confidence information """ try: direction_value = self.last_price_direction['direction'] confidence_value = self.last_price_direction['confidence'] # Convert to discrete direction if direction_value > 0.1: direction_label = "UP" discrete_direction = 1 elif direction_value < -0.1: direction_label = "DOWN" discrete_direction = -1 else: direction_label = "SIDEWAYS" discrete_direction = 0 return { 'direction_value': float(direction_value), 'confidence_value': float(confidence_value), 'direction_label': direction_label, 'discrete_direction': discrete_direction, 'strength': abs(float(direction_value)), 'weighted_strength': abs(float(direction_value)) * float(confidence_value) } except Exception as e: logger.error(f"Error calculating price direction summary: {e}") return { 'direction_value': 0.0, 'confidence_value': 0.0, 'direction_label': "SIDEWAYS", 'discrete_direction': 0, 'strength': 0.0, 'weighted_strength': 0.0 } except Exception as e: logger.error(f"Error in act_with_confidence: {e}") # Return default action with low confidence return 1, 0.1, [0.45, 0.55] # Default to HOLD action def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore): """ Determine action based on current position and confidence thresholds This implements the intelligent position management where: - When neutral: Need high confidence to enter position - When in position: Need lower confidence to exit - Different thresholds for entry vs exit """ # Apply epsilon-greedy exploration if explore and np.random.random() <= self.epsilon: return np.random.choice([0, 1]) # Get the dominant signal - FIXED ACTION MAPPING: 0=BUY, 1=SELL dominant_action = 0 if buy_conf > sell_conf else 1 dominant_confidence = max(buy_conf, sell_conf) # Decision logic based on current position if self.current_position == 0: # No position - need high confidence to enter if dominant_confidence >= self.entry_confidence_threshold: # Strong enough signal to enter position if dominant_action == 0: # BUY signal (action 0) self.current_position = 1.0 self.position_entry_price = current_price self.position_entry_time = time.time() logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}") return 0 # Return BUY action (0) else: # SELL signal (action 1) self.current_position = -1.0 self.position_entry_price = current_price self.position_entry_time = time.time() logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}") return 1 # Return SELL action (1) else: # Not confident enough to enter position return None elif self.current_position > 0: # Long position if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold: # SELL signal (action 1) with enough confidence to close long position pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 0.0 self.position_entry_price = 0.0 self.position_entry_time = None return 1 # Return SELL action (1) elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold: # Very strong SELL signal - close long and enter short pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = -1.0 self.position_entry_price = current_price self.position_entry_time = time.time() return 1 # Return SELL action (1) else: # Hold the long position return None elif self.current_position < 0: # Short position if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold: # BUY signal (action 0) with enough confidence to close short position pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 0.0 self.position_entry_price = 0.0 self.position_entry_time = None return 0 # Return BUY action (0) elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold: # Very strong BUY signal - close short and enter long pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0 logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}") self.current_position = 1.0 self.position_entry_price = current_price self.position_entry_time = time.time() return 0 # Return BUY action (0) else: # Hold the short position return None return None def _safe_cnn_forward(self, network, states): """Safely call CNN forward method ensuring we always get 5 return values""" try: result = network(states) if isinstance(result, tuple) and len(result) == 5: return result elif isinstance(result, tuple) and len(result) == 1: # Handle case where only q_values are returned (like in empty tensor case) q_values = result[0] batch_size = q_values.size(0) device = q_values.device default_extrema = torch.zeros(batch_size, 3, device=device) default_price = torch.zeros(batch_size, 1, device=device) default_features = torch.zeros(batch_size, 1024, device=device) default_advanced = torch.zeros(batch_size, 1, device=device) return q_values, default_extrema, default_price, default_features, default_advanced else: # Fallback: create all default tensors batch_size = states.size(0) device = states.device default_q_values = torch.zeros(batch_size, self.n_actions, device=device) default_extrema = torch.zeros(batch_size, 3, device=device) default_price = torch.zeros(batch_size, 1, device=device) default_features = torch.zeros(batch_size, 1024, device=device) default_advanced = torch.zeros(batch_size, 1, device=device) return default_q_values, default_extrema, default_price, default_features, default_advanced except Exception as e: logger.error(f"Error in CNN forward pass: {e}") # Fallback: create all default tensors batch_size = states.size(0) device = states.device default_q_values = torch.zeros(batch_size, self.n_actions, device=device) default_extrema = torch.zeros(batch_size, 3, device=device) default_price = torch.zeros(batch_size, 1, device=device) default_features = torch.zeros(batch_size, 1024, device=device) default_advanced = torch.zeros(batch_size, 1, device=device) return default_q_values, default_extrema, default_price, default_features, default_advanced def replay(self, experiences=None): """Train the model using experiences from memory""" # Don't train if not in training mode if not self.training: return 0.0 # If no experiences provided, sample from memory if experiences is None: # Skip if memory is too small if len(self.memory) < self.batch_size: return 0.0 # Sample random mini-batch from memory indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False) experiences = [self.memory[i] for i in indices] # Validate experiences before processing if not experiences or len(experiences) == 0: logger.warning("No experiences provided for training") return 0.0 # Sanitize and validate experiences valid_experiences = [] for i, exp in enumerate(experiences): try: if len(exp) != 5: logger.debug(f"Invalid experience format at index {i}: expected 5 elements, got {len(exp)}") continue state, action, reward, next_state, done = exp # Validate state state = self._validate_and_fix_state(state) next_state = self._validate_and_fix_state(next_state) if state is None or next_state is None: continue # Validate action if isinstance(action, dict): action = action.get('action', action.get('value', 0)) action = int(action) if action is not None else 0 action = max(0, min(action, self.n_actions - 1)) # Clamp to valid range # Validate reward if isinstance(reward, dict): reward = reward.get('reward', reward.get('value', 0.0)) reward = float(reward) if reward is not None else 0.0 # Validate done flag done = bool(done) if done is not None else False valid_experiences.append((state, action, reward, next_state, done)) except Exception as e: logger.debug(f"Error processing experience {i}: {e}") continue if len(valid_experiences) == 0: logger.warning("No valid experiences after sanitization") return 0.0 # Use validated experiences for training experiences = valid_experiences # Extract components states, actions, rewards, next_states, dones = zip(*experiences) # Convert to tensors with proper validation try: # Ensure all data is on CPU first, then move to device states_array = np.array(states, dtype=np.float32) actions_array = np.array(actions, dtype=np.int64) rewards_array = np.array(rewards, dtype=np.float32) next_states_array = np.array(next_states, dtype=np.float32) dones_array = np.array(dones, dtype=np.float32) # Convert to tensors and move to device states = torch.from_numpy(states_array).to(self.device) actions = torch.from_numpy(actions_array).to(self.device) rewards = torch.from_numpy(rewards_array).to(self.device) next_states = torch.from_numpy(next_states_array).to(self.device) dones = torch.from_numpy(dones_array).to(self.device) # Final validation of tensor shapes if states.shape[0] == 0 or actions.shape[0] == 0: logger.warning("Empty tensors after conversion") return 0.0 # Ensure all tensors have the same batch size batch_size = states.shape[0] if not all(tensor.shape[0] == batch_size for tensor in [actions, rewards, next_states, dones]): logger.warning("Inconsistent batch sizes across tensors") return 0.0 except Exception as e: logger.error(f"Error converting experiences to tensors: {e}") return 0.0 # Always use standard training to fix gradient issues loss = self._replay_standard(states, actions, rewards, next_states, dones) # Update epsilon if self.epsilon > self.epsilon_min: self.epsilon *= self.epsilon_decay # Update statistics self.losses.append(loss) if len(self.losses) > 1000: self.losses = self.losses[-500:] # Keep only recent losses return loss def _validate_and_fix_state(self, state): """Validate and fix state to ensure it has correct dimensions and no empty data""" try: # Convert to numpy if needed if isinstance(state, torch.Tensor): state = state.detach().cpu().numpy() elif not isinstance(state, np.ndarray): # Check if state is a dict or complex object if isinstance(state, dict): logger.error(f"State is a dict: {state}") # Extract numerical values from dict if possible if 'features' in state: state = state['features'] elif 'state' in state: state = state['state'] else: # Try to extract all numerical values numerical_values = [] for key, value in state.items(): if isinstance(value, (int, float)): numerical_values.append(float(value)) elif isinstance(value, (list, np.ndarray)): try: # Handle nested structures safely flattened = np.array(value).flatten() for x in flattened: if isinstance(x, (int, float)): numerical_values.append(float(x)) elif hasattr(x, 'item'): # numpy scalar numerical_values.append(float(x.item())) except (ValueError, TypeError): continue elif isinstance(value, dict): # Recursively extract from nested dicts try: nested_values = self._extract_numeric_from_dict(value) numerical_values.extend(nested_values) except Exception: continue if numerical_values: state = np.array(numerical_values, dtype=np.float32) else: logger.error("No numerical values found in state dict, using default state") expected_size = getattr(self, 'state_size', 403) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) else: try: state = np.array(state, dtype=np.float32) except (ValueError, TypeError) as e: logger.error(f"Cannot convert state to numpy array: {type(state)}, {e}") expected_size = getattr(self, 'state_size', 403) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) # Flatten if multi-dimensional if state.ndim > 1: state = state.flatten() # Check for empty or invalid state if state.size == 0: logger.warning("Empty state detected, using default") expected_size = getattr(self, 'state_size', 403) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) # Check for NaN or infinite values if np.any(np.isnan(state)) or np.any(np.isinf(state)): logger.warning("NaN or infinite values in state, replacing with zeros") state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0) # Ensure correct dimensions expected_size = getattr(self, 'state_size', 403) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) expected_size = int(expected_size) if len(state) != expected_size: if len(state) < expected_size: # Pad with zeros padded_state = np.zeros(expected_size, dtype=np.float32) padded_state[:len(state)] = state state = padded_state else: # Truncate state = state[:expected_size] return state.astype(np.float32) except Exception as e: logger.error(f"Error validating state: {e}") # Return default state as fallback expected_size = getattr(self, 'state_size', 403) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) def _replay_standard(self, states, actions, rewards, next_states, dones): """Standard training step without mixed precision""" try: # Validate input tensors if states.shape[0] == 0: logger.warning("Empty batch in _replay_standard") return 0.0 # Ensure model is in training mode for gradients self.policy_net.train() # Get current Q values - use the updated forward method q_values_output = self.policy_net(states) if isinstance(q_values_output, tuple): current_q_values_all = q_values_output[0] # Extract Q-values from tuple else: current_q_values_all = q_values_output current_q_values = current_q_values_all.gather(1, actions.unsqueeze(1)).squeeze(1) # Enhanced Double DQN implementation with torch.no_grad(): if self.use_double_dqn: # Double DQN: Use policy network to select actions, target network to evaluate policy_output = self.policy_net(next_states) policy_q_values = policy_output[0] if isinstance(policy_output, tuple) else policy_output next_actions = policy_q_values.argmax(1) target_output = self.target_net(next_states) target_q_values_all = target_output[0] if isinstance(target_output, tuple) else target_output next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1) else: # Standard DQN: Use target network for both selection and evaluation target_output = self.target_net(next_states) target_q_values = target_output[0] if isinstance(target_output, tuple) else target_output next_q_values = target_q_values.max(1)[0] # Ensure tensor shapes are consistent batch_size = states.shape[0] if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size: logger.warning(f"Shape mismatch in replay: batch_size={batch_size}, rewards={rewards.shape}, next_q_values={next_q_values.shape}") min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] # Calculate target Q values target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute loss for Q value - ensure tensors require gradients if not current_q_values.requires_grad: logger.warning("Current Q values do not require gradients") # Force training mode self.policy_net.train() return 0.0 q_loss = self.criterion(current_q_values, target_q_values.detach()) # Calculate auxiliary losses and add to Q-loss total_loss = q_loss # Add auxiliary losses if available try: # Get additional predictions from forward pass if isinstance(q_values_output, tuple) and len(q_values_output) >= 5: current_regime_pred = q_values_output[1] current_price_pred = q_values_output[2] current_volatility_pred = q_values_output[3] current_extrema_pred = current_regime_pred # Use regime as extrema proxy for now # Price direction loss if current_price_pred is not None and current_price_pred.shape[0] > 0: price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions) if price_direction_loss is not None: total_loss = total_loss + 0.2 * price_direction_loss # Extrema loss if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0: extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions) if extrema_loss is not None: total_loss = total_loss + 0.1 * extrema_loss except Exception as e: logger.debug(f"Could not add auxiliary loss in standard training: {e}") # Reset gradients self.optimizer.zero_grad() # Ensure total loss requires gradients if not total_loss.requires_grad: logger.warning("Total loss does not require gradients - policy network may not be in training mode") self.policy_net.train() # Ensure training mode return 0.0 # Backward pass total_loss.backward() # Gradient clipping torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0) # Check if gradients are valid has_valid_gradients = False for param in self.policy_net.parameters(): if param.grad is not None and torch.any(torch.isfinite(param.grad)): has_valid_gradients = True break if not has_valid_gradients: logger.warning("No valid gradients found, skipping optimizer step") return 0.0 # Update weights self.optimizer.step() # Update target network periodically self.training_steps += 1 if self.training_steps % self.target_update_freq == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) logger.debug(f"Target network updated at step {self.training_steps}") return total_loss.item() except Exception as e: logger.error(f"Error in standard replay: {e}") return 0.0 def _replay_mixed_precision(self, states, actions, rewards, next_states, dones): """Mixed precision training step""" if not self.use_mixed_precision: logger.warning("Mixed precision not available, falling back to standard replay") return self._replay_standard(states, actions, rewards, next_states, dones) try: # Validate input tensors if states.shape[0] == 0: logger.warning("Empty batch in _replay_mixed_precision") return 0.0 # Zero gradients self.optimizer.zero_grad() # Forward pass with amp autocasting import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) with torch.cuda.amp.autocast(): # Get current Q values and predictions current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self._safe_cnn_forward(self.policy_net, states) current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) # Get next Q values from target network with torch.no_grad(): if self.use_double_dqn: # Double DQN policy_q_values, _, _, _, _ = self._safe_cnn_forward(self.policy_net, next_states) next_actions = policy_q_values.argmax(1) target_q_values_all, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states) next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1) else: # Standard DQN next_q_values, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states) next_q_values = next_q_values.max(1)[0] # Ensure consistent shapes batch_size = states.shape[0] if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size: logger.warning(f"Shape mismatch in mixed precision replay") min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0]) rewards = rewards[:min_size] dones = dones[:min_size] next_q_values = next_q_values[:min_size] current_q_values = current_q_values[:min_size] target_q_values = rewards + (1 - dones) * self.gamma * next_q_values # Compute Q-value loss (primary task) q_loss = nn.MSELoss()(current_q_values, target_q_values.detach()) # Initialize loss with q_loss loss = q_loss # Add auxiliary losses if available try: # Price direction loss if current_price_pred is not None and current_price_pred.shape[0] > 0: price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions) if price_direction_loss is not None: loss = loss + 0.2 * price_direction_loss # Extrema loss if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0: extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions) if extrema_loss is not None: loss = loss + 0.1 * extrema_loss except Exception as e: logger.debug(f"Could not add auxiliary loss in mixed precision: {e}") # Check if loss requires gradients if not loss.requires_grad: logger.warning("Loss does not require gradients in mixed precision training") return 0.0 # Scale and backward pass self.scaler.scale(loss).backward() # Unscale gradients and clip self.scaler.unscale_(self.optimizer) torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0) # Check for valid gradients has_valid_gradients = False for param in self.policy_net.parameters(): if param.grad is not None and torch.any(torch.isfinite(param.grad)): has_valid_gradients = True break if not has_valid_gradients: logger.warning("No valid gradients in mixed precision training") self.scaler.update() # Still update scaler return 0.0 # Optimizer step with scaler self.scaler.step(self.optimizer) self.scaler.update() # Update target network self.training_steps += 1 if self.training_steps % self.target_update_freq == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) logger.debug(f"Target network updated at step {self.training_steps}") return loss.item() except Exception as e: logger.error(f"Error in mixed precision replay: {e}") return 0.0 def train_on_extrema(self, states, actions, rewards, next_states, dones): """ Special training function specifically for extrema points Args: states: Batch of states at extrema points actions: Batch of actions rewards: Batch of rewards next_states: Batch of next states dones: Batch of done flags Returns: float: Training loss """ # Convert to numpy arrays if not already if not isinstance(states, np.ndarray): states = np.array(states) if not isinstance(actions, np.ndarray): actions = np.array(actions) if not isinstance(rewards, np.ndarray): rewards = np.array(rewards) if not isinstance(next_states, np.ndarray): next_states = np.array(next_states) if not isinstance(dones, np.ndarray): dones = np.array(dones, dtype=np.float32) # Normalize states states = np.vstack([self._normalize_state(s) for s in states]) next_states = np.vstack([self._normalize_state(s) for s in next_states]) # Convert to torch tensors and move to device states_tensor = torch.FloatTensor(states).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device) next_states_tensor = torch.FloatTensor(next_states).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device) # Choose training method based on precision mode if self.use_mixed_precision: return self._replay_mixed_precision( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) else: return self._replay_standard( states_tensor, actions_tensor, rewards_tensor, next_states_tensor, dones_tensor ) def _normalize_state(self, state: np.ndarray) -> np.ndarray: """Normalize the state data to prevent numerical issues""" # Handle NaN and infinite values state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0) # Check if state is 1D array (happens in some environments) if len(state.shape) == 1: # If 1D, we need to normalize the whole array normalized_state = state.copy() # Convert any timestamp or non-numeric data to float for i in range(len(normalized_state)): # Check for timestamp-like objects if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i] = float(normalized_state[i].timestamp()) elif not isinstance(normalized_state[i], (int, float, np.number)): # Set non-numeric data to 0 normalized_state[i] = 0.0 # Ensure all values are float normalized_state = normalized_state.astype(np.float32) # Simple min-max normalization for 1D state state_min = np.min(normalized_state) state_max = np.max(normalized_state) if state_max > state_min: normalized_state = (normalized_state - state_min) / (state_max - state_min) return normalized_state # Handle 2D arrays normalized_state = np.zeros_like(state, dtype=np.float32) # Convert any timestamp or non-numeric data to float for i in range(state.shape[0]): for j in range(state.shape[1]): if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i, j] = float(state[i, j].timestamp()) elif isinstance(state[i, j], (int, float, np.number)): normalized_state[i, j] = state[i, j] else: # Set non-numeric data to 0 normalized_state[i, j] = 0.0 # Loop through each timeframe's features in the combined state feature_count = state.shape[1] // len(self.timeframes) for tf_idx in range(len(self.timeframes)): start_idx = tf_idx * feature_count end_idx = start_idx + feature_count # Extract this timeframe's features tf_features = normalized_state[:, start_idx:end_idx] # Normalize OHLCV data by the first close price in the window # This makes price movements relative rather than absolute price_idx = 3 # Assuming close price is at index 3 if price_idx < tf_features.shape[1]: reference_price = np.mean(tf_features[:, price_idx]) if reference_price != 0: # Normalize price-related columns (OHLC) for i in range(4): # First 4 columns are OHLC if i < tf_features.shape[1]: normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price # Normalize volume using mean and std vol_idx = 4 # Assuming volume is at index 4 if vol_idx < tf_features.shape[1]: vol_mean = np.mean(tf_features[:, vol_idx]) vol_std = np.std(tf_features[:, vol_idx]) if vol_std > 0: normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std else: normalized_state[:, start_idx + vol_idx] = 0 # Other features (technical indicators) - normalize with min-max scaling for i in range(5, feature_count): if i < tf_features.shape[1]: feature_min = np.min(tf_features[:, i]) feature_max = np.max(tf_features[:, i]) if feature_max > feature_min: normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min) else: normalized_state[:, start_idx + i] = 0 return normalized_state def update_realtime_tick_features(self, tick_features): """Update with real-time tick features from tick processor""" try: if tick_features is not None: self.realtime_tick_features = tick_features # Log high-confidence tick features if tick_features.get('confidence', 0) > 0.8: logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}") except Exception as e: logger.error(f"Error updating real-time tick features: {e}") def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray: """Enhance state with real-time tick features if available""" try: if self.realtime_tick_features is None: return state # Extract neural features from tick processor neural_features = self.realtime_tick_features.get('neural_features', np.array([])) volume_features = self.realtime_tick_features.get('volume_features', np.array([])) microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([])) confidence = self.realtime_tick_features.get('confidence', 0.0) # Combine tick features - make them compact to match state dimensions tick_features = np.concatenate([ neural_features[:3] if len(neural_features) >= 3 else np.zeros(3), # Take first 3 neural features volume_features[:1] if len(volume_features) >= 1 else np.zeros(1), # Take first volume feature microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1), # Take first microstructure feature ]) # Weight the tick features weighted_tick_features = tick_features * self.tick_feature_weight # Enhance the state by adding tick features to each timeframe if len(state.shape) == 1: # 1D state - append tick features enhanced_state = np.concatenate([state, weighted_tick_features]) else: # 2D state - add tick features to each timeframe row num_timeframes, num_features = state.shape # Ensure tick features match the number of original features if len(weighted_tick_features) != num_features: # Pad or truncate tick features to match state feature dimension if len(weighted_tick_features) < num_features: # Pad with zeros padded_features = np.zeros(num_features) padded_features[:len(weighted_tick_features)] = weighted_tick_features weighted_tick_features = padded_features else: # Truncate to match weighted_tick_features = weighted_tick_features[:num_features] # Add tick features to the last row (most recent timeframe) enhanced_state = state.copy() enhanced_state[-1, :] += weighted_tick_features # Add to last timeframe return enhanced_state except Exception as e: logger.error(f"Error enhancing state with tick features: {e}") return state def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01): """Update learning metrics and perform learning rate adjustments if needed""" # Update average reward with exponential moving average if self.avg_reward == 0: self.avg_reward = episode_reward else: self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward # Check if we're making sufficient progress if episode_reward > (1 + best_reward_threshold) * self.best_reward: self.best_reward = episode_reward self.no_improvement_count = 0 return True # Improved else: self.no_improvement_count += 1 # If no improvement for a while, adjust learning rate if self.no_improvement_count >= 10: current_lr = self.optimizer.param_groups[0]['lr'] new_lr = current_lr * 0.5 if new_lr >= 1e-6: # Don't reduce below minimum threshold for param_group in self.optimizer.param_groups: param_group['lr'] = new_lr logger.info(f"Reducing learning rate from {current_lr} to {new_lr}") self.no_improvement_count = 0 return False # No improvement def save(self, path: str): """Save model and agent state""" os.makedirs(os.path.dirname(path), exist_ok=True) # Save policy network self.policy_net.save(f"{path}_policy") # Save target network self.target_net.save(f"{path}_target") # Save agent state state = { 'epsilon': self.epsilon, 'update_count': self.update_count, 'losses': self.losses, 'optimizer_state': self.optimizer.state_dict(), 'best_reward': self.best_reward, 'avg_reward': self.avg_reward } torch.save(state, f"{path}_agent_state.pt") logger.info(f"Agent state saved to {path}_agent_state.pt") def load(self, path: str): """Load model and agent state""" # Load policy network self.policy_net.load(f"{path}_policy") # Load target network self.target_net.load(f"{path}_target") # Load agent state try: agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device, weights_only=False) self.epsilon = agent_state['epsilon'] self.update_count = agent_state['update_count'] self.losses = agent_state['losses'] self.optimizer.load_state_dict(agent_state['optimizer_state']) # Load additional metrics if they exist if 'best_reward' in agent_state: self.best_reward = agent_state['best_reward'] if 'avg_reward' in agent_state: self.avg_reward = agent_state['avg_reward'] logger.info(f"Agent state loaded from {path}_agent_state.pt") except FileNotFoundError: logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values") def get_position_info(self): """Get current position information""" return { 'position': self.current_position, 'entry_price': self.position_entry_price, 'entry_time': self.position_entry_time, 'entry_threshold': self.entry_confidence_threshold, 'exit_threshold': self.exit_confidence_threshold } def _calculate_price_direction_loss(self, price_direction_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: """ Calculate loss for price direction predictions Args: price_direction_pred: Tensor of shape [batch, 2] containing [direction, confidence] rewards: Tensor of shape [batch] containing rewards actions: Tensor of shape [batch] containing actions Returns: Price direction loss tensor """ try: if price_direction_pred.size(1) != 2: return None batch_size = price_direction_pred.size(0) # Extract direction and confidence predictions direction_pred = price_direction_pred[:, 0] # -1 to 1 confidence_pred = price_direction_pred[:, 1] # 0 to 1 # Create targets based on rewards and actions with torch.no_grad(): # Direction targets: 1 if reward > 0 and action is BUY, -1 if reward > 0 and action is SELL, 0 otherwise direction_targets = torch.zeros(batch_size, device=price_direction_pred.device) for i in range(batch_size): if rewards[i] > 0.01: # Positive reward threshold if actions[i] == 0: # BUY action direction_targets[i] = 1.0 # UP elif actions[i] == 1: # SELL action direction_targets[i] = -1.0 # DOWN # else: targets remain 0 (sideways) # Confidence targets: based on reward magnitude (higher reward = higher confidence) confidence_targets = torch.abs(rewards).clamp(0, 1) # Calculate losses for each component direction_loss = F.mse_loss(direction_pred, direction_targets) confidence_loss = F.mse_loss(confidence_pred, confidence_targets) # Combined loss (direction is more important than confidence) total_loss = direction_loss + 0.3 * confidence_loss return total_loss except Exception as e: logger.debug(f"Error calculating price direction loss: {e}") return None def _calculate_extrema_loss(self, extrema_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor: """ Calculate loss for extrema predictions Args: extrema_pred: Extrema predictions rewards: Tensor containing rewards actions: Tensor containing actions Returns: Extrema loss tensor """ try: batch_size = extrema_pred.size(0) # Create targets based on reward patterns with torch.no_grad(): extrema_targets = torch.ones(batch_size, dtype=torch.long, device=extrema_pred.device) * 2 # Default to "neither" for i in range(batch_size): # High positive reward suggests we're at a good entry point (potential bottom for BUY, top for SELL) if rewards[i] > 0.05: if actions[i] == 0: # BUY action extrema_targets[i] = 0 # Bottom elif actions[i] == 1: # SELL action extrema_targets[i] = 1 # Top # Calculate cross-entropy loss if extrema_pred.size(1) >= 3: extrema_loss = F.cross_entropy(extrema_pred[:, :3], extrema_targets) else: extrema_loss = F.cross_entropy(extrema_pred, extrema_targets) return extrema_loss except Exception as e: logger.debug(f"Error calculating extrema loss: {e}") return None def get_enhanced_training_stats(self): """Get enhanced RL training statistics with detailed metrics (from EnhancedDQNAgent)""" return { 'buffer_size': len(self.memory), 'epsilon': self.epsilon, 'avg_reward': self.avg_reward, 'best_reward': self.best_reward, 'recent_rewards': list(self.recent_rewards) if hasattr(self, 'recent_rewards') else [], 'no_improvement_count': self.no_improvement_count, # Enhanced statistics from EnhancedDQNAgent 'training_steps': self.training_steps, 'avg_td_error': np.mean(self.td_errors[-100:]) if self.td_errors else 0.0, 'recent_losses': self.losses[-10:] if self.losses else [], 'epsilon_trend': self.epsilon_history[-20:] if self.epsilon_history else [], 'specialized_buffers': { 'extrema_memory': len(self.extrema_memory), 'positive_memory': len(self.positive_memory), 'price_movement_memory': len(self.price_movement_memory) }, 'market_regime_weights': self.market_regime_weights, 'use_double_dqn': self.use_double_dqn, 'use_prioritized_replay': self.use_prioritized_replay, 'gradient_clip_norm': self.gradient_clip_norm, 'target_update_frequency': self.target_update_freq } def get_params_count(self): """Get total number of parameters in the DQN model""" total_params = 0 for param in self.policy_net.parameters(): total_params += param.numel() return total_params def _sanitize_state_data(self, state): """Sanitize state data to ensure it's a proper numeric array""" try: # If state is already a numpy array, return it if isinstance(state, np.ndarray): # Check for empty array if state.size == 0: logger.warning("Received empty numpy array state. Using fallback dimensions.") expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403)) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) # Check for non-numeric data and handle it if state.dtype == object: # Convert object array to float array sanitized = np.zeros_like(state, dtype=np.float32) for i in range(state.shape[0]): if len(state.shape) > 1: for j in range(state.shape[1]): sanitized[i, j] = self._extract_numeric_value(state[i, j]) else: sanitized[i] = self._extract_numeric_value(state[i]) return sanitized else: return state.astype(np.float32) # If state is a list or tuple, convert to array elif isinstance(state, (list, tuple)): # Check for empty list/tuple if len(state) == 0: logger.warning("Received empty list/tuple state. Using fallback dimensions.") expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403)) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) # Recursively sanitize each element sanitized = [] for item in state: if isinstance(item, (list, tuple)): sanitized_row = [] for sub_item in item: sanitized_row.append(self._extract_numeric_value(sub_item)) sanitized.append(sanitized_row) else: sanitized.append(self._extract_numeric_value(item)) result = np.array(sanitized, dtype=np.float32) # Check if result is empty and provide fallback if result.size == 0: logger.warning("Sanitized state resulted in empty array. Using fallback dimensions.") expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403)) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) return result # If state is a dict, try to extract values elif isinstance(state, dict): # Try to extract meaningful values from dict values = [] for key in sorted(state.keys()): # Sort for consistency values.append(self._extract_numeric_value(state[key])) return np.array(values, dtype=np.float32) # If state is a single value, make it an array else: return np.array([self._extract_numeric_value(state)], dtype=np.float32) except Exception as e: logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.") # Return a zero array as fallback with the expected state dimension # Use the state_dim from initialization, fallback to 403 if not available expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403)) if isinstance(expected_size, tuple): expected_size = np.prod(expected_size) return np.zeros(int(expected_size), dtype=np.float32) def _extract_numeric_value(self, value): """Extract a numeric value from various data types""" try: # Handle None values if value is None: return 0.0 # Handle numeric types if isinstance(value, (int, float, np.number)): return float(value) # Handle dict values elif isinstance(value, dict): # Try common keys for numeric data for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']: if key in value: return self._extract_numeric_value(value[key]) # If no common keys, try to get first numeric value for v in value.values(): if isinstance(v, (int, float, np.number)): return float(v) return 0.0 # Handle string values that might be numeric elif isinstance(value, str): try: return float(value) except: return 0.0 # Handle datetime objects elif hasattr(value, 'timestamp'): return float(value.timestamp()) # Handle boolean values elif isinstance(value, bool): return float(value) # Handle list/tuple - take first numeric value elif isinstance(value, (list, tuple)) and len(value) > 0: return self._extract_numeric_value(value[0]) else: return 0.0 except: return 0.0 def _extract_numeric_from_dict(self, data_dict): """Recursively extract all numeric values from a dictionary""" numeric_values = [] try: for key, value in data_dict.items(): if isinstance(value, (int, float)): numeric_values.append(float(value)) elif isinstance(value, (list, np.ndarray)): try: flattened = np.array(value).flatten() for x in flattened: if isinstance(x, (int, float)): numeric_values.append(float(x)) elif hasattr(x, 'item'): # numpy scalar numeric_values.append(float(x.item())) except (ValueError, TypeError): continue elif isinstance(value, dict): # Recursively extract from nested dicts nested_values = self._extract_numeric_from_dict(value) numeric_values.extend(nested_values) elif isinstance(value, torch.Tensor): try: numeric_values.append(float(value.item())) except Exception: continue except Exception as e: logger.debug(f"Error extracting numeric values from dict: {e}") return numeric_values