import torch import torch.nn as nn import torch.optim as optim import numpy as np from collections import deque import random from typing import Tuple, List import os import sys import logging # Add parent directory to path sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from NN.models.simple_cnn import CNNModelPyTorch # Configure logger logger = logging.getLogger(__name__) class DQNAgent: """ Deep Q-Network agent for trading Uses CNN model as the base network """ def __init__(self, state_size: int, action_size: int, window_size: int, num_features: int, timeframes: List[str], learning_rate: float = 0.0005, # Reduced learning rate for more stability gamma: float = 0.97, # Slightly reduced discount factor epsilon: float = 1.0, epsilon_min: float = 0.05, # Increased minimum epsilon for more exploration epsilon_decay: float = 0.9975, # Slower decay rate memory_size: int = 20000, # Increased memory size batch_size: int = 128, # Larger batch size target_update: int = 5): # More frequent target updates self.state_size = state_size self.action_size = action_size self.window_size = window_size self.num_features = num_features self.timeframes = timeframes self.learning_rate = learning_rate self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.memory_size = memory_size self.batch_size = batch_size self.target_update = target_update # Device configuration self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Initialize networks self.policy_net = CNNModelPyTorch( window_size=window_size, num_features=num_features, output_size=action_size, timeframes=timeframes ).to(self.device) self.target_net = CNNModelPyTorch( window_size=window_size, num_features=num_features, output_size=action_size, timeframes=timeframes ).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) # Initialize optimizer with gradient clipping self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5) # Initialize memories with different priorities self.memory = deque(maxlen=memory_size) self.extrema_memory = deque(maxlen=memory_size // 4) # For extrema points self.positive_memory = deque(maxlen=memory_size // 4) # For positive rewards # Training metrics self.update_count = 0 self.losses = [] self.avg_reward = 0 self.no_improvement_count = 0 self.best_reward = float('-inf') def remember(self, state: np.ndarray, action: int, reward: float, next_state: np.ndarray, done: bool, is_extrema: bool = False): """ Store experience in memory with prioritization Args: state: Current state action: Action taken reward: Reward received next_state: Next state done: Whether episode is done is_extrema: Whether this is a local extrema sample (for specialized learning) """ experience = (state, action, reward, next_state, done) # Always add to main memory self.memory.append(experience) # Add to specialized memories if applicable if is_extrema: self.extrema_memory.append(experience) # Store positive experiences separately for prioritized replay if reward > 0: self.positive_memory.append(experience) def act(self, state: np.ndarray, explore=True) -> int: """Choose action using epsilon-greedy policy with explore flag""" if explore and random.random() < self.epsilon: return random.randrange(self.action_size) with torch.no_grad(): # Ensure state is normalized before inference state_tensor = self._normalize_state(state) state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device) action_probs, extrema_pred = self.policy_net(state_tensor) return action_probs.argmax().item() def _normalize_state(self, state: np.ndarray) -> np.ndarray: """Normalize the state data to prevent numerical issues""" # Handle NaN and infinite values state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0) # Check if state is 1D array (happens in some environments) if len(state.shape) == 1: # If 1D, we need to normalize the whole array normalized_state = state.copy() # Convert any timestamp or non-numeric data to float for i in range(len(normalized_state)): # Check for timestamp-like objects if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i] = float(normalized_state[i].timestamp()) elif not isinstance(normalized_state[i], (int, float, np.number)): # Set non-numeric data to 0 normalized_state[i] = 0.0 # Ensure all values are float normalized_state = normalized_state.astype(np.float32) # Simple min-max normalization for 1D state state_min = np.min(normalized_state) state_max = np.max(normalized_state) if state_max > state_min: normalized_state = (normalized_state - state_min) / (state_max - state_min) return normalized_state # Handle 2D arrays normalized_state = np.zeros_like(state, dtype=np.float32) # Convert any timestamp or non-numeric data to float for i in range(state.shape[0]): for j in range(state.shape[1]): if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')): # Convert timestamp to float (seconds since epoch) normalized_state[i, j] = float(state[i, j].timestamp()) elif isinstance(state[i, j], (int, float, np.number)): normalized_state[i, j] = state[i, j] else: # Set non-numeric data to 0 normalized_state[i, j] = 0.0 # Loop through each timeframe's features in the combined state feature_count = state.shape[1] // len(self.timeframes) for tf_idx in range(len(self.timeframes)): start_idx = tf_idx * feature_count end_idx = start_idx + feature_count # Extract this timeframe's features tf_features = normalized_state[:, start_idx:end_idx] # Normalize OHLCV data by the first close price in the window # This makes price movements relative rather than absolute price_idx = 3 # Assuming close price is at index 3 if price_idx < tf_features.shape[1]: reference_price = np.mean(tf_features[:, price_idx]) if reference_price != 0: # Normalize price-related columns (OHLC) for i in range(4): # First 4 columns are OHLC if i < tf_features.shape[1]: normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price # Normalize volume using mean and std vol_idx = 4 # Assuming volume is at index 4 if vol_idx < tf_features.shape[1]: vol_mean = np.mean(tf_features[:, vol_idx]) vol_std = np.std(tf_features[:, vol_idx]) if vol_std > 0: normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std else: normalized_state[:, start_idx + vol_idx] = 0 # Other features (technical indicators) - normalize with min-max scaling for i in range(5, feature_count): if i < tf_features.shape[1]: feature_min = np.min(tf_features[:, i]) feature_max = np.max(tf_features[:, i]) if feature_max > feature_min: normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min) else: normalized_state[:, start_idx + i] = 0 return normalized_state def replay(self, use_prioritized=True) -> float: """ Train on a batch of experiences with prioritized sampling Args: use_prioritized: Whether to use prioritized replay Returns: float: Loss value """ if len(self.memory) < self.batch_size: return 0.0 # Sample batch with prioritization batch = [] if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0: # Prioritized sampling from different memory types positive_count = min(self.batch_size // 4, len(self.positive_memory)) extrema_count = min(self.batch_size // 4, len(self.extrema_memory)) regular_count = self.batch_size - positive_count - extrema_count positive_samples = random.sample(list(self.positive_memory), positive_count) extrema_samples = random.sample(list(self.extrema_memory), extrema_count) regular_samples = random.sample(list(self.memory), regular_count) batch = positive_samples + extrema_samples + regular_samples else: # Standard sampling batch = random.sample(self.memory, self.batch_size) states, actions, rewards, next_states, dones = zip(*batch) # Normalize states before training normalized_states = np.array([self._normalize_state(state) for state in states]) normalized_next_states = np.array([self._normalize_state(state) for state in next_states]) # Convert to tensors and move to device states_tensor = torch.FloatTensor(normalized_states).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device) next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device) # Get current Q values current_q_values, extrema_pred = self.policy_net(states_tensor) current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1)) # Get next Q values from target network (Double DQN approach) with torch.no_grad(): # Get actions from policy network next_actions, _ = self.policy_net(next_states_tensor) next_actions = next_actions.max(1)[1].unsqueeze(1) # Get Q values from target network for those actions next_q_values, _ = self.target_net(next_states_tensor) next_q_values = next_q_values.gather(1, next_actions).squeeze(1) # Compute target Q values target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values # Clamp target values to prevent extreme values target_q_values = torch.clamp(target_q_values, -100, 100) # Compute Huber loss (more robust to outliers than MSE) loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values) # Optimize self.optimizer.zero_grad() loss.backward() # Apply gradient clipping to prevent exploding gradients nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0) self.optimizer.step() # Update target network if needed self.update_count += 1 if self.update_count % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) # Decay epsilon self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay) return loss.item() def train_on_extrema(self, states, actions, rewards, next_states, dones): """ Special training method focused on extrema patterns Args: states: Array of states near extrema points actions: Correct actions to take (buy at bottoms, sell at tops) rewards: Rewards for each action next_states: Next states dones: Done flags """ if len(states) == 0: return 0.0 # Normalize states normalized_states = np.array([self._normalize_state(state) for state in states]) normalized_next_states = np.array([self._normalize_state(state) for state in next_states]) # Convert to tensors states_tensor = torch.FloatTensor(normalized_states).to(self.device) actions_tensor = torch.LongTensor(actions).to(self.device) rewards_tensor = torch.FloatTensor(rewards).to(self.device) next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device) dones_tensor = torch.FloatTensor(dones).to(self.device) # Forward pass current_q_values, extrema_pred = self.policy_net(states_tensor) current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1)) # Get next Q values (Double DQN approach) with torch.no_grad(): next_actions, _ = self.policy_net(next_states_tensor) next_actions = next_actions.max(1)[1].unsqueeze(1) next_q_values, _ = self.target_net(next_states_tensor) next_q_values = next_q_values.gather(1, next_actions).squeeze(1) target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values # Clamp target values target_q_values = torch.clamp(target_q_values, -100, 100) # Use Huber loss for extrema training q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values) # Full loss loss = q_loss # Optimize self.optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0) self.optimizer.step() return loss.item() def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01): """Update learning metrics and perform learning rate adjustments if needed""" # Update average reward with exponential moving average if self.avg_reward == 0: self.avg_reward = episode_reward else: self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward # Check if we're making sufficient progress if episode_reward > (1 + best_reward_threshold) * self.best_reward: self.best_reward = episode_reward self.no_improvement_count = 0 return True # Improved else: self.no_improvement_count += 1 # If no improvement for a while, adjust learning rate if self.no_improvement_count >= 10: current_lr = self.optimizer.param_groups[0]['lr'] new_lr = current_lr * 0.5 if new_lr >= 1e-6: # Don't reduce below minimum threshold for param_group in self.optimizer.param_groups: param_group['lr'] = new_lr logger.info(f"Reducing learning rate from {current_lr} to {new_lr}") self.no_improvement_count = 0 return False # No improvement def save(self, path: str): """Save model and agent state""" os.makedirs(os.path.dirname(path), exist_ok=True) # Save policy network self.policy_net.save(f"{path}_policy") # Save target network self.target_net.save(f"{path}_target") # Save agent state state = { 'epsilon': self.epsilon, 'update_count': self.update_count, 'losses': self.losses, 'optimizer_state': self.optimizer.state_dict(), 'best_reward': self.best_reward, 'avg_reward': self.avg_reward } torch.save(state, f"{path}_agent_state.pt") logger.info(f"Agent state saved to {path}_agent_state.pt") def load(self, path: str): """Load model and agent state""" # Load policy network self.policy_net.load(f"{path}_policy") # Load target network self.target_net.load(f"{path}_target") # Load agent state try: agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device) self.epsilon = agent_state['epsilon'] self.update_count = agent_state['update_count'] self.losses = agent_state['losses'] self.optimizer.load_state_dict(agent_state['optimizer_state']) # Load additional metrics if they exist if 'best_reward' in agent_state: self.best_reward = agent_state['best_reward'] if 'avg_reward' in agent_state: self.avg_reward = agent_state['avg_reward'] logger.info(f"Agent state loaded from {path}_agent_state.pt") except FileNotFoundError: logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")