gogo2/NN/models/dqn_agent.py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
from typing import Tuple, List
import os
import sys
import logging

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

from NN.models.simple_cnn import CNNModelPyTorch

# Configure logger
logger = logging.getLogger(__name__)

class DQNAgent:
    """
    Deep Q-Network agent for trading
    Uses CNN model as the base network
    """
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 window_size: int,
                 num_features: int,
                 timeframes: List[str],
                 learning_rate: float = 0.0005,  # Reduced learning rate for more stability
                 gamma: float = 0.97,            # Slightly reduced discount factor
                 epsilon: float = 1.0,
                 epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
                 epsilon_decay: float = 0.9975,  # Slower decay rate
                 memory_size: int = 20000,       # Increased memory size
                 batch_size: int = 128,          # Larger batch size
                 target_update: int = 5):        # More frequent target updates

        self.state_size = state_size
        self.action_size = action_size
        self.window_size = window_size
        self.num_features = num_features
        self.timeframes = timeframes
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.target_update = target_update

        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Initialize networks
        self.policy_net = CNNModelPyTorch(
            window_size=window_size,
            num_features=num_features,
            output_size=action_size,
            timeframes=timeframes
        ).to(self.device)

        self.target_net = CNNModelPyTorch(
            window_size=window_size,
            num_features=num_features,
            output_size=action_size,
            timeframes=timeframes
        ).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())

        # Initialize optimizer with gradient clipping
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)

        # Initialize memories with different priorities
        self.memory = deque(maxlen=memory_size)
        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards

        # Training metrics
        self.update_count = 0
        self.losses = []
        self.avg_reward = 0
        self.no_improvement_count = 0
        self.best_reward = float('-inf')

    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
        Store experience in memory with prioritization

        Args:
            state: Current state
            action: Action taken
            reward: Reward received
            next_state: Next state
            done: Whether episode is done
            is_extrema: Whether this is a local extrema sample (for specialized learning)
        """
        experience = (state, action, reward, next_state, done)

        # Always add to main memory
        self.memory.append(experience)

        # Add to specialized memories if applicable
        if is_extrema:
            self.extrema_memory.append(experience)

        # Store positive experiences separately for prioritized replay
        if reward > 0:
            self.positive_memory.append(experience)

    def act(self, state: np.ndarray, explore=True) -> int:
        """Choose action using epsilon-greedy policy with explore flag"""
        if explore and random.random() < self.epsilon:
            return random.randrange(self.action_size)

        with torch.no_grad():
            # Ensure state is normalized before inference
            state_tensor = self._normalize_state(state)
            state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
            action_probs, extrema_pred = self.policy_net(state_tensor)
            return action_probs.argmax().item()

    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
        # Handle NaN and infinite values
        state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)

        # Check if state is 1D array (happens in some environments)
        if len(state.shape) == 1:
            # If 1D, we need to normalize the whole array
            normalized_state = state.copy()

            # Convert any timestamp or non-numeric data to float
            for i in range(len(normalized_state)):
                # Check for timestamp-like objects
                if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i] = float(normalized_state[i].timestamp())
                elif not isinstance(normalized_state[i], (int, float, np.number)):
                    # Set non-numeric data to 0
                    normalized_state[i] = 0.0

            # Ensure all values are float
            normalized_state = normalized_state.astype(np.float32)

            # Simple min-max normalization for 1D state
            state_min = np.min(normalized_state)
            state_max = np.max(normalized_state)
            if state_max > state_min:
                normalized_state = (normalized_state - state_min) / (state_max - state_min)
            return normalized_state

        # Handle 2D arrays
        normalized_state = np.zeros_like(state, dtype=np.float32)

        # Convert any timestamp or non-numeric data to float
        for i in range(state.shape[0]):
            for j in range(state.shape[1]):
                if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i, j] = float(state[i, j].timestamp())
                elif isinstance(state[i, j], (int, float, np.number)):
                    normalized_state[i, j] = state[i, j]
                else:
                    # Set non-numeric data to 0
                    normalized_state[i, j] = 0.0

        # Loop through each timeframe's features in the combined state
        feature_count = state.shape[1] // len(self.timeframes)

        for tf_idx in range(len(self.timeframes)):
            start_idx = tf_idx * feature_count
            end_idx = start_idx + feature_count

            # Extract this timeframe's features
            tf_features = normalized_state[:, start_idx:end_idx]

            # Normalize OHLCV data by the first close price in the window
            # This makes price movements relative rather than absolute
            price_idx = 3  # Assuming close price is at index 3
            if price_idx < tf_features.shape[1]:
                reference_price = np.mean(tf_features[:, price_idx])
                if reference_price != 0:
                    # Normalize price-related columns (OHLC)
                    for i in range(4):  # First 4 columns are OHLC
                        if i < tf_features.shape[1]:
                            normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price

                # Normalize volume using mean and std
                vol_idx = 4  # Assuming volume is at index 4
                if vol_idx < tf_features.shape[1]:
                    vol_mean = np.mean(tf_features[:, vol_idx])
                    vol_std = np.std(tf_features[:, vol_idx])
                    if vol_std > 0:
                        normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
                    else:
                        normalized_state[:, start_idx + vol_idx] = 0

                # Other features (technical indicators) - normalize with min-max scaling
                for i in range(5, feature_count):
                    if i < tf_features.shape[1]:
                        feature_min = np.min(tf_features[:, i])
                        feature_max = np.max(tf_features[:, i])
                        if feature_max > feature_min:
                            normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
                        else:
                            normalized_state[:, start_idx + i] = 0

        return normalized_state

    def replay(self, use_prioritized=True) -> float:
        """
        Train on a batch of experiences with prioritized sampling

        Args:
            use_prioritized: Whether to use prioritized replay

        Returns:
            float: Loss value
        """
        if len(self.memory) < self.batch_size:
            return 0.0

        # Sample batch with prioritization
        batch = []

        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
            # Prioritized sampling from different memory types
            positive_count = min(self.batch_size // 4, len(self.positive_memory))
            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
            regular_count = self.batch_size - positive_count - extrema_count

            positive_samples = random.sample(list(self.positive_memory), positive_count)
            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
            regular_samples = random.sample(list(self.memory), regular_count)

            batch = positive_samples + extrema_samples + regular_samples
        else:
            # Standard sampling
            batch = random.sample(self.memory, self.batch_size)

        states, actions, rewards, next_states, dones = zip(*batch)

        # Normalize states before training
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])

        # Convert to tensors and move to device
        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)

        # Get current Q values
        current_q_values, extrema_pred = self.policy_net(states_tensor)
        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))

        # Get next Q values from target network (Double DQN approach)
        with torch.no_grad():
            # Get actions from policy network
            next_actions, _ = self.policy_net(next_states_tensor)
            next_actions = next_actions.max(1)[1].unsqueeze(1)

            # Get Q values from target network for those actions
            next_q_values, _ = self.target_net(next_states_tensor)
            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)

            # Compute target Q values
            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values

        # Clamp target values to prevent extreme values
        target_q_values = torch.clamp(target_q_values, -100, 100)

        # Compute Huber loss (more robust to outliers than MSE)
        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)

        # Optimize
        self.optimizer.zero_grad()
        loss.backward()

        # Apply gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)

        self.optimizer.step()

        # Update target network if needed
        self.update_count += 1
        if self.update_count % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)

        return loss.item()

    def train_on_extrema(self, states, actions, rewards, next_states, dones):
        """
        Special training method focused on extrema patterns

        Args:
            states: Array of states near extrema points
            actions: Correct actions to take (buy at bottoms, sell at tops)
            rewards: Rewards for each action
            next_states: Next states
            dones: Done flags
        """
        if len(states) == 0:
            return 0.0

        # Normalize states
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])

        # Convert to tensors
        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)

        # Forward pass
        current_q_values, extrema_pred = self.policy_net(states_tensor)
        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))

        # Get next Q values (Double DQN approach)
        with torch.no_grad():
            next_actions, _ = self.policy_net(next_states_tensor)
            next_actions = next_actions.max(1)[1].unsqueeze(1)

            next_q_values, _ = self.target_net(next_states_tensor)
            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)

            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values

        # Clamp target values
        target_q_values = torch.clamp(target_q_values, -100, 100)

        # Use Huber loss for extrema training
        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)

        # Full loss
        loss = q_loss

        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()

        return loss.item()

    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
        if self.avg_reward == 0:
            self.avg_reward = episode_reward
        else:
            self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward

        # Check if we're making sufficient progress
        if episode_reward > (1 + best_reward_threshold) * self.best_reward:
            self.best_reward = episode_reward
            self.no_improvement_count = 0
            return True  # Improved
        else:
            self.no_improvement_count += 1

            # If no improvement for a while, adjust learning rate
            if self.no_improvement_count >= 10:
                current_lr = self.optimizer.param_groups[0]['lr']
                new_lr = current_lr * 0.5
                if new_lr >= 1e-6:  # Don't reduce below minimum threshold
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = new_lr
                    logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
                    self.no_improvement_count = 0

            return False  # No improvement

    def save(self, path: str):
        """Save model and agent state"""
        os.makedirs(os.path.dirname(path), exist_ok=True)

        # Save policy network
        self.policy_net.save(f"{path}_policy")

        # Save target network
        self.target_net.save(f"{path}_target")

        # Save agent state
        state = {
            'epsilon': self.epsilon,
            'update_count': self.update_count,
            'losses': self.losses,
            'optimizer_state': self.optimizer.state_dict(),
            'best_reward': self.best_reward,
            'avg_reward': self.avg_reward
        }

        torch.save(state, f"{path}_agent_state.pt")
        logger.info(f"Agent state saved to {path}_agent_state.pt")

    def load(self, path: str):
        """Load model and agent state"""
        # Load policy network
        self.policy_net.load(f"{path}_policy")

        # Load target network
        self.target_net.load(f"{path}_target")

        # Load agent state
        try:
            agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
            self.epsilon = agent_state['epsilon']
            self.update_count = agent_state['update_count']
            self.losses = agent_state['losses']
            self.optimizer.load_state_dict(agent_state['optimizer_state'])

            # Load additional metrics if they exist
            if 'best_reward' in agent_state:
                self.best_reward = agent_state['best_reward']
            if 'avg_reward' in agent_state:
                self.avg_reward = agent_state['avg_reward']

            logger.info(f"Agent state loaded from {path}_agent_state.pt")
        except FileNotFoundError:
            logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")