gogo2/NN/models/dqn_agent.py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
from typing import Tuple, List, Dict, Any
import os
import sys
import logging
import torch.nn.functional as F
import time

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

# Import checkpoint management
from utils.checkpoint_manager import save_checkpoint, load_best_checkpoint
from utils.training_integration import get_training_integration

# Configure logger
logger = logging.getLogger(__name__)

class DQNNetwork(nn.Module):
    """
    Massive Deep Q-Network specifically designed for RL trading with unified BaseDataInput features
    Handles 7850 input features from multi-timeframe, multi-asset data
    TARGET: 50M parameters for enhanced learning capacity
    """
    def __init__(self, input_dim: int, n_actions: int):
        super(DQNNetwork, self).__init__()

        # Handle different input dimension formats
        if isinstance(input_dim, (tuple, list)):
            if len(input_dim) == 1:
                self.input_size = input_dim[0]
            else:
                self.input_size = np.prod(input_dim)  # Flatten multi-dimensional input
        else:
            self.input_size = input_dim

        self.n_actions = n_actions

        # MASSIVE network architecture optimized for trading features
        # Target: ~50M parameters
        self.feature_extractor = nn.Sequential(
            # Initial feature extraction with massive width
            nn.Linear(self.input_size, 8192),  # 7850 -> 8192 = ~64M weights
            nn.LayerNorm(8192),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),

            # Deep feature processing layers
            nn.Linear(8192, 6144),  # 8192 -> 6144 = ~50M weights
            nn.LayerNorm(6144),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),

            nn.Linear(6144, 4096),  # 6144 -> 4096 = ~25M weights
            nn.LayerNorm(4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),

            nn.Linear(4096, 3072),  # 4096 -> 3072 = ~12M weights
            nn.LayerNorm(3072),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),

            nn.Linear(3072, 2048),  # 3072 -> 2048 = ~6M weights
            nn.LayerNorm(2048),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
        )

        # Market regime detection head
        self.regime_head = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 4)  # trending, ranging, volatile, mixed
        )

        # Price direction prediction head - outputs direction and confidence
        self.price_direction_head = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 2)  # [direction, confidence]
        )

        # Direction activation (tanh for -1 to 1)
        self.direction_activation = nn.Tanh()
        # Confidence activation (sigmoid for 0 to 1)
        self.confidence_activation = nn.Sigmoid()

        # Volatility prediction head
        self.volatility_head = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(1024, 256),
            nn.LayerNorm(256),
            nn.ReLU(inplace=True),
            nn.Linear(256, 4)  # predicted volatility for 4 timeframes
        )

        # Main Q-value head (dueling architecture)
        self.value_head = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, 1)  # State value
        )

        self.advantage_head = nn.Sequential(
            nn.Linear(2048, 1024),
            nn.LayerNorm(1024),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(1024, 512),
            nn.LayerNorm(512),
            nn.ReLU(inplace=True),
            nn.Linear(512, n_actions)  # Action advantages
        )

        # Initialize weights
        self._initialize_weights()

        # Log parameter count
        total_params = sum(p.numel() for p in self.parameters())
        logger.info(f"DQN Network initialized with {total_params:,} parameters (target: 50M)")

    def _initialize_weights(self):
        """Initialize network weights using Xavier initialization"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LayerNorm):
                nn.init.constant_(module.bias, 0)
                nn.init.constant_(module.weight, 1.0)

    def forward(self, x):
        """Forward pass through the network"""
        # Ensure input is properly shaped
        if x.dim() > 2:
            x = x.view(x.size(0), -1)  # Flatten if needed
        elif x.dim() == 1:
            x = x.unsqueeze(0)  # Add batch dimension if needed

        # Feature extraction
        features = self.feature_extractor(x)

        # Multiple prediction heads
        regime_pred = self.regime_head(features)
        price_direction_raw = self.price_direction_head(features)

        # Apply separate activations to direction and confidence
        direction = self.direction_activation(price_direction_raw[:, 0:1])  # -1 to 1
        confidence = self.confidence_activation(price_direction_raw[:, 1:2])  # 0 to 1
        price_direction_pred = torch.cat([direction, confidence], dim=1)  # [batch, 2]

        volatility_pred = self.volatility_head(features)

        # Dueling Q-network
        value = self.value_head(features)
        advantage = self.advantage_head(features)

        # Combine value and advantage for Q-values
        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)

        return q_values, regime_pred, price_direction_pred, volatility_pred, features

    def act(self, state, explore=True):
        """
        Select action using epsilon-greedy policy

        Args:
            state: Current state (numpy array or tensor)
            explore: Whether to use epsilon-greedy exploration

        Returns:
            action_idx: Selected action index
            confidence: Confidence score
            action_probs: Action probabilities
        """
        # Convert state to tensor if needed
        if isinstance(state, np.ndarray):
            state = torch.FloatTensor(state).to(next(self.parameters()).device)

        # Ensure proper shape
        if state.dim() == 1:
            state = state.unsqueeze(0)

        with torch.no_grad():
            q_values, regime_pred, price_direction_pred, volatility_pred, features = self.forward(state)

            # Process price direction predictions
            if price_direction_pred is not None:
                self.process_price_direction_predictions(price_direction_pred)

            # Get action probabilities using softmax
            action_probs = F.softmax(q_values, dim=1)

            # Select action (greedy for inference)
            action_idx = torch.argmax(q_values, dim=1).item()

            # Calculate confidence as max probability
            confidence = float(action_probs[0, action_idx].item())

            # Convert probabilities to list
            probs_list = action_probs.squeeze(0).cpu().numpy().tolist()

            return action_idx, confidence, probs_list

class DQNAgent:
    """
    Deep Q-Network agent for trading
    Uses Enhanced CNN model as the base network with GPU support for improved performance
    """
    def __init__(self,
                 state_shape: Tuple[int, ...],
                 n_actions: int = 2,
                 learning_rate: float = 0.001,
                 epsilon: float = 1.0,
                 epsilon_min: float = 0.01,
                 epsilon_decay: float = 0.995,
                 buffer_size: int = 10000,
                 batch_size: int = 32,
                 target_update: int = 100,
                 priority_memory: bool = True,
                 device=None,
                 model_name: str = "dqn_agent",
                 enable_checkpoints: bool = True):

        # Checkpoint management
        self.model_name = model_name
        self.enable_checkpoints = enable_checkpoints
        self.training_integration = get_training_integration() if enable_checkpoints else None
        self.episode_count = 0
        self.best_reward = float('-inf')
        self.reward_history = deque(maxlen=100)
        self.checkpoint_frequency = 100  # Save checkpoint every 100 episodes

        # Extract state dimensions
        if isinstance(state_shape, tuple) and len(state_shape) > 1:
            # Multi-dimensional state (like image or sequence)
            self.state_dim = state_shape
        else:
            # 1D state
            if isinstance(state_shape, tuple):
                if len(state_shape) == 0:
                    self.state_dim = 1  # Safe default for empty tuple
                else:
                    self.state_dim = state_shape[0]
            else:
                self.state_dim = state_shape

        # Store parameters
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.target_update = target_update

        # Set device for computation (default to GPU if available)
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device

        logger.info(f"DQN Agent using device: {self.device}")

        # Initialize models with RL-specific network architecture
        self.policy_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device)
        self.target_net = DQNNetwork(self.state_dim, self.n_actions).to(self.device)

        # Ensure models are on the correct device
        self.policy_net = self.policy_net.to(self.device)
        self.target_net = self.target_net.to(self.device)

        # Initialize the target network with the same weights as the policy network
        self.target_net.load_state_dict(self.policy_net.state_dict())

        # Set models to eval mode (important for batch norm, dropout)
        self.target_net.eval()

        # Optimization components
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

        # Experience replay memory
        self.memory = []
        self.positive_memory = []  # Special memory for storing good experiences
        self.update_count = 0

        # Extrema detection tracking
        self.last_extrema_pred = {
            'class': 2,  # Default to "neither" (not extrema)
            'confidence': 0.0,
            'raw': None
        }
        self.extrema_memory = []

        # DQN hyperparameters
        self.gamma = 0.99  # Discount factor

        # Initialize avg_reward for dashboard compatibility
        self.avg_reward = 0.0  # Average reward tracking for dashboard

        # Market regime adaptation weights
        self.market_regime_weights = {
            'trending': 1.0,
            'sideways': 0.8,
            'volatile': 1.2,
            'bullish': 1.1,
            'bearish': 1.1
        }

        # Load best checkpoint if available
        if self.enable_checkpoints:
            self.load_best_checkpoint()

        logger.info(f"DQN Agent initialized with checkpoint management: {enable_checkpoints}")
        if enable_checkpoints:
            logger.info(f"Model name: {model_name}, Checkpoint frequency: {self.checkpoint_frequency}")

        # Add this line to the __init__ method
        self.recent_actions = deque(maxlen=10)
        self.recent_prices = deque(maxlen=20)
        self.recent_rewards = deque(maxlen=100)

        # Price direction tracking - stores direction and confidence
        self.last_price_direction = {
            'direction': 0.0,    # Single value between -1 and 1
            'confidence': 0.0    # Single value between 0 and 1
        }

        # Store separate memory for price direction examples
        self.price_movement_memory = []  # For storing examples of clear price movements

        # Performance tracking
        self.losses = []
        self.no_improvement_count = 0

        # Confidence tracking
        self.confidence_history = []
        self.avg_confidence = 0.0
        self.max_confidence = 0.0
        self.min_confidence = 1.0

        # Enhanced features from EnhancedDQNAgent
        # Market adaptation capabilities
        self.market_regime_weights = {
            'trending': 1.2,    # Higher confidence in trending markets
            'ranging': 0.8,     # Lower confidence in ranging markets
            'volatile': 0.6     # Much lower confidence in volatile markets
        }

        # Dueling network support (requires enhanced network architecture)
        self.use_dueling = True

        # Prioritized experience replay parameters
        self.use_prioritized_replay = priority_memory
        self.alpha = 0.6  # Priority exponent
        self.beta = 0.4   # Importance sampling exponent
        self.beta_increment = 0.001

        # Double DQN support
        self.use_double_dqn = True

        # Enhanced training features from EnhancedDQNAgent
        self.target_update_freq = target_update  # More descriptive name
        self.training_steps = 0
        self.gradient_clip_norm = 1.0  # Gradient clipping

        # Enhanced statistics tracking
        self.epsilon_history = []
        self.td_errors = []  # Track TD errors for analysis

        # Trade action fee and confidence thresholds
        self.trade_action_fee = 0.0005  # Small fee to discourage unnecessary trading
        self.minimum_action_confidence = 0.3  # Minimum confidence to consider trading (lowered from 0.5)

        # Violent move detection
        self.price_history = []
        self.volatility_window = 20  # Window size for volatility calculation
        self.volatility_threshold = 0.0015  # Threshold for considering a move "violent"
        self.post_violent_move = False  # Flag for recent violent move
        self.violent_move_cooldown = 0  # Cooldown after violent move

        # Feature integration
        self.last_hidden_features = None  # Store last extracted features
        self.feature_history = []  # Store history of features for analysis

        # Real-time tick features integration
        self.realtime_tick_features = None  # Latest tick features from tick processor
        self.tick_feature_weight = 0.3  # Weight for tick features in decision making

        # Check if mixed precision training should be used
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
            self.use_mixed_precision = True
            self.scaler = torch.cuda.amp.GradScaler()
            logger.info("Mixed precision training enabled")
        else:
            self.use_mixed_precision = False
            logger.info("Mixed precision training disabled")

        # Track if we're in training mode
        self.training = True

        # For compatibility with old code
        self.state_size = np.prod(state_shape)
        self.action_size = n_actions
        self.memory_size = buffer_size
        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3]  # Default timeframes

        logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}")
        logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}")
        logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}")

        # Log model parameters
        total_params = sum(p.numel() for p in self.policy_net.parameters())
        logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters")

        # Position management for 2-action system
        self.current_position = 0.0  # -1 (short), 0 (neutral), 1 (long)
        self.position_entry_price = 0.0
        self.position_entry_time = None

        # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data
        self.entry_confidence_threshold = 0.35  # Lower threshold for new positions (was 0.7)
        self.exit_confidence_threshold = 0.15   # Very low threshold for closing positions (was 0.3)
        self.uncertainty_threshold = 0.1        # When to stay neutral

    def load_best_checkpoint(self):
        """Load the best checkpoint for this DQN agent"""
        try:
            if not self.enable_checkpoints:
                return

            result = load_best_checkpoint(self.model_name)
            if result:
                file_path, metadata = result
                checkpoint = torch.load(file_path, map_location=self.device, weights_only=False)

                # Load model states
                if 'policy_net_state_dict' in checkpoint:
                    self.policy_net.load_state_dict(checkpoint['policy_net_state_dict'])
                if 'target_net_state_dict' in checkpoint:
                    self.target_net.load_state_dict(checkpoint['target_net_state_dict'])
                if 'optimizer_state_dict' in checkpoint:
                    self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

                # Load training state
                if 'episode_count' in checkpoint:
                    self.episode_count = checkpoint['episode_count']
                if 'epsilon' in checkpoint:
                    self.epsilon = checkpoint['epsilon']
                if 'best_reward' in checkpoint:
                    self.best_reward = checkpoint['best_reward']

                logger.info(f"Loaded DQN checkpoint: {metadata.checkpoint_id}")
                logger.info(f"Episode: {self.episode_count}, Best reward: {self.best_reward:.4f}")

        except Exception as e:
            logger.warning(f"Failed to load checkpoint for {self.model_name}: {e}")

    def save_checkpoint(self, episode_reward: float, force_save: bool = False):
        """Save checkpoint if performance improved or forced"""
        try:
            if not self.enable_checkpoints:
                return False

            self.episode_count += 1
            self.reward_history.append(episode_reward)

            # Calculate average reward over recent episodes
            avg_reward = sum(self.reward_history) / len(self.reward_history)

            # Update best reward
            if episode_reward > self.best_reward:
                self.best_reward = episode_reward

            # Save checkpoint every N episodes or if forced
            should_save = (
                force_save or
                self.episode_count % self.checkpoint_frequency == 0 or
                episode_reward > self.best_reward * 0.95  # Within 5% of best
            )

            if should_save and self.training_integration:
                return self.training_integration.save_rl_checkpoint(
                    rl_agent=self,
                    model_name=self.model_name,
                    episode=self.episode_count,
                    avg_reward=avg_reward,
                    best_reward=self.best_reward,
                    epsilon=self.epsilon,
                    total_pnl=0.0  # Default to 0, can be set by calling code
                )

            return False

        except Exception as e:
            logger.error(f"Error saving DQN checkpoint: {e}")
            return False

        # Store separate memory for price direction examples
        self.price_movement_memory = []  # For storing examples of clear price movements

        # Performance tracking
        self.losses = []
        self.no_improvement_count = 0

        # Confidence tracking
        self.confidence_history = []
        self.avg_confidence = 0.0
        self.max_confidence = 0.0
        self.min_confidence = 1.0

        # Enhanced features from EnhancedDQNAgent
        # Market adaptation capabilities
        self.market_regime_weights = {
            'trending': 1.2,    # Higher confidence in trending markets
            'ranging': 0.8,     # Lower confidence in ranging markets
            'volatile': 0.6     # Much lower confidence in volatile markets
        }

        # Dueling network support (requires enhanced network architecture)
        self.use_dueling = True

        # Prioritized experience replay parameters
        self.use_prioritized_replay = priority_memory
        self.alpha = 0.6  # Priority exponent
        self.beta = 0.4   # Importance sampling exponent
        self.beta_increment = 0.001

        # Double DQN support
        self.use_double_dqn = True

        # Enhanced training features from EnhancedDQNAgent
        self.target_update_freq = target_update  # More descriptive name
        self.training_steps = 0
        self.gradient_clip_norm = 1.0  # Gradient clipping

        # Enhanced statistics tracking
        self.epsilon_history = []
        self.td_errors = []  # Track TD errors for analysis

        # Trade action fee and confidence thresholds
        self.trade_action_fee = 0.0005  # Small fee to discourage unnecessary trading
        self.minimum_action_confidence = 0.3  # Minimum confidence to consider trading (lowered from 0.5)

        # Violent move detection
        self.price_history = []
        self.volatility_window = 20  # Window size for volatility calculation
        self.volatility_threshold = 0.0015  # Threshold for considering a move "violent"
        self.post_violent_move = False  # Flag for recent violent move
        self.violent_move_cooldown = 0  # Cooldown after violent move

        # Feature integration
        self.last_hidden_features = None  # Store last extracted features
        self.feature_history = []  # Store history of features for analysis

        # Real-time tick features integration
        self.realtime_tick_features = None  # Latest tick features from tick processor
        self.tick_feature_weight = 0.3  # Weight for tick features in decision making

        # Check if mixed precision training should be used
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
            self.use_mixed_precision = True
            self.scaler = torch.cuda.amp.GradScaler()
            logger.info("Mixed precision training enabled")
        else:
            self.use_mixed_precision = False
            logger.info("Mixed precision training disabled")

        # Track if we're in training mode
        self.training = True

        # For compatibility with old code
        self.state_size = np.prod(state_shape)
        self.action_size = n_actions
        self.memory_size = buffer_size
        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3]  # Default timeframes

        logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}")
        logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}")
        logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}")

        # Log model parameters
        total_params = sum(p.numel() for p in self.policy_net.parameters())
        logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters")

        # Position management for 2-action system
        self.current_position = 0.0  # -1 (short), 0 (neutral), 1 (long)
        self.position_entry_price = 0.0
        self.position_entry_time = None

        # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data
        self.entry_confidence_threshold = 0.35  # Lower threshold for new positions (was 0.7)
        self.exit_confidence_threshold = 0.15   # Very low threshold for closing positions (was 0.3)
        self.uncertainty_threshold = 0.1        # When to stay neutral

    def move_models_to_device(self, device=None):
        """Move models to the specified device (GPU/CPU)"""
        if device is not None:
            self.device = device

        try:
            self.policy_net = self.policy_net.to(self.device)
            self.target_net = self.target_net.to(self.device)
            logger.info(f"Moved models to {self.device}")
            return True
        except Exception as e:
            logger.error(f"Failed to move models to {self.device}: {str(e)}")
            return False

    def to(self, device):
        """PyTorch-style device movement method"""
        self.device = device
        self.policy_net = self.policy_net.to(device)
        self.target_net = self.target_net.to(device)
        return self

    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
        Store experience in memory with prioritization

        Args:
            state: Current state
            action: Action taken
            reward: Reward received
            next_state: Next state
            done: Whether episode is done
            is_extrema: Whether this is a local extrema sample (for specialized learning)
        """
        experience = (state, action, reward, next_state, done)

        # Always add to main memory
        self.memory.append(experience)

        # Try to extract price change to analyze the experience
        try:
            # Extract price feature from sequence data (if available)
            if len(state.shape) > 1:  # 2D state [timeframes, features]
                current_price = state[-1, -1]  # Last timeframe, last feature
                next_price = next_state[-1, -1]
            else:  # 1D state
                current_price = state[-1]  # Last feature
                next_price = next_state[-1]

            # Calculate price change - avoid division by zero
            if np.isscalar(current_price) and current_price != 0:
                price_change = (next_price - current_price) / current_price
            elif isinstance(current_price, np.ndarray):
                # Handle array case - protect against division by zero
                with np.errstate(divide='ignore', invalid='ignore'):
                    price_change = (next_price - current_price) / current_price
                # Replace infinities and NaNs with zeros
                if isinstance(price_change, np.ndarray):
                    price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0)
                else:
                    price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change
            else:
                price_change = 0.0

            # Check if this is a significant price movement
            if abs(price_change) > 0.002:  # Significant price change
                # Store in price movement memory
                self.price_movement_memory.append(experience)

                # Log significant price movements
                direction = "UP" if price_change > 0 else "DOWN"
                logger.info(f"Stored significant {direction} price movement: {price_change:.4f}")

                # For clear price movements, also duplicate in main memory to learn more
                if abs(price_change) > 0.005:  # Very significant movement
                    for _ in range(2):  # Add 2 extra copies
                        self.memory.append(experience)
        except Exception as e:
            # Skip price movement analysis if it fails
            pass

        # Check if this is an extrema point based on our extrema detection head
        if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
            # Class 0 = bottom, 1 = top, 2 = neither
            # Only consider high confidence predictions
            if self.last_extrema_pred['confidence'] > 0.7:
                self.extrema_memory.append(experience)

                # Log this special experience
                extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
                logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")

                # For tops and bottoms, also duplicate the experience in memory to learn more from it
                for _ in range(2):  # Add 2 extra copies
                    self.memory.append(experience)

        # Explicitly marked extrema points also go to extrema memory
        elif is_extrema:
            self.extrema_memory.append(experience)

        # Store positive experiences separately for prioritized replay
        if reward > 0:
            self.positive_memory.append(experience)

            # For very good rewards, duplicate to learn more from them
            if reward > 0.1:
                for _ in range(min(int(reward * 10), 5)):  # Cap at 5 extra copies for very high rewards
                    self.positive_memory.append(experience)

        # Keep memory size under control
        if len(self.memory) > self.buffer_size:
            # Keep more recent experiences
            self.memory = self.memory[-self.buffer_size:]

        # Keep specialized memories under control too
        if len(self.positive_memory) > self.buffer_size // 4:
            self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]

        if len(self.extrema_memory) > self.buffer_size // 4:
            self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]

        if len(self.price_movement_memory) > self.buffer_size // 4:
            self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):]

    def act(self, state: np.ndarray, explore=True, current_price=None, market_context=None) -> int:
        """
        Choose action based on current state using 2-action system with intelligent position management

        Args:
            state: Current market state
            explore: Whether to use epsilon-greedy exploration
            current_price: Current market price for position management
            market_context: Additional market context for decision making

        Returns:
            int: Action (0=BUY, 1=SELL)
        """
        try:
            # Use the DQNNetwork's act method for consistent behavior
            action_idx, confidence, action_probs = self.policy_net.act(state, explore=explore)

            # Apply epsilon-greedy exploration if requested
            if explore and np.random.random() <= self.epsilon:
                action_idx = np.random.choice(self.n_actions)

            # Update tracking
            if current_price:
                self.recent_prices.append(current_price)

            self.recent_actions.append(action_idx)
            return action_idx

        except Exception as e:
            logger.error(f"Error in act method: {e}")
            # Return default action (HOLD/SELL)
            return 1

    def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float, List[float]]:
        """Choose action with confidence score adapted to market regime"""
        try:
            # Use the DQNNetwork's act method which handles the state properly
            action_idx, base_confidence, action_probs = self.policy_net.act(state, explore=False)

            # Adapt confidence based on market regime
            regime_weight = self.market_regime_weights.get(market_regime, 1.0)
            adapted_confidence = min(base_confidence * regime_weight, 1.0)

            # Return action, confidence, and probabilities (for orchestrator compatibility)
            return int(action_idx), float(adapted_confidence), action_probs

        except Exception as e:
            logger.error(f"Error in act_with_confidence: {e}")
            # Return default action with low confidence
            return 1, 0.1, [0.45, 0.55]  # Default to HOLD action

    def process_price_direction_predictions(self, price_direction_pred: torch.Tensor) -> Dict[str, float]:
        """
        Process price direction predictions and convert to standardized format

        Args:
            price_direction_pred: Tensor of shape (batch_size, 2) containing [direction, confidence]

        Returns:
            Dict with direction (-1 to 1) and confidence (0 to 1)
        """
        try:
            if price_direction_pred is None or price_direction_pred.numel() == 0:
                return self.last_price_direction

            # Extract direction and confidence values
            direction_value = float(price_direction_pred[0, 0].item())  # -1 to 1
            confidence_value = float(price_direction_pred[0, 1].item())  # 0 to 1

            # Update last price direction
            self.last_price_direction = {
                'direction': direction_value,
                'confidence': confidence_value
            }

            return self.last_price_direction

        except Exception as e:
            logger.error(f"Error processing price direction predictions: {e}")
            return self.last_price_direction

    def get_price_direction_vector(self) -> Dict[str, float]:
        """
        Get the current price direction and confidence

        Returns:
            Dict with direction (-1 to 1) and confidence (0 to 1)
        """
        return self.last_price_direction

    def get_price_direction_summary(self) -> Dict[str, Any]:
        """
        Get a summary of price direction prediction

        Returns:
            Dict containing direction and confidence information
        """
        try:
            direction_value = self.last_price_direction['direction']
            confidence_value = self.last_price_direction['confidence']

            # Convert to discrete direction
            if direction_value > 0.1:
                direction_label = "UP"
                discrete_direction = 1
            elif direction_value < -0.1:
                direction_label = "DOWN"
                discrete_direction = -1
            else:
                direction_label = "SIDEWAYS"
                discrete_direction = 0

            return {
                'direction_value': float(direction_value),
                'confidence_value': float(confidence_value),
                'direction_label': direction_label,
                'discrete_direction': discrete_direction,
                'strength': abs(float(direction_value)),
                'weighted_strength': abs(float(direction_value)) * float(confidence_value)
            }

        except Exception as e:
            logger.error(f"Error calculating price direction summary: {e}")
            return {
                'direction_value': 0.0,
                'confidence_value': 0.0,
                'direction_label': "SIDEWAYS",
                'discrete_direction': 0,
                'strength': 0.0,
                'weighted_strength': 0.0
            }

        except Exception as e:
            logger.error(f"Error in act_with_confidence: {e}")
            # Return default action with low confidence
            return 1, 0.1, [0.45, 0.55]  # Default to HOLD action

    def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore):
        """
        Determine action based on current position and confidence thresholds

        This implements the intelligent position management where:
        - When neutral: Need high confidence to enter position
        - When in position: Need lower confidence to exit
        - Different thresholds for entry vs exit
        """

        # Apply epsilon-greedy exploration
        if explore and np.random.random() <= self.epsilon:
            return np.random.choice([0, 1])

        # Get the dominant signal - FIXED ACTION MAPPING: 0=BUY, 1=SELL
        dominant_action = 0 if buy_conf > sell_conf else 1
        dominant_confidence = max(buy_conf, sell_conf)

        # Decision logic based on current position
        if self.current_position == 0:  # No position - need high confidence to enter
            if dominant_confidence >= self.entry_confidence_threshold:
                # Strong enough signal to enter position
                if dominant_action == 0:  # BUY signal (action 0)
                    self.current_position = 1.0
                    self.position_entry_price = current_price
                    self.position_entry_time = time.time()
                    logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
                    return 0  # Return BUY action (0)
                else:  # SELL signal (action 1)
                    self.current_position = -1.0
                    self.position_entry_price = current_price
                    self.position_entry_time = time.time()
                    logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
                    return 1  # Return SELL action (1)
            else:
                # Not confident enough to enter position
                return None

        elif self.current_position > 0:  # Long position
            if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold:
                # SELL signal (action 1) with enough confidence to close long position
                pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 0.0
                self.position_entry_price = 0.0
                self.position_entry_time = None
                return 1  # Return SELL action (1)
            elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold:
                # Very strong SELL signal - close long and enter short
                pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = -1.0
                self.position_entry_price = current_price
                self.position_entry_time = time.time()
                return 1  # Return SELL action (1)
            else:
                # Hold the long position
                return None

        elif self.current_position < 0:  # Short position
            if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold:
                # BUY signal (action 0) with enough confidence to close short position
                pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 0.0
                self.position_entry_price = 0.0
                self.position_entry_time = None
                return 0  # Return BUY action (0)
            elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold:
                # Very strong BUY signal - close short and enter long
                pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 1.0
                self.position_entry_price = current_price
                self.position_entry_time = time.time()
                return 0  # Return BUY action (0)
            else:
                # Hold the short position
                return None

        return None

    def _safe_cnn_forward(self, network, states):
        """Safely call CNN forward method ensuring we always get 5 return values"""
        try:
            result = network(states)
            if isinstance(result, tuple) and len(result) == 5:
                return result
            elif isinstance(result, tuple) and len(result) == 1:
                # Handle case where only q_values are returned (like in empty tensor case)
                q_values = result[0]
                batch_size = q_values.size(0)
                device = q_values.device
                default_extrema = torch.zeros(batch_size, 3, device=device)
                default_price = torch.zeros(batch_size, 1, device=device)
                default_features = torch.zeros(batch_size, 1024, device=device)
                default_advanced = torch.zeros(batch_size, 1, device=device)
                return q_values, default_extrema, default_price, default_features, default_advanced
            else:
                # Fallback: create all default tensors
                batch_size = states.size(0)
                device = states.device
                default_q_values = torch.zeros(batch_size, self.n_actions, device=device)
                default_extrema = torch.zeros(batch_size, 3, device=device)
                default_price = torch.zeros(batch_size, 1, device=device)
                default_features = torch.zeros(batch_size, 1024, device=device)
                default_advanced = torch.zeros(batch_size, 1, device=device)
                return default_q_values, default_extrema, default_price, default_features, default_advanced
        except Exception as e:
            logger.error(f"Error in CNN forward pass: {e}")
            # Fallback: create all default tensors
            batch_size = states.size(0)
            device = states.device
            default_q_values = torch.zeros(batch_size, self.n_actions, device=device)
            default_extrema = torch.zeros(batch_size, 3, device=device)
            default_price = torch.zeros(batch_size, 1, device=device)
            default_features = torch.zeros(batch_size, 1024, device=device)
            default_advanced = torch.zeros(batch_size, 1, device=device)
            return default_q_values, default_extrema, default_price, default_features, default_advanced

    def replay(self, experiences=None):
        """Train the model using experiences from memory"""

        # Don't train if not in training mode
        if not self.training:
            return 0.0

        # If no experiences provided, sample from memory
        if experiences is None:
            # Skip if memory is too small
            if len(self.memory) < self.batch_size:
                return 0.0

            # Sample random mini-batch from memory
            indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
            experiences = [self.memory[i] for i in indices]

        # Validate experiences before processing
        if not experiences or len(experiences) == 0:
            logger.warning("No experiences provided for training")
            return 0.0

        # Sanitize and validate experiences
        valid_experiences = []
        for i, exp in enumerate(experiences):
            try:
                if len(exp) != 5:
                    logger.debug(f"Invalid experience format at index {i}: expected 5 elements, got {len(exp)}")
                    continue

                state, action, reward, next_state, done = exp

                # Validate state
                state = self._validate_and_fix_state(state)
                next_state = self._validate_and_fix_state(next_state)

                if state is None or next_state is None:
                    continue

                # Validate action
                if isinstance(action, dict):
                    action = action.get('action', action.get('value', 0))
                action = int(action) if action is not None else 0
                action = max(0, min(action, self.n_actions - 1))  # Clamp to valid range

                # Validate reward
                if isinstance(reward, dict):
                    reward = reward.get('reward', reward.get('value', 0.0))
                reward = float(reward) if reward is not None else 0.0

                # Validate done flag
                done = bool(done) if done is not None else False

                valid_experiences.append((state, action, reward, next_state, done))

            except Exception as e:
                logger.debug(f"Error processing experience {i}: {e}")
                continue

        if len(valid_experiences) == 0:
            logger.warning("No valid experiences after sanitization")
            return 0.0

        # Use validated experiences for training
        experiences = valid_experiences

        # Extract components
        states, actions, rewards, next_states, dones = zip(*experiences)

        # Convert to tensors with proper validation
        try:
            # Ensure all data is on CPU first, then move to device
            states_array = np.array(states, dtype=np.float32)
            actions_array = np.array(actions, dtype=np.int64)
            rewards_array = np.array(rewards, dtype=np.float32)
            next_states_array = np.array(next_states, dtype=np.float32)
            dones_array = np.array(dones, dtype=np.float32)

            # Convert to tensors and move to device
            states = torch.from_numpy(states_array).to(self.device)
            actions = torch.from_numpy(actions_array).to(self.device)
            rewards = torch.from_numpy(rewards_array).to(self.device)
            next_states = torch.from_numpy(next_states_array).to(self.device)
            dones = torch.from_numpy(dones_array).to(self.device)

            # Final validation of tensor shapes
            if states.shape[0] == 0 or actions.shape[0] == 0:
                logger.warning("Empty tensors after conversion")
                return 0.0

            # Ensure all tensors have the same batch size
            batch_size = states.shape[0]
            if not all(tensor.shape[0] == batch_size for tensor in [actions, rewards, next_states, dones]):
                logger.warning("Inconsistent batch sizes across tensors")
                return 0.0

        except Exception as e:
            logger.error(f"Error converting experiences to tensors: {e}")
            return 0.0

        # Always use standard training to fix gradient issues
        loss = self._replay_standard(states, actions, rewards, next_states, dones)

        # Update epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # Update statistics
        self.losses.append(loss)
        if len(self.losses) > 1000:
            self.losses = self.losses[-500:]  # Keep only recent losses

        return loss

    def _validate_and_fix_state(self, state):
        """Validate and fix state to ensure it has correct dimensions and no empty data"""
        try:
            # Convert to numpy if needed
            if isinstance(state, torch.Tensor):
                state = state.detach().cpu().numpy()
            elif not isinstance(state, np.ndarray):
                # Check if state is a dict or complex object
                if isinstance(state, dict):
                    logger.error(f"State is a dict: {state}")
                    # Extract numerical values from dict if possible
                    if 'features' in state:
                        state = state['features']
                    elif 'state' in state:
                        state = state['state']
                    else:
                        # Try to extract all numerical values
                        numerical_values = []
                        for key, value in state.items():
                            if isinstance(value, (int, float)):
                                numerical_values.append(float(value))
                            elif isinstance(value, (list, np.ndarray)):
                                try:
                                    # Handle nested structures safely
                                    flattened = np.array(value).flatten()
                                    for x in flattened:
                                        if isinstance(x, (int, float)):
                                            numerical_values.append(float(x))
                                        elif hasattr(x, 'item'):  # numpy scalar
                                            numerical_values.append(float(x.item()))
                                except (ValueError, TypeError):
                                    continue
                            elif isinstance(value, dict):
                                # Recursively extract from nested dicts
                                try:
                                    nested_values = self._extract_numeric_from_dict(value)
                                    numerical_values.extend(nested_values)
                                except Exception:
                                    continue
                        if numerical_values:
                            state = np.array(numerical_values, dtype=np.float32)
                        else:
                            logger.error("No numerical values found in state dict, using default state")
                            expected_size = getattr(self, 'state_size', 403)
                            if isinstance(expected_size, tuple):
                                expected_size = np.prod(expected_size)
                            return np.zeros(int(expected_size), dtype=np.float32)
                else:
                    try:
                        state = np.array(state, dtype=np.float32)
                    except (ValueError, TypeError) as e:
                        logger.error(f"Cannot convert state to numpy array: {type(state)}, {e}")
                        expected_size = getattr(self, 'state_size', 403)
                        if isinstance(expected_size, tuple):
                            expected_size = np.prod(expected_size)
                        return np.zeros(int(expected_size), dtype=np.float32)

            # Flatten if multi-dimensional
            if state.ndim > 1:
                state = state.flatten()

            # Check for empty or invalid state
            if state.size == 0:
                logger.warning("Empty state detected, using default")
                expected_size = getattr(self, 'state_size', 403)
                if isinstance(expected_size, tuple):
                    expected_size = np.prod(expected_size)
                return np.zeros(int(expected_size), dtype=np.float32)

            # Check for NaN or infinite values
            if np.any(np.isnan(state)) or np.any(np.isinf(state)):
                logger.warning("NaN or infinite values in state, replacing with zeros")
                state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)

            # Ensure correct dimensions
            expected_size = getattr(self, 'state_size', 403)
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            expected_size = int(expected_size)

            if len(state) != expected_size:
                if len(state) < expected_size:
                    # Pad with zeros
                    padded_state = np.zeros(expected_size, dtype=np.float32)
                    padded_state[:len(state)] = state
                    state = padded_state
                else:
                    # Truncate
                    state = state[:expected_size]

            return state.astype(np.float32)

        except Exception as e:
            logger.error(f"Error validating state: {e}")
            # Return default state as fallback
            expected_size = getattr(self, 'state_size', 403)
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            return np.zeros(int(expected_size), dtype=np.float32)

    def _replay_standard(self, states, actions, rewards, next_states, dones):
        """Standard training step without mixed precision"""
        try:
            # Validate input tensors
            if states.shape[0] == 0:
                logger.warning("Empty batch in _replay_standard")
                return 0.0

            # Ensure model is in training mode for gradients
            self.policy_net.train()

            # Get current Q values - use the updated forward method
            q_values_output = self.policy_net(states)
            if isinstance(q_values_output, tuple):
                current_q_values_all = q_values_output[0]  # Extract Q-values from tuple
            else:
                current_q_values_all = q_values_output

            current_q_values = current_q_values_all.gather(1, actions.unsqueeze(1)).squeeze(1)

            # Enhanced Double DQN implementation
            with torch.no_grad():
                if self.use_double_dqn:
                    # Double DQN: Use policy network to select actions, target network to evaluate
                    policy_output = self.policy_net(next_states)
                    policy_q_values = policy_output[0] if isinstance(policy_output, tuple) else policy_output
                    next_actions = policy_q_values.argmax(1)

                    target_output = self.target_net(next_states)
                    target_q_values_all = target_output[0] if isinstance(target_output, tuple) else target_output
                    next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
                else:
                    # Standard DQN: Use target network for both selection and evaluation
                    target_output = self.target_net(next_states)
                    target_q_values = target_output[0] if isinstance(target_output, tuple) else target_output
                    next_q_values = target_q_values.max(1)[0]

                # Ensure tensor shapes are consistent
                batch_size = states.shape[0]
                if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
                    logger.warning(f"Shape mismatch in replay: batch_size={batch_size}, rewards={rewards.shape}, next_q_values={next_q_values.shape}")
                    min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
                    rewards = rewards[:min_size]
                    dones = dones[:min_size]
                    next_q_values = next_q_values[:min_size]
                    current_q_values = current_q_values[:min_size]

                # Calculate target Q values
                target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

            # Compute loss for Q value - ensure tensors require gradients
            if not current_q_values.requires_grad:
                logger.warning("Current Q values do not require gradients")
                # Force training mode
                self.policy_net.train()
                return 0.0

            q_loss = self.criterion(current_q_values, target_q_values.detach())

            # Calculate auxiliary losses and add to Q-loss
            total_loss = q_loss

            # Add auxiliary losses if available
            try:
                # Get additional predictions from forward pass
                if isinstance(q_values_output, tuple) and len(q_values_output) >= 5:
                    current_regime_pred = q_values_output[1]
                    current_price_pred = q_values_output[2]
                    current_volatility_pred = q_values_output[3]
                    current_extrema_pred = current_regime_pred  # Use regime as extrema proxy for now

                    # Price direction loss
                    if current_price_pred is not None and current_price_pred.shape[0] > 0:
                        price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions)
                        if price_direction_loss is not None:
                            total_loss = total_loss + 0.2 * price_direction_loss

                    # Extrema loss
                    if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
                        extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions)
                        if extrema_loss is not None:
                            total_loss = total_loss + 0.1 * extrema_loss

            except Exception as e:
                logger.debug(f"Could not add auxiliary loss in standard training: {e}")

            # Reset gradients
            self.optimizer.zero_grad()

            # Ensure total loss requires gradients
            if not total_loss.requires_grad:
                logger.warning("Total loss does not require gradients - policy network may not be in training mode")
                self.policy_net.train()  # Ensure training mode
                return 0.0

            # Backward pass
            total_loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)

            # Check if gradients are valid
            has_valid_gradients = False
            for param in self.policy_net.parameters():
                if param.grad is not None and torch.any(torch.isfinite(param.grad)):
                    has_valid_gradients = True
                    break

            if not has_valid_gradients:
                logger.warning("No valid gradients found, skipping optimizer step")
                return 0.0

            # Update weights
            self.optimizer.step()

            # Update target network periodically
            self.training_steps += 1
            if self.training_steps % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                logger.debug(f"Target network updated at step {self.training_steps}")

            return total_loss.item()

        except Exception as e:
            logger.error(f"Error in standard replay: {e}")
            return 0.0

    def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
        """Mixed precision training step"""
        if not self.use_mixed_precision:
            logger.warning("Mixed precision not available, falling back to standard replay")
            return self._replay_standard(states, actions, rewards, next_states, dones)

        try:
            # Validate input tensors
            if states.shape[0] == 0:
                logger.warning("Empty batch in _replay_mixed_precision")
                return 0.0

            # Zero gradients
            self.optimizer.zero_grad()

            # Forward pass with amp autocasting
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", FutureWarning)
                with torch.cuda.amp.autocast():
                    # Get current Q values and predictions
                    current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self._safe_cnn_forward(self.policy_net, states)
                    current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

                    # Get next Q values from target network
                    with torch.no_grad():
                        if self.use_double_dqn:
                            # Double DQN
                            policy_q_values, _, _, _, _ = self._safe_cnn_forward(self.policy_net, next_states)
                            next_actions = policy_q_values.argmax(1)
                            target_q_values_all, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
                            next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
                        else:
                            # Standard DQN
                            next_q_values, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
                            next_q_values = next_q_values.max(1)[0]

                        # Ensure consistent shapes
                        batch_size = states.shape[0]
                        if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
                            logger.warning(f"Shape mismatch in mixed precision replay")
                            min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
                            rewards = rewards[:min_size]
                            dones = dones[:min_size]
                            next_q_values = next_q_values[:min_size]
                            current_q_values = current_q_values[:min_size]

                        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

                    # Compute Q-value loss (primary task)
                    q_loss = nn.MSELoss()(current_q_values, target_q_values.detach())

                    # Initialize loss with q_loss
                    loss = q_loss

                    # Add auxiliary losses if available
                    try:
                        # Price direction loss
                        if current_price_pred is not None and current_price_pred.shape[0] > 0:
                            price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions)
                            if price_direction_loss is not None:
                                loss = loss + 0.2 * price_direction_loss

                        # Extrema loss
                        if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
                            extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions)
                            if extrema_loss is not None:
                                loss = loss + 0.1 * extrema_loss

                    except Exception as e:
                        logger.debug(f"Could not add auxiliary loss in mixed precision: {e}")

            # Check if loss requires gradients
            if not loss.requires_grad:
                logger.warning("Loss does not require gradients in mixed precision training")
                return 0.0

            # Scale and backward pass
            self.scaler.scale(loss).backward()

            # Unscale gradients and clip
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)

            # Check for valid gradients
            has_valid_gradients = False
            for param in self.policy_net.parameters():
                if param.grad is not None and torch.any(torch.isfinite(param.grad)):
                    has_valid_gradients = True
                    break

            if not has_valid_gradients:
                logger.warning("No valid gradients in mixed precision training")
                self.scaler.update()  # Still update scaler
                return 0.0

            # Optimizer step with scaler
            self.scaler.step(self.optimizer)
            self.scaler.update()

            # Update target network
            self.training_steps += 1
            if self.training_steps % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                logger.debug(f"Target network updated at step {self.training_steps}")

            return loss.item()

        except Exception as e:
            logger.error(f"Error in mixed precision replay: {e}")
            return 0.0

    def train_on_extrema(self, states, actions, rewards, next_states, dones):
        """
        Special training function specifically for extrema points

        Args:
            states: Batch of states at extrema points
            actions: Batch of actions
            rewards: Batch of rewards
            next_states: Batch of next states
            dones: Batch of done flags

        Returns:
            float: Training loss
        """
        # Convert to numpy arrays if not already
        if not isinstance(states, np.ndarray):
            states = np.array(states)
        if not isinstance(actions, np.ndarray):
            actions = np.array(actions)
        if not isinstance(rewards, np.ndarray):
            rewards = np.array(rewards)
        if not isinstance(next_states, np.ndarray):
            next_states = np.array(next_states)
        if not isinstance(dones, np.ndarray):
            dones = np.array(dones, dtype=np.float32)

        # Normalize states
        states = np.vstack([self._normalize_state(s) for s in states])
        next_states = np.vstack([self._normalize_state(s) for s in next_states])

        # Convert to torch tensors and move to device
        states_tensor = torch.FloatTensor(states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)

        # Choose training method based on precision mode
        if self.use_mixed_precision:
            return self._replay_mixed_precision(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )
        else:
            return self._replay_standard(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )

    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
        # Handle NaN and infinite values
        state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)

        # Check if state is 1D array (happens in some environments)
        if len(state.shape) == 1:
            # If 1D, we need to normalize the whole array
            normalized_state = state.copy()

            # Convert any timestamp or non-numeric data to float
            for i in range(len(normalized_state)):
                # Check for timestamp-like objects
                if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i] = float(normalized_state[i].timestamp())
                elif not isinstance(normalized_state[i], (int, float, np.number)):
                    # Set non-numeric data to 0
                    normalized_state[i] = 0.0

            # Ensure all values are float
            normalized_state = normalized_state.astype(np.float32)

            # Simple min-max normalization for 1D state
            state_min = np.min(normalized_state)
            state_max = np.max(normalized_state)
            if state_max > state_min:
                normalized_state = (normalized_state - state_min) / (state_max - state_min)
            return normalized_state

        # Handle 2D arrays
        normalized_state = np.zeros_like(state, dtype=np.float32)

        # Convert any timestamp or non-numeric data to float
        for i in range(state.shape[0]):
            for j in range(state.shape[1]):
                if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i, j] = float(state[i, j].timestamp())
                elif isinstance(state[i, j], (int, float, np.number)):
                    normalized_state[i, j] = state[i, j]
                else:
                    # Set non-numeric data to 0
                    normalized_state[i, j] = 0.0

        # Loop through each timeframe's features in the combined state
        feature_count = state.shape[1] // len(self.timeframes)

        for tf_idx in range(len(self.timeframes)):
            start_idx = tf_idx * feature_count
            end_idx = start_idx + feature_count

            # Extract this timeframe's features
            tf_features = normalized_state[:, start_idx:end_idx]

            # Normalize OHLCV data by the first close price in the window
            # This makes price movements relative rather than absolute
            price_idx = 3  # Assuming close price is at index 3
            if price_idx < tf_features.shape[1]:
                reference_price = np.mean(tf_features[:, price_idx])
                if reference_price != 0:
                    # Normalize price-related columns (OHLC)
                    for i in range(4):  # First 4 columns are OHLC
                        if i < tf_features.shape[1]:
                            normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price

                # Normalize volume using mean and std
                vol_idx = 4  # Assuming volume is at index 4
                if vol_idx < tf_features.shape[1]:
                    vol_mean = np.mean(tf_features[:, vol_idx])
                    vol_std = np.std(tf_features[:, vol_idx])
                    if vol_std > 0:
                        normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
                    else:
                        normalized_state[:, start_idx + vol_idx] = 0

                # Other features (technical indicators) - normalize with min-max scaling
                for i in range(5, feature_count):
                    if i < tf_features.shape[1]:
                        feature_min = np.min(tf_features[:, i])
                        feature_max = np.max(tf_features[:, i])
                        if feature_max > feature_min:
                            normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
                        else:
                            normalized_state[:, start_idx + i] = 0

        return normalized_state

    def update_realtime_tick_features(self, tick_features):
        """Update with real-time tick features from tick processor"""
        try:
            if tick_features is not None:
                self.realtime_tick_features = tick_features

                # Log high-confidence tick features
                if tick_features.get('confidence', 0) > 0.8:
                    logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}")

        except Exception as e:
            logger.error(f"Error updating real-time tick features: {e}")

    def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray:
        """Enhance state with real-time tick features if available"""
        try:
            if self.realtime_tick_features is None:
                return state

            # Extract neural features from tick processor
            neural_features = self.realtime_tick_features.get('neural_features', np.array([]))
            volume_features = self.realtime_tick_features.get('volume_features', np.array([]))
            microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([]))
            confidence = self.realtime_tick_features.get('confidence', 0.0)

            # Combine tick features - make them compact to match state dimensions
            tick_features = np.concatenate([
                neural_features[:3] if len(neural_features) >= 3 else np.zeros(3),  # Take first 3 neural features
                volume_features[:1] if len(volume_features) >= 1 else np.zeros(1),    # Take first volume feature
                microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1),  # Take first microstructure feature
            ])

            # Weight the tick features
            weighted_tick_features = tick_features * self.tick_feature_weight

            # Enhance the state by adding tick features to each timeframe
            if len(state.shape) == 1:
                # 1D state - append tick features
                enhanced_state = np.concatenate([state, weighted_tick_features])
            else:
                # 2D state - add tick features to each timeframe row
                num_timeframes, num_features = state.shape

                # Ensure tick features match the number of original features
                if len(weighted_tick_features) != num_features:
                    # Pad or truncate tick features to match state feature dimension
                    if len(weighted_tick_features) < num_features:
                        # Pad with zeros
                        padded_features = np.zeros(num_features)
                        padded_features[:len(weighted_tick_features)] = weighted_tick_features
                        weighted_tick_features = padded_features
                    else:
                        # Truncate to match
                        weighted_tick_features = weighted_tick_features[:num_features]

                # Add tick features to the last row (most recent timeframe)
                enhanced_state = state.copy()
                enhanced_state[-1, :] += weighted_tick_features  # Add to last timeframe

            return enhanced_state

        except Exception as e:
            logger.error(f"Error enhancing state with tick features: {e}")
            return state

    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
        if self.avg_reward == 0:
            self.avg_reward = episode_reward
        else:
            self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward

        # Check if we're making sufficient progress
        if episode_reward > (1 + best_reward_threshold) * self.best_reward:
            self.best_reward = episode_reward
            self.no_improvement_count = 0
            return True  # Improved
        else:
            self.no_improvement_count += 1

            # If no improvement for a while, adjust learning rate
            if self.no_improvement_count >= 10:
                current_lr = self.optimizer.param_groups[0]['lr']
                new_lr = current_lr * 0.5
                if new_lr >= 1e-6:  # Don't reduce below minimum threshold
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = new_lr
                    logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
                    self.no_improvement_count = 0

            return False  # No improvement

    def save(self, path: str):
        """Save model and agent state"""
        os.makedirs(os.path.dirname(path), exist_ok=True)

        # Save policy network
        self.policy_net.save(f"{path}_policy")

        # Save target network
        self.target_net.save(f"{path}_target")

        # Save agent state
        state = {
            'epsilon': self.epsilon,
            'update_count': self.update_count,
            'losses': self.losses,
            'optimizer_state': self.optimizer.state_dict(),
            'best_reward': self.best_reward,
            'avg_reward': self.avg_reward
        }

        torch.save(state, f"{path}_agent_state.pt")
        logger.info(f"Agent state saved to {path}_agent_state.pt")

    def load(self, path: str):
        """Load model and agent state"""
        # Load policy network
        self.policy_net.load(f"{path}_policy")

        # Load target network
        self.target_net.load(f"{path}_target")

        # Load agent state
        try:
            agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device, weights_only=False)
            self.epsilon = agent_state['epsilon']
            self.update_count = agent_state['update_count']
            self.losses = agent_state['losses']
            self.optimizer.load_state_dict(agent_state['optimizer_state'])

            # Load additional metrics if they exist
            if 'best_reward' in agent_state:
                self.best_reward = agent_state['best_reward']
            if 'avg_reward' in agent_state:
                self.avg_reward = agent_state['avg_reward']

            logger.info(f"Agent state loaded from {path}_agent_state.pt")
        except FileNotFoundError:
            logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")

    def get_position_info(self):
        """Get current position information"""
        return {
            'position': self.current_position,
            'entry_price': self.position_entry_price,
            'entry_time': self.position_entry_time,
            'entry_threshold': self.entry_confidence_threshold,
            'exit_threshold': self.exit_confidence_threshold
        }

    def _calculate_price_direction_loss(self, price_direction_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        """
        Calculate loss for price direction predictions

        Args:
            price_direction_pred: Tensor of shape [batch, 2] containing [direction, confidence]
            rewards: Tensor of shape [batch] containing rewards
            actions: Tensor of shape [batch] containing actions

        Returns:
            Price direction loss tensor
        """
        try:
            if price_direction_pred.size(1) != 2:
                return None

            batch_size = price_direction_pred.size(0)

            # Extract direction and confidence predictions
            direction_pred = price_direction_pred[:, 0]  # -1 to 1
            confidence_pred = price_direction_pred[:, 1]  # 0 to 1

            # Create targets based on rewards and actions
            with torch.no_grad():
                # Direction targets: 1 if reward > 0 and action is BUY, -1 if reward > 0 and action is SELL, 0 otherwise
                direction_targets = torch.zeros(batch_size, device=price_direction_pred.device)
                for i in range(batch_size):
                    if rewards[i] > 0.01:  # Positive reward threshold
                        if actions[i] == 0:  # BUY action
                            direction_targets[i] = 1.0  # UP
                        elif actions[i] == 1:  # SELL action
                            direction_targets[i] = -1.0  # DOWN
                    # else: targets remain 0 (sideways)

                # Confidence targets: based on reward magnitude (higher reward = higher confidence)
                confidence_targets = torch.abs(rewards).clamp(0, 1)

            # Calculate losses for each component
            direction_loss = F.mse_loss(direction_pred, direction_targets)
            confidence_loss = F.mse_loss(confidence_pred, confidence_targets)

            # Combined loss (direction is more important than confidence)
            total_loss = direction_loss + 0.3 * confidence_loss

            return total_loss

        except Exception as e:
            logger.debug(f"Error calculating price direction loss: {e}")
            return None

    def _calculate_extrema_loss(self, extrema_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        """
        Calculate loss for extrema predictions

        Args:
            extrema_pred: Extrema predictions
            rewards: Tensor containing rewards
            actions: Tensor containing actions

        Returns:
            Extrema loss tensor
        """
        try:
            batch_size = extrema_pred.size(0)

            # Create targets based on reward patterns
            with torch.no_grad():
                extrema_targets = torch.ones(batch_size, dtype=torch.long, device=extrema_pred.device) * 2  # Default to "neither"

                for i in range(batch_size):
                    # High positive reward suggests we're at a good entry point (potential bottom for BUY, top for SELL)
                    if rewards[i] > 0.05:
                        if actions[i] == 0:  # BUY action
                            extrema_targets[i] = 0  # Bottom
                        elif actions[i] == 1:  # SELL action
                            extrema_targets[i] = 1  # Top

            # Calculate cross-entropy loss
            if extrema_pred.size(1) >= 3:
                extrema_loss = F.cross_entropy(extrema_pred[:, :3], extrema_targets)
            else:
                extrema_loss = F.cross_entropy(extrema_pred, extrema_targets)

            return extrema_loss

        except Exception as e:
            logger.debug(f"Error calculating extrema loss: {e}")
            return None

    def get_enhanced_training_stats(self):
        """Get enhanced RL training statistics with detailed metrics (from EnhancedDQNAgent)"""
        return {
            'buffer_size': len(self.memory),
            'epsilon': self.epsilon,
            'avg_reward': self.avg_reward,
            'best_reward': self.best_reward,
            'recent_rewards': list(self.recent_rewards) if hasattr(self, 'recent_rewards') else [],
            'no_improvement_count': self.no_improvement_count,
            # Enhanced statistics from EnhancedDQNAgent
            'training_steps': self.training_steps,
            'avg_td_error': np.mean(self.td_errors[-100:]) if self.td_errors else 0.0,
            'recent_losses': self.losses[-10:] if self.losses else [],
            'epsilon_trend': self.epsilon_history[-20:] if self.epsilon_history else [],
            'specialized_buffers': {
                'extrema_memory': len(self.extrema_memory),
                'positive_memory': len(self.positive_memory),
                'price_movement_memory': len(self.price_movement_memory)
            },
            'market_regime_weights': self.market_regime_weights,
            'use_double_dqn': self.use_double_dqn,
            'use_prioritized_replay': self.use_prioritized_replay,
            'gradient_clip_norm': self.gradient_clip_norm,
            'target_update_frequency': self.target_update_freq
        }

    def get_params_count(self):
        """Get total number of parameters in the DQN model"""
        total_params = 0
        for param in self.policy_net.parameters():
            total_params += param.numel()
        return total_params

    def _sanitize_state_data(self, state):
        """Sanitize state data to ensure it's a proper numeric array"""
        try:
            # If state is already a numpy array, return it
            if isinstance(state, np.ndarray):
                # Check for empty array
                if state.size == 0:
                    logger.warning("Received empty numpy array state. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                # Check for non-numeric data and handle it
                if state.dtype == object:
                    # Convert object array to float array
                    sanitized = np.zeros_like(state, dtype=np.float32)
                    for i in range(state.shape[0]):
                        if len(state.shape) > 1:
                            for j in range(state.shape[1]):
                                sanitized[i, j] = self._extract_numeric_value(state[i, j])
                        else:
                            sanitized[i] = self._extract_numeric_value(state[i])
                    return sanitized
                else:
                    return state.astype(np.float32)

            # If state is a list or tuple, convert to array
            elif isinstance(state, (list, tuple)):
                # Check for empty list/tuple
                if len(state) == 0:
                    logger.warning("Received empty list/tuple state. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                # Recursively sanitize each element
                sanitized = []
                for item in state:
                    if isinstance(item, (list, tuple)):
                        sanitized_row = []
                        for sub_item in item:
                            sanitized_row.append(self._extract_numeric_value(sub_item))
                        sanitized.append(sanitized_row)
                    else:
                        sanitized.append(self._extract_numeric_value(item))

                result = np.array(sanitized, dtype=np.float32)

                # Check if result is empty and provide fallback
                if result.size == 0:
                    logger.warning("Sanitized state resulted in empty array. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                return result

            # If state is a dict, try to extract values
            elif isinstance(state, dict):
                # Try to extract meaningful values from dict
                values = []
                for key in sorted(state.keys()):  # Sort for consistency
                    values.append(self._extract_numeric_value(state[key]))
                return np.array(values, dtype=np.float32)

            # If state is a single value, make it an array
            else:
                return np.array([self._extract_numeric_value(state)], dtype=np.float32)

        except Exception as e:
            logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.")
            # Return a zero array as fallback with the expected state dimension
            # Use the state_dim from initialization, fallback to 403 if not available
            expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            return np.zeros(int(expected_size), dtype=np.float32)

    def _extract_numeric_value(self, value):
        """Extract a numeric value from various data types"""
        try:
            # Handle None values
            if value is None:
                return 0.0

            # Handle numeric types
            if isinstance(value, (int, float, np.number)):
                return float(value)

            # Handle dict values
            elif isinstance(value, dict):
                # Try common keys for numeric data
                for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']:
                    if key in value:
                        return self._extract_numeric_value(value[key])
                # If no common keys, try to get first numeric value
                for v in value.values():
                    if isinstance(v, (int, float, np.number)):
                        return float(v)
                return 0.0

            # Handle string values that might be numeric
            elif isinstance(value, str):
                try:
                    return float(value)
                except:
                    return 0.0

            # Handle datetime objects
            elif hasattr(value, 'timestamp'):
                return float(value.timestamp())

            # Handle boolean values
            elif isinstance(value, bool):
                return float(value)

            # Handle list/tuple - take first numeric value
            elif isinstance(value, (list, tuple)) and len(value) > 0:
                return self._extract_numeric_value(value[0])

            else:
                return 0.0

        except:
            return 0.0

    def _extract_numeric_from_dict(self, data_dict):
        """Recursively extract all numeric values from a dictionary"""
        numeric_values = []
        try:
            for key, value in data_dict.items():
                if isinstance(value, (int, float)):
                    numeric_values.append(float(value))
                elif isinstance(value, (list, np.ndarray)):
                    try:
                        flattened = np.array(value).flatten()
                        for x in flattened:
                            if isinstance(x, (int, float)):
                                numeric_values.append(float(x))
                            elif hasattr(x, 'item'):  # numpy scalar
                                numeric_values.append(float(x.item()))
                    except (ValueError, TypeError):
                        continue
                elif isinstance(value, dict):
                    # Recursively extract from nested dicts
                    nested_values = self._extract_numeric_from_dict(value)
                    numeric_values.extend(nested_values)
                elif isinstance(value, torch.Tensor):
                    try:
                        numeric_values.append(float(value.item()))
                    except Exception:
                        continue
        except Exception as e:
            logger.debug(f"Error extracting numeric values from dict: {e}")
        return numeric_values