gogo2/NN/models/dqn_agent.py

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random
<<<<<<< HEAD
from typing import Tuple, List
=======
from typing import Tuple, List, Dict, Any
>>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
import os
import sys
import logging
import torch.nn.functional as F
import time

# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))

# Import checkpoint management
from NN.training.model_manager import save_checkpoint, load_best_checkpoint
from NN.training.model_manager import create_model_manager

# Configure logger
logger = logging.getLogger(__name__)

class DQNNetwork(nn.Module):
    """
    Configurable Deep Q-Network specifically designed for RL trading with unified BaseDataInput features
    Handles 7850 input features from multi-timeframe, multi-asset data
    Architecture is configurable via config.yaml
    """
    def __init__(self, input_dim: int, n_actions: int, config: dict = None):
        super(DQNNetwork, self).__init__()

        # Handle different input dimension formats
        if isinstance(input_dim, (tuple, list)):
            if len(input_dim) == 1:
                self.input_size = input_dim[0]
            else:
                self.input_size = np.prod(input_dim)  # Flatten multi-dimensional input
        else:
            self.input_size = input_dim

        self.n_actions = n_actions

        # Get network architecture from config or use defaults
        if config and 'network_architecture' in config:
            arch_config = config['network_architecture']
            feature_layers = arch_config.get('feature_layers', [4096, 3072, 2048, 1536, 1024])
            regime_head = arch_config.get('regime_head', [512, 256])
            price_direction_head = arch_config.get('price_direction_head', [512, 256])
            volatility_head = arch_config.get('volatility_head', [512, 128])
            value_head = arch_config.get('value_head', [512, 256])
            advantage_head = arch_config.get('advantage_head', [512, 256])
            dropout_rate = arch_config.get('dropout_rate', 0.1)
            use_layer_norm = arch_config.get('use_layer_norm', True)
        else:
            # Default reduced architecture (half the original size)
            feature_layers = [4096, 3072, 2048, 1536, 1024]
            regime_head = [512, 256]
            price_direction_head = [512, 256]
            volatility_head = [512, 128]
            value_head = [512, 256]
            advantage_head = [512, 256]
            dropout_rate = 0.1
            use_layer_norm = True

        # Build configurable feature extractor
        feature_layers_list = []
        prev_size = self.input_size

        for layer_size in feature_layers:
            feature_layers_list.append(nn.Linear(prev_size, layer_size))
            if use_layer_norm:
                feature_layers_list.append(nn.LayerNorm(layer_size))
            feature_layers_list.append(nn.ReLU(inplace=True))
            feature_layers_list.append(nn.Dropout(dropout_rate))
            prev_size = layer_size

        self.feature_extractor = nn.Sequential(*feature_layers_list)
        self.feature_size = feature_layers[-1]  # Final feature size

        # Build configurable network heads
        def build_head_layers(input_size, layer_sizes, output_size):
            layers = []
            prev_size = input_size
            for layer_size in layer_sizes:
                layers.append(nn.Linear(prev_size, layer_size))
                if use_layer_norm:
                    layers.append(nn.LayerNorm(layer_size))
                layers.append(nn.ReLU(inplace=True))
                layers.append(nn.Dropout(dropout_rate))
                prev_size = layer_size
            layers.append(nn.Linear(prev_size, output_size))
            return nn.Sequential(*layers)

        # Market regime detection head
        self.regime_head = build_head_layers(
            self.feature_size, regime_head, 4  # trending, ranging, volatile, mixed
        )

        # Price direction prediction head - outputs direction and confidence
        self.price_direction_head = build_head_layers(
            self.feature_size, price_direction_head, 2  # [direction, confidence]
        )

        # Direction activation (tanh for -1 to 1)
        self.direction_activation = nn.Tanh()
        # Confidence activation (sigmoid for 0 to 1)
        self.confidence_activation = nn.Sigmoid()

        # Volatility prediction head
        self.volatility_head = build_head_layers(
            self.feature_size, volatility_head, 4  # predicted volatility for 4 timeframes
        )

        # Main Q-value head (dueling architecture)
        self.value_head = build_head_layers(
            self.feature_size, value_head, 1  # Single value for dueling architecture
        )

        # Advantage head (dueling architecture)
        self.advantage_head = build_head_layers(
            self.feature_size, advantage_head, n_actions  # Action advantages
        )

        # Initialize weights
        self._initialize_weights()

        # Log parameter count
        total_params = sum(p.numel() for p in self.parameters())
        logger.info(f"DQN Network initialized with {total_params:,} parameters (target: 50M)")

    def _initialize_weights(self):
        """Initialize network weights using Xavier initialization"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LayerNorm):
                nn.init.constant_(module.bias, 0)
                nn.init.constant_(module.weight, 1.0)

    def forward(self, x):
        """Forward pass through the network"""
        # Ensure input is properly shaped
        if x.dim() > 2:
            x = x.view(x.size(0), -1)  # Flatten if needed
        elif x.dim() == 1:
            x = x.unsqueeze(0)  # Add batch dimension if needed

        # Feature extraction
        features = self.feature_extractor(x)

        # Multiple prediction heads
        regime_pred = self.regime_head(features)
        price_direction_raw = self.price_direction_head(features)

        # Apply separate activations to direction and confidence
        direction = self.direction_activation(price_direction_raw[:, 0:1])  # -1 to 1
        confidence = self.confidence_activation(price_direction_raw[:, 1:2])  # 0 to 1
        price_direction_pred = torch.cat([direction, confidence], dim=1)  # [batch, 2]

        volatility_pred = self.volatility_head(features)

        # Dueling Q-network
        value = self.value_head(features)
        advantage = self.advantage_head(features)

        # Combine value and advantage for Q-values
        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)

        return q_values, regime_pred, price_direction_pred, volatility_pred, features

    def act(self, state, explore=True):
        """
        Select action using epsilon-greedy policy

        Args:
            state: Current state (numpy array or tensor)
            explore: Whether to use epsilon-greedy exploration

        Returns:
            action_idx: Selected action index
            confidence: Confidence score
            action_probs: Action probabilities
        """
        # Convert state to tensor if needed
        if isinstance(state, np.ndarray):
            state = torch.FloatTensor(state)

        # Move to device
        device = next(self.parameters()).device
        state = state.to(device)

        # Ensure proper shape
        if state.dim() == 1:
            state = state.unsqueeze(0)

        with torch.no_grad():
            q_values, regime_pred, price_direction_pred, volatility_pred, features = self.forward(state)

            # Price direction predictions are processed in the agent's act method
            # This is just the network forward pass

            # Get action probabilities using softmax
            action_probs = F.softmax(q_values, dim=1)

            # Select action (greedy for inference)
            action_idx = torch.argmax(q_values, dim=1).item()

            # Calculate confidence as max probability
            confidence = float(action_probs[0, action_idx].item())

            # Convert probabilities to list
            probs_list = action_probs.squeeze(0).cpu().numpy().tolist()

            return action_idx, confidence, probs_list

class DQNAgent:
    """
    Deep Q-Network agent for trading
    Uses Enhanced CNN model as the base network with GPU support for improved performance
    """
    def __init__(self,
                 state_shape: Tuple[int, ...],
                 n_actions: int = 3,  # BUY=0, SELL=1, HOLD=2
                 learning_rate: float = 0.001,
                 epsilon: float = 1.0,
                 epsilon_min: float = 0.01,
                 epsilon_decay: float = 0.995,
                 buffer_size: int = 10000,
                 batch_size: int = 32,
                 target_update: int = 100,
                 priority_memory: bool = True,
                 device=None,
                 model_name: str = "dqn_agent",
                 enable_checkpoints: bool = True,
                 config: dict = None):

        # Checkpoint management
        self.model_name = model_name
        self.enable_checkpoints = enable_checkpoints
        self.training_integration = None  # Removed dependency on utils.training_integration
        self.episode_count = 0
        self.best_reward = float('-inf')
        self.reward_history = deque(maxlen=100)
        self.checkpoint_frequency = 100  # Save checkpoint every 100 episodes

        # Extract state dimensions
        if isinstance(state_shape, tuple) and len(state_shape) > 1:
            # Multi-dimensional state (like image or sequence)
            self.state_dim = state_shape
        else:
            # 1D state
            if isinstance(state_shape, tuple):
                if len(state_shape) == 0:
                    self.state_dim = 1  # Safe default for empty tuple
                else:
                    self.state_dim = state_shape[0]
            else:
                self.state_dim = state_shape

        # Store parameters
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.target_update = target_update

        # Set device for computation (default to GPU if available)
        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device

        logger.info(f"DQN Agent using device: {self.device}")

        # Initialize models with RL-specific network architecture
        self.policy_net = DQNNetwork(self.state_dim, self.n_actions, config).to(self.device)
        self.target_net = DQNNetwork(self.state_dim, self.n_actions, config).to(self.device)

        # Ensure models are on the correct device
        self.policy_net = self.policy_net.to(self.device)
        self.target_net = self.target_net.to(self.device)

        # Initialize the target network with the same weights as the policy network
        self.target_net.load_state_dict(self.policy_net.state_dict())

        # Set models to eval mode (important for batch norm, dropout)
        self.target_net.eval()

        # Optimization components
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
        self.criterion = nn.MSELoss()

        # Experience replay memory
        self.memory = []
        self.positive_memory = []  # Special memory for storing good experiences
        self.update_count = 0

        # Extrema detection tracking
        self.last_extrema_pred = {
            'class': 2,  # Default to "neither" (not extrema)
            'confidence': 0.0,
            'raw': None
        }
        self.extrema_memory = []

        # DQN hyperparameters
        self.gamma = 0.99  # Discount factor

        # Initialize avg_reward for dashboard compatibility
        self.avg_reward = 0.0  # Average reward tracking for dashboard

        # Market regime adaptation weights
        self.market_regime_weights = {
            'trending': 1.0,
            'sideways': 0.8,
            'volatile': 1.2,
            'bullish': 1.1,
            'bearish': 1.1
        }

        # Load best checkpoint if available
        if self.enable_checkpoints:
            self.load_best_checkpoint()

        logger.info(f"DQN Agent initialized with checkpoint management: {enable_checkpoints}")
        if enable_checkpoints:
            logger.info(f"Model name: {model_name}, Checkpoint frequency: {self.checkpoint_frequency}")

        # Add this line to the __init__ method
        self.recent_actions = deque(maxlen=10)
        self.recent_prices = deque(maxlen=20)
        self.recent_rewards = deque(maxlen=100)

        # Price direction tracking - stores direction and confidence
        self.last_price_direction = {
            'direction': 0.0,    # Single value between -1 and 1
            'confidence': 0.0    # Single value between 0 and 1
        }

        # Store separate memory for price direction examples
        self.price_movement_memory = []  # For storing examples of clear price movements

        # Performance tracking
        self.losses = []
        self.no_improvement_count = 0

        # Confidence tracking
        self.confidence_history = []
        self.avg_confidence = 0.0
        self.max_confidence = 0.0
        self.min_confidence = 1.0

        # Enhanced features from EnhancedDQNAgent
        # Market adaptation capabilities
        self.market_regime_weights = {
            'trending': 1.2,    # Higher confidence in trending markets
            'ranging': 0.8,     # Lower confidence in ranging markets
            'volatile': 0.6     # Much lower confidence in volatile markets
        }

        # Dueling network support (requires enhanced network architecture)
        self.use_dueling = True

        # Prioritized experience replay parameters
        self.use_prioritized_replay = priority_memory
        self.alpha = 0.6  # Priority exponent
        self.beta = 0.4   # Importance sampling exponent
        self.beta_increment = 0.001

        # Double DQN support
        self.use_double_dqn = True

        # Enhanced training features from EnhancedDQNAgent
        self.target_update_freq = target_update  # More descriptive name
        self.training_steps = 0
        self.gradient_clip_norm = 1.0  # Gradient clipping

        # Enhanced statistics tracking
        self.epsilon_history = []
        self.td_errors = []  # Track TD errors for analysis

        # Trade action fee and confidence thresholds
        self.trade_action_fee = 0.0005  # Small fee to discourage unnecessary trading
        self.minimum_action_confidence = 0.3  # Minimum confidence to consider trading (lowered from 0.5)

        # Violent move detection
        self.price_history = []
        self.volatility_window = 20  # Window size for volatility calculation
        self.volatility_threshold = 0.0015  # Threshold for considering a move "violent"
        self.post_violent_move = False  # Flag for recent violent move
        self.violent_move_cooldown = 0  # Cooldown after violent move

        # Feature integration
        self.last_hidden_features = None  # Store last extracted features
        self.feature_history = []  # Store history of features for analysis

        # Real-time tick features integration
        self.realtime_tick_features = None  # Latest tick features from tick processor
        self.tick_feature_weight = 0.3  # Weight for tick features in decision making

        # Check if mixed precision training should be used
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
            self.use_mixed_precision = True
            self.scaler = torch.cuda.amp.GradScaler()
            logger.info("Mixed precision training enabled")
        else:
            self.use_mixed_precision = False
            logger.info("Mixed precision training disabled")

        # Track if we're in training mode
        self.training = True

        # For compatibility with old code
        self.state_size = np.prod(state_shape)
        self.action_size = n_actions
        self.memory_size = buffer_size
        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3]  # Default timeframes

        logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}")
        logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}")
        logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}")

        # Log model parameters
        total_params = sum(p.numel() for p in self.policy_net.parameters())
        logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters")

        # Position management for 2-action system
        self.current_position = 0.0  # -1 (short), 0 (neutral), 1 (long)
        self.position_entry_price = 0.0
        self.position_entry_time = None

        # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data
        self.entry_confidence_threshold = 0.35  # Lower threshold for new positions (was 0.7)
        self.exit_confidence_threshold = 0.15   # Very low threshold for closing positions (was 0.3)
        self.uncertainty_threshold = 0.1        # When to stay neutral

    def load_best_checkpoint(self):
        """Load the best checkpoint for this DQN agent"""
        try:
            if not self.enable_checkpoints:
                return

            result = load_best_checkpoint(self.model_name)
            if result:
                file_path, metadata = result
                checkpoint = torch.load(file_path, map_location=self.device, weights_only=False)

                # Load model states
                if 'policy_net_state_dict' in checkpoint:
                    self.policy_net.load_state_dict(checkpoint['policy_net_state_dict'])
                if 'target_net_state_dict' in checkpoint:
                    self.target_net.load_state_dict(checkpoint['target_net_state_dict'])
                if 'optimizer_state_dict' in checkpoint:
                    self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

                # Load training state
                if 'episode_count' in checkpoint:
                    self.episode_count = checkpoint['episode_count']
                if 'epsilon' in checkpoint:
                    self.epsilon = checkpoint['epsilon']
                if 'best_reward' in checkpoint:
                    self.best_reward = checkpoint['best_reward']

                logger.info(f"Loaded DQN checkpoint: {metadata.checkpoint_id}")
                logger.info(f"Episode: {self.episode_count}, Best reward: {self.best_reward:.4f}")

        except Exception as e:
            logger.warning(f"Failed to load checkpoint for {self.model_name}: {e}")

    def save_checkpoint(self, episode_reward: float, force_save: bool = False):
        """Save checkpoint if performance improved or forced"""
        try:
            if not self.enable_checkpoints:
                return False

            self.episode_count += 1
            self.reward_history.append(episode_reward)

            # Calculate average reward over recent episodes
            avg_reward = sum(self.reward_history) / len(self.reward_history)

            # Update best reward
            if episode_reward > self.best_reward:
                self.best_reward = episode_reward

            # Save checkpoint every N episodes or if forced
            should_save = (
                force_save or
                self.episode_count % self.checkpoint_frequency == 0 or
                episode_reward > self.best_reward * 0.95  # Within 5% of best
            )

            if should_save and self.training_integration:
                return self.training_integration.save_rl_checkpoint(
                    rl_agent=self,
                    model_name=self.model_name,
                    episode=self.episode_count,
                    avg_reward=avg_reward,
                    best_reward=self.best_reward,
                    epsilon=self.epsilon,
                    total_pnl=0.0  # Default to 0, can be set by calling code
                )

            return False

        except Exception as e:
            logger.error(f"Error saving DQN checkpoint: {e}")
            return False

        # Store separate memory for price direction examples
        self.price_movement_memory = []  # For storing examples of clear price movements

        # Performance tracking
        self.losses = []
        self.no_improvement_count = 0

        # Confidence tracking
        self.confidence_history = []
        self.avg_confidence = 0.0
        self.max_confidence = 0.0
        self.min_confidence = 1.0

        # Enhanced features from EnhancedDQNAgent
        # Market adaptation capabilities
        self.market_regime_weights = {
            'trending': 1.2,    # Higher confidence in trending markets
            'ranging': 0.8,     # Lower confidence in ranging markets
            'volatile': 0.6     # Much lower confidence in volatile markets
        }

        # Dueling network support (requires enhanced network architecture)
        self.use_dueling = True

        # Prioritized experience replay parameters
        self.use_prioritized_replay = priority_memory
        self.alpha = 0.6  # Priority exponent
        self.beta = 0.4   # Importance sampling exponent
        self.beta_increment = 0.001

        # Double DQN support
        self.use_double_dqn = True

        # Enhanced training features from EnhancedDQNAgent
        self.target_update_freq = target_update  # More descriptive name
        self.training_steps = 0
        self.gradient_clip_norm = 1.0  # Gradient clipping

        # Enhanced statistics tracking
        self.epsilon_history = []
        self.td_errors = []  # Track TD errors for analysis

        # Trade action fee and confidence thresholds
        self.trade_action_fee = 0.0005  # Small fee to discourage unnecessary trading
        self.minimum_action_confidence = 0.3  # Minimum confidence to consider trading (lowered from 0.5)

        # Violent move detection
        self.price_history = []
        self.volatility_window = 20  # Window size for volatility calculation
        self.volatility_threshold = 0.0015  # Threshold for considering a move "violent"
        self.post_violent_move = False  # Flag for recent violent move
        self.violent_move_cooldown = 0  # Cooldown after violent move

        # Feature integration
        self.last_hidden_features = None  # Store last extracted features
        self.feature_history = []  # Store history of features for analysis

        # Real-time tick features integration
        self.realtime_tick_features = None  # Latest tick features from tick processor
        self.tick_feature_weight = 0.3  # Weight for tick features in decision making

        # Check if mixed precision training should be used
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
            self.use_mixed_precision = True
            self.scaler = torch.cuda.amp.GradScaler()
            logger.info("Mixed precision training enabled")
        else:
            self.use_mixed_precision = False
            logger.info("Mixed precision training disabled")

        # Track if we're in training mode
        self.training = True

        # For compatibility with old code
        self.state_size = np.prod(state_shape)
        self.action_size = n_actions
        self.memory_size = buffer_size
        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0] if isinstance(self.state_dim, tuple) else 3]  # Default timeframes

        logger.info(f"DQN Agent using Enhanced CNN with device: {self.device}")
        logger.info(f"Trade action fee set to {self.trade_action_fee}, minimum confidence: {self.minimum_action_confidence}")
        logger.info(f"Real-time tick feature integration enabled with weight: {self.tick_feature_weight}")

        # Log model parameters
        total_params = sum(p.numel() for p in self.policy_net.parameters())
        logger.info(f"Enhanced CNN Policy Network: {total_params:,} parameters")

        # Position management for 2-action system
        self.current_position = 0.0  # -1 (short), 0 (neutral), 1 (long)
        self.position_entry_price = 0.0
        self.position_entry_time = None

        # Different thresholds for entry vs exit decisions - AGGRESSIVE for more training data
        self.entry_confidence_threshold = 0.35  # Lower threshold for new positions (was 0.7)
        self.exit_confidence_threshold = 0.15   # Very low threshold for closing positions (was 0.3)
        self.uncertainty_threshold = 0.1        # When to stay neutral

    def move_models_to_device(self, device=None):
        """Move models to the specified device (GPU/CPU)"""
        if device is not None:
            self.device = device

        try:
            self.policy_net = self.policy_net.to(self.device)
            self.target_net = self.target_net.to(self.device)
            logger.info(f"Moved models to {self.device}")
            return True
        except Exception as e:
            logger.error(f"Failed to move models to {self.device}: {str(e)}")
            return False

    def to(self, device):
        """PyTorch-style device movement method"""
        self.device = device
        self.policy_net = self.policy_net.to(device)
        self.target_net = self.target_net.to(device)
        return self

    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
        Store experience in memory with prioritization

        Args:
            state: Current state
            action: Action taken
            reward: Reward received
            next_state: Next state
            done: Whether episode is done
            is_extrema: Whether this is a local extrema sample (for specialized learning)
        """
        # Validate states before storing experience
        if state is None or next_state is None:
            logger.debug("Skipping experience storage: None state provided")
            return

        if isinstance(state, dict) and not state:
            logger.debug("Skipping experience storage: empty state dictionary")
            return

        if isinstance(next_state, dict) and not next_state:
            logger.debug("Skipping experience storage: empty next_state dictionary")
            return

        # Check if states are all zeros (invalid)
        if hasattr(state, '__iter__') and all(f == 0 for f in np.array(state).flatten()):
            logger.debug("Skipping experience storage: state is all zeros")
            return

        experience = (state, action, reward, next_state, done)

        # Always add to main memory
        self.memory.append(experience)

        # Try to extract price change to analyze the experience
        try:
            # Extract price feature from sequence data (if available)
            if len(state.shape) > 1:  # 2D state [timeframes, features]
                current_price = state[-1, -1]  # Last timeframe, last feature
                next_price = next_state[-1, -1]
            else:  # 1D state
                current_price = state[-1]  # Last feature
                next_price = next_state[-1]

            # Calculate price change - avoid division by zero
            if np.isscalar(current_price) and current_price != 0:
                price_change = (next_price - current_price) / current_price
            elif isinstance(current_price, np.ndarray):
                # Handle array case - protect against division by zero
                with np.errstate(divide='ignore', invalid='ignore'):
                    price_change = (next_price - current_price) / current_price
                # Replace infinities and NaNs with zeros
                if isinstance(price_change, np.ndarray):
                    price_change = np.nan_to_num(price_change, nan=0.0, posinf=0.0, neginf=0.0)
                else:
                    price_change = 0.0 if np.isnan(price_change) or np.isinf(price_change) else price_change
            else:
                price_change = 0.0

            # Check if this is a significant price movement
            if abs(price_change) > 0.002:  # Significant price change
                # Store in price movement memory
                self.price_movement_memory.append(experience)

                # Log significant price movements
                direction = "UP" if price_change > 0 else "DOWN"
                logger.info(f"Stored significant {direction} price movement: {price_change:.4f}")

                # For clear price movements, also duplicate in main memory to learn more
                if abs(price_change) > 0.005:  # Very significant movement
                    for _ in range(2):  # Add 2 extra copies
                        self.memory.append(experience)
        except Exception as e:
            # Skip price movement analysis if it fails
            pass

        # Check if this is an extrema point based on our extrema detection head
        if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
            # Class 0 = bottom, 1 = top, 2 = neither
            # Only consider high confidence predictions
            if self.last_extrema_pred['confidence'] > 0.7:
                self.extrema_memory.append(experience)

                # Log this special experience
                extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
                logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")

                # For tops and bottoms, also duplicate the experience in memory to learn more from it
                for _ in range(2):  # Add 2 extra copies
                    self.memory.append(experience)

        # Explicitly marked extrema points also go to extrema memory
        elif is_extrema:
            self.extrema_memory.append(experience)

        # Store positive experiences separately for prioritized replay
        if reward > 0:
            self.positive_memory.append(experience)

            # For very good rewards, duplicate to learn more from them
            if reward > 0.1:
                for _ in range(min(int(reward * 10), 5)):  # Cap at 5 extra copies for very high rewards
                    self.positive_memory.append(experience)

        # Keep memory size under control
        if len(self.memory) > self.buffer_size:
            # Keep more recent experiences
            self.memory = self.memory[-self.buffer_size:]

        # Keep specialized memories under control too
        if len(self.positive_memory) > self.buffer_size // 4:
            self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]

        if len(self.extrema_memory) > self.buffer_size // 4:
            self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]

        if len(self.price_movement_memory) > self.buffer_size // 4:
            self.price_movement_memory = self.price_movement_memory[-(self.buffer_size // 4):]

    def act(self, state: np.ndarray, explore=True, current_price=None, market_context=None) -> int:
        """
        Choose action based on current state using 2-action system with intelligent position management

        Args:
            state: Current market state
            explore: Whether to use epsilon-greedy exploration
            current_price: Current market price for position management
            market_context: Additional market context for decision making

        Returns:
            int: Action (0=BUY, 1=SELL)
        """
        try:
            # Validate state first - return early if empty/invalid/None
            if state is None:
                logger.warning("None state provided to act(), returning SELL action")
                return 1  # SELL action (safe default)

            if isinstance(state, dict) and not state:
                logger.warning("Empty state dictionary provided to act(), returning SELL action")
                return 1  # SELL action (safe default)

            # Use the DQNNetwork's act method for consistent behavior
            action_idx, confidence, action_probs = self.policy_net.act(state, explore=explore)

            # Process price direction predictions from the network
            # Get the raw predictions from the network's forward pass
            with torch.no_grad():
                q_values, regime_pred, price_direction_pred, volatility_pred, features = self.policy_net.forward(state)
                if price_direction_pred is not None:
                    self.process_price_direction_predictions(price_direction_pred)

            # Apply epsilon-greedy exploration if requested
            if explore and np.random.random() <= self.epsilon:
                action_idx = np.random.choice(self.n_actions)

            # Update tracking
            if current_price:
                self.recent_prices.append(current_price)

            self.recent_actions.append(action_idx)
            return action_idx

        except Exception as e:
            logger.error(f"Error in act method: {e}")
            # Return default action (HOLD/SELL)
            return 1

    def act_with_confidence(self, state: np.ndarray, market_regime: str = 'trending') -> Tuple[int, float, List[float]]:
        """Choose action with confidence score adapted to market regime"""
        try:
            # Validate state first - return early if empty/invalid/None
            if state is None:
                logger.warning("None state provided to act_with_confidence(), returning safe defaults")
                return 1, 0.1, [0.0, 0.9, 0.1]  # SELL action with low confidence

            if isinstance(state, dict) and not state:
                logger.warning("Empty state dictionary provided to act_with_confidence(), returning safe defaults")
                return 1, 0.0, [0.0, 1.0]  # SELL action with zero confidence
            # Convert state to tensor if needed
            if isinstance(state, np.ndarray):
                state_tensor = torch.FloatTensor(state)
                device = next(self.policy_net.parameters()).device
                state_tensor = state_tensor.to(device)

                # Ensure proper shape
                if state_tensor.dim() == 1:
                    state_tensor = state_tensor.unsqueeze(0)
            else:
                state_tensor = state

            # Get network outputs
            with torch.no_grad():
                q_values, regime_pred, price_direction_pred, volatility_pred, features = self.policy_net.forward(state_tensor)

                # Process price direction predictions
                if price_direction_pred is not None:
                    self.process_price_direction_predictions(price_direction_pred)

                # Get action probabilities using softmax
                action_probs = F.softmax(q_values, dim=1)

                # Select action (greedy for inference)
                action_idx = torch.argmax(q_values, dim=1).item()

                # Calculate confidence as max probability
                base_confidence = float(action_probs[0, action_idx].item())

            # Adapt confidence based on market regime
            regime_weight = self.market_regime_weights.get(market_regime, 1.0)
            adapted_confidence = min(base_confidence * regime_weight, 1.0)

            # Convert probabilities to list
            probs_list = action_probs.squeeze(0).cpu().numpy().tolist()

            # Return action, confidence, and probabilities (for orchestrator compatibility)
            return int(action_idx), float(adapted_confidence), probs_list

        except Exception as e:
            logger.error(f"Error in act_with_confidence: {e}")
            # Return default action with low confidence
            return 1, 0.1, [0.45, 0.55]  # Default to HOLD action

    def process_price_direction_predictions(self, price_direction_pred: torch.Tensor) -> Dict[str, float]:
        """
        Process price direction predictions and convert to standardized format

        Args:
            price_direction_pred: Tensor of shape (batch_size, 2) containing [direction, confidence]

        Returns:
            Dict with direction (-1 to 1) and confidence (0 to 1)
        """
        try:
            if price_direction_pred is None or price_direction_pred.numel() == 0:
                return self.last_price_direction

            # Extract direction and confidence values
            direction_value = float(price_direction_pred[0, 0].item())  # -1 to 1
            confidence_value = float(price_direction_pred[0, 1].item())  # 0 to 1

            # Update last price direction
            self.last_price_direction = {
                'direction': direction_value,
                'confidence': confidence_value
            }

            return self.last_price_direction

        except Exception as e:
            logger.error(f"Error processing price direction predictions: {e}")
            return self.last_price_direction

    def get_price_direction_vector(self) -> Dict[str, float]:
        """
        Get the current price direction and confidence

        Returns:
            Dict with direction (-1 to 1) and confidence (0 to 1)
        """
        return self.last_price_direction

    def get_price_direction_summary(self) -> Dict[str, Any]:
        """
        Get a summary of price direction prediction

        Returns:
            Dict containing direction and confidence information
        """
        try:
            direction_value = self.last_price_direction['direction']
            confidence_value = self.last_price_direction['confidence']

            # Convert to discrete direction
            if direction_value > 0.1:
                direction_label = "UP"
                discrete_direction = 1
            elif direction_value < -0.1:
                direction_label = "DOWN"
                discrete_direction = -1
            else:
                direction_label = "SIDEWAYS"
                discrete_direction = 0

            return {
                'direction_value': float(direction_value),
                'confidence_value': float(confidence_value),
                'direction_label': direction_label,
                'discrete_direction': discrete_direction,
                'strength': abs(float(direction_value)),
                'weighted_strength': abs(float(direction_value)) * float(confidence_value)
            }

        except Exception as e:
            logger.error(f"Error calculating price direction summary: {e}")
            return {
                'direction_value': 0.0,
                'confidence_value': 0.0,
                'direction_label': "SIDEWAYS",
                'discrete_direction': 0,
                'strength': 0.0,
                'weighted_strength': 0.0
            }

        except Exception as e:
            logger.error(f"Error in act_with_confidence: {e}")
            # Return default action with low confidence
            return 1, 0.1, [0.45, 0.55]  # Default to HOLD action

    def _determine_action_with_position_management(self, sell_conf, buy_conf, current_price, market_context, explore):
        """
        Determine action based on current position and confidence thresholds

        This implements the intelligent position management where:
        - When neutral: Need high confidence to enter position
        - When in position: Need lower confidence to exit
        - Different thresholds for entry vs exit
        """

        # Apply epsilon-greedy exploration
        if explore and np.random.random() <= self.epsilon:
            return np.random.choice([0, 1])

        # Get the dominant signal - FIXED ACTION MAPPING: 0=BUY, 1=SELL
        dominant_action = 0 if buy_conf > sell_conf else 1
        dominant_confidence = max(buy_conf, sell_conf)

        # Decision logic based on current position
        if self.current_position == 0:  # No position - need high confidence to enter
            if dominant_confidence >= self.entry_confidence_threshold:
                # Strong enough signal to enter position
                if dominant_action == 0:  # BUY signal (action 0)
                    self.current_position = 1.0
                    self.position_entry_price = current_price
                    self.position_entry_time = time.time()
                    logger.info(f"ENTERING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
                    return 0  # Return BUY action (0)
                else:  # SELL signal (action 1)
                    self.current_position = -1.0
                    self.position_entry_price = current_price
                    self.position_entry_time = time.time()
                    logger.info(f"ENTERING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}")
                    return 1  # Return SELL action (1)
            else:
                # Not confident enough to enter position
                return None

        elif self.current_position > 0:  # Long position
            if dominant_action == 1 and dominant_confidence >= self.exit_confidence_threshold:
                # SELL signal (action 1) with enough confidence to close long position
                pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"CLOSING LONG position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 0.0
                self.position_entry_price = 0.0
                self.position_entry_time = None
                return 1  # Return SELL action (1)
            elif dominant_action == 1 and dominant_confidence >= self.entry_confidence_threshold:
                # Very strong SELL signal - close long and enter short
                pnl = (current_price - self.position_entry_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"FLIPPING from LONG to SHORT at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = -1.0
                self.position_entry_price = current_price
                self.position_entry_time = time.time()
                return 1  # Return SELL action (1)
            else:
                # Hold the long position
                return None

        elif self.current_position < 0:  # Short position
            if dominant_action == 0 and dominant_confidence >= self.exit_confidence_threshold:
                # BUY signal (action 0) with enough confidence to close short position
                pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"CLOSING SHORT position at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 0.0
                self.position_entry_price = 0.0
                self.position_entry_time = None
                return 0  # Return BUY action (0)
            elif dominant_action == 0 and dominant_confidence >= self.entry_confidence_threshold:
                # Very strong BUY signal - close short and enter long
                pnl = (self.position_entry_price - current_price) / self.position_entry_price if current_price and self.position_entry_price else 0
                logger.info(f"FLIPPING from SHORT to LONG at {current_price:.4f} with confidence {dominant_confidence:.4f}, PnL: {pnl:.4f}")
                self.current_position = 1.0
                self.position_entry_price = current_price
                self.position_entry_time = time.time()
                return 0  # Return BUY action (0)
            else:
                # Hold the short position
                return None

        return None

    def _safe_cnn_forward(self, network, states):
        """Safely call CNN forward method ensuring we always get 5 return values"""
        try:
            result = network(states)
            if isinstance(result, tuple) and len(result) == 5:
                return result
            elif isinstance(result, tuple) and len(result) == 1:
                # Handle case where only q_values are returned (like in empty tensor case)
                q_values = result[0]
                batch_size = q_values.size(0)
                device = q_values.device
                default_extrema = torch.zeros(batch_size, 3, device=device)
                default_price = torch.zeros(batch_size, 1, device=device)
                default_features = torch.zeros(batch_size, 1024, device=device)
                default_advanced = torch.zeros(batch_size, 1, device=device)
                return q_values, default_extrema, default_price, default_features, default_advanced
            else:
                # Fallback: create all default tensors
                batch_size = states.size(0)
                device = states.device
                default_q_values = torch.zeros(batch_size, self.n_actions, device=device)
                default_extrema = torch.zeros(batch_size, 3, device=device)
                default_price = torch.zeros(batch_size, 1, device=device)
                default_features = torch.zeros(batch_size, 1024, device=device)
                default_advanced = torch.zeros(batch_size, 1, device=device)
                return default_q_values, default_extrema, default_price, default_features, default_advanced
        except Exception as e:
            logger.error(f"Error in CNN forward pass: {e}")
            # Fallback: create all default tensors
            batch_size = states.size(0)
            device = states.device
            default_q_values = torch.zeros(batch_size, self.n_actions, device=device)
            default_extrema = torch.zeros(batch_size, 3, device=device)
            default_price = torch.zeros(batch_size, 1, device=device)
            default_features = torch.zeros(batch_size, 1024, device=device)
            default_advanced = torch.zeros(batch_size, 1, device=device)
            return default_q_values, default_extrema, default_price, default_features, default_advanced

    def replay(self, experiences=None):
        """Train the model using experiences from memory"""

        # Don't train if not in training mode
        if not self.training:
            return 0.0

        # If no experiences provided, sample from memory
        if experiences is None:
            # Skip if memory is too small (allow early training for GPU warmup)
            min_required = min(getattr(self, 'batch_size', 32), 16)
            if len(self.memory) < min_required:
                return 0.0

            # Sample random mini-batch from memory
            indices = np.random.choice(len(self.memory), size=min(self.batch_size, len(self.memory)), replace=False)
            experiences = [self.memory[i] for i in indices]

        # Validate experiences before processing
        if not experiences or len(experiences) == 0:
            logger.warning("No experiences provided for training")
            return 0.0

        # Sanitize and validate experiences
        valid_experiences = []
        for i, exp in enumerate(experiences):
            try:
                if len(exp) != 5:
                    logger.debug(f"Invalid experience format at index {i}: expected 5 elements, got {len(exp)}")
                    continue

                state, action, reward, next_state, done = exp

                # Validate state
                state = self._validate_and_fix_state(state)
                next_state = self._validate_and_fix_state(next_state)

                if state is None or next_state is None:
                    continue

                # Validate action
                if isinstance(action, dict):
                    action = action.get('action', action.get('value', 0))
                action = int(action) if action is not None else 0
                action = max(0, min(action, self.n_actions - 1))  # Clamp to valid range

                # Validate reward
                if isinstance(reward, dict):
                    reward = reward.get('reward', reward.get('value', 0.0))
                reward = float(reward) if reward is not None else 0.0

                # Validate done flag
                done = bool(done) if done is not None else False

                valid_experiences.append((state, action, reward, next_state, done))

            except Exception as e:
                logger.debug(f"Error processing experience {i}: {e}")
                continue

        if len(valid_experiences) == 0:
            logger.warning("No valid experiences after sanitization")
            return 0.0

        # Use validated experiences for training
        experiences = valid_experiences

        # Extract components
        states, actions, rewards, next_states, dones = zip(*experiences)

        # Convert to tensors with proper validation
        try:
            # Ensure all data is on CPU first, then move to device
            states_array = np.array(states, dtype=np.float32)
            actions_array = np.array(actions, dtype=np.int64)
            rewards_array = np.array(rewards, dtype=np.float32)
            next_states_array = np.array(next_states, dtype=np.float32)
            dones_array = np.array(dones, dtype=np.float32)

            # Convert to tensors and move to device
            states = torch.from_numpy(states_array).to(self.device)
            actions = torch.from_numpy(actions_array).to(self.device)
            rewards = torch.from_numpy(rewards_array).to(self.device)
            next_states = torch.from_numpy(next_states_array).to(self.device)
            dones = torch.from_numpy(dones_array).to(self.device)

            # Final validation of tensor shapes
            if states.shape[0] == 0 or actions.shape[0] == 0:
                logger.warning("Empty tensors after conversion")
                return 0.0

            # Ensure all tensors have the same batch size
            batch_size = states.shape[0]
            if not all(tensor.shape[0] == batch_size for tensor in [actions, rewards, next_states, dones]):
                logger.warning("Inconsistent batch sizes across tensors")
                return 0.0

        except Exception as e:
            logger.error(f"Error converting experiences to tensors: {e}")
            return 0.0

        # Always use standard training to fix gradient issues
        loss = self._replay_standard(states, actions, rewards, next_states, dones)

        # Update epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        # Update statistics
        self.losses.append(loss)
        if len(self.losses) > 1000:
            self.losses = self.losses[-500:]  # Keep only recent losses

        return loss

    def _validate_and_fix_state(self, state):
        """Validate and fix state to ensure it has correct dimensions and no empty data"""
        try:
            # Convert to numpy if needed
            if isinstance(state, torch.Tensor):
                state = state.detach().cpu().numpy()
            elif not isinstance(state, np.ndarray):
                # Check if state is a dict or complex object
                if isinstance(state, dict):
                    logger.error(f"State is a dict: {state}")

                    # Handle empty dictionary case
                    if not state:
                        logger.error("Empty state dictionary received, using default state")
                        expected_size = getattr(self, 'state_size', 403)
                        if isinstance(expected_size, tuple):
                            expected_size = np.prod(expected_size)
                        return np.zeros(int(expected_size), dtype=np.float32)

                    # Extract numerical values from dict if possible
                    if 'features' in state:
                        state = state['features']
                    elif 'state' in state:
                        state = state['state']
                    else:
                        # Try to extract all numerical values using the helper method
                        numerical_values = self._extract_numeric_from_dict(state)
                        if numerical_values:
                            state = np.array(numerical_values, dtype=np.float32)
                        else:
                            logger.error("No numerical values found in state dict, using default state")
                            expected_size = getattr(self, 'state_size', 403)
                            if isinstance(expected_size, tuple):
                                expected_size = np.prod(expected_size)
                            return np.zeros(int(expected_size), dtype=np.float32)
                else:
                    try:
                        state = np.array(state, dtype=np.float32)
                    except (ValueError, TypeError) as e:
                        logger.error(f"Cannot convert state to numpy array: {type(state)}, {e}")
                        expected_size = getattr(self, 'state_size', 403)
                        if isinstance(expected_size, tuple):
                            expected_size = np.prod(expected_size)
                        return np.zeros(int(expected_size), dtype=np.float32)

            # Flatten if multi-dimensional
            if state.ndim > 1:
                state = state.flatten()

            # Check for empty or invalid state
            if state.size == 0:
                logger.warning("Empty state detected, using default")
                expected_size = getattr(self, 'state_size', 403)
                if isinstance(expected_size, tuple):
                    expected_size = np.prod(expected_size)
                return np.zeros(int(expected_size), dtype=np.float32)

            # Check for NaN or infinite values
            if np.any(np.isnan(state)) or np.any(np.isinf(state)):
                logger.warning("NaN or infinite values in state, replacing with zeros")
                state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)

            # Ensure correct dimensions
            expected_size = getattr(self, 'state_size', 403)
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            expected_size = int(expected_size)

            if len(state) != expected_size:
                if len(state) < expected_size:
                    # Pad with zeros
                    padded_state = np.zeros(expected_size, dtype=np.float32)
                    padded_state[:len(state)] = state
                    state = padded_state
                else:
                    # Truncate
                    state = state[:expected_size]

            return state.astype(np.float32)

        except Exception as e:
            logger.error(f"Error validating state: {e}")
            # Return default state as fallback
            expected_size = getattr(self, 'state_size', 403)
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            return np.zeros(int(expected_size), dtype=np.float32)

    def _extract_numeric_from_dict(self, data_dict):
        """Recursively extract numerical values from nested dictionaries"""
        numerical_values = []
        try:
            for key, value in data_dict.items():
                if isinstance(value, (int, float)):
                    numerical_values.append(float(value))
                elif isinstance(value, (list, np.ndarray)):
                    try:
                        flattened = np.array(value).flatten()
                        for x in flattened:
                            if isinstance(x, (int, float)):
                                numerical_values.append(float(x))
                            elif hasattr(x, 'item'):  # numpy scalar
                                numerical_values.append(float(x.item()))
                    except (ValueError, TypeError):
                        continue
                elif isinstance(value, dict):
                    # Recursively extract from nested dicts
                    nested_values = self._extract_numeric_from_dict(value)
                    numerical_values.extend(nested_values)
        except Exception as e:
            logger.debug(f"Error extracting numeric values from dict: {e}")
        return numerical_values

    def _replay_standard(self, states, actions, rewards, next_states, dones):
        """Standard training step without mixed precision"""
        try:
            # Validate input tensors
            if states.shape[0] == 0:
                logger.warning("Empty batch in _replay_standard")
                return 0.0

            # Ensure model is in training mode for gradients
            self.policy_net.train()

            # Get current Q values - use the updated forward method
            q_values_output = self.policy_net(states)
            if isinstance(q_values_output, tuple):
                current_q_values_all = q_values_output[0]  # Extract Q-values from tuple
            else:
                current_q_values_all = q_values_output

            current_q_values = current_q_values_all.gather(1, actions.unsqueeze(1)).squeeze(1)

            # Enhanced Double DQN implementation
            with torch.no_grad():
                if self.use_double_dqn:
                    # Double DQN: Use policy network to select actions, target network to evaluate
                    policy_output = self.policy_net(next_states)
                    policy_q_values = policy_output[0] if isinstance(policy_output, tuple) else policy_output
                    next_actions = policy_q_values.argmax(1)

                    target_output = self.target_net(next_states)
                    target_q_values_all = target_output[0] if isinstance(target_output, tuple) else target_output
                    next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
                else:
                    # Standard DQN: Use target network for both selection and evaluation
                    target_output = self.target_net(next_states)
                    target_q_values = target_output[0] if isinstance(target_output, tuple) else target_output
                    next_q_values = target_q_values.max(1)[0]

                # Ensure tensor shapes are consistent
                batch_size = states.shape[0]
                if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
                    logger.warning(f"Shape mismatch in replay: batch_size={batch_size}, rewards={rewards.shape}, next_q_values={next_q_values.shape}")
                    min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
                    rewards = rewards[:min_size]
                    dones = dones[:min_size]
                    next_q_values = next_q_values[:min_size]
                    current_q_values = current_q_values[:min_size]

                # Calculate target Q values
                target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

            # Compute loss for Q value - ensure tensors require gradients
            if not current_q_values.requires_grad:
                logger.warning("Current Q values do not require gradients")
                # Force training mode
                self.policy_net.train()
                return 0.0

            q_loss = self.criterion(current_q_values, target_q_values.detach())

            # Calculate auxiliary losses and add to Q-loss
            total_loss = q_loss

            # Add auxiliary losses if available
            try:
                # Get additional predictions from forward pass
                if isinstance(q_values_output, tuple) and len(q_values_output) >= 5:
                    current_regime_pred = q_values_output[1]
                    current_price_pred = q_values_output[2]
                    current_volatility_pred = q_values_output[3]
                    current_extrema_pred = current_regime_pred  # Use regime as extrema proxy for now

                    # Price direction loss
                    if current_price_pred is not None and current_price_pred.shape[0] > 0:
                        price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions)
                        if price_direction_loss is not None:
                            total_loss = total_loss + 0.2 * price_direction_loss

                    # Extrema loss
                    if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
                        extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions)
                        if extrema_loss is not None:
                            total_loss = total_loss + 0.1 * extrema_loss

            except Exception as e:
                logger.debug(f"Could not add auxiliary loss in standard training: {e}")

            # Reset gradients
            self.optimizer.zero_grad()

            # Ensure total loss requires gradients
            if not total_loss.requires_grad:
                logger.warning("Total loss does not require gradients - policy network may not be in training mode")
                self.policy_net.train()  # Ensure training mode
                return 0.0

            # Backward pass
            total_loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)

            # Check if gradients are valid
            has_valid_gradients = False
            for param in self.policy_net.parameters():
                if param.grad is not None and torch.any(torch.isfinite(param.grad)):
                    has_valid_gradients = True
                    break

            if not has_valid_gradients:
                logger.warning("No valid gradients found, skipping optimizer step")
                return 0.0

            # Update weights
            self.optimizer.step()

            # Update target network periodically
            self.training_steps += 1
            if self.training_steps % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                logger.debug(f"Target network updated at step {self.training_steps}")

            return total_loss.item()

        except Exception as e:
            logger.error(f"Error in standard replay: {e}")
            return 0.0

    def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
        """Mixed precision training step"""
        if not self.use_mixed_precision:
            logger.warning("Mixed precision not available, falling back to standard replay")
            return self._replay_standard(states, actions, rewards, next_states, dones)

        try:
            # Validate input tensors
            if states.shape[0] == 0:
                logger.warning("Empty batch in _replay_mixed_precision")
                return 0.0

            # Zero gradients
            self.optimizer.zero_grad()

            # Forward pass with amp autocasting
            import warnings
            with warnings.catch_warnings():
                warnings.simplefilter("ignore", FutureWarning)
                with torch.cuda.amp.autocast():
                    # Get current Q values and predictions
                    current_q_values, current_extrema_pred, current_price_pred, hidden_features, current_advanced_pred = self._safe_cnn_forward(self.policy_net, states)
                    current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)

                    # Get next Q values from target network
                    with torch.no_grad():
                        if self.use_double_dqn:
                            # Double DQN
                            policy_q_values, _, _, _, _ = self._safe_cnn_forward(self.policy_net, next_states)
                            next_actions = policy_q_values.argmax(1)
                            target_q_values_all, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
                            next_q_values = target_q_values_all.gather(1, next_actions.unsqueeze(1)).squeeze(1)
                        else:
                            # Standard DQN
                            next_q_values, _, _, _, _ = self._safe_cnn_forward(self.target_net, next_states)
                            next_q_values = next_q_values.max(1)[0]

                        # Ensure consistent shapes
                        batch_size = states.shape[0]
                        if rewards.shape[0] != batch_size or next_q_values.shape[0] != batch_size:
                            logger.warning(f"Shape mismatch in mixed precision replay")
                            min_size = min(batch_size, rewards.shape[0], next_q_values.shape[0])
                            rewards = rewards[:min_size]
                            dones = dones[:min_size]
                            next_q_values = next_q_values[:min_size]
                            current_q_values = current_q_values[:min_size]

                        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

                    # Compute Q-value loss (primary task)
                    q_loss = nn.MSELoss()(current_q_values, target_q_values.detach())

                    # Initialize loss with q_loss
                    loss = q_loss

                    # Add auxiliary losses if available
                    try:
                        # Price direction loss
                        if current_price_pred is not None and current_price_pred.shape[0] > 0:
                            price_direction_loss = self._calculate_price_direction_loss(current_price_pred, rewards, actions)
                            if price_direction_loss is not None:
                                loss = loss + 0.2 * price_direction_loss

                        # Extrema loss
                        if current_extrema_pred is not None and current_extrema_pred.shape[0] > 0:
                            extrema_loss = self._calculate_extrema_loss(current_extrema_pred, rewards, actions)
                            if extrema_loss is not None:
                                loss = loss + 0.1 * extrema_loss

                    except Exception as e:
                        logger.debug(f"Could not add auxiliary loss in mixed precision: {e}")

            # Check if loss requires gradients
            if not loss.requires_grad:
                logger.warning("Loss does not require gradients in mixed precision training")
                return 0.0

            # Scale and backward pass
            self.scaler.scale(loss).backward()

            # Unscale gradients and clip
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)

            # Check for valid gradients
            has_valid_gradients = False
            for param in self.policy_net.parameters():
                if param.grad is not None and torch.any(torch.isfinite(param.grad)):
                    has_valid_gradients = True
                    break

            if not has_valid_gradients:
                logger.warning("No valid gradients in mixed precision training")
                self.scaler.update()  # Still update scaler
                return 0.0

            # Optimizer step with scaler
            self.scaler.step(self.optimizer)
            self.scaler.update()

            # Update target network
            self.training_steps += 1
            if self.training_steps % self.target_update_freq == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
                logger.debug(f"Target network updated at step {self.training_steps}")

            return loss.item()

        except Exception as e:
            logger.error(f"Error in mixed precision replay: {e}")
            return 0.0

    def train_on_extrema(self, states, actions, rewards, next_states, dones):
        """
        Special training function specifically for extrema points

        Args:
            states: Batch of states at extrema points
            actions: Batch of actions
            rewards: Batch of rewards
            next_states: Batch of next states
            dones: Batch of done flags

        Returns:
            float: Training loss
        """
        # Convert to numpy arrays if not already
        if not isinstance(states, np.ndarray):
            states = np.array(states)
        if not isinstance(actions, np.ndarray):
            actions = np.array(actions)
        if not isinstance(rewards, np.ndarray):
            rewards = np.array(rewards)
        if not isinstance(next_states, np.ndarray):
            next_states = np.array(next_states)
        if not isinstance(dones, np.ndarray):
            dones = np.array(dones, dtype=np.float32)

        # Normalize states
        states = np.vstack([self._normalize_state(s) for s in states])
        next_states = np.vstack([self._normalize_state(s) for s in next_states])

        # Convert to torch tensors and move to device
        states_tensor = torch.FloatTensor(states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)

        # Choose training method based on precision mode
        if self.use_mixed_precision:
            return self._replay_mixed_precision(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )
        else:
            return self._replay_standard(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )

    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
        # Handle NaN and infinite values
        state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)

        # Check if state is 1D array (happens in some environments)
        if len(state.shape) == 1:
            # If 1D, we need to normalize the whole array
            normalized_state = state.copy()

            # Convert any timestamp or non-numeric data to float
            for i in range(len(normalized_state)):
                # Check for timestamp-like objects
                if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i] = float(normalized_state[i].timestamp())
                elif not isinstance(normalized_state[i], (int, float, np.number)):
                    # Set non-numeric data to 0
                    normalized_state[i] = 0.0

            # Ensure all values are float
            normalized_state = normalized_state.astype(np.float32)

            # Simple min-max normalization for 1D state
            state_min = np.min(normalized_state)
            state_max = np.max(normalized_state)
            if state_max > state_min:
                normalized_state = (normalized_state - state_min) / (state_max - state_min)
            return normalized_state

        # Handle 2D arrays
        normalized_state = np.zeros_like(state, dtype=np.float32)

        # Convert any timestamp or non-numeric data to float
        for i in range(state.shape[0]):
            for j in range(state.shape[1]):
                if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i, j] = float(state[i, j].timestamp())
                elif isinstance(state[i, j], (int, float, np.number)):
                    normalized_state[i, j] = state[i, j]
                else:
                    # Set non-numeric data to 0
                    normalized_state[i, j] = 0.0

        # Loop through each timeframe's features in the combined state
        feature_count = state.shape[1] // len(self.timeframes)

        for tf_idx in range(len(self.timeframes)):
            start_idx = tf_idx * feature_count
            end_idx = start_idx + feature_count

            # Extract this timeframe's features
            tf_features = normalized_state[:, start_idx:end_idx]

            # Normalize OHLCV data by the first close price in the window
            # This makes price movements relative rather than absolute
            price_idx = 3  # Assuming close price is at index 3
            if price_idx < tf_features.shape[1]:
                reference_price = np.mean(tf_features[:, price_idx])
                if reference_price != 0:
                    # Normalize price-related columns (OHLC)
                    for i in range(4):  # First 4 columns are OHLC
                        if i < tf_features.shape[1]:
                            normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price

                # Normalize volume using mean and std
                vol_idx = 4  # Assuming volume is at index 4
                if vol_idx < tf_features.shape[1]:
                    vol_mean = np.mean(tf_features[:, vol_idx])
                    vol_std = np.std(tf_features[:, vol_idx])
                    if vol_std > 0:
                        normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
                    else:
                        normalized_state[:, start_idx + vol_idx] = 0

                # Other features (technical indicators) - normalize with min-max scaling
                for i in range(5, feature_count):
                    if i < tf_features.shape[1]:
                        feature_min = np.min(tf_features[:, i])
                        feature_max = np.max(tf_features[:, i])
                        if feature_max > feature_min:
                            normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
                        else:
                            normalized_state[:, start_idx + i] = 0

        return normalized_state

    def update_realtime_tick_features(self, tick_features):
        """Update with real-time tick features from tick processor"""
        try:
            if tick_features is not None:
                self.realtime_tick_features = tick_features

                # Log high-confidence tick features
                if tick_features.get('confidence', 0) > 0.8:
                    logger.debug(f"High-confidence tick features updated: confidence={tick_features['confidence']:.3f}")

        except Exception as e:
            logger.error(f"Error updating real-time tick features: {e}")

    def _enhance_state_with_tick_features(self, state: np.ndarray) -> np.ndarray:
        """Enhance state with real-time tick features if available"""
        try:
            if self.realtime_tick_features is None:
                return state

            # Extract neural features from tick processor
            neural_features = self.realtime_tick_features.get('neural_features', np.array([]))
            volume_features = self.realtime_tick_features.get('volume_features', np.array([]))
            microstructure_features = self.realtime_tick_features.get('microstructure_features', np.array([]))
            confidence = self.realtime_tick_features.get('confidence', 0.0)

            # Combine tick features - make them compact to match state dimensions
            tick_features = np.concatenate([
                neural_features[:3] if len(neural_features) >= 3 else np.zeros(3),  # Take first 3 neural features
                volume_features[:1] if len(volume_features) >= 1 else np.zeros(1),    # Take first volume feature
                microstructure_features[:1] if len(microstructure_features) >= 1 else np.zeros(1),  # Take first microstructure feature
            ])

            # Weight the tick features
            weighted_tick_features = tick_features * self.tick_feature_weight

            # Enhance the state by adding tick features to each timeframe
            if len(state.shape) == 1:
                # 1D state - append tick features
                enhanced_state = np.concatenate([state, weighted_tick_features])
            else:
                # 2D state - add tick features to each timeframe row
                num_timeframes, num_features = state.shape

                # Ensure tick features match the number of original features
                if len(weighted_tick_features) != num_features:
                    # Pad or truncate tick features to match state feature dimension
                    if len(weighted_tick_features) < num_features:
                        # Pad with zeros
                        padded_features = np.zeros(num_features)
                        padded_features[:len(weighted_tick_features)] = weighted_tick_features
                        weighted_tick_features = padded_features
                    else:
                        # Truncate to match
                        weighted_tick_features = weighted_tick_features[:num_features]

                # Add tick features to the last row (most recent timeframe)
                enhanced_state = state.copy()
                enhanced_state[-1, :] += weighted_tick_features  # Add to last timeframe

            return enhanced_state

        except Exception as e:
            logger.error(f"Error enhancing state with tick features: {e}")
            return state

    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
        if self.avg_reward == 0:
            self.avg_reward = episode_reward
        else:
            self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward

        # Check if we're making sufficient progress
        if episode_reward > (1 + best_reward_threshold) * self.best_reward:
            self.best_reward = episode_reward
            self.no_improvement_count = 0
            return True  # Improved
        else:
            self.no_improvement_count += 1

            # If no improvement for a while, adjust learning rate
            if self.no_improvement_count >= 10:
                current_lr = self.optimizer.param_groups[0]['lr']
                new_lr = current_lr * 0.5
                if new_lr >= 1e-6:  # Don't reduce below minimum threshold
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = new_lr
                    logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
                    self.no_improvement_count = 0

            return False  # No improvement

    def save(self, path: str = None):
        """Save model and agent state using unified registry"""
        try:
            from NN.training.model_manager import save_model

            # Use unified registry if no path or if it's a models/ path
            if path is None or path.startswith('models/'):
                model_name = "dqn_agent"
                if path:
                    model_name = path.split('/')[-1].replace('_agent_state', '').replace('.pt', '')

                # Prepare full agent state
                agent_state = {
                    'epsilon': self.epsilon,
                    'update_count': self.update_count,
                    'losses': self.losses,
                    'optimizer_state': self.optimizer.state_dict(),
                    'best_reward': self.best_reward,
                    'avg_reward': self.avg_reward,
                    'policy_net_state': self.policy_net.state_dict(),
                    'target_net_state': self.target_net.state_dict()
                }

                success = save_model(
                    model=self.policy_net,  # Save policy net as main model
                    model_name=model_name,
                    model_type='dqn',
                    metadata={'full_agent_state': agent_state}
                )

                if success:
                    logger.info(f"DQN agent saved to unified registry: {model_name}")
                return

            else:
                # Legacy direct file save
                os.makedirs(os.path.dirname(path), exist_ok=True)

                # Save policy network
                self.policy_net.save(f"{path}_policy")

                # Save target network
                self.target_net.save(f"{path}_target")

                # Save agent state
                state = {
                    'epsilon': self.epsilon,
                    'update_count': self.update_count,
                    'losses': self.losses,
                    'optimizer_state': self.optimizer.state_dict(),
                    'best_reward': self.best_reward,
                    'avg_reward': self.avg_reward
                }

                torch.save(state, f"{path}_agent_state.pt")
                logger.info(f"Agent state saved to {path}_agent_state.pt (legacy mode)")

        except Exception as e:
            logger.error(f"Failed to save DQN agent: {e}")

    def load(self, path: str = None):
        """Load model and agent state from unified registry or file"""
        try:
            from NN.training.model_manager import load_model

            # Use unified registry if no path or if it's a models/ path
            if path is None or path.startswith('models/'):
                model_name = "dqn_agent"
                if path:
                    model_name = path.split('/')[-1].replace('_agent_state', '').replace('.pt', '')

                model = load_model(model_name, 'dqn')
                if model is None:
                    logger.warning(f"Could not load DQN agent {model_name} from unified registry")
                    return

                # Load full agent state from metadata
                registry = get_model_registry()
                if model_name in registry.metadata['models']:
                    model_data = registry.metadata['models'][model_name]
                    if 'full_agent_state' in model_data:
                        agent_state = model_data['full_agent_state']

                        # Restore agent state
                        self.epsilon = agent_state['epsilon']
                        self.update_count = agent_state['update_count']
                        self.losses = agent_state['losses']
                        self.optimizer.load_state_dict(agent_state['optimizer_state'])

                        # Load additional metrics if they exist
                        if 'best_reward' in agent_state:
                            self.best_reward = agent_state['best_reward']
                        if 'avg_reward' in agent_state:
                            self.avg_reward = agent_state['avg_reward']

                        # Load network states
                        if 'policy_net_state' in agent_state:
                            self.policy_net.load_state_dict(agent_state['policy_net_state'])
                        if 'target_net_state' in agent_state:
                            self.target_net.load_state_dict(agent_state['target_net_state'])

                        logger.info(f"DQN agent loaded from unified registry: {model_name}")
                        return

                return

            else:
                # Legacy direct file load
                # Load policy network
                self.policy_net.load(f"{path}_policy")

                # Load target network
                self.target_net.load(f"{path}_target")

                # Load agent state
                try:
                    agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device, weights_only=False)
                    self.epsilon = agent_state['epsilon']
                    self.update_count = agent_state['update_count']
                    self.losses = agent_state['losses']
                    self.optimizer.load_state_dict(agent_state['optimizer_state'])

                    # Load additional metrics if they exist
                    if 'best_reward' in agent_state:
                        self.best_reward = agent_state['best_reward']
                    if 'avg_reward' in agent_state:
                        self.avg_reward = agent_state['avg_reward']

                    logger.info(f"Agent state loaded from {path}_agent_state.pt (legacy mode)")
                except FileNotFoundError:
                    logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values")

        except Exception as e:
            logger.error(f"Failed to load DQN agent: {e}")

    def get_position_info(self):
        """Get current position information"""
        return {
            'position': self.current_position,
            'entry_price': self.position_entry_price,
            'entry_time': self.position_entry_time,
            'entry_threshold': self.entry_confidence_threshold,
            'exit_threshold': self.exit_confidence_threshold
        }

    def _calculate_price_direction_loss(self, price_direction_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        """
        Calculate loss for price direction predictions

        Args:
            price_direction_pred: Tensor of shape [batch, 2] containing [direction, confidence]
            rewards: Tensor of shape [batch] containing rewards
            actions: Tensor of shape [batch] containing actions

        Returns:
            Price direction loss tensor
        """
        try:
            if price_direction_pred.size(1) != 2:
                return None

            batch_size = price_direction_pred.size(0)

            # Extract direction and confidence predictions
            direction_pred = price_direction_pred[:, 0]  # -1 to 1
            confidence_pred = price_direction_pred[:, 1]  # 0 to 1

            # Create targets based on rewards and actions
            with torch.no_grad():
                # Direction targets: 1 if reward > 0 and action is BUY, -1 if reward > 0 and action is SELL, 0 otherwise
                direction_targets = torch.zeros(batch_size, device=price_direction_pred.device)
                for i in range(batch_size):
                    if rewards[i] > 0.01:  # Positive reward threshold
                        if actions[i] == 0:  # BUY action
                            direction_targets[i] = 1.0  # UP
                        elif actions[i] == 1:  # SELL action
                            direction_targets[i] = -1.0  # DOWN
                    # else: targets remain 0 (sideways)

                # Confidence targets: based on reward magnitude (higher reward = higher confidence)
                confidence_targets = torch.abs(rewards).clamp(0, 1)

            # Calculate losses for each component
            direction_loss = F.mse_loss(direction_pred, direction_targets)
            confidence_loss = F.mse_loss(confidence_pred, confidence_targets)

            # Combined loss (direction is more important than confidence)
            total_loss = direction_loss + 0.3 * confidence_loss

            return total_loss

        except Exception as e:
            logger.debug(f"Error calculating price direction loss: {e}")
            return None

    def _calculate_extrema_loss(self, extrema_pred: torch.Tensor, rewards: torch.Tensor, actions: torch.Tensor) -> torch.Tensor:
        """
        Calculate loss for extrema predictions

        Args:
            extrema_pred: Extrema predictions
            rewards: Tensor containing rewards
            actions: Tensor containing actions

        Returns:
            Extrema loss tensor
        """
        try:
            batch_size = extrema_pred.size(0)

            # Create targets based on reward patterns
            with torch.no_grad():
                extrema_targets = torch.ones(batch_size, dtype=torch.long, device=extrema_pred.device) * 2  # Default to "neither"

                for i in range(batch_size):
                    # High positive reward suggests we're at a good entry point (potential bottom for BUY, top for SELL)
                    if rewards[i] > 0.05:
                        if actions[i] == 0:  # BUY action
                            extrema_targets[i] = 0  # Bottom
                        elif actions[i] == 1:  # SELL action
                            extrema_targets[i] = 1  # Top

            # Calculate cross-entropy loss
            if extrema_pred.size(1) >= 3:
                extrema_loss = F.cross_entropy(extrema_pred[:, :3], extrema_targets)
            else:
                extrema_loss = F.cross_entropy(extrema_pred, extrema_targets)

            return extrema_loss

        except Exception as e:
            logger.debug(f"Error calculating extrema loss: {e}")
            return None

    def get_enhanced_training_stats(self):
        """Get enhanced RL training statistics with detailed metrics (from EnhancedDQNAgent)"""
        return {
            'buffer_size': len(self.memory),
            'epsilon': self.epsilon,
            'avg_reward': self.avg_reward,
            'best_reward': self.best_reward,
            'recent_rewards': list(self.recent_rewards) if hasattr(self, 'recent_rewards') else [],
            'no_improvement_count': self.no_improvement_count,
            # Enhanced statistics from EnhancedDQNAgent
            'training_steps': self.training_steps,
            'avg_td_error': np.mean(self.td_errors[-100:]) if self.td_errors else 0.0,
            'recent_losses': self.losses[-10:] if self.losses else [],
            'epsilon_trend': self.epsilon_history[-20:] if self.epsilon_history else [],
            'specialized_buffers': {
                'extrema_memory': len(self.extrema_memory),
                'positive_memory': len(self.positive_memory),
                'price_movement_memory': len(self.price_movement_memory)
            },
            'market_regime_weights': self.market_regime_weights,
            'use_double_dqn': self.use_double_dqn,
            'use_prioritized_replay': self.use_prioritized_replay,
            'gradient_clip_norm': self.gradient_clip_norm,
            'target_update_frequency': self.target_update_freq
        }

    def get_params_count(self):
        """Get total number of parameters in the DQN model"""
        total_params = 0
        for param in self.policy_net.parameters():
            total_params += param.numel()
        return total_params

    def _sanitize_state_data(self, state):
        """Sanitize state data to ensure it's a proper numeric array"""
        try:
            # If state is already a numpy array, return it
            if isinstance(state, np.ndarray):
                # Check for empty array
                if state.size == 0:
                    logger.warning("Received empty numpy array state. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                # Check for non-numeric data and handle it
                if state.dtype == object:
                    # Convert object array to float array
                    sanitized = np.zeros_like(state, dtype=np.float32)
                    for i in range(state.shape[0]):
                        if len(state.shape) > 1:
                            for j in range(state.shape[1]):
                                sanitized[i, j] = self._extract_numeric_value(state[i, j])
                        else:
                            sanitized[i] = self._extract_numeric_value(state[i])
                    return sanitized
                else:
                    return state.astype(np.float32)

            # If state is a list or tuple, convert to array
            elif isinstance(state, (list, tuple)):
                # Check for empty list/tuple
                if len(state) == 0:
                    logger.warning("Received empty list/tuple state. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                # Recursively sanitize each element
                sanitized = []
                for item in state:
                    if isinstance(item, (list, tuple)):
                        sanitized_row = []
                        for sub_item in item:
                            sanitized_row.append(self._extract_numeric_value(sub_item))
                        sanitized.append(sanitized_row)
                    else:
                        sanitized.append(self._extract_numeric_value(item))

                result = np.array(sanitized, dtype=np.float32)

                # Check if result is empty and provide fallback
                if result.size == 0:
                    logger.warning("Sanitized state resulted in empty array. Using fallback dimensions.")
                    expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
                    if isinstance(expected_size, tuple):
                        expected_size = np.prod(expected_size)
                    return np.zeros(int(expected_size), dtype=np.float32)

                return result

            # If state is a dict, try to extract values
            elif isinstance(state, dict):
                # Try to extract meaningful values from dict
                values = []
                for key in sorted(state.keys()):  # Sort for consistency
                    values.append(self._extract_numeric_value(state[key]))
                return np.array(values, dtype=np.float32)

            # If state is a single value, make it an array
            else:
                return np.array([self._extract_numeric_value(state)], dtype=np.float32)

        except Exception as e:
            logger.warning(f"Error sanitizing state data: {e}. Using zero array with expected dimensions.")
            # Return a zero array as fallback with the expected state dimension
            # Use the state_dim from initialization, fallback to 403 if not available
            expected_size = getattr(self, 'state_size', getattr(self, 'state_dim', 403))
            if isinstance(expected_size, tuple):
                expected_size = np.prod(expected_size)
            return np.zeros(int(expected_size), dtype=np.float32)

    def _extract_numeric_value(self, value):
        """Extract a numeric value from various data types"""
        try:
            # Handle None values
            if value is None:
                return 0.0

            # Handle numeric types
            if isinstance(value, (int, float, np.number)):
                return float(value)

            # Handle dict values
            elif isinstance(value, dict):
                # Try common keys for numeric data
                for key in ['value', 'price', 'close', 'last', 'amount', 'quantity']:
                    if key in value:
                        return self._extract_numeric_value(value[key])
                # If no common keys, try to get first numeric value
                for v in value.values():
                    if isinstance(v, (int, float, np.number)):
                        return float(v)
                return 0.0

            # Handle string values that might be numeric
            elif isinstance(value, str):
                try:
                    return float(value)
                except:
                    return 0.0

            # Handle datetime objects
            elif hasattr(value, 'timestamp'):
                return float(value.timestamp())

            # Handle boolean values
            elif isinstance(value, bool):
                return float(value)

            # Handle list/tuple - take first numeric value
            elif isinstance(value, (list, tuple)) and len(value) > 0:
                return self._extract_numeric_value(value[0])

            else:
                return 0.0

        except:
            return 0.0

    def _extract_numeric_from_dict(self, data_dict):
        """Recursively extract all numeric values from a dictionary"""
        numeric_values = []
        try:
            for key, value in data_dict.items():
                if isinstance(value, (int, float)):
                    numeric_values.append(float(value))
                elif isinstance(value, (list, np.ndarray)):
                    try:
                        flattened = np.array(value).flatten()
                        for x in flattened:
                            if isinstance(x, (int, float)):
                                numeric_values.append(float(x))
                            elif hasattr(x, 'item'):  # numpy scalar
                                numeric_values.append(float(x.item()))
                    except (ValueError, TypeError):
                        continue
                elif isinstance(value, dict):
                    # Recursively extract from nested dicts
                    nested_values = self._extract_numeric_from_dict(value)
                    numeric_values.extend(nested_values)
                elif isinstance(value, torch.Tensor):
                    try:
                        numeric_values.append(float(value.item()))
                    except Exception:
                        continue
        except Exception as e:
            logger.debug(f"Error extracting numeric values from dict: {e}")
        return numeric_values