initial movel changes to fix performance

2025-04-02 14:03:20 +03:00
parent aec536d007
commit 70eb7bba9b
8 changed files with 1619 additions and 279 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -8,6 +8,7 @@ from typing import Tuple, List
 import os
 import sys
 import logging
+import torch.nn.functional as F

 # Add parent directory to path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@@ -20,71 +21,124 @@ logger = logging.getLogger(__name__)
 class DQNAgent:
    """
    Deep Q-Network agent for trading
-    Uses CNN model as the base network
+    Uses CNN model as the base network with GPU support
    """
    def __init__(self,
-                 state_size: int,
-                 action_size: int,
-                 window_size: int,
-                 num_features: int,
-                 timeframes: List[str],
+                 state_shape: Tuple[int, ...],
+                 n_actions: int,
                 learning_rate: float = 0.0005,  # Reduced learning rate for more stability
                 gamma: float = 0.97,            # Slightly reduced discount factor
                 epsilon: float = 1.0,
                 epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
                 epsilon_decay: float = 0.9975,  # Slower decay rate
-                 memory_size: int = 20000,       # Increased memory size
+                 buffer_size: int = 20000,       # Increased memory size
                 batch_size: int = 128,          # Larger batch size
-                 target_update: int = 5):        # More frequent target updates
+                 target_update: int = 5,         # More frequent target updates
+                 device=None):                   # Device for computations
        
-        self.state_size = state_size
-        self.action_size = action_size
-        self.window_size = window_size
-        self.num_features = num_features
-        self.timeframes = timeframes
+        # Extract state dimensions
+        if isinstance(state_shape, tuple) and len(state_shape) > 1:
+            # Multi-dimensional state (like image or sequence)
+            self.state_dim = state_shape
+        else:
+            # 1D state
+            if isinstance(state_shape, tuple):
+                self.state_dim = state_shape[0]
+            else:
+                self.state_dim = state_shape
+        
+        # Store parameters
+        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
-        self.memory_size = memory_size
+        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.target_update = target_update
        
-        # Device configuration
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Set device for computation (default to CPU)
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = device
        
-        # Initialize networks
-        self.policy_net = CNNModelPyTorch(
-            window_size=window_size,
-            num_features=num_features,
-            output_size=action_size,
-            timeframes=timeframes
-        ).to(self.device)
+        # Initialize models with appropriate architecture based on state shape
+        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
+            # For image-like states (from RL environment with CNN)
+            from NN.models.simple_cnn import SimpleCNN
+            self.policy_net = SimpleCNN(self.state_dim, self.n_actions)
+            self.target_net = SimpleCNN(self.state_dim, self.n_actions)
+        else:
+            # For 1D state vectors (most environments)
+            from NN.models.simple_mlp import SimpleMLP
+            self.policy_net = SimpleMLP(self.state_dim, self.n_actions)
+            self.target_net = SimpleMLP(self.state_dim, self.n_actions)
        
-        self.target_net = CNNModelPyTorch(
-            window_size=window_size,
-            num_features=num_features,
-            output_size=action_size,
-            timeframes=timeframes
-        ).to(self.device)
+        # Initialize the target network with the same weights as the policy network
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
-        # Initialize optimizer with gradient clipping
-        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
+        # Set models to eval mode (important for batch norm, dropout)
+        self.target_net.eval()
        
-        # Initialize memories with different priorities
-        self.memory = deque(maxlen=memory_size)
-        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
-        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards
+        # Optimization components
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
+        self.criterion = nn.MSELoss()
        
-        # Training metrics
+        # Experience replay memory
+        self.memory = []
+        self.positive_memory = []  # Special memory for storing good experiences
        self.update_count = 0
-        self.losses = []
-        self.avg_reward = 0
-        self.no_improvement_count = 0
-        self.best_reward = float('-inf')
        
+        # Extrema detection tracking
+        self.last_extrema_pred = {
+            'class': 2,  # Default to "neither" (not extrema)
+            'confidence': 0.0,
+            'raw': None
+        }
+        self.extrema_memory = []  # Special memory for storing extrema points
+        
+        # Performance tracking
+        self.losses = []
+        self.avg_reward = 0.0
+        self.best_reward = -float('inf')
+        self.no_improvement_count = 0
+        
+        # Check if mixed precision training should be used
+        self.use_mixed_precision = False
+        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
+            self.use_mixed_precision = True
+            self.scaler = torch.cuda.amp.GradScaler()
+            logger.info("Mixed precision training enabled")
+        else:
+            logger.info("Mixed precision training disabled")
+            
+        # Track if we're in training mode
+        self.training = True
+        
+        # For compatibility with old code
+        self.state_size = np.prod(state_shape)
+        self.action_size = n_actions
+        self.memory_size = buffer_size
+        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0]]  # Default timeframes
+        
+        logger.info(f"DQN Agent using device: {self.device}")
+        
+    def move_models_to_device(self, device=None):
+        """Move models to the specified device (GPU/CPU)"""
+        if device is not None:
+            self.device = device
+            
+        try:
+            self.policy_net = self.policy_net.to(self.device)
+            self.target_net = self.target_net.to(self.device)
+            logger.info(f"Moved models to {self.device}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to move models to {self.device}: {str(e)}")
+            return False
+
    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
@@ -103,25 +157,472 @@ class DQNAgent:
        # Always add to main memory
        self.memory.append(experience)
        
-        # Add to specialized memories if applicable
-        if is_extrema:
+        # Check if this is an extrema point based on our extrema detection head
+        if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
+            # Class 0 = bottom, 1 = top, 2 = neither
+            # Only consider high confidence predictions
+            if self.last_extrema_pred['confidence'] > 0.7:
+                self.extrema_memory.append(experience)
+                
+                # Log this special experience
+                extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
+                logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")
+                
+                # For tops and bottoms, also duplicate the experience in memory to learn more from it
+                for _ in range(2):  # Add 2 extra copies
+                    self.memory.append(experience)
+        
+        # Explicitly marked extrema points also go to extrema memory
+        elif is_extrema:
            self.extrema_memory.append(experience)
            
        # Store positive experiences separately for prioritized replay
        if reward > 0:
            self.positive_memory.append(experience)
+            
+            # For very good rewards, duplicate to learn more from them
+            if reward > 0.1:
+                for _ in range(min(int(reward * 10), 5)):  # Cap at 5 extra copies for very high rewards
+                    self.positive_memory.append(experience)
+                    
+        # Keep memory size under control
+        if len(self.memory) > self.buffer_size:
+            # Keep more recent experiences
+            self.memory = self.memory[-self.buffer_size:]
+            
+        # Keep specialized memories under control too
+        if len(self.positive_memory) > self.buffer_size // 4:
+            self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]
+            
+        if len(self.extrema_memory) > self.buffer_size // 4:
+            self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]
    
    def act(self, state: np.ndarray, explore=True) -> int:
        """Choose action using epsilon-greedy policy with explore flag"""
        if explore and random.random() < self.epsilon:
-            return random.randrange(self.action_size)
+            return random.randrange(self.n_actions)
        
        with torch.no_grad():
            # Ensure state is normalized before inference
            state_tensor = self._normalize_state(state)
            state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
+            
+            # Get predictions using the policy network
+            self.policy_net.eval()  # Set to evaluation mode for inference
            action_probs, extrema_pred = self.policy_net(state_tensor)
-            return action_probs.argmax().item()
+            self.policy_net.train()  # Back to training mode
+            
+            # Get the predicted extrema class (0=bottom, 1=top, 2=neither)
+            extrema_class = extrema_pred.argmax(dim=1).item()
+            extrema_confidence = torch.softmax(extrema_pred, dim=1)[0, extrema_class].item()
+            
+            # Log extrema prediction for significant signals
+            if extrema_confidence > 0.7 and extrema_class != 2:  # Only log strong top/bottom signals
+                extrema_type = "BOTTOM" if extrema_class == 0 else "TOP" if extrema_class == 1 else "NEITHER"
+                logger.info(f"High confidence {extrema_type} detected! Confidence: {extrema_confidence:.4f}")
+            
+            # Store extrema prediction for the environment to use
+            self.last_extrema_pred = {
+                'class': extrema_class,
+                'confidence': extrema_confidence,
+                'raw': extrema_pred.cpu().numpy()
+            }
+            
+            # Get the action with highest Q-value
+            action = action_probs.argmax().item()
+            
+            # Adjust action based on extrema prediction (with some probability)
+            if extrema_confidence > 0.8:  # Only adjust for strong signals
+                if extrema_class == 0:  # Bottom detected
+                    # Bias toward BUY at bottoms
+                    if action != 0 and random.random() < 0.3 * extrema_confidence:
+                        logger.info(f"Adjusting action to BUY based on bottom detection")
+                        action = 0  # BUY
+                elif extrema_class == 1:  # Top detected
+                    # Bias toward SELL at tops
+                    if action != 1 and random.random() < 0.3 * extrema_confidence:
+                        logger.info(f"Adjusting action to SELL based on top detection")
+                        action = 1  # SELL
+            
+            return action
+
+    def replay(self, use_prioritized=True) -> float:
+        """Experience replay - learn from stored experiences
+
+        Args:
+            use_prioritized: Whether to use prioritized experience replay
+            
+        Returns:
+            float: Training loss
+        """
+        # Check if we have enough samples
+        if len(self.memory) < self.batch_size:
+            return 0.0
+        
+        # Check if mixed precision should be disabled
+        if 'DISABLE_MIXED_PRECISION' in os.environ:
+            self.use_mixed_precision = False
+            
+        # Sample from memory with or without prioritization
+        if use_prioritized and len(self.positive_memory) > self.batch_size // 4:
+            # Use prioritized sampling: mix normal samples with positive reward samples
+            positive_batch_size = min(self.batch_size // 4, len(self.positive_memory))
+            regular_batch_size = self.batch_size - positive_batch_size
+            
+            # Get positive examples
+            positive_batch = random.sample(self.positive_memory, positive_batch_size)
+            
+            # Get regular examples
+            regular_batch = random.sample(self.memory, regular_batch_size)
+            
+            # Combine batches
+            minibatch = positive_batch + regular_batch
+        else:
+            # Use regular uniform sampling
+            minibatch = random.sample(self.memory, self.batch_size)
+        
+        # Extract batches with proper tensor conversion
+        states = np.vstack([self._normalize_state(x[0]) for x in minibatch])
+        actions = np.array([x[1] for x in minibatch])
+        rewards = np.array([x[2] for x in minibatch])
+        next_states = np.vstack([self._normalize_state(x[3]) for x in minibatch])
+        dones = np.array([x[4] for x in minibatch], dtype=np.float32)
+        
+        # Convert to torch tensors and move to device
+        states_tensor = torch.FloatTensor(states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
+        
+        # First training step with mixed precision if available
+        if self.use_mixed_precision:
+            loss = self._replay_mixed_precision(
+                states_tensor, actions_tensor, rewards_tensor, 
+                next_states_tensor, dones_tensor
+            )
+        else:
+            loss = self._replay_standard(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
+            
+        # Occasionally train specifically on extrema points, if we have enough
+        if hasattr(self, 'extrema_memory') and len(self.extrema_memory) >= self.batch_size // 2:
+            if random.random() < 0.3:  # 30% chance to do extra extrema training
+                # Sample from extrema memory
+                extrema_batch_size = min(self.batch_size // 2, len(self.extrema_memory))
+                extrema_batch = random.sample(self.extrema_memory, extrema_batch_size)
+                
+                # Extract batches with proper tensor conversion
+                extrema_states = np.vstack([self._normalize_state(x[0]) for x in extrema_batch])
+                extrema_actions = np.array([x[1] for x in extrema_batch])
+                extrema_rewards = np.array([x[2] for x in extrema_batch])
+                extrema_next_states = np.vstack([self._normalize_state(x[3]) for x in extrema_batch])
+                extrema_dones = np.array([x[4] for x in extrema_batch], dtype=np.float32)
+                
+                # Convert to torch tensors and move to device
+                extrema_states_tensor = torch.FloatTensor(extrema_states).to(self.device)
+                extrema_actions_tensor = torch.LongTensor(extrema_actions).to(self.device)
+                extrema_rewards_tensor = torch.FloatTensor(extrema_rewards).to(self.device)
+                extrema_next_states_tensor = torch.FloatTensor(extrema_next_states).to(self.device)
+                extrema_dones_tensor = torch.FloatTensor(extrema_dones).to(self.device)
+                
+                # Additional training step focused on extrema points (with smaller learning rate)
+                original_lr = self.optimizer.param_groups[0]['lr']
+                # Temporarily reduce learning rate for fine-tuning on extrema
+                for param_group in self.optimizer.param_groups:
+                    param_group['lr'] = original_lr * 0.5
+                    
+                # Train on extrema
+                if self.use_mixed_precision:
+                    extrema_loss = self._replay_mixed_precision(
+                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
+                        extrema_next_states_tensor, extrema_dones_tensor
+                    )
+                else:
+                    extrema_loss = self._replay_standard(
+                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
+                        extrema_next_states_tensor, extrema_dones_tensor
+                    )
+                
+                # Restore original learning rate
+                for param_group in self.optimizer.param_groups:
+                    param_group['lr'] = original_lr
+                
+                logger.info(f"Extra training on extrema points: loss={extrema_loss:.4f}")
+                
+                # Average the loss
+                loss = (loss + extrema_loss) / 2
+        
+        # Store and return loss
+        self.losses.append(loss)
+        return loss
+            
+    def _replay_standard(self, states, actions, rewards, next_states, dones):
+        """Standard precision training step"""
+        # Zero gradients
+        self.optimizer.zero_grad()
+        
+        # Get current Q values and extrema predictions
+        current_q_values, current_extrema_pred = self.policy_net(states)
+        current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
+        
+        # Get next Q values from target network
+        with torch.no_grad():
+            next_q_values, next_extrema_pred = self.target_net(next_states)
+            next_q_values = next_q_values.max(1)[0]
+            
+            # Check for dimension mismatch and fix it
+            if rewards.shape[0] != next_q_values.shape[0]:
+                # Log the shape mismatch for debugging
+                logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
+                # Use the smaller size to prevent index errors
+                min_size = min(rewards.shape[0], next_q_values.shape[0])
+                rewards = rewards[:min_size]
+                dones = dones[:min_size]
+                next_q_values = next_q_values[:min_size]
+                current_q_values = current_q_values[:min_size]
+                
+            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+        
+        # Compute Q-value loss (primary task)
+        q_loss = nn.MSELoss()(current_q_values, target_q_values)
+        
+        # Create extrema labels from price movements (crude approximation)
+        # If the next state price is higher than current, we might be in an uptrend (not a bottom)
+        # If the next state price is lower than current, we might be in a downtrend (not a top)
+        # This is a simplified approximation; in real scenarios we'd want to use actual extrema detection
+        
+        # Try to extract price from current and next states
+        # Assuming price is in the last feature
+        try:
+            # Extract price feature from sequence data (if available)
+            if len(states.shape) == 3:  # [batch, seq, features]
+                current_prices = states[:, -1, -1]  # Last timestep, last feature
+                next_prices = next_states[:, -1, -1]
+            else:  # [batch, features]
+                current_prices = states[:, -1]  # Last feature
+                next_prices = next_states[:, -1]
+                
+            # Compute price changes
+            price_changes = (next_prices - current_prices) / current_prices
+            
+            # Create crude extrema labels:
+            # 0 = bottom: Large negative price change followed by positive change
+            # 1 = top: Large positive price change followed by negative change
+            # 2 = neither: Small or inconsistent changes
+            
+            # Classify based on price change magnitude
+            extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
+            
+            # Identify potential bottoms (significant negative change)
+            bottoms = (price_changes < -0.003)
+            extrema_labels[bottoms] = 0
+            
+            # Identify potential tops (significant positive change)
+            tops = (price_changes > 0.003)
+            extrema_labels[tops] = 1
+            
+            # Calculate extrema prediction loss (auxiliary task)
+            if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
+                current_extrema_pred = current_extrema_pred[:min_size]
+                extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
+                
+                # Combined loss (primary + auxiliary with lower weight)
+                # Typically auxiliary tasks should have lower weight to not dominate the primary task
+                loss = q_loss + 0.3 * extrema_loss
+                
+                # Log separate loss components occasionally
+                if random.random() < 0.01:  # Log 1% of the time to avoid flood
+                    logger.info(f"Training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
+            else:
+                # Fall back to just Q-value loss if extrema predictions aren't available
+                loss = q_loss
+        except Exception as e:
+            # Fallback if price extraction fails
+            logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
+            loss = q_loss
+        
+        # Backward pass and optimize
+        loss.backward()
+        
+        # Gradient clipping to prevent exploding gradients
+        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
+        self.optimizer.step()
+        
+        # Update target network if needed
+        self.update_count += 1
+        if self.update_count % self.target_update == 0:
+            self.target_net.load_state_dict(self.policy_net.state_dict())
+        
+        # Track and decay epsilon
+        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+        
+        return loss.item()
+    
+    def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
+        """Mixed precision training step for better GPU performance"""
+        # Check if mixed precision should be explicitly disabled
+        if 'DISABLE_MIXED_PRECISION' in os.environ:
+            logger.info("Mixed precision explicitly disabled by environment variable")
+            return self._replay_standard(states, actions, rewards, next_states, dones)
+            
+        try:
+            # Zero gradients
+            self.optimizer.zero_grad()
+            
+            # Forward pass with amp autocasting
+            with torch.cuda.amp.autocast():
+                # Get current Q values and extrema predictions
+                current_q_values, current_extrema_pred = self.policy_net(states)
+                current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
+                
+                # Get next Q values from target network
+                with torch.no_grad():
+                    next_q_values, next_extrema_pred = self.target_net(next_states)
+                    next_q_values = next_q_values.max(1)[0]
+                    
+                    # Check for dimension mismatch and fix it
+                    if rewards.shape[0] != next_q_values.shape[0]:
+                        # Log the shape mismatch for debugging
+                        logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
+                        # Use the smaller size to prevent index errors
+                        min_size = min(rewards.shape[0], next_q_values.shape[0])
+                        rewards = rewards[:min_size]
+                        dones = dones[:min_size]
+                        next_q_values = next_q_values[:min_size]
+                        current_q_values = current_q_values[:min_size]
+                    
+                    target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+                
+                # Compute Q-value loss (primary task)
+                q_loss = nn.MSELoss()(current_q_values, target_q_values)
+                
+                # Create extrema labels from price movements (crude approximation)
+                # Try to extract price from current and next states
+                try:
+                    # Extract price feature from sequence data (if available)
+                    if len(states.shape) == 3:  # [batch, seq, features]
+                        current_prices = states[:, -1, -1]  # Last timestep, last feature
+                        next_prices = next_states[:, -1, -1]
+                    else:  # [batch, features]
+                        current_prices = states[:, -1]  # Last feature
+                        next_prices = next_states[:, -1]
+                    
+                    # Compute price changes
+                    price_changes = (next_prices - current_prices) / current_prices
+                    
+                    # Create crude extrema labels:
+                    # 0 = bottom: Large negative price change followed by positive change
+                    # 1 = top: Large positive price change followed by negative change
+                    # 2 = neither: Small or inconsistent changes
+                    
+                    # Classify based on price change magnitude
+                    extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
+                    
+                    # Identify potential bottoms (significant negative change)
+                    bottoms = (price_changes < -0.003)
+                    extrema_labels[bottoms] = 0
+                    
+                    # Identify potential tops (significant positive change)
+                    tops = (price_changes > 0.003)
+                    extrema_labels[tops] = 1
+                    
+                    # Calculate extrema prediction loss (auxiliary task)
+                    if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
+                        current_extrema_pred = current_extrema_pred[:min_size]
+                        extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
+                        
+                        # Combined loss (primary + auxiliary with lower weight)
+                        loss = q_loss + 0.3 * extrema_loss
+                        
+                        # Log separate loss components occasionally
+                        if random.random() < 0.01:  # Log 1% of the time to avoid flood
+                            logger.info(f"Mixed precision training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
+                    else:
+                        # Fall back to just Q-value loss
+                        loss = q_loss
+                except Exception as e:
+                    # Fallback if price extraction fails
+                    logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
+                    loss = q_loss
+            
+            # Backward pass with scaled gradients
+            self.scaler.scale(loss).backward()
+            
+            # Gradient clipping on scaled gradients
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
+            
+            # Update with scaler
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            
+            # Update target network if needed
+            self.update_count += 1
+            if self.update_count % self.target_update == 0:
+                self.target_net.load_state_dict(self.policy_net.state_dict())
+            
+            # Track and decay epsilon
+            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+            
+            return loss.item()
+            
+        except Exception as e:
+            logger.error(f"Error in mixed precision training: {str(e)}")
+            logger.warning("Falling back to standard precision training")
+            # Fall back to standard training
+            return self._replay_standard(states, actions, rewards, next_states, dones)
+
+    def train_on_extrema(self, states, actions, rewards, next_states, dones):
+        """
+        Special training function specifically for extrema points
+        
+        Args:
+            states: Batch of states at extrema points
+            actions: Batch of actions
+            rewards: Batch of rewards
+            next_states: Batch of next states
+            dones: Batch of done flags
+            
+        Returns:
+            float: Training loss
+        """
+        # Convert to numpy arrays if not already
+        if not isinstance(states, np.ndarray):
+            states = np.array(states)
+        if not isinstance(actions, np.ndarray):
+            actions = np.array(actions)
+        if not isinstance(rewards, np.ndarray):
+            rewards = np.array(rewards)
+        if not isinstance(next_states, np.ndarray):
+            next_states = np.array(next_states)
+        if not isinstance(dones, np.ndarray):
+            dones = np.array(dones, dtype=np.float32)
+        
+        # Normalize states
+        states = np.vstack([self._normalize_state(s) for s in states])
+        next_states = np.vstack([self._normalize_state(s) for s in next_states])
+        
+        # Convert to torch tensors and move to device
+        states_tensor = torch.FloatTensor(states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
+        
+        # Choose training method based on precision mode
+        if self.use_mixed_precision:
+            return self._replay_mixed_precision(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
+        else:
+            return self._replay_standard(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
    
    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
@@ -211,148 +712,6 @@ class DQNAgent:
        
        return normalized_state
    
-    def replay(self, use_prioritized=True) -> float:
-        """
-        Train on a batch of experiences with prioritized sampling
-        
-        Args:
-            use_prioritized: Whether to use prioritized replay
-            
-        Returns:
-            float: Loss value
-        """
-        if len(self.memory) < self.batch_size:
-            return 0.0
-        
-        # Sample batch with prioritization
-        batch = []
-        
-        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
-            # Prioritized sampling from different memory types
-            positive_count = min(self.batch_size // 4, len(self.positive_memory))
-            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
-            regular_count = self.batch_size - positive_count - extrema_count
-            
-            positive_samples = random.sample(list(self.positive_memory), positive_count)
-            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
-            regular_samples = random.sample(list(self.memory), regular_count)
-            
-            batch = positive_samples + extrema_samples + regular_samples
-        else:
-            # Standard sampling
-            batch = random.sample(self.memory, self.batch_size)
-        
-        states, actions, rewards, next_states, dones = zip(*batch)
-        
-        # Normalize states before training
-        normalized_states = np.array([self._normalize_state(state) for state in states])
-        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
-        
-        # Convert to tensors and move to device
-        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones_tensor = torch.FloatTensor(dones).to(self.device)
-        
-        # Get current Q values
-        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        
-        # Get next Q values from target network (Double DQN approach)
-        with torch.no_grad():
-            # Get actions from policy network
-            next_actions, _ = self.policy_net(next_states_tensor)
-            next_actions = next_actions.max(1)[1].unsqueeze(1)
-            
-            # Get Q values from target network for those actions
-            next_q_values, _ = self.target_net(next_states_tensor)
-            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-            
-            # Compute target Q values
-            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
-        
-        # Clamp target values to prevent extreme values
-        target_q_values = torch.clamp(target_q_values, -100, 100)
-        
-        # Compute Huber loss (more robust to outliers than MSE)
-        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
-        
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        
-        # Apply gradient clipping to prevent exploding gradients
-        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
-        
-        self.optimizer.step()
-        
-        # Update target network if needed
-        self.update_count += 1
-        if self.update_count % self.target_update == 0:
-            self.target_net.load_state_dict(self.policy_net.state_dict())
-        
-        # Decay epsilon
-        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
-        
-        return loss.item()
-    
-    def train_on_extrema(self, states, actions, rewards, next_states, dones):
-        """
-        Special training method focused on extrema patterns
-        
-        Args:
-            states: Array of states near extrema points
-            actions: Correct actions to take (buy at bottoms, sell at tops)
-            rewards: Rewards for each action
-            next_states: Next states
-            dones: Done flags
-        """
-        if len(states) == 0:
-            return 0.0
-        
-        # Normalize states
-        normalized_states = np.array([self._normalize_state(state) for state in states])
-        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
-            
-        # Convert to tensors
-        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones_tensor = torch.FloatTensor(dones).to(self.device)
-        
-        # Forward pass
-        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        
-        # Get next Q values (Double DQN approach)
-        with torch.no_grad():
-            next_actions, _ = self.policy_net(next_states_tensor)
-            next_actions = next_actions.max(1)[1].unsqueeze(1)
-            
-            next_q_values, _ = self.target_net(next_states_tensor)
-            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-            
-            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
-            
-        # Clamp target values
-        target_q_values = torch.clamp(target_q_values, -100, 100)
-        
-        # Use Huber loss for extrema training
-        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
-        
-        # Full loss
-        loss = q_loss
-        
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
-        self.optimizer.step()
-        
-        return loss.item()
-    
    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
--- a/NN/models/simple_cnn.py
+++ b/NN/models/simple_cnn.py
@@ -74,6 +74,107 @@ class AdaptiveNorm(nn.Module):
                self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
            return self.layer_norm_1d(x)

+class SimpleCNN(nn.Module):
+    """
+    Simple CNN model for reinforcement learning with image-like state inputs
+    """
+    def __init__(self, input_shape, n_actions):
+        super(SimpleCNN, self).__init__()
+        
+        # Store dimensions
+        self.input_shape = input_shape
+        self.n_actions = n_actions
+        
+        # Calculate input dimensions
+        if len(input_shape) == 3:  # [channels, height, width]
+            self.channels, self.height, self.width = input_shape
+            self.feature_dim = self.height * self.width
+        elif len(input_shape) == 2:  # [timeframes, features]
+            self.channels = input_shape[0]
+            self.features = input_shape[1]
+            self.feature_dim = self.features
+        elif len(input_shape) == 1:  # [features]
+            self.channels = 1
+            self.features = input_shape[0]
+            self.feature_dim = self.features
+        else:
+            raise ValueError(f"Unsupported input shape: {input_shape}")
+        
+        # Build network
+        self._build_network()
+        
+        # Initialize device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(self.device)
+        
+        logger.info(f"SimpleCNN initialized with input shape: {input_shape}, actions: {n_actions}")
+    
+    def _build_network(self):
+        """Build the neural network with current feature dimensions"""
+        # Create a flexible architecture that adapts to input dimensions
+        self.fc_layers = nn.Sequential(
+            nn.Linear(self.feature_dim, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU()
+        )
+        
+        # Output heads (Dueling DQN architecture)
+        self.advantage_head = nn.Linear(256, self.n_actions)
+        self.value_head = nn.Linear(256, 1)
+        
+        # Extrema detection head
+        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
+    
+    def _check_rebuild_network(self, features):
+        """Check if network needs to be rebuilt for different feature dimensions"""
+        if features != self.feature_dim:
+            logger.info(f"Rebuilding network for new feature dimension: {features} (was {self.feature_dim})")
+            self.feature_dim = features
+            self._build_network()
+            # Move to device after rebuilding
+            self.to(self.device)
+            return True
+        return False
+        
+    def forward(self, x):
+        """
+        Forward pass through the network
+        Returns both action values and extrema predictions
+        """
+        # Handle different input shapes
+        if len(x.shape) == 2:  # [batch_size, features]
+            # Simple feature vector
+            batch_size, features = x.shape
+            # Check if we need to rebuild the network for new dimensions
+            self._check_rebuild_network(features)
+            
+        elif len(x.shape) == 3:  # [batch_size, timeframes/channels, features]
+            # Reshape to flatten timeframes/channels with features
+            batch_size, timeframes, features = x.shape
+            total_features = timeframes * features
+            
+            # Check if we need to rebuild the network for new dimensions
+            self._check_rebuild_network(total_features)
+            
+            # Reshape tensor to [batch_size, total_features]
+            x = x.reshape(batch_size, total_features)
+            
+        # Apply fully connected layers
+        fc_out = self.fc_layers(x)
+        
+        # Dueling architecture
+        advantage = self.advantage_head(fc_out)
+        value = self.value_head(fc_out)
+        
+        # Q-values = value + (advantage - mean(advantage))
+        action_values = value + advantage - advantage.mean(dim=1, keepdim=True)
+        
+        # Extrema predictions
+        extrema_pred = self.extrema_head(fc_out)
+        
+        return action_values, extrema_pred
+
 class CNNModelPyTorch(nn.Module):
    """
    CNN model for trading with multiple timeframes
--- a/NN/models/simple_mlp.py
+++ b/NN/models/simple_mlp.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+import logging
+
+# Configure logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class SimpleMLP(nn.Module):
+    """
+    Simple Multi-Layer Perceptron for reinforcement learning with vector state inputs
+    Implements dueling architecture for better Q-learning
+    """
+    def __init__(self, state_dim, n_actions):
+        super(SimpleMLP, self).__init__()
+        
+        # Store dimensions
+        self.state_dim = state_dim
+        self.n_actions = n_actions
+        
+        # Calculate input size
+        if isinstance(state_dim, tuple):
+            self.input_size = int(np.prod(state_dim))
+        else:
+            self.input_size = state_dim
+            
+        # Hidden layers
+        self.fc1 = nn.Linear(self.input_size, 256)
+        self.fc2 = nn.Linear(256, 256)
+        
+        # Dueling architecture
+        self.advantage = nn.Linear(256, n_actions)
+        self.value = nn.Linear(256, 1)
+        
+        # Extrema detection
+        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
+        
+        # Move to appropriate device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(self.device)
+        
+        logger.info(f"SimpleMLP initialized with input size: {self.input_size}, actions: {n_actions}")
+    
+    def forward(self, x):
+        """
+        Forward pass through the network
+        Returns both action values and extrema predictions
+        """
+        # Handle different input shapes
+        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
+            x = x.view(-1, self.input_size)
+            
+        # Main network
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        
+        # Dueling architecture
+        advantage = self.advantage(x)
+        value = self.value(x)
+        
+        # Combine value and advantage (Q = V + A - mean(A))
+        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
+        
+        # Extrema predictions
+        extrema = F.softmax(self.extrema_head(x), dim=1)
+        
+        return q_values, extrema 
--- a/NN/train_rl.py
+++ b/NN/train_rl.py
@@ -29,6 +29,21 @@ logging.basicConfig(
    ]
 )

+# Set up device for PyTorch (use GPU if available)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Log GPU status
+if torch.cuda.is_available():
+    gpu_count = torch.cuda.device_count()
+    gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
+    logger.info(f"Using GPU: {gpu_names}")
+    
+    # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
+    if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
+        logger.info("BFloat16 precision is supported - will use for faster training")
+else:
+    logger.warning("GPU not available. Using CPU for training (slower).")
+
 class RLTradingEnvironment(gym.Env):
    """
    Reinforcement Learning environment for trading with technical indicators
@@ -266,87 +281,151 @@ class RLTradingEnvironment(gym.Env):
 def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent", 
             action_callback=None, episode_callback=None, symbol="BTC/USDT"):
    """
-    Train DQN agent for RL-based trading with extended training and monitoring
+    Train a reinforcement learning agent for trading
    
    Args:
-        env_class: Optional environment class to use, defaults to RLTradingEnvironment
-        num_episodes: Number of episodes to train
+        env_class: Optional environment class override
+        num_episodes: Number of episodes to train for
        max_steps: Maximum steps per episode
-        save_path: Path to save the model
-        action_callback: Optional callback for each action (step, action, price, reward, info)
-        episode_callback: Optional callback after each episode (episode, reward, info)
-        symbol: Trading pair symbol (e.g., "BTC/USDT")
+        save_path: Path to save the trained model
+        action_callback: Callback function for monitoring actions
+        episode_callback: Callback function for monitoring episodes
+        symbol: Trading symbol to use
        
    Returns:
-        DQNAgent: The trained agent
+        tuple: (trained agent, environment)
    """
-    import pandas as pd
-    from NN.utils.data_interface import DataInterface
+    # Load data for the selected symbol
+    data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
    
-    logger.info("Starting DQN training for RL trading")
+    try:
+        # Try to load data for the requested symbol using get_historical_data method
+        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
+        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
+        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
+        
+        if data_1m is None or data_5m is None or data_15m is None:
+            raise FileNotFoundError("Could not retrieve data for specified symbol")
+    except Exception as e:
+        logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.")
+        # Try to use cached data if available
+        symbol = "BTC/USDT"
+        data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
+        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
+        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
+        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
+        
+        if data_1m is None or data_5m is None or data_15m is None:
+            logger.error("Failed to retrieve any data. Cannot continue training.")
+            raise ValueError("No data available for training")
    
-    # Create data interface with specified symbol
-    data_interface = DataInterface(symbol=symbol)
-    
-    # Load and preprocess data
-    logger.info(f"Loading data from multiple timeframes for {symbol}")
-    features_1m = data_interface.get_training_data("1m", n_candles=2000)
-    features_5m = data_interface.get_training_data("5m", n_candles=1000)
-    features_15m = data_interface.get_training_data("15m", n_candles=500)
-    
-    # Check if we have all the data
-    if features_1m is None or features_5m is None or features_15m is None:
-        logger.error("Failed to load training data from one or more timeframes")
-        return None
-    
-    # If data is a DataFrame, convert to numpy array excluding the timestamp column
-    if isinstance(features_1m, pd.DataFrame):
-        features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
-    if isinstance(features_5m, pd.DataFrame):
-        features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
-    if isinstance(features_15m, pd.DataFrame):
-        features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
-    
-    # Initialize environment or use provided class
-    if env_class is None:
-        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
+    # Create features from the data by adding technical indicators and converting to numpy format
+    if data_1m is not None:
+        data_1m = data_interface.add_technical_indicators(data_1m)
+        # Convert to numpy array with close price as the last column
+        features_1m = np.hstack([
+            data_1m.drop(['timestamp', 'close'], axis=1).values,
+            data_1m['close'].values.reshape(-1, 1)
+        ])
    else:
+        features_1m = None
+        
+    if data_5m is not None:
+        data_5m = data_interface.add_technical_indicators(data_5m)
+        # Convert to numpy array with close price as the last column
+        features_5m = np.hstack([
+            data_5m.drop(['timestamp', 'close'], axis=1).values,
+            data_5m['close'].values.reshape(-1, 1)
+        ])
+    else:
+        features_5m = None
+        
+    if data_15m is not None:
+        data_15m = data_interface.add_technical_indicators(data_15m)
+        # Convert to numpy array with close price as the last column
+        features_15m = np.hstack([
+            data_15m.drop(['timestamp', 'close'], axis=1).values,
+            data_15m['close'].values.reshape(-1, 1)
+        ])
+    else:
+        features_15m = None
+    
+    # Check if we have all the required features
+    if features_1m is None or features_5m is None or features_15m is None:
+        logger.error("Failed to create features for all timeframes.")
+        raise ValueError("Could not create features for training")
+    
+    # Create the environment
+    if env_class:
+        # Use provided environment class
        env = env_class(features_1m, features_5m, features_15m)
+    else:
+        # Use the default environment
+        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
    
    # Set action callback if provided
    if action_callback:
-        def step_callback(action, price, reward, info):
-            action_callback(env.current_step, action, price, reward, info)
-        env.set_action_callback(step_callback)
+        env.set_action_callback(action_callback)
    
-    # Initialize agent
-    window_size = env.window_size
-    num_features = env.num_features * env.num_timeframes
-    action_size = env.action_space.n
-    timeframes = ['1m', '5m', '15m']  # Match the timeframes from the environment
+    # Get environment properties for agent creation
+    input_shape = env.observation_space.shape
+    n_actions = env.action_space.n
    
+    # Create the agent
    agent = DQNAgent(
-        state_size=window_size * num_features,
-        action_size=action_size,
-        window_size=window_size,
-        num_features=env.num_features,
-        timeframes=timeframes,
-        memory_size=100000,
-        batch_size=64,
+        state_shape=input_shape,
+        n_actions=n_actions,
+        epsilon=1.0,
+        epsilon_decay=0.995,
+        epsilon_min=0.01,
        learning_rate=0.0001,
        gamma=0.99,
-        epsilon=1.0,
-        epsilon_min=0.01,
-        epsilon_decay=0.995
+        buffer_size=10000,
+        batch_size=64,
+        device=device  # Pass device to agent for GPU usage
    )
    
-    # Training variables
-    best_reward = -float('inf')
-    episode_rewards = []
+    # Check if model file exists and load it
+    model_file = f"{save_path}_model.pth"
+    if os.path.exists(model_file):
+        try:
+            agent.load(model_file)
+            logger.info(f"Loaded existing model from {model_file}")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+    else:
+        logger.info("No existing model found. Starting with a new model.")
    
-    # TensorBoard writer for logging
-    writer = SummaryWriter(log_dir=f'runs/rl_trading_{int(time.time())}')
+    # Create TensorBoard writer
+    writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')
    
+    # Log GPU status to TensorBoard
+    writer.add_text("hardware/device", str(device), 0)
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)
+    
+    # Training loop
+    total_rewards = []
+    trade_win_rates = []
+    best_reward = -np.inf
+    
+    # Move models to the appropriate device if not already there
+    agent.move_models_to_device(device)
+    
+    # Enable mixed precision if GPU and feature is available
+    use_mixed_precision = False
+    if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
+        logger.info("Enabling mixed precision training")
+        use_mixed_precision = True
+        scaler = torch.cuda.amp.GradScaler()
+
+    # Define step callback for tensorboard logging and model tracking
+    def step_callback(action, price, reward, info):
+        # Pass to external callback if provided
+        if action_callback:
+            action_callback(env.current_step, action, price, reward, info)
+
    # Main training loop
    logger.info(f"Starting training for {num_episodes} episodes...")
    logger.info(f"Starting training on device: {agent.device}")
@@ -378,12 +457,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
                    break
            
            # Track rewards
-            episode_rewards.append(total_reward)
-            
-            # Log progress
-            avg_reward = np.mean(episode_rewards[-100:])
-            logger.info(f"Episode {episode}/{num_episodes} - Reward: {total_reward:.4f}, " +
-                        f"Avg (100): {avg_reward:.4f}, Epsilon: {agent.epsilon:.4f}")
+            total_rewards.append(total_reward)
            
            # Calculate trading metrics
            win_rate = env.win_rate if hasattr(env, 'win_rate') else 0
@@ -391,15 +465,14 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
            
            # Log to TensorBoard
            writer.add_scalar('Reward/Episode', total_reward, episode)
-            writer.add_scalar('Reward/Average100', avg_reward, episode)
            writer.add_scalar('Trade/WinRate', win_rate, episode)
            writer.add_scalar('Trade/Count', trades, episode)
            
            # Save best model
-            if avg_reward > best_reward and episode > 10:
-                logger.info(f"New best average reward: {avg_reward:.4f}, saving model")
+            if total_reward > best_reward and episode > 10:
+                logger.info(f"New best average reward: {total_reward:.4f}, saving model")
                agent.save(save_path)
-                best_reward = avg_reward
+                best_reward = total_reward
            
            # Periodic save every 100 episodes
            if episode % 100 == 0 and episode > 0:
@@ -424,7 +497,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
    # Close TensorBoard writer
    writer.close()
    
-    return agent
+    return agent, env

 if __name__ == "__main__":
    train_rl()