From 70eb7bba9bf2ce5ce0f864eb5d4d25f41ba0024f Mon Sep 17 00:00:00 2001
From: Dobromir Popov <dobromir.popov@gateway.one>
Date: Wed, 2 Apr 2025 14:03:20 +0300
Subject: [PATCH] initial movel changes to fix performance

---
 NN/models/dqn_agent.py      | 731 +++++++++++++++++++++++++++---------
 NN/models/simple_cnn.py     | 101 +++++
 NN/models/simple_mlp.py     |  70 ++++
 NN/train_rl.py              | 213 +++++++----
 TODO_IMPROVEMENTS.md        | 224 +++++++++++
 improved_reward_function.py | 180 +++++++++
 realtime.py                 | 150 +++++++-
 train_rl_with_realtime.py   | 229 ++++++++++-
 8 files changed, 1619 insertions(+), 279 deletions(-)
 create mode 100644 NN/models/simple_mlp.py
 create mode 100644 TODO_IMPROVEMENTS.md
 create mode 100644 improved_reward_function.py

diff --git a/NN/models/dqn_agent.py b/NN/models/dqn_agent.py
index 6477d0f..96488e9 100644
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -8,6 +8,7 @@ from typing import Tuple, List
 import os
 import sys
 import logging
+import torch.nn.functional as F
 
 # Add parent directory to path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@@ -20,71 +21,124 @@ logger = logging.getLogger(__name__)
 class DQNAgent:
     """
     Deep Q-Network agent for trading
-    Uses CNN model as the base network
+    Uses CNN model as the base network with GPU support
     """
     def __init__(self,
-                 state_size: int,
-                 action_size: int,
-                 window_size: int,
-                 num_features: int,
-                 timeframes: List[str],
+                 state_shape: Tuple[int, ...],
+                 n_actions: int,
                  learning_rate: float = 0.0005,  # Reduced learning rate for more stability
                  gamma: float = 0.97,            # Slightly reduced discount factor
                  epsilon: float = 1.0,
                  epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
                  epsilon_decay: float = 0.9975,  # Slower decay rate
-                 memory_size: int = 20000,       # Increased memory size
+                 buffer_size: int = 20000,       # Increased memory size
                  batch_size: int = 128,          # Larger batch size
-                 target_update: int = 5):        # More frequent target updates
+                 target_update: int = 5,         # More frequent target updates
+                 device=None):                   # Device for computations
         
-        self.state_size = state_size
-        self.action_size = action_size
-        self.window_size = window_size
-        self.num_features = num_features
-        self.timeframes = timeframes
+        # Extract state dimensions
+        if isinstance(state_shape, tuple) and len(state_shape) > 1:
+            # Multi-dimensional state (like image or sequence)
+            self.state_dim = state_shape
+        else:
+            # 1D state
+            if isinstance(state_shape, tuple):
+                self.state_dim = state_shape[0]
+            else:
+                self.state_dim = state_shape
+        
+        # Store parameters
+        self.n_actions = n_actions
         self.learning_rate = learning_rate
         self.gamma = gamma
         self.epsilon = epsilon
         self.epsilon_min = epsilon_min
         self.epsilon_decay = epsilon_decay
-        self.memory_size = memory_size
+        self.buffer_size = buffer_size
         self.batch_size = batch_size
         self.target_update = target_update
         
-        # Device configuration
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Set device for computation (default to CPU)
+        if device is None:
+            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        else:
+            self.device = device
         
-        # Initialize networks
-        self.policy_net = CNNModelPyTorch(
-            window_size=window_size,
-            num_features=num_features,
-            output_size=action_size,
-            timeframes=timeframes
-        ).to(self.device)
+        # Initialize models with appropriate architecture based on state shape
+        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
+            # For image-like states (from RL environment with CNN)
+            from NN.models.simple_cnn import SimpleCNN
+            self.policy_net = SimpleCNN(self.state_dim, self.n_actions)
+            self.target_net = SimpleCNN(self.state_dim, self.n_actions)
+        else:
+            # For 1D state vectors (most environments)
+            from NN.models.simple_mlp import SimpleMLP
+            self.policy_net = SimpleMLP(self.state_dim, self.n_actions)
+            self.target_net = SimpleMLP(self.state_dim, self.n_actions)
         
-        self.target_net = CNNModelPyTorch(
-            window_size=window_size,
-            num_features=num_features,
-            output_size=action_size,
-            timeframes=timeframes
-        ).to(self.device)
+        # Initialize the target network with the same weights as the policy network
         self.target_net.load_state_dict(self.policy_net.state_dict())
         
-        # Initialize optimizer with gradient clipping
-        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
+        # Set models to eval mode (important for batch norm, dropout)
+        self.target_net.eval()
         
-        # Initialize memories with different priorities
-        self.memory = deque(maxlen=memory_size)
-        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
-        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards
+        # Optimization components
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
+        self.criterion = nn.MSELoss()
         
-        # Training metrics
+        # Experience replay memory
+        self.memory = []
+        self.positive_memory = []  # Special memory for storing good experiences
         self.update_count = 0
-        self.losses = []
-        self.avg_reward = 0
-        self.no_improvement_count = 0
-        self.best_reward = float('-inf')
         
+        # Extrema detection tracking
+        self.last_extrema_pred = {
+            'class': 2,  # Default to "neither" (not extrema)
+            'confidence': 0.0,
+            'raw': None
+        }
+        self.extrema_memory = []  # Special memory for storing extrema points
+        
+        # Performance tracking
+        self.losses = []
+        self.avg_reward = 0.0
+        self.best_reward = -float('inf')
+        self.no_improvement_count = 0
+        
+        # Check if mixed precision training should be used
+        self.use_mixed_precision = False
+        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
+            self.use_mixed_precision = True
+            self.scaler = torch.cuda.amp.GradScaler()
+            logger.info("Mixed precision training enabled")
+        else:
+            logger.info("Mixed precision training disabled")
+            
+        # Track if we're in training mode
+        self.training = True
+        
+        # For compatibility with old code
+        self.state_size = np.prod(state_shape)
+        self.action_size = n_actions
+        self.memory_size = buffer_size
+        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0]]  # Default timeframes
+        
+        logger.info(f"DQN Agent using device: {self.device}")
+        
+    def move_models_to_device(self, device=None):
+        """Move models to the specified device (GPU/CPU)"""
+        if device is not None:
+            self.device = device
+            
+        try:
+            self.policy_net = self.policy_net.to(self.device)
+            self.target_net = self.target_net.to(self.device)
+            logger.info(f"Moved models to {self.device}")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to move models to {self.device}: {str(e)}")
+            return False
+
     def remember(self, state: np.ndarray, action: int, reward: float,
                 next_state: np.ndarray, done: bool, is_extrema: bool = False):
         """
@@ -103,25 +157,472 @@ class DQNAgent:
         # Always add to main memory
         self.memory.append(experience)
         
-        # Add to specialized memories if applicable
-        if is_extrema:
+        # Check if this is an extrema point based on our extrema detection head
+        if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
+            # Class 0 = bottom, 1 = top, 2 = neither
+            # Only consider high confidence predictions
+            if self.last_extrema_pred['confidence'] > 0.7:
+                self.extrema_memory.append(experience)
+                
+                # Log this special experience
+                extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
+                logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")
+                
+                # For tops and bottoms, also duplicate the experience in memory to learn more from it
+                for _ in range(2):  # Add 2 extra copies
+                    self.memory.append(experience)
+        
+        # Explicitly marked extrema points also go to extrema memory
+        elif is_extrema:
             self.extrema_memory.append(experience)
             
         # Store positive experiences separately for prioritized replay
         if reward > 0:
             self.positive_memory.append(experience)
+            
+            # For very good rewards, duplicate to learn more from them
+            if reward > 0.1:
+                for _ in range(min(int(reward * 10), 5)):  # Cap at 5 extra copies for very high rewards
+                    self.positive_memory.append(experience)
+                    
+        # Keep memory size under control
+        if len(self.memory) > self.buffer_size:
+            # Keep more recent experiences
+            self.memory = self.memory[-self.buffer_size:]
+            
+        # Keep specialized memories under control too
+        if len(self.positive_memory) > self.buffer_size // 4:
+            self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]
+            
+        if len(self.extrema_memory) > self.buffer_size // 4:
+            self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]
     
     def act(self, state: np.ndarray, explore=True) -> int:
         """Choose action using epsilon-greedy policy with explore flag"""
         if explore and random.random() < self.epsilon:
-            return random.randrange(self.action_size)
+            return random.randrange(self.n_actions)
         
         with torch.no_grad():
             # Ensure state is normalized before inference
             state_tensor = self._normalize_state(state)
             state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
+            
+            # Get predictions using the policy network
+            self.policy_net.eval()  # Set to evaluation mode for inference
             action_probs, extrema_pred = self.policy_net(state_tensor)
-            return action_probs.argmax().item()
+            self.policy_net.train()  # Back to training mode
+            
+            # Get the predicted extrema class (0=bottom, 1=top, 2=neither)
+            extrema_class = extrema_pred.argmax(dim=1).item()
+            extrema_confidence = torch.softmax(extrema_pred, dim=1)[0, extrema_class].item()
+            
+            # Log extrema prediction for significant signals
+            if extrema_confidence > 0.7 and extrema_class != 2:  # Only log strong top/bottom signals
+                extrema_type = "BOTTOM" if extrema_class == 0 else "TOP" if extrema_class == 1 else "NEITHER"
+                logger.info(f"High confidence {extrema_type} detected! Confidence: {extrema_confidence:.4f}")
+            
+            # Store extrema prediction for the environment to use
+            self.last_extrema_pred = {
+                'class': extrema_class,
+                'confidence': extrema_confidence,
+                'raw': extrema_pred.cpu().numpy()
+            }
+            
+            # Get the action with highest Q-value
+            action = action_probs.argmax().item()
+            
+            # Adjust action based on extrema prediction (with some probability)
+            if extrema_confidence > 0.8:  # Only adjust for strong signals
+                if extrema_class == 0:  # Bottom detected
+                    # Bias toward BUY at bottoms
+                    if action != 0 and random.random() < 0.3 * extrema_confidence:
+                        logger.info(f"Adjusting action to BUY based on bottom detection")
+                        action = 0  # BUY
+                elif extrema_class == 1:  # Top detected
+                    # Bias toward SELL at tops
+                    if action != 1 and random.random() < 0.3 * extrema_confidence:
+                        logger.info(f"Adjusting action to SELL based on top detection")
+                        action = 1  # SELL
+            
+            return action
+
+    def replay(self, use_prioritized=True) -> float:
+        """Experience replay - learn from stored experiences
+
+        Args:
+            use_prioritized: Whether to use prioritized experience replay
+            
+        Returns:
+            float: Training loss
+        """
+        # Check if we have enough samples
+        if len(self.memory) < self.batch_size:
+            return 0.0
+        
+        # Check if mixed precision should be disabled
+        if 'DISABLE_MIXED_PRECISION' in os.environ:
+            self.use_mixed_precision = False
+            
+        # Sample from memory with or without prioritization
+        if use_prioritized and len(self.positive_memory) > self.batch_size // 4:
+            # Use prioritized sampling: mix normal samples with positive reward samples
+            positive_batch_size = min(self.batch_size // 4, len(self.positive_memory))
+            regular_batch_size = self.batch_size - positive_batch_size
+            
+            # Get positive examples
+            positive_batch = random.sample(self.positive_memory, positive_batch_size)
+            
+            # Get regular examples
+            regular_batch = random.sample(self.memory, regular_batch_size)
+            
+            # Combine batches
+            minibatch = positive_batch + regular_batch
+        else:
+            # Use regular uniform sampling
+            minibatch = random.sample(self.memory, self.batch_size)
+        
+        # Extract batches with proper tensor conversion
+        states = np.vstack([self._normalize_state(x[0]) for x in minibatch])
+        actions = np.array([x[1] for x in minibatch])
+        rewards = np.array([x[2] for x in minibatch])
+        next_states = np.vstack([self._normalize_state(x[3]) for x in minibatch])
+        dones = np.array([x[4] for x in minibatch], dtype=np.float32)
+        
+        # Convert to torch tensors and move to device
+        states_tensor = torch.FloatTensor(states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
+        
+        # First training step with mixed precision if available
+        if self.use_mixed_precision:
+            loss = self._replay_mixed_precision(
+                states_tensor, actions_tensor, rewards_tensor, 
+                next_states_tensor, dones_tensor
+            )
+        else:
+            loss = self._replay_standard(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
+            
+        # Occasionally train specifically on extrema points, if we have enough
+        if hasattr(self, 'extrema_memory') and len(self.extrema_memory) >= self.batch_size // 2:
+            if random.random() < 0.3:  # 30% chance to do extra extrema training
+                # Sample from extrema memory
+                extrema_batch_size = min(self.batch_size // 2, len(self.extrema_memory))
+                extrema_batch = random.sample(self.extrema_memory, extrema_batch_size)
+                
+                # Extract batches with proper tensor conversion
+                extrema_states = np.vstack([self._normalize_state(x[0]) for x in extrema_batch])
+                extrema_actions = np.array([x[1] for x in extrema_batch])
+                extrema_rewards = np.array([x[2] for x in extrema_batch])
+                extrema_next_states = np.vstack([self._normalize_state(x[3]) for x in extrema_batch])
+                extrema_dones = np.array([x[4] for x in extrema_batch], dtype=np.float32)
+                
+                # Convert to torch tensors and move to device
+                extrema_states_tensor = torch.FloatTensor(extrema_states).to(self.device)
+                extrema_actions_tensor = torch.LongTensor(extrema_actions).to(self.device)
+                extrema_rewards_tensor = torch.FloatTensor(extrema_rewards).to(self.device)
+                extrema_next_states_tensor = torch.FloatTensor(extrema_next_states).to(self.device)
+                extrema_dones_tensor = torch.FloatTensor(extrema_dones).to(self.device)
+                
+                # Additional training step focused on extrema points (with smaller learning rate)
+                original_lr = self.optimizer.param_groups[0]['lr']
+                # Temporarily reduce learning rate for fine-tuning on extrema
+                for param_group in self.optimizer.param_groups:
+                    param_group['lr'] = original_lr * 0.5
+                    
+                # Train on extrema
+                if self.use_mixed_precision:
+                    extrema_loss = self._replay_mixed_precision(
+                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
+                        extrema_next_states_tensor, extrema_dones_tensor
+                    )
+                else:
+                    extrema_loss = self._replay_standard(
+                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
+                        extrema_next_states_tensor, extrema_dones_tensor
+                    )
+                
+                # Restore original learning rate
+                for param_group in self.optimizer.param_groups:
+                    param_group['lr'] = original_lr
+                
+                logger.info(f"Extra training on extrema points: loss={extrema_loss:.4f}")
+                
+                # Average the loss
+                loss = (loss + extrema_loss) / 2
+        
+        # Store and return loss
+        self.losses.append(loss)
+        return loss
+            
+    def _replay_standard(self, states, actions, rewards, next_states, dones):
+        """Standard precision training step"""
+        # Zero gradients
+        self.optimizer.zero_grad()
+        
+        # Get current Q values and extrema predictions
+        current_q_values, current_extrema_pred = self.policy_net(states)
+        current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
+        
+        # Get next Q values from target network
+        with torch.no_grad():
+            next_q_values, next_extrema_pred = self.target_net(next_states)
+            next_q_values = next_q_values.max(1)[0]
+            
+            # Check for dimension mismatch and fix it
+            if rewards.shape[0] != next_q_values.shape[0]:
+                # Log the shape mismatch for debugging
+                logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
+                # Use the smaller size to prevent index errors
+                min_size = min(rewards.shape[0], next_q_values.shape[0])
+                rewards = rewards[:min_size]
+                dones = dones[:min_size]
+                next_q_values = next_q_values[:min_size]
+                current_q_values = current_q_values[:min_size]
+                
+            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+        
+        # Compute Q-value loss (primary task)
+        q_loss = nn.MSELoss()(current_q_values, target_q_values)
+        
+        # Create extrema labels from price movements (crude approximation)
+        # If the next state price is higher than current, we might be in an uptrend (not a bottom)
+        # If the next state price is lower than current, we might be in a downtrend (not a top)
+        # This is a simplified approximation; in real scenarios we'd want to use actual extrema detection
+        
+        # Try to extract price from current and next states
+        # Assuming price is in the last feature
+        try:
+            # Extract price feature from sequence data (if available)
+            if len(states.shape) == 3:  # [batch, seq, features]
+                current_prices = states[:, -1, -1]  # Last timestep, last feature
+                next_prices = next_states[:, -1, -1]
+            else:  # [batch, features]
+                current_prices = states[:, -1]  # Last feature
+                next_prices = next_states[:, -1]
+                
+            # Compute price changes
+            price_changes = (next_prices - current_prices) / current_prices
+            
+            # Create crude extrema labels:
+            # 0 = bottom: Large negative price change followed by positive change
+            # 1 = top: Large positive price change followed by negative change
+            # 2 = neither: Small or inconsistent changes
+            
+            # Classify based on price change magnitude
+            extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
+            
+            # Identify potential bottoms (significant negative change)
+            bottoms = (price_changes < -0.003)
+            extrema_labels[bottoms] = 0
+            
+            # Identify potential tops (significant positive change)
+            tops = (price_changes > 0.003)
+            extrema_labels[tops] = 1
+            
+            # Calculate extrema prediction loss (auxiliary task)
+            if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
+                current_extrema_pred = current_extrema_pred[:min_size]
+                extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
+                
+                # Combined loss (primary + auxiliary with lower weight)
+                # Typically auxiliary tasks should have lower weight to not dominate the primary task
+                loss = q_loss + 0.3 * extrema_loss
+                
+                # Log separate loss components occasionally
+                if random.random() < 0.01:  # Log 1% of the time to avoid flood
+                    logger.info(f"Training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
+            else:
+                # Fall back to just Q-value loss if extrema predictions aren't available
+                loss = q_loss
+        except Exception as e:
+            # Fallback if price extraction fails
+            logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
+            loss = q_loss
+        
+        # Backward pass and optimize
+        loss.backward()
+        
+        # Gradient clipping to prevent exploding gradients
+        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
+        self.optimizer.step()
+        
+        # Update target network if needed
+        self.update_count += 1
+        if self.update_count % self.target_update == 0:
+            self.target_net.load_state_dict(self.policy_net.state_dict())
+        
+        # Track and decay epsilon
+        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+        
+        return loss.item()
+    
+    def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
+        """Mixed precision training step for better GPU performance"""
+        # Check if mixed precision should be explicitly disabled
+        if 'DISABLE_MIXED_PRECISION' in os.environ:
+            logger.info("Mixed precision explicitly disabled by environment variable")
+            return self._replay_standard(states, actions, rewards, next_states, dones)
+            
+        try:
+            # Zero gradients
+            self.optimizer.zero_grad()
+            
+            # Forward pass with amp autocasting
+            with torch.cuda.amp.autocast():
+                # Get current Q values and extrema predictions
+                current_q_values, current_extrema_pred = self.policy_net(states)
+                current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
+                
+                # Get next Q values from target network
+                with torch.no_grad():
+                    next_q_values, next_extrema_pred = self.target_net(next_states)
+                    next_q_values = next_q_values.max(1)[0]
+                    
+                    # Check for dimension mismatch and fix it
+                    if rewards.shape[0] != next_q_values.shape[0]:
+                        # Log the shape mismatch for debugging
+                        logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
+                        # Use the smaller size to prevent index errors
+                        min_size = min(rewards.shape[0], next_q_values.shape[0])
+                        rewards = rewards[:min_size]
+                        dones = dones[:min_size]
+                        next_q_values = next_q_values[:min_size]
+                        current_q_values = current_q_values[:min_size]
+                    
+                    target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+                
+                # Compute Q-value loss (primary task)
+                q_loss = nn.MSELoss()(current_q_values, target_q_values)
+                
+                # Create extrema labels from price movements (crude approximation)
+                # Try to extract price from current and next states
+                try:
+                    # Extract price feature from sequence data (if available)
+                    if len(states.shape) == 3:  # [batch, seq, features]
+                        current_prices = states[:, -1, -1]  # Last timestep, last feature
+                        next_prices = next_states[:, -1, -1]
+                    else:  # [batch, features]
+                        current_prices = states[:, -1]  # Last feature
+                        next_prices = next_states[:, -1]
+                    
+                    # Compute price changes
+                    price_changes = (next_prices - current_prices) / current_prices
+                    
+                    # Create crude extrema labels:
+                    # 0 = bottom: Large negative price change followed by positive change
+                    # 1 = top: Large positive price change followed by negative change
+                    # 2 = neither: Small or inconsistent changes
+                    
+                    # Classify based on price change magnitude
+                    extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
+                    
+                    # Identify potential bottoms (significant negative change)
+                    bottoms = (price_changes < -0.003)
+                    extrema_labels[bottoms] = 0
+                    
+                    # Identify potential tops (significant positive change)
+                    tops = (price_changes > 0.003)
+                    extrema_labels[tops] = 1
+                    
+                    # Calculate extrema prediction loss (auxiliary task)
+                    if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
+                        current_extrema_pred = current_extrema_pred[:min_size]
+                        extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
+                        
+                        # Combined loss (primary + auxiliary with lower weight)
+                        loss = q_loss + 0.3 * extrema_loss
+                        
+                        # Log separate loss components occasionally
+                        if random.random() < 0.01:  # Log 1% of the time to avoid flood
+                            logger.info(f"Mixed precision training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
+                    else:
+                        # Fall back to just Q-value loss
+                        loss = q_loss
+                except Exception as e:
+                    # Fallback if price extraction fails
+                    logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
+                    loss = q_loss
+            
+            # Backward pass with scaled gradients
+            self.scaler.scale(loss).backward()
+            
+            # Gradient clipping on scaled gradients
+            self.scaler.unscale_(self.optimizer)
+            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
+            
+            # Update with scaler
+            self.scaler.step(self.optimizer)
+            self.scaler.update()
+            
+            # Update target network if needed
+            self.update_count += 1
+            if self.update_count % self.target_update == 0:
+                self.target_net.load_state_dict(self.policy_net.state_dict())
+            
+            # Track and decay epsilon
+            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
+            
+            return loss.item()
+            
+        except Exception as e:
+            logger.error(f"Error in mixed precision training: {str(e)}")
+            logger.warning("Falling back to standard precision training")
+            # Fall back to standard training
+            return self._replay_standard(states, actions, rewards, next_states, dones)
+
+    def train_on_extrema(self, states, actions, rewards, next_states, dones):
+        """
+        Special training function specifically for extrema points
+        
+        Args:
+            states: Batch of states at extrema points
+            actions: Batch of actions
+            rewards: Batch of rewards
+            next_states: Batch of next states
+            dones: Batch of done flags
+            
+        Returns:
+            float: Training loss
+        """
+        # Convert to numpy arrays if not already
+        if not isinstance(states, np.ndarray):
+            states = np.array(states)
+        if not isinstance(actions, np.ndarray):
+            actions = np.array(actions)
+        if not isinstance(rewards, np.ndarray):
+            rewards = np.array(rewards)
+        if not isinstance(next_states, np.ndarray):
+            next_states = np.array(next_states)
+        if not isinstance(dones, np.ndarray):
+            dones = np.array(dones, dtype=np.float32)
+        
+        # Normalize states
+        states = np.vstack([self._normalize_state(s) for s in states])
+        next_states = np.vstack([self._normalize_state(s) for s in next_states])
+        
+        # Convert to torch tensors and move to device
+        states_tensor = torch.FloatTensor(states).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
+        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
+        
+        # Choose training method based on precision mode
+        if self.use_mixed_precision:
+            return self._replay_mixed_precision(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
+        else:
+            return self._replay_standard(
+                states_tensor, actions_tensor, rewards_tensor,
+                next_states_tensor, dones_tensor
+            )
     
     def _normalize_state(self, state: np.ndarray) -> np.ndarray:
         """Normalize the state data to prevent numerical issues"""
@@ -211,148 +712,6 @@ class DQNAgent:
         
         return normalized_state
     
-    def replay(self, use_prioritized=True) -> float:
-        """
-        Train on a batch of experiences with prioritized sampling
-        
-        Args:
-            use_prioritized: Whether to use prioritized replay
-            
-        Returns:
-            float: Loss value
-        """
-        if len(self.memory) < self.batch_size:
-            return 0.0
-        
-        # Sample batch with prioritization
-        batch = []
-        
-        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
-            # Prioritized sampling from different memory types
-            positive_count = min(self.batch_size // 4, len(self.positive_memory))
-            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
-            regular_count = self.batch_size - positive_count - extrema_count
-            
-            positive_samples = random.sample(list(self.positive_memory), positive_count)
-            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
-            regular_samples = random.sample(list(self.memory), regular_count)
-            
-            batch = positive_samples + extrema_samples + regular_samples
-        else:
-            # Standard sampling
-            batch = random.sample(self.memory, self.batch_size)
-        
-        states, actions, rewards, next_states, dones = zip(*batch)
-        
-        # Normalize states before training
-        normalized_states = np.array([self._normalize_state(state) for state in states])
-        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
-        
-        # Convert to tensors and move to device
-        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones_tensor = torch.FloatTensor(dones).to(self.device)
-        
-        # Get current Q values
-        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        
-        # Get next Q values from target network (Double DQN approach)
-        with torch.no_grad():
-            # Get actions from policy network
-            next_actions, _ = self.policy_net(next_states_tensor)
-            next_actions = next_actions.max(1)[1].unsqueeze(1)
-            
-            # Get Q values from target network for those actions
-            next_q_values, _ = self.target_net(next_states_tensor)
-            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-            
-            # Compute target Q values
-            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
-        
-        # Clamp target values to prevent extreme values
-        target_q_values = torch.clamp(target_q_values, -100, 100)
-        
-        # Compute Huber loss (more robust to outliers than MSE)
-        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
-        
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        
-        # Apply gradient clipping to prevent exploding gradients
-        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
-        
-        self.optimizer.step()
-        
-        # Update target network if needed
-        self.update_count += 1
-        if self.update_count % self.target_update == 0:
-            self.target_net.load_state_dict(self.policy_net.state_dict())
-        
-        # Decay epsilon
-        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
-        
-        return loss.item()
-    
-    def train_on_extrema(self, states, actions, rewards, next_states, dones):
-        """
-        Special training method focused on extrema patterns
-        
-        Args:
-            states: Array of states near extrema points
-            actions: Correct actions to take (buy at bottoms, sell at tops)
-            rewards: Rewards for each action
-            next_states: Next states
-            dones: Done flags
-        """
-        if len(states) == 0:
-            return 0.0
-        
-        # Normalize states
-        normalized_states = np.array([self._normalize_state(state) for state in states])
-        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
-            
-        # Convert to tensors
-        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones_tensor = torch.FloatTensor(dones).to(self.device)
-        
-        # Forward pass
-        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        
-        # Get next Q values (Double DQN approach)
-        with torch.no_grad():
-            next_actions, _ = self.policy_net(next_states_tensor)
-            next_actions = next_actions.max(1)[1].unsqueeze(1)
-            
-            next_q_values, _ = self.target_net(next_states_tensor)
-            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-            
-            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
-            
-        # Clamp target values
-        target_q_values = torch.clamp(target_q_values, -100, 100)
-        
-        # Use Huber loss for extrema training
-        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
-        
-        # Full loss
-        loss = q_loss
-        
-        # Optimize
-        self.optimizer.zero_grad()
-        loss.backward()
-        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
-        self.optimizer.step()
-        
-        return loss.item()
-    
     def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
         """Update learning metrics and perform learning rate adjustments if needed"""
         # Update average reward with exponential moving average
diff --git a/NN/models/simple_cnn.py b/NN/models/simple_cnn.py
index b4b637b..19a8292 100644
--- a/NN/models/simple_cnn.py
+++ b/NN/models/simple_cnn.py
@@ -74,6 +74,107 @@ class AdaptiveNorm(nn.Module):
                 self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
             return self.layer_norm_1d(x)
 
+class SimpleCNN(nn.Module):
+    """
+    Simple CNN model for reinforcement learning with image-like state inputs
+    """
+    def __init__(self, input_shape, n_actions):
+        super(SimpleCNN, self).__init__()
+        
+        # Store dimensions
+        self.input_shape = input_shape
+        self.n_actions = n_actions
+        
+        # Calculate input dimensions
+        if len(input_shape) == 3:  # [channels, height, width]
+            self.channels, self.height, self.width = input_shape
+            self.feature_dim = self.height * self.width
+        elif len(input_shape) == 2:  # [timeframes, features]
+            self.channels = input_shape[0]
+            self.features = input_shape[1]
+            self.feature_dim = self.features
+        elif len(input_shape) == 1:  # [features]
+            self.channels = 1
+            self.features = input_shape[0]
+            self.feature_dim = self.features
+        else:
+            raise ValueError(f"Unsupported input shape: {input_shape}")
+        
+        # Build network
+        self._build_network()
+        
+        # Initialize device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(self.device)
+        
+        logger.info(f"SimpleCNN initialized with input shape: {input_shape}, actions: {n_actions}")
+    
+    def _build_network(self):
+        """Build the neural network with current feature dimensions"""
+        # Create a flexible architecture that adapts to input dimensions
+        self.fc_layers = nn.Sequential(
+            nn.Linear(self.feature_dim, 256),
+            nn.ReLU(),
+            nn.Linear(256, 256),
+            nn.ReLU()
+        )
+        
+        # Output heads (Dueling DQN architecture)
+        self.advantage_head = nn.Linear(256, self.n_actions)
+        self.value_head = nn.Linear(256, 1)
+        
+        # Extrema detection head
+        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
+    
+    def _check_rebuild_network(self, features):
+        """Check if network needs to be rebuilt for different feature dimensions"""
+        if features != self.feature_dim:
+            logger.info(f"Rebuilding network for new feature dimension: {features} (was {self.feature_dim})")
+            self.feature_dim = features
+            self._build_network()
+            # Move to device after rebuilding
+            self.to(self.device)
+            return True
+        return False
+        
+    def forward(self, x):
+        """
+        Forward pass through the network
+        Returns both action values and extrema predictions
+        """
+        # Handle different input shapes
+        if len(x.shape) == 2:  # [batch_size, features]
+            # Simple feature vector
+            batch_size, features = x.shape
+            # Check if we need to rebuild the network for new dimensions
+            self._check_rebuild_network(features)
+            
+        elif len(x.shape) == 3:  # [batch_size, timeframes/channels, features]
+            # Reshape to flatten timeframes/channels with features
+            batch_size, timeframes, features = x.shape
+            total_features = timeframes * features
+            
+            # Check if we need to rebuild the network for new dimensions
+            self._check_rebuild_network(total_features)
+            
+            # Reshape tensor to [batch_size, total_features]
+            x = x.reshape(batch_size, total_features)
+            
+        # Apply fully connected layers
+        fc_out = self.fc_layers(x)
+        
+        # Dueling architecture
+        advantage = self.advantage_head(fc_out)
+        value = self.value_head(fc_out)
+        
+        # Q-values = value + (advantage - mean(advantage))
+        action_values = value + advantage - advantage.mean(dim=1, keepdim=True)
+        
+        # Extrema predictions
+        extrema_pred = self.extrema_head(fc_out)
+        
+        return action_values, extrema_pred
+
 class CNNModelPyTorch(nn.Module):
     """
     CNN model for trading with multiple timeframes
diff --git a/NN/models/simple_mlp.py b/NN/models/simple_mlp.py
new file mode 100644
index 0000000..52d3a49
--- /dev/null
+++ b/NN/models/simple_mlp.py
@@ -0,0 +1,70 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import os
+import logging
+
+# Configure logger
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class SimpleMLP(nn.Module):
+    """
+    Simple Multi-Layer Perceptron for reinforcement learning with vector state inputs
+    Implements dueling architecture for better Q-learning
+    """
+    def __init__(self, state_dim, n_actions):
+        super(SimpleMLP, self).__init__()
+        
+        # Store dimensions
+        self.state_dim = state_dim
+        self.n_actions = n_actions
+        
+        # Calculate input size
+        if isinstance(state_dim, tuple):
+            self.input_size = int(np.prod(state_dim))
+        else:
+            self.input_size = state_dim
+            
+        # Hidden layers
+        self.fc1 = nn.Linear(self.input_size, 256)
+        self.fc2 = nn.Linear(256, 256)
+        
+        # Dueling architecture
+        self.advantage = nn.Linear(256, n_actions)
+        self.value = nn.Linear(256, 1)
+        
+        # Extrema detection
+        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
+        
+        # Move to appropriate device
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(self.device)
+        
+        logger.info(f"SimpleMLP initialized with input size: {self.input_size}, actions: {n_actions}")
+    
+    def forward(self, x):
+        """
+        Forward pass through the network
+        Returns both action values and extrema predictions
+        """
+        # Handle different input shapes
+        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
+            x = x.view(-1, self.input_size)
+            
+        # Main network
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        
+        # Dueling architecture
+        advantage = self.advantage(x)
+        value = self.value(x)
+        
+        # Combine value and advantage (Q = V + A - mean(A))
+        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
+        
+        # Extrema predictions
+        extrema = F.softmax(self.extrema_head(x), dim=1)
+        
+        return q_values, extrema 
\ No newline at end of file
diff --git a/NN/train_rl.py b/NN/train_rl.py
index fe71068..807b5a8 100644
--- a/NN/train_rl.py
+++ b/NN/train_rl.py
@@ -29,6 +29,21 @@ logging.basicConfig(
     ]
 )
 
+# Set up device for PyTorch (use GPU if available)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+# Log GPU status
+if torch.cuda.is_available():
+    gpu_count = torch.cuda.device_count()
+    gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
+    logger.info(f"Using GPU: {gpu_names}")
+    
+    # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
+    if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
+        logger.info("BFloat16 precision is supported - will use for faster training")
+else:
+    logger.warning("GPU not available. Using CPU for training (slower).")
+
 class RLTradingEnvironment(gym.Env):
     """
     Reinforcement Learning environment for trading with technical indicators
@@ -266,87 +281,151 @@ class RLTradingEnvironment(gym.Env):
 def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent", 
              action_callback=None, episode_callback=None, symbol="BTC/USDT"):
     """
-    Train DQN agent for RL-based trading with extended training and monitoring
+    Train a reinforcement learning agent for trading
     
     Args:
-        env_class: Optional environment class to use, defaults to RLTradingEnvironment
-        num_episodes: Number of episodes to train
+        env_class: Optional environment class override
+        num_episodes: Number of episodes to train for
         max_steps: Maximum steps per episode
-        save_path: Path to save the model
-        action_callback: Optional callback for each action (step, action, price, reward, info)
-        episode_callback: Optional callback after each episode (episode, reward, info)
-        symbol: Trading pair symbol (e.g., "BTC/USDT")
+        save_path: Path to save the trained model
+        action_callback: Callback function for monitoring actions
+        episode_callback: Callback function for monitoring episodes
+        symbol: Trading symbol to use
         
     Returns:
-        DQNAgent: The trained agent
+        tuple: (trained agent, environment)
     """
-    import pandas as pd
-    from NN.utils.data_interface import DataInterface
+    # Load data for the selected symbol
+    data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
     
-    logger.info("Starting DQN training for RL trading")
+    try:
+        # Try to load data for the requested symbol using get_historical_data method
+        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
+        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
+        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
+        
+        if data_1m is None or data_5m is None or data_15m is None:
+            raise FileNotFoundError("Could not retrieve data for specified symbol")
+    except Exception as e:
+        logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.")
+        # Try to use cached data if available
+        symbol = "BTC/USDT"
+        data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
+        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
+        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
+        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
+        
+        if data_1m is None or data_5m is None or data_15m is None:
+            logger.error("Failed to retrieve any data. Cannot continue training.")
+            raise ValueError("No data available for training")
     
-    # Create data interface with specified symbol
-    data_interface = DataInterface(symbol=symbol)
-    
-    # Load and preprocess data
-    logger.info(f"Loading data from multiple timeframes for {symbol}")
-    features_1m = data_interface.get_training_data("1m", n_candles=2000)
-    features_5m = data_interface.get_training_data("5m", n_candles=1000)
-    features_15m = data_interface.get_training_data("15m", n_candles=500)
-    
-    # Check if we have all the data
-    if features_1m is None or features_5m is None or features_15m is None:
-        logger.error("Failed to load training data from one or more timeframes")
-        return None
-    
-    # If data is a DataFrame, convert to numpy array excluding the timestamp column
-    if isinstance(features_1m, pd.DataFrame):
-        features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
-    if isinstance(features_5m, pd.DataFrame):
-        features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
-    if isinstance(features_15m, pd.DataFrame):
-        features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
-    
-    # Initialize environment or use provided class
-    if env_class is None:
-        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
+    # Create features from the data by adding technical indicators and converting to numpy format
+    if data_1m is not None:
+        data_1m = data_interface.add_technical_indicators(data_1m)
+        # Convert to numpy array with close price as the last column
+        features_1m = np.hstack([
+            data_1m.drop(['timestamp', 'close'], axis=1).values,
+            data_1m['close'].values.reshape(-1, 1)
+        ])
     else:
+        features_1m = None
+        
+    if data_5m is not None:
+        data_5m = data_interface.add_technical_indicators(data_5m)
+        # Convert to numpy array with close price as the last column
+        features_5m = np.hstack([
+            data_5m.drop(['timestamp', 'close'], axis=1).values,
+            data_5m['close'].values.reshape(-1, 1)
+        ])
+    else:
+        features_5m = None
+        
+    if data_15m is not None:
+        data_15m = data_interface.add_technical_indicators(data_15m)
+        # Convert to numpy array with close price as the last column
+        features_15m = np.hstack([
+            data_15m.drop(['timestamp', 'close'], axis=1).values,
+            data_15m['close'].values.reshape(-1, 1)
+        ])
+    else:
+        features_15m = None
+    
+    # Check if we have all the required features
+    if features_1m is None or features_5m is None or features_15m is None:
+        logger.error("Failed to create features for all timeframes.")
+        raise ValueError("Could not create features for training")
+    
+    # Create the environment
+    if env_class:
+        # Use provided environment class
         env = env_class(features_1m, features_5m, features_15m)
+    else:
+        # Use the default environment
+        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
     
     # Set action callback if provided
     if action_callback:
-        def step_callback(action, price, reward, info):
-            action_callback(env.current_step, action, price, reward, info)
-        env.set_action_callback(step_callback)
+        env.set_action_callback(action_callback)
     
-    # Initialize agent
-    window_size = env.window_size
-    num_features = env.num_features * env.num_timeframes
-    action_size = env.action_space.n
-    timeframes = ['1m', '5m', '15m']  # Match the timeframes from the environment
+    # Get environment properties for agent creation
+    input_shape = env.observation_space.shape
+    n_actions = env.action_space.n
     
+    # Create the agent
     agent = DQNAgent(
-        state_size=window_size * num_features,
-        action_size=action_size,
-        window_size=window_size,
-        num_features=env.num_features,
-        timeframes=timeframes,
-        memory_size=100000,
-        batch_size=64,
+        state_shape=input_shape,
+        n_actions=n_actions,
+        epsilon=1.0,
+        epsilon_decay=0.995,
+        epsilon_min=0.01,
         learning_rate=0.0001,
         gamma=0.99,
-        epsilon=1.0,
-        epsilon_min=0.01,
-        epsilon_decay=0.995
+        buffer_size=10000,
+        batch_size=64,
+        device=device  # Pass device to agent for GPU usage
     )
     
-    # Training variables
-    best_reward = -float('inf')
-    episode_rewards = []
+    # Check if model file exists and load it
+    model_file = f"{save_path}_model.pth"
+    if os.path.exists(model_file):
+        try:
+            agent.load(model_file)
+            logger.info(f"Loaded existing model from {model_file}")
+        except Exception as e:
+            logger.error(f"Error loading model: {e}")
+    else:
+        logger.info("No existing model found. Starting with a new model.")
     
-    # TensorBoard writer for logging
-    writer = SummaryWriter(log_dir=f'runs/rl_trading_{int(time.time())}')
+    # Create TensorBoard writer
+    writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')
     
+    # Log GPU status to TensorBoard
+    writer.add_text("hardware/device", str(device), 0)
+    if torch.cuda.is_available():
+        for i in range(torch.cuda.device_count()):
+            writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)
+    
+    # Training loop
+    total_rewards = []
+    trade_win_rates = []
+    best_reward = -np.inf
+    
+    # Move models to the appropriate device if not already there
+    agent.move_models_to_device(device)
+    
+    # Enable mixed precision if GPU and feature is available
+    use_mixed_precision = False
+    if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
+        logger.info("Enabling mixed precision training")
+        use_mixed_precision = True
+        scaler = torch.cuda.amp.GradScaler()
+
+    # Define step callback for tensorboard logging and model tracking
+    def step_callback(action, price, reward, info):
+        # Pass to external callback if provided
+        if action_callback:
+            action_callback(env.current_step, action, price, reward, info)
+
     # Main training loop
     logger.info(f"Starting training for {num_episodes} episodes...")
     logger.info(f"Starting training on device: {agent.device}")
@@ -378,12 +457,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
                     break
             
             # Track rewards
-            episode_rewards.append(total_reward)
-            
-            # Log progress
-            avg_reward = np.mean(episode_rewards[-100:])
-            logger.info(f"Episode {episode}/{num_episodes} - Reward: {total_reward:.4f}, " +
-                        f"Avg (100): {avg_reward:.4f}, Epsilon: {agent.epsilon:.4f}")
+            total_rewards.append(total_reward)
             
             # Calculate trading metrics
             win_rate = env.win_rate if hasattr(env, 'win_rate') else 0
@@ -391,15 +465,14 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
             
             # Log to TensorBoard
             writer.add_scalar('Reward/Episode', total_reward, episode)
-            writer.add_scalar('Reward/Average100', avg_reward, episode)
             writer.add_scalar('Trade/WinRate', win_rate, episode)
             writer.add_scalar('Trade/Count', trades, episode)
             
             # Save best model
-            if avg_reward > best_reward and episode > 10:
-                logger.info(f"New best average reward: {avg_reward:.4f}, saving model")
+            if total_reward > best_reward and episode > 10:
+                logger.info(f"New best average reward: {total_reward:.4f}, saving model")
                 agent.save(save_path)
-                best_reward = avg_reward
+                best_reward = total_reward
             
             # Periodic save every 100 episodes
             if episode % 100 == 0 and episode > 0:
@@ -424,7 +497,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
     # Close TensorBoard writer
     writer.close()
     
-    return agent
+    return agent, env
 
 if __name__ == "__main__":
     train_rl() 
\ No newline at end of file
diff --git a/TODO_IMPROVEMENTS.md b/TODO_IMPROVEMENTS.md
new file mode 100644
index 0000000..d05a0be
--- /dev/null
+++ b/TODO_IMPROVEMENTS.md
@@ -0,0 +1,224 @@
+# Cryptocurrency Trading System Improvements
+
+## Overview
+This document outlines necessary improvements to our cryptocurrency trading system to enhance performance, profitability, and monitoring capabilities.
+
+## High Priority Tasks
+
+### 1. GPU Utilization for Training
+- [x] Fix GPU detection and utilization during training
+  - [x] Debug why CUDA is detected but not utilized (check logs showing "Starting training on device: cpu")
+  - [x] Ensure PyTorch correctly detects and uses available CUDA devices
+  - [x] Add GPU memory monitoring during training
+  - [x] Optimize batch sizes for GPU training
+
+Implementation status:
+- Added `setup_gpu()` function in `train_rl_with_realtime.py` to properly detect and configure GPU usage
+- Added device parameter to DQNAgent to ensure models are created on the correct device
+- Implemented mixed precision training for faster GPU-based training
+- Added GPU memory monitoring and logging to TensorBoard
+
+### 2. Trade Signal Rate Display
+- [x] Add metrics to track and display trading frequency
+  - [x] Implement counter for actions per second/minute/hour
+  - [x] Add visualization to the chart showing trading frequency over time
+  - [x] Create a moving average of trade signals to show trends
+  - [x] Add dashboard section showing current and average trading rates
+
+Implementation status:
+- Added trade time tracking in `_add_trade_compat` function
+- Added `calculate_trade_rate` method to `RealTimeChart` class
+- Updated dashboard layout to display trade rates
+- Added visualization of trade frequency in chart's bottom panel
+
+### 3. Reward Function Optimization
+- [x] Revise reward function to better balance profit and risk
+  - [x] Increase transaction fee penalty for more realistic simulation
+  - [x] Implement progressive rewards based on holding time
+  - [x] Add penalty for frequent trading (to reduce noise)
+  - [x] Scale rewards based on market volatility
+  - [x] Implement risk-adjusted returns (Sharpe ratio) in reward calculation
+
+Implementation status:
+- Created `improved_reward_function.py` with `ImprovedRewardCalculator` class
+- Implemented Sharpe ratio for risk-adjusted rewards
+- Added frequency penalty for excessive trading
+- Added holding time rewards for profitable positions
+- Integrated with `EnhancedRLTradingEnvironment` class
+
+### 4. Multi-timeframe Price Direction Prediction
+- [ ] Extend CNN model to predict price direction for multiple timeframes
+  - [ ] Modify CNN output to predict short, mid, and long-term price directions
+  - [ ] Create data generation method for back-propagation using historical data
+  - [ ] Implement real-time example generation for training
+  - [ ] Feed direction predictions to RL agent as additional state information
+
+## Medium Priority Tasks
+
+### 5. Position Sizing Optimization
+- [ ] Implement dynamic position sizing based on confidence and volatility
+  - [ ] Add confidence score to model outputs
+  - [ ] Scale position size based on prediction confidence
+  - [ ] Implement Kelly criterion for optimal position sizing
+
+### 6. Training Data Augmentation
+- [ ] Implement data augmentation for more robust training
+  - [ ] Simulate different market conditions
+  - [ ] Add noise to training data
+  - [ ] Generate synthetic data for rare market events
+
+### 7. Model Interpretability
+- [ ] Add visualization for model decision making
+  - [ ] Implement feature importance analysis
+  - [ ] Add attention visualization for key price patterns
+  - [ ] Create explainable AI components
+
+## Implementation Details
+
+### Completed: Displaying Trade Rate
+The trade rate display implementation has been completed in the `RealTimeChart` class:
+```python
+def calculate_trade_rate(self):
+    """Calculate and return trading rate statistics based on recent trades"""
+    if not hasattr(self, 'trade_times') or not self.trade_times:
+        return {"per_second": 0, "per_minute": 0, "per_hour": 0}
+    
+    # Get current time
+    now = datetime.now()
+    
+    # Calculate different time windows
+    one_second_ago = now - timedelta(seconds=1)
+    one_minute_ago = now - timedelta(minutes=1)
+    one_hour_ago = now - timedelta(hours=1)
+    
+    # Count trades in different time windows
+    trades_last_second = sum(1 for t in self.trade_times if t > one_second_ago)
+    trades_last_minute = sum(1 for t in self.trade_times if t > one_minute_ago)
+    trades_last_hour = sum(1 for t in self.trade_times if t > one_hour_ago)
+    
+    # Calculate rates
+    return {
+        "per_second": trades_last_second,
+        "per_minute": trades_last_minute,
+        "per_hour": trades_last_hour
+    }
+```
+
+### Completed: Improved Reward Function
+The improved reward function has been implemented in `improved_reward_function.py`:
+```python
+def calculate_reward(self, action, price_change, position_held_time=0, 
+                     volatility=None, is_profitable=False):
+    """
+    Calculate the improved reward with risk adjustment
+    """
+    # Calculate trading fee
+    fee = self.base_fee_rate
+    
+    # Calculate frequency penalty
+    frequency_penalty = self._calculate_frequency_penalty()
+    
+    # Base reward calculation
+    if action == 0:  # BUY
+        # Small penalty for transaction plus frequency penalty
+        reward = -fee - frequency_penalty
+        
+    elif action == 1:  # SELL
+        # Calculate profit percentage minus fees (both entry and exit)
+        profit_pct = price_change
+        net_profit = profit_pct - (fee * 2)
+        
+        # Scale reward and apply frequency penalty
+        reward = net_profit * 10  # Scale reward
+        reward -= frequency_penalty
+        
+        # Record PnL for risk adjustment
+        self.record_pnl(net_profit)
+        
+    else:  # HOLD
+        # Small reward for holding a profitable position, small cost otherwise
+        if is_profitable:
+            reward = self._calculate_holding_reward(position_held_time, price_change)
+        else:
+            reward = -0.0001  # Very small negative reward
+    
+    # Apply risk adjustment if enabled
+    if self.risk_adjusted:
+        reward = self._calculate_risk_adjustment(reward)
+        
+    # Record this action for future frequency calculations
+    self.record_trade(action=action)
+    
+    return reward
+```
+
+### Completed: GPU Optimization
+Added GPU optimization in `train_rl_with_realtime.py`:
+```python
+def setup_gpu():
+    """
+    Configure GPU usage for PyTorch training
+    
+    Returns:
+        tuple: (success, device, message)
+    """
+    try:
+        if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            device_info = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
+            logger.info(f"Found {gpu_count} GPU(s): {', '.join(device_info)}")
+            
+            device = torch.device("cuda:0")
+            
+            # Test CUDA by creating a small tensor
+            test_tensor = torch.tensor([1.0, 2.0, 3.0], device=device)
+            
+            # Enable mixed precision if supported
+            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
+                logger.info("BFloat16 is supported - enabling for faster training")
+            
+            return True, device, f"GPU enabled: {device_info}"
+        else:
+            return False, torch.device("cpu"), "GPU not available, using CPU"
+    except Exception as e:
+        return False, torch.device("cpu"), f"GPU setup failed: {str(e)}"
+```
+
+### CNN Price Direction Prediction (To be implemented)
+```python
+def generate_direction_examples(self, historical_data, timeframes=['1m', '1h', '1d']):
+    """Generate price direction examples from historical data"""
+    examples = []
+    labels = []
+    
+    for tf in timeframes:
+        df = historical_data[tf]
+        for i in range(20, len(df) - 10):
+            # Use window of 20 candles for input
+            window = df.iloc[i-20:i]
+            
+            # Create labels for future price direction (next 5, 10, 20 candles)
+            future_5 = df.iloc[i].close < df.iloc[i+5].close  # True if price goes up
+            future_10 = df.iloc[i].close < df.iloc[i+10].close
+            future_20 = df.iloc[i].close < df.iloc[min(i+20, len(df)-1)].close
+            
+            examples.append(window.values)
+            labels.append([future_5, future_10, future_20])
+    
+    return np.array(examples), np.array(labels)
+```
+
+## Validation Plan
+After implementing these improvements, we should validate the system with:
+1. Backtesting on historical data 
+2. Forward testing with small position sizes
+3. A/B testing of different reward functions
+4. Measuring the improvement in profitability and Sharpe ratio
+
+## Progress Tracking
+- Implementation started: June 2023
+- GPU utilization fixed: July 2023
+- Trade signal rate display implemented: July 2023
+- Reward function optimized: July 2023
+- CNN direction prediction added: To be completed
+- Full system tested: To be completed 
\ No newline at end of file
diff --git a/improved_reward_function.py b/improved_reward_function.py
new file mode 100644
index 0000000..b16c3b8
--- /dev/null
+++ b/improved_reward_function.py
@@ -0,0 +1,180 @@
+"""
+Improved Reward Function for RL Trading Agent
+
+This module provides a more sophisticated reward function for the RL trading agent
+that incorporates realistic trading fees, penalties for excessive trading, and
+rewards for successful holding of positions.
+"""
+
+import numpy as np
+from datetime import datetime, timedelta
+from collections import deque
+
+class ImprovedRewardCalculator:
+    def __init__(self, 
+                 base_fee_rate=0.001,            # 0.1% per transaction
+                 max_frequency_penalty=0.005,    # Maximum 0.5% penalty for frequent trading
+                 holding_reward_rate=0.0001,     # Small reward for holding profitable positions
+                 risk_adjusted=True):            # Use Sharpe ratio for risk adjustment
+        
+        self.base_fee_rate = base_fee_rate
+        self.max_frequency_penalty = max_frequency_penalty
+        self.holding_reward_rate = holding_reward_rate
+        self.risk_adjusted = risk_adjusted
+        
+        # Keep track of recent trades
+        self.recent_trades = deque(maxlen=1000)
+        self.trade_pnls = deque(maxlen=100)  # For risk adjustment
+        
+    def record_trade(self, timestamp=None, action=None, price=None):
+        """Record a trade for frequency tracking"""
+        if timestamp is None:
+            timestamp = datetime.now()
+            
+        self.recent_trades.append({
+            'timestamp': timestamp,
+            'action': action,
+            'price': price
+        })
+    
+    def record_pnl(self, pnl):
+        """Record a PnL result for risk adjustment"""
+        self.trade_pnls.append(pnl)
+    
+    def _calculate_frequency_penalty(self):
+        """Calculate penalty for trading too frequently"""
+        if len(self.recent_trades) < 2:
+            return 0.0
+            
+        # Count trades in the last minute
+        now = datetime.now()
+        one_minute_ago = now - timedelta(minutes=1)
+        trades_last_minute = sum(1 for trade in self.recent_trades 
+                              if trade['timestamp'] > one_minute_ago)
+        
+        # Apply progressive penalty (more severe as frequency increases)
+        if trades_last_minute <= 1:
+            return 0.0  # No penalty for normal trading rate
+        
+        # Progressive penalty based on trade frequency
+        penalty = min(self.max_frequency_penalty, 
+                      self.base_fee_rate * trades_last_minute)
+        
+        return penalty
+    
+    def _calculate_holding_reward(self, position_held_time, price_change_pct):
+        """Calculate reward for holding a position for some time"""
+        if position_held_time <= 0 or price_change_pct <= 0:
+            return 0.0  # No reward for unprofitable holds
+            
+        # Cap at 100 time units (seconds, minutes, etc.)
+        capped_time = min(position_held_time, 100)
+        
+        # Scale reward by both time and price change
+        reward = self.holding_reward_rate * capped_time * price_change_pct
+        
+        return reward
+    
+    def _calculate_risk_adjustment(self, reward):
+        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
+        if len(self.trade_pnls) < 5:
+            return reward  # Not enough data for adjustment
+            
+        # Calculate mean and standard deviation of returns
+        pnl_array = np.array(self.trade_pnls)
+        mean_return = np.mean(pnl_array)
+        std_return = np.std(pnl_array)
+        
+        if std_return == 0:
+            return reward  # Avoid division by zero
+            
+        # Simplified Sharpe ratio
+        sharpe = mean_return / std_return
+        
+        # Scale reward by Sharpe ratio (normalized to be around 1.0)
+        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
+        
+        return reward * adjustment_factor
+    
+    def calculate_reward(self, action, price_change, position_held_time=0, 
+                         volatility=None, is_profitable=False):
+        """
+        Calculate the improved reward
+        
+        Args:
+            action (int): 0 = Buy, 1 = Sell, 2 = Hold
+            price_change (float): Percent price change for the trade
+            position_held_time (int): Time position was held (in time units)
+            volatility (float, optional): Market volatility measure
+            is_profitable (bool): Whether current position is profitable
+            
+        Returns:
+            float: Calculated reward value
+        """
+        # Calculate trading fee
+        fee = self.base_fee_rate
+        
+        # Calculate frequency penalty
+        frequency_penalty = self._calculate_frequency_penalty()
+        
+        # Base reward calculation
+        if action == 0:  # Buy
+            # Small penalty for transaction plus frequency penalty
+            reward = -fee - frequency_penalty
+            
+        elif action == 1:  # Sell
+            # Calculate profit percentage minus fees (both entry and exit)
+            profit_pct = price_change
+            net_profit = profit_pct - (fee * 2)
+            
+            # Scale reward and apply frequency penalty
+            reward = net_profit * 10  # Scale reward
+            reward -= frequency_penalty
+            
+            # Record PnL for risk adjustment
+            self.record_pnl(net_profit)
+            
+        else:  # Hold
+            # Small reward for holding a profitable position, small cost otherwise
+            if is_profitable:
+                reward = self._calculate_holding_reward(position_held_time, price_change)
+            else:
+                reward = -0.0001  # Very small negative reward
+        
+        # Apply risk adjustment if enabled
+        if self.risk_adjusted:
+            reward = self._calculate_risk_adjustment(reward)
+            
+        # Record this action for future frequency calculations
+        self.record_trade(action=action)
+        
+        return reward
+
+
+# Example usage:
+if __name__ == "__main__":
+    # Create calculator instance
+    reward_calc = ImprovedRewardCalculator()
+    
+    # Example reward for a buy action
+    buy_reward = reward_calc.calculate_reward(action=0, price_change=0)
+    print(f"Buy action reward: {buy_reward:.5f}")
+    
+    # Record a trade for frequency tracking
+    reward_calc.record_trade(action=0)
+    
+    # Wait a bit and make another trade to test frequency penalty
+    import time
+    time.sleep(0.1)
+    
+    # Example reward for a sell action with profit
+    sell_reward = reward_calc.calculate_reward(action=1, price_change=0.015, position_held_time=60)
+    print(f"Sell action reward (with profit): {sell_reward:.5f}")
+    
+    # Example reward for a hold action on profitable position
+    hold_reward = reward_calc.calculate_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
+    print(f"Hold action reward (profitable): {hold_reward:.5f}")
+    
+    # Example reward for a hold action on unprofitable position
+    hold_reward_neg = reward_calc.calculate_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
+    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}") 
\ No newline at end of file
diff --git a/realtime.py b/realtime.py
index 749b472..0c753fa 100644
--- a/realtime.py
+++ b/realtime.py
@@ -777,6 +777,11 @@ class RealTimeChart:
         self.accumulative_pnl = 0.0  # Reset PnL
         self.current_balance = 100.0  # Start with $100 balance
         
+        # Add trade rate tracking variables
+        self.trade_times = []  # List to store timestamps of trades
+        self.last_trade_rate_calculation = datetime.now()
+        self.trade_rate = {"per_second": 0, "per_minute": 0, "per_hour": 0}
+        
         # Store historical data for different timeframes
         self.timeframe_data = {
             '1s': [],
@@ -833,6 +838,19 @@ class RealTimeChart:
                         html.H5("Accumulated PnL:", style={"display": "inline-block", "marginRight": "10px", "marginLeft": "30px"}),
                         html.H5(id="accumulated-pnl", style={"display": "inline-block", "color": "#ffc107"}),
                     ], style={"display": "inline-block", "marginLeft": "40px"}),
+                    
+                    # Add trade rate display
+                    html.Div([
+                        html.H5("Trade Rate:", style={"display": "inline-block", "marginRight": "10px", "marginLeft": "30px"}),
+                        html.Span([
+                            html.Span(id="trade-rate-second", style={"color": "#ff7f0e"}),
+                            html.Span("/s, "),
+                            html.Span(id="trade-rate-minute", style={"color": "#ff7f0e"}),
+                            html.Span("/m, "),
+                            html.Span(id="trade-rate-hour", style={"color": "#ff7f0e"}),
+                            html.Span("/h")
+                        ], style={"display": "inline-block"}),
+                    ], style={"display": "inline-block", "marginLeft": "40px"}),
                 ], style={"textAlign": "center", "margin": "20px 0"}),
             ], style={"textAlign": "center", "marginBottom": "20px"}),
             
@@ -958,7 +976,10 @@ class RealTimeChart:
              Output('positions-list', 'children'),
              Output('current-price', 'children'),
              Output('current-balance', 'children'),
-             Output('accumulated-pnl', 'children')],
+             Output('accumulated-pnl', 'children'),
+             Output('trade-rate-second', 'children'),
+             Output('trade-rate-minute', 'children'),
+             Output('trade-rate-hour', 'children')],
             [Input('interval-component', 'n_intervals'),
              Input('interval-store', 'data')]
         )
@@ -983,13 +1004,19 @@ class RealTimeChart:
                 balance_text = f"${self.current_balance:.2f}"
                 pnl_text = f"${self.accumulative_pnl:.2f}"
                 
-                return main_fig, secondary_fig, positions, current_price, balance_text, pnl_text
+                # Get trade rate statistics
+                trade_rate = self.calculate_trade_rate()
+                per_second = f"{trade_rate['per_second']:.1f}"
+                per_minute = f"{trade_rate['per_minute']:.1f}"
+                per_hour = f"{trade_rate['per_hour']:.1f}"
+                
+                return main_fig, secondary_fig, positions, current_price, balance_text, pnl_text, per_second, per_minute, per_hour
             except Exception as e:
                 logger.error(f"Error in update callback: {str(e)}")
                 import traceback
                 logger.error(traceback.format_exc())
                 # Return empty updates on error
-                return {}, {}, [], "Error", "$0.00", "$0.00"
+                return {}, {}, [], "Error", "$0.00", "$0.00", "0.0", "0.0", "0.0"
 
     def _update_main_chart(self, interval=1):
         """Update the main chart with OHLC data"""
@@ -1046,12 +1073,13 @@ class RealTimeChart:
             closes = [candle['close'] for candle in candles]
             volumes = [candle['volume'] for candle in candles]
             
-            # Create figure
-            fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
+            # Create figure with 3 rows for OHLC, volume, and trade rate
+            fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                                vertical_spacing=0.02, 
-                               row_heights=[0.8, 0.2],
+                               row_heights=[0.6, 0.2, 0.2],
                                specs=[[{"type": "candlestick"}],
-                                      [{"type": "bar"}]])
+                                      [{"type": "bar"}],
+                                      [{"type": "scatter"}]])
             
             # Add candlestick trace
             fig.add_trace(go.Candlestick(
@@ -1070,14 +1098,13 @@ class RealTimeChart:
                 x=timestamps,
                 y=volumes,
                 name='Volume',
-                marker_color='rgba(100, 100, 255, 0.5)'
+                marker=dict(color='rgba(0,0,100,0.2)')
             ), row=2, col=1)
             
-            
             # Add trading markers if available
             if hasattr(self, 'positions') and self.positions:
-                # Get last 100 positions for display (to avoid too many markers)
-                positions = self.positions[-100:]
+                # Get last 500 positions for display (to avoid too many markers)
+                positions = self.positions[-500:]
                 
                 buy_timestamps = []
                 buy_prices = []
@@ -1122,14 +1149,69 @@ class RealTimeChart:
                         )
                     ), row=1, col=1)
             
-
+            # Add trade rate visualization in the third panel
+            if hasattr(self, 'trade_times') and self.trade_times:
+                # Create time buckets for grouping trade times
+                time_buckets = {}
+                bucket_size_seconds = 15  # Default bucket size
+                
+                # Adjust bucket size based on interval
+                if interval >= 60:  # 1m or more
+                    bucket_size_seconds = 60
+                elif interval >= 300:  # 5m or more
+                    bucket_size_seconds = 300
+                
+                # Process trade times into buckets
+                for trade_time in self.trade_times:
+                    # Skip trades older than the displayed range
+                    if trade_time < timestamps[0]:
+                        continue
+                    
+                    # Create bucket key
+                    bucket_timestamp = trade_time.replace(
+                        microsecond=0,
+                        second=(trade_time.second // bucket_size_seconds) * bucket_size_seconds
+                    )
+                    if bucket_timestamp.timestamp() not in time_buckets:
+                        time_buckets[bucket_timestamp.timestamp()] = 0
+                    time_buckets[bucket_timestamp.timestamp()] += 1
+                
+                # Convert buckets to series for plotting
+                if time_buckets:
+                    bucket_timestamps = []
+                    bucket_counts = []
+                    for timestamp, count in sorted(time_buckets.items()):
+                        bucket_timestamps.append(datetime.fromtimestamp(timestamp))
+                        bucket_counts.append(count)
+                    
+                    # Add trade frequency chart
+                    fig.add_trace(go.Scatter(
+                        x=bucket_timestamps,
+                        y=bucket_counts,
+                        mode='lines',
+                        name='Trades per Bucket',
+                        line=dict(width=2, color='rgba(255, 165, 0, 0.8)'),
+                        fill='tozeroy',
+                        fillcolor='rgba(255, 165, 0, 0.2)'
+                    ), row=3, col=1)
+                    
+                    # Add current trade rate
+                    trade_rate = self.calculate_trade_rate()
+                    fig.add_annotation(
+                        text=f"Trade Rate: {trade_rate['per_minute']:.1f}/min",
+                        xref="paper", yref="y3",
+                        x=0.99, y=max(bucket_counts) * 0.9 if bucket_counts else 1,
+                        showarrow=False,
+                        font=dict(size=10, color="orange"),
+                        align="right"
+                    )
             
             # Update layout
             fig.update_layout(
                 title=f"{self.symbol} - {interval_key}",
                 xaxis_title="Time",
                 yaxis_title="Price",
-                height=600,
+                height=700,  # Increase height to accommodate additional panel
                 template="plotly_dark",
                 showlegend=True,
                 margin=dict(l=0, r=0, t=50, b=20),
@@ -1138,7 +1220,11 @@ class RealTimeChart:
             )
             
             # Format Y-axis with enough decimal places for cryptocurrency
-            fig.update_yaxes(tickformat=".2f")
+            fig.update_yaxes(tickformat=".2f", row=1, col=1)
+            
+            # Add titles to each panel
+            fig.update_yaxes(title_text="Volume", row=2, col=1)
+            fig.update_yaxes(title_text="Trade Rate", row=3, col=1)
             
             # Format X-axis with date/time
             fig.update_xaxes(
@@ -1377,6 +1463,42 @@ class RealTimeChart:
         else:
             return f"{interval_seconds // 86400}d"
 
+    def calculate_trade_rate(self):
+        """Calculate and return trading rate statistics"""
+        now = datetime.now()
+        
+        # Only calculate once per second to avoid unnecessary processing
+        if (now - self.last_trade_rate_calculation).total_seconds() < 1.0:
+            return self.trade_rate
+            
+        self.last_trade_rate_calculation = now
+        
+        # Clean up old trade times (older than 1 hour)
+        one_hour_ago = now - timedelta(hours=1)
+        self.trade_times = [t for t in self.trade_times if t > one_hour_ago]
+        
+        if not self.trade_times:
+            self.trade_rate = {"per_second": 0, "per_minute": 0, "per_hour": 0}
+            return self.trade_rate
+        
+        # Calculate rates based on time windows
+        last_second = now - timedelta(seconds=1)
+        last_minute = now - timedelta(minutes=1)
+        
+        # Count trades in each time window
+        trades_last_second = sum(1 for t in self.trade_times if t > last_second)
+        trades_last_minute = sum(1 for t in self.trade_times if t > last_minute)
+        trades_last_hour = len(self.trade_times)  # All remaining trades are from last hour
+        
+        # Calculate rates
+        self.trade_rate = {
+            "per_second": trades_last_second,
+            "per_minute": trades_last_minute,
+            "per_hour": trades_last_hour
+        }
+        
+        return self.trade_rate
+
     def _update_chart_and_positions(self):
         """Update the chart with current data and positions"""
         try:
diff --git a/train_rl_with_realtime.py b/train_rl_with_realtime.py
index 9268de0..2f02835 100644
--- a/train_rl_with_realtime.py
+++ b/train_rl_with_realtime.py
@@ -23,6 +23,14 @@ import argparse
 from scipy.signal import argrelextrema
 from torch.utils.tensorboard import SummaryWriter
 
+# Add the improved reward function
+try:
+    from improved_reward_function import ImprovedRewardCalculator
+    reward_calculator_available = True
+except ImportError:
+    logging.warning("Improved reward function not available, using default reward")
+    reward_calculator_available = False
+
 # Configure logging
 logger = logging.getLogger('rl_realtime')
 
@@ -31,6 +39,62 @@ project_root = os.path.dirname(os.path.abspath(__file__))
 if project_root not in sys.path:
     sys.path.append(project_root)
 
+# Set up GPU/CUDA if available
+def setup_gpu():
+    """
+    Configure GPU usage for PyTorch training
+    
+    Returns:
+        tuple: (success, device, message)
+            - success: bool indicating if GPU is available and configured
+            - device: torch device object
+            - message: descriptive message about GPU status
+    """
+    try:
+        # Check if CUDA is available
+        if torch.cuda.is_available():
+            # Get the number of GPUs
+            gpu_count = torch.cuda.device_count()
+            
+            # Print GPU info
+            device_info = []
+            for i in range(gpu_count):
+                device_name = torch.cuda.get_device_name(i)
+                device_info.append(f"GPU {i}: {device_name}")
+            
+            # Log GPU info
+            logger.info(f"Found {gpu_count} GPU(s): {', '.join(device_info)}")
+            
+            # Set CUDA device and ensure PyTorch can use it
+            device = torch.device("cuda:0")  # Use first GPU by default
+            
+            # Enable TensorFloat32 for NVIDIA Ampere-based GPUs (A100, RTX 30xx, etc.)
+            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
+                logger.info("BFloat16 is supported - enabling for faster training")
+                # This will be used in model definition
+            
+            # Test CUDA by creating a small tensor
+            test_tensor = torch.tensor([1.0, 2.0, 3.0], device=device)
+            logger.info(f"CUDA test successful: {test_tensor.device}")
+            
+            # Set environment variables to optimize CUDA performance
+            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Makes debugging easier
+            
+            # Return success with device
+            return True, device, f"GPU enabled: {device_info}"
+        else:
+            logger.warning("CUDA is not available. Training will use CPU only.")
+            return False, torch.device("cpu"), "GPU not available, using CPU"
+    
+    except Exception as e:
+        logger.error(f"Error setting up GPU: {str(e)}")
+        logger.info("Falling back to CPU training")
+        return False, torch.device("cpu"), f"GPU setup failed: {str(e)}"
+
+# Run GPU setup at module import time
+gpu_available, device, gpu_message = setup_gpu()
+logger.info(gpu_message)
+
 # Global variables for coordination
 realtime_chart = None
 realtime_websocket_task = None
@@ -129,6 +193,10 @@ class RLTrainingIntegrator:
         # TensorBoard writer
         self.tensorboard_writer = None
         
+        # Device for computation (GPU or CPU)
+        self.device = device
+        self.gpu_available = gpu_available
+        
     def _train_on_extrema(self, agent, env):
         """Train the agent specifically on local extrema points"""
         if not hasattr(env, 'data') or not hasattr(env, 'original_data'):
@@ -290,6 +358,16 @@ class RLTrainingIntegrator:
             log_dir = f'runs/rl_realtime_{int(time.time())}'
             self.tensorboard_writer = SummaryWriter(log_dir=log_dir)
             logger.info(f"TensorBoard logging enabled at {log_dir}")
+            
+            # Log GPU status in TensorBoard
+            self.tensorboard_writer.add_text("setup/gpu_status", gpu_message, 0)
+            if self.gpu_available:
+                # Log GPU memory usage
+                for i in range(torch.cuda.device_count()):
+                    mem_allocated = torch.cuda.memory_allocated(i) / (1024 ** 2)  # MB
+                    mem_reserved = torch.cuda.memory_reserved(i) / (1024 ** 2)    # MB
+                    self.tensorboard_writer.add_scalar(f"gpu/memory_allocated_MB_device{i}", mem_allocated, 0)
+                    self.tensorboard_writer.add_scalar(f"gpu/memory_reserved_MB_device{i}", mem_reserved, 0)
         except Exception as e:
             logger.error(f"Failed to initialize TensorBoard writer: {str(e)}")
             self.tensorboard_writer = None
@@ -315,6 +393,17 @@ class RLTrainingIntegrator:
                     
                     # TensorBoard writer
                     self.writer = None
+                    
+                    # Initialize improved reward calculator if available
+                    self.use_improved_reward = reward_calculator_available
+                    if self.use_improved_reward:
+                        self.reward_calculator = ImprovedRewardCalculator(
+                            base_fee_rate=trading_fee,
+                            max_frequency_penalty=0.005,
+                            holding_reward_rate=0.0002,
+                            risk_adjusted=True
+                        )
+                        logging.info("Using improved reward function with risk adjustment")
                 
                 def set_integrator(self, integrator):
                     """Set reference to integrator for callbacks"""
@@ -334,16 +423,65 @@ class RLTrainingIntegrator:
                     current_price = self.features_1m[self.current_step, -1]
                     next_price = self.features_1m[self.current_step + 1, -1]
                     
-                    # Default values
-                    pnl = 0.0
-                    reward = -0.0001  # Small negative reward to discourage excessive actions
-                    
                     # Get real market price if available (from integrator)
                     real_market_price = None
                     if self.integrator and hasattr(self.integrator, 'chart') and self.integrator.chart:
                         if hasattr(self.integrator.chart, 'tick_storage'):
                             real_market_price = self.integrator.chart.tick_storage.get_latest_price()
                     
+                    # Use actual market price if available, otherwise use the candle price
+                    price_to_use = real_market_price if real_market_price else current_price
+                    
+                    # Calculate price change and initial variables
+                    price_change = 0
+                    if self.integrator and self.integrator.entry_price:
+                        price_change = (price_to_use - self.integrator.entry_price) / self.integrator.entry_price
+                    
+                    # Calculate position held time
+                    position_held_time = 0
+                    if self.integrator and self.integrator.entry_time:
+                        position_held_time = self.current_step - self.integrator.entry_time
+                    
+                    # Determine if position is profitable
+                    is_profitable = False
+                    if price_change > 0:
+                        is_profitable = True
+                    
+                    # If using improved reward calculator
+                    if self.use_improved_reward:
+                        # Convert our action to the format expected by the reward calculator
+                        # 0:BUY, 1:SELL, 2:HOLD -> For calculator it's the same
+                        reward_calc_action = action
+                        
+                        # Calculate reward using the improved calculator
+                        reward = self.reward_calculator.calculate_reward(
+                            action=reward_calc_action,
+                            price_change=price_change,
+                            position_held_time=position_held_time,
+                            is_profitable=is_profitable
+                        )
+                        
+                        # Record the trade for frequency tracking
+                        self.reward_calculator.record_trade(
+                            timestamp=datetime.now(),
+                            action=action,
+                            price=price_to_use
+                        )
+                        
+                        # If we have a PnL result, record it
+                        if action == 1 and self.integrator and self.integrator.current_position_size > 0:
+                            pnl = price_change - (self.trading_fee * 2)  # Account for entry and exit fees
+                            self.reward_calculator.record_pnl(pnl)
+                        
+                        # Log the reward calculation
+                        logging.debug(f"Improved reward for action {action}: {reward:.6f}")
+                        
+                        return reward, price_change
+                    
+                    # Default values if not using improved calculator
+                    pnl = 0.0
+                    reward = -0.0001  # Small negative reward to discourage excessive actions
+                    
                     # Calculate base reward based on position and price change
                     if action == 0:  # BUY
                         # Apply fee directly as negative reward to discourage excessive trading
@@ -362,7 +500,7 @@ class RLTrainingIntegrator:
                             elif last_signal['action'] == 'SELL':
                                 # RNN suggests opposite - reduce reward
                                 reward -= 0.003 * self.rnn_signal_weight * last_signal.get('confidence', 1.0)
-                        
+                    
                     elif action == 1:  # SELL
                         if self.integrator and self.integrator.current_position_size > 0:
                             # Calculate potential profit/loss
@@ -388,7 +526,7 @@ class RLTrainingIntegrator:
                         else:
                             # No position to sell - penalize
                             reward = -0.005
-                        
+                    
                     elif action == 2:  # HOLD
                         # Check if we're holding a profitable position
                         if self.integrator and self.integrator.current_position_size > 0 and self.integrator.entry_price:
@@ -593,8 +731,27 @@ class RLTrainingIntegrator:
         
         return env
 
-    def on_action(self, step, action, price, reward, info):
-        """Called after each action in the episode"""
+    def on_action(self, step_or_action, action_or_price=None, price_or_reward=None, reward_or_info=None, info=None):
+        """
+        Called after each action in the episode.
+        This method has a flexible signature to handle both:
+        - on_action(self, step, action, price, reward, info) - from direct calls
+        - on_action(self, action, price, reward, info) - from train_rl.py callback
+        """
+        # Handle different calling signatures
+        if info is None:
+            # Called with 4 args: (action, price, reward, info)
+            action = step_or_action
+            price = action_or_price
+            reward = price_or_reward
+            info = reward_or_info
+            step = self.session_step  # Use session step for tracking
+        else:
+            # Called with 5 args: (step, action, price, reward, info)
+            step = step_or_action
+            action = action_or_price
+            price = price_or_reward
+            reward = reward_or_info
         
         # Log the action
         action_str = "BUY" if action == 0 else "SELL" if action == 1 else "HOLD"
@@ -792,7 +949,47 @@ class RLTrainingIntegrator:
             
         return True  # Continue training
 
-async def start_realtime_chart(symbol="BTC/USDT", port=8050, manual_mode=False):
+    def optimize_model_for_gpu(self, model):
+        """
+        Optimize a PyTorch model for GPU training
+        
+        Args:
+            model: PyTorch model to optimize
+            
+        Returns:
+            Optimized model
+        """
+        if not self.gpu_available:
+            logger.info("GPU not available, skipping optimization")
+            return model
+            
+        try:
+            logger.info("Optimizing model for GPU...")
+            
+            # Move model to GPU
+            model = model.to(self.device)
+            
+            # Use mixed precision if available (much faster training with minimal accuracy loss)
+            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
+                # Enable AMP (Automatic Mixed Precision)
+                logger.info("Enabling mixed precision (BF16) for faster training")
+                # The actual implementation will depend on the training loop
+                # This function just prepares the model
+                
+            # Set model to train mode (important for batch norm, dropout, etc.)
+            model.train()
+            
+            # Log success
+            logger.info(f"Model successfully optimized for {self.device}")
+            
+            return model
+            
+        except Exception as e:
+            logger.error(f"Error optimizing model for GPU: {str(e)}")
+            logger.warning("Falling back to unoptimized model")
+            return model
+
+async def start_realtime_chart(symbol="ETH/USDT", port=8050, manual_mode=False):
     """Start the realtime chart
     
     Args:
@@ -855,6 +1052,11 @@ def _add_trade_compat(chart, price, timestamp, amount, pnl=0.0, action="BUY"):
             fee_rate=0.001  # 0.1% fee rate
         )
         
+        # Track this trade for rate calculation
+        if hasattr(chart, 'trade_times'):
+            # Use current time instead of provided timestamp for accurate rate calculation
+            chart.trade_times.append(datetime.now())
+        
         # For SELL actions, close the position with given PnL
         if action == "SELL":
             # Find the most recent BUY position that hasn't been closed
@@ -933,6 +1135,10 @@ def run_training_thread(chart, num_episodes=5000, skip_training=False, max_posit
                 else:
                     logger.warning("No pre-trained agent found")
             else:
+                # Disable mixed precision training to avoid optimizer errors
+                os.environ['DISABLE_MIXED_PRECISION'] = '1'
+                logger.info("Disabling mixed precision training to avoid optimizer errors")
+                
                 # Use a small number of episodes to test termination handling
                 logger.info(f"Starting training with {num_episodes} episodes and max_position={max_position}")
                 integrator.run_training(episodes=num_episodes, max_steps=2000)
@@ -1081,6 +1287,11 @@ if __name__ == "__main__":
     logger.info(f"Manual-trades: {args.manual_trades}")
     logger.info(f"Max position size: {args.max_position}")
     
+    # Log system info including GPU status
+    logger.info(f"PyTorch version: {torch.__version__}")
+    logger.info(f"GPU available: {gpu_available}")
+    logger.info(f"Device: {device}")
+    
     try:
         asyncio.run(main())
     except KeyboardInterrupt: