initial movel changes to fix performance

2025-04-02 14:03:20 +03:00
parent aec536d007
commit 70eb7bba9b
8 changed files with 1619 additions and 279 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@ -8,6 +8,7 @@ from typing import Tuple, List
 import os
 import sys
 import logging
 import torch.nn.functional as F
 # Add parent directory to path
 sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
@ -20,71 +21,124 @@ logger = logging.getLogger(__name__)
 class DQNAgent:
    """
    Deep Q-Network agent for trading
-    Uses CNN model as the base network
+    Uses CNN model as the base network with GPU support
    """
    def __init__(self,
-                 state_size: int,
+                 state_shape: Tuple[int, ...],
-                 action_size: int,
+                 n_actions: int,
                 window_size: int,
                 num_features: int,
                 timeframes: List[str],
                 learning_rate: float = 0.0005,  # Reduced learning rate for more stability
                 gamma: float = 0.97,            # Slightly reduced discount factor
                 epsilon: float = 1.0,
                 epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
                 epsilon_decay: float = 0.9975,  # Slower decay rate
-                 memory_size: int = 20000,       # Increased memory size
+                 buffer_size: int = 20000,       # Increased memory size
                 batch_size: int = 128,          # Larger batch size
-                 target_update: int = 5):        # More frequent target updates
+                 target_update: int = 5,         # More frequent target updates
                 device=None):                   # Device for computations
-        self.state_size = state_size
+        # Extract state dimensions
-        self.action_size = action_size
+        if isinstance(state_shape, tuple) and len(state_shape) > 1:
-        self.window_size = window_size
+            # Multi-dimensional state (like image or sequence)
-        self.num_features = num_features
+            self.state_dim = state_shape
-        self.timeframes = timeframes
+        else:
            # 1D state
            if isinstance(state_shape, tuple):
                self.state_dim = state_shape[0]
            else:
                self.state_dim = state_shape
        # Store parameters
        self.n_actions = n_actions
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
-        self.memory_size = memory_size
+        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.target_update = target_update
-        # Device configuration
+        # Set device for computation (default to CPU)
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        if device is None:
            self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        else:
            self.device = device
-        # Initialize networks
+        # Initialize models with appropriate architecture based on state shape
-        self.policy_net = CNNModelPyTorch(
+        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
-            window_size=window_size,
+            # For image-like states (from RL environment with CNN)
-            num_features=num_features,
+            from NN.models.simple_cnn import SimpleCNN
-            output_size=action_size,
+            self.policy_net = SimpleCNN(self.state_dim, self.n_actions)
-            timeframes=timeframes
+            self.target_net = SimpleCNN(self.state_dim, self.n_actions)
-        ).to(self.device)
+        else:
            # For 1D state vectors (most environments)
            from NN.models.simple_mlp import SimpleMLP
            self.policy_net = SimpleMLP(self.state_dim, self.n_actions)
            self.target_net = SimpleMLP(self.state_dim, self.n_actions)
-        self.target_net = CNNModelPyTorch(
+        # Initialize the target network with the same weights as the policy network
            window_size=window_size,
            num_features=num_features,
            output_size=action_size,
            timeframes=timeframes
        ).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
-        # Initialize optimizer with gradient clipping
+        # Set models to eval mode (important for batch norm, dropout)
-        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
+        self.target_net.eval()
-        # Initialize memories with different priorities
+        # Optimization components
-        self.memory = deque(maxlen=memory_size)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.learning_rate)
-        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
+        self.criterion = nn.MSELoss()
        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards
-        # Training metrics
+        # Experience replay memory
        self.memory = []
        self.positive_memory = []  # Special memory for storing good experiences
        self.update_count = 0
        self.losses = []
        self.avg_reward = 0
        self.no_improvement_count = 0
        self.best_reward = float('-inf')
        # Extrema detection tracking
        self.last_extrema_pred = {
            'class': 2,  # Default to "neither" (not extrema)
            'confidence': 0.0,
            'raw': None
        }
        self.extrema_memory = []  # Special memory for storing extrema points
        # Performance tracking
        self.losses = []
        self.avg_reward = 0.0
        self.best_reward = -float('inf')
        self.no_improvement_count = 0
        # Check if mixed precision training should be used
        self.use_mixed_precision = False
        if torch.cuda.is_available() and hasattr(torch.cuda, 'amp') and 'DISABLE_MIXED_PRECISION' not in os.environ:
            self.use_mixed_precision = True
            self.scaler = torch.cuda.amp.GradScaler()
            logger.info("Mixed precision training enabled")
        else:
            logger.info("Mixed precision training disabled")
        # Track if we're in training mode
        self.training = True
        # For compatibility with old code
        self.state_size = np.prod(state_shape)
        self.action_size = n_actions
        self.memory_size = buffer_size
        self.timeframes = ["1m", "5m", "15m"][:self.state_dim[0]]  # Default timeframes
        logger.info(f"DQN Agent using device: {self.device}")
    def move_models_to_device(self, device=None):
        """Move models to the specified device (GPU/CPU)"""
        if device is not None:
            self.device = device
        try:
            self.policy_net = self.policy_net.to(self.device)
            self.target_net = self.target_net.to(self.device)
            logger.info(f"Moved models to {self.device}")
            return True
        except Exception as e:
            logger.error(f"Failed to move models to {self.device}: {str(e)}")
            return False
    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
@ -103,25 +157,472 @@ class DQNAgent:
        # Always add to main memory
        self.memory.append(experience)
-        # Add to specialized memories if applicable
+        # Check if this is an extrema point based on our extrema detection head
-        if is_extrema:
+        if hasattr(self, 'last_extrema_pred') and self.last_extrema_pred['class'] != 2:
            # Class 0 = bottom, 1 = top, 2 = neither
            # Only consider high confidence predictions
            if self.last_extrema_pred['confidence'] > 0.7:
                self.extrema_memory.append(experience)
                # Log this special experience
                extrema_type = "BOTTOM" if self.last_extrema_pred['class'] == 0 else "TOP"
                logger.info(f"Stored {extrema_type} experience with reward {reward:.4f}")
                # For tops and bottoms, also duplicate the experience in memory to learn more from it
                for _ in range(2):  # Add 2 extra copies
                    self.memory.append(experience)
        # Explicitly marked extrema points also go to extrema memory
        elif is_extrema:
            self.extrema_memory.append(experience)
        # Store positive experiences separately for prioritized replay
        if reward > 0:
            self.positive_memory.append(experience)
            # For very good rewards, duplicate to learn more from them
            if reward > 0.1:
                for _ in range(min(int(reward * 10), 5)):  # Cap at 5 extra copies for very high rewards
                    self.positive_memory.append(experience)
        # Keep memory size under control
        if len(self.memory) > self.buffer_size:
            # Keep more recent experiences
            self.memory = self.memory[-self.buffer_size:]
        # Keep specialized memories under control too
        if len(self.positive_memory) > self.buffer_size // 4:
            self.positive_memory = self.positive_memory[-(self.buffer_size // 4):]
        if len(self.extrema_memory) > self.buffer_size // 4:
            self.extrema_memory = self.extrema_memory[-(self.buffer_size // 4):]
    def act(self, state: np.ndarray, explore=True) -> int:
        """Choose action using epsilon-greedy policy with explore flag"""
        if explore and random.random() < self.epsilon:
-            return random.randrange(self.action_size)
+            return random.randrange(self.n_actions)
        with torch.no_grad():
            # Ensure state is normalized before inference
            state_tensor = self._normalize_state(state)
            state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
            # Get predictions using the policy network
            self.policy_net.eval()  # Set to evaluation mode for inference
            action_probs, extrema_pred = self.policy_net(state_tensor)
-            return action_probs.argmax().item()
+            self.policy_net.train()  # Back to training mode
            # Get the predicted extrema class (0=bottom, 1=top, 2=neither)
            extrema_class = extrema_pred.argmax(dim=1).item()
            extrema_confidence = torch.softmax(extrema_pred, dim=1)[0, extrema_class].item()
            # Log extrema prediction for significant signals
            if extrema_confidence > 0.7 and extrema_class != 2:  # Only log strong top/bottom signals
                extrema_type = "BOTTOM" if extrema_class == 0 else "TOP" if extrema_class == 1 else "NEITHER"
                logger.info(f"High confidence {extrema_type} detected! Confidence: {extrema_confidence:.4f}")
            # Store extrema prediction for the environment to use
            self.last_extrema_pred = {
                'class': extrema_class,
                'confidence': extrema_confidence,
                'raw': extrema_pred.cpu().numpy()
            }
            # Get the action with highest Q-value
            action = action_probs.argmax().item()
            # Adjust action based on extrema prediction (with some probability)
            if extrema_confidence > 0.8:  # Only adjust for strong signals
                if extrema_class == 0:  # Bottom detected
                    # Bias toward BUY at bottoms
                    if action != 0 and random.random() < 0.3 * extrema_confidence:
                        logger.info(f"Adjusting action to BUY based on bottom detection")
                        action = 0  # BUY
                elif extrema_class == 1:  # Top detected
                    # Bias toward SELL at tops
                    if action != 1 and random.random() < 0.3 * extrema_confidence:
                        logger.info(f"Adjusting action to SELL based on top detection")
                        action = 1  # SELL
            return action
    def replay(self, use_prioritized=True) -> float:
        """Experience replay - learn from stored experiences
        Args:
            use_prioritized: Whether to use prioritized experience replay
        Returns:
            float: Training loss
        """
        # Check if we have enough samples
        if len(self.memory) < self.batch_size:
            return 0.0
        # Check if mixed precision should be disabled
        if 'DISABLE_MIXED_PRECISION' in os.environ:
            self.use_mixed_precision = False
        # Sample from memory with or without prioritization
        if use_prioritized and len(self.positive_memory) > self.batch_size // 4:
            # Use prioritized sampling: mix normal samples with positive reward samples
            positive_batch_size = min(self.batch_size // 4, len(self.positive_memory))
            regular_batch_size = self.batch_size - positive_batch_size
            # Get positive examples
            positive_batch = random.sample(self.positive_memory, positive_batch_size)
            # Get regular examples
            regular_batch = random.sample(self.memory, regular_batch_size)
            # Combine batches
            minibatch = positive_batch + regular_batch
        else:
            # Use regular uniform sampling
            minibatch = random.sample(self.memory, self.batch_size)
        # Extract batches with proper tensor conversion
        states = np.vstack([self._normalize_state(x[0]) for x in minibatch])
        actions = np.array([x[1] for x in minibatch])
        rewards = np.array([x[2] for x in minibatch])
        next_states = np.vstack([self._normalize_state(x[3]) for x in minibatch])
        dones = np.array([x[4] for x in minibatch], dtype=np.float32)
        # Convert to torch tensors and move to device
        states_tensor = torch.FloatTensor(states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # First training step with mixed precision if available
        if self.use_mixed_precision:
            loss = self._replay_mixed_precision(
                states_tensor, actions_tensor, rewards_tensor, 
                next_states_tensor, dones_tensor
            )
        else:
            loss = self._replay_standard(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )
        # Occasionally train specifically on extrema points, if we have enough
        if hasattr(self, 'extrema_memory') and len(self.extrema_memory) >= self.batch_size // 2:
            if random.random() < 0.3:  # 30% chance to do extra extrema training
                # Sample from extrema memory
                extrema_batch_size = min(self.batch_size // 2, len(self.extrema_memory))
                extrema_batch = random.sample(self.extrema_memory, extrema_batch_size)
                # Extract batches with proper tensor conversion
                extrema_states = np.vstack([self._normalize_state(x[0]) for x in extrema_batch])
                extrema_actions = np.array([x[1] for x in extrema_batch])
                extrema_rewards = np.array([x[2] for x in extrema_batch])
                extrema_next_states = np.vstack([self._normalize_state(x[3]) for x in extrema_batch])
                extrema_dones = np.array([x[4] for x in extrema_batch], dtype=np.float32)
                # Convert to torch tensors and move to device
                extrema_states_tensor = torch.FloatTensor(extrema_states).to(self.device)
                extrema_actions_tensor = torch.LongTensor(extrema_actions).to(self.device)
                extrema_rewards_tensor = torch.FloatTensor(extrema_rewards).to(self.device)
                extrema_next_states_tensor = torch.FloatTensor(extrema_next_states).to(self.device)
                extrema_dones_tensor = torch.FloatTensor(extrema_dones).to(self.device)
                # Additional training step focused on extrema points (with smaller learning rate)
                original_lr = self.optimizer.param_groups[0]['lr']
                # Temporarily reduce learning rate for fine-tuning on extrema
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = original_lr * 0.5
                # Train on extrema
                if self.use_mixed_precision:
                    extrema_loss = self._replay_mixed_precision(
                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
                        extrema_next_states_tensor, extrema_dones_tensor
                    )
                else:
                    extrema_loss = self._replay_standard(
                        extrema_states_tensor, extrema_actions_tensor, extrema_rewards_tensor,
                        extrema_next_states_tensor, extrema_dones_tensor
                    )
                # Restore original learning rate
                for param_group in self.optimizer.param_groups:
                    param_group['lr'] = original_lr
                logger.info(f"Extra training on extrema points: loss={extrema_loss:.4f}")
                # Average the loss
                loss = (loss + extrema_loss) / 2
        # Store and return loss
        self.losses.append(loss)
        return loss
    def _replay_standard(self, states, actions, rewards, next_states, dones):
        """Standard precision training step"""
        # Zero gradients
        self.optimizer.zero_grad()
        # Get current Q values and extrema predictions
        current_q_values, current_extrema_pred = self.policy_net(states)
        current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        # Get next Q values from target network
        with torch.no_grad():
            next_q_values, next_extrema_pred = self.target_net(next_states)
            next_q_values = next_q_values.max(1)[0]
            # Check for dimension mismatch and fix it
            if rewards.shape[0] != next_q_values.shape[0]:
                # Log the shape mismatch for debugging
                logger.warning(f"Shape mismatch detected in standard replay: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
                # Use the smaller size to prevent index errors
                min_size = min(rewards.shape[0], next_q_values.shape[0])
                rewards = rewards[:min_size]
                dones = dones[:min_size]
                next_q_values = next_q_values[:min_size]
                current_q_values = current_q_values[:min_size]
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
        # Compute Q-value loss (primary task)
        q_loss = nn.MSELoss()(current_q_values, target_q_values)
        # Create extrema labels from price movements (crude approximation)
        # If the next state price is higher than current, we might be in an uptrend (not a bottom)
        # If the next state price is lower than current, we might be in a downtrend (not a top)
        # This is a simplified approximation; in real scenarios we'd want to use actual extrema detection
        # Try to extract price from current and next states
        # Assuming price is in the last feature
        try:
            # Extract price feature from sequence data (if available)
            if len(states.shape) == 3:  # [batch, seq, features]
                current_prices = states[:, -1, -1]  # Last timestep, last feature
                next_prices = next_states[:, -1, -1]
            else:  # [batch, features]
                current_prices = states[:, -1]  # Last feature
                next_prices = next_states[:, -1]
            # Compute price changes
            price_changes = (next_prices - current_prices) / current_prices
            # Create crude extrema labels:
            # 0 = bottom: Large negative price change followed by positive change
            # 1 = top: Large positive price change followed by negative change
            # 2 = neither: Small or inconsistent changes
            # Classify based on price change magnitude
            extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
            # Identify potential bottoms (significant negative change)
            bottoms = (price_changes < -0.003)
            extrema_labels[bottoms] = 0
            # Identify potential tops (significant positive change)
            tops = (price_changes > 0.003)
            extrema_labels[tops] = 1
            # Calculate extrema prediction loss (auxiliary task)
            if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
                current_extrema_pred = current_extrema_pred[:min_size]
                extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
                # Combined loss (primary + auxiliary with lower weight)
                # Typically auxiliary tasks should have lower weight to not dominate the primary task
                loss = q_loss + 0.3 * extrema_loss
                # Log separate loss components occasionally
                if random.random() < 0.01:  # Log 1% of the time to avoid flood
                    logger.info(f"Training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
            else:
                # Fall back to just Q-value loss if extrema predictions aren't available
                loss = q_loss
        except Exception as e:
            # Fallback if price extraction fails
            logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
            loss = q_loss
        # Backward pass and optimize
        loss.backward()
        # Gradient clipping to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
        self.optimizer.step()
        # Update target network if needed
        self.update_count += 1
        if self.update_count % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        # Track and decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        return loss.item()
    def _replay_mixed_precision(self, states, actions, rewards, next_states, dones):
        """Mixed precision training step for better GPU performance"""
        # Check if mixed precision should be explicitly disabled
        if 'DISABLE_MIXED_PRECISION' in os.environ:
            logger.info("Mixed precision explicitly disabled by environment variable")
            return self._replay_standard(states, actions, rewards, next_states, dones)
        try:
            # Zero gradients
            self.optimizer.zero_grad()
            # Forward pass with amp autocasting
            with torch.cuda.amp.autocast():
                # Get current Q values and extrema predictions
                current_q_values, current_extrema_pred = self.policy_net(states)
                current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
                # Get next Q values from target network
                with torch.no_grad():
                    next_q_values, next_extrema_pred = self.target_net(next_states)
                    next_q_values = next_q_values.max(1)[0]
                    # Check for dimension mismatch and fix it
                    if rewards.shape[0] != next_q_values.shape[0]:
                        # Log the shape mismatch for debugging
                        logger.warning(f"Shape mismatch detected: rewards {rewards.shape}, next_q_values {next_q_values.shape}")
                        # Use the smaller size to prevent index errors
                        min_size = min(rewards.shape[0], next_q_values.shape[0])
                        rewards = rewards[:min_size]
                        dones = dones[:min_size]
                        next_q_values = next_q_values[:min_size]
                        current_q_values = current_q_values[:min_size]
                    target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
                # Compute Q-value loss (primary task)
                q_loss = nn.MSELoss()(current_q_values, target_q_values)
                # Create extrema labels from price movements (crude approximation)
                # Try to extract price from current and next states
                try:
                    # Extract price feature from sequence data (if available)
                    if len(states.shape) == 3:  # [batch, seq, features]
                        current_prices = states[:, -1, -1]  # Last timestep, last feature
                        next_prices = next_states[:, -1, -1]
                    else:  # [batch, features]
                        current_prices = states[:, -1]  # Last feature
                        next_prices = next_states[:, -1]
                    # Compute price changes
                    price_changes = (next_prices - current_prices) / current_prices
                    # Create crude extrema labels:
                    # 0 = bottom: Large negative price change followed by positive change
                    # 1 = top: Large positive price change followed by negative change
                    # 2 = neither: Small or inconsistent changes
                    # Classify based on price change magnitude
                    extrema_labels = torch.ones(min_size, dtype=torch.long, device=self.device) * 2  # Default: neither
                    # Identify potential bottoms (significant negative change)
                    bottoms = (price_changes < -0.003)
                    extrema_labels[bottoms] = 0
                    # Identify potential tops (significant positive change)
                    tops = (price_changes > 0.003)
                    extrema_labels[tops] = 1
                    # Calculate extrema prediction loss (auxiliary task)
                    if len(current_extrema_pred.shape) > 1 and current_extrema_pred.shape[0] >= min_size:
                        current_extrema_pred = current_extrema_pred[:min_size]
                        extrema_loss = nn.CrossEntropyLoss()(current_extrema_pred, extrema_labels)
                        # Combined loss (primary + auxiliary with lower weight)
                        loss = q_loss + 0.3 * extrema_loss
                        # Log separate loss components occasionally
                        if random.random() < 0.01:  # Log 1% of the time to avoid flood
                            logger.info(f"Mixed precision training losses: Q-loss={q_loss.item():.4f}, Extrema-loss={extrema_loss.item():.4f}")
                    else:
                        # Fall back to just Q-value loss
                        loss = q_loss
                except Exception as e:
                    # Fallback if price extraction fails
                    logger.warning(f"Failed to calculate extrema loss: {str(e)}. Using only Q-value loss.")
                    loss = q_loss
            # Backward pass with scaled gradients
            self.scaler.scale(loss).backward()
            # Gradient clipping on scaled gradients
            self.scaler.unscale_(self.optimizer)
            torch.nn.utils.clip_grad_norm_(self.policy_net.parameters(), 1.0)
            # Update with scaler
            self.scaler.step(self.optimizer)
            self.scaler.update()
            # Update target network if needed
            self.update_count += 1
            if self.update_count % self.target_update == 0:
                self.target_net.load_state_dict(self.policy_net.state_dict())
            # Track and decay epsilon
            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
            return loss.item()
        except Exception as e:
            logger.error(f"Error in mixed precision training: {str(e)}")
            logger.warning("Falling back to standard precision training")
            # Fall back to standard training
            return self._replay_standard(states, actions, rewards, next_states, dones)
    def train_on_extrema(self, states, actions, rewards, next_states, dones):
        """
        Special training function specifically for extrema points
        Args:
            states: Batch of states at extrema points
            actions: Batch of actions
            rewards: Batch of rewards
            next_states: Batch of next states
            dones: Batch of done flags
        Returns:
            float: Training loss
        """
        # Convert to numpy arrays if not already
        if not isinstance(states, np.ndarray):
            states = np.array(states)
        if not isinstance(actions, np.ndarray):
            actions = np.array(actions)
        if not isinstance(rewards, np.ndarray):
            rewards = np.array(rewards)
        if not isinstance(next_states, np.ndarray):
            next_states = np.array(next_states)
        if not isinstance(dones, np.ndarray):
            dones = np.array(dones, dtype=np.float32)
        # Normalize states
        states = np.vstack([self._normalize_state(s) for s in states])
        next_states = np.vstack([self._normalize_state(s) for s in next_states])
        # Convert to torch tensors and move to device
        states_tensor = torch.FloatTensor(states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # Choose training method based on precision mode
        if self.use_mixed_precision:
            return self._replay_mixed_precision(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )
        else:
            return self._replay_standard(
                states_tensor, actions_tensor, rewards_tensor,
                next_states_tensor, dones_tensor
            )
    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
@ -211,148 +712,6 @@ class DQNAgent:
        return normalized_state
    def replay(self, use_prioritized=True) -> float:
        """
        Train on a batch of experiences with prioritized sampling
        Args:
            use_prioritized: Whether to use prioritized replay
        Returns:
            float: Loss value
        """
        if len(self.memory) < self.batch_size:
            return 0.0
        # Sample batch with prioritization
        batch = []
        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
            # Prioritized sampling from different memory types
            positive_count = min(self.batch_size // 4, len(self.positive_memory))
            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
            regular_count = self.batch_size - positive_count - extrema_count
            positive_samples = random.sample(list(self.positive_memory), positive_count)
            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
            regular_samples = random.sample(list(self.memory), regular_count)
            batch = positive_samples + extrema_samples + regular_samples
        else:
            # Standard sampling
            batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        # Normalize states before training
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
        # Convert to tensors and move to device
        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # Get current Q values
        current_q_values, extrema_pred = self.policy_net(states_tensor)
        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
        # Get next Q values from target network (Double DQN approach)
        with torch.no_grad():
            # Get actions from policy network
            next_actions, _ = self.policy_net(next_states_tensor)
            next_actions = next_actions.max(1)[1].unsqueeze(1)
            # Get Q values from target network for those actions
            next_q_values, _ = self.target_net(next_states_tensor)
            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
            # Compute target Q values
            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
        # Clamp target values to prevent extreme values
        target_q_values = torch.clamp(target_q_values, -100, 100)
        # Compute Huber loss (more robust to outliers than MSE)
        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        # Apply gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        # Update target network if needed
        self.update_count += 1
        if self.update_count % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        # Decay epsilon
        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        return loss.item()
    def train_on_extrema(self, states, actions, rewards, next_states, dones):
        """
        Special training method focused on extrema patterns
        Args:
            states: Array of states near extrema points
            actions: Correct actions to take (buy at bottoms, sell at tops)
            rewards: Rewards for each action
            next_states: Next states
            dones: Done flags
        """
        if len(states) == 0:
            return 0.0
        # Normalize states
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
        # Convert to tensors
        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
        actions_tensor = torch.LongTensor(actions).to(self.device)
        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # Forward pass
        current_q_values, extrema_pred = self.policy_net(states_tensor)
        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
        # Get next Q values (Double DQN approach)
        with torch.no_grad():
            next_actions, _ = self.policy_net(next_states_tensor)
            next_actions = next_actions.max(1)[1].unsqueeze(1)
            next_q_values, _ = self.target_net(next_states_tensor)
            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
        # Clamp target values
        target_q_values = torch.clamp(target_q_values, -100, 100)
        # Use Huber loss for extrema training
        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
        # Full loss
        loss = q_loss
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        return loss.item()
    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
--- a/NN/models/simple_cnn.py
+++ b/NN/models/simple_cnn.py
@ -74,6 +74,107 @@ class AdaptiveNorm(nn.Module):
                self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
            return self.layer_norm_1d(x)
 class SimpleCNN(nn.Module):
    """
    Simple CNN model for reinforcement learning with image-like state inputs
    """
    def __init__(self, input_shape, n_actions):
        super(SimpleCNN, self).__init__()
        # Store dimensions
        self.input_shape = input_shape
        self.n_actions = n_actions
        # Calculate input dimensions
        if len(input_shape) == 3:  # [channels, height, width]
            self.channels, self.height, self.width = input_shape
            self.feature_dim = self.height * self.width
        elif len(input_shape) == 2:  # [timeframes, features]
            self.channels = input_shape[0]
            self.features = input_shape[1]
            self.feature_dim = self.features
        elif len(input_shape) == 1:  # [features]
            self.channels = 1
            self.features = input_shape[0]
            self.feature_dim = self.features
        else:
            raise ValueError(f"Unsupported input shape: {input_shape}")
        # Build network
        self._build_network()
        # Initialize device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        logger.info(f"SimpleCNN initialized with input shape: {input_shape}, actions: {n_actions}")
    def _build_network(self):
        """Build the neural network with current feature dimensions"""
        # Create a flexible architecture that adapts to input dimensions
        self.fc_layers = nn.Sequential(
            nn.Linear(self.feature_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 256),
            nn.ReLU()
        )
        # Output heads (Dueling DQN architecture)
        self.advantage_head = nn.Linear(256, self.n_actions)
        self.value_head = nn.Linear(256, 1)
        # Extrema detection head
        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
    def _check_rebuild_network(self, features):
        """Check if network needs to be rebuilt for different feature dimensions"""
        if features != self.feature_dim:
            logger.info(f"Rebuilding network for new feature dimension: {features} (was {self.feature_dim})")
            self.feature_dim = features
            self._build_network()
            # Move to device after rebuilding
            self.to(self.device)
            return True
        return False
    def forward(self, x):
        """
        Forward pass through the network
        Returns both action values and extrema predictions
        """
        # Handle different input shapes
        if len(x.shape) == 2:  # [batch_size, features]
            # Simple feature vector
            batch_size, features = x.shape
            # Check if we need to rebuild the network for new dimensions
            self._check_rebuild_network(features)
        elif len(x.shape) == 3:  # [batch_size, timeframes/channels, features]
            # Reshape to flatten timeframes/channels with features
            batch_size, timeframes, features = x.shape
            total_features = timeframes * features
            # Check if we need to rebuild the network for new dimensions
            self._check_rebuild_network(total_features)
            # Reshape tensor to [batch_size, total_features]
            x = x.reshape(batch_size, total_features)
        # Apply fully connected layers
        fc_out = self.fc_layers(x)
        # Dueling architecture
        advantage = self.advantage_head(fc_out)
        value = self.value_head(fc_out)
        # Q-values = value + (advantage - mean(advantage))
        action_values = value + advantage - advantage.mean(dim=1, keepdim=True)
        # Extrema predictions
        extrema_pred = self.extrema_head(fc_out)
        return action_values, extrema_pred
 class CNNModelPyTorch(nn.Module):
    """
    CNN model for trading with multiple timeframes
--- a/NN/models/simple_mlp.py
+++ b/NN/models/simple_mlp.py
@ -0,0 +1,70 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 import os
 import logging
 # Configure logger
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 class SimpleMLP(nn.Module):
    """
    Simple Multi-Layer Perceptron for reinforcement learning with vector state inputs
    Implements dueling architecture for better Q-learning
    """
    def __init__(self, state_dim, n_actions):
        super(SimpleMLP, self).__init__()
        # Store dimensions
        self.state_dim = state_dim
        self.n_actions = n_actions
        # Calculate input size
        if isinstance(state_dim, tuple):
            self.input_size = int(np.prod(state_dim))
        else:
            self.input_size = state_dim
        # Hidden layers
        self.fc1 = nn.Linear(self.input_size, 256)
        self.fc2 = nn.Linear(256, 256)
        # Dueling architecture
        self.advantage = nn.Linear(256, n_actions)
        self.value = nn.Linear(256, 1)
        # Extrema detection
        self.extrema_head = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
        # Move to appropriate device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        logger.info(f"SimpleMLP initialized with input size: {self.input_size}, actions: {n_actions}")
    def forward(self, x):
        """
        Forward pass through the network
        Returns both action values and extrema predictions
        """
        # Handle different input shapes
        if isinstance(self.state_dim, tuple) and len(self.state_dim) > 1:
            x = x.view(-1, self.input_size)
        # Main network
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        # Dueling architecture
        advantage = self.advantage(x)
        value = self.value(x)
        # Combine value and advantage (Q = V + A - mean(A))
        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
        # Extrema predictions
        extrema = F.softmax(self.extrema_head(x), dim=1)
        return q_values, extrema 
--- a/NN/train_rl.py
+++ b/NN/train_rl.py
@ -29,6 +29,21 @@ logging.basicConfig(
    ]
 )
 # Set up device for PyTorch (use GPU if available)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 # Log GPU status
 if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count()
    gpu_names = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
    logger.info(f"Using GPU: {gpu_names}")
    # Enable TensorFloat32 for NVIDIA Ampere GPUs for faster training
    if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
        logger.info("BFloat16 precision is supported - will use for faster training")
 else:
    logger.warning("GPU not available. Using CPU for training (slower).")
 class RLTradingEnvironment(gym.Env):
    """
    Reinforcement Learning environment for trading with technical indicators
@ -266,87 +281,151 @@ class RLTradingEnvironment(gym.Env):
 def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/models/saved/dqn_agent", 
             action_callback=None, episode_callback=None, symbol="BTC/USDT"):
    """
-    Train DQN agent for RL-based trading with extended training and monitoring
+    Train a reinforcement learning agent for trading
    Args:
-        env_class: Optional environment class to use, defaults to RLTradingEnvironment
+        env_class: Optional environment class override
-        num_episodes: Number of episodes to train
+        num_episodes: Number of episodes to train for
        max_steps: Maximum steps per episode
-        save_path: Path to save the model
+        save_path: Path to save the trained model
-        action_callback: Optional callback for each action (step, action, price, reward, info)
+        action_callback: Callback function for monitoring actions
-        episode_callback: Optional callback after each episode (episode, reward, info)
+        episode_callback: Callback function for monitoring episodes
-        symbol: Trading pair symbol (e.g., "BTC/USDT")
+        symbol: Trading symbol to use
    Returns:
-        DQNAgent: The trained agent
+        tuple: (trained agent, environment)
    """
-    import pandas as pd
+    # Load data for the selected symbol
-    from NN.utils.data_interface import DataInterface
+    data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
-    logger.info("Starting DQN training for RL trading")
+    try:
        # Try to load data for the requested symbol using get_historical_data method
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
        if data_1m is None or data_5m is None or data_15m is None:
            raise FileNotFoundError("Could not retrieve data for specified symbol")
    except Exception as e:
        logger.warning(f"Data for {symbol} not available: {str(e)}. Using default data.")
        # Try to use cached data if available
        symbol = "BTC/USDT"
        data_interface = DataInterface(symbol=symbol, timeframes=['1m', '5m', '15m'])
        data_1m = data_interface.get_historical_data(timeframe='1m', n_candles=5000)
        data_5m = data_interface.get_historical_data(timeframe='5m', n_candles=5000)
        data_15m = data_interface.get_historical_data(timeframe='15m', n_candles=5000)
        if data_1m is None or data_5m is None or data_15m is None:
            logger.error("Failed to retrieve any data. Cannot continue training.")
            raise ValueError("No data available for training")
-    # Create data interface with specified symbol
+    # Create features from the data by adding technical indicators and converting to numpy format
-    data_interface = DataInterface(symbol=symbol)
+    if data_1m is not None:
-    
+        data_1m = data_interface.add_technical_indicators(data_1m)
-    # Load and preprocess data
+        # Convert to numpy array with close price as the last column
-    logger.info(f"Loading data from multiple timeframes for {symbol}")
+        features_1m = np.hstack([
-    features_1m = data_interface.get_training_data("1m", n_candles=2000)
+            data_1m.drop(['timestamp', 'close'], axis=1).values,
-    features_5m = data_interface.get_training_data("5m", n_candles=1000)
+            data_1m['close'].values.reshape(-1, 1)
-    features_15m = data_interface.get_training_data("15m", n_candles=500)
+        ])
    # Check if we have all the data
    if features_1m is None or features_5m is None or features_15m is None:
        logger.error("Failed to load training data from one or more timeframes")
        return None
    # If data is a DataFrame, convert to numpy array excluding the timestamp column
    if isinstance(features_1m, pd.DataFrame):
        features_1m = features_1m.drop('timestamp', axis=1, errors='ignore').values
    if isinstance(features_5m, pd.DataFrame):
        features_5m = features_5m.drop('timestamp', axis=1, errors='ignore').values
    if isinstance(features_15m, pd.DataFrame):
        features_15m = features_15m.drop('timestamp', axis=1, errors='ignore').values
    # Initialize environment or use provided class
    if env_class is None:
        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
    else:
        features_1m = None
    if data_5m is not None:
        data_5m = data_interface.add_technical_indicators(data_5m)
        # Convert to numpy array with close price as the last column
        features_5m = np.hstack([
            data_5m.drop(['timestamp', 'close'], axis=1).values,
            data_5m['close'].values.reshape(-1, 1)
        ])
    else:
        features_5m = None
    if data_15m is not None:
        data_15m = data_interface.add_technical_indicators(data_15m)
        # Convert to numpy array with close price as the last column
        features_15m = np.hstack([
            data_15m.drop(['timestamp', 'close'], axis=1).values,
            data_15m['close'].values.reshape(-1, 1)
        ])
    else:
        features_15m = None
    # Check if we have all the required features
    if features_1m is None or features_5m is None or features_15m is None:
        logger.error("Failed to create features for all timeframes.")
        raise ValueError("Could not create features for training")
    # Create the environment
    if env_class:
        # Use provided environment class
        env = env_class(features_1m, features_5m, features_15m)
    else:
        # Use the default environment
        env = RLTradingEnvironment(features_1m, features_5m, features_15m)
    # Set action callback if provided
    if action_callback:
-        def step_callback(action, price, reward, info):
+        env.set_action_callback(action_callback)
            action_callback(env.current_step, action, price, reward, info)
        env.set_action_callback(step_callback)
-    # Initialize agent
+    # Get environment properties for agent creation
-    window_size = env.window_size
+    input_shape = env.observation_space.shape
-    num_features = env.num_features * env.num_timeframes
+    n_actions = env.action_space.n
    action_size = env.action_space.n
    timeframes = ['1m', '5m', '15m']  # Match the timeframes from the environment
    # Create the agent
    agent = DQNAgent(
-        state_size=window_size * num_features,
+        state_shape=input_shape,
-        action_size=action_size,
+        n_actions=n_actions,
-        window_size=window_size,
+        epsilon=1.0,
-        num_features=env.num_features,
+        epsilon_decay=0.995,
-        timeframes=timeframes,
+        epsilon_min=0.01,
        memory_size=100000,
        batch_size=64,
        learning_rate=0.0001,
        gamma=0.99,
-        epsilon=1.0,
+        buffer_size=10000,
-        epsilon_min=0.01,
+        batch_size=64,
-        epsilon_decay=0.995
+        device=device  # Pass device to agent for GPU usage
    )
-    # Training variables
+    # Check if model file exists and load it
-    best_reward = -float('inf')
+    model_file = f"{save_path}_model.pth"
-    episode_rewards = []
+    if os.path.exists(model_file):
        try:
            agent.load(model_file)
            logger.info(f"Loaded existing model from {model_file}")
        except Exception as e:
            logger.error(f"Error loading model: {e}")
    else:
        logger.info("No existing model found. Starting with a new model.")
-    # TensorBoard writer for logging
+    # Create TensorBoard writer
-    writer = SummaryWriter(log_dir=f'runs/rl_trading_{int(time.time())}')
+    writer = SummaryWriter(log_dir=f'runs/dqn_{int(time.time())}')
    # Log GPU status to TensorBoard
    writer.add_text("hardware/device", str(device), 0)
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            writer.add_text(f"hardware/gpu_{i}", torch.cuda.get_device_name(i), 0)
    # Training loop
    total_rewards = []
    trade_win_rates = []
    best_reward = -np.inf
    # Move models to the appropriate device if not already there
    agent.move_models_to_device(device)
    # Enable mixed precision if GPU and feature is available
    use_mixed_precision = False
    if torch.cuda.is_available() and hasattr(torch.cuda, 'amp'):
        logger.info("Enabling mixed precision training")
        use_mixed_precision = True
        scaler = torch.cuda.amp.GradScaler()
    # Define step callback for tensorboard logging and model tracking
    def step_callback(action, price, reward, info):
        # Pass to external callback if provided
        if action_callback:
            action_callback(env.current_step, action, price, reward, info)
    # Main training loop
    logger.info(f"Starting training for {num_episodes} episodes...")
    logger.info(f"Starting training on device: {agent.device}")
@ -378,12 +457,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
                    break
            # Track rewards
-            episode_rewards.append(total_reward)
+            total_rewards.append(total_reward)
            # Log progress
            avg_reward = np.mean(episode_rewards[-100:])
            logger.info(f"Episode {episode}/{num_episodes} - Reward: {total_reward:.4f}, " +
                        f"Avg (100): {avg_reward:.4f}, Epsilon: {agent.epsilon:.4f}")
            # Calculate trading metrics
            win_rate = env.win_rate if hasattr(env, 'win_rate') else 0
@ -391,15 +465,14 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
            # Log to TensorBoard
            writer.add_scalar('Reward/Episode', total_reward, episode)
            writer.add_scalar('Reward/Average100', avg_reward, episode)
            writer.add_scalar('Trade/WinRate', win_rate, episode)
            writer.add_scalar('Trade/Count', trades, episode)
            # Save best model
-            if avg_reward > best_reward and episode > 10:
+            if total_reward > best_reward and episode > 10:
-                logger.info(f"New best average reward: {avg_reward:.4f}, saving model")
+                logger.info(f"New best average reward: {total_reward:.4f}, saving model")
                agent.save(save_path)
-                best_reward = avg_reward
+                best_reward = total_reward
            # Periodic save every 100 episodes
            if episode % 100 == 0 and episode > 0:
@ -424,7 +497,7 @@ def train_rl(env_class=None, num_episodes=5000, max_steps=2000, save_path="NN/mo
    # Close TensorBoard writer
    writer.close()
-    return agent
+    return agent, env
 if __name__ == "__main__":
    train_rl() 
--- a/TODO_IMPROVEMENTS.md
+++ b/TODO_IMPROVEMENTS.md
@ -0,0 +1,224 @@
 # Cryptocurrency Trading System Improvements
 ## Overview
 This document outlines necessary improvements to our cryptocurrency trading system to enhance performance, profitability, and monitoring capabilities.
 ## High Priority Tasks
 ### 1. GPU Utilization for Training
 - [x] Fix GPU detection and utilization during training
  - [x] Debug why CUDA is detected but not utilized (check logs showing "Starting training on device: cpu")
  - [x] Ensure PyTorch correctly detects and uses available CUDA devices
  - [x] Add GPU memory monitoring during training
  - [x] Optimize batch sizes for GPU training
 Implementation status:
 - Added `setup_gpu()` function in `train_rl_with_realtime.py` to properly detect and configure GPU usage
 - Added device parameter to DQNAgent to ensure models are created on the correct device
 - Implemented mixed precision training for faster GPU-based training
 - Added GPU memory monitoring and logging to TensorBoard
 ### 2. Trade Signal Rate Display
 - [x] Add metrics to track and display trading frequency
  - [x] Implement counter for actions per second/minute/hour
  - [x] Add visualization to the chart showing trading frequency over time
  - [x] Create a moving average of trade signals to show trends
  - [x] Add dashboard section showing current and average trading rates
 Implementation status:
 - Added trade time tracking in `_add_trade_compat` function
 - Added `calculate_trade_rate` method to `RealTimeChart` class
 - Updated dashboard layout to display trade rates
 - Added visualization of trade frequency in chart's bottom panel
 ### 3. Reward Function Optimization
 - [x] Revise reward function to better balance profit and risk
  - [x] Increase transaction fee penalty for more realistic simulation
  - [x] Implement progressive rewards based on holding time
  - [x] Add penalty for frequent trading (to reduce noise)
  - [x] Scale rewards based on market volatility
  - [x] Implement risk-adjusted returns (Sharpe ratio) in reward calculation
 Implementation status:
 - Created `improved_reward_function.py` with `ImprovedRewardCalculator` class
 - Implemented Sharpe ratio for risk-adjusted rewards
 - Added frequency penalty for excessive trading
 - Added holding time rewards for profitable positions
 - Integrated with `EnhancedRLTradingEnvironment` class
 ### 4. Multi-timeframe Price Direction Prediction
 - [ ] Extend CNN model to predict price direction for multiple timeframes
  - [ ] Modify CNN output to predict short, mid, and long-term price directions
  - [ ] Create data generation method for back-propagation using historical data
  - [ ] Implement real-time example generation for training
  - [ ] Feed direction predictions to RL agent as additional state information
 ## Medium Priority Tasks
 ### 5. Position Sizing Optimization
 - [ ] Implement dynamic position sizing based on confidence and volatility
  - [ ] Add confidence score to model outputs
  - [ ] Scale position size based on prediction confidence
  - [ ] Implement Kelly criterion for optimal position sizing
 ### 6. Training Data Augmentation
 - [ ] Implement data augmentation for more robust training
  - [ ] Simulate different market conditions
  - [ ] Add noise to training data
  - [ ] Generate synthetic data for rare market events
 ### 7. Model Interpretability
 - [ ] Add visualization for model decision making
  - [ ] Implement feature importance analysis
  - [ ] Add attention visualization for key price patterns
  - [ ] Create explainable AI components
 ## Implementation Details
 ### Completed: Displaying Trade Rate
 The trade rate display implementation has been completed in the `RealTimeChart` class:
 ```python
 def calculate_trade_rate(self):
    """Calculate and return trading rate statistics based on recent trades"""
    if not hasattr(self, 'trade_times') or not self.trade_times:
        return {"per_second": 0, "per_minute": 0, "per_hour": 0}
    # Get current time
    now = datetime.now()
    # Calculate different time windows
    one_second_ago = now - timedelta(seconds=1)
    one_minute_ago = now - timedelta(minutes=1)
    one_hour_ago = now - timedelta(hours=1)
    # Count trades in different time windows
    trades_last_second = sum(1 for t in self.trade_times if t > one_second_ago)
    trades_last_minute = sum(1 for t in self.trade_times if t > one_minute_ago)
    trades_last_hour = sum(1 for t in self.trade_times if t > one_hour_ago)
    # Calculate rates
    return {
        "per_second": trades_last_second,
        "per_minute": trades_last_minute,
        "per_hour": trades_last_hour
    }
 ```
 ### Completed: Improved Reward Function
 The improved reward function has been implemented in `improved_reward_function.py`:
 ```python
 def calculate_reward(self, action, price_change, position_held_time=0, 
                     volatility=None, is_profitable=False):
    """
    Calculate the improved reward with risk adjustment
    """
    # Calculate trading fee
    fee = self.base_fee_rate
    # Calculate frequency penalty
    frequency_penalty = self._calculate_frequency_penalty()
    # Base reward calculation
    if action == 0:  # BUY
        # Small penalty for transaction plus frequency penalty
        reward = -fee - frequency_penalty
    elif action == 1:  # SELL
        # Calculate profit percentage minus fees (both entry and exit)
        profit_pct = price_change
        net_profit = profit_pct - (fee * 2)
        # Scale reward and apply frequency penalty
        reward = net_profit * 10  # Scale reward
        reward -= frequency_penalty
        # Record PnL for risk adjustment
        self.record_pnl(net_profit)
    else:  # HOLD
        # Small reward for holding a profitable position, small cost otherwise
        if is_profitable:
            reward = self._calculate_holding_reward(position_held_time, price_change)
        else:
            reward = -0.0001  # Very small negative reward
    # Apply risk adjustment if enabled
    if self.risk_adjusted:
        reward = self._calculate_risk_adjustment(reward)
    # Record this action for future frequency calculations
    self.record_trade(action=action)
    return reward
 ```
 ### Completed: GPU Optimization
 Added GPU optimization in `train_rl_with_realtime.py`:
 ```python
 def setup_gpu():
    """
    Configure GPU usage for PyTorch training
    Returns:
        tuple: (success, device, message)
    """
    try:
        if torch.cuda.is_available():
            gpu_count = torch.cuda.device_count()
            device_info = [torch.cuda.get_device_name(i) for i in range(gpu_count)]
            logger.info(f"Found {gpu_count} GPU(s): {', '.join(device_info)}")
            device = torch.device("cuda:0")
            # Test CUDA by creating a small tensor
            test_tensor = torch.tensor([1.0, 2.0, 3.0], device=device)
            # Enable mixed precision if supported
            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
                logger.info("BFloat16 is supported - enabling for faster training")
            return True, device, f"GPU enabled: {device_info}"
        else:
            return False, torch.device("cpu"), "GPU not available, using CPU"
    except Exception as e:
        return False, torch.device("cpu"), f"GPU setup failed: {str(e)}"
 ```
 ### CNN Price Direction Prediction (To be implemented)
 ```python
 def generate_direction_examples(self, historical_data, timeframes=['1m', '1h', '1d']):
    """Generate price direction examples from historical data"""
    examples = []
    labels = []
    for tf in timeframes:
        df = historical_data[tf]
        for i in range(20, len(df) - 10):
            # Use window of 20 candles for input
            window = df.iloc[i-20:i]
            # Create labels for future price direction (next 5, 10, 20 candles)
            future_5 = df.iloc[i].close < df.iloc[i+5].close  # True if price goes up
            future_10 = df.iloc[i].close < df.iloc[i+10].close
            future_20 = df.iloc[i].close < df.iloc[min(i+20, len(df)-1)].close
            examples.append(window.values)
            labels.append([future_5, future_10, future_20])
    return np.array(examples), np.array(labels)
 ```
 ## Validation Plan
 After implementing these improvements, we should validate the system with:
 1. Backtesting on historical data 
 2. Forward testing with small position sizes
 3. A/B testing of different reward functions
 4. Measuring the improvement in profitability and Sharpe ratio
 ## Progress Tracking
 - Implementation started: June 2023
 - GPU utilization fixed: July 2023
 - Trade signal rate display implemented: July 2023
 - Reward function optimized: July 2023
 - CNN direction prediction added: To be completed
 - Full system tested: To be completed 
--- a/improved_reward_function.py
+++ b/improved_reward_function.py
@ -0,0 +1,180 @@
 """
 Improved Reward Function for RL Trading Agent
 This module provides a more sophisticated reward function for the RL trading agent
 that incorporates realistic trading fees, penalties for excessive trading, and
 rewards for successful holding of positions.
 """
 import numpy as np
 from datetime import datetime, timedelta
 from collections import deque
 class ImprovedRewardCalculator:
    def __init__(self, 
                 base_fee_rate=0.001,            # 0.1% per transaction
                 max_frequency_penalty=0.005,    # Maximum 0.5% penalty for frequent trading
                 holding_reward_rate=0.0001,     # Small reward for holding profitable positions
                 risk_adjusted=True):            # Use Sharpe ratio for risk adjustment
        self.base_fee_rate = base_fee_rate
        self.max_frequency_penalty = max_frequency_penalty
        self.holding_reward_rate = holding_reward_rate
        self.risk_adjusted = risk_adjusted
        # Keep track of recent trades
        self.recent_trades = deque(maxlen=1000)
        self.trade_pnls = deque(maxlen=100)  # For risk adjustment
    def record_trade(self, timestamp=None, action=None, price=None):
        """Record a trade for frequency tracking"""
        if timestamp is None:
            timestamp = datetime.now()
        self.recent_trades.append({
            'timestamp': timestamp,
            'action': action,
            'price': price
        })
    def record_pnl(self, pnl):
        """Record a PnL result for risk adjustment"""
        self.trade_pnls.append(pnl)
    def _calculate_frequency_penalty(self):
        """Calculate penalty for trading too frequently"""
        if len(self.recent_trades) < 2:
            return 0.0
        # Count trades in the last minute
        now = datetime.now()
        one_minute_ago = now - timedelta(minutes=1)
        trades_last_minute = sum(1 for trade in self.recent_trades 
                              if trade['timestamp'] > one_minute_ago)
        # Apply progressive penalty (more severe as frequency increases)
        if trades_last_minute <= 1:
            return 0.0  # No penalty for normal trading rate
        # Progressive penalty based on trade frequency
        penalty = min(self.max_frequency_penalty, 
                      self.base_fee_rate * trades_last_minute)
        return penalty
    def _calculate_holding_reward(self, position_held_time, price_change_pct):
        """Calculate reward for holding a position for some time"""
        if position_held_time <= 0 or price_change_pct <= 0:
            return 0.0  # No reward for unprofitable holds
        # Cap at 100 time units (seconds, minutes, etc.)
        capped_time = min(position_held_time, 100)
        # Scale reward by both time and price change
        reward = self.holding_reward_rate * capped_time * price_change_pct
        return reward
    def _calculate_risk_adjustment(self, reward):
        """Adjust rewards based on risk (simple Sharpe ratio implementation)"""
        if len(self.trade_pnls) < 5:
            return reward  # Not enough data for adjustment
        # Calculate mean and standard deviation of returns
        pnl_array = np.array(self.trade_pnls)
        mean_return = np.mean(pnl_array)
        std_return = np.std(pnl_array)
        if std_return == 0:
            return reward  # Avoid division by zero
        # Simplified Sharpe ratio
        sharpe = mean_return / std_return
        # Scale reward by Sharpe ratio (normalized to be around 1.0)
        adjustment_factor = np.clip(1.0 + 0.5 * sharpe, 0.5, 2.0)
        return reward * adjustment_factor
    def calculate_reward(self, action, price_change, position_held_time=0, 
                         volatility=None, is_profitable=False):
        """
        Calculate the improved reward
        Args:
            action (int): 0 = Buy, 1 = Sell, 2 = Hold
            price_change (float): Percent price change for the trade
            position_held_time (int): Time position was held (in time units)
            volatility (float, optional): Market volatility measure
            is_profitable (bool): Whether current position is profitable
        Returns:
            float: Calculated reward value
        """
        # Calculate trading fee
        fee = self.base_fee_rate
        # Calculate frequency penalty
        frequency_penalty = self._calculate_frequency_penalty()
        # Base reward calculation
        if action == 0:  # Buy
            # Small penalty for transaction plus frequency penalty
            reward = -fee - frequency_penalty
        elif action == 1:  # Sell
            # Calculate profit percentage minus fees (both entry and exit)
            profit_pct = price_change
            net_profit = profit_pct - (fee * 2)
            # Scale reward and apply frequency penalty
            reward = net_profit * 10  # Scale reward
            reward -= frequency_penalty
            # Record PnL for risk adjustment
            self.record_pnl(net_profit)
        else:  # Hold
            # Small reward for holding a profitable position, small cost otherwise
            if is_profitable:
                reward = self._calculate_holding_reward(position_held_time, price_change)
            else:
                reward = -0.0001  # Very small negative reward
        # Apply risk adjustment if enabled
        if self.risk_adjusted:
            reward = self._calculate_risk_adjustment(reward)
        # Record this action for future frequency calculations
        self.record_trade(action=action)
        return reward
 # Example usage:
 if __name__ == "__main__":
    # Create calculator instance
    reward_calc = ImprovedRewardCalculator()
    # Example reward for a buy action
    buy_reward = reward_calc.calculate_reward(action=0, price_change=0)
    print(f"Buy action reward: {buy_reward:.5f}")
    # Record a trade for frequency tracking
    reward_calc.record_trade(action=0)
    # Wait a bit and make another trade to test frequency penalty
    import time
    time.sleep(0.1)
    # Example reward for a sell action with profit
    sell_reward = reward_calc.calculate_reward(action=1, price_change=0.015, position_held_time=60)
    print(f"Sell action reward (with profit): {sell_reward:.5f}")
    # Example reward for a hold action on profitable position
    hold_reward = reward_calc.calculate_reward(action=2, price_change=0.01, position_held_time=30, is_profitable=True)
    print(f"Hold action reward (profitable): {hold_reward:.5f}")
    # Example reward for a hold action on unprofitable position
    hold_reward_neg = reward_calc.calculate_reward(action=2, price_change=-0.01, position_held_time=30, is_profitable=False)
    print(f"Hold action reward (unprofitable): {hold_reward_neg:.5f}") 
--- a/realtime.py
+++ b/realtime.py
@ -777,6 +777,11 @@ class RealTimeChart:
        self.accumulative_pnl = 0.0  # Reset PnL
        self.current_balance = 100.0  # Start with $100 balance
        # Add trade rate tracking variables
        self.trade_times = []  # List to store timestamps of trades
        self.last_trade_rate_calculation = datetime.now()
        self.trade_rate = {"per_second": 0, "per_minute": 0, "per_hour": 0}
        # Store historical data for different timeframes
        self.timeframe_data = {
            '1s': [],
@ -833,6 +838,19 @@ class RealTimeChart:
                        html.H5("Accumulated PnL:", style={"display": "inline-block", "marginRight": "10px", "marginLeft": "30px"}),
                        html.H5(id="accumulated-pnl", style={"display": "inline-block", "color": "#ffc107"}),
                    ], style={"display": "inline-block", "marginLeft": "40px"}),
                    # Add trade rate display
                    html.Div([
                        html.H5("Trade Rate:", style={"display": "inline-block", "marginRight": "10px", "marginLeft": "30px"}),
                        html.Span([
                            html.Span(id="trade-rate-second", style={"color": "#ff7f0e"}),
                            html.Span("/s, "),
                            html.Span(id="trade-rate-minute", style={"color": "#ff7f0e"}),
                            html.Span("/m, "),
                            html.Span(id="trade-rate-hour", style={"color": "#ff7f0e"}),
                            html.Span("/h")
                        ], style={"display": "inline-block"}),
                    ], style={"display": "inline-block", "marginLeft": "40px"}),
                ], style={"textAlign": "center", "margin": "20px 0"}),
            ], style={"textAlign": "center", "marginBottom": "20px"}),
@ -958,7 +976,10 @@ class RealTimeChart:
             Output('positions-list', 'children'),
             Output('current-price', 'children'),
             Output('current-balance', 'children'),
-             Output('accumulated-pnl', 'children')],
+             Output('accumulated-pnl', 'children'),
             Output('trade-rate-second', 'children'),
             Output('trade-rate-minute', 'children'),
             Output('trade-rate-hour', 'children')],
            [Input('interval-component', 'n_intervals'),
             Input('interval-store', 'data')]
        )
@ -983,13 +1004,19 @@ class RealTimeChart:
                balance_text = f"${self.current_balance:.2f}"
                pnl_text = f"${self.accumulative_pnl:.2f}"
-                return main_fig, secondary_fig, positions, current_price, balance_text, pnl_text
+                # Get trade rate statistics
                trade_rate = self.calculate_trade_rate()
                per_second = f"{trade_rate['per_second']:.1f}"
                per_minute = f"{trade_rate['per_minute']:.1f}"
                per_hour = f"{trade_rate['per_hour']:.1f}"
                return main_fig, secondary_fig, positions, current_price, balance_text, pnl_text, per_second, per_minute, per_hour
            except Exception as e:
                logger.error(f"Error in update callback: {str(e)}")
                import traceback
                logger.error(traceback.format_exc())
                # Return empty updates on error
-                return {}, {}, [], "Error", "$0.00", "$0.00"
+                return {}, {}, [], "Error", "$0.00", "$0.00", "0.0", "0.0", "0.0"
    def _update_main_chart(self, interval=1):
        """Update the main chart with OHLC data"""
@ -1046,12 +1073,13 @@ class RealTimeChart:
            closes = [candle['close'] for candle in candles]
            volumes = [candle['volume'] for candle in candles]
-            # Create figure
+            # Create figure with 3 rows for OHLC, volume, and trade rate
-            fig = make_subplots(rows=2, cols=1, shared_xaxes=True, 
+            fig = make_subplots(rows=3, cols=1, shared_xaxes=True, 
                               vertical_spacing=0.02, 
-                               row_heights=[0.8, 0.2],
+                               row_heights=[0.6, 0.2, 0.2],
                               specs=[[{"type": "candlestick"}],
-                                      [{"type": "bar"}]])
+                                      [{"type": "bar"}],
                                      [{"type": "scatter"}]])
            # Add candlestick trace
            fig.add_trace(go.Candlestick(
@ -1070,14 +1098,13 @@ class RealTimeChart:
                x=timestamps,
                y=volumes,
                name='Volume',
-                marker_color='rgba(100, 100, 255, 0.5)'
+                marker=dict(color='rgba(0,0,100,0.2)')
            ), row=2, col=1)
            # Add trading markers if available
            if hasattr(self, 'positions') and self.positions:
-                # Get last 100 positions for display (to avoid too many markers)
+                # Get last 500 positions for display (to avoid too many markers)
-                positions = self.positions[-100:]
+                positions = self.positions[-500:]
                buy_timestamps = []
                buy_prices = []
@ -1122,14 +1149,69 @@ class RealTimeChart:
                        )
                    ), row=1, col=1)
-
+            # Add trade rate visualization in the third panel
            if hasattr(self, 'trade_times') and self.trade_times:
                # Create time buckets for grouping trade times
                time_buckets = {}
                bucket_size_seconds = 15  # Default bucket size
                # Adjust bucket size based on interval
                if interval >= 60:  # 1m or more
                    bucket_size_seconds = 60
                elif interval >= 300:  # 5m or more
                    bucket_size_seconds = 300
                # Process trade times into buckets
                for trade_time in self.trade_times:
                    # Skip trades older than the displayed range
                    if trade_time < timestamps[0]:
                        continue
                    # Create bucket key
                    bucket_timestamp = trade_time.replace(
                        microsecond=0,
                        second=(trade_time.second // bucket_size_seconds) * bucket_size_seconds
                    )
                    if bucket_timestamp.timestamp() not in time_buckets:
                        time_buckets[bucket_timestamp.timestamp()] = 0
                    time_buckets[bucket_timestamp.timestamp()] += 1
                # Convert buckets to series for plotting
                if time_buckets:
                    bucket_timestamps = []
                    bucket_counts = []
                    for timestamp, count in sorted(time_buckets.items()):
                        bucket_timestamps.append(datetime.fromtimestamp(timestamp))
                        bucket_counts.append(count)
                    # Add trade frequency chart
                    fig.add_trace(go.Scatter(
                        x=bucket_timestamps,
                        y=bucket_counts,
                        mode='lines',
                        name='Trades per Bucket',
                        line=dict(width=2, color='rgba(255, 165, 0, 0.8)'),
                        fill='tozeroy',
                        fillcolor='rgba(255, 165, 0, 0.2)'
                    ), row=3, col=1)
                    # Add current trade rate
                    trade_rate = self.calculate_trade_rate()
                    fig.add_annotation(
                        text=f"Trade Rate: {trade_rate['per_minute']:.1f}/min",
                        xref="paper", yref="y3",
                        x=0.99, y=max(bucket_counts) * 0.9 if bucket_counts else 1,
                        showarrow=False,
                        font=dict(size=10, color="orange"),
                        align="right"
                    )
            # Update layout
            fig.update_layout(
                title=f"{self.symbol} - {interval_key}",
                xaxis_title="Time",
                yaxis_title="Price",
-                height=600,
+                height=700,  # Increase height to accommodate additional panel
                template="plotly_dark",
                showlegend=True,
                margin=dict(l=0, r=0, t=50, b=20),
@ -1138,7 +1220,11 @@ class RealTimeChart:
            )
            # Format Y-axis with enough decimal places for cryptocurrency
-            fig.update_yaxes(tickformat=".2f")
+            fig.update_yaxes(tickformat=".2f", row=1, col=1)
            # Add titles to each panel
            fig.update_yaxes(title_text="Volume", row=2, col=1)
            fig.update_yaxes(title_text="Trade Rate", row=3, col=1)
            # Format X-axis with date/time
            fig.update_xaxes(
@ -1377,6 +1463,42 @@ class RealTimeChart:
        else:
            return f"{interval_seconds // 86400}d"
    def calculate_trade_rate(self):
        """Calculate and return trading rate statistics"""
        now = datetime.now()
        # Only calculate once per second to avoid unnecessary processing
        if (now - self.last_trade_rate_calculation).total_seconds() < 1.0:
            return self.trade_rate
        self.last_trade_rate_calculation = now
        # Clean up old trade times (older than 1 hour)
        one_hour_ago = now - timedelta(hours=1)
        self.trade_times = [t for t in self.trade_times if t > one_hour_ago]
        if not self.trade_times:
            self.trade_rate = {"per_second": 0, "per_minute": 0, "per_hour": 0}
            return self.trade_rate
        # Calculate rates based on time windows
        last_second = now - timedelta(seconds=1)
        last_minute = now - timedelta(minutes=1)
        # Count trades in each time window
        trades_last_second = sum(1 for t in self.trade_times if t > last_second)
        trades_last_minute = sum(1 for t in self.trade_times if t > last_minute)
        trades_last_hour = len(self.trade_times)  # All remaining trades are from last hour
        # Calculate rates
        self.trade_rate = {
            "per_second": trades_last_second,
            "per_minute": trades_last_minute,
            "per_hour": trades_last_hour
        }
        return self.trade_rate
    def _update_chart_and_positions(self):
        """Update the chart with current data and positions"""
        try:
--- a/train_rl_with_realtime.py
+++ b/train_rl_with_realtime.py
@ -23,6 +23,14 @@ import argparse
 from scipy.signal import argrelextrema
 from torch.utils.tensorboard import SummaryWriter
 # Add the improved reward function
 try:
    from improved_reward_function import ImprovedRewardCalculator
    reward_calculator_available = True
 except ImportError:
    logging.warning("Improved reward function not available, using default reward")
    reward_calculator_available = False
 # Configure logging
 logger = logging.getLogger('rl_realtime')
@ -31,6 +39,62 @@ project_root = os.path.dirname(os.path.abspath(__file__))
 if project_root not in sys.path:
    sys.path.append(project_root)
 # Set up GPU/CUDA if available
 def setup_gpu():
    """
    Configure GPU usage for PyTorch training
    Returns:
        tuple: (success, device, message)
            - success: bool indicating if GPU is available and configured
            - device: torch device object
            - message: descriptive message about GPU status
    """
    try:
        # Check if CUDA is available
        if torch.cuda.is_available():
            # Get the number of GPUs
            gpu_count = torch.cuda.device_count()
            # Print GPU info
            device_info = []
            for i in range(gpu_count):
                device_name = torch.cuda.get_device_name(i)
                device_info.append(f"GPU {i}: {device_name}")
            # Log GPU info
            logger.info(f"Found {gpu_count} GPU(s): {', '.join(device_info)}")
            # Set CUDA device and ensure PyTorch can use it
            device = torch.device("cuda:0")  # Use first GPU by default
            # Enable TensorFloat32 for NVIDIA Ampere-based GPUs (A100, RTX 30xx, etc.)
            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
                logger.info("BFloat16 is supported - enabling for faster training")
                # This will be used in model definition
            # Test CUDA by creating a small tensor
            test_tensor = torch.tensor([1.0, 2.0, 3.0], device=device)
            logger.info(f"CUDA test successful: {test_tensor.device}")
            # Set environment variables to optimize CUDA performance
            os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Makes debugging easier
            # Return success with device
            return True, device, f"GPU enabled: {device_info}"
        else:
            logger.warning("CUDA is not available. Training will use CPU only.")
            return False, torch.device("cpu"), "GPU not available, using CPU"
    except Exception as e:
        logger.error(f"Error setting up GPU: {str(e)}")
        logger.info("Falling back to CPU training")
        return False, torch.device("cpu"), f"GPU setup failed: {str(e)}"
 # Run GPU setup at module import time
 gpu_available, device, gpu_message = setup_gpu()
 logger.info(gpu_message)
 # Global variables for coordination
 realtime_chart = None
 realtime_websocket_task = None
@ -129,6 +193,10 @@ class RLTrainingIntegrator:
        # TensorBoard writer
        self.tensorboard_writer = None
        # Device for computation (GPU or CPU)
        self.device = device
        self.gpu_available = gpu_available
    def _train_on_extrema(self, agent, env):
        """Train the agent specifically on local extrema points"""
        if not hasattr(env, 'data') or not hasattr(env, 'original_data'):
@ -290,6 +358,16 @@ class RLTrainingIntegrator:
            log_dir = f'runs/rl_realtime_{int(time.time())}'
            self.tensorboard_writer = SummaryWriter(log_dir=log_dir)
            logger.info(f"TensorBoard logging enabled at {log_dir}")
            # Log GPU status in TensorBoard
            self.tensorboard_writer.add_text("setup/gpu_status", gpu_message, 0)
            if self.gpu_available:
                # Log GPU memory usage
                for i in range(torch.cuda.device_count()):
                    mem_allocated = torch.cuda.memory_allocated(i) / (1024 ** 2)  # MB
                    mem_reserved = torch.cuda.memory_reserved(i) / (1024 ** 2)    # MB
                    self.tensorboard_writer.add_scalar(f"gpu/memory_allocated_MB_device{i}", mem_allocated, 0)
                    self.tensorboard_writer.add_scalar(f"gpu/memory_reserved_MB_device{i}", mem_reserved, 0)
        except Exception as e:
            logger.error(f"Failed to initialize TensorBoard writer: {str(e)}")
            self.tensorboard_writer = None
@ -315,6 +393,17 @@ class RLTrainingIntegrator:
                    # TensorBoard writer
                    self.writer = None
                    # Initialize improved reward calculator if available
                    self.use_improved_reward = reward_calculator_available
                    if self.use_improved_reward:
                        self.reward_calculator = ImprovedRewardCalculator(
                            base_fee_rate=trading_fee,
                            max_frequency_penalty=0.005,
                            holding_reward_rate=0.0002,
                            risk_adjusted=True
                        )
                        logging.info("Using improved reward function with risk adjustment")
                def set_integrator(self, integrator):
                    """Set reference to integrator for callbacks"""
@ -334,16 +423,65 @@ class RLTrainingIntegrator:
                    current_price = self.features_1m[self.current_step, -1]
                    next_price = self.features_1m[self.current_step + 1, -1]
                    # Default values
                    pnl = 0.0
                    reward = -0.0001  # Small negative reward to discourage excessive actions
                    # Get real market price if available (from integrator)
                    real_market_price = None
                    if self.integrator and hasattr(self.integrator, 'chart') and self.integrator.chart:
                        if hasattr(self.integrator.chart, 'tick_storage'):
                            real_market_price = self.integrator.chart.tick_storage.get_latest_price()
                    # Use actual market price if available, otherwise use the candle price
                    price_to_use = real_market_price if real_market_price else current_price
                    # Calculate price change and initial variables
                    price_change = 0
                    if self.integrator and self.integrator.entry_price:
                        price_change = (price_to_use - self.integrator.entry_price) / self.integrator.entry_price
                    # Calculate position held time
                    position_held_time = 0
                    if self.integrator and self.integrator.entry_time:
                        position_held_time = self.current_step - self.integrator.entry_time
                    # Determine if position is profitable
                    is_profitable = False
                    if price_change > 0:
                        is_profitable = True
                    # If using improved reward calculator
                    if self.use_improved_reward:
                        # Convert our action to the format expected by the reward calculator
                        # 0:BUY, 1:SELL, 2:HOLD -> For calculator it's the same
                        reward_calc_action = action
                        # Calculate reward using the improved calculator
                        reward = self.reward_calculator.calculate_reward(
                            action=reward_calc_action,
                            price_change=price_change,
                            position_held_time=position_held_time,
                            is_profitable=is_profitable
                        )
                        # Record the trade for frequency tracking
                        self.reward_calculator.record_trade(
                            timestamp=datetime.now(),
                            action=action,
                            price=price_to_use
                        )
                        # If we have a PnL result, record it
                        if action == 1 and self.integrator and self.integrator.current_position_size > 0:
                            pnl = price_change - (self.trading_fee * 2)  # Account for entry and exit fees
                            self.reward_calculator.record_pnl(pnl)
                        # Log the reward calculation
                        logging.debug(f"Improved reward for action {action}: {reward:.6f}")
                        return reward, price_change
                    # Default values if not using improved calculator
                    pnl = 0.0
                    reward = -0.0001  # Small negative reward to discourage excessive actions
                    # Calculate base reward based on position and price change
                    if action == 0:  # BUY
                        # Apply fee directly as negative reward to discourage excessive trading
@ -362,7 +500,7 @@ class RLTrainingIntegrator:
                            elif last_signal['action'] == 'SELL':
                                # RNN suggests opposite - reduce reward
                                reward -= 0.003 * self.rnn_signal_weight * last_signal.get('confidence', 1.0)
-                        
+                    
                    elif action == 1:  # SELL
                        if self.integrator and self.integrator.current_position_size > 0:
                            # Calculate potential profit/loss
@ -388,7 +526,7 @@ class RLTrainingIntegrator:
                        else:
                            # No position to sell - penalize
                            reward = -0.005
-                        
+                    
                    elif action == 2:  # HOLD
                        # Check if we're holding a profitable position
                        if self.integrator and self.integrator.current_position_size > 0 and self.integrator.entry_price:
@ -593,8 +731,27 @@ class RLTrainingIntegrator:
        return env
-    def on_action(self, step, action, price, reward, info):
+    def on_action(self, step_or_action, action_or_price=None, price_or_reward=None, reward_or_info=None, info=None):
-        """Called after each action in the episode"""
+        """
        Called after each action in the episode.
        This method has a flexible signature to handle both:
        - on_action(self, step, action, price, reward, info) - from direct calls
        - on_action(self, action, price, reward, info) - from train_rl.py callback
        """
        # Handle different calling signatures
        if info is None:
            # Called with 4 args: (action, price, reward, info)
            action = step_or_action
            price = action_or_price
            reward = price_or_reward
            info = reward_or_info
            step = self.session_step  # Use session step for tracking
        else:
            # Called with 5 args: (step, action, price, reward, info)
            step = step_or_action
            action = action_or_price
            price = price_or_reward
            reward = reward_or_info
        # Log the action
        action_str = "BUY" if action == 0 else "SELL" if action == 1 else "HOLD"
@ -792,7 +949,47 @@ class RLTrainingIntegrator:
        return True  # Continue training
-async def start_realtime_chart(symbol="BTC/USDT", port=8050, manual_mode=False):
+    def optimize_model_for_gpu(self, model):
        """
        Optimize a PyTorch model for GPU training
        Args:
            model: PyTorch model to optimize
        Returns:
            Optimized model
        """
        if not self.gpu_available:
            logger.info("GPU not available, skipping optimization")
            return model
        try:
            logger.info("Optimizing model for GPU...")
            # Move model to GPU
            model = model.to(self.device)
            # Use mixed precision if available (much faster training with minimal accuracy loss)
            if hasattr(torch.cuda, 'amp') and torch.cuda.is_bf16_supported():
                # Enable AMP (Automatic Mixed Precision)
                logger.info("Enabling mixed precision (BF16) for faster training")
                # The actual implementation will depend on the training loop
                # This function just prepares the model
            # Set model to train mode (important for batch norm, dropout, etc.)
            model.train()
            # Log success
            logger.info(f"Model successfully optimized for {self.device}")
            return model
        except Exception as e:
            logger.error(f"Error optimizing model for GPU: {str(e)}")
            logger.warning("Falling back to unoptimized model")
            return model
 async def start_realtime_chart(symbol="ETH/USDT", port=8050, manual_mode=False):
    """Start the realtime chart
    Args:
@ -855,6 +1052,11 @@ def _add_trade_compat(chart, price, timestamp, amount, pnl=0.0, action="BUY"):
            fee_rate=0.001  # 0.1% fee rate
        )
        # Track this trade for rate calculation
        if hasattr(chart, 'trade_times'):
            # Use current time instead of provided timestamp for accurate rate calculation
            chart.trade_times.append(datetime.now())
        # For SELL actions, close the position with given PnL
        if action == "SELL":
            # Find the most recent BUY position that hasn't been closed
@ -933,6 +1135,10 @@ def run_training_thread(chart, num_episodes=5000, skip_training=False, max_posit
                else:
                    logger.warning("No pre-trained agent found")
            else:
                # Disable mixed precision training to avoid optimizer errors
                os.environ['DISABLE_MIXED_PRECISION'] = '1'
                logger.info("Disabling mixed precision training to avoid optimizer errors")
                # Use a small number of episodes to test termination handling
                logger.info(f"Starting training with {num_episodes} episodes and max_position={max_position}")
                integrator.run_training(episodes=num_episodes, max_steps=2000)
@ -1081,6 +1287,11 @@ if __name__ == "__main__":
    logger.info(f"Manual-trades: {args.manual_trades}")
    logger.info(f"Max position size: {args.max_position}")
    # Log system info including GPU status
    logger.info(f"PyTorch version: {torch.__version__}")
    logger.info(f"GPU available: {gpu_available}")
    logger.info(f"Device: {device}")
    try:
        asyncio.run(main())
    except KeyboardInterrupt: