new training process and changes to the models (wip)

2025-04-01 18:43:26 +03:00
parent a78906a888
commit 902593b5f3
6 changed files with 5151 additions and 2635 deletions
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -28,14 +28,14 @@ class DQNAgent:
                 window_size: int,
                 num_features: int,
                 timeframes: List[str],
-                 learning_rate: float = 0.001,
+                 learning_rate: float = 0.0005,  # Reduced learning rate for more stability
-                 gamma: float = 0.99,
+                 gamma: float = 0.97,            # Slightly reduced discount factor
                 epsilon: float = 1.0,
-                 epsilon_min: float = 0.01,
+                 epsilon_min: float = 0.05,      # Increased minimum epsilon for more exploration
-                 epsilon_decay: float = 0.995,
+                 epsilon_decay: float = 0.9975,  # Slower decay rate
-                 memory_size: int = 10000,
+                 memory_size: int = 20000,       # Increased memory size
-                 batch_size: int = 64,
+                 batch_size: int = 128,          # Larger batch size
-                 target_update: int = 10):
+                 target_update: int = 5):        # More frequent target updates
        self.state_size = state_size
        self.action_size = action_size
@@ -70,23 +70,25 @@ class DQNAgent:
        ).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
-        # Initialize optimizer
+        # Initialize optimizer with gradient clipping
-        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
+        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)
-        # Initialize memory
+        # Initialize memories with different priorities
        self.memory = deque(maxlen=memory_size)
-        
+        self.extrema_memory = deque(maxlen=memory_size // 4)  # For extrema points
-        # Special memory for extrema samples to use for targeted learning
+        self.positive_memory = deque(maxlen=memory_size // 4)  # For positive rewards
        self.extrema_memory = deque(maxlen=memory_size // 5)  # Smaller size for extrema examples
        # Training metrics
        self.update_count = 0
        self.losses = []
        self.avg_reward = 0
        self.no_improvement_count = 0
        self.best_reward = float('-inf')
    def remember(self, state: np.ndarray, action: int, reward: float,
                next_state: np.ndarray, done: bool, is_extrema: bool = False):
        """
-        Store experience in memory
+        Store experience in memory with prioritization
        Args:
            state: Current state
@@ -97,28 +99,124 @@ class DQNAgent:
            is_extrema: Whether this is a local extrema sample (for specialized learning)
        """
        experience = (state, action, reward, next_state, done)
        # Always add to main memory
        self.memory.append(experience)
-        # If this is an extrema sample, also add to specialized memory
+        # Add to specialized memories if applicable
        if is_extrema:
            self.extrema_memory.append(experience)
-    def act(self, state: np.ndarray) -> int:
+        # Store positive experiences separately for prioritized replay
-        """Choose action using epsilon-greedy policy"""
+        if reward > 0:
-        if random.random() < self.epsilon:
+            self.positive_memory.append(experience)
    def act(self, state: np.ndarray, explore=True) -> int:
        """Choose action using epsilon-greedy policy with explore flag"""
        if explore and random.random() < self.epsilon:
            return random.randrange(self.action_size)
        with torch.no_grad():
-            state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
+            # Ensure state is normalized before inference
-            action_probs, extrema_pred = self.policy_net(state)
+            state_tensor = self._normalize_state(state)
            state_tensor = torch.FloatTensor(state_tensor).unsqueeze(0).to(self.device)
            action_probs, extrema_pred = self.policy_net(state_tensor)
            return action_probs.argmax().item()
-    def replay(self, use_extrema=False) -> float:
+    def _normalize_state(self, state: np.ndarray) -> np.ndarray:
        """Normalize the state data to prevent numerical issues"""
        # Handle NaN and infinite values
        state = np.nan_to_num(state, nan=0.0, posinf=1.0, neginf=-1.0)
        # Check if state is 1D array (happens in some environments)
        if len(state.shape) == 1:
            # If 1D, we need to normalize the whole array
            normalized_state = state.copy()
            # Convert any timestamp or non-numeric data to float
            for i in range(len(normalized_state)):
                # Check for timestamp-like objects
                if hasattr(normalized_state[i], 'timestamp') and callable(getattr(normalized_state[i], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i] = float(normalized_state[i].timestamp())
                elif not isinstance(normalized_state[i], (int, float, np.number)):
                    # Set non-numeric data to 0
                    normalized_state[i] = 0.0
            # Ensure all values are float
            normalized_state = normalized_state.astype(np.float32)
            # Simple min-max normalization for 1D state
            state_min = np.min(normalized_state)
            state_max = np.max(normalized_state)
            if state_max > state_min:
                normalized_state = (normalized_state - state_min) / (state_max - state_min)
            return normalized_state
        # Handle 2D arrays
        normalized_state = np.zeros_like(state, dtype=np.float32)
        # Convert any timestamp or non-numeric data to float
        for i in range(state.shape[0]):
            for j in range(state.shape[1]):
                if hasattr(state[i, j], 'timestamp') and callable(getattr(state[i, j], 'timestamp')):
                    # Convert timestamp to float (seconds since epoch)
                    normalized_state[i, j] = float(state[i, j].timestamp())
                elif isinstance(state[i, j], (int, float, np.number)):
                    normalized_state[i, j] = state[i, j]
                else:
                    # Set non-numeric data to 0
                    normalized_state[i, j] = 0.0
        # Loop through each timeframe's features in the combined state
        feature_count = state.shape[1] // len(self.timeframes)
        for tf_idx in range(len(self.timeframes)):
            start_idx = tf_idx * feature_count
            end_idx = start_idx + feature_count
            # Extract this timeframe's features
            tf_features = normalized_state[:, start_idx:end_idx]
            # Normalize OHLCV data by the first close price in the window
            # This makes price movements relative rather than absolute
            price_idx = 3  # Assuming close price is at index 3
            if price_idx < tf_features.shape[1]:
                reference_price = np.mean(tf_features[:, price_idx])
                if reference_price != 0:
                    # Normalize price-related columns (OHLC)
                    for i in range(4):  # First 4 columns are OHLC
                        if i < tf_features.shape[1]:
                            normalized_state[:, start_idx + i] = tf_features[:, i] / reference_price
                # Normalize volume using mean and std
                vol_idx = 4  # Assuming volume is at index 4
                if vol_idx < tf_features.shape[1]:
                    vol_mean = np.mean(tf_features[:, vol_idx])
                    vol_std = np.std(tf_features[:, vol_idx])
                    if vol_std > 0:
                        normalized_state[:, start_idx + vol_idx] = (tf_features[:, vol_idx] - vol_mean) / vol_std
                    else:
                        normalized_state[:, start_idx + vol_idx] = 0
                # Other features (technical indicators) - normalize with min-max scaling
                for i in range(5, feature_count):
                    if i < tf_features.shape[1]:
                        feature_min = np.min(tf_features[:, i])
                        feature_max = np.max(tf_features[:, i])
                        if feature_max > feature_min:
                            normalized_state[:, start_idx + i] = (tf_features[:, i] - feature_min) / (feature_max - feature_min)
                        else:
                            normalized_state[:, start_idx + i] = 0
        return normalized_state
    def replay(self, use_prioritized=True) -> float:
        """
-        Train on a batch of experiences
+        Train on a batch of experiences with prioritized sampling
        Args:
-            use_extrema: Whether to include extrema samples in training
+            use_prioritized: Whether to use prioritized replay
        Returns:
            float: Loss value
@@ -126,55 +224,67 @@ class DQNAgent:
        if len(self.memory) < self.batch_size:
            return 0.0
-        # Sample batch - mix regular and extrema samples
+        # Sample batch with prioritization
        batch = []
        if use_extrema and len(self.extrema_memory) > self.batch_size // 4:
            # Get some extrema samples
            extrema_count = min(self.batch_size // 3, len(self.extrema_memory))
            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
-            # Get regular samples for the rest
+        if use_prioritized and len(self.positive_memory) > 0 and len(self.extrema_memory) > 0:
-            regular_count = self.batch_size - extrema_count
+            # Prioritized sampling from different memory types
            positive_count = min(self.batch_size // 4, len(self.positive_memory))
            extrema_count = min(self.batch_size // 4, len(self.extrema_memory))
            regular_count = self.batch_size - positive_count - extrema_count
            positive_samples = random.sample(list(self.positive_memory), positive_count)
            extrema_samples = random.sample(list(self.extrema_memory), extrema_count)
            regular_samples = random.sample(list(self.memory), regular_count)
-            # Combine samples
+            batch = positive_samples + extrema_samples + regular_samples
            batch = extrema_samples + regular_samples
        else:
            # Standard sampling
            batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        # Normalize states before training
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
        # Convert to tensors and move to device
-        states = torch.FloatTensor(np.array(states)).to(self.device)
+        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions = torch.LongTensor(actions).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards = torch.FloatTensor(rewards).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
+        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones = torch.FloatTensor(dones).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # Get current Q values
-        current_q_values, extrema_pred = self.policy_net(states)
+        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions.unsqueeze(1))
+        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        # Get next Q values from target network
+        # Get next Q values from target network (Double DQN approach)
        with torch.no_grad():
-            next_q_values, _ = self.target_net(next_states)
+            # Get actions from policy network
-            next_q_values = next_q_values.max(1)[0]
+            next_actions, _ = self.policy_net(next_states_tensor)
-            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
+            next_actions = next_actions.max(1)[1].unsqueeze(1)
-        # Compute Q-learning loss
+            # Get Q values from target network for those actions
-        q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
+            next_q_values, _ = self.target_net(next_states_tensor)
            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-        # If we have extrema labels (not in this implementation yet), 
+            # Compute target Q values
-        # we could add an additional loss for extrema prediction
+            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
        # This would require labels for whether each state is near an extrema
-        # Total loss is just Q-learning loss for now
+        # Clamp target values to prevent extreme values
-        loss = q_loss
+        target_q_values = torch.clamp(target_q_values, -100, 100)
        # Compute Huber loss (more robust to outliers than MSE)
        loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        # Apply gradient clipping to prevent exploding gradients
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        # Update target network if needed
@@ -201,36 +311,76 @@ class DQNAgent:
        if len(states) == 0:
            return 0.0
        # Normalize states
        normalized_states = np.array([self._normalize_state(state) for state in states])
        normalized_next_states = np.array([self._normalize_state(state) for state in next_states])
        # Convert to tensors
-        states = torch.FloatTensor(np.array(states)).to(self.device)
+        states_tensor = torch.FloatTensor(normalized_states).to(self.device)
-        actions = torch.LongTensor(actions).to(self.device)
+        actions_tensor = torch.LongTensor(actions).to(self.device)
-        rewards = torch.FloatTensor(rewards).to(self.device)
+        rewards_tensor = torch.FloatTensor(rewards).to(self.device)
-        next_states = torch.FloatTensor(np.array(next_states)).to(self.device)
+        next_states_tensor = torch.FloatTensor(normalized_next_states).to(self.device)
-        dones = torch.FloatTensor(dones).to(self.device)
+        dones_tensor = torch.FloatTensor(dones).to(self.device)
        # Forward pass
-        current_q_values, extrema_pred = self.policy_net(states)
+        current_q_values, extrema_pred = self.policy_net(states_tensor)
-        current_q_values = current_q_values.gather(1, actions.unsqueeze(1))
+        current_q_values = current_q_values.gather(1, actions_tensor.unsqueeze(1))
-        # Get next Q values
+        # Get next Q values (Double DQN approach)
        with torch.no_grad():
-            next_q_values, _ = self.target_net(next_states)
+            next_actions, _ = self.policy_net(next_states_tensor)
-            next_q_values = next_q_values.max(1)[0]
+            next_actions = next_actions.max(1)[1].unsqueeze(1)
            target_q_values = rewards + (1 - dones) * self.gamma * next_q_values
-        # Higher weight for extrema training
+            next_q_values, _ = self.target_net(next_states_tensor)
-        q_loss = nn.MSELoss()(current_q_values.squeeze(), target_q_values)
+            next_q_values = next_q_values.gather(1, next_actions).squeeze(1)
-        # Full loss is just Q-learning loss
+            target_q_values = rewards_tensor + (1 - dones_tensor) * self.gamma * next_q_values
        # Clamp target values
        target_q_values = torch.clamp(target_q_values, -100, 100)
        # Use Huber loss for extrema training
        q_loss = nn.SmoothL1Loss()(current_q_values.squeeze(), target_q_values)
        # Full loss
        loss = q_loss
        # Optimize
        self.optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(self.policy_net.parameters(), max_norm=1.0)
        self.optimizer.step()
        return loss.item()
    def update_learning_metrics(self, episode_reward, best_reward_threshold=0.01):
        """Update learning metrics and perform learning rate adjustments if needed"""
        # Update average reward with exponential moving average
        if self.avg_reward == 0:
            self.avg_reward = episode_reward
        else:
            self.avg_reward = 0.95 * self.avg_reward + 0.05 * episode_reward
        # Check if we're making sufficient progress
        if episode_reward > (1 + best_reward_threshold) * self.best_reward:
            self.best_reward = episode_reward
            self.no_improvement_count = 0
            return True  # Improved
        else:
            self.no_improvement_count += 1
            # If no improvement for a while, adjust learning rate
            if self.no_improvement_count >= 10:
                current_lr = self.optimizer.param_groups[0]['lr']
                new_lr = current_lr * 0.5
                if new_lr >= 1e-6:  # Don't reduce below minimum threshold
                    for param_group in self.optimizer.param_groups:
                        param_group['lr'] = new_lr
                    logger.info(f"Reducing learning rate from {current_lr} to {new_lr}")
                    self.no_improvement_count = 0
            return False  # No improvement
    def save(self, path: str):
        """Save model and agent state"""
        os.makedirs(os.path.dirname(path), exist_ok=True)
@@ -246,9 +396,13 @@ class DQNAgent:
            'epsilon': self.epsilon,
            'update_count': self.update_count,
            'losses': self.losses,
-            'optimizer_state': self.optimizer.state_dict()
+            'optimizer_state': self.optimizer.state_dict(),
            'best_reward': self.best_reward,
            'avg_reward': self.avg_reward
        }
        torch.save(state, f"{path}_agent_state.pt")
        logger.info(f"Agent state saved to {path}_agent_state.pt")
    def load(self, path: str):
        """Load model and agent state"""
@@ -259,8 +413,19 @@ class DQNAgent:
        self.target_net.load(f"{path}_target")
        # Load agent state
-        state = torch.load(f"{path}_agent_state.pt")
+        try:
-        self.epsilon = state['epsilon']
+            agent_state = torch.load(f"{path}_agent_state.pt", map_location=self.device)
-        self.update_count = state['update_count']
+            self.epsilon = agent_state['epsilon']
-        self.losses = state['losses']
+            self.update_count = agent_state['update_count']
-        self.optimizer.load_state_dict(state['optimizer_state']) 
+            self.losses = agent_state['losses']
            self.optimizer.load_state_dict(agent_state['optimizer_state'])
            # Load additional metrics if they exist
            if 'best_reward' in agent_state:
                self.best_reward = agent_state['best_reward']
            if 'avg_reward' in agent_state:
                self.avg_reward = agent_state['avg_reward']
            logger.info(f"Agent state loaded from {path}_agent_state.pt")
        except FileNotFoundError:
            logger.warning(f"Agent state file not found at {path}_agent_state.pt, using default values") 
--- a/NN/models/simple_cnn.py
+++ b/NN/models/simple_cnn.py
@@ -44,13 +44,46 @@ class PricePatternAttention(nn.Module):
        return output, attn_weights
 class AdaptiveNorm(nn.Module):
    """
    Adaptive normalization layer that chooses between different normalization
    methods based on input dimensions
    """
    def __init__(self, num_features):
        super(AdaptiveNorm, self).__init__()
        self.batch_norm = nn.BatchNorm1d(num_features, affine=True)
        self.group_norm = nn.GroupNorm(min(32, num_features), num_features)
        self.layer_norm = nn.LayerNorm([num_features, 1])
    def forward(self, x):
        # Check input dimensions
        batch_size, channels, seq_len = x.size()
        # Choose normalization method:
        # - Batch size > 1 and seq_len > 1: BatchNorm
        # - Batch size == 1 or seq_len == 1: GroupNorm
        # - Fallback for extreme cases: LayerNorm
        if batch_size > 1 and seq_len > 1:
            return self.batch_norm(x)
        elif seq_len > 1:
            return self.group_norm(x)
        else:
            # For 1D inputs (seq_len=1), we need to adjust the layer norm
            # to the actual input size
            if not hasattr(self, 'layer_norm_1d') or self.layer_norm_1d.normalized_shape[0] != channels:
                self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
            return self.layer_norm_1d(x)
 class CNNModelPyTorch(nn.Module):
    """
    CNN model for trading with multiple timeframes
    """
-    def __init__(self, window_size, num_features, output_size, timeframes):
+    def __init__(self, window_size=20, num_features=5, output_size=3, timeframes=None):
        super(CNNModelPyTorch, self).__init__()
        if timeframes is None:
            timeframes = [1]
        self.window_size = window_size
        self.num_features = num_features
        self.output_size = output_size
@@ -73,27 +106,28 @@ class CNNModelPyTorch(nn.Module):
        """Create all model layers with current feature dimensions"""
        # Convolutional layers - use total_features as input channels
        self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1)
-        self.bn1 = nn.BatchNorm1d(64)
+        self.norm1 = AdaptiveNorm(64)
        self.dropout1 = nn.Dropout(0.2)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
-        self.bn2 = nn.BatchNorm1d(128)
+        self.norm2 = AdaptiveNorm(128)
        self.dropout2 = nn.Dropout(0.3)
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
-        self.bn3 = nn.BatchNorm1d(256)
+        self.norm3 = AdaptiveNorm(256)
        self.dropout3 = nn.Dropout(0.4)
        # Add price pattern attention layer
        self.attention = PricePatternAttention(256)
        # Extrema detection specialized convolutional layer
-        self.extrema_conv = nn.Conv1d(256, 128, kernel_size=5, padding=2)
+        self.extrema_conv = nn.Conv1d(256, 128, kernel_size=3, padding=1)  # Smaller kernel for small inputs
-        self.extrema_bn = nn.BatchNorm1d(128)
+        self.extrema_norm = AdaptiveNorm(128)
-        # Calculate size after convolutions - adjusted for attention output
+        # Fully connected layers - input size will be determined dynamically
-        conv_output_size = self.window_size * 256
+        self.fc1 = None  # Will be initialized in forward pass
        # Fully connected layers
        self.fc1 = nn.Linear(conv_output_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.dropout_fc = nn.Dropout(0.5)
        # Advantage and Value streams (Dueling DQN architecture)
        self.fc3 = nn.Linear(256, self.output_size)  # Advantage stream
@@ -131,46 +165,96 @@ class CNNModelPyTorch(nn.Module):
        # Ensure input is on the correct device
        x = x.to(self.device)
-        # Check and handle if input dimensions don't match model expectations
+        # Check input dimensions and reshape as needed
-        batch_size, window_len, feature_dim = x.size()
+        if len(x.size()) == 2:
-        if feature_dim != self.total_features:
+            # If input is [batch_size, features], reshape to [batch_size, features, 1]
-            logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers")
+            batch_size, feature_dim = x.size()
            self.rebuild_conv_layers(feature_dim)
-        # Reshape input: [batch, window_size, features] -> [batch, channels, window_size]
+            # Check and handle if input features don't match model expectations
-        x = x.permute(0, 2, 1)
+            if feature_dim != self.total_features:
                logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers")
                self.rebuild_conv_layers(feature_dim)
-        # Convolutional layers
+            # For 1D input, use a sequence length of 1
-        x = F.relu(self.bn1(self.conv1(x)))
+            seq_len = 1
-        x = F.relu(self.bn2(self.conv2(x)))
+            x = x.unsqueeze(2)  # Reshape to [batch, features, 1]
-        x = F.relu(self.bn3(self.conv3(x)))
+        elif len(x.size()) == 3:
            # Standard case: [batch_size, window_size, features]
            batch_size, seq_len, feature_dim = x.size()
            # Check and handle if input dimensions don't match model expectations
            if feature_dim != self.total_features:
                logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features}), rebuilding layers")
                self.rebuild_conv_layers(feature_dim)
            # Reshape input: [batch, window_size, features] -> [batch, features, window_size]
            x = x.permute(0, 2, 1)
        else:
            raise ValueError(f"Unexpected input shape: {x.size()}, expected 2D or 3D tensor")
        # Convolutional layers with dropout - safely handle small spatial dimensions
        try:
            x = self.dropout1(F.relu(self.norm1(self.conv1(x))))
            x = self.dropout2(F.relu(self.norm2(self.conv2(x))))
            x = self.dropout3(F.relu(self.norm3(self.conv3(x))))
        except Exception as e:
            logger.warning(f"Error in convolutional layers: {str(e)}")
            # Fallback for very small inputs: skip some convolutions
            if seq_len < 3:
                # Apply a simpler convolution for very small inputs
                x = F.relu(self.conv1(x))
                x = F.relu(self.conv2(x))
                # Skip last conv if we get dimension errors
                try:
                    x = F.relu(self.conv3(x))
                except:
                    pass
        # Store conv features for extrema detection
        conv_features = x
-        # Reshape for attention: [batch, channels, window_size] -> [batch, window_size, channels]
+        # Get the current shape after convolutions
-        x_attention = x.permute(0, 2, 1)
+        _, channels, conv_seq_len = x.size()
-        # Apply attention
+        # Initialize fc1 if not created yet or if the shape has changed
-        attention_output, attention_weights = self.attention(x_attention)
+        if self.fc1 is None:
            flattened_size = channels * conv_seq_len
            logger.info(f"Initializing fc1 with input size {flattened_size}")
            self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
-        # We'll use attention directly without the residual connection
+        # Apply extrema detection safely
-        # to avoid dimension mismatch issues
+        try:
-        attention_reshaped = attention_output.permute(0, 2, 1)  # [batch, channels, window_size]
+            extrema_features = F.relu(self.extrema_norm(self.extrema_conv(conv_features)))
        except Exception as e:
            logger.warning(f"Error in extrema detection: {str(e)}")
            extrema_features = conv_features  # Fallback
-        # Apply extrema detection specialized layer
+        # Handle attention for small sequence lengths
-        extrema_features = F.relu(self.extrema_bn(self.extrema_conv(conv_features)))
+        if conv_seq_len > 1:
            # Reshape for attention: [batch, channels, seq_len] -> [batch, seq_len, channels]
            x_attention = x.permute(0, 2, 1)
-        # Use attention features directly instead of residual connection
+            # Apply attention
-        # to avoid dimension mismatches
+            try:
-        x = conv_features  # Just use the convolutional features
+                attention_output, attention_weights = self.attention(x_attention)
            except Exception as e:
                logger.warning(f"Error in attention layer: {str(e)}")
                # Fallback: don't use attention
-        # Flatten
+        # Flatten - get the actual shape for this batch
-        x = x.view(batch_size, -1)
+        flattened_size = channels * conv_seq_len
        x = x.view(batch_size, flattened_size)
-        # Fully connected layers
+        # Check if we need to recreate fc1 with the correct size
        if self.fc1.in_features != flattened_size:
            logger.info(f"Recreating fc1 layer to match input size {flattened_size}")
            self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
            # Reinitialize optimizer after changing the model
            self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        # Fully connected layers with dropout
        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
+        x = self.dropout_fc(F.relu(self.fc2(x)))
        # Split into advantage and value streams
        advantage = self.fc3(x)
--- a/NN/utils/trading_env.py
+++ b/NN/utils/trading_env.py
@@ -3,6 +3,10 @@ import gym
 from gym import spaces
 from typing import Dict, Tuple, List
 import pandas as pd
 import logging
 # Configure logger
 logger = logging.getLogger(__name__)
 class TradingEnvironment(gym.Env):
    """
@@ -12,97 +16,284 @@ class TradingEnvironment(gym.Env):
                 data: pd.DataFrame,
                 initial_balance: float = 100.0,
                 fee_rate: float = 0.0002,
-                 max_steps: int = 1000):
+                 max_steps: int = 1000,
                 window_size: int = 20,
                 risk_aversion: float = 0.2,  # Controls how much to penalize volatility
                 price_scaling: str = 'zscore',  # 'zscore', 'minmax', or 'raw'
                 reward_scaling: float = 10.0,   # Scale factor for rewards
                 episode_penalty: float = 0.1):  # Penalty for active positions at end of episode
        super(TradingEnvironment, self).__init__()
        self.data = data
        self.initial_balance = initial_balance
        self.fee_rate = fee_rate
        self.max_steps = max_steps
        self.window_size = window_size
        self.risk_aversion = risk_aversion
        self.price_scaling = price_scaling
        self.reward_scaling = reward_scaling
        self.episode_penalty = episode_penalty
-        # Action space: 0 (SELL), 1 (HOLD), 2 (BUY)
+        # Preprocess data if needed
        self._preprocess_data()
        # Action space: 0 (BUY), 1 (SELL), 2 (HOLD)
        self.action_space = spaces.Discrete(3)
        # Observation space: price data, technical indicators, and account state
        feature_dim = self.data.shape[1] + 3  # Adding position, equity, unrealized_pnl
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
-            shape=(data.shape[1],),  # Number of features
+            shape=(feature_dim,),
            dtype=np.float32
        )
        # Initialize state
        self.reset()
    def _preprocess_data(self):
        """Preprocess data - normalize or standardize features"""
        # Store the original data for reference
        self.original_data = self.data.copy()
        # Normalize price data based on the selected method
        if self.price_scaling == 'zscore':
            # For each feature, apply z-score normalization 
            for col in self.data.columns:
                if col in ['open', 'high', 'low', 'close']:
                    mean = self.data[col].mean()
                    std = self.data[col].std()
                    if std > 0:
                        self.data[col] = (self.data[col] - mean) / std
                # Normalize volume separately
                elif col == 'volume':
                    mean = self.data[col].mean()
                    std = self.data[col].std()
                    if std > 0:
                        self.data[col] = (self.data[col] - mean) / std
        elif self.price_scaling == 'minmax':
            # For each feature, apply min-max scaling
            for col in self.data.columns:
                min_val = self.data[col].min()
                max_val = self.data[col].max()
                if max_val > min_val:
                    self.data[col] = (self.data[col] - min_val) / (max_val - min_val)
    def reset(self) -> np.ndarray:
        """Reset the environment to initial state"""
-        self.current_step = 0
+        self.current_step = self.window_size
        self.balance = self.initial_balance
-        self.position = 0  # 0: no position, 1: long position
+        self.position = 0  # 0: no position, 1: long position, -1: short position
        self.entry_price = 0
        self.entry_time = 0
        self.total_trades = 0
        self.winning_trades = 0
        self.losing_trades = 0
        self.total_pnl = 0
        self.balance_history = [self.initial_balance]
        self.equity_history = [self.initial_balance]
        self.max_balance = self.initial_balance
        self.max_drawdown = 0
        # Trading performance metrics
        self.trade_durations = []  # Track how long trades are held
        self.returns = []          # Track returns of each trade
        # For analyzing trade clustering
        self.last_action_time = 0
        self.actions_taken = []
        return self._get_observation()
    def _get_observation(self) -> np.ndarray:
-        """Get current observation state"""
+        """Get current observation state with account information"""
-        return self.data.iloc[self.current_step].values
+        # Get market data for the current step
        market_data = self.data.iloc[self.current_step].values
        # Get current price
        current_price = self.original_data.iloc[self.current_step]['close']
        # Calculate unrealized PnL
        unrealized_pnl = 0
        if self.position != 0:
            price_diff = current_price - self.entry_price
            unrealized_pnl = self.position * price_diff
        # Calculate total equity (balance + unrealized PnL)
        equity = self.balance + unrealized_pnl
        # Normalize account state
        normalized_position = self.position  # -1, 0, or 1
        normalized_equity = equity / self.initial_balance - 1.0  # Percent change from initial
        normalized_unrealized_pnl = unrealized_pnl / self.initial_balance if self.initial_balance > 0 else 0
        # Combine market data with account state
        account_state = np.array([normalized_position, normalized_equity, normalized_unrealized_pnl])
        observation = np.concatenate([market_data, account_state])
        # Handle any NaN values
        observation = np.nan_to_num(observation, nan=0.0)
        return observation
    def _calculate_reward(self, action: int) -> float:
-        """Calculate reward based on action and outcome"""
+        """
-        current_price = self.data.iloc[self.current_step]['close']
+        Calculate reward based on action and outcome with improved risk-adjusted metrics
-        # If we have an open position
+        Args:
-        if self.position != 0:
+            action: The action taken (0=BUY, 1=SELL, 2=HOLD)
            # Calculate PnL
            pnl = self.position * (current_price - self.entry_price) / self.entry_price
            fees = self.fee_rate * 2  # Entry and exit fees
-            # Close position
+        Returns:
-            if (action == 0 and self.position > 0) or (action == 2 and self.position < 0):
+            float: Calculated reward value
-                net_pnl = pnl - fees
+        """
-                self.total_pnl += net_pnl
+        # Get current price and next price
-                self.balance *= (1 + net_pnl)
+        current_price = self.original_data.iloc[self.current_step]['close']
        # Default reward is slightly negative to discourage excessive trading
        reward = -0.0001
        pnl = 0.0
        # Handle different actions based on current position
        if self.position == 0:  # No position
            if action == 0:  # BUY
                self.position = 1
                self.entry_price = current_price
                self.entry_time = self.current_step
                reward = -self.fee_rate  # Small penalty for trading cost
            elif action == 1:  # SELL (start short position)
                self.position = -1
                self.entry_price = current_price
                self.entry_time = self.current_step
                reward = -self.fee_rate  # Small penalty for trading cost
            # else action == 2 (HOLD) - keep the small negative reward
        elif self.position > 0:  # Long position
            if action == 1:  # SELL (close long)
                # Calculate profit/loss
                price_diff = current_price - self.entry_price
                pnl = price_diff / self.entry_price - 2 * self.fee_rate  # Account for entry and exit fees
                # Adjust reward based on PnL and risk
                reward = pnl * self.reward_scaling
                # Track trade performance
                self.total_trades += 1
                if pnl > 0:
                    self.winning_trades += 1
                else:
                    self.losing_trades += 1
                # Calculate trade duration
                trade_duration = self.current_step - self.entry_time
                self.trade_durations.append(trade_duration)
                # Update returns list
                self.returns.append(pnl)
                # Update balance and reset position
                self.balance *= (1 + pnl)
                self.balance_history.append(self.balance)
                self.max_balance = max(self.max_balance, self.balance)
                self.total_pnl += pnl
-                self.total_trades += 1
+                # Reset position
                if net_pnl > 0:
                    self.winning_trades += 1
                # Reward based on PnL
                reward = net_pnl * 100  # Scale up for better learning
                # Additional reward for win rate
                win_rate = self.winning_trades / max(1, self.total_trades)
                reward += win_rate * 0.1
                self.position = 0
                return reward
-            # Hold position
+            elif action == 0:  # BUY (while already long)
-            return pnl * 0.1  # Small reward for holding profitable positions
+                # Penalize trying to increase an already active position
                reward = -0.001
-        # No position
+            # else action == 2 (HOLD) - calculate unrealized P&L for reward
-        if action == 1:  # HOLD
+            else:
-            return 0
+                price_diff = current_price - self.entry_price
                unrealized_pnl = price_diff / self.entry_price
-        # Open new position
+                # Small reward/penalty based on unrealized P&L
-        if action in [0, 2]:  # SELL or BUY
+                reward = unrealized_pnl * 0.05  # Scale down to encourage holding good positions
            self.position = -1 if action == 0 else 1
            self.entry_price = current_price
            return -self.fee_rate  # Small penalty for trading
-        return 0
+        elif self.position < 0:  # Short position
            if action == 0:  # BUY (close short)
                # Calculate profit/loss
                price_diff = self.entry_price - current_price
                pnl = price_diff / self.entry_price - 2 * self.fee_rate  # Account for entry and exit fees
                # Adjust reward based on PnL and risk
                reward = pnl * self.reward_scaling
                # Track trade performance
                self.total_trades += 1
                if pnl > 0:
                    self.winning_trades += 1
                else:
                    self.losing_trades += 1
                # Calculate trade duration
                trade_duration = self.current_step - self.entry_time
                self.trade_durations.append(trade_duration)
                # Update returns list
                self.returns.append(pnl)
                # Update balance and reset position
                self.balance *= (1 + pnl)
                self.balance_history.append(self.balance)
                self.max_balance = max(self.max_balance, self.balance)
                self.total_pnl += pnl
                # Reset position
                self.position = 0
            elif action == 1:  # SELL (while already short)
                # Penalize trying to increase an already active position
                reward = -0.001
            # else action == 2 (HOLD) - calculate unrealized P&L for reward
            else:
                price_diff = self.entry_price - current_price
                unrealized_pnl = price_diff / self.entry_price
                # Small reward/penalty based on unrealized P&L
                reward = unrealized_pnl * 0.05  # Scale down to encourage holding good positions
        # Record the action
        self.actions_taken.append(action)
        self.last_action_time = self.current_step
        # Update equity history (balance + unrealized P&L)
        current_equity = self.balance
        if self.position != 0:
            # Calculate unrealized P&L
            if self.position > 0:  # Long
                price_diff = current_price - self.entry_price
                unrealized_pnl = price_diff / self.entry_price * self.balance
            else:  # Short
                price_diff = self.entry_price - current_price
                unrealized_pnl = price_diff / self.entry_price * self.balance
            current_equity = self.balance + unrealized_pnl
        self.equity_history.append(current_equity)
        # Calculate current drawdown
        peak_equity = max(self.equity_history)
        current_drawdown = (peak_equity - current_equity) / peak_equity if peak_equity > 0 else 0
        self.max_drawdown = max(self.max_drawdown, current_drawdown)
        # Apply risk aversion factor - penalize volatility
        if len(self.returns) > 1:
            returns_std = np.std(self.returns)
            reward -= returns_std * self.risk_aversion
        return reward, pnl
    def step(self, action: int) -> Tuple[np.ndarray, float, bool, Dict]:
        """Execute one step in the environment"""
-        # Calculate reward
+        # Calculate reward and update state
-        reward = self._calculate_reward(action)
+        reward, pnl = self._calculate_reward(action)
        # Move to next step
        self.current_step += 1
@@ -110,26 +301,70 @@ class TradingEnvironment(gym.Env):
        # Check if episode is done
        done = self.current_step >= min(self.max_steps - 1, len(self.data) - 1)
        # Apply penalty if episode ends with open position
        if done and self.position != 0:
            reward -= self.episode_penalty
            # Force close the position at the end if still open
            current_price = self.original_data.iloc[self.current_step]['close']
            if self.position > 0:  # Long position
                price_diff = current_price - self.entry_price
                pnl = price_diff / self.entry_price - 2 * self.fee_rate
            else:  # Short position
                price_diff = self.entry_price - current_price
                pnl = price_diff / self.entry_price - 2 * self.fee_rate
            # Update balance
            self.balance *= (1 + pnl)
            self.total_pnl += pnl
            # Track trade
            self.total_trades += 1
            if pnl > 0:
                self.winning_trades += 1
            else:
                self.losing_trades += 1
            # Reset position
            self.position = 0
        # Get next observation
        observation = self._get_observation()
-        # Calculate max drawdown
+        # Calculate sharpe ratio and sortino ratio if possible
-        max_drawdown = 0
+        sharpe_ratio = 0
-        if len(self.balance_history) > 1:
+        sortino_ratio = 0
-            peak = self.balance_history[0]
+        win_rate = self.winning_trades / max(1, self.total_trades)
-            for balance in self.balance_history:
+        
-                peak = max(peak, balance)
+        if len(self.returns) > 1:
-                drawdown = (peak - balance) / peak
+            mean_return = np.mean(self.returns)
-                max_drawdown = max(max_drawdown, drawdown)
+            std_return = np.std(self.returns)
            if std_return > 0:
                sharpe_ratio = mean_return / std_return
            # For sortino, we only consider downside deviation
            downside_returns = [r for r in self.returns if r < 0]
            if downside_returns:
                downside_deviation = np.std(downside_returns)
                if downside_deviation > 0:
                    sortino_ratio = mean_return / downside_deviation
        # Calculate average trade duration
        avg_trade_duration = np.mean(self.trade_durations) if self.trade_durations else 0
        # Additional info
        info = {
            'balance': self.balance,
            'position': self.position,
            'total_trades': self.total_trades,
-            'win_rate': self.winning_trades / max(1, self.total_trades),
+            'win_rate': win_rate,
            'total_pnl': self.total_pnl,
-            'max_drawdown': max_drawdown
+            'max_drawdown': self.max_drawdown,
            'sharpe_ratio': sharpe_ratio,
            'sortino_ratio': sortino_ratio,
            'avg_trade_duration': avg_trade_duration,
            'pnl': pnl,
            'gain': (self.balance - self.initial_balance) / self.initial_balance
        }
        return observation, reward, done, info
@@ -143,20 +378,19 @@ class TradingEnvironment(gym.Env):
            print(f"Total Trades: {self.total_trades}")
            print(f"Win Rate: {self.winning_trades/max(1, self.total_trades):.2%}")
            print(f"Total PnL: ${self.total_pnl:.2f}")
-            print(f"Max Drawdown: {self._calculate_max_drawdown():.2%}")
+            print(f"Max Drawdown: {self.max_drawdown:.2%}")
            print(f"Sharpe Ratio: {self._calculate_sharpe_ratio():.4f}")
            print("-" * 50)
-    def _calculate_max_drawdown(self):
+    def _calculate_sharpe_ratio(self):
-        """Calculate maximum drawdown from balance history"""
+        """Calculate Sharpe ratio from returns"""
-        if len(self.balance_history) <= 1:
+        if len(self.returns) < 2:
            return 0.0
-        peak = self.balance_history[0]
+        mean_return = np.mean(self.returns)
-        max_drawdown = 0.0
+        std_return = np.std(self.returns)
-        for balance in self.balance_history:
+        if std_return == 0:
-            peak = max(peak, balance)
+            return 0.0
            drawdown = (peak - balance) / peak
            max_drawdown = max(max_drawdown, drawdown)
-        return max_drawdown 
+        return mean_return / std_return 
--- a/realtime.py
+++ b/realtime.py
--- a/realtime_old.py
+++ b/realtime_old.py
--- a/train_improved_rl.py
+++ b/train_improved_rl.py
@@ -0,0 +1,547 @@
 #!/usr/bin/env python
 """
 Improved RL Trading with Enhanced Training and Monitoring
 This script provides an improved version of the RL training process,
 implementing better normalization, reward structure, and model training.
 """
 import os
 import sys
 import logging
 import argparse
 import time
 from datetime import datetime
 import numpy as np
 import torch
 import pandas as pd
 import matplotlib.pyplot as plt
 from pathlib import Path
 # Add project directory to path if needed
 project_root = os.path.dirname(os.path.abspath(__file__))
 if project_root not in sys.path:
    sys.path.append(project_root)
 # Import our custom modules
 from NN.models.dqn_agent import DQNAgent
 from NN.utils.trading_env import TradingEnvironment
 from NN.utils.data_interface import DataInterface
 from realtime import BinanceHistoricalData, RealTimeChart
 # Configure logging
 log_filename = f'improved_rl_training_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()
    ]
 )
 logger = logging.getLogger('improved_rl')
 # Parse command line arguments
 parser = argparse.ArgumentParser(description='Improved RL Trading with Enhanced Training')
 parser.add_argument('--episodes', type=int, default=20, help='Number of episodes to train')
 parser.add_argument('--visualize', action='store_true', help='Visualize trades during training')
 parser.add_argument('--save-path', type=str, default='NN/models/saved/improved_dqn_agent', help='Path to save trained model')
 parser.add_argument('--symbol', type=str, default='ETH/USDT', help='Trading pair symbol')
 args = parser.parse_args()
 def create_training_environment(symbol, window_size=20):
    """Create and prepare the training environment with data"""
    logger.info(f"Setting up training environment for {symbol}")
    # Fetch historical data from multiple timeframes
    data_interface = DataInterface(symbol)
    # Use Binance data provider for fetching data
    historical_data = BinanceHistoricalData()
    # Fetch data for each timeframe
    df_1m = historical_data.get_historical_candles(symbol, interval_seconds=60, limit=1000)
    df_5m = historical_data.get_historical_candles(symbol, interval_seconds=300, limit=1000)
    df_15m = historical_data.get_historical_candles(symbol, interval_seconds=900, limit=500)
    # Ensure all dataframes have index as timestamp type
    if df_1m is not None and not df_1m.empty:
        if 'timestamp' in df_1m.columns:
            df_1m = df_1m.set_index('timestamp')
    if df_5m is not None and not df_5m.empty:
        if 'timestamp' in df_5m.columns:
            df_5m = df_5m.set_index('timestamp')
    if df_15m is not None and not df_15m.empty:
        if 'timestamp' in df_15m.columns:
            df_15m = df_15m.set_index('timestamp')
    # Preprocess data (add technical indicators)
    df_1m = preprocess_dataframe(df_1m)
    df_5m = preprocess_dataframe(df_5m)
    df_15m = preprocess_dataframe(df_15m)
    # Create environment with all timeframes
    env = create_multi_timeframe_env(df_1m, df_5m, df_15m, window_size)
    return env, (df_1m, df_5m, df_15m)
 def preprocess_dataframe(df):
    """Add technical indicators and preprocess dataframe"""
    if df is None or df.empty:
        return None
    # Drop any missing values
    df = df.dropna()
    # Ensure it has OHLCV columns
    required_columns = ['open', 'high', 'low', 'close', 'volume']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logger.warning(f"Missing required columns: {missing_columns}")
        for col in missing_columns:
            # Fill with close price for OHLC if missing
            if col in ['open', 'high', 'low'] and 'close' in df.columns:
                df[col] = df['close']
            # Fill with zeros for volume if missing
            elif col == 'volume':
                df[col] = 0
    # Add simple technical indicators
    # 1. Simple Moving Averages
    df['sma_5'] = df['close'].rolling(window=5).mean()
    df['sma_10'] = df['close'].rolling(window=10).mean()
    # 2. Relative Strength Index (RSI)
    delta = df['close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    df['rsi'] = 100 - (100 / (1 + rs))
    # 3. Bollinger Bands
    df['bb_middle'] = df['close'].rolling(window=20).mean()
    df['bb_std'] = df['close'].rolling(window=20).std()
    df['bb_upper'] = df['bb_middle'] + 2 * df['bb_std']
    df['bb_lower'] = df['bb_middle'] - 2 * df['bb_std']
    # 4. MACD
    df['ema_12'] = df['close'].ewm(span=12, adjust=False).mean()
    df['ema_26'] = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = df['ema_12'] - df['ema_26']
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    # 5. Price rate of change
    df['roc'] = df['close'].pct_change(periods=10) * 100
    # Fill any remaining NaN values with 0
    df = df.fillna(0)
    return df
 def create_multi_timeframe_env(df_1m, df_5m, df_15m, window_size=20):
    """Create a custom environment that handles multiple timeframes"""
    # Ensure we have complete data for all timeframes
    min_required_length = window_size + 100  # Add buffer for training
    if (df_1m is None or len(df_1m) < min_required_length or
        df_5m is None or len(df_5m) < min_required_length or
        df_15m is None or len(df_15m) < min_required_length):
        raise ValueError(f"Insufficient data for training. Need at least {min_required_length} candles per timeframe.")
    # Ensure we only use the last N valid data points
    df_1m = df_1m.iloc[-900:].copy() if len(df_1m) > 900 else df_1m.copy()
    df_5m = df_5m.iloc[-180:].copy() if len(df_5m) > 180 else df_5m.copy()
    df_15m = df_15m.iloc[-60:].copy() if len(df_15m) > 60 else df_15m.copy()
    # Reset index to make sure we have continuous integers
    df_1m = df_1m.reset_index(drop=True)
    df_5m = df_5m.reset_index(drop=True)
    df_15m = df_15m.reset_index(drop=True)
    # For simplicity, we'll use the 1m data as the base environment
    # The other timeframes will be incorporated through observation
    env = TradingEnvironment(
        data=df_1m,
        initial_balance=100.0,
        fee_rate=0.0005,            # 0.05% fee (typical for crypto exchanges)
        max_steps=len(df_1m) - window_size - 50,  # Leave some room at the end
        window_size=window_size,
        risk_aversion=0.2,          # Moderately risk-averse
        price_scaling='zscore',     # Use z-score normalization
        reward_scaling=10.0,        # Scale rewards for better learning
        episode_penalty=0.2         # Penalty for holding positions at end of episode
    )
    return env
 def initialize_agent(env, window_size=20, num_features=0, timeframes=None):
    """Initialize the DQN agent with appropriate parameters"""
    if timeframes is None:
        timeframes = ['1m', '5m', '15m']
    # Calculate input dimensions
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n
    # If num_features wasn't provided, infer from environment
    if num_features == 0:
        # Calculate features per timeframe from state dimension and number of timeframes
        # Accounting for the 3 additional features (position, equity, unrealized_pnl)
        num_features = (state_dim - 3) // len(timeframes)
    logger.info(f"Initializing DQN agent: state_dim={state_dim}, action_dim={action_dim}, features={num_features}")
    agent = DQNAgent(
        state_size=state_dim,
        action_size=action_dim,
        window_size=window_size,
        num_features=num_features,
        timeframes=timeframes,
        learning_rate=0.0005,     # Start with a moderate learning rate
        gamma=0.97,               # Slightly reduced discount factor for stable learning
        epsilon=1.0,              # Start with full exploration
        epsilon_min=0.05,         # Maintain some exploration even at the end
        epsilon_decay=0.9975,     # Slower decay for more exploration
        memory_size=20000,        # Larger replay buffer
        batch_size=128,           # Larger batch size for more stable gradients
        target_update=5           # More frequent target network updates
    )
    return agent
 def train_agent(env, agent, num_episodes=20, visualize=False, chart=None, save_path=None, save_freq=5):
    """
    Train the DQN agent with improved training loop
    Args:
        env: The trading environment
        agent: The DQN agent
        num_episodes: Number of episodes to train
        visualize: Whether to visualize trades during training
        chart: The visualization chart (if visualize=True)
        save_path: Path to save the model
        save_freq: How often to save checkpoints (in episodes)
    Returns:
        tuple: (rewards, wins, losses, best_reward)
    """
    logger.info(f"Starting training for {num_episodes} episodes")
    # Initialize metrics tracking
    rewards = []
    win_rates = []
    total_train_time = 0
    best_reward = float('-inf')
    best_model_path = None
    # Create directory for checkpoints if needed
    if save_path:
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        checkpoint_dir = os.path.join(os.path.dirname(save_path), 'checkpoints')
        os.makedirs(checkpoint_dir, exist_ok=True)
    # For tracking improvement
    last_improved_episode = 0
    patience = 10  # Episodes to wait for improvement before early stopping
    for episode in range(num_episodes):
        start_time = time.time()
        # Reset environment and get initial state
        state = env.reset()
        done = False
        episode_reward = 0
        step = 0
        # Action metrics for this episode
        actions_taken = {0: 0, 1: 0, 2: 0}  # Track BUY, SELL, HOLD actions
        while not done:
            # Select action
            action = agent.act(state)
            # Execute action
            next_state, reward, done, info = env.step(action)
            # Store experience in replay buffer
            is_extrema = False  # In a real implementation, detect extrema points
            agent.remember(state, action, reward, next_state, done, is_extrema)
            # Learn from experience
            if len(agent.memory) >= agent.batch_size:
                use_prioritized = episode > 1  # Start using prioritized replay after first episode
                loss = agent.replay(use_prioritized=use_prioritized)
            # Update state and metrics
            state = next_state
            episode_reward += reward
            actions_taken[action] += 1
            # Every 100 steps, log progress
            if step % 100 == 0 or step < 10:
                action_str = "BUY" if action == 0 else "SELL" if action == 1 else "HOLD"
                current_price = info.get('current_price', 0)
                pnl = info.get('pnl', 0)
                balance = info.get('balance', 0)
                logger.info(f"Episode {episode}, Step {step}: Action={action_str}, "
                           f"Reward={reward:.4f}, Balance=${balance:.2f}, PnL={pnl:.4f}")
                # Add trade to visualization if enabled
                if visualize and chart and action in [0, 1]:  # BUY or SELL
                    chart.add_trade(
                        price=current_price,
                        timestamp=datetime.now(),
                        amount=0.1,
                        pnl=pnl,
                        action=action_str
                    )
            step += 1
        # Episode finished - calculate metrics
        episode_time = time.time() - start_time
        total_train_time += episode_time
        # Get environment info
        win_rate = env.winning_trades / max(1, env.total_trades)
        trades = env.total_trades
        balance = env.balance
        gain = (balance - env.initial_balance) / env.initial_balance
        max_drawdown = env.max_drawdown
        # Record metrics
        rewards.append(episode_reward)
        win_rates.append(win_rate)
        # Update agent's learning metrics
        improved = agent.update_learning_metrics(episode_reward)
        # If this is best performance, save the model
        if episode_reward > best_reward:
            best_reward = episode_reward
            if save_path:
                best_model_path = f"{save_path}_best"
                agent.save(best_model_path)
                logger.info(f"New best model saved to {best_model_path} (reward: {best_reward:.2f})")
            last_improved_episode = episode
        # Regular checkpoint saving
        if save_path and episode % save_freq == 0:
            checkpoint_path = os.path.join(checkpoint_dir, f"checkpoint_episode_{episode}")
            agent.save(checkpoint_path)
        # Print episode summary
        actions_summary = ", ".join([f"{k}:{v}" for k, v in actions_taken.items()])
        logger.info(f"Episode {episode} completed in {episode_time:.2f}s")
        logger.info(f"  Total reward: {episode_reward:.4f}")
        logger.info(f"  Actions taken: {actions_summary}")
        logger.info(f"  Trades: {trades}, Win rate: {win_rate:.2%}")
        logger.info(f"  Balance: ${balance:.2f}, Gain: {gain:.2%}")
        logger.info(f"  Max Drawdown: {max_drawdown:.2%}")
        # Early stopping check
        if episode - last_improved_episode >= patience:
            logger.info(f"No improvement for {patience} episodes. Early stopping.")
            break
    # Training complete
    avg_time_per_episode = total_train_time / max(1, len(rewards))
    logger.info(f"Training completed in {total_train_time:.2f}s ({avg_time_per_episode:.2f}s per episode)")
    # Save final model
    if save_path:
        agent.save(f"{save_path}_final")
        logger.info(f"Final model saved to {save_path}_final")
    # Return training metrics
    return rewards, win_rates, best_reward, best_model_path
 def plot_training_results(rewards, win_rates, save_dir=None):
    """Plot training metrics and save the figure"""
    plt.figure(figsize=(12, 8))
    # Plot rewards
    plt.subplot(2, 1, 1)
    plt.plot(rewards, 'b-')
    plt.title('Training Rewards per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Total Reward')
    plt.grid(True)
    # Plot win rates
    plt.subplot(2, 1, 2)
    plt.plot(win_rates, 'g-')
    plt.title('Win Rate per Episode')
    plt.xlabel('Episode')
    plt.ylabel('Win Rate')
    plt.grid(True)
    plt.tight_layout()
    # Save figure if directory provided
    if save_dir:
        os.makedirs(save_dir, exist_ok=True)
        plt.savefig(os.path.join(save_dir, f'training_results_{datetime.now().strftime("%Y%m%d_%H%M%S")}.png'))
    plt.close()
 def evaluate_agent(env, agent, num_episodes=5, visualize=False, chart=None):
    """
    Evaluate a trained agent on the environment
    Args:
        env: The trading environment
        agent: The trained DQN agent
        num_episodes: Number of evaluation episodes
        visualize: Whether to visualize trades
        chart: The visualization chart (if visualize=True)
    Returns:
        dict: Evaluation metrics
    """
    logger.info(f"Evaluating agent over {num_episodes} episodes")
    # Metrics to track
    total_rewards = []
    total_trades = []
    win_rates = []
    sharpe_ratios = []
    sortino_ratios = []
    max_drawdowns = []
    final_balances = []
    for episode in range(num_episodes):
        # Reset environment
        state = env.reset()
        done = False
        episode_reward = 0
        # Run episode without exploration
        while not done:
            action = agent.act(state, explore=False)  # No exploration during evaluation
            next_state, reward, done, info = env.step(action)
            episode_reward += reward
            state = next_state
            # Add trade to visualization if enabled
            if visualize and chart and action in [0, 1]:  # BUY or SELL
                action_str = "BUY" if action == 0 else "SELL"
                current_price = info.get('current_price', 0)
                pnl = info.get('pnl', 0)
                chart.add_trade(
                    price=current_price,
                    timestamp=datetime.now(),
                    amount=0.1,
                    pnl=pnl,
                    action=action_str
                )
        # Record metrics
        total_rewards.append(episode_reward)
        total_trades.append(env.total_trades)
        win_rates.append(env.winning_trades / max(1, env.total_trades))
        sharpe_ratios.append(info.get('sharpe_ratio', 0))
        sortino_ratios.append(info.get('sortino_ratio', 0))
        max_drawdowns.append(env.max_drawdown)
        final_balances.append(env.balance)
        logger.info(f"Evaluation episode {episode} - Reward: {episode_reward:.4f}, "
                   f"Balance: ${env.balance:.2f}, Win rate: {win_rates[-1]:.2%}")
    # Calculate average metrics
    avg_reward = np.mean(total_rewards)
    avg_trades = np.mean(total_trades)
    avg_win_rate = np.mean(win_rates)
    avg_sharpe = np.mean(sharpe_ratios)
    avg_sortino = np.mean(sortino_ratios)
    avg_max_drawdown = np.mean(max_drawdowns)
    avg_final_balance = np.mean(final_balances)
    # Log evaluation summary
    logger.info("Evaluation completed:")
    logger.info(f"  Average reward: {avg_reward:.4f}")
    logger.info(f"  Average trades per episode: {avg_trades:.2f}")
    logger.info(f"  Average win rate: {avg_win_rate:.2%}")
    logger.info(f"  Average Sharpe ratio: {avg_sharpe:.4f}")
    logger.info(f"  Average Sortino ratio: {avg_sortino:.4f}")
    logger.info(f"  Average max drawdown: {avg_max_drawdown:.2%}")
    logger.info(f"  Average final balance: ${avg_final_balance:.2f}")
    # Return evaluation metrics
    return {
        'avg_reward': avg_reward,
        'avg_trades': avg_trades,
        'avg_win_rate': avg_win_rate,
        'avg_sharpe': avg_sharpe,
        'avg_sortino': avg_sortino,
        'avg_max_drawdown': avg_max_drawdown,
        'avg_final_balance': avg_final_balance
    }
 def main():
    """Main function to run the improved RL training"""
    start_time = time.time()
    logger.info(f"Starting improved RL training for {args.symbol}")
    # Create environment
    env, data_frames = create_training_environment(args.symbol)
    # Initialize visualization if enabled
    chart = None
    if args.visualize:
        logger.info("Initializing visualization chart")
        chart = RealTimeChart(args.symbol)
        time.sleep(2)  # Give time for chart to initialize
    # Initialize agent
    agent = initialize_agent(env)
    # Train agent
    rewards, win_rates, best_reward, best_model_path = train_agent(
        env=env,
        agent=agent,
        num_episodes=args.episodes,
        visualize=args.visualize,
        chart=chart,
        save_path=args.save_path
    )
    # Plot training results
    plot_dir = os.path.join(os.path.dirname(args.save_path), 'plots')
    plot_training_results(rewards, win_rates, save_dir=plot_dir)
    # Evaluate best model
    logger.info("Evaluating best model")
    # Load best model for evaluation
    if best_model_path:
        best_agent = initialize_agent(env)
        best_agent.load(best_model_path)
        # Evaluate the best model
        eval_metrics = evaluate_agent(
            env=env, 
            agent=best_agent,
            visualize=args.visualize,
            chart=chart
        )
        # Log evaluation results
        logger.info("Best model evaluation complete:")
        for metric, value in eval_metrics.items():
            logger.info(f"  {metric}: {value}")
    # Total run time
    total_time = time.time() - start_time
    logger.info(f"Total run time: {total_time:.2f} seconds")
 if __name__ == "__main__":
    main()