#!/usr/bin/env python3
"""
CNN Model - PyTorch Implementation (Optimized for Short-Term High-Leverage Trading)

This module implements an enhanced CNN model using PyTorch for time series analysis 
with a focus on detecting short-term high-leverage trading opportunities.
Key improvements include attention mechanisms, rapid pattern detection,
and optimized decision thresholds for trading signals.
"""

import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F

# Configure logging
logger = logging.getLogger(__name__)

class AttentionLayer(nn.Module):
    """Self-attention layer for time series data"""
    
    def __init__(self, input_dim):
        super(AttentionLayer, self).__init__()
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)
        self.scale = math.sqrt(input_dim)
    
    def forward(self, x):
        # x shape: [batch, channels, seq_len]
        batch, channels, seq_len = x.size()
        
        # Reshape for attention computation
        x_reshaped = x.transpose(1, 2)  # [batch, seq_len, channels]
        
        # Compute query, key, value
        q = self.query(x_reshaped)  # [batch, seq_len, channels]
        k = self.key(x_reshaped)    # [batch, seq_len, channels]
        v = self.value(x_reshaped)  # [batch, seq_len, channels]
        
        # Compute attention scores
        attn_scores = torch.bmm(q, k.transpose(1, 2)) / self.scale  # [batch, seq_len, seq_len]
        attn_weights = F.softmax(attn_scores, dim=2)
        
        # Apply attention
        out = torch.bmm(attn_weights, v)  # [batch, seq_len, channels]
        out = out.transpose(1, 2)  # [batch, channels, seq_len]
        
        return out

class CNNPyTorch(nn.Module):
    """
    CNN model for time series analysis using PyTorch.
    """
    
    def __init__(self, input_shape, output_size=3):
        """
        Initialize the CNN architecture.
        
        Args:
            input_shape (tuple): Shape of input data (window_size, features)
            output_size (int): Number of output classes
        """
        super(CNNPyTorch, self).__init__()
        
        window_size, num_features = input_shape
        self.window_size = window_size
        
        # Increased dropout for better generalization
        dropout_rate = 0.25
        
        # Convolutional layers with wider kernels for better pattern detection
        self.conv1 = nn.Sequential(
            nn.Conv1d(num_features, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv1d(64, 128, kernel_size=5, padding=2),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        )
        
        # Micro-movement detection with smaller kernels
        self.micro_conv = nn.Sequential(
            nn.Conv1d(num_features, 32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.LeakyReLU(0.1),
            nn.Conv1d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        )
        
        # Attention mechanism for pattern importance weighting
        self.attention = nn.Conv1d(64, 1, kernel_size=1)
        self.softmax = nn.Softmax(dim=2)
        
        # Define a fixed output size for conv features to avoid dimension mismatch
        fixed_conv_size = 10  # This should match the expected size in forward pass
        
        # Use adaptive pooling to get fixed size regardless of input
        self.adaptive_pool = nn.AdaptiveAvgPool1d(fixed_conv_size)
        
        # Calculate input size for fully connected layer
        # After adaptive pooling, dimensions are [batch_size, channels, fixed_conv_size]
        conv2_flat_size = 128 * fixed_conv_size  # From conv2
        micro_flat_size = 64 * fixed_conv_size   # From micro_conv
        fc_input_size = conv2_flat_size + micro_flat_size
        
        # Shared fully connected layers
        self.shared_fc = nn.Sequential(
            nn.Linear(fc_input_size, 256),
            nn.BatchNorm1d(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate)
        )
        
        # Action prediction head
        self.action_fc = nn.Sequential(
            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate),
            nn.Linear(64, output_size)
        )
        
        # Price prediction head
        self.price_fc = nn.Sequential(
            nn.Linear(256, 64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(0.1),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 1)  # Predict price change percentage
        )
        
        # Confidence thresholds for decision making
        self.buy_threshold = 0.55  # Higher threshold for BUY signals
        self.sell_threshold = 0.55  # Higher threshold for SELL signals
    
    def forward(self, x):
        """
        Forward pass through the network with enhanced pattern detection.
        
        Args:
            x: Input tensor of shape [batch_size, window_size, features]
            
        Returns:
            Tuple of (action_probs, price_pred)
        """
        # Transpose for conv1d: [batch, features, window]
        x = x.transpose(1, 2)
        
        # Main convolutional layers
        conv1_out = self.conv1(x)
        conv2_out = self.conv2(conv1_out)  # Use conv1_out as input to conv2
        
        # Micro-movement pattern detection
        micro_out = self.micro_conv(x)
        
        # Apply adaptive pooling to ensure fixed size output for both paths
        # This ensures both tensors have the same size at dimension 2
        micro_out = self.adaptive_pool(micro_out)  # Output: [batch, 64, 10]
        conv2_out = self.adaptive_pool(conv2_out)  # Output: [batch, 128, 10]
        
        # Apply attention to conv1 output to detect important patterns
        attention = self.attention(conv1_out)
        attention = self.softmax(attention)
        
        # Flatten and concatenate features
        conv2_flat = conv2_out.reshape(conv2_out.size(0), -1)  # [batch, 128*10]
        micro_flat = micro_out.reshape(micro_out.size(0), -1)  # [batch, 64*10]
        
        features = torch.cat([conv2_flat, micro_flat], dim=1)
        
        # Shared layers
        shared_features = self.shared_fc(features)
        
        # Action head
        action_logits = self.action_fc(shared_features)
        action_probs = F.softmax(action_logits, dim=1)
        
        # Price prediction head
        price_pred = self.price_fc(shared_features)
        
        # Adjust confidence thresholds to favor decisive trading actions
        with torch.no_grad():
            # Reduce HOLD probabilities more aggressively for short-term trading
            action_probs[:, 1] *= 0.4  # More aggressive reduction of HOLD (index 1) probabilities
            
            # Identify high-confidence signals and boost them further
            sell_mask = action_probs[:, 0] > self.sell_threshold
            buy_mask = action_probs[:, 2] > self.buy_threshold
            
            # Boost high-confidence signals even more
            action_probs[sell_mask, 0] *= 1.8  # Higher boost for high-confidence SELL signals
            action_probs[buy_mask, 2] *= 1.8  # Higher boost for high-confidence BUY signals
            
            # For other cases, provide moderate boost
            action_probs[:, 0] *= 1.4  # Boost SELL probabilities
            action_probs[:, 2] *= 1.4  # Boost BUY probabilities
            
            # Re-normalize to sum to 1
            action_probs = action_probs / action_probs.sum(dim=1, keepdim=True)
        
        return action_probs, price_pred

class CNNModelPyTorch:
    """
    CNN model wrapper class for time series analysis using PyTorch.
    
    This class provides methods for building, training, evaluating, and making
    predictions with the CNN model, optimized for short-term trading opportunities.
    """
    
    def __init__(self, window_size, num_features, output_size=3, timeframes=None):
        """
        Initialize the CNN model.
        
        Args:
            window_size (int): Size of the input window
            num_features (int): Number of features in the input data
            output_size (int): Size of the output (default: 3 for BUY/HOLD/SELL)
            timeframes (list): List of timeframes used (for logging)
        """
        # Action tracking
        self.action_counts = {
            'BUY': 0,
            'SELL': 0, 
            'HOLD': 0
        }
        self.window_size = window_size
        self.num_features = num_features
        self.output_size = output_size
        self.timeframes = timeframes or []
        
        # Determine device (GPU or CPU)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        logger.info(f"Using device: {self.device}")
        
        # Initialize model
        self.model = None
        self.build_model()
        
        # Initialize training history
        self.history = {
            'loss': [],
            'val_loss': [],
            'accuracy': [],
            'val_accuracy': []
        }
        
        # Sensitivity parameters for high-leverage trading
        self.confidence_threshold = 0.65  # Minimum confidence for trading actions
        self.max_consecutive_same_action = 3  # Limit consecutive identical actions
        self.last_actions = []  # Track recent actions
    
    def build_model(self):
        """Build the CNN model architecture"""
        logger.info(f"Building PyTorch CNN model with window_size={self.window_size}, "
                   f"num_features={self.num_features}, output_size={self.output_size}")
        
        # Ensure window size is not less than the actual input
        input_window_size = max(self.window_size, 20)  # Use at least 20 as minimum window size
        
        self.model = CNNPyTorch(
            input_shape=(input_window_size, self.num_features),
            output_size=self.output_size
        ).to(self.device)
        
        # Initialize optimizer with higher learning rate for faster adaptation
        self.optimizer = optim.Adam(self.model.parameters(), lr=0.002)
        
        # Learning rate scheduler with faster decay
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='max', factor=0.6, patience=6, verbose=True
        )
        
        # Initialize loss function with higher weights for BUY/SELL
        class_weights = torch.tensor([7.0, 1.0, 7.0]).to(self.device)  # Even higher weights for BUY/SELL
        self.criterion = nn.CrossEntropyLoss(weight=class_weights)
        
        logger.info(f"Model built successfully with {sum(p.numel() for p in self.model.parameters())} parameters")
    
    def compute_trading_loss(self, action_probs, price_pred, targets, future_prices=None):
        """
        Custom loss function that prioritizes profitable trades
        
        Args:
            action_probs: Predicted action probabilities [batch_size, 3]
            price_pred: Predicted price changes [batch_size, 1]
            targets: Target actions [batch_size]
            future_prices: Actual future price changes [batch_size]
            
        Returns:
            Total loss value
        """
        batch_size = action_probs.size(0)
        
        # Base classification loss
        action_loss = self.criterion(action_probs, targets)
        
        # Initialize price and profitability losses
        price_loss = torch.tensor(0.0, device=self.device)
        profit_loss = torch.tensor(0.0, device=self.device)
        diversity_loss = torch.tensor(0.0, device=self.device)
        
        # Get predicted actions
        pred_actions = torch.argmax(action_probs, dim=1)
        
        # Calculate signal diversity loss to prevent model from always predicting the same action
        # Count actions in the batch
        buy_count = (pred_actions == 2).float().sum() / batch_size
        sell_count = (pred_actions == 0).float().sum() / batch_size
        hold_count = (pred_actions == 1).float().sum() / batch_size
        
        # Enhanced diversity mechanism
        # For short-term high-leverage trading, we want a more balanced distribution
        # with a slight preference for actions over holds, but still maintaining diversity
        
        # Ideal distribution varies based on market conditions and training phase
        # Start with more conservative distribution and gradually shift to more aggressive
        if hasattr(self, 'training_progress'):
            self.training_progress += 1
        else:
            self.training_progress = 0
            
        # Early training phase - more balanced with higher HOLD
        if self.training_progress < 500:
            ideal_buy = 0.3
            ideal_sell = 0.3
            ideal_hold = 0.4
        # Mid training phase - balanced trading signals
        elif self.training_progress < 1500:
            ideal_buy = 0.35
            ideal_sell = 0.35
            ideal_hold = 0.3
        # Late training phase - more aggressive with tactical HOLDs
        else:
            ideal_buy = 0.4
            ideal_sell = 0.4
            ideal_hold = 0.2
        
        # Calculate diversity loss using Kullback-Leibler divergence approximation
        # Plus an additional penalty for extreme imbalance
        actual_dist = torch.tensor([sell_count, hold_count, buy_count], device=self.device)
        ideal_dist = torch.tensor([ideal_sell, ideal_hold, ideal_buy], device=self.device)
        
        # KL divergence component (approximation)
        eps = 1e-8  # Small constant to avoid division by zero
        kl_div = torch.sum(actual_dist * torch.log((actual_dist + eps) / (ideal_dist + eps)))
        
        # Add strong penalty for extreme predictions (all same class)
        max_ratio = torch.max(actual_dist)
        if max_ratio > 0.9:  # If more than 90% of predictions are the same class
            diversity_loss = kl_div + (max_ratio - 0.9) * 5.0  # Stronger penalty
        elif max_ratio > 0.7:  # If more than 70% predictions are the same class
            diversity_loss = kl_div + (max_ratio - 0.7) * 2.0  # Moderate penalty
        else:
            diversity_loss = kl_div
            
        # Add additional penalty if any class has zero predictions
        # This is critical for avoiding scenarios where model never predicts a certain class
        zero_class_penalty = 0.0
        min_class_ratio = 0.1  # We want at least 10% of each class
        
        if buy_count < min_class_ratio:
            zero_class_penalty += (min_class_ratio - buy_count) * 3.0
        if sell_count < min_class_ratio:
            zero_class_penalty += (min_class_ratio - sell_count) * 3.0
        if hold_count < min_class_ratio:
            zero_class_penalty += (min_class_ratio - hold_count) * 2.0  # Slightly lower penalty for HOLD
            
        diversity_loss += zero_class_penalty
        
        # If we have future prices, calculate profitability-based losses
        if future_prices is not None and future_prices.numel() > 0:
            # Calculate price direction loss - penalize wrong direction predictions
            if price_pred is not None:
                # For each sample where future price is available
                valid_mask = ~torch.isnan(future_prices) & (future_prices != 0)
                if valid_mask.any():
                    valid_future = future_prices[valid_mask]
                    valid_price_pred = price_pred.view(-1)[valid_mask]
                    
                    # Mean squared error for price prediction
                    price_loss = F.mse_loss(valid_price_pred, valid_future)
                    
                    # Direction loss - penalize wrong direction predictions more heavily
                    pred_direction = torch.sign(valid_price_pred)
                    true_direction = torch.sign(valid_future)
                    direction_loss = ((pred_direction != true_direction) & (true_direction != 0)).float().mean()
                    
                    # Add direction loss to price loss with higher weight
                    price_loss = price_loss + direction_loss * 2.0
            
            # Calculate trade profitability loss
            # This penalizes unprofitable trades more than just wrong classifications
            profitable_trades = 0
            unprofitable_trades = 0
            
            for i in range(batch_size):
                if i < future_prices.size(0) and not torch.isnan(future_prices[i]) and future_prices[i] != 0:
                    price_change = future_prices[i].item()
                    
                    # Calculate expected profit/loss based on action
                    if pred_actions[i] == 0:  # SELL
                        expected_pnl = -price_change  # Negative price change is profit for SELL
                    elif pred_actions[i] == 2:  # BUY
                        expected_pnl = price_change  # Positive price change is profit for BUY
                    else:  # HOLD
                        expected_pnl = 0  # No profit/loss for HOLD
                    
                    # Enhanced profit/loss penalties with larger gradient for bad trades
                    if expected_pnl < 0:
                        # Exponential penalty for larger losses
                        severity = abs(expected_pnl) ** 1.5  # Higher exponent for short-term trading
                        profit_loss = profit_loss + torch.tensor(severity, device=self.device) * 2.5
                        unprofitable_trades += 1
                    elif expected_pnl > 0:
                        # Reward for profitable trades (negative loss contribution)
                        # Higher reward for larger profits
                        reward = expected_pnl * 0.9
                        profit_loss = profit_loss - torch.tensor(reward, device=self.device)
                        profitable_trades += 1
            
            # Calculate win rate and further adjust profit loss
            if profitable_trades + unprofitable_trades > 0:
                win_rate = profitable_trades / (profitable_trades + unprofitable_trades)
                
                # Add extra penalty if win rate is less than 50%
                if win_rate < 0.5:
                    profit_loss = profit_loss * (1.0 + (0.5 - win_rate) * 2.5)
                # Add small reward if win rate is high
                elif win_rate > 0.6:
                    profit_loss = profit_loss * (1.0 - (win_rate - 0.6) * 0.5)
        
        # Combine all loss components with dynamic weighting
        # Adjust weights based on training progress
        
        # Early training focuses more on classification accuracy
        if self.training_progress < 500:
            action_weight = 1.0
            price_weight = 0.2
            profit_weight = 0.5
            diversity_weight = 0.3
        # Mid training balances all components
        elif self.training_progress < 1500:
            action_weight = 0.8
            price_weight = 0.3
            profit_weight = 0.8
            diversity_weight = 0.5
        # Late training emphasizes profitability and diversity
        else:
            action_weight = 0.6
            price_weight = 0.3
            profit_weight = 1.0
            diversity_weight = 0.7
            
        total_loss = (action_weight * action_loss + 
                     price_weight * price_loss + 
                     profit_weight * profit_loss + 
                     diversity_weight * diversity_loss)
        
        return total_loss, action_loss, price_loss
    
    def train_epoch(self, X_train, y_train, future_prices, batch_size):
        """Train the model for one epoch with focus on short-term pattern recognition"""
        self.model.train()
        total_action_loss = 0
        total_price_loss = 0
        total_correct = 0
        total_samples = 0
        
        # Convert inputs to tensors and create DataLoader
        X_train_tensor = torch.FloatTensor(X_train).to(self.device)
        y_train_tensor = torch.LongTensor(y_train).to(self.device)
        future_prices_tensor = torch.FloatTensor(future_prices).to(self.device) if future_prices is not None else None
        
        # Create dataset and dataloader
        if future_prices_tensor is not None:
            dataset = TensorDataset(X_train_tensor, y_train_tensor, future_prices_tensor)
        else:
            dataset = TensorDataset(X_train_tensor, y_train_tensor)
        
        train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        
        # Training loop
        for batch_data in train_loader:
            self.optimizer.zero_grad()
            
            # Extract batch data
            if len(batch_data) == 3:
                batch_X, batch_y, batch_future_prices = batch_data
            else:
                batch_X, batch_y = batch_data
                batch_future_prices = None
            
            # Forward pass
            action_probs, price_pred = self.model(batch_X)
            
            # Calculate loss using custom trading loss function
            total_loss, action_loss, price_loss = self.compute_trading_loss(
                action_probs, price_pred, batch_y, batch_future_prices
            )
            
            # Backward pass and optimization
            total_loss.backward()
            
            # Apply gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
            
            self.optimizer.step()
            
            # Update metrics
            total_action_loss += action_loss.item()
            total_price_loss += price_loss.item() if hasattr(price_loss, 'item') else 0
            
            predictions = torch.argmax(action_probs, dim=1)
            total_correct += (predictions == batch_y).sum().item()
            total_samples += batch_y.size(0)
            
            # Track trading signals for logging
            buy_count = (predictions == 2).sum().item()
            sell_count = (predictions == 0).sum().item()
            hold_count = (predictions == 1).sum().item()
            
            buy_correct = ((predictions == 2) & (batch_y == 2)).sum().item()
            sell_correct = ((predictions == 0) & (batch_y == 0)).sum().item()
        
        # Calculate average losses and accuracy
        avg_action_loss = total_action_loss / len(train_loader)
        avg_price_loss = total_price_loss / len(train_loader)
        accuracy = total_correct / total_samples
        
        # Log trading signals
        logger.info(f"Trading signals: BUY={buy_count}, SELL={sell_count}, HOLD={hold_count}")
        logger.info(f"Signal precision: BUY={buy_correct/max(1, buy_count):.4f}, SELL={sell_correct/max(1, sell_count):.4f}")
        
        # Update learning rate
        self.scheduler.step(accuracy)
        
        return avg_action_loss, avg_price_loss, accuracy

    def evaluate(self, X_val, y_val, future_prices=None):
        """Evaluate the model with focus on short-term trading performance metrics"""
        self.model.eval()
        total_action_loss = 0
        total_price_loss = 0
        total_correct = 0
        total_samples = 0
        
        # Additional metrics for trading performance
        trade_signals = {'BUY': 0, 'SELL': 0, 'HOLD': 0}
        correct_signals = {'BUY': 0, 'SELL': 0, 'HOLD': 0}
        
        # Convert inputs to tensors
        X_val_tensor = torch.FloatTensor(X_val).to(self.device)
        y_val_tensor = torch.LongTensor(y_val).to(self.device)
        future_prices_tensor = torch.FloatTensor(future_prices).to(self.device) if future_prices is not None else None
        
        with torch.no_grad():
            # Forward pass
            action_probs, price_pred = self.model(X_val_tensor)
            
            # Calculate loss using custom trading loss function
            total_loss, action_loss, price_loss = self.compute_trading_loss(
                action_probs, price_pred, y_val_tensor, future_prices_tensor
            )
            
            # Calculate predictions and accuracy
            predictions = torch.argmax(action_probs, dim=1)
            
            # Count prediction types and correct predictions
            for i in range(predictions.shape[0]):
                pred = predictions[i].item()
                if pred == 0:
                    trade_signals['SELL'] += 1
                    if y_val_tensor[i].item() == pred:
                        correct_signals['SELL'] += 1
                elif pred == 1:
                    trade_signals['HOLD'] += 1
                    if y_val_tensor[i].item() == pred:
                        correct_signals['HOLD'] += 1
                elif pred == 2:
                    trade_signals['BUY'] += 1
                    if y_val_tensor[i].item() == pred:
                        correct_signals['BUY'] += 1
            
            # Update metrics
            total_action_loss = action_loss.item()
            total_price_loss = price_loss.item() if hasattr(price_loss, 'item') else 0
            
            total_correct = (predictions == y_val_tensor).sum().item()
            total_samples = y_val_tensor.size(0)
        
        # Calculate accuracy
        accuracy = total_correct / total_samples if total_samples > 0 else 0
        
        # Calculate signal precision (crucial for short-term trading)
        buy_precision = correct_signals['BUY'] / trade_signals['BUY'] if trade_signals['BUY'] > 0 else 0
        sell_precision = correct_signals['SELL'] / trade_signals['SELL'] if trade_signals['SELL'] > 0 else 0
        
        # Log trading-specific metrics
        logger.info(f"Trading signals: BUY={trade_signals['BUY']}, SELL={trade_signals['SELL']}, HOLD={trade_signals['HOLD']}")
        logger.info(f"Signal precision: BUY={buy_precision:.4f}, SELL={sell_precision:.4f}")
        
        # Return combined loss, accuracy and volatility factor for adaptive training
        return total_action_loss, total_price_loss, accuracy

    def predict(self, X):
        """Make predictions optimized for short-term high-leverage trading signals"""
        self.model.eval()
        
        # Convert to tensor if not already
        if not isinstance(X, torch.Tensor):
            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        else:
            X_tensor = X.to(self.device)
        
        with torch.no_grad():
            action_probs, price_pred = self.model(X_tensor)
            
            # Post-processing optimized for short-term trading signals
            action_probs_np = action_probs.cpu().numpy()
            
            # Apply more aggressive HOLD reduction for short-term trading
            action_probs_np[:, 1] *= 0.5  # More aggressive HOLD reduction
            
            # Apply boosting for BUY/SELL signals
            action_probs_np[:, 0] *= 1.3  # Boost SELL probabilities
            action_probs_np[:, 2] *= 1.3  # Boost BUY probabilities
            
            # Implement signal filtering based on previous actions to avoid oscillation
            if len(self.last_actions) >= self.max_consecutive_same_action:
                # Check for too many consecutive identical actions
                if all(a == 0 for a in self.last_actions[-self.max_consecutive_same_action:]):
                    # Too many consecutive SELL - reduce sell probability
                    action_probs_np[:, 0] *= 0.7
                elif all(a == 2 for a in self.last_actions[-self.max_consecutive_same_action:]):
                    # Too many consecutive BUY - reduce buy probability
                    action_probs_np[:, 2] *= 0.7
            
            # Apply confidence threshold to reduce noise
            max_probs = np.max(action_probs_np, axis=1)
            for i in range(len(action_probs_np)):
                if max_probs[i] < self.confidence_threshold:
                    # If confidence is too low, force HOLD
                    action_probs_np[i] = np.array([0.1, 0.8, 0.1])
            
            # Re-normalize
            action_probs_np = action_probs_np / action_probs_np.sum(axis=1, keepdims=True)
            
            # Store the predicted action for the most recent input
            if action_probs_np.shape[0] > 0:
                latest_action = np.argmax(action_probs_np[-1])
                self.last_actions.append(int(latest_action))
                # Keep only the most recent actions
                self.last_actions = self.last_actions[-10:]  # Store last 10 actions
            
            # Update action counts for stats
            actions = np.argmax(action_probs_np, axis=1)
            unique, counts = np.unique(actions, return_counts=True)
            action_dict = dict(zip(unique, counts))
            
            if 0 in action_dict:
                self.action_counts['SELL'] += action_dict[0]
            if 1 in action_dict:
                self.action_counts['HOLD'] += action_dict[1]
            if 2 in action_dict:
                self.action_counts['BUY'] += action_dict[2]
            
            # Get the current close prices from the input
            current_prices = X_tensor[:, -1, 3].cpu().numpy() if X_tensor.shape[2] > 3 else np.zeros(X_tensor.shape[0])
            
            # Calculate price directions based on probabilities
            price_directions = action_probs_np[:, 2] - action_probs_np[:, 0]  # BUY - SELL
            
            # Scale the price change based on signal strength
            price_preds = current_prices * (1 + price_directions * 0.002)
            
            return action_probs_np, price_preds.reshape(-1, 1)

    def predict_next_candles(self, X, n_candles=3):
        """
        Predict the next n candles with focus on short-term signals.
        
        Args:
            X: Input data of shape [batch_size, window_size, features]
            n_candles: Number of future candles to predict
            
        Returns:
            Dictionary of predictions for each timeframe
        """
        self.model.eval()
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        
        with torch.no_grad():
            # Get initial predictions
            action_probs, price_pred = self.model(X_tensor)
            action_probs_np = action_probs.cpu().numpy()
            
            # Apply more aggressive processing for short-term signals
            action_probs_np[:, 1] *= 0.5  # Reduce HOLD
            action_probs_np[:, 0] *= 1.3  # Boost SELL
            action_probs_np[:, 2] *= 1.3  # Boost BUY
            
            # Re-normalize
            action_probs_np = action_probs_np / action_probs_np.sum(axis=1, keepdims=True)
            
            # For short-term predictions, implement decay of signal over time
            # First candle: full signal, then gradually decay
            predictions = {}
            for i, tf in enumerate(self.timeframes):
                tf_preds = np.zeros((n_candles, action_probs_np.shape[0], 3))
                
                for j in range(n_candles):
                    # Apply decay factor to move signals toward HOLD over time
                    # (short-term signals shouldn't persist too long)
                    decay_factor = max(0.1, 1.0 - j * 0.3)
                    
                    # First, move probabilities toward HOLD with decay
                    decayed_probs = action_probs_np.copy()
                    decayed_probs[:, 0] = action_probs_np[:, 0] * decay_factor  # Decay SELL
                    decayed_probs[:, 2] = action_probs_np[:, 2] * decay_factor  # Decay BUY
                    
                    # Increase HOLD probability to compensate
                    hold_increase = (1.0 - decay_factor) * (action_probs_np[:, 0] + action_probs_np[:, 2])
                    decayed_probs[:, 1] = action_probs_np[:, 1] + hold_increase
                    
                    # Re-normalize
                    decayed_probs = decayed_probs / decayed_probs.sum(axis=1, keepdims=True)
                    
                    # Store in predictions array
                    tf_preds[j] = decayed_probs
                
                # Store in output dictionary
                predictions[tf] = tf_preds
            
            return predictions

    def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, epochs=100):
        """
        Train the CNN model.
        
        Args:
            X_train: Training input data
            y_train: Training target data
            X_val: Validation input data
            y_val: Validation target data
            batch_size: Batch size for training
            epochs: Number of training epochs
            
        Returns:
            Training history
        """
        logger.info(f"Training PyTorch CNN model with {len(X_train)} samples, "
                   f"batch_size={batch_size}, epochs={epochs}")
        
        # Convert numpy arrays to PyTorch tensors
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(self.device)
        
        # Handle different output sizes for y_train
        if self.output_size == 1:
            y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(self.device)
        else:
            y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(self.device)
        
        # Create DataLoader for training data
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Create DataLoader for validation data if provided
        if X_val is not None and y_val is not None:
            X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(self.device)
            if self.output_size == 1:
                y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(self.device)
            else:
                y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(self.device)
                
            val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
            val_loader = DataLoader(val_dataset, batch_size=batch_size)
        else:
            val_loader = None
        
        # Training loop
        for epoch in range(epochs):
            # Training phase
            self.model.train()
            running_loss = 0.0
            correct = 0
            total = 0
            
            for inputs, targets in train_loader:
                # Zero the parameter gradients
                self.optimizer.zero_grad()
                
                # Forward pass
                action_probs, price_pred = self.model(inputs)
                
                # Calculate loss
                if self.output_size == 1:
                    loss = self.criterion(action_probs, targets.unsqueeze(1))
                else:
                    loss = self.criterion(action_probs, targets)
                
                # Backward pass and optimize
                loss.backward()
                self.optimizer.step()
                
                # Statistics
                running_loss += loss.item()
                _, predicted = torch.max(action_probs, 1)
                total += targets.size(0)
                correct += (predicted == targets).sum().item()
            
            epoch_loss = running_loss / len(train_loader)
            epoch_acc = correct / total if total > 0 else 0
            
            # Validation phase
            if val_loader is not None:
                val_loss, val_acc = self.evaluate(X_val, y_val)
                
                logger.info(f"Epoch {epoch+1}/{epochs} - "
                           f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f} - "
                           f"val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")
                
                # Update history
                self.history['loss'].append(epoch_loss)
                self.history['accuracy'].append(epoch_acc)
                self.history['val_loss'].append(val_loss)
                self.history['val_accuracy'].append(val_acc)
            else:
                logger.info(f"Epoch {epoch+1}/{epochs} - "
                           f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f}")
                
                # Update history without validation
                self.history['loss'].append(epoch_loss)
                self.history['accuracy'].append(epoch_acc)
        
        logger.info("Training completed")
        return self.history
    
    def evaluate_metrics(self, X_test, y_test):
        """
        Calculate and return comprehensive evaluation metrics as dict
        """
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(self.device)
        
        self.model.eval()
        with torch.no_grad():
            y_pred = self.model(X_test_tensor)
            
            if self.output_size > 1:
                _, y_pred_class = torch.max(y_pred, 1)
                y_pred_class = y_pred_class.cpu().numpy()
            else:
                y_pred_class = (y_pred.cpu().numpy() > 0.5).astype(int).flatten()
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred_class),
            'precision': precision_score(y_test, y_pred_class, average='weighted', zero_division=0),
            'recall': recall_score(y_test, y_pred_class, average='weighted', zero_division=0),
            'f1_score': f1_score(y_test, y_pred_class, average='weighted', zero_division=0)
        }
        
        return metrics
    
    def save(self, filepath):
        """
        Save the model to a file with trading configuration.
        
        Args:
            filepath: Path to save the model
        """
        # Create directory if it doesn't exist
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        
        # Save the model state with additional trading parameters
        model_state = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'history': self.history,
            'window_size': self.window_size,
            'num_features': self.num_features,
            'output_size': self.output_size,
            'timeframes': self.timeframes,
            # Save trading configuration
            'confidence_threshold': self.confidence_threshold,
            'max_consecutive_same_action': self.max_consecutive_same_action,
            'action_counts': self.action_counts,
            'last_actions': self.last_actions,
            # Save model version information
            'model_version': 'short_term_optimized_v1.0',
            'timestamp': datetime.now().strftime('%Y%m%d_%H%M%S')
        }
        
        torch.save(model_state, f"{filepath}.pt")
        logger.info(f"Model saved to {filepath}.pt with short-term trading optimizations")
        
        # Save a backup of the model periodically
        if not os.path.exists(f"{filepath}_backup"):
            os.makedirs(f"{filepath}_backup", exist_ok=True)
        
        backup_path = os.path.join(f"{filepath}_backup", f"model_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pt")
        torch.save(model_state, backup_path)
        logger.info(f"Backup saved to {backup_path}")
    
    def load(self, filepath):
        """Load model weights from file"""
        if not os.path.exists(f"{filepath}.pt"):
            logger.error(f"Model file {filepath}.pt not found")
            return False
        
        try:
            # Load the model state
            model_state = torch.load(f"{filepath}.pt", map_location=self.device)
            
            # Update model parameters
            self.window_size = model_state['window_size']
            self.num_features = model_state['num_features']
            self.output_size = model_state['output_size']
            self.timeframes = model_state.get('timeframes', ["1m"])
            
            # Load model state dict
            self.load_state_dict(model_state['model_state_dict'])
            
            # Load optimizer state if available
            if 'optimizer_state_dict' in model_state:
                self.optimizer.load_state_dict(model_state['optimizer_state_dict'])
            
            # Load trading configuration if available
            if 'confidence_threshold' in model_state:
                self.confidence_threshold = model_state['confidence_threshold']
            if 'max_consecutive_same_action' in model_state:
                self.max_consecutive_same_action = model_state['max_consecutive_same_action']
            
            # Log model version information if available
            if 'model_version' in model_state:
                logger.info(f"Model version: {model_state['model_version']}")
            if 'timestamp' in model_state:
                logger.info(f"Model timestamp: {model_state['timestamp']}")
            
            return True
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return False
    
    def plot_training_history(self, metrics_file="NN/models/saved/training_metrics.json"):
        """
        Plot training history from saved metrics.
        
        Args:
            metrics_file: Path to the saved metrics JSON file
        """
        try:
            import json
            import matplotlib.pyplot as plt
            import matplotlib.dates as mdates
            from datetime import datetime
            
            # Load metrics
            with open(metrics_file, 'r') as f:
                metrics = json.load(f)
            
            # Create plots directory
            plots_dir = os.path.join(os.path.dirname(metrics_file), 'plots')
            os.makedirs(plots_dir, exist_ok=True)
            
            # Convert timestamps to datetime objects
            timestamps = [datetime.fromisoformat(ts) for ts in metrics['timestamps']]
            
            # 1. Plot Loss and Accuracy
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
            
            # Loss plot
            ax1.plot(timestamps, metrics['train_loss'], 'b-', label='Training Loss')
            ax1.plot(timestamps, metrics['val_loss'], 'r-', label='Validation Loss')
            ax1.set_title('Model Loss Over Time')
            ax1.set_ylabel('Loss')
            ax1.legend()
            ax1.grid(True)
            
            # Accuracy plot
            ax2.plot(timestamps, metrics['train_acc'], 'g-', label='Training Accuracy')
            ax2.plot(timestamps, metrics['val_acc'], 'm-', label='Validation Accuracy')
            ax2.set_title('Model Accuracy Over Time')
            ax2.set_ylabel('Accuracy')
            ax2.legend()
            ax2.grid(True)
            
            # Format x-axis
            ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
            plt.xticks(rotation=45)
            
            # Save the plot
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, 'loss_accuracy.png'))
            plt.close()
            
            # 2. Plot PnL and Win Rate
            fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
            
            # PnL plot
            ax1.plot(timestamps, metrics['train_pnl'], 'g-', label='Training PnL')
            ax1.plot(timestamps, metrics['val_pnl'], 'r-', label='Validation PnL')
            ax1.set_title('PnL Over Time')
            ax1.set_ylabel('PnL')
            ax1.legend()
            ax1.grid(True)
            
            # Win Rate plot
            ax2.plot(timestamps, metrics['train_win_rate'], 'b-', label='Training Win Rate')
            ax2.plot(timestamps, metrics['val_win_rate'], 'm-', label='Validation Win Rate')
            ax2.set_title('Win Rate Over Time')
            ax2.set_ylabel('Win Rate')
            ax2.legend()
            ax2.grid(True)
            
            # Format x-axis
            ax2.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d %H:%M'))
            plt.xticks(rotation=45)
            
            # Save the plot
            plt.tight_layout()
            plt.savefig(os.path.join(plots_dir, 'pnl_winrate.png'))
            plt.close()
            
            print(f"Performance visualizations saved to {plots_dir}")
            return True
        except Exception as e:
            print(f"Error generating plots: {str(e)}")
            import traceback
            print(traceback.format_exc())
            return False
    
    def extract_hidden_features(self, X):
        """
        Extract hidden features from the model - outputs from last dense layer before output.
        
        Args:
            X: Input data
            
        Returns:
            Hidden features (output from penultimate dense layer)
        """
        # Convert to PyTorch tensor
        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        
        # Forward pass through the model
        self.model.eval()
        with torch.no_grad():
            # Get features through CNN layers
            x_t = X_tensor.transpose(1, 2)
            conv_out = self.model.conv_layers(x_t)
            
            # Process through all dense layers except the output layer
            features = conv_out
            for layer in self.model.dense_block[:-2]:  # Exclude last linear layer and dropout
                features = layer(features)
        
        return features.cpu().numpy()