cleanup, CNN fixes

2025-07-05 00:12:40 +03:00
parent ce8c00a9d1
commit 5ca7493708
18 changed files with 587 additions and 5181 deletions
--- a/NN/models/init.py
+++ b/NN/models/init.py
@@ -4,17 +4,16 @@ Neural Network Models

 This package contains the neural network models used in the trading system:
 - CNN Model: Deep convolutional neural network for feature extraction  
- Transformer Model: Processes high-level features for improved pattern recognition
- MoE: Mixture of Experts model that combines multiple neural networks
+- DQN Agent: Deep Q-Network for reinforcement learning
+- COB RL Model: Specialized RL model for order book data
+- Advanced Transformer: High-performance transformer for trading

 PyTorch implementation only.
 """

-from NN.models.cnn_model_pytorch import EnhancedCNNModel as CNNModel
-from NN.models.transformer_model_pytorch import (
-    TransformerModelPyTorch as TransformerModel,
-    MixtureOfExpertsModelPyTorch as MixtureOfExpertsModel
-)
+from NN.models.cnn_model import EnhancedCNNModel as CNNModel
+from NN.models.dqn_agent import DQNAgent
 from NN.models.cob_rl_model import MassiveRLNetwork, COBRLModelInterface
+from NN.models.advanced_transformer_trading import AdvancedTradingTransformer, TradingTransformerConfig

-__all__ = ['CNNModel', 'TransformerModel', 'MixtureOfExpertsModel', 'MassiveRLNetwork', 'COBRLModelInterface']
+__all__ = ['CNNModel', 'DQNAgent', 'MassiveRLNetwork', 'COBRLModelInterface', 'AdvancedTradingTransformer', 'TradingTransformerConfig']
--- a/NN/models/cnn_model.py
+++ b/NN/models/cnn_model.py
@@ -329,13 +329,13 @@ class EnhancedCNNModel(nn.Module):
            x = x.unsqueeze(0)
        elif len(x.shape) > 3:
            # Input has extra dimensions - flatten to [batch, seq, features]
-            x = x.view(x.shape[0], -1, x.shape[-1])
+            x = x.reshape(x.shape[0], -1, x.shape[-1])
        
        x = self._memory_barrier(x)  # Apply barrier after shape changes
        batch_size, seq_len, features = x.shape
        
        # Reshape for processing: [batch, seq, features] -> [batch*seq, features]
-        x_reshaped = x.view(-1, features)
+        x_reshaped = x.reshape(-1, features)
        x_reshaped = self._memory_barrier(x_reshaped)
        
        # Input embedding
@@ -343,7 +343,7 @@ class EnhancedCNNModel(nn.Module):
        embedded = self._memory_barrier(embedded)
        
        # Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
-        embedded = embedded.view(batch_size, seq_len, -1).transpose(1, 2).contiguous()
+        embedded = embedded.reshape(batch_size, seq_len, -1).transpose(1, 2).contiguous()
        embedded = self._memory_barrier(embedded)
        
        # Multi-scale feature extraction - ensure each path creates independent tensors
@@ -380,10 +380,10 @@ class EnhancedCNNModel(nn.Module):
        
        # Global aggregation - create independent tensors
        avg_pooled = self.global_pool(attended_features)
-        avg_pooled = self._memory_barrier(avg_pooled.view(avg_pooled.shape[0], -1))  # Flatten instead of squeeze
+        avg_pooled = self._memory_barrier(avg_pooled.reshape(avg_pooled.shape[0], -1))  # Flatten instead of squeeze
        
        max_pooled = self.global_max_pool(attended_features) 
-        max_pooled = self._memory_barrier(max_pooled.view(max_pooled.shape[0], -1))  # Flatten instead of squeeze
+        max_pooled = self._memory_barrier(max_pooled.reshape(max_pooled.shape[0], -1))  # Flatten instead of squeeze
        
        # Combine global features - create new tensor
        global_features = torch.cat([avg_pooled, max_pooled], dim=1)
@@ -399,7 +399,7 @@ class EnhancedCNNModel(nn.Module):
        
        # Combine all features for final decision (8 regime classes + 1 volatility)
        # Create completely independent tensors for concatenation
-        vol_pred_flat = self._memory_barrier(volatility_pred.view(volatility_pred.shape[0], -1))  # Flatten instead of squeeze
+        vol_pred_flat = self._memory_barrier(volatility_pred.reshape(volatility_pred.shape[0], -1))  # Flatten instead of squeeze
        combined_features = torch.cat([processed_features, regime_probs, vol_pred_flat], dim=1)
        combined_features = self._memory_barrier(combined_features)
        
@@ -411,15 +411,15 @@ class EnhancedCNNModel(nn.Module):
        trading_probs = self._memory_barrier(F.softmax(scaled_logits, dim=1))
        
        # Flatten confidence to ensure consistent shape
-        confidence_flat = self._memory_barrier(confidence.view(confidence.shape[0], -1))
-        volatility_flat = self._memory_barrier(volatility_pred.view(volatility_pred.shape[0], -1))
+        confidence_flat = self._memory_barrier(confidence.reshape(confidence.shape[0], -1))
+        volatility_flat = self._memory_barrier(volatility_pred.reshape(volatility_pred.shape[0], -1))
        
        return {
            'logits': self._memory_barrier(trading_logits),
            'probabilities': self._memory_barrier(trading_probs),
-            'confidence': confidence_flat[:, 0] if confidence_flat.shape[1] > 0 else confidence_flat.view(-1)[0],
+            'confidence': confidence_flat[:, 0] if confidence_flat.shape[1] > 0 else confidence_flat.reshape(-1)[0],
            'regime': self._memory_barrier(regime_probs),
-            'volatility': volatility_flat[:, 0] if volatility_flat.shape[1] > 0 else volatility_flat.view(-1)[0],
+            'volatility': volatility_flat[:, 0] if volatility_flat.shape[1] > 0 else volatility_flat.reshape(-1)[0],
            'features': self._memory_barrier(processed_features)
        }
    
--- a/NN/models/cnn_model_pytorch.py
+++ b/NN/models/cnn_model_pytorch.py
@@ -1,610 +0,0 @@
-#!/usr/bin/env python3
-"""
-Enhanced CNN Model for Trading - PyTorch Implementation
-Much larger and more sophisticated architecture for better learning
-"""
-
-import os
-import logging
-import numpy as np
-import matplotlib.pyplot as plt
-from datetime import datetime
-import math
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-import torch.nn.functional as F
-from typing import Dict, Any, Optional, Tuple
-
-# Configure logging
-logger = logging.getLogger(__name__)
-
-class MultiHeadAttention(nn.Module):
-    """Multi-head attention mechanism for sequence data"""
-    
-    def __init__(self, d_model: int, num_heads: int = 8, dropout: float = 0.1):
-        super().__init__()
-        assert d_model % num_heads == 0
-        
-        self.d_model = d_model
-        self.num_heads = num_heads
-        self.d_k = d_model // num_heads
-        
-        self.w_q = nn.Linear(d_model, d_model)
-        self.w_k = nn.Linear(d_model, d_model)
-        self.w_v = nn.Linear(d_model, d_model)
-        self.w_o = nn.Linear(d_model, d_model)
-        
-        self.dropout = nn.Dropout(dropout)
-        self.scale = math.sqrt(self.d_k)
-    
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, seq_len, _ = x.size()
-        
-        # Compute Q, K, V
-        Q = self.w_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
-        K = self.w_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
-        V = self.w_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
-        
-        # Attention weights
-        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
-        attention_weights = F.softmax(scores, dim=-1)
-        attention_weights = self.dropout(attention_weights)
-        
-        # Apply attention
-        attention_output = torch.matmul(attention_weights, V)
-        attention_output = attention_output.transpose(1, 2).contiguous().view(
-            batch_size, seq_len, self.d_model
-        )
-        
-        return self.w_o(attention_output)
-
-class ResidualBlock(nn.Module):
-    """Residual block with normalization and dropout"""
-    
-    def __init__(self, channels: int, dropout: float = 0.1):
-        super().__init__()
-        self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
-        self.norm1 = nn.BatchNorm1d(channels)
-        self.norm2 = nn.BatchNorm1d(channels)
-        self.dropout = nn.Dropout(dropout)
-        
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        residual = x
-        
-        out = F.relu(self.norm1(self.conv1(x)))
-        out = self.dropout(out)
-        out = self.norm2(self.conv2(out))
-        
-        # Add residual connection (avoid in-place operation)
-        out = out + residual
-        return F.relu(out)
-
-class SpatialAttentionBlock(nn.Module):
-    """Spatial attention for feature maps"""
-    
-    def __init__(self, channels: int):
-        super().__init__()
-        self.conv = nn.Conv1d(channels, 1, kernel_size=1)
-        
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # Compute attention weights
-        attention = torch.sigmoid(self.conv(x))
-        # Avoid in-place operation by creating new tensor
-        return torch.mul(x, attention)
-
-class EnhancedCNNModel(nn.Module):
-    """
-    Much larger and more sophisticated CNN architecture for trading
-    Features:
-    - Deep convolutional layers with residual connections
-    - Multi-head attention mechanisms
-    - Spatial attention blocks
-    - Multiple feature extraction paths
-    - Large capacity for complex pattern learning
-    """
-    
-    def __init__(self, 
-                 input_size: int = 60,
-                 feature_dim: int = 50,
-                 output_size: int = 2,  # BUY/SELL for 2-action system
-                 base_channels: int = 256,  # Increased from 128 to 256
-                 num_blocks: int = 12,  # Increased from 6 to 12
-                 num_attention_heads: int = 16,  # Increased from 8 to 16
-                 dropout_rate: float = 0.2):
-        super().__init__()
-        
-        self.input_size = input_size
-        self.feature_dim = feature_dim
-        self.output_size = output_size
-        self.base_channels = base_channels
-        
-        # Much larger input embedding - project features to higher dimension
-        self.input_embedding = nn.Sequential(
-            nn.Linear(feature_dim, base_channels // 2),
-            nn.BatchNorm1d(base_channels // 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            nn.Linear(base_channels // 2, base_channels),
-            nn.BatchNorm1d(base_channels),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate)
-        )
-        
-        # Multi-scale convolutional feature extraction with more channels
-        self.conv_path1 = self._build_conv_path(base_channels, base_channels, 3)
-        self.conv_path2 = self._build_conv_path(base_channels, base_channels, 5)
-        self.conv_path3 = self._build_conv_path(base_channels, base_channels, 7)
-        self.conv_path4 = self._build_conv_path(base_channels, base_channels, 9)  # Additional path
-        
-        # Feature fusion with more capacity
-        self.feature_fusion = nn.Sequential(
-            nn.Conv1d(base_channels * 4, base_channels * 3, kernel_size=1),  # 4 paths now
-            nn.BatchNorm1d(base_channels * 3),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            nn.Conv1d(base_channels * 3, base_channels * 2, kernel_size=1),
-            nn.BatchNorm1d(base_channels * 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate)
-        )
-        
-        # Much deeper residual blocks for complex pattern learning
-        self.residual_blocks = nn.ModuleList([
-            ResidualBlock(base_channels * 2, dropout_rate) for _ in range(num_blocks)
-        ])
-        
-        # More spatial attention blocks
-        self.spatial_attention = nn.ModuleList([
-            SpatialAttentionBlock(base_channels * 2) for _ in range(6)  # Increased from 3 to 6
-        ])
-        
-        # Multiple temporal attention layers
-        self.temporal_attention1 = MultiHeadAttention(
-            d_model=base_channels * 2,
-            num_heads=num_attention_heads,
-            dropout=dropout_rate
-        )
-        self.temporal_attention2 = MultiHeadAttention(
-            d_model=base_channels * 2,
-            num_heads=num_attention_heads // 2,
-            dropout=dropout_rate
-        )
-        
-        # Global feature aggregation
-        self.global_pool = nn.AdaptiveAvgPool1d(1)
-        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
-        
-        # Much larger advanced feature processing
-        self.advanced_features = nn.Sequential(
-            nn.Linear(base_channels * 4, base_channels * 6),  # Increased capacity
-            nn.BatchNorm1d(base_channels * 6),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels * 6, base_channels * 4),
-            nn.BatchNorm1d(base_channels * 4),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels * 4, base_channels * 3),
-            nn.BatchNorm1d(base_channels * 3),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels * 3, base_channels * 2),
-            nn.BatchNorm1d(base_channels * 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels * 2, base_channels),
-            nn.BatchNorm1d(base_channels),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate)
-        )
-        
-        # Enhanced market regime detection branch
-        self.regime_detector = nn.Sequential(
-            nn.Linear(base_channels, base_channels // 2),
-            nn.BatchNorm1d(base_channels // 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            nn.Linear(base_channels // 2, base_channels // 4),
-            nn.BatchNorm1d(base_channels // 4),
-            nn.ReLU(),
-            nn.Linear(base_channels // 4, 8),  # 8 market regimes instead of 4
-            nn.Softmax(dim=1)
-        )
-        
-        # Enhanced volatility prediction branch
-        self.volatility_predictor = nn.Sequential(
-            nn.Linear(base_channels, base_channels // 2),
-            nn.BatchNorm1d(base_channels // 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            nn.Linear(base_channels // 2, base_channels // 4),
-            nn.BatchNorm1d(base_channels // 4),
-            nn.ReLU(),
-            nn.Linear(base_channels // 4, 1),
-            nn.Sigmoid()
-        )
-        
-        # Main trading decision head
-        self.decision_head = nn.Sequential(
-            nn.Linear(base_channels + 8 + 1, base_channels),  # 8 regime classes + 1 volatility
-            nn.BatchNorm1d(base_channels),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels, base_channels // 2),
-            nn.BatchNorm1d(base_channels // 2),
-            nn.ReLU(),
-            nn.Dropout(dropout_rate),
-            
-            nn.Linear(base_channels // 2, output_size)
-        )
-        
-        # Confidence estimation head
-        self.confidence_head = nn.Sequential(
-            nn.Linear(base_channels, base_channels // 2),
-            nn.ReLU(),
-            nn.Linear(base_channels // 2, 1),
-            nn.Sigmoid()
-        )
-        
-        # Initialize weights
-        self._initialize_weights()
-        
-    def _build_conv_path(self, in_channels: int, out_channels: int, kernel_size: int) -> nn.Module:
-        """Build a convolutional path with multiple layers"""
-        return nn.Sequential(
-            nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2),
-            nn.BatchNorm1d(out_channels),
-            nn.ReLU(),
-            nn.Dropout(0.1),
-            
-            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
-            nn.BatchNorm1d(out_channels),
-            nn.ReLU(),
-            nn.Dropout(0.1),
-            
-            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
-            nn.BatchNorm1d(out_channels),
-            nn.ReLU()
-        )
-    
-    def _initialize_weights(self):
-        """Initialize model weights"""
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.xavier_normal_(m.weight)
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm1d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    
-    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
-        """
-        Forward pass with multiple outputs
-        Args:
-            x: Input tensor of shape [batch_size, sequence_length, features]
-        Returns:
-            Dictionary with predictions, confidence, regime, and volatility
-        """
-        batch_size, seq_len, features = x.shape
-        
-        # Reshape for processing: [batch, seq, features] -> [batch*seq, features]
-        x_reshaped = x.reshape(-1, features)
-        x_reshaped = self._memory_barrier(x_reshaped)
-        
-        # Input embedding
-        embedded = self.input_embedding(x_reshaped)  # [batch*seq, base_channels]
-        embedded = self._memory_barrier(embedded)
-        
-        # Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
-        embedded = embedded.reshape(batch_size, seq_len, -1).transpose(1, 2).contiguous()
-        
-        # Multi-scale feature extraction
-        path1 = self.conv_path1(embedded)
-        path2 = self.conv_path2(embedded)
-        path3 = self.conv_path3(embedded)
-        path4 = self.conv_path4(embedded)
-        
-        # Feature fusion
-        fused_features = torch.cat([path1, path2, path3, path4], dim=1)
-        fused_features = self.feature_fusion(fused_features)
-        
-        # Apply residual blocks with spatial attention
-        current_features = fused_features
-        for i, (res_block, attention) in enumerate(zip(self.residual_blocks, self.spatial_attention)):
-            current_features = res_block(current_features)
-            if i % 2 == 0:  # Apply attention every other block
-                current_features = attention(current_features)
-        
-        # Apply remaining residual blocks
-        for res_block in self.residual_blocks[len(self.spatial_attention):]:
-            current_features = res_block(current_features)
-        
-        # Temporal attention - apply both attention layers
-        # Reshape for attention: [batch, channels, seq] -> [batch, seq, channels]
-        attention_input = current_features.transpose(1, 2)
-        attended_features = self.temporal_attention1(attention_input)
-        attended_features = self.temporal_attention2(attended_features)
-        # Back to conv format: [batch, seq, channels] -> [batch, channels, seq]
-        attended_features = attended_features.transpose(1, 2)
-        
-        # Global aggregation
-        avg_pooled = self.global_pool(attended_features).squeeze(-1)  # [batch, channels]
-        max_pooled = self.global_max_pool(attended_features).squeeze(-1)  # [batch, channels]
-        
-        # Combine global features
-        global_features = torch.cat([avg_pooled, max_pooled], dim=1)
-        
-        # Advanced feature processing
-        processed_features = self.advanced_features(global_features)
-        
-        # Multi-task predictions
-        regime_probs = self.regime_detector(processed_features)
-        volatility_pred = self.volatility_predictor(processed_features)
-        confidence = self.confidence_head(processed_features)
-        
-        # Combine all features for final decision (8 regime classes + 1 volatility)
-        combined_features = torch.cat([processed_features, regime_probs, volatility_pred], dim=1)
-        trading_logits = self.decision_head(combined_features)
-        
-        # Apply temperature scaling for better calibration
-        temperature = 1.5
-        trading_probs = F.softmax(trading_logits / temperature, dim=1)
-        
-        return {
-            'logits': trading_logits,
-            'probabilities': trading_probs,
-            'confidence': confidence.squeeze(-1),
-            'regime': regime_probs,
-            'volatility': volatility_pred.squeeze(-1),
-            'features': processed_features
-        }
-    
-    def predict(self, feature_matrix: np.ndarray) -> Dict[str, Any]:
-        """
-        Make predictions on feature matrix
-        Args:
-            feature_matrix: numpy array of shape [sequence_length, features]
-        Returns:
-            Dictionary with prediction results
-        """
-        self.eval()
-        
-        with torch.no_grad():
-            # Convert to tensor and add batch dimension
-            if isinstance(feature_matrix, np.ndarray):
-                x = torch.FloatTensor(feature_matrix).unsqueeze(0)  # Add batch dim
-            else:
-                x = feature_matrix.unsqueeze(0)
-            
-            # Move to device
-            device = next(self.parameters()).device
-            x = x.to(device)
-            
-            # Forward pass
-            outputs = self.forward(x)
-            
-            # Extract results with proper shape handling
-            probs = outputs['probabilities'].cpu().numpy()[0]
-            confidence_tensor = outputs['confidence'].cpu().numpy()
-            regime = outputs['regime'].cpu().numpy()[0]
-            volatility_tensor = outputs['volatility'].cpu().numpy()
-            
-            # Handle confidence shape properly to avoid scalar conversion errors
-            if isinstance(confidence_tensor, np.ndarray):
-                if confidence_tensor.ndim == 0:
-                    confidence = float(confidence_tensor.item())
-                elif confidence_tensor.size == 1:
-                    confidence = float(confidence_tensor.flatten()[0])
-                else:
-                    confidence = float(confidence_tensor[0] if len(confidence_tensor) > 0 else 0.7)
-            else:
-                confidence = float(confidence_tensor)
-            
-            # Handle volatility shape properly
-            if isinstance(volatility_tensor, np.ndarray):
-                if volatility_tensor.ndim == 0:
-                    volatility = float(volatility_tensor.item())
-                elif volatility_tensor.size == 1:
-                    volatility = float(volatility_tensor.flatten()[0])
-                else:
-                    volatility = float(volatility_tensor[0] if len(volatility_tensor) > 0 else 0.0)
-            else:
-                volatility = float(volatility_tensor)
-            
-            # Determine action (0=BUY, 1=SELL for 2-action system)
-            action = int(np.argmax(probs))
-            action_confidence = float(probs[action])
-            
-            return {
-                'action': action,
-                'action_name': 'BUY' if action == 0 else 'SELL',
-                'confidence': confidence,  # Already converted to float above
-                'action_confidence': action_confidence,
-                'probabilities': probs.tolist(),
-                'regime_probabilities': regime.tolist(),
-                'volatility_prediction': volatility,  # Already converted to float above
-                'raw_logits': outputs['logits'].cpu().numpy()[0].tolist()
-            }
-    
-    def get_memory_usage(self) -> Dict[str, Any]:
-        """Get model memory usage statistics"""
-        total_params = sum(p.numel() for p in self.parameters())
-        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
-        
-        param_size = sum(p.numel() * p.element_size() for p in self.parameters())
-        buffer_size = sum(b.numel() * b.element_size() for b in self.buffers())
-        
-        return {
-            'total_parameters': total_params,
-            'trainable_parameters': trainable_params,
-            'parameter_size_mb': param_size / (1024 * 1024),
-            'buffer_size_mb': buffer_size / (1024 * 1024),
-            'total_size_mb': (param_size + buffer_size) / (1024 * 1024)
-        }
-    
-    def to_device(self, device: str):
-        """Move model to specified device"""
-        return self.to(torch.device(device))
-
-class CNNModelTrainer:
-    """Enhanced trainer for the beefed-up CNN model"""
-    
-    def __init__(self, model: EnhancedCNNModel, learning_rate: float = 0.0001, device: str = 'cuda'):
-        self.model = model.to(device)
-        self.device = device
-        self.learning_rate = learning_rate
-        
-        # Use AdamW optimizer with weight decay
-        self.optimizer = torch.optim.AdamW(
-            model.parameters(), 
-            lr=learning_rate, 
-            weight_decay=0.01,
-            betas=(0.9, 0.999)
-        )
-        
-        # Learning rate scheduler
-        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
-            self.optimizer,
-            max_lr=learning_rate * 10,
-            total_steps=10000,  # Will be updated based on actual training
-            pct_start=0.1,
-            anneal_strategy='cos'
-        )
-        
-        # Multi-task loss functions
-        self.main_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
-        self.confidence_criterion = nn.BCELoss()
-        self.regime_criterion = nn.CrossEntropyLoss()
-        self.volatility_criterion = nn.MSELoss()
-        
-        self.training_history = []
-        
-    def train_step(self, x: torch.Tensor, y: torch.Tensor, 
-                   confidence_targets: Optional[torch.Tensor] = None,
-                   regime_targets: Optional[torch.Tensor] = None,
-                   volatility_targets: Optional[torch.Tensor] = None) -> Dict[str, float]:
-        """Single training step with multi-task learning"""
-        
-        self.model.train()
-        self.optimizer.zero_grad()
-        
-        # Forward pass
-        outputs = self.model(x)
-        
-        # Main trading loss
-        main_loss = self.main_criterion(outputs['logits'], y)
-        total_loss = main_loss
-        
-        losses = {'main_loss': main_loss.item()}
-        
-        # Confidence loss (if targets provided)
-        if confidence_targets is not None:
-            conf_loss = self.confidence_criterion(outputs['confidence'], confidence_targets)
-            total_loss += 0.1 * conf_loss
-            losses['confidence_loss'] = conf_loss.item()
-        
-        # Regime classification loss (if targets provided)
-        if regime_targets is not None:
-            regime_loss = self.regime_criterion(outputs['regime'], regime_targets)
-            total_loss += 0.05 * regime_loss
-            losses['regime_loss'] = regime_loss.item()
-        
-        # Volatility prediction loss (if targets provided)
-        if volatility_targets is not None:
-            vol_loss = self.volatility_criterion(outputs['volatility'], volatility_targets)
-            total_loss += 0.05 * vol_loss
-            losses['volatility_loss'] = vol_loss.item()
-        
-        losses['total_loss'] = total_loss.item()
-        
-        # Backward pass
-        total_loss.backward()
-        
-        # Gradient clipping
-        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
-        
-        self.optimizer.step()
-        self.scheduler.step()
-        
-        # Calculate accuracy
-        with torch.no_grad():
-            predictions = torch.argmax(outputs['probabilities'], dim=1)
-            accuracy = (predictions == y).float().mean().item()
-            losses['accuracy'] = accuracy
-        
-        return losses
-    
-    def save_model(self, filepath: str, metadata: Optional[Dict] = None):
-        """Save model with metadata"""
-        save_dict = {
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.optimizer.state_dict(),
-            'scheduler_state_dict': self.scheduler.state_dict(),
-            'training_history': self.training_history,
-            'model_config': {
-                'input_size': self.model.input_size,
-                'feature_dim': self.model.feature_dim,
-                'output_size': self.model.output_size,
-                'base_channels': self.model.base_channels
-            }
-        }
-        
-        if metadata:
-            save_dict['metadata'] = metadata
-            
-        torch.save(save_dict, filepath)
-        logger.info(f"Enhanced CNN model saved to {filepath}")
-    
-    def load_model(self, filepath: str) -> Dict:
-        """Load model from file"""
-        checkpoint = torch.load(filepath, map_location=self.device)
-        
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
-        
-        if 'scheduler_state_dict' in checkpoint:
-            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
-        
-        if 'training_history' in checkpoint:
-            self.training_history = checkpoint['training_history']
-        
-        logger.info(f"Enhanced CNN model loaded from {filepath}")
-        return checkpoint.get('metadata', {})
-
-def create_enhanced_cnn_model(input_size: int = 60, 
-                            feature_dim: int = 50, 
-                            output_size: int = 2,
-                            base_channels: int = 256,
-                            device: str = 'cuda') -> Tuple[EnhancedCNNModel, CNNModelTrainer]:
-    """Create enhanced CNN model and trainer"""
-    
-    model = EnhancedCNNModel(
-        input_size=input_size,
-        feature_dim=feature_dim,
-        output_size=output_size,
-        base_channels=base_channels,
-        num_blocks=12,
-        num_attention_heads=16,
-        dropout_rate=0.2
-    )
-    
-    trainer = CNNModelTrainer(model, learning_rate=0.0001, device=device)
-    
-    logger.info(f"Created enhanced CNN model with {model.get_memory_usage()['total_parameters']:,} parameters")
-    
-    return model, trainer
--- a/NN/models/dqn_agent.py
+++ b/NN/models/dqn_agent.py
@@ -461,6 +461,10 @@ class DQNAgent:
        action_values = q_values.cpu().data.numpy()[0]
        
        # Calculate confidence scores
+        # Ensure q_values has correct shape for softmax
+        if q_values.dim() == 1:
+            q_values = q_values.unsqueeze(0)
+        
        sell_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
        buy_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
        
@@ -486,6 +490,10 @@ class DQNAgent:
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.policy_net(state_tensor)
            
+            # Ensure q_values has correct shape for softmax
+            if q_values.dim() == 1:
+                q_values = q_values.unsqueeze(0)
+            
            # Convert Q-values to probabilities
            action_probs = torch.softmax(q_values, dim=1)
            action = q_values.argmax().item()
--- a/NN/models/enhanced_cnn_with_orderbook.py
+++ b/NN/models/enhanced_cnn_with_orderbook.py
@@ -1,603 +0,0 @@
-"""
-Enhanced CNN Model with Bookmap Order Book Integration
-
-This module extends the enhanced CNN to incorporate:
- Traditional market data (OHLCV, indicators)
- Order book depth features (COB)
- Volume profile features (SVP)
- Order flow signals (sweeps, absorptions, momentum)
- Market microstructure metrics
-
-The integrated model provides comprehensive market awareness for superior trading decisions.
-"""
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import numpy as np
-import logging
-from typing import Dict, List, Optional, Tuple, Any
-
-logger = logging.getLogger(__name__)
-
-class ResidualBlock(nn.Module):
-    """Enhanced residual block with skip connections"""
-    
-    def __init__(self, in_channels, out_channels, stride=1):
-        super(ResidualBlock, self).__init__()
-        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
-        self.bn1 = nn.BatchNorm1d(out_channels)
-        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
-        self.bn2 = nn.BatchNorm1d(out_channels)
-        
-        # Shortcut connection
-        self.shortcut = nn.Sequential()
-        if stride != 1 or in_channels != out_channels:
-            self.shortcut = nn.Sequential(
-                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
-                nn.BatchNorm1d(out_channels)
-            )
-    
-    def forward(self, x):
-        out = F.relu(self.bn1(self.conv1(x)))
-        out = self.bn2(self.conv2(out))
-        # Avoid in-place operation
-        out = out + self.shortcut(x)
-        out = F.relu(out)
-        return out
-
-class MultiHeadAttention(nn.Module):
-    """Multi-head attention mechanism"""
-    
-    def __init__(self, dim, num_heads=8, dropout=0.1):
-        super(MultiHeadAttention, self).__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        
-        self.q_linear = nn.Linear(dim, dim)
-        self.k_linear = nn.Linear(dim, dim)
-        self.v_linear = nn.Linear(dim, dim)
-        self.dropout = nn.Dropout(dropout)
-        self.out = nn.Linear(dim, dim)
-    
-    def forward(self, x):
-        batch_size, seq_len, dim = x.size()
-        
-        # Linear transformations
-        q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
-        k = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
-        v = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
-        
-        # Transpose for attention
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        
-        # Scaled dot-product attention
-        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.head_dim)
-        attn_weights = F.softmax(scores, dim=-1)
-        attn_weights = self.dropout(attn_weights)
-        
-        attn_output = torch.matmul(attn_weights, v)
-        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, dim)
-        
-        return self.out(attn_output), attn_weights
-
-class OrderBookEncoder(nn.Module):
-    """Specialized encoder for order book data"""
-    
-    def __init__(self, input_dim=100, hidden_dim=512):
-        super(OrderBookEncoder, self).__init__()
-        
-        # Order book feature processing
-        self.bid_encoder = nn.Sequential(
-            nn.Linear(40, 128),  # 20 levels x 2 features
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(128, 256),
-            nn.ReLU(),
-            nn.Dropout(0.2)
-        )
-        
-        self.ask_encoder = nn.Sequential(
-            nn.Linear(40, 128),  # 20 levels x 2 features
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(128, 256),
-            nn.ReLU(),
-            nn.Dropout(0.2)
-        )
-        
-        # Microstructure features
-        self.microstructure_encoder = nn.Sequential(
-            nn.Linear(15, 64),  # Liquidity + imbalance + flow features
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(64, 128),
-            nn.ReLU(),
-            nn.Dropout(0.2)
-        )
-        
-        # Cross-attention between bids and asks
-        self.cross_attention = MultiHeadAttention(256, num_heads=8)
-        
-        # Output projection
-        self.output_projection = nn.Sequential(
-            nn.Linear(256 + 256 + 128, hidden_dim),  # Combine all features
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(hidden_dim, hidden_dim)
-        )
-    
-    def forward(self, orderbook_features):
-        """
-        Process order book features
-        
-        Args:
-            orderbook_features: Tensor of shape [batch, 100] containing:
-                - 40 bid features (20 levels x 2)
-                - 40 ask features (20 levels x 2) 
-                - 15 microstructure features
-                - 5 flow signal features
-        """
-        # Split features
-        bid_features = orderbook_features[:, :40]      # First 40 features
-        ask_features = orderbook_features[:, 40:80]    # Next 40 features
-        micro_features = orderbook_features[:, 80:95]  # Next 15 features
-        # flow_features = orderbook_features[:, 95:100]  # Last 5 features (included in micro)
-        
-        # Encode each component
-        bid_encoded = self.bid_encoder(bid_features)      # [batch, 256]
-        ask_encoded = self.ask_encoder(ask_features)      # [batch, 256]
-        micro_encoded = self.microstructure_encoder(micro_features)  # [batch, 128]
-        
-        # Add sequence dimension for attention
-        bid_seq = bid_encoded.unsqueeze(1)  # [batch, 1, 256]
-        ask_seq = ask_encoded.unsqueeze(1)  # [batch, 1, 256]
-        
-        # Cross-attention between bids and asks
-        combined_seq = torch.cat([bid_seq, ask_seq], dim=1)  # [batch, 2, 256]
-        attended_features, attention_weights = self.cross_attention(combined_seq)
-        
-        # Flatten attended features
-        attended_flat = attended_features.reshape(attended_features.size(0), -1)  # [batch, 512]
-        
-        # Combine with microstructure features
-        combined_features = torch.cat([attended_flat, micro_encoded], dim=1)  # [batch, 640]
-        
-        # Final projection
-        output = self.output_projection(combined_features)
-        
-        return output
-
-class VolumeProfileEncoder(nn.Module):
-    """Encoder for volume profile data"""
-    
-    def __init__(self, max_levels=50, hidden_dim=256):
-        super(VolumeProfileEncoder, self).__init__()
-        
-        self.max_levels = max_levels
-        
-        # Process volume profile levels
-        self.level_encoder = nn.Sequential(
-            nn.Linear(7, 32),  # price, volume, buy_vol, sell_vol, trades, vwap, net_vol
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(32, 64),
-            nn.ReLU()
-        )
-        
-        # Attention over price levels
-        self.level_attention = MultiHeadAttention(64, num_heads=4)
-        
-        # Final aggregation
-        self.aggregator = nn.Sequential(
-            nn.Linear(64, hidden_dim),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(hidden_dim, hidden_dim)
-        )
-    
-    def forward(self, volume_profile_data):
-        """
-        Process volume profile data
-        
-        Args:
-            volume_profile_data: List of dicts or tensor with volume profile levels
-        """
-        # If input is list of dicts, convert to tensor
-        if isinstance(volume_profile_data, list):
-            if not volume_profile_data:
-                # Return zero features if no data
-                return torch.zeros(1, 256, device=torch.device('cpu'))  # Hardcoded output dim as per hidden_dim in class init
-            
-            # Convert to tensor
-            features = []
-            for level in volume_profile_data[:self.max_levels]:
-                level_features = [
-                    level.get('price', 0.0),
-                    level.get('volume', 0.0),
-                    level.get('buy_volume', 0.0),
-                    level.get('sell_volume', 0.0),
-                    level.get('trades_count', 0.0),
-                    level.get('vwap', 0.0),
-                    level.get('net_volume', 0.0)
-                ]
-                features.append(level_features)
-            
-            # Pad if needed
-            while len(features) < self.max_levels:
-                features.append([0.0] * 7)
-            
-            volume_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
-        else:
-            volume_tensor = volume_profile_data
-        
-        batch_size, num_levels, feature_dim = volume_tensor.shape
-        
-        # Encode each level
-        level_features = self.level_encoder(volume_tensor.view(-1, feature_dim))
-        level_features = level_features.reshape(batch_size, num_levels, -1)
-        
-        # Apply attention across levels
-        attended_levels, _ = self.level_attention(level_features)
-        
-        # Global average pooling
-        aggregated = torch.mean(attended_levels, dim=1)
-        
-        # Final processing
-        output = self.aggregator(aggregated)
-        
-        return output
-
-class EnhancedCNNWithOrderBook(nn.Module):
-    """
-    Enhanced CNN model integrating traditional market data with order book analysis
-    
-    Features:
-    - Multi-scale convolutional processing for time series data
-    - Specialized order book feature extraction
-    - Volume profile analysis
-    - Order flow signal integration
-    - Multi-head attention mechanisms
-    - Dueling architecture for value and advantage estimation
-    """
-    
-    def __init__(self, 
-                 market_input_shape=(60, 50),  # Traditional market data
-                 orderbook_features=100,       # Order book feature dimension
-                 n_actions=2,
-                 confidence_threshold=0.5):
-        super(EnhancedCNNWithOrderBook, self).__init__()
-        
-        self.market_input_shape = market_input_shape
-        self.orderbook_features = orderbook_features
-        self.n_actions = n_actions
-        self.confidence_threshold = confidence_threshold
-        
-        # Traditional market data processing
-        self.market_encoder = self._build_market_encoder()
-        
-        # Order book data processing
-        self.orderbook_encoder = OrderBookEncoder(
-            input_dim=orderbook_features,
-            hidden_dim=512
-        )
-        
-        # Volume profile processing
-        self.volume_encoder = VolumeProfileEncoder(
-            max_levels=50,
-            hidden_dim=256
-        )
-        
-        # Feature fusion
-        total_features = 1024 + 512 + 256  # market + orderbook + volume
-        self.feature_fusion = nn.Sequential(
-            nn.Linear(total_features, 1536),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(1536, 1024),
-            nn.ReLU(),
-            nn.Dropout(0.3)
-        )
-        
-        # Multi-head attention for integrated features
-        self.integrated_attention = MultiHeadAttention(1024, num_heads=16)
-        
-        # Dueling architecture
-        self.advantage_stream = nn.Sequential(
-            nn.Linear(1024, 512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, 256),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(256, n_actions)
-        )
-        
-        self.value_stream = nn.Sequential(
-            nn.Linear(1024, 512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, 256),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(256, 1)
-        )
-        
-        # Auxiliary heads for multi-task learning
-        self.extrema_head = nn.Sequential(
-            nn.Linear(1024, 512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, 256),
-            nn.ReLU(),
-            nn.Linear(256, 3)  # bottom, top, neither
-        )
-        
-        self.market_regime_head = nn.Sequential(
-            nn.Linear(1024, 512),
-            nn.ReLU(),
-            nn.Dropout(0.3),
-            nn.Linear(512, 256),
-            nn.ReLU(),
-            nn.Linear(256, 8)  # trending, ranging, volatile, etc.
-        )
-        
-        self.confidence_head = nn.Sequential(
-            nn.Linear(1024, 256),
-            nn.ReLU(),
-            nn.Linear(256, 1),
-            nn.Sigmoid()
-        )
-        
-        # Initialize weights
-        self._initialize_weights()
-        
-        # Device management
-        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.to(self.device)
-        
-        logger.info(f"Enhanced CNN with Order Book initialized")
-        logger.info(f"Market input shape: {market_input_shape}")
-        logger.info(f"Order book features: {orderbook_features}")
-        logger.info(f"Output actions: {n_actions}")
-    
-    def _build_market_encoder(self):
-        """Build traditional market data encoder"""
-        seq_len, feature_dim = self.market_input_shape
-        
-        return nn.Sequential(
-            # Input projection
-            nn.Linear(feature_dim, 128),
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            
-            # Convolutional layers for temporal patterns
-            nn.Conv1d(128, 256, kernel_size=5, padding=2),
-            nn.BatchNorm1d(256),
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            
-            ResidualBlock(256, 512),
-            ResidualBlock(512, 512),
-            ResidualBlock(512, 768),
-            ResidualBlock(768, 768),
-            
-            # Global pooling
-            nn.AdaptiveAvgPool1d(1),
-            nn.Flatten(),
-            
-            # Final projection
-            nn.Linear(768, 1024),
-            nn.ReLU(),
-            nn.Dropout(0.3)
-        )
-    
-    def _initialize_weights(self):
-        """Initialize model weights"""
-        for m in self.modules():
-            if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.Linear):
-                nn.init.xavier_normal_(m.weight)
-                if m.bias is not None:
-                    nn.init.constant_(m.bias, 0)
-            elif isinstance(m, nn.BatchNorm1d):
-                nn.init.constant_(m.weight, 1)
-                nn.init.constant_(m.bias, 0)
-    
-    def forward(self, market_data, orderbook_data, volume_profile_data=None):
-        """
-        Forward pass through integrated model
-        
-        Args:
-            market_data: Traditional market data [batch, seq_len, features]
-            orderbook_data: Order book features [batch, orderbook_features]
-            volume_profile_data: Volume profile data (optional)
-        
-        Returns:
-            Dictionary with Q-values, confidence, regime, and auxiliary predictions
-        """
-        # Process market data - ensure batch dimension first
-        if len(market_data.shape) == 2:
-            market_data = market_data.unsqueeze(0)
-        
-        batch_size = market_data.size(0)  # Get correct batch size after shape adjustment
-        
-        # Reshape for convolutional processing with safe dimensions
-        market_reshaped = market_data.reshape(batch_size, -1, market_data.size(-1))
-        market_features = self.market_encoder(market_reshaped.transpose(1, 2))
-        
-        # Process order book data
-        orderbook_features = self.orderbook_encoder(orderbook_data)
-        
-        # Process volume profile data
-        if volume_profile_data is not None:
-            volume_features = self.volume_encoder(volume_profile_data)
-        else:
-            volume_features = torch.zeros(batch_size, 256, device=market_data.device)
-        
-        # Fuse all features
-        combined_features = torch.cat([
-            market_features,
-            orderbook_features,
-            volume_features
-        ], dim=1)
-        
-        # Feature fusion
-        fused_features = self.feature_fusion(combined_features)
-        
-        # Apply attention
-        attended_features = fused_features.unsqueeze(1)  # Add sequence dimension
-        attended_output, attention_weights = self.integrated_attention(attended_features)
-        final_features = attended_output.squeeze(1)  # Remove sequence dimension
-        
-        # Dueling architecture
-        advantage = self.advantage_stream(final_features)
-        value = self.value_stream(final_features)
-        
-        # Combine value and advantage
-        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
-        
-        # Auxiliary predictions
-        extrema_pred = self.extrema_head(final_features)
-        regime_pred = self.market_regime_head(final_features)
-        confidence = self.confidence_head(final_features)
-        
-        return {
-            'q_values': q_values,
-            'confidence': confidence,
-            'extrema_prediction': extrema_pred,
-            'market_regime': regime_pred,
-            'attention_weights': attention_weights,
-            'integrated_features': final_features
-        }
-    
-    def predict(self, market_data, orderbook_data, volume_profile_data=None):
-        """Make prediction with confidence thresholding"""
-        self.eval()
-        
-        with torch.no_grad():
-            # Convert inputs to tensors if needed
-            if isinstance(market_data, np.ndarray):
-                market_data = torch.FloatTensor(market_data).to(self.device)
-            if isinstance(orderbook_data, np.ndarray):
-                orderbook_data = torch.FloatTensor(orderbook_data).to(self.device)
-            
-            # Ensure batch dimension
-            if len(market_data.shape) == 2:
-                market_data = market_data.unsqueeze(0)
-            if len(orderbook_data.shape) == 1:
-                orderbook_data = orderbook_data.unsqueeze(0)
-            
-            # Forward pass
-            outputs = self.forward(market_data, orderbook_data, volume_profile_data)
-            
-            # Get probabilities
-            q_values = outputs['q_values']
-            probs = F.softmax(q_values, dim=1)
-            
-            # Handle confidence shape properly to avoid scalar conversion errors
-            confidence_tensor = outputs['confidence']
-            if isinstance(confidence_tensor, torch.Tensor):
-                if confidence_tensor.numel() == 1:
-                    confidence = confidence_tensor.item()
-                else:
-                    confidence = confidence_tensor.flatten()[0].item()
-            else:
-                confidence = float(confidence_tensor)
-            
-            # Action selection with confidence thresholding
-            if confidence >= self.confidence_threshold:
-                action = torch.argmax(q_values, dim=1).item()
-            else:
-                action = None  # No action due to low confidence
-            
-            return {
-                'action': action,
-                'probabilities': probs.cpu().numpy()[0],
-                'confidence': confidence,
-                'q_values': q_values.cpu().numpy()[0],
-                'extrema_prediction': F.softmax(outputs['extrema_prediction'], dim=1).cpu().numpy()[0],
-                'market_regime': F.softmax(outputs['market_regime'], dim=1).cpu().numpy()[0]
-            }
-    
-    def get_feature_importance(self, market_data, orderbook_data, volume_profile_data=None):
-        """Analyze feature importance using gradients"""
-        self.eval()
-        
-        # Enable gradient computation for inputs
-        market_data.requires_grad_(True)
-        orderbook_data.requires_grad_(True)
-        
-        # Forward pass
-        outputs = self.forward(market_data, orderbook_data, volume_profile_data)
-        
-        # Compute gradients for Q-values
-        q_values = outputs['q_values']
-        q_values.sum().backward()
-        
-        # Get gradient magnitudes
-        market_importance = torch.abs(market_data.grad).mean().item()
-        orderbook_importance = torch.abs(orderbook_data.grad).mean().item()
-        
-        return {
-            'market_importance': market_importance,
-            'orderbook_importance': orderbook_importance,
-            'total_importance': market_importance + orderbook_importance
-        }
-    
-    def save(self, path):
-        """Save model state"""
-        torch.save({
-            'model_state_dict': self.state_dict(),
-            'market_input_shape': self.market_input_shape,
-            'orderbook_features': self.orderbook_features,
-            'n_actions': self.n_actions,
-            'confidence_threshold': self.confidence_threshold
-        }, path)
-        logger.info(f"Enhanced CNN with Order Book saved to {path}")
-    
-    def load(self, path):
-        """Load model state"""
-        checkpoint = torch.load(path, map_location=self.device)
-        self.load_state_dict(checkpoint['model_state_dict'])
-        logger.info(f"Enhanced CNN with Order Book loaded from {path}")
-    
-    def get_memory_usage(self):
-        """Get model memory usage statistics"""
-        total_params = sum(p.numel() for p in self.parameters())
-        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
-        
-        return {
-            'total_parameters': total_params,
-            'trainable_parameters': trainable_params,
-            'model_size_mb': total_params * 4 / (1024 * 1024),  # Assuming float32
-        }
-
-def create_enhanced_cnn_with_orderbook(
-    market_input_shape=(60, 50),
-    orderbook_features=100,
-    n_actions=2,
-    device='cuda'
-):
-    """Create and initialize enhanced CNN with order book integration"""
-    
-    model = EnhancedCNNWithOrderBook(
-        market_input_shape=market_input_shape,
-        orderbook_features=orderbook_features,
-        n_actions=n_actions
-    )
-    
-    if device and torch.cuda.is_available():
-        model = model.to(device)
-    
-    memory_usage = model.get_memory_usage()
-    logger.info(f"Created Enhanced CNN with Order Book: {memory_usage['total_parameters']:,} parameters")
-    logger.info(f"Model size: {memory_usage['model_size_mb']:.1f} MB")
-    
-    return model 
--- a/NN/models/transformer_model_pytorch.py
+++ b/NN/models/transformer_model_pytorch.py
@@ -1,653 +0,0 @@
-#!/usr/bin/env python3
-"""
-Transformer Model - PyTorch Implementation
-
-This module implements a Transformer model using PyTorch for time series analysis.
-The model consists of a Transformer encoder and a Mixture of Experts model.
-"""
-
-import os
-import logging
-import numpy as np
-import matplotlib.pyplot as plt
-from datetime import datetime
-
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from torch.utils.data import DataLoader, TensorDataset
-from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
-
-# Configure logging
-logger = logging.getLogger(__name__)
-
-class TransformerBlock(nn.Module):
-    """Transformer Block with self-attention mechanism"""
-    
-    def __init__(self, input_dim, num_heads=4, ff_dim=64, dropout=0.1):
-        super(TransformerBlock, self).__init__()
-        
-        self.attention = nn.MultiheadAttention(
-            embed_dim=input_dim,
-            num_heads=num_heads,
-            dropout=dropout,
-            batch_first=True
-        )
-        
-        self.feed_forward = nn.Sequential(
-            nn.Linear(input_dim, ff_dim),
-            nn.ReLU(),
-            nn.Linear(ff_dim, input_dim)
-        )
-        
-        self.layernorm1 = nn.LayerNorm(input_dim)
-        self.layernorm2 = nn.LayerNorm(input_dim)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-    
-    def forward(self, x):
-        # Self-attention
-        attn_output, _ = self.attention(x, x, x)
-        x = x + self.dropout1(attn_output)
-        x = self.layernorm1(x)
-        
-        # Feed forward
-        ff_output = self.feed_forward(x)
-        x = x + self.dropout2(ff_output)
-        x = self.layernorm2(x)
-        
-        return x
-
-class TransformerModelPyTorch(nn.Module):
-    """PyTorch Transformer model for time series analysis"""
-    
-    def __init__(self, input_shape, output_size=3, num_heads=4, ff_dim=64, num_transformer_blocks=2):
-        """
-        Initialize the Transformer model.
-        
-        Args:
-            input_shape (tuple): Shape of input data (window_size, features)
-            output_size (int): Size of output (1 for regression, 3 for classification)
-            num_heads (int): Number of attention heads
-            ff_dim (int): Feed forward dimension
-            num_transformer_blocks (int): Number of transformer blocks
-        """
-        super(TransformerModelPyTorch, self).__init__()
-        
-        window_size, num_features = input_shape
-        
-        # Positional encoding
-        self.pos_encoding = nn.Parameter(
-            torch.zeros(1, window_size, num_features),
-            requires_grad=True
-        )
-        
-        # Transformer blocks
-        self.transformer_blocks = nn.ModuleList([
-            TransformerBlock(
-                input_dim=num_features,
-                num_heads=num_heads,
-                ff_dim=ff_dim
-            ) for _ in range(num_transformer_blocks)
-        ])
-        
-        # Global average pooling
-        self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
-        
-        # Dense layers
-        self.dense = nn.Sequential(
-            nn.Linear(num_features, 64),
-            nn.ReLU(),
-            nn.BatchNorm1d(64),
-            nn.Dropout(0.3),
-            nn.Linear(64, output_size)
-        )
-        
-        # Activation based on output size
-        if output_size == 1:
-            self.activation = nn.Sigmoid()  # Binary classification or regression
-        elif output_size > 1:
-            self.activation = nn.Softmax(dim=1)  # Multi-class classification
-        else:
-            self.activation = nn.Identity()  # No activation
-    
-    def forward(self, x):
-        """
-        Forward pass through the network.
-        
-        Args:
-            x: Input tensor of shape [batch_size, window_size, features]
-            
-        Returns:
-            Output tensor of shape [batch_size, output_size]
-        """
-        # Add positional encoding
-        x = x + self.pos_encoding
-        
-        # Apply transformer blocks
-        for transformer_block in self.transformer_blocks:
-            x = transformer_block(x)
-        
-        # Global average pooling
-        x = x.transpose(1, 2)  # [batch, features, window]
-        x = self.global_avg_pool(x)  # [batch, features, 1]
-        x = x.squeeze(-1)  # [batch, features]
-        
-        # Dense layers
-        x = self.dense(x)
-        
-        # Apply activation
-        return self.activation(x)
-
-
-class TransformerModelPyTorchWrapper:
-    """
-    Transformer model wrapper class for time series analysis using PyTorch.
-    
-    This class provides methods for building, training, evaluating, and making
-    predictions with the Transformer model.
-    """
-    
-    def __init__(self, window_size, num_features, output_size=3, timeframes=None):
-        """
-        Initialize the Transformer model.
-        
-        Args:
-            window_size (int): Size of the input window
-            num_features (int): Number of features in the input data
-            output_size (int): Size of the output (1 for regression, 3 for classification)
-            timeframes (list): List of timeframes used (for logging)
-        """
-        self.window_size = window_size
-        self.num_features = num_features
-        self.output_size = output_size
-        self.timeframes = timeframes or []
-        
-        # Determine device (GPU or CPU)
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        logger.info(f"Using device: {self.device}")
-        
-        # Initialize model
-        self.model = None
-        self.build_model()
-        
-        # Initialize training history
-        self.history = {
-            'loss': [],
-            'val_loss': [],
-            'accuracy': [],
-            'val_accuracy': []
-        }
-    
-    def build_model(self):
-        """Build the Transformer model architecture"""
-        logger.info(f"Building PyTorch Transformer model with window_size={self.window_size}, "
-                   f"num_features={self.num_features}, output_size={self.output_size}")
-        
-        self.model = TransformerModelPyTorch(
-            input_shape=(self.window_size, self.num_features),
-            output_size=self.output_size
-        ).to(self.device)
-        
-        # Initialize optimizer
-        self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
-        
-        # Initialize loss function based on output size
-        if self.output_size == 1:
-            self.criterion = nn.BCELoss()  # Binary classification
-        elif self.output_size > 1:
-            self.criterion = nn.CrossEntropyLoss()  # Multi-class classification
-        else:
-            self.criterion = nn.MSELoss()  # Regression
-        
-        logger.info(f"Model built successfully with {sum(p.numel() for p in self.model.parameters())} parameters")
-    
-    def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, epochs=100):
-        """
-        Train the Transformer model.
-        
-        Args:
-            X_train: Training input data
-            y_train: Training target data
-            X_val: Validation input data
-            y_val: Validation target data
-            batch_size: Batch size for training
-            epochs: Number of training epochs
-            
-        Returns:
-            Training history
-        """
-        logger.info(f"Training PyTorch Transformer model with {len(X_train)} samples, "
-                   f"batch_size={batch_size}, epochs={epochs}")
-        
-        # Convert numpy arrays to PyTorch tensors
-        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(self.device)
-        
-        # Handle different output sizes for y_train
-        if self.output_size == 1:
-            y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(self.device)
-        else:
-            y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(self.device)
-        
-        # Create DataLoader for training data
-        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
-        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
-        
-        # Create DataLoader for validation data if provided
-        if X_val is not None and y_val is not None:
-            X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(self.device)
-            if self.output_size == 1:
-                y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(self.device)
-            else:
-                y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(self.device)
-                
-            val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
-            val_loader = DataLoader(val_dataset, batch_size=batch_size)
-        else:
-            val_loader = None
-        
-        # Training loop
-        for epoch in range(epochs):
-            # Training phase
-            self.model.train()
-            running_loss = 0.0
-            correct = 0
-            total = 0
-            
-            for inputs, targets in train_loader:
-                # Zero the parameter gradients
-                self.optimizer.zero_grad()
-                
-                # Forward pass
-                outputs = self.model(inputs)
-                
-                # Calculate loss
-                if self.output_size == 1:
-                    loss = self.criterion(outputs, targets.unsqueeze(1))
-                else:
-                    loss = self.criterion(outputs, targets)
-                
-                # Backward pass and optimize
-                loss.backward()
-                self.optimizer.step()
-                
-                # Statistics
-                running_loss += loss.item()
-                if self.output_size > 1:
-                    _, predicted = torch.max(outputs, 1)
-                    total += targets.size(0)
-                    correct += (predicted == targets).sum().item()
-            
-            epoch_loss = running_loss / len(train_loader)
-            epoch_acc = correct / total if total > 0 else 0
-            
-            # Validation phase
-            if val_loader is not None:
-                val_loss, val_acc = self._validate(val_loader)
-                
-                logger.info(f"Epoch {epoch+1}/{epochs} - "
-                           f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f} - "
-                           f"val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")
-                
-                # Update history
-                self.history['loss'].append(epoch_loss)
-                self.history['accuracy'].append(epoch_acc)
-                self.history['val_loss'].append(val_loss)
-                self.history['val_accuracy'].append(val_acc)
-            else:
-                logger.info(f"Epoch {epoch+1}/{epochs} - "
-                           f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f}")
-                
-                # Update history without validation
-                self.history['loss'].append(epoch_loss)
-                self.history['accuracy'].append(epoch_acc)
-        
-        logger.info("Training completed")
-        return self.history
-    
-    def _validate(self, val_loader):
-        """Validate the model using the validation set"""
-        self.model.eval()
-        val_loss = 0.0
-        correct = 0
-        total = 0
-        
-        with torch.no_grad():
-            for inputs, targets in val_loader:
-                # Forward pass
-                outputs = self.model(inputs)
-                
-                # Calculate loss
-                if self.output_size == 1:
-                    loss = self.criterion(outputs, targets.unsqueeze(1))
-                else:
-                    loss = self.criterion(outputs, targets)
-                
-                val_loss += loss.item()
-                
-                # Calculate accuracy
-                if self.output_size > 1:
-                    _, predicted = torch.max(outputs, 1)
-                    total += targets.size(0)
-                    correct += (predicted == targets).sum().item()
-        
-        return val_loss / len(val_loader), correct / total if total > 0 else 0
-    
-    def evaluate(self, X_test, y_test):
-        """
-        Evaluate the model on test data.
-        
-        Args:
-            X_test: Test input data
-            y_test: Test target data
-            
-        Returns:
-            dict: Evaluation metrics
-        """
-        logger.info(f"Evaluating model on {len(X_test)} samples")
-        
-        # Convert to PyTorch tensors
-        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(self.device)
-        
-        # Get predictions
-        self.model.eval()
-        with torch.no_grad():
-            y_pred = self.model(X_test_tensor)
-            
-            if self.output_size > 1:
-                _, y_pred_class = torch.max(y_pred, 1)
-                y_pred_class = y_pred_class.cpu().numpy()
-            else:
-                y_pred_class = (y_pred.cpu().numpy() > 0.5).astype(int).flatten()
-        
-        # Calculate metrics
-        if self.output_size > 1:
-            accuracy = accuracy_score(y_test, y_pred_class)
-            precision = precision_score(y_test, y_pred_class, average='weighted')
-            recall = recall_score(y_test, y_pred_class, average='weighted')
-            f1 = f1_score(y_test, y_pred_class, average='weighted')
-            
-            metrics = {
-                'accuracy': accuracy,
-                'precision': precision,
-                'recall': recall,
-                'f1_score': f1
-            }
-        else:
-            accuracy = accuracy_score(y_test, y_pred_class)
-            precision = precision_score(y_test, y_pred_class)
-            recall = recall_score(y_test, y_pred_class)
-            f1 = f1_score(y_test, y_pred_class)
-            
-            metrics = {
-                'accuracy': accuracy,
-                'precision': precision,
-                'recall': recall,
-                'f1_score': f1
-            }
-        
-        logger.info(f"Evaluation metrics: {metrics}")
-        return metrics
-    
-    def predict(self, X):
-        """
-        Make predictions with the model.
-        
-        Args:
-            X: Input data
-            
-        Returns:
-            Predictions
-        """
-        # Convert to PyTorch tensor
-        X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
-        
-        # Get predictions
-        self.model.eval()
-        with torch.no_grad():
-            predictions = self.model(X_tensor)
-            
-            if self.output_size > 1:
-                # Multi-class classification
-                probs = predictions.cpu().numpy()
-                _, class_preds = torch.max(predictions, 1)
-                class_preds = class_preds.cpu().numpy()
-                return class_preds, probs
-            else:
-                # Binary classification or regression
-                preds = predictions.cpu().numpy()
-                if self.output_size == 1:
-                    # Binary classification
-                    class_preds = (preds > 0.5).astype(int)
-                    return class_preds.flatten(), preds.flatten()
-                else:
-                    # Regression
-                    return preds.flatten(), None
-    
-    def save(self, filepath):
-        """
-        Save the model to a file.
-        
-        Args:
-            filepath: Path to save the model
-        """
-        # Create directory if it doesn't exist
-        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        
-        # Save the model state
-        model_state = {
-            'model_state_dict': self.model.state_dict(),
-            'optimizer_state_dict': self.optimizer.state_dict(),
-            'history': self.history,
-            'window_size': self.window_size,
-            'num_features': self.num_features,
-            'output_size': self.output_size,
-            'timeframes': self.timeframes
-        }
-        
-        torch.save(model_state, f"{filepath}.pt")
-        logger.info(f"Model saved to {filepath}.pt")
-    
-    def load(self, filepath):
-        """
-        Load the model from a file.
-        
-        Args:
-            filepath: Path to load the model from
-        """
-        # Check if file exists
-        if not os.path.exists(f"{filepath}.pt"):
-            logger.error(f"Model file {filepath}.pt not found")
-            return False
-        
-        # Load the model state
-        model_state = torch.load(f"{filepath}.pt", map_location=self.device)
-        
-        # Update model parameters
-        self.window_size = model_state['window_size']
-        self.num_features = model_state['num_features']
-        self.output_size = model_state['output_size']
-        self.timeframes = model_state['timeframes']
-        
-        # Rebuild the model
-        self.build_model()
-        
-        # Load the model state
-        self.model.load_state_dict(model_state['model_state_dict'])
-        self.optimizer.load_state_dict(model_state['optimizer_state_dict'])
-        self.history = model_state['history']
-        
-        logger.info(f"Model loaded from {filepath}.pt")
-        return True
-
-class MixtureOfExpertsModelPyTorch:
-    """
-    Mixture of Experts model implementation using PyTorch.
-    
-    This model combines predictions from multiple models (experts) using a 
-    learned weighting scheme.
-    """
-    
-    def __init__(self, output_size=3, timeframes=None):
-        """
-        Initialize the Mixture of Experts model.
-        
-        Args:
-            output_size (int): Size of the output (1 for regression, 3 for classification)
-            timeframes (list): List of timeframes used (for logging)
-        """
-        self.output_size = output_size
-        self.timeframes = timeframes or []
-        self.experts = {}
-        self.expert_weights = {}
-        
-        # Determine device (GPU or CPU)
-        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-        logger.info(f"Using device: {self.device}")
-        
-        # Initialize model and training history
-        self.model = None
-        self.history = {
-            'loss': [],
-            'val_loss': [],
-            'accuracy': [],
-            'val_accuracy': []
-        }
-    
-    def add_expert(self, name, model):
-        """
-        Add an expert model.
-        
-        Args:
-            name (str): Name of the expert
-            model: Expert model
-        """
-        self.experts[name] = model
-        logger.info(f"Added expert: {name}")
-    
-    def predict(self, X):
-        """
-        Make predictions using all experts and combine them.
-        
-        Args:
-            X: Input data
-            
-        Returns:
-            Combined predictions
-        """
-        if not self.experts:
-            logger.error("No experts added to the MoE model")
-            return None
-        
-        # Get predictions from each expert
-        expert_predictions = {}
-        for name, expert in self.experts.items():
-            pred, _ = expert.predict(X)
-            expert_predictions[name] = pred
-        
-        # Combine predictions based on weights
-        final_pred = None
-        for name, pred in expert_predictions.items():
-            weight = self.expert_weights.get(name, 1.0 / len(self.experts))
-            if final_pred is None:
-                final_pred = weight * pred
-            else:
-                final_pred += weight * pred
-        
-        # For classification, convert to class indices
-        if self.output_size > 1:
-            # Get class with highest probability
-            class_pred = np.argmax(final_pred, axis=1)
-            return class_pred, final_pred
-        else:
-            # Binary classification
-            class_pred = (final_pred > 0.5).astype(int)
-            return class_pred, final_pred
-    
-    def evaluate(self, X_test, y_test):
-        """
-        Evaluate the model on test data.
-        
-        Args:
-            X_test: Test input data
-            y_test: Test target data
-            
-        Returns:
-            dict: Evaluation metrics
-        """
-        logger.info(f"Evaluating MoE model on {len(X_test)} samples")
-        
-        # Get predictions
-        y_pred_class, _ = self.predict(X_test)
-        
-        # Calculate metrics
-        if self.output_size > 1:
-            accuracy = accuracy_score(y_test, y_pred_class)
-            precision = precision_score(y_test, y_pred_class, average='weighted')
-            recall = recall_score(y_test, y_pred_class, average='weighted')
-            f1 = f1_score(y_test, y_pred_class, average='weighted')
-            
-            metrics = {
-                'accuracy': accuracy,
-                'precision': precision,
-                'recall': recall,
-                'f1_score': f1
-            }
-        else:
-            accuracy = accuracy_score(y_test, y_pred_class)
-            precision = precision_score(y_test, y_pred_class)
-            recall = recall_score(y_test, y_pred_class)
-            f1 = f1_score(y_test, y_pred_class)
-            
-            metrics = {
-                'accuracy': accuracy,
-                'precision': precision,
-                'recall': recall,
-                'f1_score': f1
-            }
-        
-        logger.info(f"MoE evaluation metrics: {metrics}")
-        return metrics
-    
-    def save(self, filepath):
-        """
-        Save the model weights to a file.
-        
-        Args:
-            filepath: Path to save the model
-        """
-        # Create directory if it doesn't exist
-        os.makedirs(os.path.dirname(filepath), exist_ok=True)
-        
-        # Save the model state
-        model_state = {
-            'expert_weights': self.expert_weights,
-            'output_size': self.output_size,
-            'timeframes': self.timeframes
-        }
-        
-        torch.save(model_state, f"{filepath}_moe.pt")
-        logger.info(f"MoE model saved to {filepath}_moe.pt")
-    
-    def load(self, filepath):
-        """
-        Load the model from a file.
-        
-        Args:
-            filepath: Path to load the model from
-        """
-        # Check if file exists
-        if not os.path.exists(f"{filepath}_moe.pt"):
-            logger.error(f"MoE model file {filepath}_moe.pt not found")
-            return False
-        
-        # Load the model state
-        model_state = torch.load(f"{filepath}_moe.pt", map_location=self.device)
-        
-        # Update model parameters
-        self.expert_weights = model_state['expert_weights']
-        self.output_size = model_state['output_size']
-        self.timeframes = model_state['timeframes']
-        
-        logger.info(f"MoE model loaded from {filepath}_moe.pt")
-        return True