order flow WIP, chart broken

2025-06-18 13:51:08 +03:00
parent 5bce17a21a
commit e238ce374b
16 changed files with 1768 additions and 1333 deletions
--- a/NN/models/init.py
+++ b/NN/models/init.py
@@ -10,7 +10,7 @@ This package contains the neural network models used in the trading system:
 PyTorch implementation only.
 """

-from NN.models.cnn_model_pytorch import CNNModelPyTorch as CNNModel
+from NN.models.cnn_model_pytorch import EnhancedCNNModel as CNNModel
 from NN.models.transformer_model_pytorch import (
    TransformerModelPyTorch as TransformerModel,
    MixtureOfExpertsModelPyTorch as MixtureOfExpertsModel
--- a/NN/models/cnn_model.py
+++ b/NN/models/cnn_model.py
@@ -0,0 +1,725 @@
+#!/usr/bin/env python3
+"""
+Enhanced CNN Model for Trading - PyTorch Implementation
+Much larger and more sophisticated architecture for better learning
+"""
+
+import os
+import logging
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime
+import math
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, TensorDataset
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+import torch.nn.functional as F
+from typing import Dict, Any, Optional, Tuple
+
+# Configure logging
+logger = logging.getLogger(__name__)
+
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention mechanism for sequence data"""
+    
+    def __init__(self, d_model: int, num_heads: int = 8, dropout: float = 0.1):
+        super().__init__()
+        assert d_model % num_heads == 0
+        
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads
+        
+        self.w_q = nn.Linear(d_model, d_model)
+        self.w_k = nn.Linear(d_model, d_model)
+        self.w_v = nn.Linear(d_model, d_model)
+        self.w_o = nn.Linear(d_model, d_model)
+        
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.d_k)
+    
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, seq_len, _ = x.size()
+        
+        # Compute Q, K, V
+        Q = self.w_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        K = self.w_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        V = self.w_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        
+        # Attention weights
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        attention_weights = F.softmax(scores, dim=-1)
+        attention_weights = self.dropout(attention_weights)
+        
+        # Apply attention
+        attention_output = torch.matmul(attention_weights, V)
+        attention_output = attention_output.transpose(1, 2).contiguous().view(
+            batch_size, seq_len, self.d_model
+        )
+        
+        return self.w_o(attention_output)
+
+class ResidualBlock(nn.Module):
+    """Residual block with normalization and dropout"""
+    
+    def __init__(self, channels: int, dropout: float = 0.1):
+        super().__init__()
+        self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
+        self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
+        self.norm1 = nn.BatchNorm1d(channels)
+        self.norm2 = nn.BatchNorm1d(channels)
+        self.dropout = nn.Dropout(dropout)
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        residual = x
+        
+        out = F.relu(self.norm1(self.conv1(x)))
+        out = self.dropout(out)
+        out = self.norm2(self.conv2(out))
+        
+        # Add residual connection (avoid in-place operation)
+        out = out + residual
+        return F.relu(out)
+
+class SpatialAttentionBlock(nn.Module):
+    """Spatial attention for feature maps"""
+    
+    def __init__(self, channels: int):
+        super().__init__()
+        self.conv = nn.Conv1d(channels, 1, kernel_size=1)
+        
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Compute attention weights
+        attention = torch.sigmoid(self.conv(x))
+        # Avoid in-place operation by creating new tensor
+        return torch.mul(x, attention)
+
+class EnhancedCNNModel(nn.Module):
+    """
+    Much larger and more sophisticated CNN architecture for trading
+    Features:
+    - Deep convolutional layers with residual connections
+    - Multi-head attention mechanisms
+    - Spatial attention blocks
+    - Multiple feature extraction paths
+    - Large capacity for complex pattern learning
+    """
+    
+    def __init__(self, 
+                 input_size: int = 60,
+                 feature_dim: int = 50,
+                 output_size: int = 2,  # BUY/SELL for 2-action system
+                 base_channels: int = 256,  # Increased from 128 to 256
+                 num_blocks: int = 12,  # Increased from 6 to 12
+                 num_attention_heads: int = 16,  # Increased from 8 to 16
+                 dropout_rate: float = 0.2):
+        super().__init__()
+        
+        self.input_size = input_size
+        self.feature_dim = feature_dim
+        self.output_size = output_size
+        self.base_channels = base_channels
+        
+        # Much larger input embedding - project features to higher dimension
+        self.input_embedding = nn.Sequential(
+            nn.Linear(feature_dim, base_channels // 2),
+            nn.LayerNorm(base_channels // 2),  # Changed from BatchNorm1d for batch_size=1 compatibility
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(base_channels // 2, base_channels),
+            nn.LayerNorm(base_channels),  # Changed from BatchNorm1d for batch_size=1 compatibility
+            nn.ReLU(),
+            nn.Dropout(dropout_rate)
+        )
+        
+        # Multi-scale convolutional feature extraction with more channels
+        self.conv_path1 = self._build_conv_path(base_channels, base_channels, 3)
+        self.conv_path2 = self._build_conv_path(base_channels, base_channels, 5)
+        self.conv_path3 = self._build_conv_path(base_channels, base_channels, 7)
+        self.conv_path4 = self._build_conv_path(base_channels, base_channels, 9)  # Additional path
+        
+        # Feature fusion with more capacity
+        self.feature_fusion = nn.Sequential(
+            nn.Conv1d(base_channels * 4, base_channels * 3, kernel_size=1),  # 4 paths now
+            nn.BatchNorm1d(base_channels * 3),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Conv1d(base_channels * 3, base_channels * 2, kernel_size=1),
+            nn.BatchNorm1d(base_channels * 2),
+            nn.ReLU(),
+            nn.Dropout(dropout_rate)
+        )
+        
+        # Much deeper residual blocks for complex pattern learning
+        self.residual_blocks = nn.ModuleList([
+            ResidualBlock(base_channels * 2, dropout_rate) for _ in range(num_blocks)
+        ])
+        
+        # More spatial attention blocks
+        self.spatial_attention = nn.ModuleList([
+            SpatialAttentionBlock(base_channels * 2) for _ in range(6)  # Increased from 3 to 6
+        ])
+        
+        # Multiple temporal attention layers
+        self.temporal_attention1 = MultiHeadAttention(
+            d_model=base_channels * 2,
+            num_heads=num_attention_heads,
+            dropout=dropout_rate
+        )
+        self.temporal_attention2 = MultiHeadAttention(
+            d_model=base_channels * 2,
+            num_heads=num_attention_heads // 2,
+            dropout=dropout_rate
+        )
+        
+        # Global feature aggregation
+        self.global_pool = nn.AdaptiveAvgPool1d(1)
+        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
+        
+        # Much larger advanced feature processing (using LayerNorm for batch_size=1 compatibility)
+        self.advanced_features = nn.Sequential(
+            nn.Linear(base_channels * 4, base_channels * 6),  # Increased capacity
+            nn.LayerNorm(base_channels * 6),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels * 6, base_channels * 4),
+            nn.LayerNorm(base_channels * 4),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels * 4, base_channels * 3),
+            nn.LayerNorm(base_channels * 3),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels * 3, base_channels * 2),
+            nn.LayerNorm(base_channels * 2),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels * 2, base_channels),
+            nn.LayerNorm(base_channels),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate)
+        )
+        
+        # Enhanced market regime detection branch (using LayerNorm for batch_size=1 compatibility)
+        self.regime_detector = nn.Sequential(
+            nn.Linear(base_channels, base_channels // 2),
+            nn.LayerNorm(base_channels // 2),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(base_channels // 2, base_channels // 4),
+            nn.LayerNorm(base_channels // 4),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Linear(base_channels // 4, 8),  # 8 market regimes instead of 4
+            nn.Softmax(dim=1)
+        )
+        
+        # Enhanced volatility prediction branch (using LayerNorm for batch_size=1 compatibility)
+        self.volatility_predictor = nn.Sequential(
+            nn.Linear(base_channels, base_channels // 2),
+            nn.LayerNorm(base_channels // 2),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            nn.Linear(base_channels // 2, base_channels // 4),
+            nn.LayerNorm(base_channels // 4),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Linear(base_channels // 4, 1),
+            nn.Sigmoid()
+        )
+        
+        # Main trading decision head (using LayerNorm for batch_size=1 compatibility)
+        self.decision_head = nn.Sequential(
+            nn.Linear(base_channels + 8 + 1, base_channels),  # 8 regime classes + 1 volatility
+            nn.LayerNorm(base_channels),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels, base_channels // 2),
+            nn.LayerNorm(base_channels // 2),  # Changed from BatchNorm1d
+            nn.ReLU(),
+            nn.Dropout(dropout_rate),
+            
+            nn.Linear(base_channels // 2, output_size)
+        )
+        
+        # Confidence estimation head
+        self.confidence_head = nn.Sequential(
+            nn.Linear(base_channels, base_channels // 2),
+            nn.ReLU(),
+            nn.Linear(base_channels // 2, 1),
+            nn.Sigmoid()
+        )
+        
+        # Initialize weights
+        self._initialize_weights()
+        
+    def _build_conv_path(self, in_channels: int, out_channels: int, kernel_size: int) -> nn.Module:
+        """Build a convolutional path with multiple layers"""
+        return nn.Sequential(
+            nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2),
+            nn.BatchNorm1d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            
+            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
+            nn.BatchNorm1d(out_channels),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            
+            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
+            nn.BatchNorm1d(out_channels),
+            nn.ReLU()
+        )
+    
+    def _initialize_weights(self):
+        """Initialize model weights"""
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    
+    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
+        """
+        Forward pass with multiple outputs
+        Args:
+            x: Input tensor of shape [batch_size, sequence_length, features]
+        Returns:
+            Dictionary with predictions, confidence, regime, and volatility
+        """
+        # Handle input shapes flexibly
+        if len(x.shape) == 2:
+            # Input is [seq_len, features] - add batch dimension
+            x = x.unsqueeze(0)
+        elif len(x.shape) > 3:
+            # Input has extra dimensions - flatten to [batch, seq, features]
+            x = x.view(x.shape[0], -1, x.shape[-1])
+        
+        batch_size, seq_len, features = x.shape
+        
+        # Reshape for processing: [batch, seq, features] -> [batch*seq, features]
+        x_reshaped = x.view(-1, features)
+        
+        # Input embedding
+        embedded = self.input_embedding(x_reshaped)  # [batch*seq, base_channels]
+        
+        # Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
+        embedded = embedded.view(batch_size, seq_len, -1).transpose(1, 2)
+        
+        # Multi-scale feature extraction
+        path1 = self.conv_path1(embedded)
+        path2 = self.conv_path2(embedded)
+        path3 = self.conv_path3(embedded)
+        path4 = self.conv_path4(embedded)
+        
+        # Feature fusion
+        fused_features = torch.cat([path1, path2, path3, path4], dim=1)
+        fused_features = self.feature_fusion(fused_features)
+        
+        # Apply residual blocks with spatial attention
+        current_features = fused_features
+        for i, (res_block, attention) in enumerate(zip(self.residual_blocks, self.spatial_attention)):
+            current_features = res_block(current_features)
+            if i % 2 == 0:  # Apply attention every other block
+                current_features = attention(current_features)
+        
+        # Apply remaining residual blocks
+        for res_block in self.residual_blocks[len(self.spatial_attention):]:
+            current_features = res_block(current_features)
+        
+        # Temporal attention - apply both attention layers
+        # Reshape for attention: [batch, channels, seq] -> [batch, seq, channels]
+        attention_input = current_features.transpose(1, 2)
+        attended_features = self.temporal_attention1(attention_input)
+        attended_features = self.temporal_attention2(attended_features)
+        # Back to conv format: [batch, seq, channels] -> [batch, channels, seq]
+        attended_features = attended_features.transpose(1, 2)
+        
+        # Global aggregation
+        avg_pooled = self.global_pool(attended_features).squeeze(-1)  # [batch, channels]
+        max_pooled = self.global_max_pool(attended_features).squeeze(-1)  # [batch, channels]
+        
+        # Combine global features
+        global_features = torch.cat([avg_pooled, max_pooled], dim=1)
+        
+        # Advanced feature processing
+        processed_features = self.advanced_features(global_features)
+        
+        # Multi-task predictions
+        regime_probs = self.regime_detector(processed_features)
+        volatility_pred = self.volatility_predictor(processed_features)
+        confidence = self.confidence_head(processed_features)
+        
+        # Combine all features for final decision (8 regime classes + 1 volatility)
+        combined_features = torch.cat([processed_features, regime_probs, volatility_pred], dim=1)
+        trading_logits = self.decision_head(combined_features)
+        
+        # Apply temperature scaling for better calibration
+        temperature = 1.5
+        trading_probs = F.softmax(trading_logits / temperature, dim=1)
+        
+        return {
+            'logits': trading_logits,
+            'probabilities': trading_probs,
+            'confidence': confidence.squeeze(-1),
+            'regime': regime_probs,
+            'volatility': volatility_pred.squeeze(-1),
+            'features': processed_features
+        }
+    
+    def predict(self, feature_matrix: np.ndarray) -> Dict[str, Any]:
+        """
+        Make predictions on feature matrix
+        Args:
+            feature_matrix: numpy array of shape [sequence_length, features]
+        Returns:
+            Dictionary with prediction results
+        """
+        self.eval()
+        
+        with torch.no_grad():
+            # Convert to tensor and add batch dimension
+            if isinstance(feature_matrix, np.ndarray):
+                x = torch.FloatTensor(feature_matrix).unsqueeze(0)  # Add batch dim
+            else:
+                x = feature_matrix.unsqueeze(0)
+            
+            # Move to device
+            device = next(self.parameters()).device
+            x = x.to(device)
+            
+            # Forward pass
+            outputs = self.forward(x)
+            
+            # Extract results
+            probs = outputs['probabilities'].cpu().numpy()[0]
+            confidence = outputs['confidence'].cpu().numpy()[0]
+            regime = outputs['regime'].cpu().numpy()[0]
+            volatility = outputs['volatility'].cpu().numpy()[0]
+            
+            # Determine action (0=BUY, 1=SELL for 2-action system)
+            action = int(np.argmax(probs))
+            action_confidence = float(probs[action])
+            
+            return {
+                'action': action,
+                'action_name': 'BUY' if action == 0 else 'SELL',
+                'confidence': float(confidence),
+                'action_confidence': action_confidence,
+                'probabilities': probs.tolist(),
+                'regime_probabilities': regime.tolist(),
+                'volatility_prediction': float(volatility),
+                'raw_logits': outputs['logits'].cpu().numpy()[0].tolist()
+            }
+    
+    def get_memory_usage(self) -> Dict[str, Any]:
+        """Get model memory usage statistics"""
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        
+        param_size = sum(p.numel() * p.element_size() for p in self.parameters())
+        buffer_size = sum(b.numel() * b.element_size() for b in self.buffers())
+        
+        return {
+            'total_parameters': total_params,
+            'trainable_parameters': trainable_params,
+            'parameter_size_mb': param_size / (1024 * 1024),
+            'buffer_size_mb': buffer_size / (1024 * 1024),
+            'total_size_mb': (param_size + buffer_size) / (1024 * 1024)
+        }
+    
+    def to_device(self, device: str):
+        """Move model to specified device"""
+        return self.to(torch.device(device))
+
+class CNNModelTrainer:
+    """Enhanced trainer for the beefed-up CNN model"""
+    
+    def __init__(self, model: EnhancedCNNModel, learning_rate: float = 0.0001, device: str = 'cuda'):
+        self.model = model.to(device)
+        self.device = device
+        self.learning_rate = learning_rate
+        
+        # Use AdamW optimizer with weight decay
+        self.optimizer = torch.optim.AdamW(
+            model.parameters(), 
+            lr=learning_rate, 
+            weight_decay=0.01,
+            betas=(0.9, 0.999)
+        )
+        
+        # Learning rate scheduler
+        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            self.optimizer,
+            max_lr=learning_rate * 10,
+            total_steps=10000,  # Will be updated based on actual training
+            pct_start=0.1,
+            anneal_strategy='cos'
+        )
+        
+        # Multi-task loss functions
+        self.main_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
+        self.confidence_criterion = nn.BCELoss()
+        self.regime_criterion = nn.CrossEntropyLoss()
+        self.volatility_criterion = nn.MSELoss()
+        
+        self.training_history = []
+        
+    def train_step(self, x: torch.Tensor, y: torch.Tensor, 
+                   confidence_targets: Optional[torch.Tensor] = None,
+                   regime_targets: Optional[torch.Tensor] = None,
+                   volatility_targets: Optional[torch.Tensor] = None) -> Dict[str, float]:
+        """Single training step with multi-task learning"""
+        
+        self.model.train()
+        self.optimizer.zero_grad()
+        
+        # Forward pass
+        outputs = self.model(x)
+        
+        # Main trading loss
+        main_loss = self.main_criterion(outputs['logits'], y)
+        total_loss = main_loss
+        
+        losses = {'main_loss': main_loss.item()}
+        
+        # Confidence loss (if targets provided)
+        if confidence_targets is not None:
+            conf_loss = self.confidence_criterion(outputs['confidence'], confidence_targets)
+            total_loss += 0.1 * conf_loss
+            losses['confidence_loss'] = conf_loss.item()
+        
+        # Regime classification loss (if targets provided)
+        if regime_targets is not None:
+            regime_loss = self.regime_criterion(outputs['regime'], regime_targets)
+            total_loss += 0.05 * regime_loss
+            losses['regime_loss'] = regime_loss.item()
+        
+        # Volatility prediction loss (if targets provided)
+        if volatility_targets is not None:
+            vol_loss = self.volatility_criterion(outputs['volatility'], volatility_targets)
+            total_loss += 0.05 * vol_loss
+            losses['volatility_loss'] = vol_loss.item()
+        
+        losses['total_loss'] = total_loss.item()
+        
+        # Backward pass
+        total_loss.backward()
+        
+        # Gradient clipping
+        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
+        
+        self.optimizer.step()
+        self.scheduler.step()
+        
+        # Calculate accuracy
+        with torch.no_grad():
+            predictions = torch.argmax(outputs['probabilities'], dim=1)
+            accuracy = (predictions == y).float().mean().item()
+            losses['accuracy'] = accuracy
+        
+        return losses
+    
+    def save_model(self, filepath: str, metadata: Optional[Dict] = None):
+        """Save model with metadata"""
+        save_dict = {
+            'model_state_dict': self.model.state_dict(),
+            'optimizer_state_dict': self.optimizer.state_dict(),
+            'scheduler_state_dict': self.scheduler.state_dict(),
+            'training_history': self.training_history,
+            'model_config': {
+                'input_size': self.model.input_size,
+                'feature_dim': self.model.feature_dim,
+                'output_size': self.model.output_size,
+                'base_channels': self.model.base_channels
+            }
+        }
+        
+        if metadata:
+            save_dict['metadata'] = metadata
+            
+        torch.save(save_dict, filepath)
+        logger.info(f"Enhanced CNN model saved to {filepath}")
+    
+    def load_model(self, filepath: str) -> Dict:
+        """Load model from file"""
+        checkpoint = torch.load(filepath, map_location=self.device)
+        
+        self.model.load_state_dict(checkpoint['model_state_dict'])
+        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
+        
+        if 'scheduler_state_dict' in checkpoint:
+            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
+        
+        if 'training_history' in checkpoint:
+            self.training_history = checkpoint['training_history']
+        
+        logger.info(f"Enhanced CNN model loaded from {filepath}")
+        return checkpoint.get('metadata', {})
+
+def create_enhanced_cnn_model(input_size: int = 60, 
+                            feature_dim: int = 50, 
+                            output_size: int = 2,
+                            base_channels: int = 256,
+                            device: str = 'cuda') -> Tuple[EnhancedCNNModel, CNNModelTrainer]:
+    """Create enhanced CNN model and trainer"""
+    
+    model = EnhancedCNNModel(
+        input_size=input_size,
+        feature_dim=feature_dim,
+        output_size=output_size,
+        base_channels=base_channels,
+        num_blocks=12,
+        num_attention_heads=16,
+        dropout_rate=0.2
+    )
+    
+    trainer = CNNModelTrainer(model, learning_rate=0.0001, device=device)
+    
+    logger.info(f"Created enhanced CNN model with {model.get_memory_usage()['total_parameters']:,} parameters")
+    
+    return model, trainer
+
+# Compatibility wrapper for williams_market_structure.py
+class CNNModel:
+    """
+    Compatibility wrapper for the enhanced CNN model
+    """
+    
+    def __init__(self, input_shape=(900, 50), output_size=10, model_path=None):
+        self.input_shape = input_shape
+        self.output_size = output_size
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        
+        # Create the enhanced model
+        self.model = EnhancedCNNModel(
+            input_size=input_shape[0],
+            feature_dim=input_shape[1],
+            output_size=output_size
+        )
+        self.trainer = CNNModelTrainer(self.model, device=self.device)
+        
+        logger.info(f"CNN Model wrapper initialized: input_shape={input_shape}, output_size={output_size}")
+        
+        if model_path and os.path.exists(model_path):
+            self.load(model_path)
+    
+    def build_model(self, **kwargs):
+        """Build/configure the model"""
+        logger.info("CNN Model build_model called")
+        return self
+    
+    def predict(self, X):
+        """Make predictions on input data"""
+        try:
+            if isinstance(X, np.ndarray):
+                result = self.model.predict(X)
+                pred_class = np.array([result['action']])
+                pred_proba = np.array([result['probabilities']])
+            else:
+                # Handle tensor input
+                result = self.model.predict(X.cpu().numpy() if hasattr(X, 'cpu') else X)
+                pred_class = np.array([result['action']])
+                pred_proba = np.array([result['probabilities']])
+                
+            logger.debug(f"CNN prediction: class={pred_class}, proba_shape={pred_proba.shape}")
+            return pred_class, pred_proba
+            
+        except Exception as e:
+            logger.error(f"Error in CNN prediction: {e}")
+            import traceback
+            logger.error(f"Full traceback: {traceback.format_exc()}")
+            # Return dummy prediction
+            pred_class = np.array([0])
+            pred_proba = np.array([[0.1] * self.output_size])
+            return pred_class, pred_proba
+    
+    def fit(self, X, y, **kwargs):
+        """Train the model on input data"""
+        try:
+            # Convert to tensors if needed (create new tensors to avoid in-place modifications)
+            if isinstance(X, np.ndarray):
+                X = torch.FloatTensor(X.copy())  # Use copy to avoid in-place modifications
+            elif isinstance(X, torch.Tensor):
+                X = X.clone().detach()  # Clone to avoid in-place modifications
+                
+            if isinstance(y, np.ndarray):
+                y = torch.LongTensor(y.copy())  # Use copy to avoid in-place modifications
+            elif isinstance(y, torch.Tensor):
+                y = y.clone().detach().long()  # Clone to avoid in-place modifications
+            
+            # Ensure proper shapes and consistent batch sizes
+            if len(X.shape) == 2:
+                X = X.unsqueeze(0)  # [seq, features] -> [1, seq, features]
+            
+            # Handle target tensor - ensure it matches batch size (avoid in-place operations)
+            if len(y.shape) == 0:
+                y = y.unsqueeze(0)  # scalar -> [1]
+            elif len(y.shape) == 2 and y.shape[0] == 1:
+                # Already correct shape [1, num_classes] -> get class index
+                y = torch.argmax(y, dim=1)  # [1, num_classes] -> [1]
+            elif len(y.shape) == 1 and len(y) > 1:
+                # Multi-class probabilities -> get class index, ensure batch size 1
+                y = torch.argmax(y).unsqueeze(0)  # [num_classes] -> [1]
+            elif len(y.shape) == 1 and len(y) == 1:
+                pass  # Already correct [1]
+            else:
+                # Fallback: take first element and ensure batch size 1
+                y = y.view(-1)[:1]  # Take only first element
+            
+            # Move to device (create new tensors on device, don't modify in-place)
+            X = X.to(self.device, non_blocking=True)
+            y = y.to(self.device, non_blocking=True)
+            
+            # Use trainer's train_step
+            loss_dict = self.trainer.train_step(X, y)
+            logger.info(f"CNN training: X_shape={X.shape}, y_shape={y.shape}, loss={loss_dict.get('total_loss', 0):.4f}")
+            
+            return self
+            
+        except Exception as e:
+            logger.error(f"Error in CNN training: {e}")
+            return self
+    
+    def save(self, filepath: str):
+        """Save the model"""
+        try:
+            self.trainer.save_model(filepath)
+            logger.info(f"CNN model saved to {filepath}")
+        except Exception as e:
+            logger.error(f"Error saving CNN model: {e}")
+    
+    def load(self, filepath: str):
+        """Load the model"""
+        try:
+            self.trainer.load_model(filepath)
+            logger.info(f"CNN model loaded from {filepath}")
+        except Exception as e:
+            logger.error(f"Error loading CNN model: {e}")
+    
+    def to_device(self, device):
+        """Move model to device"""
+        self.device = device
+        self.model.to(device)
+        return self
+    
+    def get_memory_usage(self):
+        """Get model memory usage"""
+        try:
+            return self.model.get_memory_usage()
+        except Exception as e:
+            logger.error(f"Error getting memory usage: {e}")
+            return {'total_parameters': 0, 'memory_mb': 0}
--- a/NN/models/cnn_model_pytorch.py
+++ b/NN/models/cnn_model_pytorch.py
@@ -80,8 +80,8 @@ class ResidualBlock(nn.Module):
        out = self.dropout(out)
        out = self.norm2(self.conv2(out))
        
-        # Add residual connection
-        out += residual
+        # Add residual connection (avoid in-place operation)
+        out = out + residual
        return F.relu(out)

 class SpatialAttentionBlock(nn.Module):
@@ -94,7 +94,8 @@ class SpatialAttentionBlock(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Compute attention weights
        attention = torch.sigmoid(self.conv(x))
-        return x * attention
+        # Avoid in-place operation by creating new tensor
+        return torch.mul(x, attention)

 class EnhancedCNNModel(nn.Module):
    """
--- a/NN/models/enhanced_cnn_with_orderbook.py
+++ b/NN/models/enhanced_cnn_with_orderbook.py
@@ -0,0 +1,595 @@
+"""
+Enhanced CNN Model with Bookmap Order Book Integration
+
+This module extends the enhanced CNN to incorporate:
+- Traditional market data (OHLCV, indicators)
+- Order book depth features (COB)
+- Volume profile features (SVP)
+- Order flow signals (sweeps, absorptions, momentum)
+- Market microstructure metrics
+
+The integrated model provides comprehensive market awareness for superior trading decisions.
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+import logging
+from typing import Dict, List, Optional, Tuple, Any
+
+logger = logging.getLogger(__name__)
+
+class ResidualBlock(nn.Module):
+    """Enhanced residual block with skip connections"""
+    
+    def __init__(self, in_channels, out_channels, stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
+        self.bn1 = nn.BatchNorm1d(out_channels)
+        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.bn2 = nn.BatchNorm1d(out_channels)
+        
+        # Shortcut connection
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_channels != out_channels:
+            self.shortcut = nn.Sequential(
+                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
+                nn.BatchNorm1d(out_channels)
+            )
+    
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        # Avoid in-place operation
+        out = out + self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+class MultiHeadAttention(nn.Module):
+    """Multi-head attention mechanism"""
+    
+    def __init__(self, dim, num_heads=8, dropout=0.1):
+        super(MultiHeadAttention, self).__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        
+        self.q_linear = nn.Linear(dim, dim)
+        self.k_linear = nn.Linear(dim, dim)
+        self.v_linear = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(dropout)
+        self.out = nn.Linear(dim, dim)
+    
+    def forward(self, x):
+        batch_size, seq_len, dim = x.size()
+        
+        # Linear transformations
+        q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
+        k = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
+        v = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
+        
+        # Transpose for attention
+        q = q.transpose(1, 2)
+        k = k.transpose(1, 2)
+        v = v.transpose(1, 2)
+        
+        # Scaled dot-product attention
+        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.head_dim)
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, dim)
+        
+        return self.out(attn_output), attn_weights
+
+class OrderBookEncoder(nn.Module):
+    """Specialized encoder for order book data"""
+    
+    def __init__(self, input_dim=100, hidden_dim=512):
+        super(OrderBookEncoder, self).__init__()
+        
+        # Order book feature processing
+        self.bid_encoder = nn.Sequential(
+            nn.Linear(40, 128),  # 20 levels x 2 features
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, 256),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+        
+        self.ask_encoder = nn.Sequential(
+            nn.Linear(40, 128),  # 20 levels x 2 features
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(128, 256),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+        
+        # Microstructure features
+        self.microstructure_encoder = nn.Sequential(
+            nn.Linear(15, 64),  # Liquidity + imbalance + flow features
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(64, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2)
+        )
+        
+        # Cross-attention between bids and asks
+        self.cross_attention = MultiHeadAttention(256, num_heads=8)
+        
+        # Output projection
+        self.output_projection = nn.Sequential(
+            nn.Linear(256 + 256 + 128, hidden_dim),  # Combine all features
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+    
+    def forward(self, orderbook_features):
+        """
+        Process order book features
+        
+        Args:
+            orderbook_features: Tensor of shape [batch, 100] containing:
+                - 40 bid features (20 levels x 2)
+                - 40 ask features (20 levels x 2) 
+                - 15 microstructure features
+                - 5 flow signal features
+        """
+        # Split features
+        bid_features = orderbook_features[:, :40]      # First 40 features
+        ask_features = orderbook_features[:, 40:80]    # Next 40 features
+        micro_features = orderbook_features[:, 80:95]  # Next 15 features
+        # flow_features = orderbook_features[:, 95:100]  # Last 5 features (included in micro)
+        
+        # Encode each component
+        bid_encoded = self.bid_encoder(bid_features)      # [batch, 256]
+        ask_encoded = self.ask_encoder(ask_features)      # [batch, 256]
+        micro_encoded = self.microstructure_encoder(micro_features)  # [batch, 128]
+        
+        # Add sequence dimension for attention
+        bid_seq = bid_encoded.unsqueeze(1)  # [batch, 1, 256]
+        ask_seq = ask_encoded.unsqueeze(1)  # [batch, 1, 256]
+        
+        # Cross-attention between bids and asks
+        combined_seq = torch.cat([bid_seq, ask_seq], dim=1)  # [batch, 2, 256]
+        attended_features, attention_weights = self.cross_attention(combined_seq)
+        
+        # Flatten attended features
+        attended_flat = attended_features.view(attended_features.size(0), -1)  # [batch, 512]
+        
+        # Combine with microstructure features
+        combined_features = torch.cat([attended_flat, micro_encoded], dim=1)  # [batch, 640]
+        
+        # Final projection
+        output = self.output_projection(combined_features)
+        
+        return output
+
+class VolumeProfileEncoder(nn.Module):
+    """Encoder for volume profile data"""
+    
+    def __init__(self, max_levels=50, hidden_dim=256):
+        super(VolumeProfileEncoder, self).__init__()
+        
+        self.max_levels = max_levels
+        
+        # Process volume profile levels
+        self.level_encoder = nn.Sequential(
+            nn.Linear(7, 32),  # price, volume, buy_vol, sell_vol, trades, vwap, net_vol
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(32, 64),
+            nn.ReLU()
+        )
+        
+        # Attention over price levels
+        self.level_attention = MultiHeadAttention(64, num_heads=4)
+        
+        # Final aggregation
+        self.aggregator = nn.Sequential(
+            nn.Linear(64, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(hidden_dim, hidden_dim)
+        )
+    
+    def forward(self, volume_profile_data):
+        """
+        Process volume profile data
+        
+        Args:
+            volume_profile_data: List of dicts or tensor with volume profile levels
+        """
+        # If input is list of dicts, convert to tensor
+        if isinstance(volume_profile_data, list):
+            if not volume_profile_data:
+                # Return zero features if no data
+                batch_size = 1
+                return torch.zeros(batch_size, self.aggregator[-1].out_features)
+            
+            # Convert to tensor
+            features = []
+            for level in volume_profile_data[:self.max_levels]:
+                level_features = [
+                    level.get('price', 0.0),
+                    level.get('volume', 0.0),
+                    level.get('buy_volume', 0.0),
+                    level.get('sell_volume', 0.0),
+                    level.get('trades_count', 0.0),
+                    level.get('vwap', 0.0),
+                    level.get('net_volume', 0.0)
+                ]
+                features.append(level_features)
+            
+            # Pad if needed
+            while len(features) < self.max_levels:
+                features.append([0.0] * 7)
+            
+            volume_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
+        else:
+            volume_tensor = volume_profile_data
+        
+        batch_size, num_levels, feature_dim = volume_tensor.shape
+        
+        # Encode each level
+        level_features = self.level_encoder(volume_tensor.view(-1, feature_dim))
+        level_features = level_features.view(batch_size, num_levels, -1)
+        
+        # Apply attention across levels
+        attended_levels, _ = self.level_attention(level_features)
+        
+        # Global average pooling
+        aggregated = torch.mean(attended_levels, dim=1)
+        
+        # Final processing
+        output = self.aggregator(aggregated)
+        
+        return output
+
+class EnhancedCNNWithOrderBook(nn.Module):
+    """
+    Enhanced CNN model integrating traditional market data with order book analysis
+    
+    Features:
+    - Multi-scale convolutional processing for time series data
+    - Specialized order book feature extraction
+    - Volume profile analysis
+    - Order flow signal integration
+    - Multi-head attention mechanisms
+    - Dueling architecture for value and advantage estimation
+    """
+    
+    def __init__(self, 
+                 market_input_shape=(60, 50),  # Traditional market data
+                 orderbook_features=100,       # Order book feature dimension
+                 n_actions=2,
+                 confidence_threshold=0.5):
+        super(EnhancedCNNWithOrderBook, self).__init__()
+        
+        self.market_input_shape = market_input_shape
+        self.orderbook_features = orderbook_features
+        self.n_actions = n_actions
+        self.confidence_threshold = confidence_threshold
+        
+        # Traditional market data processing
+        self.market_encoder = self._build_market_encoder()
+        
+        # Order book data processing
+        self.orderbook_encoder = OrderBookEncoder(
+            input_dim=orderbook_features,
+            hidden_dim=512
+        )
+        
+        # Volume profile processing
+        self.volume_encoder = VolumeProfileEncoder(
+            max_levels=50,
+            hidden_dim=256
+        )
+        
+        # Feature fusion
+        total_features = 1024 + 512 + 256  # market + orderbook + volume
+        self.feature_fusion = nn.Sequential(
+            nn.Linear(total_features, 1536),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(1536, 1024),
+            nn.ReLU(),
+            nn.Dropout(0.3)
+        )
+        
+        # Multi-head attention for integrated features
+        self.integrated_attention = MultiHeadAttention(1024, num_heads=16)
+        
+        # Dueling architecture
+        self.advantage_stream = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, n_actions)
+        )
+        
+        self.value_stream = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(256, 1)
+        )
+        
+        # Auxiliary heads for multi-task learning
+        self.extrema_head = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 3)  # bottom, top, neither
+        )
+        
+        self.market_regime_head = nn.Sequential(
+            nn.Linear(1024, 512),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(512, 256),
+            nn.ReLU(),
+            nn.Linear(256, 8)  # trending, ranging, volatile, etc.
+        )
+        
+        self.confidence_head = nn.Sequential(
+            nn.Linear(1024, 256),
+            nn.ReLU(),
+            nn.Linear(256, 1),
+            nn.Sigmoid()
+        )
+        
+        # Initialize weights
+        self._initialize_weights()
+        
+        # Device management
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.to(self.device)
+        
+        logger.info(f"Enhanced CNN with Order Book initialized")
+        logger.info(f"Market input shape: {market_input_shape}")
+        logger.info(f"Order book features: {orderbook_features}")
+        logger.info(f"Output actions: {n_actions}")
+    
+    def _build_market_encoder(self):
+        """Build traditional market data encoder"""
+        seq_len, feature_dim = self.market_input_shape
+        
+        return nn.Sequential(
+            # Input projection
+            nn.Linear(feature_dim, 128),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            
+            # Convolutional layers for temporal patterns
+            nn.Conv1d(128, 256, kernel_size=5, padding=2),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            
+            ResidualBlock(256, 512),
+            ResidualBlock(512, 512),
+            ResidualBlock(512, 768),
+            ResidualBlock(768, 768),
+            
+            # Global pooling
+            nn.AdaptiveAvgPool1d(1),
+            nn.Flatten(),
+            
+            # Final projection
+            nn.Linear(768, 1024),
+            nn.ReLU(),
+            nn.Dropout(0.3)
+        )
+    
+    def _initialize_weights(self):
+        """Initialize model weights"""
+        for m in self.modules():
+            if isinstance(m, nn.Conv1d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm1d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+    
+    def forward(self, market_data, orderbook_data, volume_profile_data=None):
+        """
+        Forward pass through integrated model
+        
+        Args:
+            market_data: Traditional market data [batch, seq_len, features]
+            orderbook_data: Order book features [batch, orderbook_features]
+            volume_profile_data: Volume profile data (optional)
+        
+        Returns:
+            Dictionary with Q-values, confidence, regime, and auxiliary predictions
+        """
+        batch_size = market_data.size(0)
+        
+        # Process market data
+        if len(market_data.shape) == 2:
+            market_data = market_data.unsqueeze(0)
+        
+        # Reshape for convolutional processing
+        market_reshaped = market_data.view(batch_size, -1, market_data.size(-1))
+        market_features = self.market_encoder(market_reshaped.transpose(1, 2))
+        
+        # Process order book data
+        orderbook_features = self.orderbook_encoder(orderbook_data)
+        
+        # Process volume profile data
+        if volume_profile_data is not None:
+            volume_features = self.volume_encoder(volume_profile_data)
+        else:
+            volume_features = torch.zeros(batch_size, 256, device=self.device)
+        
+        # Fuse all features
+        combined_features = torch.cat([
+            market_features,
+            orderbook_features,
+            volume_features
+        ], dim=1)
+        
+        # Feature fusion
+        fused_features = self.feature_fusion(combined_features)
+        
+        # Apply attention
+        attended_features = fused_features.unsqueeze(1)  # Add sequence dimension
+        attended_output, attention_weights = self.integrated_attention(attended_features)
+        final_features = attended_output.squeeze(1)  # Remove sequence dimension
+        
+        # Dueling architecture
+        advantage = self.advantage_stream(final_features)
+        value = self.value_stream(final_features)
+        
+        # Combine value and advantage
+        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
+        
+        # Auxiliary predictions
+        extrema_pred = self.extrema_head(final_features)
+        regime_pred = self.market_regime_head(final_features)
+        confidence = self.confidence_head(final_features)
+        
+        return {
+            'q_values': q_values,
+            'confidence': confidence,
+            'extrema_prediction': extrema_pred,
+            'market_regime': regime_pred,
+            'attention_weights': attention_weights,
+            'integrated_features': final_features
+        }
+    
+    def predict(self, market_data, orderbook_data, volume_profile_data=None):
+        """Make prediction with confidence thresholding"""
+        self.eval()
+        
+        with torch.no_grad():
+            # Convert inputs to tensors if needed
+            if isinstance(market_data, np.ndarray):
+                market_data = torch.FloatTensor(market_data).to(self.device)
+            if isinstance(orderbook_data, np.ndarray):
+                orderbook_data = torch.FloatTensor(orderbook_data).to(self.device)
+            
+            # Ensure batch dimension
+            if len(market_data.shape) == 2:
+                market_data = market_data.unsqueeze(0)
+            if len(orderbook_data.shape) == 1:
+                orderbook_data = orderbook_data.unsqueeze(0)
+            
+            # Forward pass
+            outputs = self.forward(market_data, orderbook_data, volume_profile_data)
+            
+            # Get probabilities
+            q_values = outputs['q_values']
+            probs = F.softmax(q_values, dim=1)
+            confidence = outputs['confidence'].item()
+            
+            # Action selection with confidence thresholding
+            if confidence >= self.confidence_threshold:
+                action = torch.argmax(q_values, dim=1).item()
+            else:
+                action = None  # No action due to low confidence
+            
+            return {
+                'action': action,
+                'probabilities': probs.cpu().numpy()[0],
+                'confidence': confidence,
+                'q_values': q_values.cpu().numpy()[0],
+                'extrema_prediction': F.softmax(outputs['extrema_prediction'], dim=1).cpu().numpy()[0],
+                'market_regime': F.softmax(outputs['market_regime'], dim=1).cpu().numpy()[0]
+            }
+    
+    def get_feature_importance(self, market_data, orderbook_data, volume_profile_data=None):
+        """Analyze feature importance using gradients"""
+        self.eval()
+        
+        # Enable gradient computation for inputs
+        market_data.requires_grad_(True)
+        orderbook_data.requires_grad_(True)
+        
+        # Forward pass
+        outputs = self.forward(market_data, orderbook_data, volume_profile_data)
+        
+        # Compute gradients for Q-values
+        q_values = outputs['q_values']
+        q_values.sum().backward()
+        
+        # Get gradient magnitudes
+        market_importance = torch.abs(market_data.grad).mean().item()
+        orderbook_importance = torch.abs(orderbook_data.grad).mean().item()
+        
+        return {
+            'market_importance': market_importance,
+            'orderbook_importance': orderbook_importance,
+            'total_importance': market_importance + orderbook_importance
+        }
+    
+    def save(self, path):
+        """Save model state"""
+        torch.save({
+            'model_state_dict': self.state_dict(),
+            'market_input_shape': self.market_input_shape,
+            'orderbook_features': self.orderbook_features,
+            'n_actions': self.n_actions,
+            'confidence_threshold': self.confidence_threshold
+        }, path)
+        logger.info(f"Enhanced CNN with Order Book saved to {path}")
+    
+    def load(self, path):
+        """Load model state"""
+        checkpoint = torch.load(path, map_location=self.device)
+        self.load_state_dict(checkpoint['model_state_dict'])
+        logger.info(f"Enhanced CNN with Order Book loaded from {path}")
+    
+    def get_memory_usage(self):
+        """Get model memory usage statistics"""
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        
+        return {
+            'total_parameters': total_params,
+            'trainable_parameters': trainable_params,
+            'model_size_mb': total_params * 4 / (1024 * 1024),  # Assuming float32
+        }
+
+def create_enhanced_cnn_with_orderbook(
+    market_input_shape=(60, 50),
+    orderbook_features=100,
+    n_actions=2,
+    device='cuda'
+):
+    """Create and initialize enhanced CNN with order book integration"""
+    
+    model = EnhancedCNNWithOrderBook(
+        market_input_shape=market_input_shape,
+        orderbook_features=orderbook_features,
+        n_actions=n_actions
+    )
+    
+    if device and torch.cuda.is_available():
+        model = model.to(device)
+    
+    memory_usage = model.get_memory_usage()
+    logger.info(f"Created Enhanced CNN with Order Book: {memory_usage['total_parameters']:,} parameters")
+    logger.info(f"Model size: {memory_usage['model_size_mb']:.1f} MB")
+    
+    return model