"""
Enhanced CNN Model with Bookmap Order Book Integration

This module extends the enhanced CNN to incorporate:
- Traditional market data (OHLCV, indicators)
- Order book depth features (COB)
- Volume profile features (SVP)
- Order flow signals (sweeps, absorptions, momentum)
- Market microstructure metrics

The integrated model provides comprehensive market awareness for superior trading decisions.
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
from typing import Dict, List, Optional, Tuple, Any

logger = logging.getLogger(__name__)

class ResidualBlock(nn.Module):
    """Enhanced residual block with skip connections"""
    
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm1d(out_channels)
        
        # Shortcut connection
        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm1d(out_channels)
            )
    
    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        # Avoid in-place operation
        out = out + self.shortcut(x)
        out = F.relu(out)
        return out

class MultiHeadAttention(nn.Module):
    """Multi-head attention mechanism"""
    
    def __init__(self, dim, num_heads=8, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        self.dim = dim
        self.num_heads = num_heads
        self.head_dim = dim // num_heads
        
        self.q_linear = nn.Linear(dim, dim)
        self.k_linear = nn.Linear(dim, dim)
        self.v_linear = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dim, dim)
    
    def forward(self, x):
        batch_size, seq_len, dim = x.size()
        
        # Linear transformations
        q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        k = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        v = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
        
        # Transpose for attention
        q = q.transpose(1, 2)
        k = k.transpose(1, 2)
        v = v.transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.head_dim)
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        attn_output = torch.matmul(attn_weights, v)
        attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, dim)
        
        return self.out(attn_output), attn_weights

class OrderBookEncoder(nn.Module):
    """Specialized encoder for order book data"""
    
    def __init__(self, input_dim=100, hidden_dim=512):
        super(OrderBookEncoder, self).__init__()
        
        # Order book feature processing
        self.bid_encoder = nn.Sequential(
            nn.Linear(40, 128),  # 20 levels x 2 features
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        self.ask_encoder = nn.Sequential(
            nn.Linear(40, 128),  # 20 levels x 2 features
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Microstructure features
        self.microstructure_encoder = nn.Sequential(
            nn.Linear(15, 64),  # Liquidity + imbalance + flow features
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Cross-attention between bids and asks
        self.cross_attention = MultiHeadAttention(256, num_heads=8)
        
        # Output projection
        self.output_projection = nn.Sequential(
            nn.Linear(256 + 256 + 128, hidden_dim),  # Combine all features
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim)
        )
    
    def forward(self, orderbook_features):
        """
        Process order book features
        
        Args:
            orderbook_features: Tensor of shape [batch, 100] containing:
                - 40 bid features (20 levels x 2)
                - 40 ask features (20 levels x 2) 
                - 15 microstructure features
                - 5 flow signal features
        """
        # Split features
        bid_features = orderbook_features[:, :40]      # First 40 features
        ask_features = orderbook_features[:, 40:80]    # Next 40 features
        micro_features = orderbook_features[:, 80:95]  # Next 15 features
        # flow_features = orderbook_features[:, 95:100]  # Last 5 features (included in micro)
        
        # Encode each component
        bid_encoded = self.bid_encoder(bid_features)      # [batch, 256]
        ask_encoded = self.ask_encoder(ask_features)      # [batch, 256]
        micro_encoded = self.microstructure_encoder(micro_features)  # [batch, 128]
        
        # Add sequence dimension for attention
        bid_seq = bid_encoded.unsqueeze(1)  # [batch, 1, 256]
        ask_seq = ask_encoded.unsqueeze(1)  # [batch, 1, 256]
        
        # Cross-attention between bids and asks
        combined_seq = torch.cat([bid_seq, ask_seq], dim=1)  # [batch, 2, 256]
        attended_features, attention_weights = self.cross_attention(combined_seq)
        
        # Flatten attended features
        attended_flat = attended_features.view(attended_features.size(0), -1)  # [batch, 512]
        
        # Combine with microstructure features
        combined_features = torch.cat([attended_flat, micro_encoded], dim=1)  # [batch, 640]
        
        # Final projection
        output = self.output_projection(combined_features)
        
        return output

class VolumeProfileEncoder(nn.Module):
    """Encoder for volume profile data"""
    
    def __init__(self, max_levels=50, hidden_dim=256):
        super(VolumeProfileEncoder, self).__init__()
        
        self.max_levels = max_levels
        
        # Process volume profile levels
        self.level_encoder = nn.Sequential(
            nn.Linear(7, 32),  # price, volume, buy_vol, sell_vol, trades, vwap, net_vol
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 64),
            nn.ReLU()
        )
        
        # Attention over price levels
        self.level_attention = MultiHeadAttention(64, num_heads=4)
        
        # Final aggregation
        self.aggregator = nn.Sequential(
            nn.Linear(64, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_dim, hidden_dim)
        )
    
    def forward(self, volume_profile_data):
        """
        Process volume profile data
        
        Args:
            volume_profile_data: List of dicts or tensor with volume profile levels
        """
        # If input is list of dicts, convert to tensor
        if isinstance(volume_profile_data, list):
            if not volume_profile_data:
                # Return zero features if no data
                batch_size = 1
                return torch.zeros(batch_size, self.aggregator[-1].out_features)
            
            # Convert to tensor
            features = []
            for level in volume_profile_data[:self.max_levels]:
                level_features = [
                    level.get('price', 0.0),
                    level.get('volume', 0.0),
                    level.get('buy_volume', 0.0),
                    level.get('sell_volume', 0.0),
                    level.get('trades_count', 0.0),
                    level.get('vwap', 0.0),
                    level.get('net_volume', 0.0)
                ]
                features.append(level_features)
            
            # Pad if needed
            while len(features) < self.max_levels:
                features.append([0.0] * 7)
            
            volume_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
        else:
            volume_tensor = volume_profile_data
        
        batch_size, num_levels, feature_dim = volume_tensor.shape
        
        # Encode each level
        level_features = self.level_encoder(volume_tensor.view(-1, feature_dim))
        level_features = level_features.view(batch_size, num_levels, -1)
        
        # Apply attention across levels
        attended_levels, _ = self.level_attention(level_features)
        
        # Global average pooling
        aggregated = torch.mean(attended_levels, dim=1)
        
        # Final processing
        output = self.aggregator(aggregated)
        
        return output

class EnhancedCNNWithOrderBook(nn.Module):
    """
    Enhanced CNN model integrating traditional market data with order book analysis
    
    Features:
    - Multi-scale convolutional processing for time series data
    - Specialized order book feature extraction
    - Volume profile analysis
    - Order flow signal integration
    - Multi-head attention mechanisms
    - Dueling architecture for value and advantage estimation
    """
    
    def __init__(self, 
                 market_input_shape=(60, 50),  # Traditional market data
                 orderbook_features=100,       # Order book feature dimension
                 n_actions=2,
                 confidence_threshold=0.5):
        super(EnhancedCNNWithOrderBook, self).__init__()
        
        self.market_input_shape = market_input_shape
        self.orderbook_features = orderbook_features
        self.n_actions = n_actions
        self.confidence_threshold = confidence_threshold
        
        # Traditional market data processing
        self.market_encoder = self._build_market_encoder()
        
        # Order book data processing
        self.orderbook_encoder = OrderBookEncoder(
            input_dim=orderbook_features,
            hidden_dim=512
        )
        
        # Volume profile processing
        self.volume_encoder = VolumeProfileEncoder(
            max_levels=50,
            hidden_dim=256
        )
        
        # Feature fusion
        total_features = 1024 + 512 + 256  # market + orderbook + volume
        self.feature_fusion = nn.Sequential(
            nn.Linear(total_features, 1536),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(1536, 1024),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Multi-head attention for integrated features
        self.integrated_attention = MultiHeadAttention(1024, num_heads=16)
        
        # Dueling architecture
        self.advantage_stream = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, n_actions)
        )
        
        self.value_stream = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )
        
        # Auxiliary heads for multi-task learning
        self.extrema_head = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 3)  # bottom, top, neither
        )
        
        self.market_regime_head = nn.Sequential(
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Linear(256, 8)  # trending, ranging, volatile, etc.
        )
        
        self.confidence_head = nn.Sequential(
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )
        
        # Initialize weights
        self._initialize_weights()
        
        # Device management
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        logger.info(f"Enhanced CNN with Order Book initialized")
        logger.info(f"Market input shape: {market_input_shape}")
        logger.info(f"Order book features: {orderbook_features}")
        logger.info(f"Output actions: {n_actions}")
    
    def _build_market_encoder(self):
        """Build traditional market data encoder"""
        seq_len, feature_dim = self.market_input_shape
        
        return nn.Sequential(
            # Input projection
            nn.Linear(feature_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # Convolutional layers for temporal patterns
            nn.Conv1d(128, 256, kernel_size=5, padding=2),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            ResidualBlock(256, 512),
            ResidualBlock(512, 512),
            ResidualBlock(512, 768),
            ResidualBlock(768, 768),
            
            # Global pooling
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            
            # Final projection
            nn.Linear(768, 1024),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
    
    def _initialize_weights(self):
        """Initialize model weights"""
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, market_data, orderbook_data, volume_profile_data=None):
        """
        Forward pass through integrated model
        
        Args:
            market_data: Traditional market data [batch, seq_len, features]
            orderbook_data: Order book features [batch, orderbook_features]
            volume_profile_data: Volume profile data (optional)
        
        Returns:
            Dictionary with Q-values, confidence, regime, and auxiliary predictions
        """
        batch_size = market_data.size(0)
        
        # Process market data
        if len(market_data.shape) == 2:
            market_data = market_data.unsqueeze(0)
        
        # Reshape for convolutional processing
        market_reshaped = market_data.view(batch_size, -1, market_data.size(-1))
        market_features = self.market_encoder(market_reshaped.transpose(1, 2))
        
        # Process order book data
        orderbook_features = self.orderbook_encoder(orderbook_data)
        
        # Process volume profile data
        if volume_profile_data is not None:
            volume_features = self.volume_encoder(volume_profile_data)
        else:
            volume_features = torch.zeros(batch_size, 256, device=self.device)
        
        # Fuse all features
        combined_features = torch.cat([
            market_features,
            orderbook_features,
            volume_features
        ], dim=1)
        
        # Feature fusion
        fused_features = self.feature_fusion(combined_features)
        
        # Apply attention
        attended_features = fused_features.unsqueeze(1)  # Add sequence dimension
        attended_output, attention_weights = self.integrated_attention(attended_features)
        final_features = attended_output.squeeze(1)  # Remove sequence dimension
        
        # Dueling architecture
        advantage = self.advantage_stream(final_features)
        value = self.value_stream(final_features)
        
        # Combine value and advantage
        q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
        
        # Auxiliary predictions
        extrema_pred = self.extrema_head(final_features)
        regime_pred = self.market_regime_head(final_features)
        confidence = self.confidence_head(final_features)
        
        return {
            'q_values': q_values,
            'confidence': confidence,
            'extrema_prediction': extrema_pred,
            'market_regime': regime_pred,
            'attention_weights': attention_weights,
            'integrated_features': final_features
        }
    
    def predict(self, market_data, orderbook_data, volume_profile_data=None):
        """Make prediction with confidence thresholding"""
        self.eval()
        
        with torch.no_grad():
            # Convert inputs to tensors if needed
            if isinstance(market_data, np.ndarray):
                market_data = torch.FloatTensor(market_data).to(self.device)
            if isinstance(orderbook_data, np.ndarray):
                orderbook_data = torch.FloatTensor(orderbook_data).to(self.device)
            
            # Ensure batch dimension
            if len(market_data.shape) == 2:
                market_data = market_data.unsqueeze(0)
            if len(orderbook_data.shape) == 1:
                orderbook_data = orderbook_data.unsqueeze(0)
            
            # Forward pass
            outputs = self.forward(market_data, orderbook_data, volume_profile_data)
            
            # Get probabilities
            q_values = outputs['q_values']
            probs = F.softmax(q_values, dim=1)
            
            # Handle confidence shape properly to avoid scalar conversion errors
            confidence_tensor = outputs['confidence']
            if isinstance(confidence_tensor, torch.Tensor):
                if confidence_tensor.numel() == 1:
                    confidence = confidence_tensor.item()
                else:
                    confidence = confidence_tensor.flatten()[0].item()
            else:
                confidence = float(confidence_tensor)
            
            # Action selection with confidence thresholding
            if confidence >= self.confidence_threshold:
                action = torch.argmax(q_values, dim=1).item()
            else:
                action = None  # No action due to low confidence
            
            return {
                'action': action,
                'probabilities': probs.cpu().numpy()[0],
                'confidence': confidence,
                'q_values': q_values.cpu().numpy()[0],
                'extrema_prediction': F.softmax(outputs['extrema_prediction'], dim=1).cpu().numpy()[0],
                'market_regime': F.softmax(outputs['market_regime'], dim=1).cpu().numpy()[0]
            }
    
    def get_feature_importance(self, market_data, orderbook_data, volume_profile_data=None):
        """Analyze feature importance using gradients"""
        self.eval()
        
        # Enable gradient computation for inputs
        market_data.requires_grad_(True)
        orderbook_data.requires_grad_(True)
        
        # Forward pass
        outputs = self.forward(market_data, orderbook_data, volume_profile_data)
        
        # Compute gradients for Q-values
        q_values = outputs['q_values']
        q_values.sum().backward()
        
        # Get gradient magnitudes
        market_importance = torch.abs(market_data.grad).mean().item()
        orderbook_importance = torch.abs(orderbook_data.grad).mean().item()
        
        return {
            'market_importance': market_importance,
            'orderbook_importance': orderbook_importance,
            'total_importance': market_importance + orderbook_importance
        }
    
    def save(self, path):
        """Save model state"""
        torch.save({
            'model_state_dict': self.state_dict(),
            'market_input_shape': self.market_input_shape,
            'orderbook_features': self.orderbook_features,
            'n_actions': self.n_actions,
            'confidence_threshold': self.confidence_threshold
        }, path)
        logger.info(f"Enhanced CNN with Order Book saved to {path}")
    
    def load(self, path):
        """Load model state"""
        checkpoint = torch.load(path, map_location=self.device)
        self.load_state_dict(checkpoint['model_state_dict'])
        logger.info(f"Enhanced CNN with Order Book loaded from {path}")
    
    def get_memory_usage(self):
        """Get model memory usage statistics"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        return {
            'total_parameters': total_params,
            'trainable_parameters': trainable_params,
            'model_size_mb': total_params * 4 / (1024 * 1024),  # Assuming float32
        }

def create_enhanced_cnn_with_orderbook(
    market_input_shape=(60, 50),
    orderbook_features=100,
    n_actions=2,
    device='cuda'
):
    """Create and initialize enhanced CNN with order book integration"""
    
    model = EnhancedCNNWithOrderBook(
        market_input_shape=market_input_shape,
        orderbook_features=orderbook_features,
        n_actions=n_actions
    )
    
    if device and torch.cuda.is_available():
        model = model.to(device)
    
    memory_usage = model.get_memory_usage()
    logger.info(f"Created Enhanced CNN with Order Book: {memory_usage['total_parameters']:,} parameters")
    logger.info(f"Model size: {memory_usage['model_size_mb']:.1f} MB")
    
    return model