#!/usr/bin/env python3
"""
Enhanced CNN Model for Trading - PyTorch Implementation
Much larger and more sophisticated architecture for better learning
"""

import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from typing import Dict, Any, Optional, Tuple

# Configure logging
logger = logging.getLogger(__name__)

class MultiHeadAttention(nn.Module):
    """Multi-head attention mechanism for sequence data"""
    
    def __init__(self, d_model: int, num_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        assert d_model % num_heads == 0
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.scale = math.sqrt(self.d_k)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, seq_len, _ = x.size()
        
        # Compute Q, K, V
        Q = self.w_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        K = self.w_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        V = self.w_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
        
        # Attention weights
        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        # Apply attention
        attention_output = torch.matmul(attention_weights, V)
        attention_output = attention_output.transpose(1, 2).contiguous().view(
            batch_size, seq_len, self.d_model
        )
        
        return self.w_o(attention_output)

class ResidualBlock(nn.Module):
    """Residual block with normalization and dropout"""
    
    def __init__(self, channels: int, dropout: float = 0.1):
        super().__init__()
        self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
        self.norm1 = nn.BatchNorm1d(channels)
        self.norm2 = nn.BatchNorm1d(channels)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        residual = x
        
        out = F.relu(self.norm1(self.conv1(x)))
        out = self.dropout(out)
        out = self.norm2(self.conv2(out))
        
        # Add residual connection
        out += residual
        return F.relu(out)

class SpatialAttentionBlock(nn.Module):
    """Spatial attention for feature maps"""
    
    def __init__(self, channels: int):
        super().__init__()
        self.conv = nn.Conv1d(channels, 1, kernel_size=1)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # Compute attention weights
        attention = torch.sigmoid(self.conv(x))
        return x * attention

class EnhancedCNNModel(nn.Module):
    """
    Much larger and more sophisticated CNN architecture for trading
    Features:
    - Deep convolutional layers with residual connections
    - Multi-head attention mechanisms
    - Spatial attention blocks
    - Multiple feature extraction paths
    - Large capacity for complex pattern learning
    """
    
    def __init__(self, 
                 input_size: int = 60,
                 feature_dim: int = 50,
                 output_size: int = 2,  # BUY/SELL for 2-action system
                 base_channels: int = 256,  # Increased from 128 to 256
                 num_blocks: int = 12,  # Increased from 6 to 12
                 num_attention_heads: int = 16,  # Increased from 8 to 16
                 dropout_rate: float = 0.2):
        super().__init__()
        
        self.input_size = input_size
        self.feature_dim = feature_dim
        self.output_size = output_size
        self.base_channels = base_channels
        
        # Much larger input embedding - project features to higher dimension
        self.input_embedding = nn.Sequential(
            nn.Linear(feature_dim, base_channels // 2),
            nn.BatchNorm1d(base_channels // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(base_channels // 2, base_channels),
            nn.BatchNorm1d(base_channels),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Multi-scale convolutional feature extraction with more channels
        self.conv_path1 = self._build_conv_path(base_channels, base_channels, 3)
        self.conv_path2 = self._build_conv_path(base_channels, base_channels, 5)
        self.conv_path3 = self._build_conv_path(base_channels, base_channels, 7)
        self.conv_path4 = self._build_conv_path(base_channels, base_channels, 9)  # Additional path
        
        # Feature fusion with more capacity
        self.feature_fusion = nn.Sequential(
            nn.Conv1d(base_channels * 4, base_channels * 3, kernel_size=1),  # 4 paths now
            nn.BatchNorm1d(base_channels * 3),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Conv1d(base_channels * 3, base_channels * 2, kernel_size=1),
            nn.BatchNorm1d(base_channels * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Much deeper residual blocks for complex pattern learning
        self.residual_blocks = nn.ModuleList([
            ResidualBlock(base_channels * 2, dropout_rate) for _ in range(num_blocks)
        ])
        
        # More spatial attention blocks
        self.spatial_attention = nn.ModuleList([
            SpatialAttentionBlock(base_channels * 2) for _ in range(6)  # Increased from 3 to 6
        ])
        
        # Multiple temporal attention layers
        self.temporal_attention1 = MultiHeadAttention(
            d_model=base_channels * 2,
            num_heads=num_attention_heads,
            dropout=dropout_rate
        )
        self.temporal_attention2 = MultiHeadAttention(
            d_model=base_channels * 2,
            num_heads=num_attention_heads // 2,
            dropout=dropout_rate
        )
        
        # Global feature aggregation
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        
        # Much larger advanced feature processing
        self.advanced_features = nn.Sequential(
            nn.Linear(base_channels * 4, base_channels * 6),  # Increased capacity
            nn.BatchNorm1d(base_channels * 6),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels * 6, base_channels * 4),
            nn.BatchNorm1d(base_channels * 4),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels * 4, base_channels * 3),
            nn.BatchNorm1d(base_channels * 3),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels * 3, base_channels * 2),
            nn.BatchNorm1d(base_channels * 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels * 2, base_channels),
            nn.BatchNorm1d(base_channels),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )
        
        # Enhanced market regime detection branch
        self.regime_detector = nn.Sequential(
            nn.Linear(base_channels, base_channels // 2),
            nn.BatchNorm1d(base_channels // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(base_channels // 2, base_channels // 4),
            nn.BatchNorm1d(base_channels // 4),
            nn.ReLU(),
            nn.Linear(base_channels // 4, 8),  # 8 market regimes instead of 4
            nn.Softmax(dim=1)
        )
        
        # Enhanced volatility prediction branch
        self.volatility_predictor = nn.Sequential(
            nn.Linear(base_channels, base_channels // 2),
            nn.BatchNorm1d(base_channels // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(base_channels // 2, base_channels // 4),
            nn.BatchNorm1d(base_channels // 4),
            nn.ReLU(),
            nn.Linear(base_channels // 4, 1),
            nn.Sigmoid()
        )
        
        # Main trading decision head
        self.decision_head = nn.Sequential(
            nn.Linear(base_channels + 8 + 1, base_channels),  # 8 regime classes + 1 volatility
            nn.BatchNorm1d(base_channels),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels, base_channels // 2),
            nn.BatchNorm1d(base_channels // 2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            
            nn.Linear(base_channels // 2, output_size)
        )
        
        # Confidence estimation head
        self.confidence_head = nn.Sequential(
            nn.Linear(base_channels, base_channels // 2),
            nn.ReLU(),
            nn.Linear(base_channels // 2, 1),
            nn.Sigmoid()
        )
        
        # Initialize weights
        self._initialize_weights()
        
    def _build_conv_path(self, in_channels: int, out_channels: int, kernel_size: int) -> nn.Module:
        """Build a convolutional path with multiple layers"""
        return nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.Dropout(0.1),
            
            nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    def _initialize_weights(self):
        """Initialize model weights"""
        for m in self.modules():
            if isinstance(m, nn.Conv1d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
    
    def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
        """
        Forward pass with multiple outputs
        Args:
            x: Input tensor of shape [batch_size, sequence_length, features]
        Returns:
            Dictionary with predictions, confidence, regime, and volatility
        """
        batch_size, seq_len, features = x.shape
        
        # Reshape for processing: [batch, seq, features] -> [batch*seq, features]
        x_reshaped = x.view(-1, features)
        
        # Input embedding
        embedded = self.input_embedding(x_reshaped)  # [batch*seq, base_channels]
        
        # Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
        embedded = embedded.view(batch_size, seq_len, -1).transpose(1, 2)
        
        # Multi-scale feature extraction
        path1 = self.conv_path1(embedded)
        path2 = self.conv_path2(embedded)
        path3 = self.conv_path3(embedded)
        path4 = self.conv_path4(embedded)
        
        # Feature fusion
        fused_features = torch.cat([path1, path2, path3, path4], dim=1)
        fused_features = self.feature_fusion(fused_features)
        
        # Apply residual blocks with spatial attention
        current_features = fused_features
        for i, (res_block, attention) in enumerate(zip(self.residual_blocks, self.spatial_attention)):
            current_features = res_block(current_features)
            if i % 2 == 0:  # Apply attention every other block
                current_features = attention(current_features)
        
        # Apply remaining residual blocks
        for res_block in self.residual_blocks[len(self.spatial_attention):]:
            current_features = res_block(current_features)
        
        # Temporal attention - apply both attention layers
        # Reshape for attention: [batch, channels, seq] -> [batch, seq, channels]
        attention_input = current_features.transpose(1, 2)
        attended_features = self.temporal_attention1(attention_input)
        attended_features = self.temporal_attention2(attended_features)
        # Back to conv format: [batch, seq, channels] -> [batch, channels, seq]
        attended_features = attended_features.transpose(1, 2)
        
        # Global aggregation
        avg_pooled = self.global_pool(attended_features).squeeze(-1)  # [batch, channels]
        max_pooled = self.global_max_pool(attended_features).squeeze(-1)  # [batch, channels]
        
        # Combine global features
        global_features = torch.cat([avg_pooled, max_pooled], dim=1)
        
        # Advanced feature processing
        processed_features = self.advanced_features(global_features)
        
        # Multi-task predictions
        regime_probs = self.regime_detector(processed_features)
        volatility_pred = self.volatility_predictor(processed_features)
        confidence = self.confidence_head(processed_features)
        
        # Combine all features for final decision (8 regime classes + 1 volatility)
        combined_features = torch.cat([processed_features, regime_probs, volatility_pred], dim=1)
        trading_logits = self.decision_head(combined_features)
        
        # Apply temperature scaling for better calibration
        temperature = 1.5
        trading_probs = F.softmax(trading_logits / temperature, dim=1)
        
        return {
            'logits': trading_logits,
            'probabilities': trading_probs,
            'confidence': confidence.squeeze(-1),
            'regime': regime_probs,
            'volatility': volatility_pred.squeeze(-1),
            'features': processed_features
        }
    
    def predict(self, feature_matrix: np.ndarray) -> Dict[str, Any]:
        """
        Make predictions on feature matrix
        Args:
            feature_matrix: numpy array of shape [sequence_length, features]
        Returns:
            Dictionary with prediction results
        """
        self.eval()
        
        with torch.no_grad():
            # Convert to tensor and add batch dimension
            if isinstance(feature_matrix, np.ndarray):
                x = torch.FloatTensor(feature_matrix).unsqueeze(0)  # Add batch dim
            else:
                x = feature_matrix.unsqueeze(0)
            
            # Move to device
            device = next(self.parameters()).device
            x = x.to(device)
            
            # Forward pass
            outputs = self.forward(x)
            
            # Extract results
            probs = outputs['probabilities'].cpu().numpy()[0]
            confidence = outputs['confidence'].cpu().numpy()[0]
            regime = outputs['regime'].cpu().numpy()[0]
            volatility = outputs['volatility'].cpu().numpy()[0]
            
            # Determine action (0=BUY, 1=SELL for 2-action system)
            action = int(np.argmax(probs))
            action_confidence = float(probs[action])
            
            return {
                'action': action,
                'action_name': 'BUY' if action == 0 else 'SELL',
                'confidence': float(confidence),
                'action_confidence': action_confidence,
                'probabilities': probs.tolist(),
                'regime_probabilities': regime.tolist(),
                'volatility_prediction': float(volatility),
                'raw_logits': outputs['logits'].cpu().numpy()[0].tolist()
            }
    
    def get_memory_usage(self) -> Dict[str, Any]:
        """Get model memory usage statistics"""
        total_params = sum(p.numel() for p in self.parameters())
        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
        
        param_size = sum(p.numel() * p.element_size() for p in self.parameters())
        buffer_size = sum(b.numel() * b.element_size() for b in self.buffers())
        
        return {
            'total_parameters': total_params,
            'trainable_parameters': trainable_params,
            'parameter_size_mb': param_size / (1024 * 1024),
            'buffer_size_mb': buffer_size / (1024 * 1024),
            'total_size_mb': (param_size + buffer_size) / (1024 * 1024)
        }
    
    def to_device(self, device: str):
        """Move model to specified device"""
        return self.to(torch.device(device))

class CNNModelTrainer:
    """Enhanced trainer for the beefed-up CNN model"""
    
    def __init__(self, model: EnhancedCNNModel, learning_rate: float = 0.0001, device: str = 'cuda'):
        self.model = model.to(device)
        self.device = device
        self.learning_rate = learning_rate
        
        # Use AdamW optimizer with weight decay
        self.optimizer = torch.optim.AdamW(
            model.parameters(), 
            lr=learning_rate, 
            weight_decay=0.01,
            betas=(0.9, 0.999)
        )
        
        # Learning rate scheduler
        self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
            self.optimizer,
            max_lr=learning_rate * 10,
            total_steps=10000,  # Will be updated based on actual training
            pct_start=0.1,
            anneal_strategy='cos'
        )
        
        # Multi-task loss functions
        self.main_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        self.confidence_criterion = nn.BCELoss()
        self.regime_criterion = nn.CrossEntropyLoss()
        self.volatility_criterion = nn.MSELoss()
        
        self.training_history = []
        
    def train_step(self, x: torch.Tensor, y: torch.Tensor, 
                   confidence_targets: Optional[torch.Tensor] = None,
                   regime_targets: Optional[torch.Tensor] = None,
                   volatility_targets: Optional[torch.Tensor] = None) -> Dict[str, float]:
        """Single training step with multi-task learning"""
        
        self.model.train()
        self.optimizer.zero_grad()
        
        # Forward pass
        outputs = self.model(x)
        
        # Main trading loss
        main_loss = self.main_criterion(outputs['logits'], y)
        total_loss = main_loss
        
        losses = {'main_loss': main_loss.item()}
        
        # Confidence loss (if targets provided)
        if confidence_targets is not None:
            conf_loss = self.confidence_criterion(outputs['confidence'], confidence_targets)
            total_loss += 0.1 * conf_loss
            losses['confidence_loss'] = conf_loss.item()
        
        # Regime classification loss (if targets provided)
        if regime_targets is not None:
            regime_loss = self.regime_criterion(outputs['regime'], regime_targets)
            total_loss += 0.05 * regime_loss
            losses['regime_loss'] = regime_loss.item()
        
        # Volatility prediction loss (if targets provided)
        if volatility_targets is not None:
            vol_loss = self.volatility_criterion(outputs['volatility'], volatility_targets)
            total_loss += 0.05 * vol_loss
            losses['volatility_loss'] = vol_loss.item()
        
        losses['total_loss'] = total_loss.item()
        
        # Backward pass
        total_loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
        
        self.optimizer.step()
        self.scheduler.step()
        
        # Calculate accuracy
        with torch.no_grad():
            predictions = torch.argmax(outputs['probabilities'], dim=1)
            accuracy = (predictions == y).float().mean().item()
            losses['accuracy'] = accuracy
        
        return losses
    
    def save_model(self, filepath: str, metadata: Optional[Dict] = None):
        """Save model with metadata"""
        save_dict = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict(),
            'training_history': self.training_history,
            'model_config': {
                'input_size': self.model.input_size,
                'feature_dim': self.model.feature_dim,
                'output_size': self.model.output_size,
                'base_channels': self.model.base_channels
            }
        }
        
        if metadata:
            save_dict['metadata'] = metadata
            
        torch.save(save_dict, filepath)
        logger.info(f"Enhanced CNN model saved to {filepath}")
    
    def load_model(self, filepath: str) -> Dict:
        """Load model from file"""
        checkpoint = torch.load(filepath, map_location=self.device)
        
        self.model.load_state_dict(checkpoint['model_state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        
        if 'scheduler_state_dict' in checkpoint:
            self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        
        if 'training_history' in checkpoint:
            self.training_history = checkpoint['training_history']
        
        logger.info(f"Enhanced CNN model loaded from {filepath}")
        return checkpoint.get('metadata', {})

def create_enhanced_cnn_model(input_size: int = 60, 
                            feature_dim: int = 50, 
                            output_size: int = 2,
                            base_channels: int = 256,
                            device: str = 'cuda') -> Tuple[EnhancedCNNModel, CNNModelTrainer]:
    """Create enhanced CNN model and trainer"""
    
    model = EnhancedCNNModel(
        input_size=input_size,
        feature_dim=feature_dim,
        output_size=output_size,
        base_channels=base_channels,
        num_blocks=12,
        num_attention_heads=16,
        dropout_rate=0.2
    )
    
    trainer = CNNModelTrainer(model, learning_rate=0.0001, device=device)
    
    logger.info(f"Created enhanced CNN model with {model.get_memory_usage()['total_parameters']:,} parameters")
    
    return model, trainer