import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import os
import logging
import torch.nn.functional as F
from typing import List, Tuple

# Configure logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class PricePatternAttention(nn.Module):
    """
    Attention mechanism specifically designed to focus on price patterns
    that might indicate local extrema or trend reversals
    """
    def __init__(self, input_dim, hidden_dim=64):
        super(PricePatternAttention, self).__init__()
        self.query = nn.Linear(input_dim, hidden_dim)
        self.key = nn.Linear(input_dim, hidden_dim)
        self.value = nn.Linear(input_dim, hidden_dim)
        self.scale = torch.sqrt(torch.tensor(hidden_dim, dtype=torch.float32))
        
    def forward(self, x):
        """Apply attention to input sequence"""
        # x shape: [batch_size, seq_len, features]
        batch_size, seq_len, _ = x.size()
        
        # Project input to query, key, value
        q = self.query(x)  # [batch_size, seq_len, hidden_dim]
        k = self.key(x)    # [batch_size, seq_len, hidden_dim]
        v = self.value(x)  # [batch_size, seq_len, hidden_dim]
        
        # Calculate attention scores
        scores = torch.matmul(q, k.transpose(-2, -1)) / self.scale  # [batch_size, seq_len, seq_len]
        
        # Apply softmax to get attention weights
        attn_weights = F.softmax(scores, dim=-1)  # [batch_size, seq_len, seq_len]
        
        # Apply attention to values
        output = torch.matmul(attn_weights, v)  # [batch_size, seq_len, hidden_dim]
        
        return output, attn_weights

class AdaptiveNorm(nn.Module):
    """
    Adaptive normalization layer that chooses between different normalization
    methods based on input dimensions
    """
    def __init__(self, num_features):
        super(AdaptiveNorm, self).__init__()
        self.batch_norm = nn.BatchNorm1d(num_features, affine=True)
        self.group_norm = nn.GroupNorm(min(32, num_features), num_features)
        self.layer_norm = nn.LayerNorm([num_features, 1])
        
    def forward(self, x):
        # Check input dimensions
        batch_size, channels, seq_len = x.size()
        
        # Choose normalization method:
        # - Batch size > 1 and seq_len > 1: BatchNorm
        # - Batch size == 1 or seq_len == 1: GroupNorm
        # - Fallback for extreme cases: LayerNorm
        if batch_size > 1 and seq_len > 1:
            return self.batch_norm(x)
        elif seq_len > 1:
            return self.group_norm(x)
        else:
            # For 1D inputs (seq_len=1), we need to adjust the layer norm
            # to the actual input size
            if not hasattr(self, 'layer_norm_1d') or self.layer_norm_1d.normalized_shape[0] != channels:
                self.layer_norm_1d = nn.LayerNorm([channels, seq_len]).to(x.device)
            return self.layer_norm_1d(x)

class SimpleCNN(nn.Module):
    """
    Simple CNN model for reinforcement learning with image-like state inputs
    """
    def __init__(self, input_shape, n_actions):
        super(SimpleCNN, self).__init__()
        
        # Store dimensions
        self.input_shape = input_shape
        self.n_actions = n_actions
        
        # Calculate input dimensions
        if len(input_shape) == 3:  # [channels, height, width]
            self.channels, self.height, self.width = input_shape
            self.feature_dim = self.height * self.width
        elif len(input_shape) == 2:  # [timeframes, features]
            self.channels = input_shape[0]
            self.features = input_shape[1]
            self.feature_dim = self.features
        elif len(input_shape) == 1:  # [features]
            self.channels = 1
            self.features = input_shape[0]
            self.feature_dim = self.features
        else:
            raise ValueError(f"Unsupported input shape: {input_shape}")
        
        # Build network
        self._build_network()
        
        # Initialize device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        
        logger.info(f"SimpleCNN initialized with input shape: {input_shape}, actions: {n_actions}")
    
    def _build_network(self):
        """Build the neural network with current feature dimensions"""
        # Create a flexible architecture that adapts to input dimensions
        # Increased complexity
        self.fc_layers = nn.Sequential(
            nn.Linear(self.feature_dim, 512), # Increased size
            nn.ReLU(),
            nn.Dropout(0.2), # Added dropout
            nn.Linear(512, 512),             # Increased size
            nn.ReLU(),
            nn.Dropout(0.2),             # Added dropout
            nn.Linear(512, 512),             # Added layer
            nn.ReLU(),
            nn.Dropout(0.2)              # Added dropout
        )
        
        # Output heads (Dueling DQN architecture)
        self.advantage_head = nn.Linear(512, self.n_actions) # Updated input size
        self.value_head = nn.Linear(512, 1)                 # Updated input size
        
        # Extrema detection head
        self.extrema_head = nn.Linear(512, 3)  # 0=bottom, 1=top, 2=neither, Updated input size
        
        # Price prediction heads for different timeframes
        self.price_pred_immediate = nn.Linear(512, 3)  # Updated input size
        self.price_pred_midterm = nn.Linear(512, 3)    # Updated input size
        self.price_pred_longterm = nn.Linear(512, 3)   # Updated input size
        
        # Regression heads for exact price prediction
        self.price_pred_value = nn.Linear(512, 4)  # Updated input size
    
    def _check_rebuild_network(self, features):
        """Check if network needs to be rebuilt for different feature dimensions"""
        if features != self.feature_dim:
            logger.info(f"Rebuilding network for new feature dimension: {features} (was {self.feature_dim})")
            self.feature_dim = features
            self._build_network()
            # Move to device after rebuilding
            self.to(self.device)
            return True
        return False
        
    def forward(self, x):
        """Forward pass through the network"""
        # Flatten input if needed to ensure it matches the expected feature dimension
        batch_size = x.size(0)
        
        # Reshape input if needed
        if len(x.shape) > 2:  # Handle multi-dimensional input
            # For 3D input: [batch, seq_len, features] or [batch, channels, features]
            x = x.reshape(batch_size, -1)  # Flatten to [batch, seq_len*features]
        
        # Check if the feature dimension matches and rebuild if necessary
        if x.size(1) != self.feature_dim:
            self._check_rebuild_network(x.size(1))
        
        # Apply fully connected layers with ReLU activation
        x = self.fc_layers(x)
        
        # Branch 1: Action values (Q-values)
        action_values = self.advantage_head(x)
        
        # Branch 2: Extrema detection (market top/bottom classification)
        extrema_pred = self.extrema_head(x)
        
        # Branch 3: Price movement prediction over different timeframes
        # Split into three timeframes: immediate, midterm, longterm
        price_immediate = self.price_pred_immediate(x)
        price_midterm = self.price_pred_midterm(x)
        price_longterm = self.price_pred_longterm(x)
        
        # Branch 4: Value prediction (regression for expected price changes)
        price_values = self.price_pred_value(x)
        
        # Package price predictions
        price_predictions = {
            'immediate': price_immediate,  # Classification (up/down/sideways)
            'midterm': price_midterm,      # Classification (up/down/sideways)
            'longterm': price_longterm,    # Classification (up/down/sideways)
            'values': price_values         # Regression (expected % change)
        }
        
        # Return all outputs and the hidden feature representation
        return action_values, extrema_pred, price_predictions, x
    
    def extract_features(self, x):
        """Extract hidden features from the input and return both action values and features"""
        # Flatten input if needed to ensure it matches the expected feature dimension
        batch_size = x.size(0)
        
        # Reshape input if needed
        if len(x.shape) > 2:  # Handle multi-dimensional input
            # For 3D input: [batch, seq_len, features] or [batch, channels, features]
            x = x.reshape(batch_size, -1)  # Flatten to [batch, seq_len*features]
        
        # Check if the feature dimension matches and rebuild if necessary
        if x.size(1) != self.feature_dim:
            self._check_rebuild_network(x.size(1))
            
        # Apply fully connected layers with ReLU activation
        x_features = self.fc_layers(x)
        
        # Branch 1: Action values (Q-values)
        action_values = self.advantage_head(x_features)
        
        # Return action values and the hidden feature representation
        return action_values, x_features
        
    def save(self, path):
        """Save model weights and architecture"""
        os.makedirs(os.path.dirname(path), exist_ok=True)
        torch.save({
            'state_dict': self.state_dict(),
            'input_shape': self.input_shape,
            'n_actions': self.n_actions,
            'feature_dim': self.feature_dim
        }, f"{path}.pt")
        logger.info(f"Model saved to {path}.pt")
        
    def load(self, path):
        """Load model weights and architecture"""
        try:
            checkpoint = torch.load(f"{path}.pt", map_location=self.device)
            self.input_shape = checkpoint['input_shape']
            self.n_actions = checkpoint['n_actions']
            self.feature_dim = checkpoint['feature_dim']
            self._build_network()
            self.load_state_dict(checkpoint['state_dict'])
            self.to(self.device)
            logger.info(f"Model loaded from {path}.pt")
            return True
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return False

class CNNModelPyTorch(nn.Module):
    """
    CNN model for trading with multiple timeframes
    """
    def __init__(self, window_size=20, num_features=5, output_size=3, timeframes=None):
        super(CNNModelPyTorch, self).__init__()
        
        if timeframes is None:
            timeframes = [1]
            
        self.window_size = window_size
        self.num_features = num_features
        self.output_size = output_size
        self.timeframes = timeframes
        
        # num_features should already be the total features across all timeframes
        self.total_features = num_features
        logger.info(f"CNNModelPyTorch initialized with window_size={window_size}, num_features={num_features}, "
                   f"total_features={self.total_features}, output_size={output_size}, timeframes={timeframes}")
        
        # Device configuration
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"Using device: {self.device}")
        
        # Create model architecture
        self._create_layers()
        
        # Move model to device
        self.to(self.device)
    
    def _create_layers(self):
        """Create all model layers with current feature dimensions"""
        # Convolutional layers - use total_features as input channels
        self.conv1 = nn.Conv1d(self.total_features, 64, kernel_size=3, padding=1)
        self.norm1 = AdaptiveNorm(64)
        self.dropout1 = nn.Dropout(0.2)
        
        self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1)
        self.norm2 = AdaptiveNorm(128)
        self.dropout2 = nn.Dropout(0.3)
        
        self.conv3 = nn.Conv1d(128, 256, kernel_size=3, padding=1)
        self.norm3 = AdaptiveNorm(256)
        self.dropout3 = nn.Dropout(0.4)
        
        # Add price pattern attention layer
        self.attention = PricePatternAttention(256)
        
        # Extrema detection specialized convolutional layer
        self.extrema_conv = nn.Conv1d(256, 128, kernel_size=3, padding=1)  # Smaller kernel for small inputs
        self.extrema_norm = AdaptiveNorm(128)
        
        # Fully connected layers - input size will be determined dynamically
        self.fc1 = None  # Will be initialized in forward pass
        self.fc2 = nn.Linear(512, 256)
        self.dropout_fc = nn.Dropout(0.5)
        
        # Advantage and Value streams (Dueling DQN architecture)
        self.fc3 = nn.Linear(256, self.output_size)  # Advantage stream
        self.value_fc = nn.Linear(256, 1)  # Value stream
        
        # Additional prediction head for extrema detection (tops/bottoms)
        self.extrema_fc = nn.Linear(256, 3)  # 0=bottom, 1=top, 2=neither
        
        # Initialize optimizer and scheduler
        self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, mode='max', factor=0.5, patience=5, verbose=True
        )
    
    def rebuild_conv_layers(self, input_channels):
        """
        Rebuild convolutional layers for different input dimensions
        
        Args:
            input_channels: Number of input channels (features) in the data
        """
        logger.info(f"Rebuilding convolutional layers for {input_channels} input channels")
        
        # Update total features
        self.total_features = input_channels
        
        # Recreate all layers with new dimensions
        self._create_layers()
        
        # Move layers to device
        self.to(self.device)
        
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward pass through the network"""
        # Ensure input is on the correct device
        x = x.to(self.device)
        
        # Log input tensor shape for debugging
        input_shape = x.size()
        logger.debug(f"Input tensor shape: {input_shape}")
        
        # Check input dimensions and reshape as needed
        if len(x.size()) == 2:
            # If input is [batch_size, features], reshape to [batch_size, features, 1]
            batch_size, feature_dim = x.size()
            
            # Check and handle if input features don't match model expectations
            if feature_dim != self.total_features:
                logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features})")
                if not hasattr(self, 'rebuild_warning_shown'):
                    logger.error(f"Dimension mismatch: Expected {self.total_features} features but got {feature_dim}")
                    self.rebuild_warning_shown = True
                # Don't rebuild - instead adapt the input
                # If features are fewer, pad with zeros. If more, truncate
                if feature_dim < self.total_features:
                    padding = torch.zeros(batch_size, self.total_features - feature_dim, device=self.device)
                    x = torch.cat([x, padding], dim=1)
                else:
                    x = x[:, :self.total_features]
            
            # For 1D input, use a sequence length of 1
            seq_len = 1
            x = x.unsqueeze(2)  # Reshape to [batch, features, 1]
        elif len(x.size()) == 3:
            # Standard case: [batch_size, window_size, features]
            batch_size, seq_len, feature_dim = x.size()
            
            # Check and handle if input dimensions don't match model expectations
            if feature_dim != self.total_features:
                logger.warning(f"Input features ({feature_dim}) don't match model features ({self.total_features})")
                if not hasattr(self, 'rebuild_warning_shown'):
                    logger.error(f"Dimension mismatch: Expected {self.total_features} features but got {feature_dim}")
                    self.rebuild_warning_shown = True
                # Don't rebuild - instead adapt the input
                # If features are fewer, pad with zeros. If more, truncate
                if feature_dim < self.total_features:
                    padding = torch.zeros(batch_size, seq_len, self.total_features - feature_dim, device=self.device)
                    x = torch.cat([x, padding], dim=2)
                else:
                    x = x[:, :, :self.total_features]
            
            # Reshape input: [batch, window_size, features] -> [batch, features, window_size]
            x = x.permute(0, 2, 1)
        else:
            raise ValueError(f"Unexpected input shape: {x.size()}, expected 2D or 3D tensor")
        
        # Log reshaped tensor for debugging
        logger.debug(f"Reshaped tensor for convolution: {x.size()}")
        
        # Convolutional layers with dropout - safely handle small spatial dimensions
        try:
            x = self.dropout1(F.relu(self.norm1(self.conv1(x))))
            x = self.dropout2(F.relu(self.norm2(self.conv2(x))))
            x = self.dropout3(F.relu(self.norm3(self.conv3(x))))
        except Exception as e:
            logger.warning(f"Error in convolutional layers: {str(e)}")
            # Fallback for very small inputs: skip some convolutions
            if seq_len < 3:
                # Apply a simpler convolution for very small inputs
                x = F.relu(self.conv1(x))
                x = F.relu(self.conv2(x))
                # Skip last conv if we get dimension errors
                try:
                    x = F.relu(self.conv3(x))
                except:
                    pass
        
        # Store conv features for extrema detection
        conv_features = x
        
        # Get the current shape after convolutions
        _, channels, conv_seq_len = x.size()
        
        # Initialize fc1 if not created yet or if the shape has changed
        if self.fc1 is None:
            flattened_size = channels * conv_seq_len
            logger.info(f"Initializing fc1 with input size {flattened_size}")
            self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
        
        # Apply extrema detection safely
        try:
            extrema_features = F.relu(self.extrema_norm(self.extrema_conv(conv_features)))
        except Exception as e:
            logger.warning(f"Error in extrema detection: {str(e)}")
            extrema_features = conv_features  # Fallback
        
        # Handle attention for small sequence lengths
        if conv_seq_len > 1:
            # Reshape for attention: [batch, channels, seq_len] -> [batch, seq_len, channels]
            x_attention = x.permute(0, 2, 1)
            
            # Apply attention
            try:
                attention_output, attention_weights = self.attention(x_attention)
            except Exception as e:
                logger.warning(f"Error in attention layer: {str(e)}")
                # Fallback: don't use attention
        
        # Flatten - get the actual shape for this batch
        flattened_size = channels * conv_seq_len
        x = x.view(batch_size, flattened_size)
        
        # Check if we need to recreate fc1 with the correct size
        if self.fc1.in_features != flattened_size:
            logger.info(f"Recreating fc1 layer to match input size {flattened_size}")
            self.fc1 = nn.Linear(flattened_size, 512).to(self.device)
            # Reinitialize optimizer after changing the model
            self.optimizer = optim.Adam(self.parameters(), lr=0.001)
        
        # Fully connected layers with dropout
        x = F.relu(self.fc1(x))
        x = self.dropout_fc(F.relu(self.fc2(x)))
        
        # Split into advantage and value streams
        advantage = self.fc3(x)
        value = self.value_fc(x)
        
        # Combine value and advantage
        q_values = value + (advantage - advantage.mean(dim=1, keepdim=True))
        
        # Also compute extrema prediction from the same features
        extrema_flat = extrema_features.view(batch_size, -1)
        extrema_pred = self.extrema_fc(x)  # Use the same features for extrema prediction
        
        return q_values, extrema_pred
    
    def predict(self, X):
        """Make predictions"""
        self.eval()
        
        # Convert to tensor if not already
        if not isinstance(X, torch.Tensor):
            X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
        else:
            X_tensor = X.to(self.device)
        
        with torch.no_grad():
            q_values, extrema_pred = self(X_tensor)
            q_values_np = q_values.cpu().numpy()
            actions = np.argmax(q_values_np, axis=1)
            
            # Also return extrema predictions
            extrema_np = extrema_pred.cpu().numpy()
            extrema_classes = np.argmax(extrema_np, axis=1)
            
        return actions, q_values_np, extrema_classes
    
    def save(self, path: str):
        """Save model weights"""
        os.makedirs(os.path.dirname(path), exist_ok=True)
        torch.save(self.state_dict(), f"{path}.pt")
        logger.info(f"Model saved to {path}.pt")
    
    def load(self, path: str):
        """Load model weights"""
        self.load_state_dict(torch.load(f"{path}.pt", map_location=self.device))
        self.eval()
        logger.info(f"Model loaded from {path}.pt")