import os
import sys
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, 
    GlobalAveragePooling1D, Concatenate, Add, Activation, Flatten
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import (
    EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, 
    TensorBoard, CSVLogger
)
import matplotlib.pyplot as plt
import logging
import time
import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler('nn_transformer_model.log')
    ]
)

logger = logging.getLogger('transformer_model')

class TransformerBlock(tf.keras.layers.Layer):
    """
    Transformer block with multi-head self-attention and feed-forward network
    """
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
        
    def call(self, inputs, training=False):
        # Normalization and attention
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        
        # Feed-forward network
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        
        # Skip connection and normalization
        return self.layernorm2(out1 + ffn_output)

class TransformerModel:
    """
    Transformer-based model for financial time series analysis.
    This model processes both raw time series data and high-level features from the CNN model.
    """
    
    def __init__(self, ts_input_shape=(20, 5), feature_input_shape=128, output_size=3, model_dir='NN/models/saved'):
        """
        Initialize the Transformer model
        
        Args:
            ts_input_shape: Shape of time series input data (sequence_length, features)
            feature_input_shape: Shape of high-level feature input (from CNN)
            output_size: Number of output classes or values
            model_dir: Directory to save model files
        """
        self.ts_input_shape = ts_input_shape
        self.feature_input_shape = feature_input_shape
        self.output_size = output_size
        self.model_dir = model_dir
        self.model = None
        self.history = None
        
        # Create model directory if it doesn't exist
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Initialized TransformerModel with time series input shape {ts_input_shape}, "
                   f"feature input shape {feature_input_shape}, and output size {output_size}")
    
    def build_model(self, embed_dim=64, num_heads=4, ff_dim=128, num_transformer_blocks=2, 
                   dropout_rate=0.2, learning_rate=0.001):
        """
        Build the Transformer model architecture
        
        Args:
            embed_dim: Embedding dimension for the transformer
            num_heads: Number of attention heads
            ff_dim: Hidden layer size in the feed-forward network
            num_transformer_blocks: Number of transformer blocks to stack
            dropout_rate: Dropout rate for regularization
            learning_rate: Learning rate for the optimizer
            
        Returns:
            Compiled Keras model
        """
        # Time series input (price and volume data)
        ts_inputs = Input(shape=self.ts_input_shape, name='time_series_input')
        
        # High-level feature input (from CNN or other sources)
        feature_inputs = Input(shape=(self.feature_input_shape,), name='feature_input')
        
        # Process time series with transformer blocks
        x = ts_inputs
        for _ in range(num_transformer_blocks):
            x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x)
        
        # Global pooling to get fixed-size representation
        x = GlobalAveragePooling1D()(x)
        
        # Combine with the high-level features
        combined = Concatenate()([x, feature_inputs])
        
        # Dense layers
        dense1 = Dense(128, activation='relu')(combined)
        dropout1 = Dropout(dropout_rate)(dense1)
        dense2 = Dense(64, activation='relu')(dropout1)
        dropout2 = Dropout(dropout_rate)(dense2)
        
        # Output layer
        if self.output_size == 1:
            # Binary classification
            outputs = Dense(1, activation='sigmoid')(dropout2)
        elif self.output_size == 3:
            # For BUY/HOLD/SELL signals (3 classes)
            outputs = Dense(3, activation='softmax')(dropout2)
        else:
            # Regression or multi-class classification
            outputs = Dense(self.output_size, activation='linear')(dropout2)
        
        # Create and compile the model
        model = Model(inputs=[ts_inputs, feature_inputs], outputs=outputs)
        
        if self.output_size == 1:
            # Binary classification
            model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy']
            )
        elif self.output_size == 3:
            # Multi-class classification for BUY/HOLD/SELL
            model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )
        else:
            # Regression
            model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='mse',
                metrics=['mae']
            )
        
        self.model = model
        logger.info(f"Model built with {model.count_params()} parameters")
        model.summary(print_fn=logger.info)
        
        return model
    
    def train(self, X_ts, X_features, y, batch_size=32, epochs=100, validation_split=0.2,
             early_stopping_patience=20, reduce_lr_patience=10, verbose=1):
        """
        Train the Transformer model
        
        Args:
            X_ts: Time series input data
            X_features: High-level feature input data
            y: Target values
            batch_size: Batch size for training
            epochs: Maximum number of epochs
            validation_split: Fraction of data to use for validation
            early_stopping_patience: Patience for early stopping
            reduce_lr_patience: Patience for learning rate reduction
            verbose: Verbosity level
            
        Returns:
            Training history
        """
        if self.model is None:
            logger.warning("Model not built yet, building with default parameters")
            self.build_model()
        
        # Create a timestamp for this training run
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        model_name = f"transformer_model_{timestamp}"
        
        # Set up callbacks
        callbacks = [
            # Early stopping to prevent overfitting
            EarlyStopping(
                monitor='val_loss', 
                patience=early_stopping_patience,
                restore_best_weights=True,
                verbose=1
            ),
            
            # Reduce learning rate when training plateaus
            ReduceLROnPlateau(
                monitor='val_loss', 
                factor=0.5,
                patience=reduce_lr_patience, 
                min_lr=1e-6,
                verbose=1
            ),
            
            # Save the best model
            ModelCheckpoint(
                filepath=os.path.join(self.model_dir, f"{model_name}_best.h5"),
                monitor='val_loss',
                save_best_only=True,
                verbose=1
            ),
            
            # TensorBoard logging
            TensorBoard(
                log_dir=os.path.join(self.model_dir, 'logs', model_name),
                histogram_freq=1
            ),
            
            # CSV logging
            CSVLogger(
                filename=os.path.join(self.model_dir, f"{model_name}_training.csv"),
                separator=',', 
                append=False
            )
        ]
        
        # Train the model
        logger.info(f"Starting training with {len(X_ts)} samples, {epochs} max epochs")
        
        start_time = time.time()
        history = self.model.fit(
            [X_ts, X_features], y,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            callbacks=callbacks,
            verbose=verbose
        )
        
        # Calculate training time
        training_time = time.time() - start_time
        logger.info(f"Training completed in {training_time:.2f} seconds")
        
        # Save the final model
        self.model.save(os.path.join(self.model_dir, f"{model_name}_final.h5"))
        logger.info(f"Model saved to {os.path.join(self.model_dir, model_name + '_final.h5')}")
        
        # Save training history
        hist_df = pd.DataFrame(history.history)
        hist_df.to_csv(os.path.join(self.model_dir, f"{model_name}_history.csv"), index=False)
        
        self.history = history
        return history
    
    def predict(self, X_ts, X_features, threshold=0.5):
        """
        Make predictions with the model
        
        Args:
            X_ts: Time series input data
            X_features: High-level feature input data
            threshold: Threshold for binary classification
            
        Returns:
            Predicted values or classes
        """
        if self.model is None:
            logger.error("Model not built or trained yet")
            return None
        
        # Get raw predictions
        y_pred_proba = self.model.predict([X_ts, X_features])
        
        # Format predictions based on output type
        if self.output_size == 1:
            # Binary classification
            y_pred = (y_pred_proba > threshold).astype(int).flatten()
            return y_pred, y_pred_proba.flatten()
        elif self.output_size == 3:
            # Multi-class (BUY/HOLD/SELL)
            y_pred = np.argmax(y_pred_proba, axis=1)
            return y_pred, y_pred_proba
        else:
            # Regression
            return y_pred_proba
    
    def save_model(self, filepath=None):
        """
        Save the model to a file
        
        Args:
            filepath: Path to save the model to
            
        Returns:
            Path to the saved model
        """
        if self.model is None:
            logger.error("Model not built or trained yet")
            return None
        
        if filepath is None:
            # Create a default filepath
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filepath = os.path.join(self.model_dir, f"transformer_model_{timestamp}.h5")
        
        self.model.save(filepath)
        logger.info(f"Model saved to {filepath}")
        
        return filepath
    
    def load_model(self, filepath):
        """
        Load a model from a file
        
        Args:
            filepath: Path to load the model from
            
        Returns:
            Loaded model
        """
        try:
            self.model = tf.keras.models.load_model(filepath)
            logger.info(f"Model loaded from {filepath}")
            return self.model
        except Exception as e:
            logger.error(f"Error loading model: {str(e)}")
            return None

class MixtureOfExpertsModel:
    """
    Mixture of Experts (MoE) model that combines predictions from multiple models.
    This implementation focuses on combining CNN and Transformer models for financial analysis.
    """
    
    def __init__(self, output_size=3, model_dir='NN/models/saved'):
        """
        Initialize the MoE model
        
        Args:
            output_size: Number of output classes or values
            model_dir: Directory to save model files
        """
        self.output_size = output_size
        self.model_dir = model_dir
        self.models = {}  # Dictionary to store expert models
        self.gating_model = None  # Model to determine which expert to use
        self.model = None  # Combined MoE model
        
        # Create model directory if it doesn't exist
        os.makedirs(model_dir, exist_ok=True)
        
        logger.info(f"Initialized MixtureOfExpertsModel with output size {output_size}")
    
    def add_expert(self, name, model):
        """
        Add an expert model to the MoE
        
        Args:
            name: Name of the expert
            model: Expert model instance
            
        Returns:
            None
        """
        self.models[name] = model
        logger.info(f"Added expert model '{name}' to MoE")
    
    def build_model(self, ts_input_shape=(20, 5), expert_weights=None, learning_rate=0.001):
        """
        Build the MoE model architecture
        
        Args:
            ts_input_shape: Shape of time series input data
            expert_weights: Dictionary of expert weights (if None, equal weighting)
            learning_rate: Learning rate for the optimizer
            
        Returns:
            Compiled Keras model
        """
        if not self.models:
            logger.error("No expert models added to MoE")
            return None
        
        # Time series input
        ts_inputs = Input(shape=ts_input_shape, name='time_series_input')
        
        # Get predictions from each expert
        expert_outputs = []
        expert_names = []
        
        for name, model in self.models.items():
            if hasattr(model, 'predict') and callable(model.predict):
                expert_names.append(name)
                if name == 'cnn':
                    # For CNN, we directly use the time series input
                    # We need to extract the raw prediction function from the model's predict method
                    # which typically returns both predictions and probabilities
                    expert_outputs.append(model.model(ts_inputs))
                elif name == 'transformer':
                    # For transformer, we need features from the CNN as well
                    # This is a simplification - in a real implementation, we would need to
                    # extract features from the CNN model and pass them to the transformer
                    # Here we just create dummy features
                    dummy_features = Dense(128, activation='relu')(Flatten()(ts_inputs))
                    expert_outputs.append(model.model([ts_inputs, dummy_features]))
                else:
                    logger.warning(f"Unknown model type: {name}, skipping")
        
        if not expert_outputs:
            logger.error("No valid expert models found")
            return None
        
        # Use expert weighting
        if expert_weights is None:
            # Equal weighting
            weights = [1.0 / len(expert_outputs)] * len(expert_outputs)
        else:
            # User-provided weights
            weights = [expert_weights.get(name, 1.0 / len(expert_outputs)) for name in expert_names]
            # Normalize weights
            weights = [w / sum(weights) for w in weights]
        
        # Combine expert outputs using weighted average
        if len(expert_outputs) == 1:
            # Only one expert, use its output directly
            combined_output = expert_outputs[0]
        else:
            # Multiple experts, compute weighted average
            weighted_outputs = [output * weight for output, weight in zip(expert_outputs, weights)]
            combined_output = Add()(weighted_outputs)
        
        # Create the MoE model
        moe_model = Model(inputs=ts_inputs, outputs=combined_output)
        
        # Compile the model
        if self.output_size == 1:
            # Binary classification
            moe_model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='binary_crossentropy',
                metrics=['accuracy']
            )
        elif self.output_size == 3:
            # Multi-class classification for BUY/HOLD/SELL
            moe_model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='categorical_crossentropy',
                metrics=['accuracy']
            )
        else:
            # Regression
            moe_model.compile(
                optimizer=Adam(learning_rate=learning_rate),
                loss='mse',
                metrics=['mae']
            )
        
        self.model = moe_model
        logger.info(f"MoE model built with experts: {expert_names}, weights: {weights}")
        moe_model.summary(print_fn=logger.info)
        
        return moe_model
    
    def predict(self, X, threshold=0.5):
        """
        Make predictions with the MoE model
        
        Args:
            X: Input data
            threshold: Threshold for binary classification
            
        Returns:
            Predicted values or classes
        """
        if self.model is None:
            logger.error("MoE model not built yet")
            return None
        
        # Get raw predictions
        y_pred_proba = self.model.predict(X)
        
        # Format predictions based on output type
        if self.output_size == 1:
            # Binary classification
            y_pred = (y_pred_proba > threshold).astype(int).flatten()
            return y_pred, y_pred_proba.flatten()
        elif self.output_size == 3:
            # Multi-class (BUY/HOLD/SELL)
            y_pred = np.argmax(y_pred_proba, axis=1)
            return y_pred, y_pred_proba
        else:
            # Regression
            return y_pred_proba
    
    def save_model(self, filepath=None):
        """
        Save the MoE model to a file
        
        Args:
            filepath: Path to save the model to
            
        Returns:
            Path to the saved model
        """
        if self.model is None:
            logger.error("MoE model not built yet")
            return None
        
        if filepath is None:
            # Create a default filepath
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filepath = os.path.join(self.model_dir, f"moe_model_{timestamp}.h5")
        
        self.model.save(filepath)
        logger.info(f"MoE model saved to {filepath}")
        
        return filepath
    
    def load_model(self, filepath):
        """
        Load an MoE model from a file
        
        Args:
            filepath: Path to load the model from
            
        Returns:
            Loaded model
        """
        try:
            self.model = tf.keras.models.load_model(filepath)
            logger.info(f"MoE model loaded from {filepath}")
            return self.model
        except Exception as e:
            logger.error(f"Error loading MoE model: {str(e)}")
            return None

# Example usage:
if __name__ == "__main__":
    # This would be a complete implementation in a real system
    print("Transformer and MoE models defined, but not implemented here.")