import os import sys import numpy as np import pandas as pd import tensorflow as tf from tensorflow.keras.models import Model from tensorflow.keras.layers import ( Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Concatenate, Add, Activation, Flatten ) from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import ( EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, TensorBoard, CSVLogger ) import matplotlib.pyplot as plt import logging import time import datetime # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), logging.FileHandler('nn_transformer_model.log') ] ) logger = logging.getLogger('transformer_model') class TransformerBlock(tf.keras.layers.Layer): """ Transformer block with multi-head self-attention and feed-forward network """ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = tf.keras.Sequential([ Dense(ff_dim, activation="relu"), Dense(embed_dim) ]) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = Dropout(rate) self.dropout2 = Dropout(rate) def call(self, inputs, training=False): # Normalization and attention attn_output = self.att(inputs, inputs) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(inputs + attn_output) # Feed-forward network ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) # Skip connection and normalization return self.layernorm2(out1 + ffn_output) class TransformerModel: """ Transformer-based model for financial time series analysis. This model processes both raw time series data and high-level features from the CNN model. """ def __init__(self, ts_input_shape=(20, 5), feature_input_shape=128, output_size=3, model_dir='NN/models/saved'): """ Initialize the Transformer model Args: ts_input_shape: Shape of time series input data (sequence_length, features) feature_input_shape: Shape of high-level feature input (from CNN) output_size: Number of output classes or values model_dir: Directory to save model files """ self.ts_input_shape = ts_input_shape self.feature_input_shape = feature_input_shape self.output_size = output_size self.model_dir = model_dir self.model = None self.history = None # Create model directory if it doesn't exist os.makedirs(model_dir, exist_ok=True) logger.info(f"Initialized TransformerModel with time series input shape {ts_input_shape}, " f"feature input shape {feature_input_shape}, and output size {output_size}") def build_model(self, embed_dim=64, num_heads=4, ff_dim=128, num_transformer_blocks=2, dropout_rate=0.2, learning_rate=0.001): """ Build the Transformer model architecture Args: embed_dim: Embedding dimension for the transformer num_heads: Number of attention heads ff_dim: Hidden layer size in the feed-forward network num_transformer_blocks: Number of transformer blocks to stack dropout_rate: Dropout rate for regularization learning_rate: Learning rate for the optimizer Returns: Compiled Keras model """ # Time series input (price and volume data) ts_inputs = Input(shape=self.ts_input_shape, name='time_series_input') # High-level feature input (from CNN or other sources) feature_inputs = Input(shape=(self.feature_input_shape,), name='feature_input') # Process time series with transformer blocks x = ts_inputs for _ in range(num_transformer_blocks): x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x) # Global pooling to get fixed-size representation x = GlobalAveragePooling1D()(x) # Combine with the high-level features combined = Concatenate()([x, feature_inputs]) # Dense layers dense1 = Dense(128, activation='relu')(combined) dropout1 = Dropout(dropout_rate)(dense1) dense2 = Dense(64, activation='relu')(dropout1) dropout2 = Dropout(dropout_rate)(dense2) # Output layer if self.output_size == 1: # Binary classification outputs = Dense(1, activation='sigmoid')(dropout2) elif self.output_size == 3: # For BUY/HOLD/SELL signals (3 classes) outputs = Dense(3, activation='softmax')(dropout2) else: # Regression or multi-class classification outputs = Dense(self.output_size, activation='linear')(dropout2) # Create and compile the model model = Model(inputs=[ts_inputs, feature_inputs], outputs=outputs) if self.output_size == 1: # Binary classification model.compile( optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'] ) elif self.output_size == 3: # Multi-class classification for BUY/HOLD/SELL model.compile( optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'] ) else: # Regression model.compile( optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'] ) self.model = model logger.info(f"Model built with {model.count_params()} parameters") model.summary(print_fn=logger.info) return model def train(self, X_ts, X_features, y, batch_size=32, epochs=100, validation_split=0.2, early_stopping_patience=20, reduce_lr_patience=10, verbose=1): """ Train the Transformer model Args: X_ts: Time series input data X_features: High-level feature input data y: Target values batch_size: Batch size for training epochs: Maximum number of epochs validation_split: Fraction of data to use for validation early_stopping_patience: Patience for early stopping reduce_lr_patience: Patience for learning rate reduction verbose: Verbosity level Returns: Training history """ if self.model is None: logger.warning("Model not built yet, building with default parameters") self.build_model() # Create a timestamp for this training run timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") model_name = f"transformer_model_{timestamp}" # Set up callbacks callbacks = [ # Early stopping to prevent overfitting EarlyStopping( monitor='val_loss', patience=early_stopping_patience, restore_best_weights=True, verbose=1 ), # Reduce learning rate when training plateaus ReduceLROnPlateau( monitor='val_loss', factor=0.5, patience=reduce_lr_patience, min_lr=1e-6, verbose=1 ), # Save the best model ModelCheckpoint( filepath=os.path.join(self.model_dir, f"{model_name}_best.h5"), monitor='val_loss', save_best_only=True, verbose=1 ), # TensorBoard logging TensorBoard( log_dir=os.path.join(self.model_dir, 'logs', model_name), histogram_freq=1 ), # CSV logging CSVLogger( filename=os.path.join(self.model_dir, f"{model_name}_training.csv"), separator=',', append=False ) ] # Train the model logger.info(f"Starting training with {len(X_ts)} samples, {epochs} max epochs") start_time = time.time() history = self.model.fit( [X_ts, X_features], y, batch_size=batch_size, epochs=epochs, validation_split=validation_split, callbacks=callbacks, verbose=verbose ) # Calculate training time training_time = time.time() - start_time logger.info(f"Training completed in {training_time:.2f} seconds") # Save the final model self.model.save(os.path.join(self.model_dir, f"{model_name}_final.h5")) logger.info(f"Model saved to {os.path.join(self.model_dir, model_name + '_final.h5')}") # Save training history hist_df = pd.DataFrame(history.history) hist_df.to_csv(os.path.join(self.model_dir, f"{model_name}_history.csv"), index=False) self.history = history return history def predict(self, X_ts, X_features, threshold=0.5): """ Make predictions with the model Args: X_ts: Time series input data X_features: High-level feature input data threshold: Threshold for binary classification Returns: Predicted values or classes """ if self.model is None: logger.error("Model not built or trained yet") return None # Get raw predictions y_pred_proba = self.model.predict([X_ts, X_features]) # Format predictions based on output type if self.output_size == 1: # Binary classification y_pred = (y_pred_proba > threshold).astype(int).flatten() return y_pred, y_pred_proba.flatten() elif self.output_size == 3: # Multi-class (BUY/HOLD/SELL) y_pred = np.argmax(y_pred_proba, axis=1) return y_pred, y_pred_proba else: # Regression return y_pred_proba def save_model(self, filepath=None): """ Save the model to a file Args: filepath: Path to save the model to Returns: Path to the saved model """ if self.model is None: logger.error("Model not built or trained yet") return None if filepath is None: # Create a default filepath timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filepath = os.path.join(self.model_dir, f"transformer_model_{timestamp}.h5") self.model.save(filepath) logger.info(f"Model saved to {filepath}") return filepath def load_model(self, filepath): """ Load a model from a file Args: filepath: Path to load the model from Returns: Loaded model """ try: self.model = tf.keras.models.load_model(filepath) logger.info(f"Model loaded from {filepath}") return self.model except Exception as e: logger.error(f"Error loading model: {str(e)}") return None class MixtureOfExpertsModel: """ Mixture of Experts (MoE) model that combines predictions from multiple models. This implementation focuses on combining CNN and Transformer models for financial analysis. """ def __init__(self, output_size=3, model_dir='NN/models/saved'): """ Initialize the MoE model Args: output_size: Number of output classes or values model_dir: Directory to save model files """ self.output_size = output_size self.model_dir = model_dir self.models = {} # Dictionary to store expert models self.gating_model = None # Model to determine which expert to use self.model = None # Combined MoE model # Create model directory if it doesn't exist os.makedirs(model_dir, exist_ok=True) logger.info(f"Initialized MixtureOfExpertsModel with output size {output_size}") def add_expert(self, name, model): """ Add an expert model to the MoE Args: name: Name of the expert model: Expert model instance Returns: None """ self.models[name] = model logger.info(f"Added expert model '{name}' to MoE") def build_model(self, ts_input_shape=(20, 5), expert_weights=None, learning_rate=0.001): """ Build the MoE model architecture Args: ts_input_shape: Shape of time series input data expert_weights: Dictionary of expert weights (if None, equal weighting) learning_rate: Learning rate for the optimizer Returns: Compiled Keras model """ if not self.models: logger.error("No expert models added to MoE") return None # Time series input ts_inputs = Input(shape=ts_input_shape, name='time_series_input') # Get predictions from each expert expert_outputs = [] expert_names = [] for name, model in self.models.items(): if hasattr(model, 'predict') and callable(model.predict): expert_names.append(name) if name == 'cnn': # For CNN, we directly use the time series input # We need to extract the raw prediction function from the model's predict method # which typically returns both predictions and probabilities expert_outputs.append(model.model(ts_inputs)) elif name == 'transformer': # For transformer, we need features from the CNN as well # This is a simplification - in a real implementation, we would need to # extract features from the CNN model and pass them to the transformer # Here we just create dummy features dummy_features = Dense(128, activation='relu')(Flatten()(ts_inputs)) expert_outputs.append(model.model([ts_inputs, dummy_features])) else: logger.warning(f"Unknown model type: {name}, skipping") if not expert_outputs: logger.error("No valid expert models found") return None # Use expert weighting if expert_weights is None: # Equal weighting weights = [1.0 / len(expert_outputs)] * len(expert_outputs) else: # User-provided weights weights = [expert_weights.get(name, 1.0 / len(expert_outputs)) for name in expert_names] # Normalize weights weights = [w / sum(weights) for w in weights] # Combine expert outputs using weighted average if len(expert_outputs) == 1: # Only one expert, use its output directly combined_output = expert_outputs[0] else: # Multiple experts, compute weighted average weighted_outputs = [output * weight for output, weight in zip(expert_outputs, weights)] combined_output = Add()(weighted_outputs) # Create the MoE model moe_model = Model(inputs=ts_inputs, outputs=combined_output) # Compile the model if self.output_size == 1: # Binary classification moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'] ) elif self.output_size == 3: # Multi-class classification for BUY/HOLD/SELL moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'] ) else: # Regression moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'] ) self.model = moe_model logger.info(f"MoE model built with experts: {expert_names}, weights: {weights}") moe_model.summary(print_fn=logger.info) return moe_model def predict(self, X, threshold=0.5): """ Make predictions with the MoE model Args: X: Input data threshold: Threshold for binary classification Returns: Predicted values or classes """ if self.model is None: logger.error("MoE model not built yet") return None # Get raw predictions y_pred_proba = self.model.predict(X) # Format predictions based on output type if self.output_size == 1: # Binary classification y_pred = (y_pred_proba > threshold).astype(int).flatten() return y_pred, y_pred_proba.flatten() elif self.output_size == 3: # Multi-class (BUY/HOLD/SELL) y_pred = np.argmax(y_pred_proba, axis=1) return y_pred, y_pred_proba else: # Regression return y_pred_proba def save_model(self, filepath=None): """ Save the MoE model to a file Args: filepath: Path to save the model to Returns: Path to the saved model """ if self.model is None: logger.error("MoE model not built yet") return None if filepath is None: # Create a default filepath timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filepath = os.path.join(self.model_dir, f"moe_model_{timestamp}.h5") self.model.save(filepath) logger.info(f"MoE model saved to {filepath}") return filepath def load_model(self, filepath): """ Load an MoE model from a file Args: filepath: Path to load the model from Returns: Loaded model """ try: self.model = tf.keras.models.load_model(filepath) logger.info(f"MoE model loaded from {filepath}") return self.model except Exception as e: logger.error(f"Error loading MoE model: {str(e)}") return None # Example usage: if __name__ == "__main__": # This would be a complete implementation in a real system print("Transformer and MoE models defined, but not implemented here.")