""" Transformer Neural Network for timeseries analysis This module implements a Transformer model with attention mechanisms for cryptocurrency price analysis. It also includes a Mixture of Experts model that combines predictions from multiple models. """ import os import logging import numpy as np import matplotlib.pyplot as plt import tensorflow as tf from tensorflow.keras.models import Model, load_model from tensorflow.keras.layers import ( Input, Dense, Dropout, BatchNormalization, Concatenate, Layer, LayerNormalization, MultiHeadAttention, Add, GlobalAveragePooling1D, Conv1D, Reshape ) from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau import datetime import json logger = logging.getLogger(__name__) class TransformerBlock(Layer): """ Transformer block implementation with multi-head attention and feed-forward networks. """ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim) self.ffn = tf.keras.Sequential([ Dense(ff_dim, activation="relu"), Dense(embed_dim), ]) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = Dropout(rate) self.dropout2 = Dropout(rate) def call(self, inputs, training=False): attn_output = self.att(inputs, inputs) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(inputs + attn_output) ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) return self.layernorm2(out1 + ffn_output) def get_config(self): config = super().get_config() config.update({ 'att': self.att, 'ffn': self.ffn, 'layernorm1': self.layernorm1, 'layernorm2': self.layernorm2, 'dropout1': self.dropout1, 'dropout2': self.dropout2 }) return config class PositionalEncoding(Layer): """ Positional encoding layer to add position information to input embeddings. """ def __init__(self, position, d_model): super(PositionalEncoding, self).__init__() self.position = position self.d_model = d_model self.pos_encoding = self.positional_encoding(position, d_model) def get_angles(self, position, i, d_model): angles = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(d_model, tf.float32)) return position * angles def positional_encoding(self, position, d_model): angle_rads = self.get_angles( position=tf.range(position, dtype=tf.float32)[:, tf.newaxis], i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :], d_model=d_model ) # Apply sin to even indices in the array sines = tf.math.sin(angle_rads[:, 0::2]) # Apply cos to odd indices in the array cosines = tf.math.cos(angle_rads[:, 1::2]) pos_encoding = tf.concat([sines, cosines], axis=-1) pos_encoding = pos_encoding[tf.newaxis, ...] return tf.cast(pos_encoding, tf.float32) def call(self, inputs): return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :] def get_config(self): config = super().get_config() config.update({ 'position': self.position, 'd_model': self.d_model, 'pos_encoding': self.pos_encoding }) return config class TransformerModel: """ Transformer Neural Network for time series analysis. This model uses self-attention mechanisms to capture relationships between different time points in the input data. """ def __init__(self, ts_input_shape=(20, 5), feature_input_shape=64, output_size=1, model_dir="NN/models/saved"): """ Initialize the Transformer model. Args: ts_input_shape (tuple): Shape of time series input data (sequence_length, features) feature_input_shape (int): Shape of additional feature input (e.g., from CNN) output_size (int): Number of output classes (1 for binary, 3 for buy/hold/sell) model_dir (str): Directory to save trained models """ self.ts_input_shape = ts_input_shape self.feature_input_shape = feature_input_shape self.output_size = output_size self.model_dir = model_dir self.model = None self.history = None # Create model directory if it doesn't exist os.makedirs(self.model_dir, exist_ok=True) logger.info(f"Initialized Transformer model with TS input shape {ts_input_shape}, " f"feature input shape {feature_input_shape}, and output size {output_size}") def build_model(self, embed_dim=32, num_heads=4, ff_dim=64, num_transformer_blocks=2, dropout_rate=0.1, learning_rate=0.001): """ Build the Transformer model architecture. Args: embed_dim (int): Embedding dimension for transformer num_heads (int): Number of attention heads ff_dim (int): Hidden dimension of the feed forward network num_transformer_blocks (int): Number of transformer blocks dropout_rate (float): Dropout rate for regularization learning_rate (float): Learning rate for Adam optimizer Returns: The compiled model """ # Time series input ts_inputs = Input(shape=self.ts_input_shape, name="ts_input") # Additional feature input (e.g., from CNN) feature_inputs = Input(shape=(self.feature_input_shape,), name="feature_input") # Process time series with transformer # First, project the input to the embedding dimension x = Conv1D(embed_dim, 1, activation="relu")(ts_inputs) # Add positional encoding x = PositionalEncoding(self.ts_input_shape[0], embed_dim)(x) # Add transformer blocks for _ in range(num_transformer_blocks): x = TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)(x) # Global pooling to get a single vector representation x = GlobalAveragePooling1D()(x) x = Dropout(dropout_rate)(x) # Combine with additional features combined = Concatenate()([x, feature_inputs]) # Dense layers for final classification/regression x = Dense(64, activation="relu")(combined) x = BatchNormalization()(x) x = Dropout(dropout_rate)(x) # Output layer if self.output_size == 1: # Binary classification (up/down) outputs = Dense(1, activation='sigmoid', name='output')(x) loss = 'binary_crossentropy' metrics = ['accuracy'] elif self.output_size == 3: # Multi-class classification (buy/hold/sell) outputs = Dense(3, activation='softmax', name='output')(x) loss = 'categorical_crossentropy' metrics = ['accuracy'] else: # Regression outputs = Dense(self.output_size, activation='linear', name='output')(x) loss = 'mse' metrics = ['mae'] # Create and compile model self.model = Model(inputs=[ts_inputs, feature_inputs], outputs=outputs) # Compile with Adam optimizer self.model.compile( optimizer=Adam(learning_rate=learning_rate), loss=loss, metrics=metrics ) # Log model summary self.model.summary(print_fn=lambda x: logger.info(x)) return self.model def train(self, X_ts, X_features, y, batch_size=32, epochs=100, validation_split=0.2, callbacks=None, class_weights=None): """ Train the Transformer model on the provided data. Args: X_ts (numpy.ndarray): Time series input features X_features (numpy.ndarray): Additional input features y (numpy.ndarray): Target labels batch_size (int): Batch size epochs (int): Number of epochs validation_split (float): Fraction of data to use for validation callbacks (list): List of Keras callbacks class_weights (dict): Class weights for imbalanced datasets Returns: History object containing training metrics """ if self.model is None: self.build_model() # Default callbacks if none provided if callbacks is None: # Create a timestamp for model checkpoints timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") callbacks = [ EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True ), ReduceLROnPlateau( monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6 ), ModelCheckpoint( filepath=os.path.join(self.model_dir, f"transformer_model_{timestamp}.h5"), monitor='val_loss', save_best_only=True ) ] # Check if y needs to be one-hot encoded for multi-class if self.output_size == 3 and len(y.shape) == 1: y = tf.keras.utils.to_categorical(y, num_classes=3) # Train the model logger.info(f"Training Transformer model with {len(X_ts)} samples, batch size {batch_size}, epochs {epochs}") self.history = self.model.fit( [X_ts, X_features], y, batch_size=batch_size, epochs=epochs, validation_split=validation_split, callbacks=callbacks, class_weight=class_weights, verbose=2 ) # Save the trained model timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") model_path = os.path.join(self.model_dir, f"transformer_model_final_{timestamp}.h5") self.model.save(model_path) logger.info(f"Model saved to {model_path}") # Save training history history_path = os.path.join(self.model_dir, f"transformer_model_history_{timestamp}.json") with open(history_path, 'w') as f: # Convert numpy values to Python native types for JSON serialization history_dict = {key: [float(value) for value in values] for key, values in self.history.history.items()} json.dump(history_dict, f, indent=2) return self.history def evaluate(self, X_ts, X_features, y): """ Evaluate the model on test data. Args: X_ts (numpy.ndarray): Time series input features X_features (numpy.ndarray): Additional input features y (numpy.ndarray): Target labels Returns: dict: Evaluation metrics """ if self.model is None: raise ValueError("Model has not been built or trained yet") # Convert y to one-hot encoding for multi-class if self.output_size == 3 and len(y.shape) == 1: y = tf.keras.utils.to_categorical(y, num_classes=3) # Evaluate model logger.info(f"Evaluating Transformer model on {len(X_ts)} samples") eval_results = self.model.evaluate([X_ts, X_features], y, verbose=0) metrics = {} for metric, value in zip(self.model.metrics_names, eval_results): metrics[metric] = value logger.info(f"{metric}: {value:.4f}") return metrics def predict(self, X_ts, X_features=None): """ Make predictions on new data. Args: X_ts (numpy.ndarray): Time series input features X_features (numpy.ndarray): Additional input features Returns: tuple: (y_pred, y_proba) where: y_pred is the predicted class (0/1 for binary, 0/1/2 for multi-class) y_proba is the class probability """ if self.model is None: raise ValueError("Model has not been built or trained yet") # Ensure X_ts has the right shape if len(X_ts.shape) == 2: # Single sample, add batch dimension X_ts = np.expand_dims(X_ts, axis=0) # Ensure X_features has the right shape if X_features is None: # Create dummy features with zeros X_features = np.zeros((X_ts.shape[0], self.feature_input_shape)) elif len(X_features.shape) == 1: # Single sample, add batch dimension X_features = np.expand_dims(X_features, axis=0) # Get predictions y_proba = self.model.predict([X_ts, X_features]) # Process based on output type if self.output_size == 1: # Binary classification y_pred = (y_proba > 0.5).astype(int).flatten() return y_pred, y_proba.flatten() elif self.output_size == 3: # Multi-class classification y_pred = np.argmax(y_proba, axis=1) return y_pred, y_proba else: # Regression return y_proba, y_proba def save(self, filepath=None): """ Save the model to disk. Args: filepath (str): Path to save the model Returns: str: Path where the model was saved """ if self.model is None: raise ValueError("Model has not been built yet") if filepath is None: # Create a default filepath with timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filepath = os.path.join(self.model_dir, f"transformer_model_{timestamp}.h5") self.model.save(filepath) logger.info(f"Model saved to {filepath}") return filepath def load(self, filepath): """ Load a saved model from disk. Args: filepath (str): Path to the saved model Returns: The loaded model """ # Register custom layers custom_objects = { 'TransformerBlock': TransformerBlock, 'PositionalEncoding': PositionalEncoding } self.model = load_model(filepath, custom_objects=custom_objects) logger.info(f"Model loaded from {filepath}") return self.model def plot_training_history(self): """ Plot training history (loss and metrics). Returns: str: Path to the saved plot """ if self.history is None: raise ValueError("Model has not been trained yet") plt.figure(figsize=(12, 5)) # Plot loss plt.subplot(1, 2, 1) plt.plot(self.history.history['loss'], label='Training Loss') if 'val_loss' in self.history.history: plt.plot(self.history.history['val_loss'], label='Validation Loss') plt.title('Model Loss') plt.xlabel('Epoch') plt.ylabel('Loss') plt.legend() # Plot accuracy plt.subplot(1, 2, 2) if 'accuracy' in self.history.history: plt.plot(self.history.history['accuracy'], label='Training Accuracy') if 'val_accuracy' in self.history.history: plt.plot(self.history.history['val_accuracy'], label='Validation Accuracy') plt.title('Model Accuracy') plt.ylabel('Accuracy') elif 'mae' in self.history.history: plt.plot(self.history.history['mae'], label='Training MAE') if 'val_mae' in self.history.history: plt.plot(self.history.history['val_mae'], label='Validation MAE') plt.title('Model MAE') plt.ylabel('MAE') plt.xlabel('Epoch') plt.legend() plt.tight_layout() # Save figure timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") fig_path = os.path.join(self.model_dir, f"transformer_training_history_{timestamp}.png") plt.savefig(fig_path) plt.close() logger.info(f"Training history plot saved to {fig_path}") return fig_path class MixtureOfExpertsModel: """ Mixture of Experts (MoE) model. This model combines predictions from multiple expert models (such as CNN and Transformer) using a weighted ensemble approach. """ def __init__(self, output_size=1, model_dir="NN/models/saved"): """ Initialize the MoE model. Args: output_size (int): Number of output classes (1 for binary, 3 for buy/hold/sell) model_dir (str): Directory to save trained models """ self.output_size = output_size self.model_dir = model_dir self.model = None self.history = None self.experts = {} # Create model directory if it doesn't exist os.makedirs(self.model_dir, exist_ok=True) logger.info(f"Initialized Mixture of Experts model with output size {output_size}") def add_expert(self, name, model): """ Add an expert model to the MoE. Args: name (str): Name of the expert model model: The expert model instance Returns: None """ self.experts[name] = model logger.info(f"Added expert model '{name}' to MoE") def build_model(self, ts_input_shape=(20, 5), expert_weights=None, learning_rate=0.001): """ Build the MoE model by combining expert models. Args: ts_input_shape (tuple): Shape of time series input data expert_weights (dict): Weights for each expert model learning_rate (float): Learning rate for Adam optimizer Returns: The compiled model """ # Time series input ts_inputs = Input(shape=ts_input_shape, name="ts_input") # Additional feature input (from CNN) feature_inputs = Input(shape=(64,), name="feature_input") # Default size for features # Process with each expert model expert_outputs = [] expert_names = [] for name, expert in self.experts.items(): # Skip if expert model is not valid or doesn't have a call/predict method if expert is None: logger.warning(f"Expert model '{name}' is None, skipping") continue try: # Different handling based on model type if name == 'cnn': # CNN model takes only time series input expert_output = expert(ts_inputs) expert_outputs.append(expert_output) expert_names.append(name) elif name == 'transformer': # Transformer model takes both time series and feature inputs expert_output = expert([ts_inputs, feature_inputs]) expert_outputs.append(expert_output) expert_names.append(name) else: logger.warning(f"Unknown expert model type: {name}") except Exception as e: logger.error(f"Error adding expert '{name}': {str(e)}") if not expert_outputs: logger.error("No valid expert models found") return None # Use expert weighting if expert_weights is None: # Equal weighting weights = [1.0 / len(expert_outputs)] * len(expert_outputs) else: # User-provided weights weights = [expert_weights.get(name, 1.0 / len(expert_outputs)) for name in expert_names] # Normalize weights weights = [w / sum(weights) for w in weights] # Combine expert outputs using weighted average if len(expert_outputs) == 1: # Only one expert, use its output directly combined_output = expert_outputs[0] else: # Multiple experts, compute weighted average weighted_outputs = [output * weight for output, weight in zip(expert_outputs, weights)] combined_output = Add()(weighted_outputs) # Create the MoE model moe_model = Model(inputs=[ts_inputs, feature_inputs], outputs=combined_output) # Compile the model if self.output_size == 1: # Binary classification moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'] ) elif self.output_size == 3: # Multi-class classification for BUY/HOLD/SELL moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='categorical_crossentropy', metrics=['accuracy'] ) else: # Regression moe_model.compile( optimizer=Adam(learning_rate=learning_rate), loss='mse', metrics=['mae'] ) self.model = moe_model # Log model summary self.model.summary(print_fn=lambda x: logger.info(x)) logger.info(f"Built MoE model with weights: {weights}") return self.model def train(self, X_ts, X_features, y, batch_size=32, epochs=100, validation_split=0.2, callbacks=None, class_weights=None): """ Train the MoE model on the provided data. Args: X_ts (numpy.ndarray): Time series input features X_features (numpy.ndarray): Additional input features y (numpy.ndarray): Target labels batch_size (int): Batch size epochs (int): Number of epochs validation_split (float): Fraction of data to use for validation callbacks (list): List of Keras callbacks class_weights (dict): Class weights for imbalanced datasets Returns: History object containing training metrics """ if self.model is None: logger.error("MoE model has not been built yet") return None # Default callbacks if none provided if callbacks is None: # Create a timestamp for model checkpoints timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") callbacks = [ EarlyStopping( monitor='val_loss', patience=10, restore_best_weights=True ), ReduceLROnPlateau( monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6 ), ModelCheckpoint( filepath=os.path.join(self.model_dir, f"moe_model_{timestamp}.h5"), monitor='val_loss', save_best_only=True ) ] # Check if y needs to be one-hot encoded for multi-class if self.output_size == 3 and len(y.shape) == 1: y = tf.keras.utils.to_categorical(y, num_classes=3) # Train the model logger.info(f"Training MoE model with {len(X_ts)} samples, batch size {batch_size}, epochs {epochs}") self.history = self.model.fit( [X_ts, X_features], y, batch_size=batch_size, epochs=epochs, validation_split=validation_split, callbacks=callbacks, class_weight=class_weights, verbose=2 ) # Save the trained model timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") model_path = os.path.join(self.model_dir, f"moe_model_final_{timestamp}.h5") self.model.save(model_path) logger.info(f"Model saved to {model_path}") # Save training history history_path = os.path.join(self.model_dir, f"moe_model_history_{timestamp}.json") with open(history_path, 'w') as f: # Convert numpy values to Python native types for JSON serialization history_dict = {key: [float(value) for value in values] for key, values in self.history.history.items()} json.dump(history_dict, f, indent=2) return self.history def predict(self, X_ts, X_features=None): """ Make predictions on new data. Args: X_ts (numpy.ndarray): Time series input features X_features (numpy.ndarray): Additional input features Returns: tuple: (y_pred, y_proba) where: y_pred is the predicted class (0/1 for binary, 0/1/2 for multi-class) y_proba is the class probability """ if self.model is None: raise ValueError("Model has not been built or trained yet") # Ensure X_ts has the right shape if len(X_ts.shape) == 2: # Single sample, add batch dimension X_ts = np.expand_dims(X_ts, axis=0) # Ensure X_features has the right shape if X_features is None: # Create dummy features with zeros X_features = np.zeros((X_ts.shape[0], 64)) # Default size elif len(X_features.shape) == 1: # Single sample, add batch dimension X_features = np.expand_dims(X_features, axis=0) # Get predictions y_proba = self.model.predict([X_ts, X_features]) # Process based on output type if self.output_size == 1: # Binary classification y_pred = (y_proba > 0.5).astype(int).flatten() return y_pred, y_proba.flatten() elif self.output_size == 3: # Multi-class classification y_pred = np.argmax(y_proba, axis=1) return y_pred, y_proba else: # Regression return y_proba, y_proba def save(self, filepath=None): """ Save the model to disk. Args: filepath (str): Path to save the model Returns: str: Path where the model was saved """ if self.model is None: raise ValueError("Model has not been built yet") if filepath is None: # Create a default filepath with timestamp timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") filepath = os.path.join(self.model_dir, f"moe_model_{timestamp}.h5") self.model.save(filepath) logger.info(f"Model saved to {filepath}") return filepath def load(self, filepath): """ Load a saved model from disk. Args: filepath (str): Path to the saved model Returns: The loaded model """ # Register custom layers custom_objects = { 'TransformerBlock': TransformerBlock, 'PositionalEncoding': PositionalEncoding } self.model = load_model(filepath, custom_objects=custom_objects) logger.info(f"Model loaded from {filepath}") return self.model # Example usage: if __name__ == "__main__": # This would be a complete implementation in a real system print("Transformer and MoE models defined, but not implemented here.")