cleanup, CNN fixes

This commit is contained in:
Dobromir Popov
2025-07-05 00:12:40 +03:00
parent ce8c00a9d1
commit 5ca7493708
18 changed files with 587 additions and 5181 deletions

View File

@ -4,17 +4,16 @@ Neural Network Models
This package contains the neural network models used in the trading system:
- CNN Model: Deep convolutional neural network for feature extraction
- Transformer Model: Processes high-level features for improved pattern recognition
- MoE: Mixture of Experts model that combines multiple neural networks
- DQN Agent: Deep Q-Network for reinforcement learning
- COB RL Model: Specialized RL model for order book data
- Advanced Transformer: High-performance transformer for trading
PyTorch implementation only.
"""
from NN.models.cnn_model_pytorch import EnhancedCNNModel as CNNModel
from NN.models.transformer_model_pytorch import (
TransformerModelPyTorch as TransformerModel,
MixtureOfExpertsModelPyTorch as MixtureOfExpertsModel
)
from NN.models.cnn_model import EnhancedCNNModel as CNNModel
from NN.models.dqn_agent import DQNAgent
from NN.models.cob_rl_model import MassiveRLNetwork, COBRLModelInterface
from NN.models.advanced_transformer_trading import AdvancedTradingTransformer, TradingTransformerConfig
__all__ = ['CNNModel', 'TransformerModel', 'MixtureOfExpertsModel', 'MassiveRLNetwork', 'COBRLModelInterface']
__all__ = ['CNNModel', 'DQNAgent', 'MassiveRLNetwork', 'COBRLModelInterface', 'AdvancedTradingTransformer', 'TradingTransformerConfig']

View File

@ -329,13 +329,13 @@ class EnhancedCNNModel(nn.Module):
x = x.unsqueeze(0)
elif len(x.shape) > 3:
# Input has extra dimensions - flatten to [batch, seq, features]
x = x.view(x.shape[0], -1, x.shape[-1])
x = x.reshape(x.shape[0], -1, x.shape[-1])
x = self._memory_barrier(x) # Apply barrier after shape changes
batch_size, seq_len, features = x.shape
# Reshape for processing: [batch, seq, features] -> [batch*seq, features]
x_reshaped = x.view(-1, features)
x_reshaped = x.reshape(-1, features)
x_reshaped = self._memory_barrier(x_reshaped)
# Input embedding
@ -343,7 +343,7 @@ class EnhancedCNNModel(nn.Module):
embedded = self._memory_barrier(embedded)
# Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
embedded = embedded.view(batch_size, seq_len, -1).transpose(1, 2).contiguous()
embedded = embedded.reshape(batch_size, seq_len, -1).transpose(1, 2).contiguous()
embedded = self._memory_barrier(embedded)
# Multi-scale feature extraction - ensure each path creates independent tensors
@ -380,10 +380,10 @@ class EnhancedCNNModel(nn.Module):
# Global aggregation - create independent tensors
avg_pooled = self.global_pool(attended_features)
avg_pooled = self._memory_barrier(avg_pooled.view(avg_pooled.shape[0], -1)) # Flatten instead of squeeze
avg_pooled = self._memory_barrier(avg_pooled.reshape(avg_pooled.shape[0], -1)) # Flatten instead of squeeze
max_pooled = self.global_max_pool(attended_features)
max_pooled = self._memory_barrier(max_pooled.view(max_pooled.shape[0], -1)) # Flatten instead of squeeze
max_pooled = self._memory_barrier(max_pooled.reshape(max_pooled.shape[0], -1)) # Flatten instead of squeeze
# Combine global features - create new tensor
global_features = torch.cat([avg_pooled, max_pooled], dim=1)
@ -399,7 +399,7 @@ class EnhancedCNNModel(nn.Module):
# Combine all features for final decision (8 regime classes + 1 volatility)
# Create completely independent tensors for concatenation
vol_pred_flat = self._memory_barrier(volatility_pred.view(volatility_pred.shape[0], -1)) # Flatten instead of squeeze
vol_pred_flat = self._memory_barrier(volatility_pred.reshape(volatility_pred.shape[0], -1)) # Flatten instead of squeeze
combined_features = torch.cat([processed_features, regime_probs, vol_pred_flat], dim=1)
combined_features = self._memory_barrier(combined_features)
@ -411,15 +411,15 @@ class EnhancedCNNModel(nn.Module):
trading_probs = self._memory_barrier(F.softmax(scaled_logits, dim=1))
# Flatten confidence to ensure consistent shape
confidence_flat = self._memory_barrier(confidence.view(confidence.shape[0], -1))
volatility_flat = self._memory_barrier(volatility_pred.view(volatility_pred.shape[0], -1))
confidence_flat = self._memory_barrier(confidence.reshape(confidence.shape[0], -1))
volatility_flat = self._memory_barrier(volatility_pred.reshape(volatility_pred.shape[0], -1))
return {
'logits': self._memory_barrier(trading_logits),
'probabilities': self._memory_barrier(trading_probs),
'confidence': confidence_flat[:, 0] if confidence_flat.shape[1] > 0 else confidence_flat.view(-1)[0],
'confidence': confidence_flat[:, 0] if confidence_flat.shape[1] > 0 else confidence_flat.reshape(-1)[0],
'regime': self._memory_barrier(regime_probs),
'volatility': volatility_flat[:, 0] if volatility_flat.shape[1] > 0 else volatility_flat.view(-1)[0],
'volatility': volatility_flat[:, 0] if volatility_flat.shape[1] > 0 else volatility_flat.reshape(-1)[0],
'features': self._memory_barrier(processed_features)
}

View File

@ -1,610 +0,0 @@
#!/usr/bin/env python3
"""
Enhanced CNN Model for Trading - PyTorch Implementation
Much larger and more sophisticated architecture for better learning
"""
import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import math
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F
from typing import Dict, Any, Optional, Tuple
# Configure logging
logger = logging.getLogger(__name__)
class MultiHeadAttention(nn.Module):
"""Multi-head attention mechanism for sequence data"""
def __init__(self, d_model: int, num_heads: int = 8, dropout: float = 0.1):
super().__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.w_q = nn.Linear(d_model, d_model)
self.w_k = nn.Linear(d_model, d_model)
self.w_v = nn.Linear(d_model, d_model)
self.w_o = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)
self.scale = math.sqrt(self.d_k)
def forward(self, x: torch.Tensor) -> torch.Tensor:
batch_size, seq_len, _ = x.size()
# Compute Q, K, V
Q = self.w_q(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
K = self.w_k(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
V = self.w_v(x).view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
# Attention weights
scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
attention_weights = F.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)
# Apply attention
attention_output = torch.matmul(attention_weights, V)
attention_output = attention_output.transpose(1, 2).contiguous().view(
batch_size, seq_len, self.d_model
)
return self.w_o(attention_output)
class ResidualBlock(nn.Module):
"""Residual block with normalization and dropout"""
def __init__(self, channels: int, dropout: float = 0.1):
super().__init__()
self.conv1 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
self.conv2 = nn.Conv1d(channels, channels, kernel_size=3, padding=1)
self.norm1 = nn.BatchNorm1d(channels)
self.norm2 = nn.BatchNorm1d(channels)
self.dropout = nn.Dropout(dropout)
def forward(self, x: torch.Tensor) -> torch.Tensor:
residual = x
out = F.relu(self.norm1(self.conv1(x)))
out = self.dropout(out)
out = self.norm2(self.conv2(out))
# Add residual connection (avoid in-place operation)
out = out + residual
return F.relu(out)
class SpatialAttentionBlock(nn.Module):
"""Spatial attention for feature maps"""
def __init__(self, channels: int):
super().__init__()
self.conv = nn.Conv1d(channels, 1, kernel_size=1)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# Compute attention weights
attention = torch.sigmoid(self.conv(x))
# Avoid in-place operation by creating new tensor
return torch.mul(x, attention)
class EnhancedCNNModel(nn.Module):
"""
Much larger and more sophisticated CNN architecture for trading
Features:
- Deep convolutional layers with residual connections
- Multi-head attention mechanisms
- Spatial attention blocks
- Multiple feature extraction paths
- Large capacity for complex pattern learning
"""
def __init__(self,
input_size: int = 60,
feature_dim: int = 50,
output_size: int = 2, # BUY/SELL for 2-action system
base_channels: int = 256, # Increased from 128 to 256
num_blocks: int = 12, # Increased from 6 to 12
num_attention_heads: int = 16, # Increased from 8 to 16
dropout_rate: float = 0.2):
super().__init__()
self.input_size = input_size
self.feature_dim = feature_dim
self.output_size = output_size
self.base_channels = base_channels
# Much larger input embedding - project features to higher dimension
self.input_embedding = nn.Sequential(
nn.Linear(feature_dim, base_channels // 2),
nn.BatchNorm1d(base_channels // 2),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels // 2, base_channels),
nn.BatchNorm1d(base_channels),
nn.ReLU(),
nn.Dropout(dropout_rate)
)
# Multi-scale convolutional feature extraction with more channels
self.conv_path1 = self._build_conv_path(base_channels, base_channels, 3)
self.conv_path2 = self._build_conv_path(base_channels, base_channels, 5)
self.conv_path3 = self._build_conv_path(base_channels, base_channels, 7)
self.conv_path4 = self._build_conv_path(base_channels, base_channels, 9) # Additional path
# Feature fusion with more capacity
self.feature_fusion = nn.Sequential(
nn.Conv1d(base_channels * 4, base_channels * 3, kernel_size=1), # 4 paths now
nn.BatchNorm1d(base_channels * 3),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Conv1d(base_channels * 3, base_channels * 2, kernel_size=1),
nn.BatchNorm1d(base_channels * 2),
nn.ReLU(),
nn.Dropout(dropout_rate)
)
# Much deeper residual blocks for complex pattern learning
self.residual_blocks = nn.ModuleList([
ResidualBlock(base_channels * 2, dropout_rate) for _ in range(num_blocks)
])
# More spatial attention blocks
self.spatial_attention = nn.ModuleList([
SpatialAttentionBlock(base_channels * 2) for _ in range(6) # Increased from 3 to 6
])
# Multiple temporal attention layers
self.temporal_attention1 = MultiHeadAttention(
d_model=base_channels * 2,
num_heads=num_attention_heads,
dropout=dropout_rate
)
self.temporal_attention2 = MultiHeadAttention(
d_model=base_channels * 2,
num_heads=num_attention_heads // 2,
dropout=dropout_rate
)
# Global feature aggregation
self.global_pool = nn.AdaptiveAvgPool1d(1)
self.global_max_pool = nn.AdaptiveMaxPool1d(1)
# Much larger advanced feature processing
self.advanced_features = nn.Sequential(
nn.Linear(base_channels * 4, base_channels * 6), # Increased capacity
nn.BatchNorm1d(base_channels * 6),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels * 6, base_channels * 4),
nn.BatchNorm1d(base_channels * 4),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels * 4, base_channels * 3),
nn.BatchNorm1d(base_channels * 3),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels * 3, base_channels * 2),
nn.BatchNorm1d(base_channels * 2),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels * 2, base_channels),
nn.BatchNorm1d(base_channels),
nn.ReLU(),
nn.Dropout(dropout_rate)
)
# Enhanced market regime detection branch
self.regime_detector = nn.Sequential(
nn.Linear(base_channels, base_channels // 2),
nn.BatchNorm1d(base_channels // 2),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels // 2, base_channels // 4),
nn.BatchNorm1d(base_channels // 4),
nn.ReLU(),
nn.Linear(base_channels // 4, 8), # 8 market regimes instead of 4
nn.Softmax(dim=1)
)
# Enhanced volatility prediction branch
self.volatility_predictor = nn.Sequential(
nn.Linear(base_channels, base_channels // 2),
nn.BatchNorm1d(base_channels // 2),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels // 2, base_channels // 4),
nn.BatchNorm1d(base_channels // 4),
nn.ReLU(),
nn.Linear(base_channels // 4, 1),
nn.Sigmoid()
)
# Main trading decision head
self.decision_head = nn.Sequential(
nn.Linear(base_channels + 8 + 1, base_channels), # 8 regime classes + 1 volatility
nn.BatchNorm1d(base_channels),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels, base_channels // 2),
nn.BatchNorm1d(base_channels // 2),
nn.ReLU(),
nn.Dropout(dropout_rate),
nn.Linear(base_channels // 2, output_size)
)
# Confidence estimation head
self.confidence_head = nn.Sequential(
nn.Linear(base_channels, base_channels // 2),
nn.ReLU(),
nn.Linear(base_channels // 2, 1),
nn.Sigmoid()
)
# Initialize weights
self._initialize_weights()
def _build_conv_path(self, in_channels: int, out_channels: int, kernel_size: int) -> nn.Module:
"""Build a convolutional path with multiple layers"""
return nn.Sequential(
nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2),
nn.BatchNorm1d(out_channels),
nn.ReLU(),
nn.Dropout(0.1),
nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
nn.BatchNorm1d(out_channels),
nn.ReLU(),
nn.Dropout(0.1),
nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2),
nn.BatchNorm1d(out_channels),
nn.ReLU()
)
def _initialize_weights(self):
"""Initialize model weights"""
for m in self.modules():
if isinstance(m, nn.Conv1d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, x: torch.Tensor) -> Dict[str, torch.Tensor]:
"""
Forward pass with multiple outputs
Args:
x: Input tensor of shape [batch_size, sequence_length, features]
Returns:
Dictionary with predictions, confidence, regime, and volatility
"""
batch_size, seq_len, features = x.shape
# Reshape for processing: [batch, seq, features] -> [batch*seq, features]
x_reshaped = x.reshape(-1, features)
x_reshaped = self._memory_barrier(x_reshaped)
# Input embedding
embedded = self.input_embedding(x_reshaped) # [batch*seq, base_channels]
embedded = self._memory_barrier(embedded)
# Reshape back for conv1d: [batch*seq, channels] -> [batch, channels, seq]
embedded = embedded.reshape(batch_size, seq_len, -1).transpose(1, 2).contiguous()
# Multi-scale feature extraction
path1 = self.conv_path1(embedded)
path2 = self.conv_path2(embedded)
path3 = self.conv_path3(embedded)
path4 = self.conv_path4(embedded)
# Feature fusion
fused_features = torch.cat([path1, path2, path3, path4], dim=1)
fused_features = self.feature_fusion(fused_features)
# Apply residual blocks with spatial attention
current_features = fused_features
for i, (res_block, attention) in enumerate(zip(self.residual_blocks, self.spatial_attention)):
current_features = res_block(current_features)
if i % 2 == 0: # Apply attention every other block
current_features = attention(current_features)
# Apply remaining residual blocks
for res_block in self.residual_blocks[len(self.spatial_attention):]:
current_features = res_block(current_features)
# Temporal attention - apply both attention layers
# Reshape for attention: [batch, channels, seq] -> [batch, seq, channels]
attention_input = current_features.transpose(1, 2)
attended_features = self.temporal_attention1(attention_input)
attended_features = self.temporal_attention2(attended_features)
# Back to conv format: [batch, seq, channels] -> [batch, channels, seq]
attended_features = attended_features.transpose(1, 2)
# Global aggregation
avg_pooled = self.global_pool(attended_features).squeeze(-1) # [batch, channels]
max_pooled = self.global_max_pool(attended_features).squeeze(-1) # [batch, channels]
# Combine global features
global_features = torch.cat([avg_pooled, max_pooled], dim=1)
# Advanced feature processing
processed_features = self.advanced_features(global_features)
# Multi-task predictions
regime_probs = self.regime_detector(processed_features)
volatility_pred = self.volatility_predictor(processed_features)
confidence = self.confidence_head(processed_features)
# Combine all features for final decision (8 regime classes + 1 volatility)
combined_features = torch.cat([processed_features, regime_probs, volatility_pred], dim=1)
trading_logits = self.decision_head(combined_features)
# Apply temperature scaling for better calibration
temperature = 1.5
trading_probs = F.softmax(trading_logits / temperature, dim=1)
return {
'logits': trading_logits,
'probabilities': trading_probs,
'confidence': confidence.squeeze(-1),
'regime': regime_probs,
'volatility': volatility_pred.squeeze(-1),
'features': processed_features
}
def predict(self, feature_matrix: np.ndarray) -> Dict[str, Any]:
"""
Make predictions on feature matrix
Args:
feature_matrix: numpy array of shape [sequence_length, features]
Returns:
Dictionary with prediction results
"""
self.eval()
with torch.no_grad():
# Convert to tensor and add batch dimension
if isinstance(feature_matrix, np.ndarray):
x = torch.FloatTensor(feature_matrix).unsqueeze(0) # Add batch dim
else:
x = feature_matrix.unsqueeze(0)
# Move to device
device = next(self.parameters()).device
x = x.to(device)
# Forward pass
outputs = self.forward(x)
# Extract results with proper shape handling
probs = outputs['probabilities'].cpu().numpy()[0]
confidence_tensor = outputs['confidence'].cpu().numpy()
regime = outputs['regime'].cpu().numpy()[0]
volatility_tensor = outputs['volatility'].cpu().numpy()
# Handle confidence shape properly to avoid scalar conversion errors
if isinstance(confidence_tensor, np.ndarray):
if confidence_tensor.ndim == 0:
confidence = float(confidence_tensor.item())
elif confidence_tensor.size == 1:
confidence = float(confidence_tensor.flatten()[0])
else:
confidence = float(confidence_tensor[0] if len(confidence_tensor) > 0 else 0.7)
else:
confidence = float(confidence_tensor)
# Handle volatility shape properly
if isinstance(volatility_tensor, np.ndarray):
if volatility_tensor.ndim == 0:
volatility = float(volatility_tensor.item())
elif volatility_tensor.size == 1:
volatility = float(volatility_tensor.flatten()[0])
else:
volatility = float(volatility_tensor[0] if len(volatility_tensor) > 0 else 0.0)
else:
volatility = float(volatility_tensor)
# Determine action (0=BUY, 1=SELL for 2-action system)
action = int(np.argmax(probs))
action_confidence = float(probs[action])
return {
'action': action,
'action_name': 'BUY' if action == 0 else 'SELL',
'confidence': confidence, # Already converted to float above
'action_confidence': action_confidence,
'probabilities': probs.tolist(),
'regime_probabilities': regime.tolist(),
'volatility_prediction': volatility, # Already converted to float above
'raw_logits': outputs['logits'].cpu().numpy()[0].tolist()
}
def get_memory_usage(self) -> Dict[str, Any]:
"""Get model memory usage statistics"""
total_params = sum(p.numel() for p in self.parameters())
trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
param_size = sum(p.numel() * p.element_size() for p in self.parameters())
buffer_size = sum(b.numel() * b.element_size() for b in self.buffers())
return {
'total_parameters': total_params,
'trainable_parameters': trainable_params,
'parameter_size_mb': param_size / (1024 * 1024),
'buffer_size_mb': buffer_size / (1024 * 1024),
'total_size_mb': (param_size + buffer_size) / (1024 * 1024)
}
def to_device(self, device: str):
"""Move model to specified device"""
return self.to(torch.device(device))
class CNNModelTrainer:
"""Enhanced trainer for the beefed-up CNN model"""
def __init__(self, model: EnhancedCNNModel, learning_rate: float = 0.0001, device: str = 'cuda'):
self.model = model.to(device)
self.device = device
self.learning_rate = learning_rate
# Use AdamW optimizer with weight decay
self.optimizer = torch.optim.AdamW(
model.parameters(),
lr=learning_rate,
weight_decay=0.01,
betas=(0.9, 0.999)
)
# Learning rate scheduler
self.scheduler = torch.optim.lr_scheduler.OneCycleLR(
self.optimizer,
max_lr=learning_rate * 10,
total_steps=10000, # Will be updated based on actual training
pct_start=0.1,
anneal_strategy='cos'
)
# Multi-task loss functions
self.main_criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
self.confidence_criterion = nn.BCELoss()
self.regime_criterion = nn.CrossEntropyLoss()
self.volatility_criterion = nn.MSELoss()
self.training_history = []
def train_step(self, x: torch.Tensor, y: torch.Tensor,
confidence_targets: Optional[torch.Tensor] = None,
regime_targets: Optional[torch.Tensor] = None,
volatility_targets: Optional[torch.Tensor] = None) -> Dict[str, float]:
"""Single training step with multi-task learning"""
self.model.train()
self.optimizer.zero_grad()
# Forward pass
outputs = self.model(x)
# Main trading loss
main_loss = self.main_criterion(outputs['logits'], y)
total_loss = main_loss
losses = {'main_loss': main_loss.item()}
# Confidence loss (if targets provided)
if confidence_targets is not None:
conf_loss = self.confidence_criterion(outputs['confidence'], confidence_targets)
total_loss += 0.1 * conf_loss
losses['confidence_loss'] = conf_loss.item()
# Regime classification loss (if targets provided)
if regime_targets is not None:
regime_loss = self.regime_criterion(outputs['regime'], regime_targets)
total_loss += 0.05 * regime_loss
losses['regime_loss'] = regime_loss.item()
# Volatility prediction loss (if targets provided)
if volatility_targets is not None:
vol_loss = self.volatility_criterion(outputs['volatility'], volatility_targets)
total_loss += 0.05 * vol_loss
losses['volatility_loss'] = vol_loss.item()
losses['total_loss'] = total_loss.item()
# Backward pass
total_loss.backward()
# Gradient clipping
torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
self.optimizer.step()
self.scheduler.step()
# Calculate accuracy
with torch.no_grad():
predictions = torch.argmax(outputs['probabilities'], dim=1)
accuracy = (predictions == y).float().mean().item()
losses['accuracy'] = accuracy
return losses
def save_model(self, filepath: str, metadata: Optional[Dict] = None):
"""Save model with metadata"""
save_dict = {
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'scheduler_state_dict': self.scheduler.state_dict(),
'training_history': self.training_history,
'model_config': {
'input_size': self.model.input_size,
'feature_dim': self.model.feature_dim,
'output_size': self.model.output_size,
'base_channels': self.model.base_channels
}
}
if metadata:
save_dict['metadata'] = metadata
torch.save(save_dict, filepath)
logger.info(f"Enhanced CNN model saved to {filepath}")
def load_model(self, filepath: str) -> Dict:
"""Load model from file"""
checkpoint = torch.load(filepath, map_location=self.device)
self.model.load_state_dict(checkpoint['model_state_dict'])
self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
if 'scheduler_state_dict' in checkpoint:
self.scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
if 'training_history' in checkpoint:
self.training_history = checkpoint['training_history']
logger.info(f"Enhanced CNN model loaded from {filepath}")
return checkpoint.get('metadata', {})
def create_enhanced_cnn_model(input_size: int = 60,
feature_dim: int = 50,
output_size: int = 2,
base_channels: int = 256,
device: str = 'cuda') -> Tuple[EnhancedCNNModel, CNNModelTrainer]:
"""Create enhanced CNN model and trainer"""
model = EnhancedCNNModel(
input_size=input_size,
feature_dim=feature_dim,
output_size=output_size,
base_channels=base_channels,
num_blocks=12,
num_attention_heads=16,
dropout_rate=0.2
)
trainer = CNNModelTrainer(model, learning_rate=0.0001, device=device)
logger.info(f"Created enhanced CNN model with {model.get_memory_usage()['total_parameters']:,} parameters")
return model, trainer

View File

@ -461,6 +461,10 @@ class DQNAgent:
action_values = q_values.cpu().data.numpy()[0]
# Calculate confidence scores
# Ensure q_values has correct shape for softmax
if q_values.dim() == 1:
q_values = q_values.unsqueeze(0)
sell_confidence = torch.softmax(q_values, dim=1)[0, 0].item()
buy_confidence = torch.softmax(q_values, dim=1)[0, 1].item()
@ -486,6 +490,10 @@ class DQNAgent:
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
q_values = self.policy_net(state_tensor)
# Ensure q_values has correct shape for softmax
if q_values.dim() == 1:
q_values = q_values.unsqueeze(0)
# Convert Q-values to probabilities
action_probs = torch.softmax(q_values, dim=1)
action = q_values.argmax().item()

View File

@ -1,603 +0,0 @@
"""
Enhanced CNN Model with Bookmap Order Book Integration
This module extends the enhanced CNN to incorporate:
- Traditional market data (OHLCV, indicators)
- Order book depth features (COB)
- Volume profile features (SVP)
- Order flow signals (sweeps, absorptions, momentum)
- Market microstructure metrics
The integrated model provides comprehensive market awareness for superior trading decisions.
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import logging
from typing import Dict, List, Optional, Tuple, Any
logger = logging.getLogger(__name__)
class ResidualBlock(nn.Module):
"""Enhanced residual block with skip connections"""
def __init__(self, in_channels, out_channels, stride=1):
super(ResidualBlock, self).__init__()
self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
self.bn1 = nn.BatchNorm1d(out_channels)
self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
self.bn2 = nn.BatchNorm1d(out_channels)
# Shortcut connection
self.shortcut = nn.Sequential()
if stride != 1 or in_channels != out_channels:
self.shortcut = nn.Sequential(
nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
nn.BatchNorm1d(out_channels)
)
def forward(self, x):
out = F.relu(self.bn1(self.conv1(x)))
out = self.bn2(self.conv2(out))
# Avoid in-place operation
out = out + self.shortcut(x)
out = F.relu(out)
return out
class MultiHeadAttention(nn.Module):
"""Multi-head attention mechanism"""
def __init__(self, dim, num_heads=8, dropout=0.1):
super(MultiHeadAttention, self).__init__()
self.dim = dim
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.q_linear = nn.Linear(dim, dim)
self.k_linear = nn.Linear(dim, dim)
self.v_linear = nn.Linear(dim, dim)
self.dropout = nn.Dropout(dropout)
self.out = nn.Linear(dim, dim)
def forward(self, x):
batch_size, seq_len, dim = x.size()
# Linear transformations
q = self.q_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
k = self.k_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
v = self.v_linear(x).view(batch_size, seq_len, self.num_heads, self.head_dim)
# Transpose for attention
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
# Scaled dot-product attention
scores = torch.matmul(q, k.transpose(-2, -1)) / np.sqrt(self.head_dim)
attn_weights = F.softmax(scores, dim=-1)
attn_weights = self.dropout(attn_weights)
attn_output = torch.matmul(attn_weights, v)
attn_output = attn_output.transpose(1, 2).contiguous().view(batch_size, seq_len, dim)
return self.out(attn_output), attn_weights
class OrderBookEncoder(nn.Module):
"""Specialized encoder for order book data"""
def __init__(self, input_dim=100, hidden_dim=512):
super(OrderBookEncoder, self).__init__()
# Order book feature processing
self.bid_encoder = nn.Sequential(
nn.Linear(40, 128), # 20 levels x 2 features
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 256),
nn.ReLU(),
nn.Dropout(0.2)
)
self.ask_encoder = nn.Sequential(
nn.Linear(40, 128), # 20 levels x 2 features
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(128, 256),
nn.ReLU(),
nn.Dropout(0.2)
)
# Microstructure features
self.microstructure_encoder = nn.Sequential(
nn.Linear(15, 64), # Liquidity + imbalance + flow features
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(64, 128),
nn.ReLU(),
nn.Dropout(0.2)
)
# Cross-attention between bids and asks
self.cross_attention = MultiHeadAttention(256, num_heads=8)
# Output projection
self.output_projection = nn.Sequential(
nn.Linear(256 + 256 + 128, hidden_dim), # Combine all features
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_dim, hidden_dim)
)
def forward(self, orderbook_features):
"""
Process order book features
Args:
orderbook_features: Tensor of shape [batch, 100] containing:
- 40 bid features (20 levels x 2)
- 40 ask features (20 levels x 2)
- 15 microstructure features
- 5 flow signal features
"""
# Split features
bid_features = orderbook_features[:, :40] # First 40 features
ask_features = orderbook_features[:, 40:80] # Next 40 features
micro_features = orderbook_features[:, 80:95] # Next 15 features
# flow_features = orderbook_features[:, 95:100] # Last 5 features (included in micro)
# Encode each component
bid_encoded = self.bid_encoder(bid_features) # [batch, 256]
ask_encoded = self.ask_encoder(ask_features) # [batch, 256]
micro_encoded = self.microstructure_encoder(micro_features) # [batch, 128]
# Add sequence dimension for attention
bid_seq = bid_encoded.unsqueeze(1) # [batch, 1, 256]
ask_seq = ask_encoded.unsqueeze(1) # [batch, 1, 256]
# Cross-attention between bids and asks
combined_seq = torch.cat([bid_seq, ask_seq], dim=1) # [batch, 2, 256]
attended_features, attention_weights = self.cross_attention(combined_seq)
# Flatten attended features
attended_flat = attended_features.reshape(attended_features.size(0), -1) # [batch, 512]
# Combine with microstructure features
combined_features = torch.cat([attended_flat, micro_encoded], dim=1) # [batch, 640]
# Final projection
output = self.output_projection(combined_features)
return output
class VolumeProfileEncoder(nn.Module):
"""Encoder for volume profile data"""
def __init__(self, max_levels=50, hidden_dim=256):
super(VolumeProfileEncoder, self).__init__()
self.max_levels = max_levels
# Process volume profile levels
self.level_encoder = nn.Sequential(
nn.Linear(7, 32), # price, volume, buy_vol, sell_vol, trades, vwap, net_vol
nn.ReLU(),
nn.Dropout(0.2),
nn.Linear(32, 64),
nn.ReLU()
)
# Attention over price levels
self.level_attention = MultiHeadAttention(64, num_heads=4)
# Final aggregation
self.aggregator = nn.Sequential(
nn.Linear(64, hidden_dim),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(hidden_dim, hidden_dim)
)
def forward(self, volume_profile_data):
"""
Process volume profile data
Args:
volume_profile_data: List of dicts or tensor with volume profile levels
"""
# If input is list of dicts, convert to tensor
if isinstance(volume_profile_data, list):
if not volume_profile_data:
# Return zero features if no data
return torch.zeros(1, 256, device=torch.device('cpu')) # Hardcoded output dim as per hidden_dim in class init
# Convert to tensor
features = []
for level in volume_profile_data[:self.max_levels]:
level_features = [
level.get('price', 0.0),
level.get('volume', 0.0),
level.get('buy_volume', 0.0),
level.get('sell_volume', 0.0),
level.get('trades_count', 0.0),
level.get('vwap', 0.0),
level.get('net_volume', 0.0)
]
features.append(level_features)
# Pad if needed
while len(features) < self.max_levels:
features.append([0.0] * 7)
volume_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
else:
volume_tensor = volume_profile_data
batch_size, num_levels, feature_dim = volume_tensor.shape
# Encode each level
level_features = self.level_encoder(volume_tensor.view(-1, feature_dim))
level_features = level_features.reshape(batch_size, num_levels, -1)
# Apply attention across levels
attended_levels, _ = self.level_attention(level_features)
# Global average pooling
aggregated = torch.mean(attended_levels, dim=1)
# Final processing
output = self.aggregator(aggregated)
return output
class EnhancedCNNWithOrderBook(nn.Module):
"""
Enhanced CNN model integrating traditional market data with order book analysis
Features:
- Multi-scale convolutional processing for time series data
- Specialized order book feature extraction
- Volume profile analysis
- Order flow signal integration
- Multi-head attention mechanisms
- Dueling architecture for value and advantage estimation
"""
def __init__(self,
market_input_shape=(60, 50), # Traditional market data
orderbook_features=100, # Order book feature dimension
n_actions=2,
confidence_threshold=0.5):
super(EnhancedCNNWithOrderBook, self).__init__()
self.market_input_shape = market_input_shape
self.orderbook_features = orderbook_features
self.n_actions = n_actions
self.confidence_threshold = confidence_threshold
# Traditional market data processing
self.market_encoder = self._build_market_encoder()
# Order book data processing
self.orderbook_encoder = OrderBookEncoder(
input_dim=orderbook_features,
hidden_dim=512
)
# Volume profile processing
self.volume_encoder = VolumeProfileEncoder(
max_levels=50,
hidden_dim=256
)
# Feature fusion
total_features = 1024 + 512 + 256 # market + orderbook + volume
self.feature_fusion = nn.Sequential(
nn.Linear(total_features, 1536),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(1536, 1024),
nn.ReLU(),
nn.Dropout(0.3)
)
# Multi-head attention for integrated features
self.integrated_attention = MultiHeadAttention(1024, num_heads=16)
# Dueling architecture
self.advantage_stream = nn.Sequential(
nn.Linear(1024, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, n_actions)
)
self.value_stream = nn.Sequential(
nn.Linear(1024, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(256, 1)
)
# Auxiliary heads for multi-task learning
self.extrema_head = nn.Sequential(
nn.Linear(1024, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 3) # bottom, top, neither
)
self.market_regime_head = nn.Sequential(
nn.Linear(1024, 512),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 8) # trending, ranging, volatile, etc.
)
self.confidence_head = nn.Sequential(
nn.Linear(1024, 256),
nn.ReLU(),
nn.Linear(256, 1),
nn.Sigmoid()
)
# Initialize weights
self._initialize_weights()
# Device management
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
self.to(self.device)
logger.info(f"Enhanced CNN with Order Book initialized")
logger.info(f"Market input shape: {market_input_shape}")
logger.info(f"Order book features: {orderbook_features}")
logger.info(f"Output actions: {n_actions}")
def _build_market_encoder(self):
"""Build traditional market data encoder"""
seq_len, feature_dim = self.market_input_shape
return nn.Sequential(
# Input projection
nn.Linear(feature_dim, 128),
nn.ReLU(),
nn.Dropout(0.2),
# Convolutional layers for temporal patterns
nn.Conv1d(128, 256, kernel_size=5, padding=2),
nn.BatchNorm1d(256),
nn.ReLU(),
nn.Dropout(0.2),
ResidualBlock(256, 512),
ResidualBlock(512, 512),
ResidualBlock(512, 768),
ResidualBlock(768, 768),
# Global pooling
nn.AdaptiveAvgPool1d(1),
nn.Flatten(),
# Final projection
nn.Linear(768, 1024),
nn.ReLU(),
nn.Dropout(0.3)
)
def _initialize_weights(self):
"""Initialize model weights"""
for m in self.modules():
if isinstance(m, nn.Conv1d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.BatchNorm1d):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
def forward(self, market_data, orderbook_data, volume_profile_data=None):
"""
Forward pass through integrated model
Args:
market_data: Traditional market data [batch, seq_len, features]
orderbook_data: Order book features [batch, orderbook_features]
volume_profile_data: Volume profile data (optional)
Returns:
Dictionary with Q-values, confidence, regime, and auxiliary predictions
"""
# Process market data - ensure batch dimension first
if len(market_data.shape) == 2:
market_data = market_data.unsqueeze(0)
batch_size = market_data.size(0) # Get correct batch size after shape adjustment
# Reshape for convolutional processing with safe dimensions
market_reshaped = market_data.reshape(batch_size, -1, market_data.size(-1))
market_features = self.market_encoder(market_reshaped.transpose(1, 2))
# Process order book data
orderbook_features = self.orderbook_encoder(orderbook_data)
# Process volume profile data
if volume_profile_data is not None:
volume_features = self.volume_encoder(volume_profile_data)
else:
volume_features = torch.zeros(batch_size, 256, device=market_data.device)
# Fuse all features
combined_features = torch.cat([
market_features,
orderbook_features,
volume_features
], dim=1)
# Feature fusion
fused_features = self.feature_fusion(combined_features)
# Apply attention
attended_features = fused_features.unsqueeze(1) # Add sequence dimension
attended_output, attention_weights = self.integrated_attention(attended_features)
final_features = attended_output.squeeze(1) # Remove sequence dimension
# Dueling architecture
advantage = self.advantage_stream(final_features)
value = self.value_stream(final_features)
# Combine value and advantage
q_values = value + advantage - advantage.mean(dim=1, keepdim=True)
# Auxiliary predictions
extrema_pred = self.extrema_head(final_features)
regime_pred = self.market_regime_head(final_features)
confidence = self.confidence_head(final_features)
return {
'q_values': q_values,
'confidence': confidence,
'extrema_prediction': extrema_pred,
'market_regime': regime_pred,
'attention_weights': attention_weights,
'integrated_features': final_features
}
def predict(self, market_data, orderbook_data, volume_profile_data=None):
"""Make prediction with confidence thresholding"""
self.eval()
with torch.no_grad():
# Convert inputs to tensors if needed
if isinstance(market_data, np.ndarray):
market_data = torch.FloatTensor(market_data).to(self.device)
if isinstance(orderbook_data, np.ndarray):
orderbook_data = torch.FloatTensor(orderbook_data).to(self.device)
# Ensure batch dimension
if len(market_data.shape) == 2:
market_data = market_data.unsqueeze(0)
if len(orderbook_data.shape) == 1:
orderbook_data = orderbook_data.unsqueeze(0)
# Forward pass
outputs = self.forward(market_data, orderbook_data, volume_profile_data)
# Get probabilities
q_values = outputs['q_values']
probs = F.softmax(q_values, dim=1)
# Handle confidence shape properly to avoid scalar conversion errors
confidence_tensor = outputs['confidence']
if isinstance(confidence_tensor, torch.Tensor):
if confidence_tensor.numel() == 1:
confidence = confidence_tensor.item()
else:
confidence = confidence_tensor.flatten()[0].item()
else:
confidence = float(confidence_tensor)
# Action selection with confidence thresholding
if confidence >= self.confidence_threshold:
action = torch.argmax(q_values, dim=1).item()
else:
action = None # No action due to low confidence
return {
'action': action,
'probabilities': probs.cpu().numpy()[0],
'confidence': confidence,
'q_values': q_values.cpu().numpy()[0],
'extrema_prediction': F.softmax(outputs['extrema_prediction'], dim=1).cpu().numpy()[0],
'market_regime': F.softmax(outputs['market_regime'], dim=1).cpu().numpy()[0]
}
def get_feature_importance(self, market_data, orderbook_data, volume_profile_data=None):
"""Analyze feature importance using gradients"""
self.eval()
# Enable gradient computation for inputs
market_data.requires_grad_(True)
orderbook_data.requires_grad_(True)
# Forward pass
outputs = self.forward(market_data, orderbook_data, volume_profile_data)
# Compute gradients for Q-values
q_values = outputs['q_values']
q_values.sum().backward()
# Get gradient magnitudes
market_importance = torch.abs(market_data.grad).mean().item()
orderbook_importance = torch.abs(orderbook_data.grad).mean().item()
return {
'market_importance': market_importance,
'orderbook_importance': orderbook_importance,
'total_importance': market_importance + orderbook_importance
}
def save(self, path):
"""Save model state"""
torch.save({
'model_state_dict': self.state_dict(),
'market_input_shape': self.market_input_shape,
'orderbook_features': self.orderbook_features,
'n_actions': self.n_actions,
'confidence_threshold': self.confidence_threshold
}, path)
logger.info(f"Enhanced CNN with Order Book saved to {path}")
def load(self, path):
"""Load model state"""
checkpoint = torch.load(path, map_location=self.device)
self.load_state_dict(checkpoint['model_state_dict'])
logger.info(f"Enhanced CNN with Order Book loaded from {path}")
def get_memory_usage(self):
"""Get model memory usage statistics"""
total_params = sum(p.numel() for p in self.parameters())
trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
return {
'total_parameters': total_params,
'trainable_parameters': trainable_params,
'model_size_mb': total_params * 4 / (1024 * 1024), # Assuming float32
}
def create_enhanced_cnn_with_orderbook(
market_input_shape=(60, 50),
orderbook_features=100,
n_actions=2,
device='cuda'
):
"""Create and initialize enhanced CNN with order book integration"""
model = EnhancedCNNWithOrderBook(
market_input_shape=market_input_shape,
orderbook_features=orderbook_features,
n_actions=n_actions
)
if device and torch.cuda.is_available():
model = model.to(device)
memory_usage = model.get_memory_usage()
logger.info(f"Created Enhanced CNN with Order Book: {memory_usage['total_parameters']:,} parameters")
logger.info(f"Model size: {memory_usage['model_size_mb']:.1f} MB")
return model

View File

@ -1,653 +0,0 @@
#!/usr/bin/env python3
"""
Transformer Model - PyTorch Implementation
This module implements a Transformer model using PyTorch for time series analysis.
The model consists of a Transformer encoder and a Mixture of Experts model.
"""
import os
import logging
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Configure logging
logger = logging.getLogger(__name__)
class TransformerBlock(nn.Module):
"""Transformer Block with self-attention mechanism"""
def __init__(self, input_dim, num_heads=4, ff_dim=64, dropout=0.1):
super(TransformerBlock, self).__init__()
self.attention = nn.MultiheadAttention(
embed_dim=input_dim,
num_heads=num_heads,
dropout=dropout,
batch_first=True
)
self.feed_forward = nn.Sequential(
nn.Linear(input_dim, ff_dim),
nn.ReLU(),
nn.Linear(ff_dim, input_dim)
)
self.layernorm1 = nn.LayerNorm(input_dim)
self.layernorm2 = nn.LayerNorm(input_dim)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
def forward(self, x):
# Self-attention
attn_output, _ = self.attention(x, x, x)
x = x + self.dropout1(attn_output)
x = self.layernorm1(x)
# Feed forward
ff_output = self.feed_forward(x)
x = x + self.dropout2(ff_output)
x = self.layernorm2(x)
return x
class TransformerModelPyTorch(nn.Module):
"""PyTorch Transformer model for time series analysis"""
def __init__(self, input_shape, output_size=3, num_heads=4, ff_dim=64, num_transformer_blocks=2):
"""
Initialize the Transformer model.
Args:
input_shape (tuple): Shape of input data (window_size, features)
output_size (int): Size of output (1 for regression, 3 for classification)
num_heads (int): Number of attention heads
ff_dim (int): Feed forward dimension
num_transformer_blocks (int): Number of transformer blocks
"""
super(TransformerModelPyTorch, self).__init__()
window_size, num_features = input_shape
# Positional encoding
self.pos_encoding = nn.Parameter(
torch.zeros(1, window_size, num_features),
requires_grad=True
)
# Transformer blocks
self.transformer_blocks = nn.ModuleList([
TransformerBlock(
input_dim=num_features,
num_heads=num_heads,
ff_dim=ff_dim
) for _ in range(num_transformer_blocks)
])
# Global average pooling
self.global_avg_pool = nn.AdaptiveAvgPool1d(1)
# Dense layers
self.dense = nn.Sequential(
nn.Linear(num_features, 64),
nn.ReLU(),
nn.BatchNorm1d(64),
nn.Dropout(0.3),
nn.Linear(64, output_size)
)
# Activation based on output size
if output_size == 1:
self.activation = nn.Sigmoid() # Binary classification or regression
elif output_size > 1:
self.activation = nn.Softmax(dim=1) # Multi-class classification
else:
self.activation = nn.Identity() # No activation
def forward(self, x):
"""
Forward pass through the network.
Args:
x: Input tensor of shape [batch_size, window_size, features]
Returns:
Output tensor of shape [batch_size, output_size]
"""
# Add positional encoding
x = x + self.pos_encoding
# Apply transformer blocks
for transformer_block in self.transformer_blocks:
x = transformer_block(x)
# Global average pooling
x = x.transpose(1, 2) # [batch, features, window]
x = self.global_avg_pool(x) # [batch, features, 1]
x = x.squeeze(-1) # [batch, features]
# Dense layers
x = self.dense(x)
# Apply activation
return self.activation(x)
class TransformerModelPyTorchWrapper:
"""
Transformer model wrapper class for time series analysis using PyTorch.
This class provides methods for building, training, evaluating, and making
predictions with the Transformer model.
"""
def __init__(self, window_size, num_features, output_size=3, timeframes=None):
"""
Initialize the Transformer model.
Args:
window_size (int): Size of the input window
num_features (int): Number of features in the input data
output_size (int): Size of the output (1 for regression, 3 for classification)
timeframes (list): List of timeframes used (for logging)
"""
self.window_size = window_size
self.num_features = num_features
self.output_size = output_size
self.timeframes = timeframes or []
# Determine device (GPU or CPU)
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {self.device}")
# Initialize model
self.model = None
self.build_model()
# Initialize training history
self.history = {
'loss': [],
'val_loss': [],
'accuracy': [],
'val_accuracy': []
}
def build_model(self):
"""Build the Transformer model architecture"""
logger.info(f"Building PyTorch Transformer model with window_size={self.window_size}, "
f"num_features={self.num_features}, output_size={self.output_size}")
self.model = TransformerModelPyTorch(
input_shape=(self.window_size, self.num_features),
output_size=self.output_size
).to(self.device)
# Initialize optimizer
self.optimizer = optim.Adam(self.model.parameters(), lr=0.001)
# Initialize loss function based on output size
if self.output_size == 1:
self.criterion = nn.BCELoss() # Binary classification
elif self.output_size > 1:
self.criterion = nn.CrossEntropyLoss() # Multi-class classification
else:
self.criterion = nn.MSELoss() # Regression
logger.info(f"Model built successfully with {sum(p.numel() for p in self.model.parameters())} parameters")
def train(self, X_train, y_train, X_val=None, y_val=None, batch_size=32, epochs=100):
"""
Train the Transformer model.
Args:
X_train: Training input data
y_train: Training target data
X_val: Validation input data
y_val: Validation target data
batch_size: Batch size for training
epochs: Number of training epochs
Returns:
Training history
"""
logger.info(f"Training PyTorch Transformer model with {len(X_train)} samples, "
f"batch_size={batch_size}, epochs={epochs}")
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(self.device)
# Handle different output sizes for y_train
if self.output_size == 1:
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(self.device)
else:
y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(self.device)
# Create DataLoader for training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# Create DataLoader for validation data if provided
if X_val is not None and y_val is not None:
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(self.device)
if self.output_size == 1:
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(self.device)
else:
y_val_tensor = torch.tensor(y_val, dtype=torch.long).to(self.device)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
else:
val_loader = None
# Training loop
for epoch in range(epochs):
# Training phase
self.model.train()
running_loss = 0.0
correct = 0
total = 0
for inputs, targets in train_loader:
# Zero the parameter gradients
self.optimizer.zero_grad()
# Forward pass
outputs = self.model(inputs)
# Calculate loss
if self.output_size == 1:
loss = self.criterion(outputs, targets.unsqueeze(1))
else:
loss = self.criterion(outputs, targets)
# Backward pass and optimize
loss.backward()
self.optimizer.step()
# Statistics
running_loss += loss.item()
if self.output_size > 1:
_, predicted = torch.max(outputs, 1)
total += targets.size(0)
correct += (predicted == targets).sum().item()
epoch_loss = running_loss / len(train_loader)
epoch_acc = correct / total if total > 0 else 0
# Validation phase
if val_loader is not None:
val_loss, val_acc = self._validate(val_loader)
logger.info(f"Epoch {epoch+1}/{epochs} - "
f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f} - "
f"val_loss: {val_loss:.4f} - val_acc: {val_acc:.4f}")
# Update history
self.history['loss'].append(epoch_loss)
self.history['accuracy'].append(epoch_acc)
self.history['val_loss'].append(val_loss)
self.history['val_accuracy'].append(val_acc)
else:
logger.info(f"Epoch {epoch+1}/{epochs} - "
f"loss: {epoch_loss:.4f} - acc: {epoch_acc:.4f}")
# Update history without validation
self.history['loss'].append(epoch_loss)
self.history['accuracy'].append(epoch_acc)
logger.info("Training completed")
return self.history
def _validate(self, val_loader):
"""Validate the model using the validation set"""
self.model.eval()
val_loss = 0.0
correct = 0
total = 0
with torch.no_grad():
for inputs, targets in val_loader:
# Forward pass
outputs = self.model(inputs)
# Calculate loss
if self.output_size == 1:
loss = self.criterion(outputs, targets.unsqueeze(1))
else:
loss = self.criterion(outputs, targets)
val_loss += loss.item()
# Calculate accuracy
if self.output_size > 1:
_, predicted = torch.max(outputs, 1)
total += targets.size(0)
correct += (predicted == targets).sum().item()
return val_loss / len(val_loader), correct / total if total > 0 else 0
def evaluate(self, X_test, y_test):
"""
Evaluate the model on test data.
Args:
X_test: Test input data
y_test: Test target data
Returns:
dict: Evaluation metrics
"""
logger.info(f"Evaluating model on {len(X_test)} samples")
# Convert to PyTorch tensors
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(self.device)
# Get predictions
self.model.eval()
with torch.no_grad():
y_pred = self.model(X_test_tensor)
if self.output_size > 1:
_, y_pred_class = torch.max(y_pred, 1)
y_pred_class = y_pred_class.cpu().numpy()
else:
y_pred_class = (y_pred.cpu().numpy() > 0.5).astype(int).flatten()
# Calculate metrics
if self.output_size > 1:
accuracy = accuracy_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class, average='weighted')
recall = recall_score(y_test, y_pred_class, average='weighted')
f1 = f1_score(y_test, y_pred_class, average='weighted')
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1
}
else:
accuracy = accuracy_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1
}
logger.info(f"Evaluation metrics: {metrics}")
return metrics
def predict(self, X):
"""
Make predictions with the model.
Args:
X: Input data
Returns:
Predictions
"""
# Convert to PyTorch tensor
X_tensor = torch.tensor(X, dtype=torch.float32).to(self.device)
# Get predictions
self.model.eval()
with torch.no_grad():
predictions = self.model(X_tensor)
if self.output_size > 1:
# Multi-class classification
probs = predictions.cpu().numpy()
_, class_preds = torch.max(predictions, 1)
class_preds = class_preds.cpu().numpy()
return class_preds, probs
else:
# Binary classification or regression
preds = predictions.cpu().numpy()
if self.output_size == 1:
# Binary classification
class_preds = (preds > 0.5).astype(int)
return class_preds.flatten(), preds.flatten()
else:
# Regression
return preds.flatten(), None
def save(self, filepath):
"""
Save the model to a file.
Args:
filepath: Path to save the model
"""
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(filepath), exist_ok=True)
# Save the model state
model_state = {
'model_state_dict': self.model.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict(),
'history': self.history,
'window_size': self.window_size,
'num_features': self.num_features,
'output_size': self.output_size,
'timeframes': self.timeframes
}
torch.save(model_state, f"{filepath}.pt")
logger.info(f"Model saved to {filepath}.pt")
def load(self, filepath):
"""
Load the model from a file.
Args:
filepath: Path to load the model from
"""
# Check if file exists
if not os.path.exists(f"{filepath}.pt"):
logger.error(f"Model file {filepath}.pt not found")
return False
# Load the model state
model_state = torch.load(f"{filepath}.pt", map_location=self.device)
# Update model parameters
self.window_size = model_state['window_size']
self.num_features = model_state['num_features']
self.output_size = model_state['output_size']
self.timeframes = model_state['timeframes']
# Rebuild the model
self.build_model()
# Load the model state
self.model.load_state_dict(model_state['model_state_dict'])
self.optimizer.load_state_dict(model_state['optimizer_state_dict'])
self.history = model_state['history']
logger.info(f"Model loaded from {filepath}.pt")
return True
class MixtureOfExpertsModelPyTorch:
"""
Mixture of Experts model implementation using PyTorch.
This model combines predictions from multiple models (experts) using a
learned weighting scheme.
"""
def __init__(self, output_size=3, timeframes=None):
"""
Initialize the Mixture of Experts model.
Args:
output_size (int): Size of the output (1 for regression, 3 for classification)
timeframes (list): List of timeframes used (for logging)
"""
self.output_size = output_size
self.timeframes = timeframes or []
self.experts = {}
self.expert_weights = {}
# Determine device (GPU or CPU)
self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {self.device}")
# Initialize model and training history
self.model = None
self.history = {
'loss': [],
'val_loss': [],
'accuracy': [],
'val_accuracy': []
}
def add_expert(self, name, model):
"""
Add an expert model.
Args:
name (str): Name of the expert
model: Expert model
"""
self.experts[name] = model
logger.info(f"Added expert: {name}")
def predict(self, X):
"""
Make predictions using all experts and combine them.
Args:
X: Input data
Returns:
Combined predictions
"""
if not self.experts:
logger.error("No experts added to the MoE model")
return None
# Get predictions from each expert
expert_predictions = {}
for name, expert in self.experts.items():
pred, _ = expert.predict(X)
expert_predictions[name] = pred
# Combine predictions based on weights
final_pred = None
for name, pred in expert_predictions.items():
weight = self.expert_weights.get(name, 1.0 / len(self.experts))
if final_pred is None:
final_pred = weight * pred
else:
final_pred += weight * pred
# For classification, convert to class indices
if self.output_size > 1:
# Get class with highest probability
class_pred = np.argmax(final_pred, axis=1)
return class_pred, final_pred
else:
# Binary classification
class_pred = (final_pred > 0.5).astype(int)
return class_pred, final_pred
def evaluate(self, X_test, y_test):
"""
Evaluate the model on test data.
Args:
X_test: Test input data
y_test: Test target data
Returns:
dict: Evaluation metrics
"""
logger.info(f"Evaluating MoE model on {len(X_test)} samples")
# Get predictions
y_pred_class, _ = self.predict(X_test)
# Calculate metrics
if self.output_size > 1:
accuracy = accuracy_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class, average='weighted')
recall = recall_score(y_test, y_pred_class, average='weighted')
f1 = f1_score(y_test, y_pred_class, average='weighted')
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1
}
else:
accuracy = accuracy_score(y_test, y_pred_class)
precision = precision_score(y_test, y_pred_class)
recall = recall_score(y_test, y_pred_class)
f1 = f1_score(y_test, y_pred_class)
metrics = {
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1
}
logger.info(f"MoE evaluation metrics: {metrics}")
return metrics
def save(self, filepath):
"""
Save the model weights to a file.
Args:
filepath: Path to save the model
"""
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(filepath), exist_ok=True)
# Save the model state
model_state = {
'expert_weights': self.expert_weights,
'output_size': self.output_size,
'timeframes': self.timeframes
}
torch.save(model_state, f"{filepath}_moe.pt")
logger.info(f"MoE model saved to {filepath}_moe.pt")
def load(self, filepath):
"""
Load the model from a file.
Args:
filepath: Path to load the model from
"""
# Check if file exists
if not os.path.exists(f"{filepath}_moe.pt"):
logger.error(f"MoE model file {filepath}_moe.pt not found")
return False
# Load the model state
model_state = torch.load(f"{filepath}_moe.pt", map_location=self.device)
# Update model parameters
self.expert_weights = model_state['expert_weights']
self.output_size = model_state['output_size']
self.timeframes = model_state['timeframes']
logger.info(f"MoE model loaded from {filepath}_moe.pt")
return True