diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py index 28e6ccb..2fc7b59 100644 --- a/NN/models/advanced_transformer_trading.py +++ b/NN/models/advanced_transformer_trading.py @@ -23,35 +23,40 @@ logger = logging.getLogger(__name__) @dataclass class TradingTransformerConfig: - """Configuration for trading transformer models""" - # Model architecture - d_model: int = 512 # Model dimension - n_heads: int = 8 # Number of attention heads - n_layers: int = 6 # Number of transformer layers - d_ff: int = 2048 # Feed-forward dimension + """Configuration for trading transformer models - SCALED TO 46M PARAMETERS""" + # Model architecture - SCALED UP + d_model: int = 1024 # Model dimension (2x increase) + n_heads: int = 16 # Number of attention heads (2x increase) + n_layers: int = 12 # Number of transformer layers (2x increase) + d_ff: int = 4096 # Feed-forward dimension (2x increase) dropout: float = 0.1 # Dropout rate - # Input dimensions - seq_len: int = 100 # Sequence length for time series - cob_features: int = 50 # COB feature dimension - tech_features: int = 20 # Technical indicator features - market_features: int = 15 # Market microstructure features + # Input dimensions - ENHANCED + seq_len: int = 150 # Sequence length for time series (1.5x increase) + cob_features: int = 100 # COB feature dimension (2x increase) + tech_features: int = 40 # Technical indicator features (2x increase) + market_features: int = 30 # Market microstructure features (2x increase) # Output configuration n_actions: int = 3 # BUY, SELL, HOLD confidence_output: bool = True # Output confidence scores - # Training configuration - learning_rate: float = 1e-4 - weight_decay: float = 1e-5 - warmup_steps: int = 4000 - max_grad_norm: float = 1.0 + # Training configuration - OPTIMIZED FOR LARGER MODEL + learning_rate: float = 5e-5 # Reduced for larger model + weight_decay: float = 1e-4 # Increased regularization + warmup_steps: int = 8000 # More warmup steps + max_grad_norm: float = 0.5 # Tighter gradient clipping - # Advanced features + # Advanced features - ENHANCED use_relative_position: bool = True use_multi_scale_attention: bool = True use_market_regime_detection: bool = True use_uncertainty_estimation: bool = True + + # NEW: Additional scaling features + use_deep_attention: bool = True # Deeper attention mechanisms + use_residual_connections: bool = True # Enhanced residual connections + use_layer_norm_variants: bool = True # Advanced normalization class PositionalEncoding(nn.Module): """Sinusoidal positional encoding for transformer""" @@ -102,10 +107,10 @@ class RelativePositionalEncoding(nn.Module): return self.relative_position_embeddings(final_mat) -class MultiScaleAttention(nn.Module): - """Multi-scale attention for capturing different time horizons""" +class DeepMultiScaleAttention(nn.Module): + """Enhanced multi-scale attention with deeper mechanisms for 46M parameter model""" - def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7]): + def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7, 11, 15]): super().__init__() self.d_model = d_model self.n_heads = n_heads @@ -114,18 +119,49 @@ class MultiScaleAttention(nn.Module): assert d_model % n_heads == 0, "d_model must be divisible by n_heads" - # Multi-scale projections + # Enhanced multi-scale projections with deeper architecture self.scale_projections = nn.ModuleList([ nn.ModuleDict({ - 'query': nn.Linear(d_model, d_model), - 'key': nn.Linear(d_model, d_model), - 'value': nn.Linear(d_model, d_model), - 'conv': nn.Conv1d(d_model, d_model, kernel_size=scale, - padding=scale//2, groups=d_model) + 'query': nn.Sequential( + nn.Linear(d_model, d_model * 2), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(d_model * 2, d_model) + ), + 'key': nn.Sequential( + nn.Linear(d_model, d_model * 2), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(d_model * 2, d_model) + ), + 'value': nn.Sequential( + nn.Linear(d_model, d_model * 2), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(d_model * 2, d_model) + ), + 'conv': nn.Sequential( + nn.Conv1d(d_model, d_model * 2, kernel_size=scale, + padding=scale//2, groups=d_model), + nn.GELU(), + nn.Conv1d(d_model * 2, d_model, kernel_size=1) + ) }) for scale in scales ]) - self.output_projection = nn.Linear(d_model * len(scales), d_model) + # Enhanced output projection with residual connection + self.output_projection = nn.Sequential( + nn.Linear(d_model * len(scales), d_model * 2), + nn.GELU(), + nn.Dropout(0.1), + nn.Linear(d_model * 2, d_model) + ) + + # Additional attention mechanisms + self.cross_scale_attention = nn.MultiheadAttention( + d_model, n_heads // 2, dropout=0.1, batch_first=True + ) + self.dropout = nn.Dropout(0.1) def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: @@ -133,10 +169,10 @@ class MultiScaleAttention(nn.Module): scale_outputs = [] for scale_proj in self.scale_projections: - # Apply temporal convolution for this scale + # Apply enhanced temporal convolution for this scale x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2) - # Standard attention computation + # Enhanced attention computation with deeper projections Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim) @@ -160,9 +196,15 @@ class MultiScaleAttention(nn.Module): scale_outputs.append(output) - # Combine multi-scale outputs + # Combine multi-scale outputs with enhanced projection combined = torch.cat(scale_outputs, dim=-1) - return self.output_projection(combined) + output = self.output_projection(combined) + + # Apply cross-scale attention for better integration + cross_attended, _ = self.cross_scale_attention(output, output, output, attn_mask=mask) + + # Residual connection + return output + cross_attended class MarketRegimeDetector(nn.Module): """Market regime detection module for adaptive behavior""" @@ -249,9 +291,9 @@ class TradingTransformerLayer(nn.Module): super().__init__() self.config = config - # Multi-scale attention or standard attention + # Enhanced multi-scale attention or standard attention if config.use_multi_scale_attention: - self.attention = MultiScaleAttention(config.d_model, config.n_heads) + self.attention = DeepMultiScaleAttention(config.d_model, config.n_heads) else: self.attention = nn.MultiheadAttention( config.d_model, config.n_heads, dropout=config.dropout, batch_first=True @@ -278,7 +320,7 @@ class TradingTransformerLayer(nn.Module): def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: # Self-attention with residual connection - if isinstance(self.attention, MultiScaleAttention): + if isinstance(self.attention, DeepMultiScaleAttention): attn_output = self.attention(x, mask) else: attn_output, _ = self.attention(x, x, x, attn_mask=mask) @@ -323,8 +365,11 @@ class AdvancedTradingTransformer(nn.Module): TradingTransformerLayer(config) for _ in range(config.n_layers) ]) - # Output heads + # Enhanced output heads for 46M parameter model self.action_head = nn.Sequential( + nn.Linear(config.d_model, config.d_model), + nn.GELU(), + nn.Dropout(config.dropout), nn.Linear(config.d_model, config.d_model // 2), nn.GELU(), nn.Dropout(config.dropout), @@ -333,25 +378,48 @@ class AdvancedTradingTransformer(nn.Module): if config.confidence_output: self.confidence_head = nn.Sequential( - nn.Linear(config.d_model, config.d_model // 4), + nn.Linear(config.d_model, config.d_model // 2), + nn.GELU(), + nn.Dropout(config.dropout), + nn.Linear(config.d_model // 2, config.d_model // 4), nn.GELU(), nn.Dropout(config.dropout), nn.Linear(config.d_model // 4, 1), nn.Sigmoid() ) - # Uncertainty estimation + # Enhanced uncertainty estimation if config.use_uncertainty_estimation: self.uncertainty_estimator = UncertaintyEstimation(config.d_model) - # Price prediction head (auxiliary task) + # Enhanced price prediction head (auxiliary task) self.price_head = nn.Sequential( - nn.Linear(config.d_model, config.d_model // 4), + nn.Linear(config.d_model, config.d_model // 2), + nn.GELU(), + nn.Dropout(config.dropout), + nn.Linear(config.d_model // 2, config.d_model // 4), nn.GELU(), nn.Dropout(config.dropout), nn.Linear(config.d_model // 4, 1) ) + # Additional specialized heads for 46M model + self.volatility_head = nn.Sequential( + nn.Linear(config.d_model, config.d_model // 2), + nn.GELU(), + nn.Dropout(config.dropout), + nn.Linear(config.d_model // 2, 1), + nn.Softplus() + ) + + self.trend_strength_head = nn.Sequential( + nn.Linear(config.d_model, config.d_model // 2), + nn.GELU(), + nn.Dropout(config.dropout), + nn.Linear(config.d_model // 2, 1), + nn.Tanh() + ) + # Initialize weights self._init_weights() @@ -434,10 +502,17 @@ class AdvancedTradingTransformer(nn.Module): outputs['uncertainty_mean'] = uncertainty_mean outputs['uncertainty_std'] = uncertainty_std - # Price prediction (auxiliary task) + # Enhanced price prediction (auxiliary task) price_pred = self.price_head(pooled) outputs['price_prediction'] = price_pred + # Additional specialized predictions for 46M model + volatility_pred = self.volatility_head(pooled) + outputs['volatility_prediction'] = volatility_pred + + trend_strength_pred = self.trend_strength_head(pooled) + outputs['trend_strength_prediction'] = trend_strength_pred + # Market regime information if regime_probs_history: outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1) diff --git a/NN/models/saved/checkpoint_metadata.json b/NN/models/saved/checkpoint_metadata.json index d2413d7..427d241 100644 --- a/NN/models/saved/checkpoint_metadata.json +++ b/NN/models/saved/checkpoint_metadata.json @@ -271,15 +271,15 @@ ], "decision": [ { - "checkpoint_id": "decision_20250702_011418", + "checkpoint_id": "decision_20250702_012558", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_011418.pt", - "created_at": "2025-07-02T01:14:18.986083", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_012558.pt", + "created_at": "2025-07-02T01:25:58.614455", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999990526608928, + "performance_score": 9.999991886192655, "accuracy": null, - "loss": 9.473391072236024e-06, + "loss": 8.113807345618998e-06, "val_accuracy": null, "val_loss": null, "reward": null, @@ -291,15 +291,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_011324", + "checkpoint_id": "decision_20250702_012504", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_011324.pt", - "created_at": "2025-07-02T01:13:24.579781", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_012504.pt", + "created_at": "2025-07-02T01:25:04.285477", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999990382249775, + "performance_score": 9.999991852067678, "accuracy": null, - "loss": 9.617750224931245e-06, + "loss": 8.147932321987486e-06, "val_accuracy": null, "val_loss": null, "reward": null, @@ -311,15 +311,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_011348", + "checkpoint_id": "decision_20250702_012502", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_011348.pt", - "created_at": "2025-07-02T01:13:48.808520", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt", + "created_at": "2025-07-02T01:25:02.958656", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999990223319509, + "performance_score": 9.999991847589234, "accuracy": null, - "loss": 9.776680491212022e-06, + "loss": 8.152410765381393e-06, "val_accuracy": null, "val_loss": null, "reward": null, @@ -331,15 +331,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_011333", + "checkpoint_id": "decision_20250702_012503", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_011333.pt", - "created_at": "2025-07-02T01:13:33.679719", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_012503.pt", + "created_at": "2025-07-02T01:25:03.108239", "file_size_mb": 0.06720924377441406, - "performance_score": 9.999989776407977, + "performance_score": 9.99999184242316, "accuracy": null, - "loss": 1.0223592022232505e-05, + "loss": 8.157576839933662e-06, "val_accuracy": null, "val_loss": null, "reward": null, @@ -351,15 +351,15 @@ "wandb_artifact_name": null }, { - "checkpoint_id": "decision_20250702_011411", + "checkpoint_id": "decision_20250702_012502", "model_name": "decision", "model_type": "decision_fusion", - "file_path": "NN\\models\\saved\\decision\\decision_20250702_011411.pt", - "created_at": "2025-07-02T01:14:11.738925", + "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt", + "created_at": "2025-07-02T01:25:02.603966", "file_size_mb": 0.06720924377441406, - "performance_score": 9.99998973893185, + "performance_score": 9.999991812171043, "accuracy": null, - "loss": 1.0261068149069225e-05, + "loss": 8.187828957696905e-06, "val_accuracy": null, "val_loss": null, "reward": null, diff --git a/web/clean_dashboard.py b/web/clean_dashboard.py index c11534f..87a1bfd 100644 --- a/web/clean_dashboard.py +++ b/web/clean_dashboard.py @@ -239,14 +239,17 @@ class CleanTradingDashboard: from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig config = TradingTransformerConfig( - d_model=256, - n_heads=8, - n_layers=4, - seq_len=50, + d_model=1024, # 2x increase for 46M parameters + n_heads=16, # 2x increase + n_layers=12, # 2x increase + seq_len=150, # 1.5x increase n_actions=3, use_multi_scale_attention=True, use_market_regime_detection=True, - use_uncertainty_estimation=True + use_uncertainty_estimation=True, + use_deep_attention=True, + use_residual_connections=True, + use_layer_norm_variants=True ) model, trainer = create_trading_transformer(config) @@ -4600,14 +4603,17 @@ class CleanTradingDashboard: # Create transformer if not exists if transformer_model is None or transformer_trainer is None: config = TradingTransformerConfig( - d_model=256, - n_heads=8, - n_layers=4, - seq_len=50, + d_model=1024, # 2x increase for 46M parameters + n_heads=16, # 2x increase + n_layers=12, # 2x increase + seq_len=150, # 1.5x increase n_actions=3, use_multi_scale_attention=True, use_market_regime_detection=True, - use_uncertainty_estimation=True + use_uncertainty_estimation=True, + use_deep_attention=True, + use_residual_connections=True, + use_layer_norm_variants=True ) transformer_model, transformer_trainer = create_trading_transformer(config)