beef up T model

2025-07-02 01:26:07 +03:00
parent 0c8ae823ba
commit 8645f6e8dd
3 changed files with 156 additions and 75 deletions
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -23,35 +23,40 @@ logger = logging.getLogger(__name__)
@dataclass
 class TradingTransformerConfig:
-    """Configuration for trading transformer models"""
+    """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
-    # Model architecture
+    # Model architecture - SCALED UP
-    d_model: int = 512          # Model dimension
+    d_model: int = 1024         # Model dimension (2x increase)
-    n_heads: int = 8            # Number of attention heads
+    n_heads: int = 16           # Number of attention heads (2x increase)
-    n_layers: int = 6           # Number of transformer layers
+    n_layers: int = 12          # Number of transformer layers (2x increase)
-    d_ff: int = 2048           # Feed-forward dimension
+    d_ff: int = 4096           # Feed-forward dimension (2x increase)
    dropout: float = 0.1        # Dropout rate
-    # Input dimensions
+    # Input dimensions - ENHANCED
-    seq_len: int = 100          # Sequence length for time series
+    seq_len: int = 150          # Sequence length for time series (1.5x increase)
-    cob_features: int = 50      # COB feature dimension
+    cob_features: int = 100     # COB feature dimension (2x increase)
-    tech_features: int = 20     # Technical indicator features
+    tech_features: int = 40     # Technical indicator features (2x increase)
-    market_features: int = 15   # Market microstructure features
+    market_features: int = 30   # Market microstructure features (2x increase)
    # Output configuration
    n_actions: int = 3          # BUY, SELL, HOLD
    confidence_output: bool = True  # Output confidence scores
-    # Training configuration
+    # Training configuration - OPTIMIZED FOR LARGER MODEL
-    learning_rate: float = 1e-4
+    learning_rate: float = 5e-5  # Reduced for larger model
-    weight_decay: float = 1e-5
+    weight_decay: float = 1e-4   # Increased regularization
-    warmup_steps: int = 4000
+    warmup_steps: int = 8000     # More warmup steps
-    max_grad_norm: float = 1.0
+    max_grad_norm: float = 0.5   # Tighter gradient clipping
-    # Advanced features
+    # Advanced features - ENHANCED
    use_relative_position: bool = True
    use_multi_scale_attention: bool = True
    use_market_regime_detection: bool = True
    use_uncertainty_estimation: bool = True
    # NEW: Additional scaling features
    use_deep_attention: bool = True      # Deeper attention mechanisms
    use_residual_connections: bool = True # Enhanced residual connections
    use_layer_norm_variants: bool = True # Advanced normalization
 class PositionalEncoding(nn.Module):
    """Sinusoidal positional encoding for transformer"""
@@ -102,10 +107,10 @@ class RelativePositionalEncoding(nn.Module):
        return self.relative_position_embeddings(final_mat)
-class MultiScaleAttention(nn.Module):
+class DeepMultiScaleAttention(nn.Module):
-    """Multi-scale attention for capturing different time horizons"""
+    """Enhanced multi-scale attention with deeper mechanisms for 46M parameter model"""
-    def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7]):
+    def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7, 11, 15]):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
@@ -114,18 +119,49 @@ class MultiScaleAttention(nn.Module):
        assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
-        # Multi-scale projections
+        # Enhanced multi-scale projections with deeper architecture
        self.scale_projections = nn.ModuleList([
            nn.ModuleDict({
-                'query': nn.Linear(d_model, d_model),
+                'query': nn.Sequential(
-                'key': nn.Linear(d_model, d_model),
+                    nn.Linear(d_model, d_model * 2),
-                'value': nn.Linear(d_model, d_model),
+                    nn.GELU(),
-                'conv': nn.Conv1d(d_model, d_model, kernel_size=scale, 
+                    nn.Dropout(0.1),
-                                padding=scale//2, groups=d_model)
+                    nn.Linear(d_model * 2, d_model)
                ),
                'key': nn.Sequential(
                    nn.Linear(d_model, d_model * 2),
                    nn.GELU(),
                    nn.Dropout(0.1),
                    nn.Linear(d_model * 2, d_model)
                ),
                'value': nn.Sequential(
                    nn.Linear(d_model, d_model * 2),
                    nn.GELU(),
                    nn.Dropout(0.1),
                    nn.Linear(d_model * 2, d_model)
                ),
                'conv': nn.Sequential(
                    nn.Conv1d(d_model, d_model * 2, kernel_size=scale, 
                             padding=scale//2, groups=d_model),
                    nn.GELU(),
                    nn.Conv1d(d_model * 2, d_model, kernel_size=1)
                )
            }) for scale in scales
        ])
-        self.output_projection = nn.Linear(d_model * len(scales), d_model)
+        # Enhanced output projection with residual connection
        self.output_projection = nn.Sequential(
            nn.Linear(d_model * len(scales), d_model * 2),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(d_model * 2, d_model)
        )
        # Additional attention mechanisms
        self.cross_scale_attention = nn.MultiheadAttention(
            d_model, n_heads // 2, dropout=0.1, batch_first=True
        )
        self.dropout = nn.Dropout(0.1)
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
@@ -133,10 +169,10 @@ class MultiScaleAttention(nn.Module):
        scale_outputs = []
        for scale_proj in self.scale_projections:
-            # Apply temporal convolution for this scale
+            # Apply enhanced temporal convolution for this scale
            x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
-            # Standard attention computation
+            # Enhanced attention computation with deeper projections
            Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
            K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
            V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
@@ -160,9 +196,15 @@ class MultiScaleAttention(nn.Module):
            scale_outputs.append(output)
-        # Combine multi-scale outputs
+        # Combine multi-scale outputs with enhanced projection
        combined = torch.cat(scale_outputs, dim=-1)
-        return self.output_projection(combined)
+        output = self.output_projection(combined)
        # Apply cross-scale attention for better integration
        cross_attended, _ = self.cross_scale_attention(output, output, output, attn_mask=mask)
        # Residual connection
        return output + cross_attended
 class MarketRegimeDetector(nn.Module):
    """Market regime detection module for adaptive behavior"""
@@ -249,9 +291,9 @@ class TradingTransformerLayer(nn.Module):
        super().__init__()
        self.config = config
-        # Multi-scale attention or standard attention
+        # Enhanced multi-scale attention or standard attention
        if config.use_multi_scale_attention:
-            self.attention = MultiScaleAttention(config.d_model, config.n_heads)
+            self.attention = DeepMultiScaleAttention(config.d_model, config.n_heads)
        else:
            self.attention = nn.MultiheadAttention(
                config.d_model, config.n_heads, dropout=config.dropout, batch_first=True
@@ -278,7 +320,7 @@ class TradingTransformerLayer(nn.Module):
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # Self-attention with residual connection
-        if isinstance(self.attention, MultiScaleAttention):
+        if isinstance(self.attention, DeepMultiScaleAttention):
            attn_output = self.attention(x, mask)
        else:
            attn_output, _ = self.attention(x, x, x, attn_mask=mask)
@@ -323,8 +365,11 @@ class AdvancedTradingTransformer(nn.Module):
            TradingTransformerLayer(config) for _ in range(config.n_layers)
        ])
-        # Output heads
+        # Enhanced output heads for 46M parameter model
        self.action_head = nn.Sequential(
            nn.Linear(config.d_model, config.d_model),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model, config.d_model // 2),
            nn.GELU(),
            nn.Dropout(config.dropout),
@@ -333,25 +378,48 @@ class AdvancedTradingTransformer(nn.Module):
        if config.confidence_output:
            self.confidence_head = nn.Sequential(
-                nn.Linear(config.d_model, config.d_model // 4),
+                nn.Linear(config.d_model, config.d_model // 2),
                nn.GELU(),
                nn.Dropout(config.dropout),
                nn.Linear(config.d_model // 2, config.d_model // 4),
                nn.GELU(),
                nn.Dropout(config.dropout),
                nn.Linear(config.d_model // 4, 1),
                nn.Sigmoid()
            )
-        # Uncertainty estimation
+        # Enhanced uncertainty estimation
        if config.use_uncertainty_estimation:
            self.uncertainty_estimator = UncertaintyEstimation(config.d_model)
-        # Price prediction head (auxiliary task)
+        # Enhanced price prediction head (auxiliary task)
        self.price_head = nn.Sequential(
-            nn.Linear(config.d_model, config.d_model // 4),
+            nn.Linear(config.d_model, config.d_model // 2),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model // 2, config.d_model // 4),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model // 4, 1)
        )
        # Additional specialized heads for 46M model
        self.volatility_head = nn.Sequential(
            nn.Linear(config.d_model, config.d_model // 2),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model // 2, 1),
            nn.Softplus()
        )
        self.trend_strength_head = nn.Sequential(
            nn.Linear(config.d_model, config.d_model // 2),
            nn.GELU(),
            nn.Dropout(config.dropout),
            nn.Linear(config.d_model // 2, 1),
            nn.Tanh()
        )
        # Initialize weights
        self._init_weights()
@@ -434,10 +502,17 @@ class AdvancedTradingTransformer(nn.Module):
            outputs['uncertainty_mean'] = uncertainty_mean
            outputs['uncertainty_std'] = uncertainty_std
-        # Price prediction (auxiliary task)
+        # Enhanced price prediction (auxiliary task)
        price_pred = self.price_head(pooled)
        outputs['price_prediction'] = price_pred
        # Additional specialized predictions for 46M model
        volatility_pred = self.volatility_head(pooled)
        outputs['volatility_prediction'] = volatility_pred
        trend_strength_pred = self.trend_strength_head(pooled)
        outputs['trend_strength_prediction'] = trend_strength_pred
        # Market regime information
        if regime_probs_history:
            outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1)
--- a/NN/models/saved/checkpoint_metadata.json
+++ b/NN/models/saved/checkpoint_metadata.json
@@ -271,15 +271,15 @@
  ],
  "decision": [
    {
-      "checkpoint_id": "decision_20250702_011418",
+      "checkpoint_id": "decision_20250702_012558",
      "model_name": "decision",
      "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011418.pt",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012558.pt",
-      "created_at": "2025-07-02T01:14:18.986083",
+      "created_at": "2025-07-02T01:25:58.614455",
      "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990526608928,
+      "performance_score": 9.999991886192655,
      "accuracy": null,
-      "loss": 9.473391072236024e-06,
+      "loss": 8.113807345618998e-06,
      "val_accuracy": null,
      "val_loss": null,
      "reward": null,
@@ -291,15 +291,15 @@
      "wandb_artifact_name": null
    },
    {
-      "checkpoint_id": "decision_20250702_011324",
+      "checkpoint_id": "decision_20250702_012504",
      "model_name": "decision",
      "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011324.pt",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012504.pt",
-      "created_at": "2025-07-02T01:13:24.579781",
+      "created_at": "2025-07-02T01:25:04.285477",
      "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990382249775,
+      "performance_score": 9.999991852067678,
      "accuracy": null,
-      "loss": 9.617750224931245e-06,
+      "loss": 8.147932321987486e-06,
      "val_accuracy": null,
      "val_loss": null,
      "reward": null,
@@ -311,15 +311,15 @@
      "wandb_artifact_name": null
    },
    {
-      "checkpoint_id": "decision_20250702_011348",
+      "checkpoint_id": "decision_20250702_012502",
      "model_name": "decision",
      "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011348.pt",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
-      "created_at": "2025-07-02T01:13:48.808520",
+      "created_at": "2025-07-02T01:25:02.958656",
      "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990223319509,
+      "performance_score": 9.999991847589234,
      "accuracy": null,
-      "loss": 9.776680491212022e-06,
+      "loss": 8.152410765381393e-06,
      "val_accuracy": null,
      "val_loss": null,
      "reward": null,
@@ -331,15 +331,15 @@
      "wandb_artifact_name": null
    },
    {
-      "checkpoint_id": "decision_20250702_011333",
+      "checkpoint_id": "decision_20250702_012503",
      "model_name": "decision",
      "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011333.pt",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012503.pt",
-      "created_at": "2025-07-02T01:13:33.679719",
+      "created_at": "2025-07-02T01:25:03.108239",
      "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999989776407977,
+      "performance_score": 9.99999184242316,
      "accuracy": null,
-      "loss": 1.0223592022232505e-05,
+      "loss": 8.157576839933662e-06,
      "val_accuracy": null,
      "val_loss": null,
      "reward": null,
@@ -351,15 +351,15 @@
      "wandb_artifact_name": null
    },
    {
-      "checkpoint_id": "decision_20250702_011411",
+      "checkpoint_id": "decision_20250702_012502",
      "model_name": "decision",
      "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011411.pt",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
-      "created_at": "2025-07-02T01:14:11.738925",
+      "created_at": "2025-07-02T01:25:02.603966",
      "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.99998973893185,
+      "performance_score": 9.999991812171043,
      "accuracy": null,
-      "loss": 1.0261068149069225e-05,
+      "loss": 8.187828957696905e-06,
      "val_accuracy": null,
      "val_loss": null,
      "reward": null,
--- a/web/clean_dashboard.py
+++ b/web/clean_dashboard.py
@@ -239,14 +239,17 @@ class CleanTradingDashboard:
                from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig
                config = TradingTransformerConfig(
-                    d_model=256,
+                    d_model=1024,        # 2x increase for 46M parameters
-                    n_heads=8,
+                    n_heads=16,          # 2x increase
-                    n_layers=4,
+                    n_layers=12,         # 2x increase
-                    seq_len=50,
+                    seq_len=150,         # 1.5x increase
                    n_actions=3,
                    use_multi_scale_attention=True,
                    use_market_regime_detection=True,
-                    use_uncertainty_estimation=True
+                    use_uncertainty_estimation=True,
                    use_deep_attention=True,
                    use_residual_connections=True,
                    use_layer_norm_variants=True
                )
                model, trainer = create_trading_transformer(config)
@@ -4600,14 +4603,17 @@ class CleanTradingDashboard:
            # Create transformer if not exists
            if transformer_model is None or transformer_trainer is None:
                config = TradingTransformerConfig(
-                    d_model=256,
+                    d_model=1024,        # 2x increase for 46M parameters
-                    n_heads=8,
+                    n_heads=16,          # 2x increase
-                    n_layers=4,
+                    n_layers=12,         # 2x increase
-                    seq_len=50,
+                    seq_len=150,         # 1.5x increase
                    n_actions=3,
                    use_multi_scale_attention=True,
                    use_market_regime_detection=True,
-                    use_uncertainty_estimation=True
+                    use_uncertainty_estimation=True,
                    use_deep_attention=True,
                    use_residual_connections=True,
                    use_layer_norm_variants=True
                )
                transformer_model, transformer_trainer = create_trading_transformer(config)