diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py
index 28e6ccb..2fc7b59 100644
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -23,35 +23,40 @@ logger = logging.getLogger(__name__)
 
 @dataclass
 class TradingTransformerConfig:
-    """Configuration for trading transformer models"""
-    # Model architecture
-    d_model: int = 512          # Model dimension
-    n_heads: int = 8            # Number of attention heads
-    n_layers: int = 6           # Number of transformer layers
-    d_ff: int = 2048           # Feed-forward dimension
+    """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
+    # Model architecture - SCALED UP
+    d_model: int = 1024         # Model dimension (2x increase)
+    n_heads: int = 16           # Number of attention heads (2x increase)
+    n_layers: int = 12          # Number of transformer layers (2x increase)
+    d_ff: int = 4096           # Feed-forward dimension (2x increase)
     dropout: float = 0.1        # Dropout rate
     
-    # Input dimensions
-    seq_len: int = 100          # Sequence length for time series
-    cob_features: int = 50      # COB feature dimension
-    tech_features: int = 20     # Technical indicator features
-    market_features: int = 15   # Market microstructure features
+    # Input dimensions - ENHANCED
+    seq_len: int = 150          # Sequence length for time series (1.5x increase)
+    cob_features: int = 100     # COB feature dimension (2x increase)
+    tech_features: int = 40     # Technical indicator features (2x increase)
+    market_features: int = 30   # Market microstructure features (2x increase)
     
     # Output configuration
     n_actions: int = 3          # BUY, SELL, HOLD
     confidence_output: bool = True  # Output confidence scores
     
-    # Training configuration
-    learning_rate: float = 1e-4
-    weight_decay: float = 1e-5
-    warmup_steps: int = 4000
-    max_grad_norm: float = 1.0
+    # Training configuration - OPTIMIZED FOR LARGER MODEL
+    learning_rate: float = 5e-5  # Reduced for larger model
+    weight_decay: float = 1e-4   # Increased regularization
+    warmup_steps: int = 8000     # More warmup steps
+    max_grad_norm: float = 0.5   # Tighter gradient clipping
     
-    # Advanced features
+    # Advanced features - ENHANCED
     use_relative_position: bool = True
     use_multi_scale_attention: bool = True
     use_market_regime_detection: bool = True
     use_uncertainty_estimation: bool = True
+    
+    # NEW: Additional scaling features
+    use_deep_attention: bool = True      # Deeper attention mechanisms
+    use_residual_connections: bool = True # Enhanced residual connections
+    use_layer_norm_variants: bool = True # Advanced normalization
 
 class PositionalEncoding(nn.Module):
     """Sinusoidal positional encoding for transformer"""
@@ -102,10 +107,10 @@ class RelativePositionalEncoding(nn.Module):
         
         return self.relative_position_embeddings(final_mat)
 
-class MultiScaleAttention(nn.Module):
-    """Multi-scale attention for capturing different time horizons"""
+class DeepMultiScaleAttention(nn.Module):
+    """Enhanced multi-scale attention with deeper mechanisms for 46M parameter model"""
     
-    def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7]):
+    def __init__(self, d_model: int, n_heads: int, scales: List[int] = [1, 3, 5, 7, 11, 15]):
         super().__init__()
         self.d_model = d_model
         self.n_heads = n_heads
@@ -114,18 +119,49 @@ class MultiScaleAttention(nn.Module):
         
         assert d_model % n_heads == 0, "d_model must be divisible by n_heads"
         
-        # Multi-scale projections
+        # Enhanced multi-scale projections with deeper architecture
         self.scale_projections = nn.ModuleList([
             nn.ModuleDict({
-                'query': nn.Linear(d_model, d_model),
-                'key': nn.Linear(d_model, d_model),
-                'value': nn.Linear(d_model, d_model),
-                'conv': nn.Conv1d(d_model, d_model, kernel_size=scale, 
-                                padding=scale//2, groups=d_model)
+                'query': nn.Sequential(
+                    nn.Linear(d_model, d_model * 2),
+                    nn.GELU(),
+                    nn.Dropout(0.1),
+                    nn.Linear(d_model * 2, d_model)
+                ),
+                'key': nn.Sequential(
+                    nn.Linear(d_model, d_model * 2),
+                    nn.GELU(),
+                    nn.Dropout(0.1),
+                    nn.Linear(d_model * 2, d_model)
+                ),
+                'value': nn.Sequential(
+                    nn.Linear(d_model, d_model * 2),
+                    nn.GELU(),
+                    nn.Dropout(0.1),
+                    nn.Linear(d_model * 2, d_model)
+                ),
+                'conv': nn.Sequential(
+                    nn.Conv1d(d_model, d_model * 2, kernel_size=scale, 
+                             padding=scale//2, groups=d_model),
+                    nn.GELU(),
+                    nn.Conv1d(d_model * 2, d_model, kernel_size=1)
+                )
             }) for scale in scales
         ])
         
-        self.output_projection = nn.Linear(d_model * len(scales), d_model)
+        # Enhanced output projection with residual connection
+        self.output_projection = nn.Sequential(
+            nn.Linear(d_model * len(scales), d_model * 2),
+            nn.GELU(),
+            nn.Dropout(0.1),
+            nn.Linear(d_model * 2, d_model)
+        )
+        
+        # Additional attention mechanisms
+        self.cross_scale_attention = nn.MultiheadAttention(
+            d_model, n_heads // 2, dropout=0.1, batch_first=True
+        )
+        
         self.dropout = nn.Dropout(0.1)
         
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
@@ -133,10 +169,10 @@ class MultiScaleAttention(nn.Module):
         scale_outputs = []
         
         for scale_proj in self.scale_projections:
-            # Apply temporal convolution for this scale
+            # Apply enhanced temporal convolution for this scale
             x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
             
-            # Standard attention computation
+            # Enhanced attention computation with deeper projections
             Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
             K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
             V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
@@ -160,9 +196,15 @@ class MultiScaleAttention(nn.Module):
             
             scale_outputs.append(output)
         
-        # Combine multi-scale outputs
+        # Combine multi-scale outputs with enhanced projection
         combined = torch.cat(scale_outputs, dim=-1)
-        return self.output_projection(combined)
+        output = self.output_projection(combined)
+        
+        # Apply cross-scale attention for better integration
+        cross_attended, _ = self.cross_scale_attention(output, output, output, attn_mask=mask)
+        
+        # Residual connection
+        return output + cross_attended
 
 class MarketRegimeDetector(nn.Module):
     """Market regime detection module for adaptive behavior"""
@@ -249,9 +291,9 @@ class TradingTransformerLayer(nn.Module):
         super().__init__()
         self.config = config
         
-        # Multi-scale attention or standard attention
+        # Enhanced multi-scale attention or standard attention
         if config.use_multi_scale_attention:
-            self.attention = MultiScaleAttention(config.d_model, config.n_heads)
+            self.attention = DeepMultiScaleAttention(config.d_model, config.n_heads)
         else:
             self.attention = nn.MultiheadAttention(
                 config.d_model, config.n_heads, dropout=config.dropout, batch_first=True
@@ -278,7 +320,7 @@ class TradingTransformerLayer(nn.Module):
     
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
         # Self-attention with residual connection
-        if isinstance(self.attention, MultiScaleAttention):
+        if isinstance(self.attention, DeepMultiScaleAttention):
             attn_output = self.attention(x, mask)
         else:
             attn_output, _ = self.attention(x, x, x, attn_mask=mask)
@@ -323,8 +365,11 @@ class AdvancedTradingTransformer(nn.Module):
             TradingTransformerLayer(config) for _ in range(config.n_layers)
         ])
         
-        # Output heads
+        # Enhanced output heads for 46M parameter model
         self.action_head = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            nn.GELU(),
+            nn.Dropout(config.dropout),
             nn.Linear(config.d_model, config.d_model // 2),
             nn.GELU(),
             nn.Dropout(config.dropout),
@@ -333,25 +378,48 @@ class AdvancedTradingTransformer(nn.Module):
         
         if config.confidence_output:
             self.confidence_head = nn.Sequential(
-                nn.Linear(config.d_model, config.d_model // 4),
+                nn.Linear(config.d_model, config.d_model // 2),
+                nn.GELU(),
+                nn.Dropout(config.dropout),
+                nn.Linear(config.d_model // 2, config.d_model // 4),
                 nn.GELU(),
                 nn.Dropout(config.dropout),
                 nn.Linear(config.d_model // 4, 1),
                 nn.Sigmoid()
             )
         
-        # Uncertainty estimation
+        # Enhanced uncertainty estimation
         if config.use_uncertainty_estimation:
             self.uncertainty_estimator = UncertaintyEstimation(config.d_model)
         
-        # Price prediction head (auxiliary task)
+        # Enhanced price prediction head (auxiliary task)
         self.price_head = nn.Sequential(
-            nn.Linear(config.d_model, config.d_model // 4),
+            nn.Linear(config.d_model, config.d_model // 2),
+            nn.GELU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.d_model // 2, config.d_model // 4),
             nn.GELU(),
             nn.Dropout(config.dropout),
             nn.Linear(config.d_model // 4, 1)
         )
         
+        # Additional specialized heads for 46M model
+        self.volatility_head = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model // 2),
+            nn.GELU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.d_model // 2, 1),
+            nn.Softplus()
+        )
+        
+        self.trend_strength_head = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model // 2),
+            nn.GELU(),
+            nn.Dropout(config.dropout),
+            nn.Linear(config.d_model // 2, 1),
+            nn.Tanh()
+        )
+        
         # Initialize weights
         self._init_weights()
     
@@ -434,10 +502,17 @@ class AdvancedTradingTransformer(nn.Module):
             outputs['uncertainty_mean'] = uncertainty_mean
             outputs['uncertainty_std'] = uncertainty_std
         
-        # Price prediction (auxiliary task)
+        # Enhanced price prediction (auxiliary task)
         price_pred = self.price_head(pooled)
         outputs['price_prediction'] = price_pred
         
+        # Additional specialized predictions for 46M model
+        volatility_pred = self.volatility_head(pooled)
+        outputs['volatility_prediction'] = volatility_pred
+        
+        trend_strength_pred = self.trend_strength_head(pooled)
+        outputs['trend_strength_prediction'] = trend_strength_pred
+        
         # Market regime information
         if regime_probs_history:
             outputs['regime_probs'] = torch.stack(regime_probs_history, dim=1)
diff --git a/NN/models/saved/checkpoint_metadata.json b/NN/models/saved/checkpoint_metadata.json
index d2413d7..427d241 100644
--- a/NN/models/saved/checkpoint_metadata.json
+++ b/NN/models/saved/checkpoint_metadata.json
@@ -271,15 +271,15 @@
   ],
   "decision": [
     {
-      "checkpoint_id": "decision_20250702_011418",
+      "checkpoint_id": "decision_20250702_012558",
       "model_name": "decision",
       "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011418.pt",
-      "created_at": "2025-07-02T01:14:18.986083",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012558.pt",
+      "created_at": "2025-07-02T01:25:58.614455",
       "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990526608928,
+      "performance_score": 9.999991886192655,
       "accuracy": null,
-      "loss": 9.473391072236024e-06,
+      "loss": 8.113807345618998e-06,
       "val_accuracy": null,
       "val_loss": null,
       "reward": null,
@@ -291,15 +291,15 @@
       "wandb_artifact_name": null
     },
     {
-      "checkpoint_id": "decision_20250702_011324",
+      "checkpoint_id": "decision_20250702_012504",
       "model_name": "decision",
       "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011324.pt",
-      "created_at": "2025-07-02T01:13:24.579781",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012504.pt",
+      "created_at": "2025-07-02T01:25:04.285477",
       "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990382249775,
+      "performance_score": 9.999991852067678,
       "accuracy": null,
-      "loss": 9.617750224931245e-06,
+      "loss": 8.147932321987486e-06,
       "val_accuracy": null,
       "val_loss": null,
       "reward": null,
@@ -311,15 +311,15 @@
       "wandb_artifact_name": null
     },
     {
-      "checkpoint_id": "decision_20250702_011348",
+      "checkpoint_id": "decision_20250702_012502",
       "model_name": "decision",
       "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011348.pt",
-      "created_at": "2025-07-02T01:13:48.808520",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
+      "created_at": "2025-07-02T01:25:02.958656",
       "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999990223319509,
+      "performance_score": 9.999991847589234,
       "accuracy": null,
-      "loss": 9.776680491212022e-06,
+      "loss": 8.152410765381393e-06,
       "val_accuracy": null,
       "val_loss": null,
       "reward": null,
@@ -331,15 +331,15 @@
       "wandb_artifact_name": null
     },
     {
-      "checkpoint_id": "decision_20250702_011333",
+      "checkpoint_id": "decision_20250702_012503",
       "model_name": "decision",
       "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011333.pt",
-      "created_at": "2025-07-02T01:13:33.679719",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012503.pt",
+      "created_at": "2025-07-02T01:25:03.108239",
       "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.999989776407977,
+      "performance_score": 9.99999184242316,
       "accuracy": null,
-      "loss": 1.0223592022232505e-05,
+      "loss": 8.157576839933662e-06,
       "val_accuracy": null,
       "val_loss": null,
       "reward": null,
@@ -351,15 +351,15 @@
       "wandb_artifact_name": null
     },
     {
-      "checkpoint_id": "decision_20250702_011411",
+      "checkpoint_id": "decision_20250702_012502",
       "model_name": "decision",
       "model_type": "decision_fusion",
-      "file_path": "NN\\models\\saved\\decision\\decision_20250702_011411.pt",
-      "created_at": "2025-07-02T01:14:11.738925",
+      "file_path": "NN\\models\\saved\\decision\\decision_20250702_012502.pt",
+      "created_at": "2025-07-02T01:25:02.603966",
       "file_size_mb": 0.06720924377441406,
-      "performance_score": 9.99998973893185,
+      "performance_score": 9.999991812171043,
       "accuracy": null,
-      "loss": 1.0261068149069225e-05,
+      "loss": 8.187828957696905e-06,
       "val_accuracy": null,
       "val_loss": null,
       "reward": null,
diff --git a/web/clean_dashboard.py b/web/clean_dashboard.py
index c11534f..87a1bfd 100644
--- a/web/clean_dashboard.py
+++ b/web/clean_dashboard.py
@@ -239,14 +239,17 @@ class CleanTradingDashboard:
                 from NN.models.advanced_transformer_trading import create_trading_transformer, TradingTransformerConfig
                 
                 config = TradingTransformerConfig(
-                    d_model=256,
-                    n_heads=8,
-                    n_layers=4,
-                    seq_len=50,
+                    d_model=1024,        # 2x increase for 46M parameters
+                    n_heads=16,          # 2x increase
+                    n_layers=12,         # 2x increase
+                    seq_len=150,         # 1.5x increase
                     n_actions=3,
                     use_multi_scale_attention=True,
                     use_market_regime_detection=True,
-                    use_uncertainty_estimation=True
+                    use_uncertainty_estimation=True,
+                    use_deep_attention=True,
+                    use_residual_connections=True,
+                    use_layer_norm_variants=True
                 )
                 
                 model, trainer = create_trading_transformer(config)
@@ -4600,14 +4603,17 @@ class CleanTradingDashboard:
             # Create transformer if not exists
             if transformer_model is None or transformer_trainer is None:
                 config = TradingTransformerConfig(
-                    d_model=256,
-                    n_heads=8,
-                    n_layers=4,
-                    seq_len=50,
+                    d_model=1024,        # 2x increase for 46M parameters
+                    n_heads=16,          # 2x increase
+                    n_layers=12,         # 2x increase
+                    seq_len=150,         # 1.5x increase
                     n_actions=3,
                     use_multi_scale_attention=True,
                     use_market_regime_detection=True,
-                    use_uncertainty_estimation=True
+                    use_uncertainty_estimation=True,
+                    use_deep_attention=True,
+                    use_residual_connections=True,
+                    use_layer_norm_variants=True
                 )
                 
                 transformer_model, transformer_trainer = create_trading_transformer(config)