training progress

2025-11-10 20:01:07 +02:00
parent a2d34c6d7c
commit 999dea9eb0
2 changed files with 116 additions and 38 deletions
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -23,19 +23,19 @@ logger = logging.getLogger(__name__)

@dataclass
 class TradingTransformerConfig:
-    """Configuration for trading transformer models - SCALED TO 46M PARAMETERS"""
-    # Model architecture - SCALED UP
-    d_model: int = 1024         # Model dimension (2x increase)
-    n_heads: int = 16           # Number of attention heads (2x increase)
-    n_layers: int = 12          # Number of transformer layers (2x increase)
-    d_ff: int = 4096           # Feed-forward dimension (2x increase)
+    """Configuration for trading transformer models - WITH PROPER MEMORY MANAGEMENT"""
+    # Model architecture - RESTORED to original size (memory leak fixed)
+    d_model: int = 1024         # Model dimension
+    n_heads: int = 16           # Number of attention heads
+    n_layers: int = 12          # Number of transformer layers
+    d_ff: int = 4096           # Feed-forward dimension
    dropout: float = 0.1        # Dropout rate
    
-    # Input dimensions - ENHANCED
-    seq_len: int = 150          # Sequence length for time series (1.5x increase)
-    cob_features: int = 100     # COB feature dimension (2x increase)
-    tech_features: int = 40     # Technical indicator features (2x increase)
-    market_features: int = 30   # Market microstructure features (2x increase)
+    # Input dimensions - RESTORED
+    seq_len: int = 200          # Sequence length for time series
+    cob_features: int = 100     # COB feature dimension
+    tech_features: int = 40     # Technical indicator features
+    market_features: int = 30   # Market microstructure features
    
    # Output configuration
    n_actions: int = 3          # BUY, SELL, HOLD
@@ -358,6 +358,7 @@ class AdvancedTradingTransformer(nn.Module):
        
        # SERIAL: Shared pattern encoder (learns candle patterns ONCE for all timeframes)
        # This is applied to each timeframe independently but uses SAME weights
+        # RESTORED: Original dimensions (memory leak fixed)
        self.shared_pattern_encoder = nn.Sequential(
            nn.Linear(5, config.d_model // 4),  # 5 OHLCV -> 256
            nn.LayerNorm(config.d_model // 4),
@@ -1163,12 +1164,13 @@ class TradingTransformerTrainer:
            self.model.train()
            
            # Only zero gradients if not accumulating
+            # Use set_to_none=True for better memory efficiency
            if not accumulate_gradients:
-                self.optimizer.zero_grad()
+                self.optimizer.zero_grad(set_to_none=True)
            
            # Move batch to device WITHOUT cloning to avoid version tracking issues
            # The detach().clone() was causing gradient computation errors
-            batch = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v 
+            batch = {k: v.to(self.device, non_blocking=True) if isinstance(v, torch.Tensor) else v 
                    for k, v in batch.items()}
            
            # Use automatic mixed precision (FP16) for memory efficiency
@@ -1189,10 +1191,26 @@ class TradingTransformerTrainer:
                
                # Calculate losses
                action_loss = self.action_criterion(outputs['action_logits'], batch['actions'])
-                price_loss = self.price_criterion(outputs['price_prediction'], batch['future_prices'])
+                
+                # FIXED: Ensure shapes match for MSELoss
+                price_pred = outputs['price_prediction']
+                price_target = batch['future_prices']
+                
+                # Both should be [batch, 1], but ensure they match
+                if price_pred.shape != price_target.shape:
+                    logger.debug(f"Reshaping price target from {price_target.shape} to {price_pred.shape}")
+                    price_target = price_target.view(price_pred.shape)
+                
+                price_loss = self.price_criterion(price_pred, price_target)
                
                # Start with base losses - avoid inplace operations on computation graph
                total_loss = action_loss + 0.1 * price_loss  # Weight auxiliary task
+                
+                # CRITICAL FIX: Scale loss for gradient accumulation
+                # This prevents gradient explosion when accumulating over multiple batches
+                if accumulate_gradients:
+                    # Assume accumulation over 5 steps (should match training loop)
+                    total_loss = total_loss / 5.0
            
            # Add confidence loss if available
            if 'confidence' in outputs and 'trade_success' in batch:
@@ -1200,7 +1218,7 @@ class TradingTransformerTrainer:
                confidence_pred = outputs['confidence']
                trade_target = batch['trade_success'].float()
                
-                # Ensure both are 2D tensors [batch_size, 1]
+                # FIXED: Ensure both are 2D tensors [batch_size, 1]
                # Handle different input shapes robustly
                if confidence_pred.dim() == 0:
                    # Scalar -> [1, 1]
@@ -1208,6 +1226,9 @@ class TradingTransformerTrainer:
                elif confidence_pred.dim() == 1:
                    # [batch_size] -> [batch_size, 1]
                    confidence_pred = confidence_pred.unsqueeze(-1)
+                elif confidence_pred.dim() == 3:
+                    # [batch_size, seq_len, 1] -> [batch_size, 1] (take last timestep)
+                    confidence_pred = confidence_pred[:, -1, :]
                
                if trade_target.dim() == 0:
                    # Scalar -> [1, 1]
@@ -1292,11 +1313,31 @@ class TradingTransformerTrainer:
                'learning_rate': self.scheduler.get_last_lr()[0]
            }
            
-            # Delete large tensors to free memory immediately
+            # CRITICAL: Delete large tensors to free memory immediately
+            # This prevents memory accumulation across batches
            del outputs, total_loss, action_loss, price_loss, predictions, accuracy
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
            
            return result
            
+        except torch.cuda.OutOfMemoryError as oom_error:
+            logger.error(f"CUDA OOM in train_step: {oom_error}")
+            # Aggressive cleanup on OOM
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+            # Reset optimizer state to prevent corruption
+            self.optimizer.zero_grad(set_to_none=True)
+            # Return zero loss to continue training
+            return {
+                'total_loss': 0.0,
+                'action_loss': 0.0,
+                'price_loss': 0.0,
+                'accuracy': 0.0,
+                'candle_accuracy': 0.0,
+                'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
+            }
        except Exception as e:
            logger.error(f"Error in train_step: {e}", exc_info=True)
            # Clear any partial computations
@@ -1309,6 +1350,7 @@ class TradingTransformerTrainer:
                'action_loss': 0.0,
                'price_loss': 0.0,
                'accuracy': 0.0,
+                'candle_accuracy': 0.0,
                'learning_rate': self.scheduler.get_last_lr()[0] if hasattr(self, 'scheduler') else 0.0
            }