LR training wip

2025-12-08 21:52:26 +02:00
parent 1ab1c02889
commit 08ee2b6a3a
3 changed files with 61 additions and 31 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -2530,13 +2530,20 @@ class RealTrainingAdapter:
                OPTIMIZATION: Batches are already on GPU and grouped for efficient processing.
                Each mini-batch contains 5 samples for better GPU utilization.
                
-                IMPORTANT: Creates a shallow copy of batch dict to prevent in-place modifications
-                from affecting subsequent epochs. Tensors themselves are shared (not copied).
+                CRITICAL FIX: Clone tensors for each epoch to avoid autograd version conflicts.
+                When the same tensor is used across multiple forward passes, operations like
+                .contiguous() and .view() modify the tensor's version number, breaking backprop.
                """
                for batch in grouped_batches:
-                    # Create shallow copy of batch dict to prevent modifications
-                    # Tensors are shared (not cloned) for memory efficiency
-                    batch_copy = {k: v for k, v in batch.items()}
+                    # CRITICAL: Clone all tensors to avoid version conflicts across epochs
+                    # This prevents "modified by an inplace operation" errors during backward pass
+                    batch_copy = {}
+                    for k, v in batch.items():
+                        if isinstance(v, torch.Tensor):
+                            # Clone tensor to create independent copy with fresh version number
+                            batch_copy[k] = v.clone()
+                        else:
+                            batch_copy[k] = v
                    yield batch_copy
            
            total_batches = len(grouped_batches)
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -144,19 +144,23 @@ class DeepMultiScaleAttention(nn.Module):
        batch_size, seq_len, _ = x.size()
        scale_outputs = []
        
+        # Clone input to avoid inplace modification issues
+        x_input = x.clone()
+        
        for scale_proj in self.scale_projections:
            # Apply enhanced temporal convolution for this scale
-            x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
+            x_conv = scale_proj['conv'](x_input.transpose(1, 2)).transpose(1, 2)
            
            # Enhanced attention computation with deeper projections
-            Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
-            K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
-            V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
+            # Use contiguous() before view() to ensure memory layout is correct
+            Q = scale_proj['query'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
+            K = scale_proj['key'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
+            V = scale_proj['value'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
            
            # Transpose for attention computation
-            Q = Q.transpose(1, 2)  # (batch, n_heads, seq_len, head_dim)
-            K = K.transpose(1, 2)
-            V = V.transpose(1, 2)
+            Q = Q.transpose(1, 2).contiguous()  # (batch, n_heads, seq_len, head_dim)
+            K = K.transpose(1, 2).contiguous()
+            V = V.transpose(1, 2).contiguous()
            
            # Scaled dot-product attention
            scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
@@ -293,24 +297,29 @@ class TradingTransformerLayer(nn.Module):
            self.regime_detector = MarketRegimeDetector(config.d_model)
    
    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        # CRITICAL: Clone input to avoid version conflicts during backpropagation
+        # This prevents "modified by an inplace operation" errors when the same
+        # batch is used across multiple epochs
+        x_residual = x.clone()
+        
        # Self-attention with residual connection
-        # Store residual before any operations to avoid version conflicts
        if isinstance(self.attention, DeepMultiScaleAttention):
-            attn_output = self.attention(x, mask)
+            attn_output = self.attention(x_residual, mask)
        else:
-            attn_output, _ = self.attention(x, x, x, attn_mask=mask)
+            attn_output, _ = self.attention(x_residual, x_residual, x_residual, attn_mask=mask)
        
        # Create new tensor for residual to avoid inplace modification tracking
-        x_new = self.norm1(x + self.dropout(attn_output))
+        x_new = self.norm1(x_residual + self.dropout(attn_output))
        
        # Market regime adaptation
        regime_probs = None
        if hasattr(self, 'regime_detector'):
            x_new, regime_probs = self.regime_detector(x_new)
        
-        # Feed-forward with residual connection
-        ff_output = self.feed_forward(x_new)
-        x_out = self.norm2(x_new + self.dropout(ff_output))
+        # Feed-forward with residual connection - clone to avoid version conflicts
+        x_ff_residual = x_new.clone()
+        ff_output = self.feed_forward(x_ff_residual)
+        x_out = self.norm2(x_ff_residual + self.dropout(ff_output))
        
        return {
            'output': x_out,
@@ -557,19 +566,23 @@ class AdvancedTradingTransformer(nn.Module):
        timeframe_indices = []
        
        for idx, (tf_name, tf_data) in enumerate(available_tfs):
+            # CRITICAL: Clone input data to avoid modifying original tensors
+            # This prevents version conflicts when batches are reused across epochs
+            tf_data_processed = tf_data.clone()
+            
            # Ensure correct sequence length
-            if tf_data.shape[1] != seq_len:
-                if tf_data.shape[1] < seq_len:
+            if tf_data_processed.shape[1] != seq_len:
+                if tf_data_processed.shape[1] < seq_len:
                    # Pad with last candle
-                    padding = tf_data[:, -1:, :].expand(batch_size, seq_len - tf_data.shape[1], 5)
-                    tf_data = torch.cat([tf_data, padding], dim=1)
+                    padding = tf_data_processed[:, -1:, :].expand(batch_size, seq_len - tf_data_processed.shape[1], 5)
+                    tf_data_processed = torch.cat([tf_data_processed, padding], dim=1)
                else:
                    # Truncate to seq_len
-                    tf_data = tf_data[:, :seq_len, :]
+                    tf_data_processed = tf_data_processed[:, :seq_len, :]
            
            # Apply SHARED pattern encoder (learns patterns once for all timeframes)
            # Shape: [batch, seq_len, 5] -> [batch, seq_len, d_model]
-            tf_encoded = self.shared_pattern_encoder(tf_data)
+            tf_encoded = self.shared_pattern_encoder(tf_data_processed)
            
            # Add timeframe-specific embedding (helps model know which timeframe)
            # Get timeframe index
@@ -1321,7 +1334,7 @@ class TradingTransformerTrainer:
            # Enable anomaly detection temporarily to debug inplace operation issues
            # NOTE: This significantly slows down training (2-3x slower), use only for debugging
            # Set to True to find exact inplace operation causing errors
-            enable_anomaly_detection = False  # DISABLED - inplace operations fixed
+            enable_anomaly_detection = False  # DISABLED - inplace operation issues fixed
            if enable_anomaly_detection:
                torch.autograd.set_detect_anomaly(True)
            
@@ -1374,9 +1387,15 @@ class TradingTransformerTrainer:
                    else:
                        batch_on_device[k] = v
            else:
-                # Batch is already on GPU, but still create a copy of the dict
-                # to avoid modifying the original batch dict
+                # CRITICAL FIX: Batch is already on GPU, but we must clone tensors
+                # to avoid version conflicts when the same batch is reused across epochs.
+                # Without cloning, operations like .contiguous() and .view() modify
+                # the tensor's version number, breaking backpropagation.
                for k, v in batch.items():
+                    if isinstance(v, torch.Tensor):
+                        # Clone tensor to create independent copy with fresh version number
+                        batch_on_device[k] = v.clone()
+                    else:
                        batch_on_device[k] = v
            
            # Ensure all batch tensors are on the same device as the model
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -613,7 +613,11 @@ class TradingOrchestrator:
        # CRITICAL: Initialize checkpoint manager for saving training progress
        self.checkpoint_manager = None
        self.training_iterations = 0  # Track training iterations for periodic saves
+        try:
            self._initialize_checkpoint_manager()
+        except Exception as e:
+            logger.error(f"Failed to initialize checkpoint manager in __init__: {e}")
+            self.checkpoint_manager = None
        
        # Initialize models, COB integration, and training system
        self._initialize_ml_models()
@@ -828,7 +832,7 @@ class TradingOrchestrator:
            # Try to load best checkpoint
            checkpoint_loaded = False
            try:
-                if self.checkpoint_manager:
+                if hasattr(self, 'checkpoint_manager') and self.checkpoint_manager:
                    checkpoint_path, checkpoint_metadata = self.checkpoint_manager.load_best_checkpoint("transformer")
                    if checkpoint_path and checkpoint_metadata:
                        # Load the checkpoint