From 08ee2b6a3acede7928fa59d1f94cbe9886e0b66f Mon Sep 17 00:00:00 2001
From: Dobromir Popov <dobromir.popov@gmail.com>
Date: Mon, 8 Dec 2025 21:52:26 +0200
Subject: [PATCH] LR training wip

---
 ANNOTATE/core/real_training_adapter.py    | 17 ++++--
 NN/models/advanced_transformer_trading.py | 67 +++++++++++++++--------
 core/orchestrator.py                      |  8 ++-
 3 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py
index 9aa038a..783b119 100644
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -2530,13 +2530,20 @@ class RealTrainingAdapter:
                 OPTIMIZATION: Batches are already on GPU and grouped for efficient processing.
                 Each mini-batch contains 5 samples for better GPU utilization.
                 
-                IMPORTANT: Creates a shallow copy of batch dict to prevent in-place modifications
-                from affecting subsequent epochs. Tensors themselves are shared (not copied).
+                CRITICAL FIX: Clone tensors for each epoch to avoid autograd version conflicts.
+                When the same tensor is used across multiple forward passes, operations like
+                .contiguous() and .view() modify the tensor's version number, breaking backprop.
                 """
                 for batch in grouped_batches:
-                    # Create shallow copy of batch dict to prevent modifications
-                    # Tensors are shared (not cloned) for memory efficiency
-                    batch_copy = {k: v for k, v in batch.items()}
+                    # CRITICAL: Clone all tensors to avoid version conflicts across epochs
+                    # This prevents "modified by an inplace operation" errors during backward pass
+                    batch_copy = {}
+                    for k, v in batch.items():
+                        if isinstance(v, torch.Tensor):
+                            # Clone tensor to create independent copy with fresh version number
+                            batch_copy[k] = v.clone()
+                        else:
+                            batch_copy[k] = v
                     yield batch_copy
             
             total_batches = len(grouped_batches)
diff --git a/NN/models/advanced_transformer_trading.py b/NN/models/advanced_transformer_trading.py
index 61fdff5..d09f8c5 100644
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -144,19 +144,23 @@ class DeepMultiScaleAttention(nn.Module):
         batch_size, seq_len, _ = x.size()
         scale_outputs = []
         
+        # Clone input to avoid inplace modification issues
+        x_input = x.clone()
+        
         for scale_proj in self.scale_projections:
             # Apply enhanced temporal convolution for this scale
-            x_conv = scale_proj['conv'](x.transpose(1, 2)).transpose(1, 2)
+            x_conv = scale_proj['conv'](x_input.transpose(1, 2)).transpose(1, 2)
             
             # Enhanced attention computation with deeper projections
-            Q = scale_proj['query'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
-            K = scale_proj['key'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
-            V = scale_proj['value'](x_conv).view(batch_size, seq_len, self.n_heads, self.head_dim)
+            # Use contiguous() before view() to ensure memory layout is correct
+            Q = scale_proj['query'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
+            K = scale_proj['key'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
+            V = scale_proj['value'](x_conv).contiguous().view(batch_size, seq_len, self.n_heads, self.head_dim)
             
             # Transpose for attention computation
-            Q = Q.transpose(1, 2)  # (batch, n_heads, seq_len, head_dim)
-            K = K.transpose(1, 2)
-            V = V.transpose(1, 2)
+            Q = Q.transpose(1, 2).contiguous()  # (batch, n_heads, seq_len, head_dim)
+            K = K.transpose(1, 2).contiguous()
+            V = V.transpose(1, 2).contiguous()
             
             # Scaled dot-product attention
             scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
@@ -293,24 +297,29 @@ class TradingTransformerLayer(nn.Module):
             self.regime_detector = MarketRegimeDetector(config.d_model)
     
     def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        # CRITICAL: Clone input to avoid version conflicts during backpropagation
+        # This prevents "modified by an inplace operation" errors when the same
+        # batch is used across multiple epochs
+        x_residual = x.clone()
+        
         # Self-attention with residual connection
-        # Store residual before any operations to avoid version conflicts
         if isinstance(self.attention, DeepMultiScaleAttention):
-            attn_output = self.attention(x, mask)
+            attn_output = self.attention(x_residual, mask)
         else:
-            attn_output, _ = self.attention(x, x, x, attn_mask=mask)
+            attn_output, _ = self.attention(x_residual, x_residual, x_residual, attn_mask=mask)
         
         # Create new tensor for residual to avoid inplace modification tracking
-        x_new = self.norm1(x + self.dropout(attn_output))
+        x_new = self.norm1(x_residual + self.dropout(attn_output))
         
         # Market regime adaptation
         regime_probs = None
         if hasattr(self, 'regime_detector'):
             x_new, regime_probs = self.regime_detector(x_new)
         
-        # Feed-forward with residual connection
-        ff_output = self.feed_forward(x_new)
-        x_out = self.norm2(x_new + self.dropout(ff_output))
+        # Feed-forward with residual connection - clone to avoid version conflicts
+        x_ff_residual = x_new.clone()
+        ff_output = self.feed_forward(x_ff_residual)
+        x_out = self.norm2(x_ff_residual + self.dropout(ff_output))
         
         return {
             'output': x_out,
@@ -557,19 +566,23 @@ class AdvancedTradingTransformer(nn.Module):
         timeframe_indices = []
         
         for idx, (tf_name, tf_data) in enumerate(available_tfs):
+            # CRITICAL: Clone input data to avoid modifying original tensors
+            # This prevents version conflicts when batches are reused across epochs
+            tf_data_processed = tf_data.clone()
+            
             # Ensure correct sequence length
-            if tf_data.shape[1] != seq_len:
-                if tf_data.shape[1] < seq_len:
+            if tf_data_processed.shape[1] != seq_len:
+                if tf_data_processed.shape[1] < seq_len:
                     # Pad with last candle
-                    padding = tf_data[:, -1:, :].expand(batch_size, seq_len - tf_data.shape[1], 5)
-                    tf_data = torch.cat([tf_data, padding], dim=1)
+                    padding = tf_data_processed[:, -1:, :].expand(batch_size, seq_len - tf_data_processed.shape[1], 5)
+                    tf_data_processed = torch.cat([tf_data_processed, padding], dim=1)
                 else:
                     # Truncate to seq_len
-                    tf_data = tf_data[:, :seq_len, :]
+                    tf_data_processed = tf_data_processed[:, :seq_len, :]
             
             # Apply SHARED pattern encoder (learns patterns once for all timeframes)
             # Shape: [batch, seq_len, 5] -> [batch, seq_len, d_model]
-            tf_encoded = self.shared_pattern_encoder(tf_data)
+            tf_encoded = self.shared_pattern_encoder(tf_data_processed)
             
             # Add timeframe-specific embedding (helps model know which timeframe)
             # Get timeframe index
@@ -1321,7 +1334,7 @@ class TradingTransformerTrainer:
             # Enable anomaly detection temporarily to debug inplace operation issues
             # NOTE: This significantly slows down training (2-3x slower), use only for debugging
             # Set to True to find exact inplace operation causing errors
-            enable_anomaly_detection = False  # DISABLED - inplace operations fixed
+            enable_anomaly_detection = False  # DISABLED - inplace operation issues fixed
             if enable_anomaly_detection:
                 torch.autograd.set_detect_anomaly(True)
             
@@ -1374,10 +1387,16 @@ class TradingTransformerTrainer:
                     else:
                         batch_on_device[k] = v
             else:
-                # Batch is already on GPU, but still create a copy of the dict
-                # to avoid modifying the original batch dict
+                # CRITICAL FIX: Batch is already on GPU, but we must clone tensors
+                # to avoid version conflicts when the same batch is reused across epochs.
+                # Without cloning, operations like .contiguous() and .view() modify
+                # the tensor's version number, breaking backpropagation.
                 for k, v in batch.items():
-                    batch_on_device[k] = v
+                    if isinstance(v, torch.Tensor):
+                        # Clone tensor to create independent copy with fresh version number
+                        batch_on_device[k] = v.clone()
+                    else:
+                        batch_on_device[k] = v
             
             # Ensure all batch tensors are on the same device as the model
             # This is critical to avoid device mismatch errors
diff --git a/core/orchestrator.py b/core/orchestrator.py
index 0409406..d75184b 100644
--- a/core/orchestrator.py
+++ b/core/orchestrator.py
@@ -613,7 +613,11 @@ class TradingOrchestrator:
         # CRITICAL: Initialize checkpoint manager for saving training progress
         self.checkpoint_manager = None
         self.training_iterations = 0  # Track training iterations for periodic saves
-        self._initialize_checkpoint_manager()
+        try:
+            self._initialize_checkpoint_manager()
+        except Exception as e:
+            logger.error(f"Failed to initialize checkpoint manager in __init__: {e}")
+            self.checkpoint_manager = None
         
         # Initialize models, COB integration, and training system
         self._initialize_ml_models()
@@ -828,7 +832,7 @@ class TradingOrchestrator:
             # Try to load best checkpoint
             checkpoint_loaded = False
             try:
-                if self.checkpoint_manager:
+                if hasattr(self, 'checkpoint_manager') and self.checkpoint_manager:
                     checkpoint_path, checkpoint_metadata = self.checkpoint_manager.load_best_checkpoint("transformer")
                     if checkpoint_path and checkpoint_metadata:
                         # Load the checkpoint