LR training wip

2025-12-08 21:52:26 +02:00
parent 1ab1c02889
commit 08ee2b6a3a
3 changed files with 61 additions and 31 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -2530,13 +2530,20 @@ class RealTrainingAdapter:
                OPTIMIZATION: Batches are already on GPU and grouped for efficient processing.
                Each mini-batch contains 5 samples for better GPU utilization.
                
-                IMPORTANT: Creates a shallow copy of batch dict to prevent in-place modifications
-                from affecting subsequent epochs. Tensors themselves are shared (not copied).
+                CRITICAL FIX: Clone tensors for each epoch to avoid autograd version conflicts.
+                When the same tensor is used across multiple forward passes, operations like
+                .contiguous() and .view() modify the tensor's version number, breaking backprop.
                """
                for batch in grouped_batches:
-                    # Create shallow copy of batch dict to prevent modifications
-                    # Tensors are shared (not cloned) for memory efficiency
-                    batch_copy = {k: v for k, v in batch.items()}
+                    # CRITICAL: Clone all tensors to avoid version conflicts across epochs
+                    # This prevents "modified by an inplace operation" errors during backward pass
+                    batch_copy = {}
+                    for k, v in batch.items():
+                        if isinstance(v, torch.Tensor):
+                            # Clone tensor to create independent copy with fresh version number
+                            batch_copy[k] = v.clone()
+                        else:
+                            batch_copy[k] = v
                    yield batch_copy
            
            total_batches = len(grouped_batches)