wip training

2025-11-17 13:28:36 +02:00
parent 43a7d75daf
commit 37e90a1c3c
3 changed files with 381 additions and 33 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -1789,17 +1789,47 @@ class RealTrainingAdapter:
            
            import torch
            
-            # MEMORY FIX: Pre-convert batches ONCE and cache them
-            # This avoids recreating batches every epoch (major leak!)
-            logger.info("   Pre-converting batches (one-time operation)...")
+            # OPTIMIZATION: Pre-convert batches ONCE and move to GPU immediately
+            # This avoids CPU→GPU transfer bottleneck during training
+            logger.info("   Pre-converting batches and moving to GPU (one-time operation)...")
+            
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            use_gpu = torch.cuda.is_available()
+            
+            if use_gpu:
+                logger.info(f"    GPU: {torch.cuda.get_device_name(0)}")
+                logger.info(f"    GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+            
            cached_batches = []
            for i, data in enumerate(training_data):
                batch = self._convert_annotation_to_transformer_batch(data)
                if batch is not None:
-                    cached_batches.append(batch)
+                    # OPTIMIZATION: Move batch to GPU immediately with pinned memory
+                    if use_gpu:
+                        batch_gpu = {}
+                        for k, v in batch.items():
+                            if isinstance(v, torch.Tensor):
+                                # Use pin_memory() for faster CPU→GPU transfer
+                                # Then move to GPU with non_blocking=True
+                                batch_gpu[k] = v.pin_memory().to(device, non_blocking=True)
+                            else:
+                                batch_gpu[k] = v
+                        cached_batches.append(batch_gpu)
+                        del batch  # Free CPU memory immediately
+                    else:
+                        cached_batches.append(batch)
+                    
+                    # Show progress every 10 batches
+                    if (i + 1) % 10 == 0 or i == 0:
+                        logger.info(f"      Processed {i + 1}/{len(training_data)} batches...")
                else:
                    logger.warning(f"   Failed to convert sample {i+1}")
            
+            # Synchronize GPU operations
+            if use_gpu:
+                torch.cuda.synchronize()
+                logger.info(f"    All {len(cached_batches)} batches now on GPU")
+            
            # Clear training_data to free memory
            training_data.clear()
            del training_data
@@ -1809,25 +1839,16 @@ class RealTrainingAdapter:
            
            def batch_generator():
                """
-                Yield pre-converted batches with proper memory management
+                Yield pre-converted batches (already on GPU)
                
-                CRITICAL: Each batch must be cloned and detached to prevent:
-                1. GPU memory accumulation across epochs
-                2. Computation graph retention
-                3. Version tracking issues
+                OPTIMIZATION: Batches are already on GPU and detached.
+                No cloning needed - just yield directly for maximum performance.
+                Each batch is independent (no gradient accumulation across batches).
                """
                for batch in cached_batches:
-                    # Clone and detach each tensor in the batch
-                    # This creates a fresh copy without gradient history
-                    cloned_batch = {}
-                    for key, value in batch.items():
-                        if isinstance(value, torch.Tensor):
-                            # detach() removes from computation graph
-                            # clone() creates new memory (prevents aliasing)
-                            cloned_batch[key] = value.detach().clone()
-                        else:
-                            cloned_batch[key] = value
-                    yield cloned_batch
+                    # Simply yield the batch - no cloning needed!
+                    # Batches are already on GPU and properly detached
+                    yield batch
            
            total_batches = len(cached_batches)
            
@@ -1860,6 +1881,12 @@ class RealTrainingAdapter:
                epoch_accuracy = 0.0
                num_batches = 0
                
+                # Log GPU status at start of epoch
+                if use_gpu:
+                    mem_allocated = torch.cuda.memory_allocated(device) / 1024**3
+                    mem_reserved = torch.cuda.memory_reserved(device) / 1024**3
+                    logger.info(f"   Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
+                
                # MEMORY FIX: Aggressive cleanup before epoch
                gc.collect()
                if torch.cuda.is_available():