new backtesting feature

2025-11-17 19:13:30 +02:00
parent 37e90a1c3c
commit ebb062bdae
5 changed files with 1106 additions and 36 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -1789,35 +1789,25 @@ class RealTrainingAdapter:
            
            import torch
            
-            # OPTIMIZATION: Pre-convert batches ONCE and move to GPU immediately
-            # This avoids CPU→GPU transfer bottleneck during training
-            logger.info("   Pre-converting batches and moving to GPU (one-time operation)...")
+            # OPTIMIZATION: Pre-convert batches ONCE
+            # NOTE: Using CPU for batch storage to avoid ROCm/HIP kernel issues
+            # GPU will be used during forward/backward passes in trainer
+            logger.info("   Pre-converting batches (one-time operation)...")
            
-            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            device = torch.device('cpu')  # Store batches on CPU
            use_gpu = torch.cuda.is_available()
            
            if use_gpu:
-                logger.info(f"    GPU: {torch.cuda.get_device_name(0)}")
+                logger.info(f"    GPU available: {torch.cuda.get_device_name(0)}")
                logger.info(f"    GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+                logger.info(f"    Batches will be stored on CPU, moved to GPU during training")
            
            cached_batches = []
            for i, data in enumerate(training_data):
                batch = self._convert_annotation_to_transformer_batch(data)
                if batch is not None:
-                    # OPTIMIZATION: Move batch to GPU immediately with pinned memory
-                    if use_gpu:
-                        batch_gpu = {}
-                        for k, v in batch.items():
-                            if isinstance(v, torch.Tensor):
-                                # Use pin_memory() for faster CPU→GPU transfer
-                                # Then move to GPU with non_blocking=True
-                                batch_gpu[k] = v.pin_memory().to(device, non_blocking=True)
-                            else:
-                                batch_gpu[k] = v
-                        cached_batches.append(batch_gpu)
-                        del batch  # Free CPU memory immediately
-                    else:
-                        cached_batches.append(batch)
+                    # Store batches on CPU (trainer will move to GPU)
+                    cached_batches.append(batch)
                    
                    # Show progress every 10 batches
                    if (i + 1) % 10 == 0 or i == 0:
@@ -1825,11 +1815,6 @@ class RealTrainingAdapter:
                else:
                    logger.warning(f"   Failed to convert sample {i+1}")
            
-            # Synchronize GPU operations
-            if use_gpu:
-                torch.cuda.synchronize()
-                logger.info(f"    All {len(cached_batches)} batches now on GPU")
-            
            # Clear training_data to free memory
            training_data.clear()
            del training_data