wip training

2025-11-17 13:28:36 +02:00
parent 43a7d75daf
commit 37e90a1c3c
3 changed files with 381 additions and 33 deletions
--- a/NN/models/advanced_transformer_trading.py
+++ b/NN/models/advanced_transformer_trading.py
@@ -1229,21 +1229,30 @@ class TradingTransformerTrainer:
            if not is_accumulation_step or self.current_accumulation_step == 1:
                self.optimizer.zero_grad(set_to_none=True)
            
-            # Move batch to device and DELETE original CPU tensors to prevent memory leak
-            # CRITICAL: Store original keys to delete CPU tensors after moving to GPU
-            batch_gpu = {}
-            for k, v in batch.items():
+            # OPTIMIZATION: Only move batch to device if not already there
+            # Check if first tensor is already on correct device
+            needs_transfer = False
+            for v in batch.values():
                if isinstance(v, torch.Tensor):
-                    # Move to device (creates GPU copy)
-                    batch_gpu[k] = v.to(self.device, non_blocking=True)
-                    # Delete CPU tensor immediately to free memory
-                    del batch[k]
-                else:
-                    batch_gpu[k] = v
+                    needs_transfer = (v.device != self.device)
+                    break
            
-            # Replace batch with GPU version
-            batch = batch_gpu
-            del batch_gpu
+            if needs_transfer:
+                # Move batch to device and DELETE original CPU tensors to prevent memory leak
+                batch_gpu = {}
+                for k, v in batch.items():
+                    if isinstance(v, torch.Tensor):
+                        # Move to device (creates GPU copy)
+                        batch_gpu[k] = v.to(self.device, non_blocking=True)
+                        # Delete CPU tensor immediately to free memory
+                        del batch[k]
+                    else:
+                        batch_gpu[k] = v
+                
+                # Replace batch with GPU version
+                batch = batch_gpu
+                del batch_gpu
+            # else: batch is already on GPU, use it directly!
            
            # Use automatic mixed precision (FP16) for memory efficiency
            # Support both CUDA and ROCm (AMD) devices