wip

2025-12-10 00:45:41 +02:00
parent c21d8cbea1
commit fadfa8c741
5 changed files with 256 additions and 117 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -163,8 +163,12 @@ class RealTrainingAdapter:
        # CRITICAL: Training lock to prevent concurrent model access
        # Multiple threads (batch training + per-candle training) can corrupt
        # the computation graph if they access the model simultaneously
+        # Use RLock (reentrant lock) to allow same thread to acquire multiple times
        import threading
-        self._training_lock = threading.Lock()
+        self._training_lock = threading.RLock()
+        
+        # Track which thread currently holds the training lock (for debugging)
+        self._training_lock_holder = None
        
        # Use orchestrator's inference training coordinator (if available)
        # This reduces duplication and centralizes coordination logic
@@ -4142,7 +4146,16 @@ class RealTrainingAdapter:
            # CRITICAL: Acquire training lock to prevent concurrent model access
            # This prevents "inplace operation" errors when batch training runs simultaneously
            import torch
-            with self._training_lock:
+            import threading
+            
+            # Try to acquire lock with timeout to prevent deadlock
+            lock_acquired = self._training_lock.acquire(timeout=5.0)
+            if not lock_acquired:
+                logger.warning("Could not acquire training lock within 5 seconds - skipping this training step")
+                return
+            
+            try:
+                self._training_lock_holder = threading.current_thread().name
                with torch.enable_grad():
                    trainer.model.train()
                    result = trainer.train_step(batch, accumulate_gradients=False)
@@ -4193,6 +4206,10 @@ class RealTrainingAdapter:
                            improved=improved
                        )
                        self.realtime_training_metrics['last_checkpoint_step'] = self.realtime_training_metrics['total_steps']
+            finally:
+                # CRITICAL: Always release the lock, even if an exception occurs
+                self._training_lock_holder = None
+                self._training_lock.release()
        
        except Exception as e:
            logger.warning(f"Error training transformer on sample: {e}")