This commit is contained in:
Dobromir Popov
2025-12-10 00:45:41 +02:00
parent c21d8cbea1
commit fadfa8c741
5 changed files with 256 additions and 117 deletions

View File

@@ -163,8 +163,12 @@ class RealTrainingAdapter:
# CRITICAL: Training lock to prevent concurrent model access
# Multiple threads (batch training + per-candle training) can corrupt
# the computation graph if they access the model simultaneously
# Use RLock (reentrant lock) to allow same thread to acquire multiple times
import threading
self._training_lock = threading.Lock()
self._training_lock = threading.RLock()
# Track which thread currently holds the training lock (for debugging)
self._training_lock_holder = None
# Use orchestrator's inference training coordinator (if available)
# This reduces duplication and centralizes coordination logic
@@ -4142,7 +4146,16 @@ class RealTrainingAdapter:
# CRITICAL: Acquire training lock to prevent concurrent model access
# This prevents "inplace operation" errors when batch training runs simultaneously
import torch
with self._training_lock:
import threading
# Try to acquire lock with timeout to prevent deadlock
lock_acquired = self._training_lock.acquire(timeout=5.0)
if not lock_acquired:
logger.warning("Could not acquire training lock within 5 seconds - skipping this training step")
return
try:
self._training_lock_holder = threading.current_thread().name
with torch.enable_grad():
trainer.model.train()
result = trainer.train_step(batch, accumulate_gradients=False)
@@ -4193,6 +4206,10 @@ class RealTrainingAdapter:
improved=improved
)
self.realtime_training_metrics['last_checkpoint_step'] = self.realtime_training_metrics['total_steps']
finally:
# CRITICAL: Always release the lock, even if an exception occurs
self._training_lock_holder = None
self._training_lock.release()
except Exception as e:
logger.warning(f"Error training transformer on sample: {e}")