wip
This commit is contained in:
@@ -163,8 +163,12 @@ class RealTrainingAdapter:
|
||||
# CRITICAL: Training lock to prevent concurrent model access
|
||||
# Multiple threads (batch training + per-candle training) can corrupt
|
||||
# the computation graph if they access the model simultaneously
|
||||
# Use RLock (reentrant lock) to allow same thread to acquire multiple times
|
||||
import threading
|
||||
self._training_lock = threading.Lock()
|
||||
self._training_lock = threading.RLock()
|
||||
|
||||
# Track which thread currently holds the training lock (for debugging)
|
||||
self._training_lock_holder = None
|
||||
|
||||
# Use orchestrator's inference training coordinator (if available)
|
||||
# This reduces duplication and centralizes coordination logic
|
||||
@@ -4142,7 +4146,16 @@ class RealTrainingAdapter:
|
||||
# CRITICAL: Acquire training lock to prevent concurrent model access
|
||||
# This prevents "inplace operation" errors when batch training runs simultaneously
|
||||
import torch
|
||||
with self._training_lock:
|
||||
import threading
|
||||
|
||||
# Try to acquire lock with timeout to prevent deadlock
|
||||
lock_acquired = self._training_lock.acquire(timeout=5.0)
|
||||
if not lock_acquired:
|
||||
logger.warning("Could not acquire training lock within 5 seconds - skipping this training step")
|
||||
return
|
||||
|
||||
try:
|
||||
self._training_lock_holder = threading.current_thread().name
|
||||
with torch.enable_grad():
|
||||
trainer.model.train()
|
||||
result = trainer.train_step(batch, accumulate_gradients=False)
|
||||
@@ -4193,6 +4206,10 @@ class RealTrainingAdapter:
|
||||
improved=improved
|
||||
)
|
||||
self.realtime_training_metrics['last_checkpoint_step'] = self.realtime_training_metrics['total_steps']
|
||||
finally:
|
||||
# CRITICAL: Always release the lock, even if an exception occurs
|
||||
self._training_lock_holder = None
|
||||
self._training_lock.release()
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Error training transformer on sample: {e}")
|
||||
|
||||
Reference in New Issue
Block a user