wip training

This commit is contained in:
Dobromir Popov
2025-11-17 13:28:36 +02:00
parent 43a7d75daf
commit 37e90a1c3c
3 changed files with 381 additions and 33 deletions

View File

@@ -1789,17 +1789,47 @@ class RealTrainingAdapter:
import torch
# MEMORY FIX: Pre-convert batches ONCE and cache them
# This avoids recreating batches every epoch (major leak!)
logger.info(" Pre-converting batches (one-time operation)...")
# OPTIMIZATION: Pre-convert batches ONCE and move to GPU immediately
# This avoids CPU→GPU transfer bottleneck during training
logger.info(" Pre-converting batches and moving to GPU (one-time operation)...")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
use_gpu = torch.cuda.is_available()
if use_gpu:
logger.info(f" GPU: {torch.cuda.get_device_name(0)}")
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
cached_batches = []
for i, data in enumerate(training_data):
batch = self._convert_annotation_to_transformer_batch(data)
if batch is not None:
cached_batches.append(batch)
# OPTIMIZATION: Move batch to GPU immediately with pinned memory
if use_gpu:
batch_gpu = {}
for k, v in batch.items():
if isinstance(v, torch.Tensor):
# Use pin_memory() for faster CPU→GPU transfer
# Then move to GPU with non_blocking=True
batch_gpu[k] = v.pin_memory().to(device, non_blocking=True)
else:
batch_gpu[k] = v
cached_batches.append(batch_gpu)
del batch # Free CPU memory immediately
else:
cached_batches.append(batch)
# Show progress every 10 batches
if (i + 1) % 10 == 0 or i == 0:
logger.info(f" Processed {i + 1}/{len(training_data)} batches...")
else:
logger.warning(f" Failed to convert sample {i+1}")
# Synchronize GPU operations
if use_gpu:
torch.cuda.synchronize()
logger.info(f" All {len(cached_batches)} batches now on GPU")
# Clear training_data to free memory
training_data.clear()
del training_data
@@ -1809,25 +1839,16 @@ class RealTrainingAdapter:
def batch_generator():
"""
Yield pre-converted batches with proper memory management
Yield pre-converted batches (already on GPU)
CRITICAL: Each batch must be cloned and detached to prevent:
1. GPU memory accumulation across epochs
2. Computation graph retention
3. Version tracking issues
OPTIMIZATION: Batches are already on GPU and detached.
No cloning needed - just yield directly for maximum performance.
Each batch is independent (no gradient accumulation across batches).
"""
for batch in cached_batches:
# Clone and detach each tensor in the batch
# This creates a fresh copy without gradient history
cloned_batch = {}
for key, value in batch.items():
if isinstance(value, torch.Tensor):
# detach() removes from computation graph
# clone() creates new memory (prevents aliasing)
cloned_batch[key] = value.detach().clone()
else:
cloned_batch[key] = value
yield cloned_batch
# Simply yield the batch - no cloning needed!
# Batches are already on GPU and properly detached
yield batch
total_batches = len(cached_batches)
@@ -1860,6 +1881,12 @@ class RealTrainingAdapter:
epoch_accuracy = 0.0
num_batches = 0
# Log GPU status at start of epoch
if use_gpu:
mem_allocated = torch.cuda.memory_allocated(device) / 1024**3
mem_reserved = torch.cuda.memory_reserved(device) / 1024**3
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
# MEMORY FIX: Aggressive cleanup before epoch
gc.collect()
if torch.cuda.is_available():