wip training
This commit is contained in:
@@ -1789,17 +1789,47 @@ class RealTrainingAdapter:
|
||||
|
||||
import torch
|
||||
|
||||
# MEMORY FIX: Pre-convert batches ONCE and cache them
|
||||
# This avoids recreating batches every epoch (major leak!)
|
||||
logger.info(" Pre-converting batches (one-time operation)...")
|
||||
# OPTIMIZATION: Pre-convert batches ONCE and move to GPU immediately
|
||||
# This avoids CPU→GPU transfer bottleneck during training
|
||||
logger.info(" Pre-converting batches and moving to GPU (one-time operation)...")
|
||||
|
||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||
use_gpu = torch.cuda.is_available()
|
||||
|
||||
if use_gpu:
|
||||
logger.info(f" GPU: {torch.cuda.get_device_name(0)}")
|
||||
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
||||
|
||||
cached_batches = []
|
||||
for i, data in enumerate(training_data):
|
||||
batch = self._convert_annotation_to_transformer_batch(data)
|
||||
if batch is not None:
|
||||
cached_batches.append(batch)
|
||||
# OPTIMIZATION: Move batch to GPU immediately with pinned memory
|
||||
if use_gpu:
|
||||
batch_gpu = {}
|
||||
for k, v in batch.items():
|
||||
if isinstance(v, torch.Tensor):
|
||||
# Use pin_memory() for faster CPU→GPU transfer
|
||||
# Then move to GPU with non_blocking=True
|
||||
batch_gpu[k] = v.pin_memory().to(device, non_blocking=True)
|
||||
else:
|
||||
batch_gpu[k] = v
|
||||
cached_batches.append(batch_gpu)
|
||||
del batch # Free CPU memory immediately
|
||||
else:
|
||||
cached_batches.append(batch)
|
||||
|
||||
# Show progress every 10 batches
|
||||
if (i + 1) % 10 == 0 or i == 0:
|
||||
logger.info(f" Processed {i + 1}/{len(training_data)} batches...")
|
||||
else:
|
||||
logger.warning(f" Failed to convert sample {i+1}")
|
||||
|
||||
# Synchronize GPU operations
|
||||
if use_gpu:
|
||||
torch.cuda.synchronize()
|
||||
logger.info(f" All {len(cached_batches)} batches now on GPU")
|
||||
|
||||
# Clear training_data to free memory
|
||||
training_data.clear()
|
||||
del training_data
|
||||
@@ -1809,25 +1839,16 @@ class RealTrainingAdapter:
|
||||
|
||||
def batch_generator():
|
||||
"""
|
||||
Yield pre-converted batches with proper memory management
|
||||
Yield pre-converted batches (already on GPU)
|
||||
|
||||
CRITICAL: Each batch must be cloned and detached to prevent:
|
||||
1. GPU memory accumulation across epochs
|
||||
2. Computation graph retention
|
||||
3. Version tracking issues
|
||||
OPTIMIZATION: Batches are already on GPU and detached.
|
||||
No cloning needed - just yield directly for maximum performance.
|
||||
Each batch is independent (no gradient accumulation across batches).
|
||||
"""
|
||||
for batch in cached_batches:
|
||||
# Clone and detach each tensor in the batch
|
||||
# This creates a fresh copy without gradient history
|
||||
cloned_batch = {}
|
||||
for key, value in batch.items():
|
||||
if isinstance(value, torch.Tensor):
|
||||
# detach() removes from computation graph
|
||||
# clone() creates new memory (prevents aliasing)
|
||||
cloned_batch[key] = value.detach().clone()
|
||||
else:
|
||||
cloned_batch[key] = value
|
||||
yield cloned_batch
|
||||
# Simply yield the batch - no cloning needed!
|
||||
# Batches are already on GPU and properly detached
|
||||
yield batch
|
||||
|
||||
total_batches = len(cached_batches)
|
||||
|
||||
@@ -1860,6 +1881,12 @@ class RealTrainingAdapter:
|
||||
epoch_accuracy = 0.0
|
||||
num_batches = 0
|
||||
|
||||
# Log GPU status at start of epoch
|
||||
if use_gpu:
|
||||
mem_allocated = torch.cuda.memory_allocated(device) / 1024**3
|
||||
mem_reserved = torch.cuda.memory_reserved(device) / 1024**3
|
||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||||
|
||||
# MEMORY FIX: Aggressive cleanup before epoch
|
||||
gc.collect()
|
||||
if torch.cuda.is_available():
|
||||
|
||||
Reference in New Issue
Block a user