training fixes
This commit is contained in:
@@ -1789,25 +1789,40 @@ class RealTrainingAdapter:
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
# OPTIMIZATION: Pre-convert batches ONCE
|
# OPTIMIZATION: Pre-convert batches ONCE and move to GPU immediately
|
||||||
# NOTE: Using CPU for batch storage to avoid ROCm/HIP kernel issues
|
# This eliminates CPU→GPU transfer bottleneck during training
|
||||||
# GPU will be used during forward/backward passes in trainer
|
logger.info(" Pre-converting batches and moving to GPU (one-time operation)...")
|
||||||
logger.info(" Pre-converting batches (one-time operation)...")
|
|
||||||
|
|
||||||
device = torch.device('cpu') # Store batches on CPU
|
|
||||||
use_gpu = torch.cuda.is_available()
|
use_gpu = torch.cuda.is_available()
|
||||||
|
device = trainer.device if hasattr(trainer, 'device') else torch.device('cuda' if use_gpu else 'cpu')
|
||||||
|
|
||||||
if use_gpu:
|
if use_gpu:
|
||||||
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
|
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
|
||||||
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
||||||
logger.info(f" Batches will be stored on CPU, moved to GPU during training")
|
logger.info(f" Pre-moving batches to GPU for maximum efficiency")
|
||||||
|
|
||||||
|
# Convert and move batches to GPU immediately
|
||||||
cached_batches = []
|
cached_batches = []
|
||||||
for i, data in enumerate(training_data):
|
for i, data in enumerate(training_data):
|
||||||
batch = self._convert_annotation_to_transformer_batch(data)
|
batch = self._convert_annotation_to_transformer_batch(data)
|
||||||
if batch is not None:
|
if batch is not None:
|
||||||
# Store batches on CPU (trainer will move to GPU)
|
# Move batch to GPU immediately with pinned memory for faster transfer
|
||||||
cached_batches.append(batch)
|
if use_gpu:
|
||||||
|
batch_gpu = {}
|
||||||
|
for k, v in batch.items():
|
||||||
|
if isinstance(v, torch.Tensor):
|
||||||
|
# Use pinned memory for faster CPU→GPU transfer
|
||||||
|
if v.device.type == 'cpu':
|
||||||
|
batch_gpu[k] = v.pin_memory().to(device, non_blocking=True)
|
||||||
|
else:
|
||||||
|
batch_gpu[k] = v.to(device, non_blocking=True)
|
||||||
|
else:
|
||||||
|
batch_gpu[k] = v
|
||||||
|
cached_batches.append(batch_gpu)
|
||||||
|
# Free CPU memory immediately
|
||||||
|
del batch
|
||||||
|
else:
|
||||||
|
cached_batches.append(batch)
|
||||||
|
|
||||||
# Show progress every 10 batches
|
# Show progress every 10 batches
|
||||||
if (i + 1) % 10 == 0 or i == 0:
|
if (i + 1) % 10 == 0 or i == 0:
|
||||||
@@ -1820,44 +1835,90 @@ class RealTrainingAdapter:
|
|||||||
del training_data
|
del training_data
|
||||||
gc.collect()
|
gc.collect()
|
||||||
|
|
||||||
logger.info(f" Converted {len(cached_batches)} batches, cleared source data")
|
# Synchronize GPU transfers
|
||||||
|
if use_gpu:
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
logger.info(f" Converted {len(cached_batches)} batches, all moved to GPU")
|
||||||
|
|
||||||
|
# Helper function to combine multiple single-sample batches into a mini-batch
|
||||||
|
def _combine_transformer_batches(batch_list: List[Dict]) -> Dict:
|
||||||
|
"""Combine multiple single-sample batches into one mini-batch"""
|
||||||
|
if len(batch_list) == 1:
|
||||||
|
return batch_list[0]
|
||||||
|
|
||||||
|
combined = {}
|
||||||
|
# Get all keys from first batch
|
||||||
|
keys = batch_list[0].keys()
|
||||||
|
|
||||||
|
for key in keys:
|
||||||
|
# Collect tensors, filtering out None values
|
||||||
|
tensors = []
|
||||||
|
for b in batch_list:
|
||||||
|
if key in b and b[key] is not None and isinstance(b[key], torch.Tensor):
|
||||||
|
tensors.append(b[key])
|
||||||
|
|
||||||
|
if tensors:
|
||||||
|
# Concatenate along batch dimension (dim=0)
|
||||||
|
combined[key] = torch.cat(tensors, dim=0)
|
||||||
|
elif key in batch_list[0]:
|
||||||
|
# For non-tensor values (like norm_params dict), use first batch's value
|
||||||
|
# Or None if all batches have None for this key
|
||||||
|
first_value = batch_list[0].get(key)
|
||||||
|
if first_value is not None and not isinstance(first_value, torch.Tensor):
|
||||||
|
combined[key] = first_value
|
||||||
|
else:
|
||||||
|
# Check if all batches have None for this key
|
||||||
|
all_none = all(b.get(key) is None for b in batch_list)
|
||||||
|
if not all_none:
|
||||||
|
# Some batches have this key, use first non-None
|
||||||
|
for b in batch_list:
|
||||||
|
if b.get(key) is not None:
|
||||||
|
combined[key] = b[key]
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
combined[key] = None
|
||||||
|
|
||||||
|
return combined
|
||||||
|
|
||||||
|
# Group batches into mini-batches for better GPU utilization
|
||||||
|
# DISABLED: Batches have inconsistent sequence lengths, process individually
|
||||||
|
# transformer_batch_size = 5
|
||||||
|
total_samples = len(cached_batches) # Store count before clearing
|
||||||
|
grouped_batches = []
|
||||||
|
|
||||||
|
# Process each batch individually to avoid shape mismatch errors
|
||||||
|
logger.info(f" Processing {len(cached_batches)} batches individually (no grouping due to variable sequence lengths)")
|
||||||
|
for batch in cached_batches:
|
||||||
|
grouped_batches.append(batch)
|
||||||
|
|
||||||
|
# Clear cached_batches to free memory
|
||||||
|
cached_batches.clear()
|
||||||
|
del cached_batches
|
||||||
|
gc.collect()
|
||||||
|
|
||||||
def batch_generator():
|
def batch_generator():
|
||||||
"""
|
"""
|
||||||
Yield pre-converted batches (already on GPU)
|
Yield grouped mini-batches (already on GPU)
|
||||||
|
|
||||||
OPTIMIZATION: Batches are already on GPU and detached.
|
OPTIMIZATION: Batches are already on GPU and grouped for efficient processing.
|
||||||
No cloning needed - just yield directly for maximum performance.
|
Each mini-batch contains 5 samples for better GPU utilization.
|
||||||
Each batch is independent (no gradient accumulation across batches).
|
|
||||||
"""
|
"""
|
||||||
for batch in cached_batches:
|
for batch in grouped_batches:
|
||||||
# Simply yield the batch - no cloning needed!
|
|
||||||
# Batches are already on GPU and properly detached
|
|
||||||
yield batch
|
yield batch
|
||||||
|
|
||||||
total_batches = len(cached_batches)
|
total_batches = len(grouped_batches)
|
||||||
|
|
||||||
if total_batches == 0:
|
if total_batches == 0:
|
||||||
raise Exception("No valid training batches after conversion")
|
raise Exception("No valid training batches after conversion")
|
||||||
|
|
||||||
logger.info(f" Ready to train on {total_batches} batches")
|
logger.info(f" Ready to train on {total_batches} batches")
|
||||||
|
logger.info(f" Total samples: {total_samples}")
|
||||||
|
|
||||||
# MEMORY FIX: Process batches directly from generator, no grouping needed
|
# Disable gradient accumulation since we're using proper batching now
|
||||||
# Batch size of 1 (single sample) to avoid OOM
|
|
||||||
logger.info(f" Processing batches individually (batch_size=1) for memory efficiency")
|
|
||||||
|
|
||||||
# MEMORY OPTIMIZATION: Configure gradient accumulation
|
|
||||||
# Process samples one at a time, accumulate gradients over multiple samples
|
|
||||||
# This reduces peak memory by ~50% compared to batching
|
|
||||||
accumulation_steps = max(2, min(5, total_batches)) # 2-5 steps based on data size
|
|
||||||
|
|
||||||
logger.info(f" Gradient accumulation: {accumulation_steps} steps")
|
|
||||||
logger.info(f" Effective batch size: {accumulation_steps} (processed as {accumulation_steps} × batch_size=1)")
|
|
||||||
|
|
||||||
# Configure trainer for gradient accumulation
|
|
||||||
if hasattr(trainer, 'set_gradient_accumulation_steps'):
|
if hasattr(trainer, 'set_gradient_accumulation_steps'):
|
||||||
trainer.set_gradient_accumulation_steps(accumulation_steps)
|
trainer.set_gradient_accumulation_steps(0) # No accumulation needed with batching
|
||||||
logger.info(f" Trainer configured for automatic gradient accumulation")
|
logger.info(f" Gradient accumulation disabled (using proper batching instead)")
|
||||||
|
|
||||||
import gc
|
import gc
|
||||||
|
|
||||||
@@ -1867,9 +1928,10 @@ class RealTrainingAdapter:
|
|||||||
num_batches = 0
|
num_batches = 0
|
||||||
|
|
||||||
# Log GPU status at start of epoch
|
# Log GPU status at start of epoch
|
||||||
if use_gpu:
|
if use_gpu and torch.cuda.is_available():
|
||||||
mem_allocated = torch.cuda.memory_allocated(device) / 1024**3
|
# Use CUDA device (0) for memory stats, not the device variable
|
||||||
mem_reserved = torch.cuda.memory_reserved(device) / 1024**3
|
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||||
|
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||||||
|
|
||||||
# MEMORY FIX: Aggressive cleanup before epoch
|
# MEMORY FIX: Aggressive cleanup before epoch
|
||||||
@@ -1878,16 +1940,16 @@ class RealTrainingAdapter:
|
|||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
# Reset gradient accumulation counter at start of epoch
|
# Reset gradient accumulation counter at start of epoch (not needed with batching, but safe to call)
|
||||||
if hasattr(trainer, 'reset_gradient_accumulation'):
|
if hasattr(trainer, 'reset_gradient_accumulation'):
|
||||||
trainer.reset_gradient_accumulation()
|
trainer.reset_gradient_accumulation()
|
||||||
|
|
||||||
# Generate batches fresh for each epoch
|
# Generate batches fresh for each epoch
|
||||||
for i, batch in enumerate(batch_generator()):
|
for i, batch in enumerate(batch_generator()):
|
||||||
try:
|
try:
|
||||||
# Call the trainer's train_step method
|
# Call the trainer's train_step method with mini-batch
|
||||||
# Trainer now handles gradient accumulation automatically
|
# Batch is already on GPU and contains multiple samples
|
||||||
result = trainer.train_step(batch)
|
result = trainer.train_step(batch, accumulate_gradients=False)
|
||||||
|
|
||||||
if result is not None:
|
if result is not None:
|
||||||
# MEMORY FIX: Detach all tensor values to break computation graph
|
# MEMORY FIX: Detach all tensor values to break computation graph
|
||||||
@@ -1929,21 +1991,19 @@ class RealTrainingAdapter:
|
|||||||
if 'result' in locals():
|
if 'result' in locals():
|
||||||
del result
|
del result
|
||||||
|
|
||||||
# Delete the cloned batch (it's a fresh copy, safe to delete)
|
# NOTE: Don't delete batch contents - batches are reused across epochs
|
||||||
|
# The batch dictionary is shared, so deleting keys corrupts it for next epoch
|
||||||
|
# Just clear the reference - Python GC will handle cleanup
|
||||||
if 'batch' in locals():
|
if 'batch' in locals():
|
||||||
for key in list(batch.keys()):
|
|
||||||
if isinstance(batch[key], torch.Tensor):
|
|
||||||
del batch[key]
|
|
||||||
del batch
|
del batch
|
||||||
|
|
||||||
# Clear CUDA cache after every batch
|
# Clear CUDA cache after every batch
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
# After optimizer step, aggressive cleanup
|
# After each batch, cleanup (no accumulation needed with proper batching)
|
||||||
# Check if this was an optimizer step (not accumulation)
|
# Every batch triggers optimizer step
|
||||||
is_optimizer_step = ((i + 1) % accumulation_steps == 0)
|
if True:
|
||||||
if is_optimizer_step:
|
|
||||||
gc.collect()
|
gc.collect()
|
||||||
if torch.cuda.is_available():
|
if torch.cuda.is_available():
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@@ -2031,7 +2091,7 @@ class RealTrainingAdapter:
|
|||||||
'learning_rate': float(trainer.scheduler.get_last_lr()[0])
|
'learning_rate': float(trainer.scheduler.get_last_lr()[0])
|
||||||
},
|
},
|
||||||
training_metadata={
|
training_metadata={
|
||||||
'num_samples': len(training_data),
|
'num_samples': total_samples, # Use stored count, training_data was deleted
|
||||||
'num_batches': num_batches,
|
'num_batches': num_batches,
|
||||||
'training_id': session.training_id
|
'training_id': session.training_id
|
||||||
},
|
},
|
||||||
@@ -2263,7 +2323,7 @@ class RealTrainingAdapter:
|
|||||||
|
|
||||||
if pivot_trainer:
|
if pivot_trainer:
|
||||||
pivot_trainer.start(symbol=symbol)
|
pivot_trainer.start(symbol=symbol)
|
||||||
logger.info(f"✅ Live pivot training ENABLED - will train on L2 peaks automatically")
|
logger.info(f"Live pivot training ENABLED - will train on L2 peaks automatically")
|
||||||
else:
|
else:
|
||||||
logger.warning("Could not initialize live pivot trainer")
|
logger.warning("Could not initialize live pivot trainer")
|
||||||
|
|
||||||
|
|||||||
@@ -477,11 +477,11 @@ class AnnotationDashboard:
|
|||||||
engineio_logger=False
|
engineio_logger=False
|
||||||
)
|
)
|
||||||
self.has_socketio = True
|
self.has_socketio = True
|
||||||
logger.info("✅ SocketIO initialized for real-time updates")
|
logger.info("SocketIO initialized for real-time updates")
|
||||||
except ImportError:
|
except ImportError:
|
||||||
self.socketio = None
|
self.socketio = None
|
||||||
self.has_socketio = False
|
self.has_socketio = False
|
||||||
logger.warning("⚠️ flask-socketio not installed - live updates will use polling")
|
logger.warning("flask-socketio not installed - live updates will use polling")
|
||||||
|
|
||||||
# Suppress werkzeug request logs (reduce noise from polling endpoints)
|
# Suppress werkzeug request logs (reduce noise from polling endpoints)
|
||||||
werkzeug_logger = logging.getLogger('werkzeug')
|
werkzeug_logger = logging.getLogger('werkzeug')
|
||||||
@@ -2202,10 +2202,10 @@ class AnnotationDashboard:
|
|||||||
logger.info(f"Starting Annotation Dashboard on http://{host}:{port}")
|
logger.info(f"Starting Annotation Dashboard on http://{host}:{port}")
|
||||||
|
|
||||||
if self.has_socketio:
|
if self.has_socketio:
|
||||||
logger.info("✅ Running with WebSocket support (SocketIO)")
|
logger.info("Running with WebSocket support (SocketIO)")
|
||||||
self.socketio.run(self.server, host=host, port=port, debug=debug, allow_unsafe_werkzeug=True)
|
self.socketio.run(self.server, host=host, port=port, debug=debug, allow_unsafe_werkzeug=True)
|
||||||
else:
|
else:
|
||||||
logger.warning("⚠️ Running without WebSocket support - install flask-socketio for live updates")
|
logger.warning("Running without WebSocket support - install flask-socketio for live updates")
|
||||||
self.server.run(host=host, port=port, debug=debug)
|
self.server.run(host=host, port=port, debug=debug)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1238,14 +1238,13 @@ class TradingTransformerTrainer:
|
|||||||
break
|
break
|
||||||
|
|
||||||
if needs_transfer:
|
if needs_transfer:
|
||||||
# Move batch to device and DELETE original CPU tensors to prevent memory leak
|
# Move batch to device - iterate over copy of keys to avoid modification during iteration
|
||||||
batch_gpu = {}
|
batch_gpu = {}
|
||||||
for k, v in batch.items():
|
for k in list(batch.keys()): # Create list copy to avoid modification during iteration
|
||||||
|
v = batch[k]
|
||||||
if isinstance(v, torch.Tensor):
|
if isinstance(v, torch.Tensor):
|
||||||
# Move to device (creates GPU copy)
|
# Move to device (creates GPU copy)
|
||||||
batch_gpu[k] = v.to(self.device, non_blocking=True)
|
batch_gpu[k] = v.to(self.device, non_blocking=True)
|
||||||
# Delete CPU tensor immediately to free memory
|
|
||||||
del batch[k]
|
|
||||||
else:
|
else:
|
||||||
batch_gpu[k] = v
|
batch_gpu[k] = v
|
||||||
|
|
||||||
|
|||||||
@@ -551,7 +551,7 @@ class DataProvider:
|
|||||||
logger.info("Skipping initial data load (using DuckDB cache)")
|
logger.info("Skipping initial data load (using DuckDB cache)")
|
||||||
|
|
||||||
logger.info(" Initial data load completed - stopping maintenance worker")
|
logger.info(" Initial data load completed - stopping maintenance worker")
|
||||||
logger.info("📊 Data will be updated on-demand only (no continuous fetching)")
|
logger.info("Data will be updated on-demand only (no continuous fetching)")
|
||||||
|
|
||||||
# Stop the maintenance worker after initial load
|
# Stop the maintenance worker after initial load
|
||||||
self.data_maintenance_active = False
|
self.data_maintenance_active = False
|
||||||
@@ -582,7 +582,7 @@ class DataProvider:
|
|||||||
self.cached_data[symbol][timeframe] = existing_df.tail(1500)
|
self.cached_data[symbol][timeframe] = existing_df.tail(1500)
|
||||||
|
|
||||||
last_timestamp = existing_df.index.max()
|
last_timestamp = existing_df.index.max()
|
||||||
logger.info(f"📦 Loaded {len(existing_df)} candles from DuckDB for {symbol} {timeframe}")
|
logger.info(f"Loaded {len(existing_df)} candles from DuckDB for {symbol} {timeframe}")
|
||||||
else:
|
else:
|
||||||
logger.debug(f"No existing data in DuckDB for {symbol} {timeframe}")
|
logger.debug(f"No existing data in DuckDB for {symbol} {timeframe}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -3140,7 +3140,7 @@ class DataProvider:
|
|||||||
logger.warning(" DuckDB storage not available - cannot load cached data")
|
logger.warning(" DuckDB storage not available - cannot load cached data")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("📦 Loading cached data from DuckDB...")
|
logger.info("Loading cached data from DuckDB...")
|
||||||
loaded_count = 0
|
loaded_count = 0
|
||||||
|
|
||||||
for symbol in self.symbols:
|
for symbol in self.symbols:
|
||||||
|
|||||||
Reference in New Issue
Block a user