This commit is contained in:
Dobromir Popov
2025-11-19 10:47:43 +02:00
parent 8ee8558829
commit df5f9b47f2
4 changed files with 303 additions and 16 deletions

View File

@@ -1800,6 +1800,14 @@ class RealTrainingAdapter:
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
logger.info(f" Pre-moving batches to GPU for maximum efficiency")
# Log initial GPU status
try:
from utils.gpu_monitor import get_gpu_monitor
gpu_monitor = get_gpu_monitor()
gpu_monitor.log_gpu_status("Initial GPU status")
except Exception as e:
logger.debug(f"GPU monitor not available: {e}")
# Convert and move batches to GPU immediately
cached_batches = []
@@ -1929,10 +1937,17 @@ class RealTrainingAdapter:
# Log GPU status at start of epoch
if use_gpu and torch.cuda.is_available():
# Use CUDA device (0) for memory stats, not the device variable
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
# Use GPU monitor for detailed metrics
try:
from utils.gpu_monitor import get_gpu_monitor
gpu_monitor = get_gpu_monitor()
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}")
except Exception as e:
# Fallback to basic memory stats if monitor not available
logger.debug(f"GPU monitor not available: {e}")
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
# MEMORY FIX: Aggressive cleanup before epoch
gc.collect()
@@ -1980,9 +1995,21 @@ class RealTrainingAdapter:
denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
# Get GPU utilization during training
gpu_info = ""
if use_gpu and torch.cuda.is_available():
try:
from utils.gpu_monitor import get_gpu_monitor
gpu_monitor = get_gpu_monitor()
gpu_summary = gpu_monitor.get_summary_string()
if gpu_summary != "GPU monitoring not available":
gpu_info = f" | {gpu_summary}"
except Exception:
pass # GPU monitoring optional
logger.info(f" Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}")
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}")
else:
logger.warning(f" Batch {i + 1} returned None result - skipping")
@@ -2120,6 +2147,15 @@ class RealTrainingAdapter:
# Check memory usage
log_memory_usage(f" Epoch {epoch + 1} end - ")
# Log GPU status at end of epoch
if use_gpu and torch.cuda.is_available():
try:
from utils.gpu_monitor import get_gpu_monitor
gpu_monitor = get_gpu_monitor()
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end")
except Exception:
pass # GPU monitoring optional
logger.info(f" Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
session.final_loss = session.current_loss