log GPU
This commit is contained in:
@@ -1800,6 +1800,14 @@ class RealTrainingAdapter:
|
||||
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
|
||||
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
||||
logger.info(f" Pre-moving batches to GPU for maximum efficiency")
|
||||
|
||||
# Log initial GPU status
|
||||
try:
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
gpu_monitor = get_gpu_monitor()
|
||||
gpu_monitor.log_gpu_status("Initial GPU status")
|
||||
except Exception as e:
|
||||
logger.debug(f"GPU monitor not available: {e}")
|
||||
|
||||
# Convert and move batches to GPU immediately
|
||||
cached_batches = []
|
||||
@@ -1929,10 +1937,17 @@ class RealTrainingAdapter:
|
||||
|
||||
# Log GPU status at start of epoch
|
||||
if use_gpu and torch.cuda.is_available():
|
||||
# Use CUDA device (0) for memory stats, not the device variable
|
||||
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||||
# Use GPU monitor for detailed metrics
|
||||
try:
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
gpu_monitor = get_gpu_monitor()
|
||||
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}")
|
||||
except Exception as e:
|
||||
# Fallback to basic memory stats if monitor not available
|
||||
logger.debug(f"GPU monitor not available: {e}")
|
||||
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||||
|
||||
# MEMORY FIX: Aggressive cleanup before epoch
|
||||
gc.collect()
|
||||
@@ -1980,9 +1995,21 @@ class RealTrainingAdapter:
|
||||
denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
|
||||
denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
|
||||
|
||||
# Get GPU utilization during training
|
||||
gpu_info = ""
|
||||
if use_gpu and torch.cuda.is_available():
|
||||
try:
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
gpu_monitor = get_gpu_monitor()
|
||||
gpu_summary = gpu_monitor.get_summary_string()
|
||||
if gpu_summary != "GPU monitoring not available":
|
||||
gpu_info = f" | {gpu_summary}"
|
||||
except Exception:
|
||||
pass # GPU monitoring optional
|
||||
|
||||
logger.info(f" Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
|
||||
f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
|
||||
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}")
|
||||
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}")
|
||||
else:
|
||||
logger.warning(f" Batch {i + 1} returned None result - skipping")
|
||||
|
||||
@@ -2120,6 +2147,15 @@ class RealTrainingAdapter:
|
||||
# Check memory usage
|
||||
log_memory_usage(f" Epoch {epoch + 1} end - ")
|
||||
|
||||
# Log GPU status at end of epoch
|
||||
if use_gpu and torch.cuda.is_available():
|
||||
try:
|
||||
from utils.gpu_monitor import get_gpu_monitor
|
||||
gpu_monitor = get_gpu_monitor()
|
||||
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end")
|
||||
except Exception:
|
||||
pass # GPU monitoring optional
|
||||
|
||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
|
||||
|
||||
session.final_loss = session.current_loss
|
||||
|
||||
Reference in New Issue
Block a user