log GPU
This commit is contained in:
@@ -1800,6 +1800,14 @@ class RealTrainingAdapter:
|
|||||||
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
|
logger.info(f" GPU available: {torch.cuda.get_device_name(0)}")
|
||||||
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
|
||||||
logger.info(f" Pre-moving batches to GPU for maximum efficiency")
|
logger.info(f" Pre-moving batches to GPU for maximum efficiency")
|
||||||
|
|
||||||
|
# Log initial GPU status
|
||||||
|
try:
|
||||||
|
from utils.gpu_monitor import get_gpu_monitor
|
||||||
|
gpu_monitor = get_gpu_monitor()
|
||||||
|
gpu_monitor.log_gpu_status("Initial GPU status")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"GPU monitor not available: {e}")
|
||||||
|
|
||||||
# Convert and move batches to GPU immediately
|
# Convert and move batches to GPU immediately
|
||||||
cached_batches = []
|
cached_batches = []
|
||||||
@@ -1929,10 +1937,17 @@ class RealTrainingAdapter:
|
|||||||
|
|
||||||
# Log GPU status at start of epoch
|
# Log GPU status at start of epoch
|
||||||
if use_gpu and torch.cuda.is_available():
|
if use_gpu and torch.cuda.is_available():
|
||||||
# Use CUDA device (0) for memory stats, not the device variable
|
# Use GPU monitor for detailed metrics
|
||||||
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
try:
|
||||||
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
from utils.gpu_monitor import get_gpu_monitor
|
||||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
gpu_monitor = get_gpu_monitor()
|
||||||
|
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}")
|
||||||
|
except Exception as e:
|
||||||
|
# Fallback to basic memory stats if monitor not available
|
||||||
|
logger.debug(f"GPU monitor not available: {e}")
|
||||||
|
mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
|
||||||
|
mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
|
||||||
|
logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
|
||||||
|
|
||||||
# MEMORY FIX: Aggressive cleanup before epoch
|
# MEMORY FIX: Aggressive cleanup before epoch
|
||||||
gc.collect()
|
gc.collect()
|
||||||
@@ -1980,9 +1995,21 @@ class RealTrainingAdapter:
|
|||||||
denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
|
denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
|
||||||
denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
|
denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
|
||||||
|
|
||||||
|
# Get GPU utilization during training
|
||||||
|
gpu_info = ""
|
||||||
|
if use_gpu and torch.cuda.is_available():
|
||||||
|
try:
|
||||||
|
from utils.gpu_monitor import get_gpu_monitor
|
||||||
|
gpu_monitor = get_gpu_monitor()
|
||||||
|
gpu_summary = gpu_monitor.get_summary_string()
|
||||||
|
if gpu_summary != "GPU monitoring not available":
|
||||||
|
gpu_info = f" | {gpu_summary}"
|
||||||
|
except Exception:
|
||||||
|
pass # GPU monitoring optional
|
||||||
|
|
||||||
logger.info(f" Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
|
logger.info(f" Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
|
||||||
f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
|
f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
|
||||||
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}")
|
f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}")
|
||||||
else:
|
else:
|
||||||
logger.warning(f" Batch {i + 1} returned None result - skipping")
|
logger.warning(f" Batch {i + 1} returned None result - skipping")
|
||||||
|
|
||||||
@@ -2120,6 +2147,15 @@ class RealTrainingAdapter:
|
|||||||
# Check memory usage
|
# Check memory usage
|
||||||
log_memory_usage(f" Epoch {epoch + 1} end - ")
|
log_memory_usage(f" Epoch {epoch + 1} end - ")
|
||||||
|
|
||||||
|
# Log GPU status at end of epoch
|
||||||
|
if use_gpu and torch.cuda.is_available():
|
||||||
|
try:
|
||||||
|
from utils.gpu_monitor import get_gpu_monitor
|
||||||
|
gpu_monitor = get_gpu_monitor()
|
||||||
|
gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end")
|
||||||
|
except Exception:
|
||||||
|
pass # GPU monitoring optional
|
||||||
|
|
||||||
logger.info(f" Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
|
logger.info(f" Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
|
||||||
|
|
||||||
session.final_loss = session.current_loss
|
session.final_loss = session.current_loss
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ seaborn>=0.12.0
|
|||||||
ta>=0.11.0
|
ta>=0.11.0
|
||||||
ccxt>=4.0.0
|
ccxt>=4.0.0
|
||||||
dash-bootstrap-components>=2.0.0
|
dash-bootstrap-components>=2.0.0
|
||||||
asyncio-compat>=0.1.2
|
|
||||||
wandb>=0.16.0
|
wandb>=0.16.0
|
||||||
pybit>=5.11.0
|
pybit>=5.11.0
|
||||||
requests>=2.31.0
|
requests>=2.31.0
|
||||||
@@ -38,6 +37,10 @@ duckdb>=0.9.0
|
|||||||
Flask>=3.0.0
|
Flask>=3.0.0
|
||||||
flask-cors>=4.0.0
|
flask-cors>=4.0.0
|
||||||
|
|
||||||
|
# Optional GPU monitoring (for NVIDIA GPUs - install if you want detailed GPU utilization metrics)
|
||||||
|
pynvml>=11.5.0 # NVIDIA Management Library for GPU utilization monitoring
|
||||||
|
# GPUtil>=1.4.0 # Alternative GPU monitoring library
|
||||||
|
|
||||||
# NOTE: PyTorch is intentionally not pinned here to avoid pulling wrong GPU deps.
|
# NOTE: PyTorch is intentionally not pinned here to avoid pulling wrong GPU deps.
|
||||||
# We only need torch (not torchvision/torchaudio) for trading systems.
|
# We only need torch (not torchvision/torchaudio) for trading systems.
|
||||||
#
|
#
|
||||||
|
|||||||
248
utils/gpu_monitor.py
Normal file
248
utils/gpu_monitor.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
"""
|
||||||
|
GPU Utilization Monitor
|
||||||
|
Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Dict, Optional, Any
|
||||||
|
import platform
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Try to import GPU monitoring libraries
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
HAS_TORCH = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_TORCH = False
|
||||||
|
torch = None
|
||||||
|
|
||||||
|
# Try NVIDIA Management Library (pynvml)
|
||||||
|
try:
|
||||||
|
import pynvml
|
||||||
|
HAS_NVML = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_NVML = False
|
||||||
|
pynvml = None
|
||||||
|
|
||||||
|
# Try GPUtil (alternative NVIDIA monitoring)
|
||||||
|
try:
|
||||||
|
import GPUtil
|
||||||
|
HAS_GPUTIL = True
|
||||||
|
except ImportError:
|
||||||
|
HAS_GPUTIL = False
|
||||||
|
GPUtil = None
|
||||||
|
|
||||||
|
|
||||||
|
class GPUMonitor:
|
||||||
|
"""Monitor GPU utilization and performance metrics"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.monitoring_enabled = False
|
||||||
|
self.gpu_type = None
|
||||||
|
self.device_id = 0
|
||||||
|
|
||||||
|
# Initialize monitoring based on available libraries
|
||||||
|
if HAS_TORCH and torch.cuda.is_available():
|
||||||
|
self.monitoring_enabled = True
|
||||||
|
self.device_id = 0
|
||||||
|
|
||||||
|
# Try to determine GPU vendor
|
||||||
|
try:
|
||||||
|
gpu_name = torch.cuda.get_device_name(0)
|
||||||
|
if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower():
|
||||||
|
self.gpu_type = 'nvidia'
|
||||||
|
elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower():
|
||||||
|
self.gpu_type = 'amd'
|
||||||
|
else:
|
||||||
|
self.gpu_type = 'unknown'
|
||||||
|
except Exception:
|
||||||
|
self.gpu_type = 'unknown'
|
||||||
|
|
||||||
|
# Initialize NVIDIA monitoring if available
|
||||||
|
if self.gpu_type == 'nvidia' and HAS_NVML:
|
||||||
|
try:
|
||||||
|
pynvml.nvmlInit()
|
||||||
|
self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
|
||||||
|
logger.info("GPU monitoring initialized: NVIDIA GPU with NVML")
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only")
|
||||||
|
self.nvml_handle = None
|
||||||
|
else:
|
||||||
|
self.nvml_handle = None
|
||||||
|
else:
|
||||||
|
logger.debug("GPU monitoring disabled: No CUDA GPU available")
|
||||||
|
|
||||||
|
def get_gpu_utilization(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Get current GPU utilization metrics
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dictionary with GPU utilization metrics or None if not available
|
||||||
|
"""
|
||||||
|
if not self.monitoring_enabled or not HAS_TORCH:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
metrics = {
|
||||||
|
'device_id': self.device_id,
|
||||||
|
'gpu_type': self.gpu_type,
|
||||||
|
'timestamp': time.time()
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get GPU memory usage (always available via PyTorch)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3
|
||||||
|
metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3
|
||||||
|
|
||||||
|
try:
|
||||||
|
props = torch.cuda.get_device_properties(self.device_id)
|
||||||
|
metrics['memory_total_gb'] = props.total_memory / 1024**3
|
||||||
|
metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100
|
||||||
|
except Exception:
|
||||||
|
metrics['memory_total_gb'] = None
|
||||||
|
metrics['memory_usage_percent'] = None
|
||||||
|
|
||||||
|
metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id)
|
||||||
|
|
||||||
|
# Get GPU utilization percentage (NVIDIA only via NVML)
|
||||||
|
if self.gpu_type == 'nvidia' and self.nvml_handle is not None:
|
||||||
|
try:
|
||||||
|
# Get utilization rates
|
||||||
|
util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle)
|
||||||
|
metrics['gpu_utilization_percent'] = util.gpu
|
||||||
|
metrics['memory_utilization_percent'] = util.memory
|
||||||
|
|
||||||
|
# Get power usage
|
||||||
|
try:
|
||||||
|
power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0 # Convert mW to W
|
||||||
|
metrics['power_usage_watts'] = power
|
||||||
|
except Exception:
|
||||||
|
metrics['power_usage_watts'] = None
|
||||||
|
|
||||||
|
# Get temperature
|
||||||
|
try:
|
||||||
|
temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU)
|
||||||
|
metrics['temperature_celsius'] = temp
|
||||||
|
except Exception:
|
||||||
|
metrics['temperature_celsius'] = None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Failed to get NVML metrics: {e}")
|
||||||
|
metrics['gpu_utilization_percent'] = None
|
||||||
|
metrics['memory_utilization_percent'] = None
|
||||||
|
|
||||||
|
# Fallback to GPUtil if NVML not available
|
||||||
|
elif self.gpu_type == 'nvidia' and HAS_GPUTIL:
|
||||||
|
try:
|
||||||
|
gpus = GPUtil.getGPUs()
|
||||||
|
if gpus and len(gpus) > self.device_id:
|
||||||
|
gpu = gpus[self.device_id]
|
||||||
|
metrics['gpu_utilization_percent'] = gpu.load * 100
|
||||||
|
metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100
|
||||||
|
metrics['temperature_celsius'] = gpu.temperature
|
||||||
|
else:
|
||||||
|
metrics['gpu_utilization_percent'] = None
|
||||||
|
metrics['memory_utilization_percent'] = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Failed to get GPUtil metrics: {e}")
|
||||||
|
metrics['gpu_utilization_percent'] = None
|
||||||
|
|
||||||
|
# For AMD GPUs or when NVML/GPUtil not available, estimate utilization
|
||||||
|
# based on memory usage and activity
|
||||||
|
else:
|
||||||
|
# Estimate GPU utilization based on memory activity
|
||||||
|
# This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs
|
||||||
|
if metrics.get('memory_usage_percent') is not None:
|
||||||
|
# If memory is being used actively, GPU is likely active
|
||||||
|
# This is a heuristic, not exact
|
||||||
|
metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100)
|
||||||
|
metrics['memory_utilization_percent'] = metrics['memory_usage_percent']
|
||||||
|
else:
|
||||||
|
metrics['gpu_utilization_percent'] = None
|
||||||
|
metrics['memory_utilization_percent'] = None
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f"Error getting GPU utilization: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def log_gpu_status(self, context: str = "") -> None:
|
||||||
|
"""
|
||||||
|
Log current GPU status
|
||||||
|
|
||||||
|
Args:
|
||||||
|
context: Optional context string to include in log message
|
||||||
|
"""
|
||||||
|
metrics = self.get_gpu_utilization()
|
||||||
|
if not metrics:
|
||||||
|
return
|
||||||
|
|
||||||
|
context_str = f"{context} - " if context else ""
|
||||||
|
|
||||||
|
# Build log message
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
if metrics.get('gpu_name'):
|
||||||
|
parts.append(f"GPU: {metrics['gpu_name']}")
|
||||||
|
|
||||||
|
if metrics.get('gpu_utilization_percent') is not None:
|
||||||
|
parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%")
|
||||||
|
|
||||||
|
if metrics.get('memory_allocated_gb') is not None:
|
||||||
|
mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
|
||||||
|
if metrics.get('memory_total_gb'):
|
||||||
|
mem_str += f"/{metrics['memory_total_gb']:.2f}GB"
|
||||||
|
if metrics.get('memory_usage_percent'):
|
||||||
|
mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
|
||||||
|
parts.append(mem_str)
|
||||||
|
|
||||||
|
if metrics.get('temperature_celsius') is not None:
|
||||||
|
parts.append(f"Temp: {metrics['temperature_celsius']}C")
|
||||||
|
|
||||||
|
if metrics.get('power_usage_watts') is not None:
|
||||||
|
parts.append(f"Power: {metrics['power_usage_watts']:.1f}W")
|
||||||
|
|
||||||
|
if parts:
|
||||||
|
logger.info(f"{context_str}{', '.join(parts)}")
|
||||||
|
|
||||||
|
def get_summary_string(self) -> str:
|
||||||
|
"""
|
||||||
|
Get a summary string of current GPU status
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Formatted string with GPU metrics
|
||||||
|
"""
|
||||||
|
metrics = self.get_gpu_utilization()
|
||||||
|
if not metrics:
|
||||||
|
return "GPU monitoring not available"
|
||||||
|
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
if metrics.get('gpu_utilization_percent') is not None:
|
||||||
|
parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%")
|
||||||
|
|
||||||
|
if metrics.get('memory_allocated_gb') is not None:
|
||||||
|
mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
|
||||||
|
if metrics.get('memory_usage_percent'):
|
||||||
|
mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
|
||||||
|
parts.append(mem_str)
|
||||||
|
|
||||||
|
if metrics.get('temperature_celsius') is not None:
|
||||||
|
parts.append(f"Temp: {metrics['temperature_celsius']}C")
|
||||||
|
|
||||||
|
return ", ".join(parts) if parts else "No metrics available"
|
||||||
|
|
||||||
|
|
||||||
|
# Global instance
|
||||||
|
_gpu_monitor = None
|
||||||
|
|
||||||
|
def get_gpu_monitor() -> GPUMonitor:
|
||||||
|
"""Get or create global GPU monitor instance"""
|
||||||
|
global _gpu_monitor
|
||||||
|
if _gpu_monitor is None:
|
||||||
|
_gpu_monitor = GPUMonitor()
|
||||||
|
return _gpu_monitor
|
||||||
|
|
||||||
@@ -4533,8 +4533,8 @@ class CleanTradingDashboard:
|
|||||||
|
|
||||||
# Add high pivots for this level
|
# Add high pivots for this level
|
||||||
if highs_x:
|
if highs_x:
|
||||||
fig.add_trace(
|
fig.add_trace(
|
||||||
go.Scatter(
|
go.Scatter(
|
||||||
x=highs_x, y=highs_y,
|
x=highs_x, y=highs_y,
|
||||||
mode='markers',
|
mode='markers',
|
||||||
name=f'L{level_num} Pivot High',
|
name=f'L{level_num} Pivot High',
|
||||||
@@ -4547,14 +4547,14 @@ class CleanTradingDashboard:
|
|||||||
),
|
),
|
||||||
showlegend=(level_num == 1), # Only show legend for Level 1
|
showlegend=(level_num == 1), # Only show legend for Level 1
|
||||||
hovertemplate=f"Level {level_num} High: ${{y:.2f}}<extra></extra>"
|
hovertemplate=f"Level {level_num} High: ${{y:.2f}}<extra></extra>"
|
||||||
),
|
),
|
||||||
row=row, col=1
|
row=row, col=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add low pivots for this level
|
# Add low pivots for this level
|
||||||
if lows_x:
|
if lows_x:
|
||||||
fig.add_trace(
|
fig.add_trace(
|
||||||
go.Scatter(
|
go.Scatter(
|
||||||
x=lows_x, y=lows_y,
|
x=lows_x, y=lows_y,
|
||||||
mode='markers',
|
mode='markers',
|
||||||
name=f'L{level_num} Pivot Low',
|
name=f'L{level_num} Pivot Low',
|
||||||
@@ -4567,9 +4567,9 @@ class CleanTradingDashboard:
|
|||||||
),
|
),
|
||||||
showlegend=(level_num == 1), # Only show legend for Level 1
|
showlegend=(level_num == 1), # Only show legend for Level 1
|
||||||
hovertemplate=f"Level {level_num} Low: ${{y:.2f}}<extra></extra>"
|
hovertemplate=f"Level {level_num} Low: ${{y:.2f}}<extra></extra>"
|
||||||
),
|
),
|
||||||
row=row, col=1
|
row=row, col=1
|
||||||
)
|
)
|
||||||
|
|
||||||
# Build external legend HTML (no annotation on chart to avoid scale distortion)
|
# Build external legend HTML (no annotation on chart to avoid scale distortion)
|
||||||
legend_children = []
|
legend_children = []
|
||||||
|
|||||||
Reference in New Issue
Block a user