Files
gogo2/utils/gpu_monitor.py
Dobromir Popov df5f9b47f2 log GPU
2025-11-19 10:47:43 +02:00

249 lines
9.5 KiB
Python

"""
GPU Utilization Monitor
Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs
"""
import logging
import time
from typing import Dict, Optional, Any
import platform
logger = logging.getLogger(__name__)
# Try to import GPU monitoring libraries
try:
import torch
HAS_TORCH = True
except ImportError:
HAS_TORCH = False
torch = None
# Try NVIDIA Management Library (pynvml)
try:
import pynvml
HAS_NVML = True
except ImportError:
HAS_NVML = False
pynvml = None
# Try GPUtil (alternative NVIDIA monitoring)
try:
import GPUtil
HAS_GPUTIL = True
except ImportError:
HAS_GPUTIL = False
GPUtil = None
class GPUMonitor:
"""Monitor GPU utilization and performance metrics"""
def __init__(self):
self.monitoring_enabled = False
self.gpu_type = None
self.device_id = 0
# Initialize monitoring based on available libraries
if HAS_TORCH and torch.cuda.is_available():
self.monitoring_enabled = True
self.device_id = 0
# Try to determine GPU vendor
try:
gpu_name = torch.cuda.get_device_name(0)
if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower():
self.gpu_type = 'nvidia'
elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower():
self.gpu_type = 'amd'
else:
self.gpu_type = 'unknown'
except Exception:
self.gpu_type = 'unknown'
# Initialize NVIDIA monitoring if available
if self.gpu_type == 'nvidia' and HAS_NVML:
try:
pynvml.nvmlInit()
self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
logger.info("GPU monitoring initialized: NVIDIA GPU with NVML")
except Exception as e:
logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only")
self.nvml_handle = None
else:
self.nvml_handle = None
else:
logger.debug("GPU monitoring disabled: No CUDA GPU available")
def get_gpu_utilization(self) -> Optional[Dict[str, Any]]:
"""
Get current GPU utilization metrics
Returns:
Dictionary with GPU utilization metrics or None if not available
"""
if not self.monitoring_enabled or not HAS_TORCH:
return None
try:
metrics = {
'device_id': self.device_id,
'gpu_type': self.gpu_type,
'timestamp': time.time()
}
# Get GPU memory usage (always available via PyTorch)
if torch.cuda.is_available():
metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3
metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3
try:
props = torch.cuda.get_device_properties(self.device_id)
metrics['memory_total_gb'] = props.total_memory / 1024**3
metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100
except Exception:
metrics['memory_total_gb'] = None
metrics['memory_usage_percent'] = None
metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id)
# Get GPU utilization percentage (NVIDIA only via NVML)
if self.gpu_type == 'nvidia' and self.nvml_handle is not None:
try:
# Get utilization rates
util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle)
metrics['gpu_utilization_percent'] = util.gpu
metrics['memory_utilization_percent'] = util.memory
# Get power usage
try:
power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0 # Convert mW to W
metrics['power_usage_watts'] = power
except Exception:
metrics['power_usage_watts'] = None
# Get temperature
try:
temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU)
metrics['temperature_celsius'] = temp
except Exception:
metrics['temperature_celsius'] = None
except Exception as e:
logger.debug(f"Failed to get NVML metrics: {e}")
metrics['gpu_utilization_percent'] = None
metrics['memory_utilization_percent'] = None
# Fallback to GPUtil if NVML not available
elif self.gpu_type == 'nvidia' and HAS_GPUTIL:
try:
gpus = GPUtil.getGPUs()
if gpus and len(gpus) > self.device_id:
gpu = gpus[self.device_id]
metrics['gpu_utilization_percent'] = gpu.load * 100
metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100
metrics['temperature_celsius'] = gpu.temperature
else:
metrics['gpu_utilization_percent'] = None
metrics['memory_utilization_percent'] = None
except Exception as e:
logger.debug(f"Failed to get GPUtil metrics: {e}")
metrics['gpu_utilization_percent'] = None
# For AMD GPUs or when NVML/GPUtil not available, estimate utilization
# based on memory usage and activity
else:
# Estimate GPU utilization based on memory activity
# This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs
if metrics.get('memory_usage_percent') is not None:
# If memory is being used actively, GPU is likely active
# This is a heuristic, not exact
metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100)
metrics['memory_utilization_percent'] = metrics['memory_usage_percent']
else:
metrics['gpu_utilization_percent'] = None
metrics['memory_utilization_percent'] = None
return metrics
except Exception as e:
logger.debug(f"Error getting GPU utilization: {e}")
return None
def log_gpu_status(self, context: str = "") -> None:
"""
Log current GPU status
Args:
context: Optional context string to include in log message
"""
metrics = self.get_gpu_utilization()
if not metrics:
return
context_str = f"{context} - " if context else ""
# Build log message
parts = []
if metrics.get('gpu_name'):
parts.append(f"GPU: {metrics['gpu_name']}")
if metrics.get('gpu_utilization_percent') is not None:
parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%")
if metrics.get('memory_allocated_gb') is not None:
mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
if metrics.get('memory_total_gb'):
mem_str += f"/{metrics['memory_total_gb']:.2f}GB"
if metrics.get('memory_usage_percent'):
mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
parts.append(mem_str)
if metrics.get('temperature_celsius') is not None:
parts.append(f"Temp: {metrics['temperature_celsius']}C")
if metrics.get('power_usage_watts') is not None:
parts.append(f"Power: {metrics['power_usage_watts']:.1f}W")
if parts:
logger.info(f"{context_str}{', '.join(parts)}")
def get_summary_string(self) -> str:
"""
Get a summary string of current GPU status
Returns:
Formatted string with GPU metrics
"""
metrics = self.get_gpu_utilization()
if not metrics:
return "GPU monitoring not available"
parts = []
if metrics.get('gpu_utilization_percent') is not None:
parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%")
if metrics.get('memory_allocated_gb') is not None:
mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
if metrics.get('memory_usage_percent'):
mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
parts.append(mem_str)
if metrics.get('temperature_celsius') is not None:
parts.append(f"Temp: {metrics['temperature_celsius']}C")
return ", ".join(parts) if parts else "No metrics available"
# Global instance
_gpu_monitor = None
def get_gpu_monitor() -> GPUMonitor:
"""Get or create global GPU monitor instance"""
global _gpu_monitor
if _gpu_monitor is None:
_gpu_monitor = GPUMonitor()
return _gpu_monitor