""" GPU Utilization Monitor Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs """ import logging import time from typing import Dict, Optional, Any import platform logger = logging.getLogger(__name__) # Try to import GPU monitoring libraries try: import torch HAS_TORCH = True except ImportError: HAS_TORCH = False torch = None # Try NVIDIA Management Library (pynvml) try: import pynvml HAS_NVML = True except ImportError: HAS_NVML = False pynvml = None # Try GPUtil (alternative NVIDIA monitoring) try: import GPUtil HAS_GPUTIL = True except ImportError: HAS_GPUTIL = False GPUtil = None class GPUMonitor: """Monitor GPU utilization and performance metrics""" def __init__(self): self.monitoring_enabled = False self.gpu_type = None self.device_id = 0 # Initialize monitoring based on available libraries if HAS_TORCH and torch.cuda.is_available(): self.monitoring_enabled = True self.device_id = 0 # Try to determine GPU vendor try: gpu_name = torch.cuda.get_device_name(0) if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower(): self.gpu_type = 'nvidia' elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower(): self.gpu_type = 'amd' else: self.gpu_type = 'unknown' except Exception: self.gpu_type = 'unknown' # Initialize NVIDIA monitoring if available if self.gpu_type == 'nvidia' and HAS_NVML: try: pynvml.nvmlInit() self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id) logger.info("GPU monitoring initialized: NVIDIA GPU with NVML") except Exception as e: logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only") self.nvml_handle = None else: self.nvml_handle = None else: logger.debug("GPU monitoring disabled: No CUDA GPU available") def get_gpu_utilization(self) -> Optional[Dict[str, Any]]: """ Get current GPU utilization metrics Returns: Dictionary with GPU utilization metrics or None if not available """ if not self.monitoring_enabled or not HAS_TORCH: return None try: metrics = { 'device_id': self.device_id, 'gpu_type': self.gpu_type, 'timestamp': time.time() } # Get GPU memory usage (always available via PyTorch) if torch.cuda.is_available(): metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3 metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3 try: props = torch.cuda.get_device_properties(self.device_id) metrics['memory_total_gb'] = props.total_memory / 1024**3 metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100 except Exception: metrics['memory_total_gb'] = None metrics['memory_usage_percent'] = None metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id) # Get GPU utilization percentage (NVIDIA only via NVML) if self.gpu_type == 'nvidia' and self.nvml_handle is not None: try: # Get utilization rates util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle) metrics['gpu_utilization_percent'] = util.gpu metrics['memory_utilization_percent'] = util.memory # Get power usage try: power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0 # Convert mW to W metrics['power_usage_watts'] = power except Exception: metrics['power_usage_watts'] = None # Get temperature try: temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU) metrics['temperature_celsius'] = temp except Exception: metrics['temperature_celsius'] = None except Exception as e: logger.debug(f"Failed to get NVML metrics: {e}") metrics['gpu_utilization_percent'] = None metrics['memory_utilization_percent'] = None # Fallback to GPUtil if NVML not available elif self.gpu_type == 'nvidia' and HAS_GPUTIL: try: gpus = GPUtil.getGPUs() if gpus and len(gpus) > self.device_id: gpu = gpus[self.device_id] metrics['gpu_utilization_percent'] = gpu.load * 100 metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100 metrics['temperature_celsius'] = gpu.temperature else: metrics['gpu_utilization_percent'] = None metrics['memory_utilization_percent'] = None except Exception as e: logger.debug(f"Failed to get GPUtil metrics: {e}") metrics['gpu_utilization_percent'] = None # For AMD GPUs or when NVML/GPUtil not available, estimate utilization # based on memory usage and activity else: # Estimate GPU utilization based on memory activity # This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs if metrics.get('memory_usage_percent') is not None: # If memory is being used actively, GPU is likely active # This is a heuristic, not exact metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100) metrics['memory_utilization_percent'] = metrics['memory_usage_percent'] else: metrics['gpu_utilization_percent'] = None metrics['memory_utilization_percent'] = None return metrics except Exception as e: logger.debug(f"Error getting GPU utilization: {e}") return None def log_gpu_status(self, context: str = "") -> None: """ Log current GPU status Args: context: Optional context string to include in log message """ metrics = self.get_gpu_utilization() if not metrics: return context_str = f"{context} - " if context else "" # Build log message parts = [] if metrics.get('gpu_name'): parts.append(f"GPU: {metrics['gpu_name']}") if metrics.get('gpu_utilization_percent') is not None: parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%") if metrics.get('memory_allocated_gb') is not None: mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB" if metrics.get('memory_total_gb'): mem_str += f"/{metrics['memory_total_gb']:.2f}GB" if metrics.get('memory_usage_percent'): mem_str += f" ({metrics['memory_usage_percent']:.1f}%)" parts.append(mem_str) if metrics.get('temperature_celsius') is not None: parts.append(f"Temp: {metrics['temperature_celsius']}C") if metrics.get('power_usage_watts') is not None: parts.append(f"Power: {metrics['power_usage_watts']:.1f}W") if parts: logger.info(f"{context_str}{', '.join(parts)}") def get_summary_string(self) -> str: """ Get a summary string of current GPU status Returns: Formatted string with GPU metrics """ metrics = self.get_gpu_utilization() if not metrics: return "GPU monitoring not available" parts = [] if metrics.get('gpu_utilization_percent') is not None: parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%") if metrics.get('memory_allocated_gb') is not None: mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB" if metrics.get('memory_usage_percent'): mem_str += f" ({metrics['memory_usage_percent']:.1f}%)" parts.append(mem_str) if metrics.get('temperature_celsius') is not None: parts.append(f"Temp: {metrics['temperature_celsius']}C") return ", ".join(parts) if parts else "No metrics available" # Global instance _gpu_monitor = None def get_gpu_monitor() -> GPUMonitor: """Get or create global GPU monitor instance""" global _gpu_monitor if _gpu_monitor is None: _gpu_monitor = GPUMonitor() return _gpu_monitor