log GPU

2025-11-19 10:47:43 +02:00
parent 8ee8558829
commit df5f9b47f2
4 changed files with 303 additions and 16 deletions
--- a/utils/gpu_monitor.py
+++ b/utils/gpu_monitor.py
@@ -0,0 +1,248 @@
+"""
+GPU Utilization Monitor
+Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs
+"""
+
+import logging
+import time
+from typing import Dict, Optional, Any
+import platform
+
+logger = logging.getLogger(__name__)
+
+# Try to import GPU monitoring libraries
+try:
+    import torch
+    HAS_TORCH = True
+except ImportError:
+    HAS_TORCH = False
+    torch = None
+
+# Try NVIDIA Management Library (pynvml)
+try:
+    import pynvml
+    HAS_NVML = True
+except ImportError:
+    HAS_NVML = False
+    pynvml = None
+
+# Try GPUtil (alternative NVIDIA monitoring)
+try:
+    import GPUtil
+    HAS_GPUTIL = True
+except ImportError:
+    HAS_GPUTIL = False
+    GPUtil = None
+
+
+class GPUMonitor:
+    """Monitor GPU utilization and performance metrics"""
+    
+    def __init__(self):
+        self.monitoring_enabled = False
+        self.gpu_type = None
+        self.device_id = 0
+        
+        # Initialize monitoring based on available libraries
+        if HAS_TORCH and torch.cuda.is_available():
+            self.monitoring_enabled = True
+            self.device_id = 0
+            
+            # Try to determine GPU vendor
+            try:
+                gpu_name = torch.cuda.get_device_name(0)
+                if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower():
+                    self.gpu_type = 'nvidia'
+                elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower():
+                    self.gpu_type = 'amd'
+                else:
+                    self.gpu_type = 'unknown'
+            except Exception:
+                self.gpu_type = 'unknown'
+            
+            # Initialize NVIDIA monitoring if available
+            if self.gpu_type == 'nvidia' and HAS_NVML:
+                try:
+                    pynvml.nvmlInit()
+                    self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
+                    logger.info("GPU monitoring initialized: NVIDIA GPU with NVML")
+                except Exception as e:
+                    logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only")
+                    self.nvml_handle = None
+            else:
+                self.nvml_handle = None
+        else:
+            logger.debug("GPU monitoring disabled: No CUDA GPU available")
+    
+    def get_gpu_utilization(self) -> Optional[Dict[str, Any]]:
+        """
+        Get current GPU utilization metrics
+        
+        Returns:
+            Dictionary with GPU utilization metrics or None if not available
+        """
+        if not self.monitoring_enabled or not HAS_TORCH:
+            return None
+        
+        try:
+            metrics = {
+                'device_id': self.device_id,
+                'gpu_type': self.gpu_type,
+                'timestamp': time.time()
+            }
+            
+            # Get GPU memory usage (always available via PyTorch)
+            if torch.cuda.is_available():
+                metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3
+                metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3
+                
+                try:
+                    props = torch.cuda.get_device_properties(self.device_id)
+                    metrics['memory_total_gb'] = props.total_memory / 1024**3
+                    metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100
+                except Exception:
+                    metrics['memory_total_gb'] = None
+                    metrics['memory_usage_percent'] = None
+                
+                metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id)
+            
+            # Get GPU utilization percentage (NVIDIA only via NVML)
+            if self.gpu_type == 'nvidia' and self.nvml_handle is not None:
+                try:
+                    # Get utilization rates
+                    util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle)
+                    metrics['gpu_utilization_percent'] = util.gpu
+                    metrics['memory_utilization_percent'] = util.memory
+                    
+                    # Get power usage
+                    try:
+                        power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0  # Convert mW to W
+                        metrics['power_usage_watts'] = power
+                    except Exception:
+                        metrics['power_usage_watts'] = None
+                    
+                    # Get temperature
+                    try:
+                        temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU)
+                        metrics['temperature_celsius'] = temp
+                    except Exception:
+                        metrics['temperature_celsius'] = None
+                    
+                except Exception as e:
+                    logger.debug(f"Failed to get NVML metrics: {e}")
+                    metrics['gpu_utilization_percent'] = None
+                    metrics['memory_utilization_percent'] = None
+            
+            # Fallback to GPUtil if NVML not available
+            elif self.gpu_type == 'nvidia' and HAS_GPUTIL:
+                try:
+                    gpus = GPUtil.getGPUs()
+                    if gpus and len(gpus) > self.device_id:
+                        gpu = gpus[self.device_id]
+                        metrics['gpu_utilization_percent'] = gpu.load * 100
+                        metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100
+                        metrics['temperature_celsius'] = gpu.temperature
+                    else:
+                        metrics['gpu_utilization_percent'] = None
+                        metrics['memory_utilization_percent'] = None
+                except Exception as e:
+                    logger.debug(f"Failed to get GPUtil metrics: {e}")
+                    metrics['gpu_utilization_percent'] = None
+            
+            # For AMD GPUs or when NVML/GPUtil not available, estimate utilization
+            # based on memory usage and activity
+            else:
+                # Estimate GPU utilization based on memory activity
+                # This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs
+                if metrics.get('memory_usage_percent') is not None:
+                    # If memory is being used actively, GPU is likely active
+                    # This is a heuristic, not exact
+                    metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100)
+                    metrics['memory_utilization_percent'] = metrics['memory_usage_percent']
+                else:
+                    metrics['gpu_utilization_percent'] = None
+                    metrics['memory_utilization_percent'] = None
+            
+            return metrics
+            
+        except Exception as e:
+            logger.debug(f"Error getting GPU utilization: {e}")
+            return None
+    
+    def log_gpu_status(self, context: str = "") -> None:
+        """
+        Log current GPU status
+        
+        Args:
+            context: Optional context string to include in log message
+        """
+        metrics = self.get_gpu_utilization()
+        if not metrics:
+            return
+        
+        context_str = f"{context} - " if context else ""
+        
+        # Build log message
+        parts = []
+        
+        if metrics.get('gpu_name'):
+            parts.append(f"GPU: {metrics['gpu_name']}")
+        
+        if metrics.get('gpu_utilization_percent') is not None:
+            parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%")
+        
+        if metrics.get('memory_allocated_gb') is not None:
+            mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
+            if metrics.get('memory_total_gb'):
+                mem_str += f"/{metrics['memory_total_gb']:.2f}GB"
+            if metrics.get('memory_usage_percent'):
+                mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
+            parts.append(mem_str)
+        
+        if metrics.get('temperature_celsius') is not None:
+            parts.append(f"Temp: {metrics['temperature_celsius']}C")
+        
+        if metrics.get('power_usage_watts') is not None:
+            parts.append(f"Power: {metrics['power_usage_watts']:.1f}W")
+        
+        if parts:
+            logger.info(f"{context_str}{', '.join(parts)}")
+    
+    def get_summary_string(self) -> str:
+        """
+        Get a summary string of current GPU status
+        
+        Returns:
+            Formatted string with GPU metrics
+        """
+        metrics = self.get_gpu_utilization()
+        if not metrics:
+            return "GPU monitoring not available"
+        
+        parts = []
+        
+        if metrics.get('gpu_utilization_percent') is not None:
+            parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%")
+        
+        if metrics.get('memory_allocated_gb') is not None:
+            mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
+            if metrics.get('memory_usage_percent'):
+                mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
+            parts.append(mem_str)
+        
+        if metrics.get('temperature_celsius') is not None:
+            parts.append(f"Temp: {metrics['temperature_celsius']}C")
+        
+        return ", ".join(parts) if parts else "No metrics available"
+
+
+# Global instance
+_gpu_monitor = None
+
+def get_gpu_monitor() -> GPUMonitor:
+    """Get or create global GPU monitor instance"""
+    global _gpu_monitor
+    if _gpu_monitor is None:
+        _gpu_monitor = GPUMonitor()
+    return _gpu_monitor
+