log GPU

2025-11-19 10:47:43 +02:00
parent 8ee8558829
commit df5f9b47f2
4 changed files with 303 additions and 16 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -1800,6 +1800,14 @@ class RealTrainingAdapter:
                logger.info(f"    GPU available: {torch.cuda.get_device_name(0)}")
                logger.info(f"    GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
                logger.info(f"    Pre-moving batches to GPU for maximum efficiency")
                # Log initial GPU status
                try:
                    from utils.gpu_monitor import get_gpu_monitor
                    gpu_monitor = get_gpu_monitor()
                    gpu_monitor.log_gpu_status("Initial GPU status")
                except Exception as e:
                    logger.debug(f"GPU monitor not available: {e}")
            # Convert and move batches to GPU immediately
            cached_batches = []
@@ -1929,10 +1937,17 @@ class RealTrainingAdapter:
                # Log GPU status at start of epoch
                if use_gpu and torch.cuda.is_available():
-                    # Use CUDA device (0) for memory stats, not the device variable
+                    # Use GPU monitor for detailed metrics
-                    mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
+                    try:
-                    mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
+                        from utils.gpu_monitor import get_gpu_monitor
-                    logger.info(f"   Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
+                        gpu_monitor = get_gpu_monitor()
                        gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}")
                    except Exception as e:
                        # Fallback to basic memory stats if monitor not available
                        logger.debug(f"GPU monitor not available: {e}")
                        mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
                        mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
                        logger.info(f"   Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
                # MEMORY FIX: Aggressive cleanup before epoch
                gc.collect()
@@ -1980,9 +1995,21 @@ class RealTrainingAdapter:
                                    denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
                                    denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
                                # Get GPU utilization during training
                                gpu_info = ""
                                if use_gpu and torch.cuda.is_available():
                                    try:
                                        from utils.gpu_monitor import get_gpu_monitor
                                        gpu_monitor = get_gpu_monitor()
                                        gpu_summary = gpu_monitor.get_summary_string()
                                        if gpu_summary != "GPU monitoring not available":
                                            gpu_info = f" | {gpu_summary}"
                                    except Exception:
                                        pass  # GPU monitoring optional
                                logger.info(f"      Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
                                          f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
-                                          f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}")
+                                          f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}")
                        else:
                            logger.warning(f"   Batch {i + 1} returned None result - skipping")
@@ -2120,6 +2147,15 @@ class RealTrainingAdapter:
                # Check memory usage
                log_memory_usage(f"   Epoch {epoch + 1} end - ")
                # Log GPU status at end of epoch
                if use_gpu and torch.cuda.is_available():
                    try:
                        from utils.gpu_monitor import get_gpu_monitor
                        gpu_monitor = get_gpu_monitor()
                        gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end")
                    except Exception:
                        pass  # GPU monitoring optional
                logger.info(f"   Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
            session.final_loss = session.current_loss
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,6 @@ seaborn>=0.12.0
 ta>=0.11.0
 ccxt>=4.0.0
 dash-bootstrap-components>=2.0.0
 asyncio-compat>=0.1.2
 wandb>=0.16.0
 pybit>=5.11.0
 requests>=2.31.0
@@ -38,6 +37,10 @@ duckdb>=0.9.0
 Flask>=3.0.0
 flask-cors>=4.0.0
 # Optional GPU monitoring (for NVIDIA GPUs - install if you want detailed GPU utilization metrics)
 pynvml>=11.5.0  # NVIDIA Management Library for GPU utilization monitoring
 # GPUtil>=1.4.0   # Alternative GPU monitoring library
 # NOTE: PyTorch is intentionally not pinned here to avoid pulling wrong GPU deps.
 # We only need torch (not torchvision/torchaudio) for trading systems.
 #
--- a/utils/gpu_monitor.py
+++ b/utils/gpu_monitor.py
@@ -0,0 +1,248 @@
 """
 GPU Utilization Monitor
 Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs
 """
 import logging
 import time
 from typing import Dict, Optional, Any
 import platform
 logger = logging.getLogger(__name__)
 # Try to import GPU monitoring libraries
 try:
    import torch
    HAS_TORCH = True
 except ImportError:
    HAS_TORCH = False
    torch = None
 # Try NVIDIA Management Library (pynvml)
 try:
    import pynvml
    HAS_NVML = True
 except ImportError:
    HAS_NVML = False
    pynvml = None
 # Try GPUtil (alternative NVIDIA monitoring)
 try:
    import GPUtil
    HAS_GPUTIL = True
 except ImportError:
    HAS_GPUTIL = False
    GPUtil = None
 class GPUMonitor:
    """Monitor GPU utilization and performance metrics"""
    def __init__(self):
        self.monitoring_enabled = False
        self.gpu_type = None
        self.device_id = 0
        # Initialize monitoring based on available libraries
        if HAS_TORCH and torch.cuda.is_available():
            self.monitoring_enabled = True
            self.device_id = 0
            # Try to determine GPU vendor
            try:
                gpu_name = torch.cuda.get_device_name(0)
                if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower():
                    self.gpu_type = 'nvidia'
                elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower():
                    self.gpu_type = 'amd'
                else:
                    self.gpu_type = 'unknown'
            except Exception:
                self.gpu_type = 'unknown'
            # Initialize NVIDIA monitoring if available
            if self.gpu_type == 'nvidia' and HAS_NVML:
                try:
                    pynvml.nvmlInit()
                    self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
                    logger.info("GPU monitoring initialized: NVIDIA GPU with NVML")
                except Exception as e:
                    logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only")
                    self.nvml_handle = None
            else:
                self.nvml_handle = None
        else:
            logger.debug("GPU monitoring disabled: No CUDA GPU available")
    def get_gpu_utilization(self) -> Optional[Dict[str, Any]]:
        """
        Get current GPU utilization metrics
        Returns:
            Dictionary with GPU utilization metrics or None if not available
        """
        if not self.monitoring_enabled or not HAS_TORCH:
            return None
        try:
            metrics = {
                'device_id': self.device_id,
                'gpu_type': self.gpu_type,
                'timestamp': time.time()
            }
            # Get GPU memory usage (always available via PyTorch)
            if torch.cuda.is_available():
                metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3
                metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3
                try:
                    props = torch.cuda.get_device_properties(self.device_id)
                    metrics['memory_total_gb'] = props.total_memory / 1024**3
                    metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100
                except Exception:
                    metrics['memory_total_gb'] = None
                    metrics['memory_usage_percent'] = None
                metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id)
            # Get GPU utilization percentage (NVIDIA only via NVML)
            if self.gpu_type == 'nvidia' and self.nvml_handle is not None:
                try:
                    # Get utilization rates
                    util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle)
                    metrics['gpu_utilization_percent'] = util.gpu
                    metrics['memory_utilization_percent'] = util.memory
                    # Get power usage
                    try:
                        power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0  # Convert mW to W
                        metrics['power_usage_watts'] = power
                    except Exception:
                        metrics['power_usage_watts'] = None
                    # Get temperature
                    try:
                        temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU)
                        metrics['temperature_celsius'] = temp
                    except Exception:
                        metrics['temperature_celsius'] = None
                except Exception as e:
                    logger.debug(f"Failed to get NVML metrics: {e}")
                    metrics['gpu_utilization_percent'] = None
                    metrics['memory_utilization_percent'] = None
            # Fallback to GPUtil if NVML not available
            elif self.gpu_type == 'nvidia' and HAS_GPUTIL:
                try:
                    gpus = GPUtil.getGPUs()
                    if gpus and len(gpus) > self.device_id:
                        gpu = gpus[self.device_id]
                        metrics['gpu_utilization_percent'] = gpu.load * 100
                        metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100
                        metrics['temperature_celsius'] = gpu.temperature
                    else:
                        metrics['gpu_utilization_percent'] = None
                        metrics['memory_utilization_percent'] = None
                except Exception as e:
                    logger.debug(f"Failed to get GPUtil metrics: {e}")
                    metrics['gpu_utilization_percent'] = None
            # For AMD GPUs or when NVML/GPUtil not available, estimate utilization
            # based on memory usage and activity
            else:
                # Estimate GPU utilization based on memory activity
                # This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs
                if metrics.get('memory_usage_percent') is not None:
                    # If memory is being used actively, GPU is likely active
                    # This is a heuristic, not exact
                    metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100)
                    metrics['memory_utilization_percent'] = metrics['memory_usage_percent']
                else:
                    metrics['gpu_utilization_percent'] = None
                    metrics['memory_utilization_percent'] = None
            return metrics
        except Exception as e:
            logger.debug(f"Error getting GPU utilization: {e}")
            return None
    def log_gpu_status(self, context: str = "") -> None:
        """
        Log current GPU status
        Args:
            context: Optional context string to include in log message
        """
        metrics = self.get_gpu_utilization()
        if not metrics:
            return
        context_str = f"{context} - " if context else ""
        # Build log message
        parts = []
        if metrics.get('gpu_name'):
            parts.append(f"GPU: {metrics['gpu_name']}")
        if metrics.get('gpu_utilization_percent') is not None:
            parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%")
        if metrics.get('memory_allocated_gb') is not None:
            mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
            if metrics.get('memory_total_gb'):
                mem_str += f"/{metrics['memory_total_gb']:.2f}GB"
            if metrics.get('memory_usage_percent'):
                mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
            parts.append(mem_str)
        if metrics.get('temperature_celsius') is not None:
            parts.append(f"Temp: {metrics['temperature_celsius']}C")
        if metrics.get('power_usage_watts') is not None:
            parts.append(f"Power: {metrics['power_usage_watts']:.1f}W")
        if parts:
            logger.info(f"{context_str}{', '.join(parts)}")
    def get_summary_string(self) -> str:
        """
        Get a summary string of current GPU status
        Returns:
            Formatted string with GPU metrics
        """
        metrics = self.get_gpu_utilization()
        if not metrics:
            return "GPU monitoring not available"
        parts = []
        if metrics.get('gpu_utilization_percent') is not None:
            parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%")
        if metrics.get('memory_allocated_gb') is not None:
            mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB"
            if metrics.get('memory_usage_percent'):
                mem_str += f" ({metrics['memory_usage_percent']:.1f}%)"
            parts.append(mem_str)
        if metrics.get('temperature_celsius') is not None:
            parts.append(f"Temp: {metrics['temperature_celsius']}C")
        return ", ".join(parts) if parts else "No metrics available"
 # Global instance
 _gpu_monitor = None
 def get_gpu_monitor() -> GPUMonitor:
    """Get or create global GPU monitor instance"""
    global _gpu_monitor
    if _gpu_monitor is None:
        _gpu_monitor = GPUMonitor()
    return _gpu_monitor
--- a/web/clean_dashboard.py
+++ b/web/clean_dashboard.py
@@ -4533,8 +4533,8 @@ class CleanTradingDashboard:
                # Add high pivots for this level
                if highs_x:
-                fig.add_trace(
+                    fig.add_trace(
-                    go.Scatter(
+                        go.Scatter(
                            x=highs_x, y=highs_y,
                            mode='markers',
                            name=f'L{level_num} Pivot High',
@@ -4547,14 +4547,14 @@ class CleanTradingDashboard:
                            ),
                            showlegend=(level_num == 1),  # Only show legend for Level 1
                            hovertemplate=f"Level {level_num} High: ${{y:.2f}}<extra></extra>"
-                    ),
+                        ),
-                    row=row, col=1
+                        row=row, col=1
-                )
+                    )
                # Add low pivots for this level
                if lows_x:
-                fig.add_trace(
+                    fig.add_trace(
-                    go.Scatter(
+                        go.Scatter(
                            x=lows_x, y=lows_y,
                            mode='markers',
                            name=f'L{level_num} Pivot Low',
@@ -4567,9 +4567,9 @@ class CleanTradingDashboard:
                            ),
                            showlegend=(level_num == 1),  # Only show legend for Level 1
                            hovertemplate=f"Level {level_num} Low: ${{y:.2f}}<extra></extra>"
-                    ),
+                        ),
-                    row=row, col=1
+                        row=row, col=1
-                )
+                    )
            # Build external legend HTML (no annotation on chart to avoid scale distortion)
            legend_children = []