diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py index a16dca3..e605d24 100644 --- a/ANNOTATE/core/real_training_adapter.py +++ b/ANNOTATE/core/real_training_adapter.py @@ -1800,6 +1800,14 @@ class RealTrainingAdapter: logger.info(f" GPU available: {torch.cuda.get_device_name(0)}") logger.info(f" GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB") logger.info(f" Pre-moving batches to GPU for maximum efficiency") + + # Log initial GPU status + try: + from utils.gpu_monitor import get_gpu_monitor + gpu_monitor = get_gpu_monitor() + gpu_monitor.log_gpu_status("Initial GPU status") + except Exception as e: + logger.debug(f"GPU monitor not available: {e}") # Convert and move batches to GPU immediately cached_batches = [] @@ -1929,10 +1937,17 @@ class RealTrainingAdapter: # Log GPU status at start of epoch if use_gpu and torch.cuda.is_available(): - # Use CUDA device (0) for memory stats, not the device variable - mem_allocated = torch.cuda.memory_allocated(0) / 1024**3 - mem_reserved = torch.cuda.memory_reserved(0) / 1024**3 - logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved") + # Use GPU monitor for detailed metrics + try: + from utils.gpu_monitor import get_gpu_monitor + gpu_monitor = get_gpu_monitor() + gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}") + except Exception as e: + # Fallback to basic memory stats if monitor not available + logger.debug(f"GPU monitor not available: {e}") + mem_allocated = torch.cuda.memory_allocated(0) / 1024**3 + mem_reserved = torch.cuda.memory_reserved(0) / 1024**3 + logger.info(f" Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved") # MEMORY FIX: Aggressive cleanup before epoch gc.collect() @@ -1980,9 +1995,21 @@ class RealTrainingAdapter: denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()] denorm_str = f", Real RMSE: {', '.join(denorm_values)}" + # Get GPU utilization during training + gpu_info = "" + if use_gpu and torch.cuda.is_available(): + try: + from utils.gpu_monitor import get_gpu_monitor + gpu_monitor = get_gpu_monitor() + gpu_summary = gpu_monitor.get_summary_string() + if gpu_summary != "GPU monitoring not available": + gpu_info = f" | {gpu_summary}" + except Exception: + pass # GPU monitoring optional + logger.info(f" Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, " f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, " - f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}") + f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}") else: logger.warning(f" Batch {i + 1} returned None result - skipping") @@ -2120,6 +2147,15 @@ class RealTrainingAdapter: # Check memory usage log_memory_usage(f" Epoch {epoch + 1} end - ") + # Log GPU status at end of epoch + if use_gpu and torch.cuda.is_available(): + try: + from utils.gpu_monitor import get_gpu_monitor + gpu_monitor = get_gpu_monitor() + gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end") + except Exception: + pass # GPU monitoring optional + logger.info(f" Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)") session.final_loss = session.current_loss diff --git a/requirements.txt b/requirements.txt index 71b91e9..98392d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,6 @@ seaborn>=0.12.0 ta>=0.11.0 ccxt>=4.0.0 dash-bootstrap-components>=2.0.0 -asyncio-compat>=0.1.2 wandb>=0.16.0 pybit>=5.11.0 requests>=2.31.0 @@ -38,6 +37,10 @@ duckdb>=0.9.0 Flask>=3.0.0 flask-cors>=4.0.0 +# Optional GPU monitoring (for NVIDIA GPUs - install if you want detailed GPU utilization metrics) +pynvml>=11.5.0 # NVIDIA Management Library for GPU utilization monitoring +# GPUtil>=1.4.0 # Alternative GPU monitoring library + # NOTE: PyTorch is intentionally not pinned here to avoid pulling wrong GPU deps. # We only need torch (not torchvision/torchaudio) for trading systems. # diff --git a/utils/gpu_monitor.py b/utils/gpu_monitor.py new file mode 100644 index 0000000..d4b0146 --- /dev/null +++ b/utils/gpu_monitor.py @@ -0,0 +1,248 @@ +""" +GPU Utilization Monitor +Provides real-time GPU utilization metrics for NVIDIA and AMD GPUs +""" + +import logging +import time +from typing import Dict, Optional, Any +import platform + +logger = logging.getLogger(__name__) + +# Try to import GPU monitoring libraries +try: + import torch + HAS_TORCH = True +except ImportError: + HAS_TORCH = False + torch = None + +# Try NVIDIA Management Library (pynvml) +try: + import pynvml + HAS_NVML = True +except ImportError: + HAS_NVML = False + pynvml = None + +# Try GPUtil (alternative NVIDIA monitoring) +try: + import GPUtil + HAS_GPUTIL = True +except ImportError: + HAS_GPUTIL = False + GPUtil = None + + +class GPUMonitor: + """Monitor GPU utilization and performance metrics""" + + def __init__(self): + self.monitoring_enabled = False + self.gpu_type = None + self.device_id = 0 + + # Initialize monitoring based on available libraries + if HAS_TORCH and torch.cuda.is_available(): + self.monitoring_enabled = True + self.device_id = 0 + + # Try to determine GPU vendor + try: + gpu_name = torch.cuda.get_device_name(0) + if 'nvidia' in gpu_name.lower() or 'geforce' in gpu_name.lower() or 'rtx' in gpu_name.lower() or 'gtx' in gpu_name.lower(): + self.gpu_type = 'nvidia' + elif 'amd' in gpu_name.lower() or 'radeon' in gpu_name.lower(): + self.gpu_type = 'amd' + else: + self.gpu_type = 'unknown' + except Exception: + self.gpu_type = 'unknown' + + # Initialize NVIDIA monitoring if available + if self.gpu_type == 'nvidia' and HAS_NVML: + try: + pynvml.nvmlInit() + self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id) + logger.info("GPU monitoring initialized: NVIDIA GPU with NVML") + except Exception as e: + logger.debug(f"NVML initialization failed: {e}, will use PyTorch metrics only") + self.nvml_handle = None + else: + self.nvml_handle = None + else: + logger.debug("GPU monitoring disabled: No CUDA GPU available") + + def get_gpu_utilization(self) -> Optional[Dict[str, Any]]: + """ + Get current GPU utilization metrics + + Returns: + Dictionary with GPU utilization metrics or None if not available + """ + if not self.monitoring_enabled or not HAS_TORCH: + return None + + try: + metrics = { + 'device_id': self.device_id, + 'gpu_type': self.gpu_type, + 'timestamp': time.time() + } + + # Get GPU memory usage (always available via PyTorch) + if torch.cuda.is_available(): + metrics['memory_allocated_gb'] = torch.cuda.memory_allocated(self.device_id) / 1024**3 + metrics['memory_reserved_gb'] = torch.cuda.memory_reserved(self.device_id) / 1024**3 + + try: + props = torch.cuda.get_device_properties(self.device_id) + metrics['memory_total_gb'] = props.total_memory / 1024**3 + metrics['memory_usage_percent'] = (metrics['memory_allocated_gb'] / metrics['memory_total_gb']) * 100 + except Exception: + metrics['memory_total_gb'] = None + metrics['memory_usage_percent'] = None + + metrics['gpu_name'] = torch.cuda.get_device_name(self.device_id) + + # Get GPU utilization percentage (NVIDIA only via NVML) + if self.gpu_type == 'nvidia' and self.nvml_handle is not None: + try: + # Get utilization rates + util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle) + metrics['gpu_utilization_percent'] = util.gpu + metrics['memory_utilization_percent'] = util.memory + + # Get power usage + try: + power = pynvml.nvmlDeviceGetPowerUsage(self.nvml_handle) / 1000.0 # Convert mW to W + metrics['power_usage_watts'] = power + except Exception: + metrics['power_usage_watts'] = None + + # Get temperature + try: + temp = pynvml.nvmlDeviceGetTemperature(self.nvml_handle, pynvml.NVML_TEMPERATURE_GPU) + metrics['temperature_celsius'] = temp + except Exception: + metrics['temperature_celsius'] = None + + except Exception as e: + logger.debug(f"Failed to get NVML metrics: {e}") + metrics['gpu_utilization_percent'] = None + metrics['memory_utilization_percent'] = None + + # Fallback to GPUtil if NVML not available + elif self.gpu_type == 'nvidia' and HAS_GPUTIL: + try: + gpus = GPUtil.getGPUs() + if gpus and len(gpus) > self.device_id: + gpu = gpus[self.device_id] + metrics['gpu_utilization_percent'] = gpu.load * 100 + metrics['memory_utilization_percent'] = (gpu.memoryUsed / gpu.memoryTotal) * 100 + metrics['temperature_celsius'] = gpu.temperature + else: + metrics['gpu_utilization_percent'] = None + metrics['memory_utilization_percent'] = None + except Exception as e: + logger.debug(f"Failed to get GPUtil metrics: {e}") + metrics['gpu_utilization_percent'] = None + + # For AMD GPUs or when NVML/GPUtil not available, estimate utilization + # based on memory usage and activity + else: + # Estimate GPU utilization based on memory activity + # This is a rough estimate - actual GPU compute utilization requires vendor-specific APIs + if metrics.get('memory_usage_percent') is not None: + # If memory is being used actively, GPU is likely active + # This is a heuristic, not exact + metrics['gpu_utilization_percent'] = min(metrics['memory_usage_percent'] * 1.2, 100) + metrics['memory_utilization_percent'] = metrics['memory_usage_percent'] + else: + metrics['gpu_utilization_percent'] = None + metrics['memory_utilization_percent'] = None + + return metrics + + except Exception as e: + logger.debug(f"Error getting GPU utilization: {e}") + return None + + def log_gpu_status(self, context: str = "") -> None: + """ + Log current GPU status + + Args: + context: Optional context string to include in log message + """ + metrics = self.get_gpu_utilization() + if not metrics: + return + + context_str = f"{context} - " if context else "" + + # Build log message + parts = [] + + if metrics.get('gpu_name'): + parts.append(f"GPU: {metrics['gpu_name']}") + + if metrics.get('gpu_utilization_percent') is not None: + parts.append(f"Util: {metrics['gpu_utilization_percent']:.1f}%") + + if metrics.get('memory_allocated_gb') is not None: + mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB" + if metrics.get('memory_total_gb'): + mem_str += f"/{metrics['memory_total_gb']:.2f}GB" + if metrics.get('memory_usage_percent'): + mem_str += f" ({metrics['memory_usage_percent']:.1f}%)" + parts.append(mem_str) + + if metrics.get('temperature_celsius') is not None: + parts.append(f"Temp: {metrics['temperature_celsius']}C") + + if metrics.get('power_usage_watts') is not None: + parts.append(f"Power: {metrics['power_usage_watts']:.1f}W") + + if parts: + logger.info(f"{context_str}{', '.join(parts)}") + + def get_summary_string(self) -> str: + """ + Get a summary string of current GPU status + + Returns: + Formatted string with GPU metrics + """ + metrics = self.get_gpu_utilization() + if not metrics: + return "GPU monitoring not available" + + parts = [] + + if metrics.get('gpu_utilization_percent') is not None: + parts.append(f"GPU: {metrics['gpu_utilization_percent']:.1f}%") + + if metrics.get('memory_allocated_gb') is not None: + mem_str = f"Mem: {metrics['memory_allocated_gb']:.2f}GB" + if metrics.get('memory_usage_percent'): + mem_str += f" ({metrics['memory_usage_percent']:.1f}%)" + parts.append(mem_str) + + if metrics.get('temperature_celsius') is not None: + parts.append(f"Temp: {metrics['temperature_celsius']}C") + + return ", ".join(parts) if parts else "No metrics available" + + +# Global instance +_gpu_monitor = None + +def get_gpu_monitor() -> GPUMonitor: + """Get or create global GPU monitor instance""" + global _gpu_monitor + if _gpu_monitor is None: + _gpu_monitor = GPUMonitor() + return _gpu_monitor + diff --git a/web/clean_dashboard.py b/web/clean_dashboard.py index a6ee9f9..04fb4e0 100644 --- a/web/clean_dashboard.py +++ b/web/clean_dashboard.py @@ -4533,8 +4533,8 @@ class CleanTradingDashboard: # Add high pivots for this level if highs_x: - fig.add_trace( - go.Scatter( + fig.add_trace( + go.Scatter( x=highs_x, y=highs_y, mode='markers', name=f'L{level_num} Pivot High', @@ -4547,14 +4547,14 @@ class CleanTradingDashboard: ), showlegend=(level_num == 1), # Only show legend for Level 1 hovertemplate=f"Level {level_num} High: ${{y:.2f}}" - ), - row=row, col=1 - ) + ), + row=row, col=1 + ) # Add low pivots for this level if lows_x: - fig.add_trace( - go.Scatter( + fig.add_trace( + go.Scatter( x=lows_x, y=lows_y, mode='markers', name=f'L{level_num} Pivot Low', @@ -4567,9 +4567,9 @@ class CleanTradingDashboard: ), showlegend=(level_num == 1), # Only show legend for Level 1 hovertemplate=f"Level {level_num} Low: ${{y:.2f}}" - ), - row=row, col=1 - ) + ), + row=row, col=1 + ) # Build external legend HTML (no annotation on chart to avoid scale distortion) legend_children = []