log GPU

2025-11-19 10:47:43 +02:00
parent 8ee8558829
commit df5f9b47f2
4 changed files with 303 additions and 16 deletions
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -1800,6 +1800,14 @@ class RealTrainingAdapter:
                logger.info(f"    GPU available: {torch.cuda.get_device_name(0)}")
                logger.info(f"    GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
                logger.info(f"    Pre-moving batches to GPU for maximum efficiency")
+                
+                # Log initial GPU status
+                try:
+                    from utils.gpu_monitor import get_gpu_monitor
+                    gpu_monitor = get_gpu_monitor()
+                    gpu_monitor.log_gpu_status("Initial GPU status")
+                except Exception as e:
+                    logger.debug(f"GPU monitor not available: {e}")
            
            # Convert and move batches to GPU immediately
            cached_batches = []
@@ -1929,10 +1937,17 @@ class RealTrainingAdapter:
                
                # Log GPU status at start of epoch
                if use_gpu and torch.cuda.is_available():
-                    # Use CUDA device (0) for memory stats, not the device variable
-                    mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
-                    mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
-                    logger.info(f"   Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
+                    # Use GPU monitor for detailed metrics
+                    try:
+                        from utils.gpu_monitor import get_gpu_monitor
+                        gpu_monitor = get_gpu_monitor()
+                        gpu_monitor.log_gpu_status(f"Epoch {epoch + 1}/{session.total_epochs}")
+                    except Exception as e:
+                        # Fallback to basic memory stats if monitor not available
+                        logger.debug(f"GPU monitor not available: {e}")
+                        mem_allocated = torch.cuda.memory_allocated(0) / 1024**3
+                        mem_reserved = torch.cuda.memory_reserved(0) / 1024**3
+                        logger.info(f"   Epoch {epoch + 1}/{session.total_epochs} - GPU Memory: {mem_allocated:.2f}GB allocated, {mem_reserved:.2f}GB reserved")
                
                # MEMORY FIX: Aggressive cleanup before epoch
                gc.collect()
@@ -1980,9 +1995,21 @@ class RealTrainingAdapter:
                                    denorm_values = [f"{tf}=${loss:.2f}" for tf, loss in batch_candle_loss_denorm.items()]
                                    denorm_str = f", Real RMSE: {', '.join(denorm_values)}"
                                
+                                # Get GPU utilization during training
+                                gpu_info = ""
+                                if use_gpu and torch.cuda.is_available():
+                                    try:
+                                        from utils.gpu_monitor import get_gpu_monitor
+                                        gpu_monitor = get_gpu_monitor()
+                                        gpu_summary = gpu_monitor.get_summary_string()
+                                        if gpu_summary != "GPU monitoring not available":
+                                            gpu_info = f" | {gpu_summary}"
+                                    except Exception:
+                                        pass  # GPU monitoring optional
+                                
                                logger.info(f"      Batch {i + 1}/{total_batches}, Loss: {batch_loss:.6f}, "
                                          f"Candle Acc: {batch_accuracy:.1%}, Trend Acc: {batch_trend_accuracy:.1%}, "
-                                          f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}")
+                                          f"Action Acc: {batch_action_accuracy:.1%}{rmse_str}{denorm_str}{gpu_info}")
                        else:
                            logger.warning(f"   Batch {i + 1} returned None result - skipping")
                        
@@ -2120,6 +2147,15 @@ class RealTrainingAdapter:
                # Check memory usage
                log_memory_usage(f"   Epoch {epoch + 1} end - ")
                
+                # Log GPU status at end of epoch
+                if use_gpu and torch.cuda.is_available():
+                    try:
+                        from utils.gpu_monitor import get_gpu_monitor
+                        gpu_monitor = get_gpu_monitor()
+                        gpu_monitor.log_gpu_status(f"Epoch {epoch + 1} end")
+                    except Exception:
+                        pass  # GPU monitoring optional
+                
                logger.info(f"   Epoch {epoch + 1}/{session.total_epochs}, Loss: {avg_loss:.6f}, Accuracy: {avg_accuracy:.2%} ({num_batches} batches)")
            
            session.final_loss = session.current_loss