gogo2/overnight_training_monitor.py

#!/usr/bin/env python3
"""
Overnight Training Monitor - 504M Parameter Massive Model
================================================================================

Comprehensive monitoring system for the overnight RL training session with:
- 504.89 Million parameter Enhanced CNN + DQN Agent
- 4GB VRAM utilization
- Real-time performance tracking
- Automated model checkpointing
- Training analytics and reporting
- Memory usage optimization
- Profit maximization metrics

Run this script to monitor the entire overnight training session.
"""

import time
import psutil
import torch
import logging
import json
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from threading import Thread
import subprocess
import GPUtil

# Setup comprehensive logging
log_dir = Path("logs/overnight_training")
log_dir.mkdir(parents=True, exist_ok=True)

# Configure detailed logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_dir / f"overnight_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class OvernightTrainingMonitor:
    """Comprehensive overnight training monitor for massive 504M parameter model"""

    def __init__(self):
        """Initialize the overnight training monitor"""
        self.start_time = datetime.now()
        self.monitoring = True

        # Model specifications
        self.model_specs = {
            'total_parameters': 504_889_098,
            'enhanced_cnn_params': 168_296_366,
            'dqn_agent_params': 336_592_732,
            'memory_usage_mb': 1926.7,
            'target_vram_gb': 4.0,
            'architecture': 'Massive Enhanced CNN + DQN Agent'
        }

        # Training metrics tracking
        self.training_metrics = {
            'episodes_completed': 0,
            'total_reward': 0.0,
            'best_reward': -float('inf'),
            'average_reward': 0.0,
            'win_rate': 0.0,
            'total_trades': 0,
            'profit_factor': 0.0,
            'sharpe_ratio': 0.0,
            'max_drawdown': 0.0,
            'final_balance': 0.0,
            'training_loss': 0.0
        }

        # System monitoring
        self.system_metrics = {
            'cpu_usage': [],
            'memory_usage': [],
            'gpu_usage': [],
            'gpu_memory': [],
            'disk_io': [],
            'network_io': []
        }

        # Performance tracking
        self.performance_history = []
        self.checkpoint_times = []

        # Profit tracking (500x leverage simulation)
        self.profit_metrics = {
            'starting_balance': 10000.0,
            'current_balance': 10000.0,
            'total_pnl': 0.0,
            'realized_pnl': 0.0,
            'unrealized_pnl': 0.0,
            'leverage': 500,
            'fees_paid': 0.0,
            'roi_percentage': 0.0
        }

        logger.info("🚀 OVERNIGHT TRAINING MONITOR INITIALIZED")
        logger.info(f"📊 Model: {self.model_specs['total_parameters']:,} parameters")
        logger.info(f"💾 Memory: {self.model_specs['memory_usage_mb']:.1f} MB")
        logger.info(f"🎯 Target VRAM: {self.model_specs['target_vram_gb']} GB")
        logger.info(f"⚡ Leverage: {self.profit_metrics['leverage']}x")

    def check_system_resources(self) -> Dict:
        """Check current system resource usage"""
        try:
            # CPU and Memory
            cpu_percent = psutil.cpu_percent(interval=1)
            memory = psutil.virtual_memory()
            memory_percent = memory.percent
            memory_used_gb = memory.used / (1024**3)
            memory_total_gb = memory.total / (1024**3)

            # GPU monitoring
            gpu_usage = 0
            gpu_memory_used = 0
            gpu_memory_total = 0

            if torch.cuda.is_available():
                gpu_memory_used = torch.cuda.memory_allocated() / (1024**3)  # GB
                gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB

                # Try to get GPU utilization
                try:
                    gpus = GPUtil.getGPUs()
                    if gpus:
                        gpu_usage = gpus[0].load * 100
                except:
                    gpu_usage = 0

            # Disk I/O
            disk_io = psutil.disk_io_counters()

            # Network I/O
            network_io = psutil.net_io_counters()

            system_info = {
                'timestamp': datetime.now(),
                'cpu_usage': cpu_percent,
                'memory_percent': memory_percent,
                'memory_used_gb': memory_used_gb,
                'memory_total_gb': memory_total_gb,
                'gpu_usage': gpu_usage,
                'gpu_memory_used_gb': gpu_memory_used,
                'gpu_memory_total_gb': gpu_memory_total,
                'gpu_memory_percent': (gpu_memory_used / gpu_memory_total * 100) if gpu_memory_total > 0 else 0,
                'disk_read_gb': disk_io.read_bytes / (1024**3) if disk_io else 0,
                'disk_write_gb': disk_io.write_bytes / (1024**3) if disk_io else 0,
                'network_sent_gb': network_io.bytes_sent / (1024**3) if network_io else 0,
                'network_recv_gb': network_io.bytes_recv / (1024**3) if network_io else 0
            }

            return system_info

        except Exception as e:
            logger.error(f"Error checking system resources: {e}")
            return {}

    def _parse_training_metrics(self) -> Dict[str, Any]:
        """Parse REAL training metrics from log files - NO SYNTHETIC DATA"""
        try:
            # Read actual training logs for real metrics
            training_log_path = Path("logs/trading.log")
            if not training_log_path.exists():
                logger.warning("⚠️ No training log found - metrics unavailable")
                return self._default_metrics()

            # Parse real metrics from training logs
            with open(training_log_path, 'r') as f:
                recent_lines = f.readlines()[-100:]  # Get last 100 lines

            # Extract real metrics from log lines
            real_metrics = self._extract_real_metrics(recent_lines)

            if real_metrics:
                logger.info(f"✅ Parsed {len(real_metrics)} real training metrics")
                return real_metrics
            else:
                logger.warning("⚠️ No real metrics found in logs")
                return self._default_metrics()

        except Exception as e:
            logger.error(f"❌ Error parsing real training metrics: {e}")
            return self._default_metrics()

    def _extract_real_metrics(self, log_lines: List[str]) -> Dict[str, Any]:
        """Extract real metrics from training log lines"""
        metrics = {}

        try:
            # Look for real training indicators
            loss_values = []
            trade_counts = []
            pnl_values = []

            for line in log_lines:
                # Extract real loss values
                if "loss:" in line.lower() or "Loss" in line:
                    try:
                        # Extract numeric loss value
                        import re
                        loss_match = re.search(r'loss[:\s]+([\d\.]+)', line, re.IGNORECASE)
                        if loss_match:
                            loss_values.append(float(loss_match.group(1)))
                    except:
                        pass

                # Extract real trade information
                if "TRADE" in line and "OPENED" in line:
                    trade_counts.append(1)

                # Extract real PnL values
                if "PnL:" in line:
                    try:
                        pnl_match = re.search(r'PnL[:\s]+\$?([+-]?[\d\.]+)', line)
                        if pnl_match:
                            pnl_values.append(float(pnl_match.group(1)))
                    except:
                        pass

            # Calculate real averages
            if loss_values:
                metrics['current_loss'] = sum(loss_values) / len(loss_values)
                metrics['loss_trend'] = 'decreasing' if len(loss_values) > 1 and loss_values[-1] < loss_values[0] else 'stable'

            if trade_counts:
                metrics['trades_per_hour'] = len(trade_counts)

            if pnl_values:
                metrics['total_pnl'] = sum(pnl_values)
                metrics['avg_pnl'] = sum(pnl_values) / len(pnl_values)
                metrics['win_rate'] = len([p for p in pnl_values if p > 0]) / len(pnl_values)

            # Add timestamp
            metrics['timestamp'] = datetime.now()
            metrics['data_source'] = 'real_training_logs'

            return metrics

        except Exception as e:
            logger.error(f"❌ Error extracting real metrics: {e}")
            return {}

    def _default_metrics(self) -> Dict[str, Any]:
        """Return default metrics when no real data is available"""
        return {
            'current_loss': 0.0,
            'trades_per_hour': 0,
            'total_pnl': 0.0,
            'avg_pnl': 0.0,
            'win_rate': 0.0,
            'timestamp': datetime.now(),
            'data_source': 'no_real_data_available',
            'loss_trend': 'unknown'
        }

    def update_training_metrics(self):
        """Update training metrics from TensorBoard logs and saved models"""
        try:
            # Look for TensorBoard log files
            runs_dir = Path("runs")
            if runs_dir.exists():
                latest_run = max(runs_dir.glob("*"), key=lambda p: p.stat().st_mtime, default=None)
                if latest_run:
                    # Parse TensorBoard logs (simplified)
                    logger.info(f"📈 Latest training run: {latest_run.name}")

            # Check for model checkpoints
            models_dir = Path("models/rl")
            if models_dir.exists():
                checkpoints = list(models_dir.glob("*.pt"))
                if checkpoints:
                    latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime)
                    checkpoint_time = datetime.fromtimestamp(latest_checkpoint.stat().st_mtime)
                    self.checkpoint_times.append(checkpoint_time)
                    logger.info(f"💾 Latest checkpoint: {latest_checkpoint.name} at {checkpoint_time}")

            # Parse REAL training metrics from logs - NO SYNTHETIC DATA
            real_metrics = self._parse_training_metrics()

            if real_metrics['data_source'] == 'real_training_logs':
                # Use real metrics from training logs
                logger.info("✅ Using REAL training metrics")
                self.training_metrics['total_pnl'] = real_metrics.get('total_pnl', 0.0)
                self.training_metrics['avg_pnl'] = real_metrics.get('avg_pnl', 0.0)
                self.training_metrics['win_rate'] = real_metrics.get('win_rate', 0.0)
                self.training_metrics['current_loss'] = real_metrics.get('current_loss', 0.0)
                self.training_metrics['trades_per_hour'] = real_metrics.get('trades_per_hour', 0)
            else:
                # No real data available - use safe defaults (NO SYNTHETIC)
                logger.warning("⚠️ No real training metrics available - using zero values")
                self.training_metrics['total_pnl'] = 0.0
                self.training_metrics['avg_pnl'] = 0.0
                self.training_metrics['win_rate'] = 0.0
                self.training_metrics['current_loss'] = 0.0
                self.training_metrics['trades_per_hour'] = 0

            # Update other real metrics
            self.training_metrics['memory_usage'] = self.check_system_resources()['memory_percent']
            self.training_metrics['gpu_usage'] = self.check_system_resources()['gpu_usage']
            self.training_metrics['training_time'] = (datetime.now() - self.start_time).total_seconds()

            # Log real metrics
            logger.info(f"🔄 Real Training Metrics Updated:")
            logger.info(f"   💰 Total PnL: ${self.training_metrics['total_pnl']:.2f}")
            logger.info(f"   📊 Win Rate: {self.training_metrics['win_rate']:.1%}")
            logger.info(f"   🔢 Trades: {self.training_metrics['trades_per_hour']}")
            logger.info(f"   📉 Loss: {self.training_metrics['current_loss']:.4f}")
            logger.info(f"   💾 Memory: {self.training_metrics['memory_usage']:.1f}%")
            logger.info(f"   🎮 GPU: {self.training_metrics['gpu_usage']:.1f}%")

        except Exception as e:
            logger.error(f"❌ Error updating real training metrics: {e}")
            # Set safe defaults on error (NO SYNTHETIC FALLBACK)
            self.training_metrics.update({
                'total_pnl': 0.0,
                'avg_pnl': 0.0,
                'win_rate': 0.0,
                'current_loss': 0.0,
                'trades_per_hour': 0
            })

    def log_comprehensive_status(self):
        """Log comprehensive training status"""
        system_info = self.check_system_resources()
        self.update_training_metrics()

        runtime = datetime.now() - self.start_time
        runtime_hours = runtime.total_seconds() / 3600

        logger.info("="*80)
        logger.info("🚀 MASSIVE MODEL OVERNIGHT TRAINING STATUS")
        logger.info("="*80)

        # Training Progress
        logger.info("📊 TRAINING PROGRESS:")
        logger.info(f"   ⏱️  Runtime: {runtime}")
        logger.info(f"   📈 Episodes: {self.training_metrics['episodes_completed']:,}")
        logger.info(f"   🎯 Average Reward: {self.training_metrics['average_reward']:.2f}")
        logger.info(f"   🏆 Win Rate: {self.training_metrics['win_rate']:.1%}")
        logger.info(f"   💹 Total Trades: {self.training_metrics['total_trades']:,}")

        # Profit Metrics (500x Leverage)
        logger.info("💰 PROFIT METRICS (500x LEVERAGE):")
        logger.info(f"   💵 Starting Balance: ${self.profit_metrics['starting_balance']:,.2f}")
        logger.info(f"   💰 Current Balance: ${self.profit_metrics['current_balance']:,.2f}")
        logger.info(f"   📈 Total P&L: ${self.profit_metrics['total_pnl']:+,.2f}")
        logger.info(f"   📊 ROI: {self.profit_metrics['roi_percentage']:+.2f}%")
        logger.info(f"   ⚡ Leverage: {self.profit_metrics['leverage']}x")

        # Model Specifications
        logger.info("🤖 MODEL SPECIFICATIONS:")
        logger.info(f"   🧠 Total Parameters: {self.model_specs['total_parameters']:,}")
        logger.info(f"   🏗️  Enhanced CNN: {self.model_specs['enhanced_cnn_params']:,}")
        logger.info(f"   🎮 DQN Agent: {self.model_specs['dqn_agent_params']:,}")
        logger.info(f"   💾 Memory Usage: {self.model_specs['memory_usage_mb']:.1f} MB")

        # System Resources
        if system_info:
            logger.info("💻 SYSTEM RESOURCES:")
            logger.info(f"   🔄 CPU Usage: {system_info['cpu_usage']:.1f}%")
            logger.info(f"   🧠 RAM Usage: {system_info['memory_used_gb']:.1f}/{system_info['memory_total_gb']:.1f} GB ({system_info['memory_percent']:.1f}%)")
            logger.info(f"   🎮 GPU Usage: {system_info['gpu_usage']:.1f}%")
            logger.info(f"   🔥 VRAM Usage: {system_info['gpu_memory_used_gb']:.1f}/{system_info['gpu_memory_total_gb']:.1f} GB ({system_info['gpu_memory_percent']:.1f}%)")

            # Store metrics for plotting
            self.system_metrics['cpu_usage'].append(system_info['cpu_usage'])
            self.system_metrics['memory_usage'].append(system_info['memory_percent'])
            self.system_metrics['gpu_usage'].append(system_info['gpu_usage'])
            self.system_metrics['gpu_memory'].append(system_info['gpu_memory_percent'])

        # Performance estimate
        if runtime_hours > 0:
            episodes_per_hour = self.training_metrics['episodes_completed'] / runtime_hours
            trades_per_hour = self.training_metrics['total_trades'] / runtime_hours
            profit_per_hour = self.profit_metrics['total_pnl'] / runtime_hours

            logger.info("⚡ PERFORMANCE ESTIMATES:")
            logger.info(f"   📊 Episodes/Hour: {episodes_per_hour:.1f}")
            logger.info(f"   💹 Trades/Hour: {trades_per_hour:.1f}")
            logger.info(f"   💰 Profit/Hour: ${profit_per_hour:+.2f}")

            # Projections for full night (8 hours)
            hours_remaining = max(0, 8 - runtime_hours)
            if hours_remaining > 0:
                projected_episodes = self.training_metrics['episodes_completed'] + (episodes_per_hour * hours_remaining)
                projected_profit = self.profit_metrics['total_pnl'] + (profit_per_hour * hours_remaining)

                logger.info("🔮 OVERNIGHT PROJECTIONS:")
                logger.info(f"   ⏰ Hours Remaining: {hours_remaining:.1f}")
                logger.info(f"   📈 Projected Episodes: {projected_episodes:.0f}")
                logger.info(f"   💰 Projected Profit: ${projected_profit:+,.2f}")

        logger.info("="*80)

        # Save performance snapshot
        snapshot = {
            'timestamp': datetime.now().isoformat(),
            'runtime_hours': runtime_hours,
            'training_metrics': self.training_metrics.copy(),
            'profit_metrics': self.profit_metrics.copy(),
            'system_info': system_info
        }
        self.performance_history.append(snapshot)

    def create_performance_plots(self):
        """Create real-time performance visualization plots"""
        try:
            if len(self.performance_history) < 2:
                return

            # Extract time series data
            timestamps = [datetime.fromisoformat(h['timestamp']) for h in self.performance_history]
            runtime_hours = [h['runtime_hours'] for h in self.performance_history]

            # Training metrics
            episodes = [h['training_metrics']['episodes_completed'] for h in self.performance_history]
            rewards = [h['training_metrics']['average_reward'] for h in self.performance_history]
            win_rates = [h['training_metrics']['win_rate'] for h in self.performance_history]

            # Profit metrics
            profits = [h['profit_metrics']['total_pnl'] for h in self.performance_history]
            roi = [h['profit_metrics']['roi_percentage'] for h in self.performance_history]

            # System metrics
            cpu_usage = [h['system_info'].get('cpu_usage', 0) for h in self.performance_history]
            gpu_memory = [h['system_info'].get('gpu_memory_percent', 0) for h in self.performance_history]

            # Create comprehensive dashboard
            plt.style.use('dark_background')
            fig, axes = plt.subplots(2, 3, figsize=(20, 12))
            fig.suptitle('🚀 MASSIVE MODEL OVERNIGHT TRAINING DASHBOARD 🚀', fontsize=16, fontweight='bold')

            # Training Episodes
            axes[0, 0].plot(runtime_hours, episodes, 'cyan', linewidth=2, marker='o')
            axes[0, 0].set_title('📈 Training Episodes', fontsize=14, fontweight='bold')
            axes[0, 0].set_xlabel('Runtime (Hours)')
            axes[0, 0].set_ylabel('Episodes Completed')
            axes[0, 0].grid(True, alpha=0.3)

            # Average Reward
            axes[0, 1].plot(runtime_hours, rewards, 'lime', linewidth=2, marker='s')
            axes[0, 1].set_title('🎯 Average Reward', fontsize=14, fontweight='bold')
            axes[0, 1].set_xlabel('Runtime (Hours)')
            axes[0, 1].set_ylabel('Average Reward')
            axes[0, 1].grid(True, alpha=0.3)

            # Win Rate
            axes[0, 2].plot(runtime_hours, [w*100 for w in win_rates], 'gold', linewidth=2, marker='^')
            axes[0, 2].set_title('🏆 Win Rate (%)', fontsize=14, fontweight='bold')
            axes[0, 2].set_xlabel('Runtime (Hours)')
            axes[0, 2].set_ylabel('Win Rate (%)')
            axes[0, 2].grid(True, alpha=0.3)

            # Profit/Loss (500x Leverage)
            axes[1, 0].plot(runtime_hours, profits, 'magenta', linewidth=3, marker='D')
            axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.7)
            axes[1, 0].set_title('💰 P&L (500x Leverage)', fontsize=14, fontweight='bold')
            axes[1, 0].set_xlabel('Runtime (Hours)')
            axes[1, 0].set_ylabel('Total P&L ($)')
            axes[1, 0].grid(True, alpha=0.3)

            # ROI Percentage
            axes[1, 1].plot(runtime_hours, roi, 'orange', linewidth=2, marker='*')
            axes[1, 1].axhline(y=0, color='red', linestyle='--', alpha=0.7)
            axes[1, 1].set_title('📊 ROI (%)', fontsize=14, fontweight='bold')
            axes[1, 1].set_xlabel('Runtime (Hours)')
            axes[1, 1].set_ylabel('ROI (%)')
            axes[1, 1].grid(True, alpha=0.3)

            # System Resources
            axes[1, 2].plot(runtime_hours, cpu_usage, 'red', linewidth=2, label='CPU %', marker='o')
            axes[1, 2].plot(runtime_hours, gpu_memory, 'cyan', linewidth=2, label='VRAM %', marker='s')
            axes[1, 2].set_title('💻 System Resources', fontsize=14, fontweight='bold')
            axes[1, 2].set_xlabel('Runtime (Hours)')
            axes[1, 2].set_ylabel('Usage (%)')
            axes[1, 2].legend()
            axes[1, 2].grid(True, alpha=0.3)

            plt.tight_layout()

            # Save plot
            plots_dir = Path("plots/overnight_training")
            plots_dir.mkdir(parents=True, exist_ok=True)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            plot_path = plots_dir / f"training_dashboard_{timestamp}.png"
            plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='black')
            plt.close()

            logger.info(f"📊 Performance dashboard saved: {plot_path}")

        except Exception as e:
            logger.error(f"Error creating performance plots: {e}")

    def save_progress_report(self):
        """Save comprehensive progress report"""
        try:
            runtime = datetime.now() - self.start_time

            report = {
                'session_info': {
                    'start_time': self.start_time.isoformat(),
                    'current_time': datetime.now().isoformat(),
                    'runtime': str(runtime),
                    'runtime_hours': runtime.total_seconds() / 3600
                },
                'model_specifications': self.model_specs,
                'training_metrics': self.training_metrics,
                'profit_metrics': self.profit_metrics,
                'system_metrics_summary': {
                    'avg_cpu_usage': np.mean(self.system_metrics['cpu_usage']) if self.system_metrics['cpu_usage'] else 0,
                    'avg_memory_usage': np.mean(self.system_metrics['memory_usage']) if self.system_metrics['memory_usage'] else 0,
                    'avg_gpu_usage': np.mean(self.system_metrics['gpu_usage']) if self.system_metrics['gpu_usage'] else 0,
                    'avg_gpu_memory': np.mean(self.system_metrics['gpu_memory']) if self.system_metrics['gpu_memory'] else 0
                },
                'performance_history': self.performance_history
            }

            # Save report
            reports_dir = Path("reports/overnight_training")
            reports_dir.mkdir(parents=True, exist_ok=True)
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            report_path = reports_dir / f"progress_report_{timestamp}.json"

            with open(report_path, 'w') as f:
                json.dump(report, f, indent=2, default=str)

            logger.info(f"📄 Progress report saved: {report_path}")

        except Exception as e:
            logger.error(f"Error saving progress report: {e}")

    def monitor_overnight_training(self, check_interval: int = 300):
        """Main monitoring loop for overnight training"""
        logger.info("🌙 STARTING OVERNIGHT TRAINING MONITORING")
        logger.info(f"⏰ Check interval: {check_interval} seconds ({check_interval/60:.1f} minutes)")
        logger.info("🚀 Monitoring the MASSIVE 504M parameter model training...")

        try:
            while self.monitoring:
                # Log comprehensive status
                self.log_comprehensive_status()

                # Create performance plots every hour
                runtime_hours = (datetime.now() - self.start_time).total_seconds() / 3600
                if len(self.performance_history) > 0 and len(self.performance_history) % 12 == 0:  # Every hour (12 * 5min = 1hr)
                    self.create_performance_plots()

                # Save progress report every 2 hours
                if len(self.performance_history) > 0 and len(self.performance_history) % 24 == 0:  # Every 2 hours
                    self.save_progress_report()

                # Check if we've been running for 8+ hours (full overnight session)
                if runtime_hours >= 8:
                    logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETED (8+ hours)")
                    self.finalize_overnight_session()
                    break

                # Wait for next check
                time.sleep(check_interval)

        except KeyboardInterrupt:
            logger.info("🛑 MONITORING STOPPED BY USER")
            self.finalize_overnight_session()
        except Exception as e:
            logger.error(f"❌ MONITORING ERROR: {e}")
            self.finalize_overnight_session()

    def finalize_overnight_session(self):
        """Finalize the overnight training session"""
        logger.info("🏁 FINALIZING OVERNIGHT TRAINING SESSION")

        # Final status log
        self.log_comprehensive_status()

        # Create final performance plots
        self.create_performance_plots()

        # Save final comprehensive report
        self.save_progress_report()

        # Calculate session summary
        runtime = datetime.now() - self.start_time
        runtime_hours = runtime.total_seconds() / 3600

        logger.info("="*80)
        logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETE")
        logger.info("="*80)
        logger.info(f"⏰ Total Runtime: {runtime}")
        logger.info(f"📊 Total Episodes: {self.training_metrics['episodes_completed']:,}")
        logger.info(f"💹 Total Trades: {self.training_metrics['total_trades']:,}")
        logger.info(f"💰 Final P&L: ${self.profit_metrics['total_pnl']:+,.2f}")
        logger.info(f"📈 Final ROI: {self.profit_metrics['roi_percentage']:+.2f}%")
        logger.info(f"🏆 Final Win Rate: {self.training_metrics['win_rate']:.1%}")
        logger.info(f"🎯 Avg Reward: {self.training_metrics['average_reward']:.2f}")
        logger.info("="*80)
        logger.info("🚀 MASSIVE 504M PARAMETER MODEL TRAINING SESSION COMPLETED!")
        logger.info("="*80)

        self.monitoring = False

def main():
    """Main function to start overnight monitoring"""
    try:
        logger.info("🚀 INITIALIZING OVERNIGHT TRAINING MONITOR")
        logger.info("💡 Monitoring 504.89 Million Parameter Enhanced CNN + DQN Agent")
        logger.info("🎯 Target: 4GB VRAM utilization with maximum profit optimization")

        # Create monitor
        monitor = OvernightTrainingMonitor()

        # Start monitoring (check every 5 minutes)
        monitor.monitor_overnight_training(check_interval=300)

    except Exception as e:
        logger.error(f"Fatal error in overnight monitoring: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()