#!/usr/bin/env python3 """ Overnight Training Monitor - 504M Parameter Massive Model ================================================================================ Comprehensive monitoring system for the overnight RL training session with: - 504.89 Million parameter Enhanced CNN + DQN Agent - 4GB VRAM utilization - Real-time performance tracking - Automated model checkpointing - Training analytics and reporting - Memory usage optimization - Profit maximization metrics Run this script to monitor the entire overnight training session. """ import time import psutil import torch import logging import json import matplotlib.pyplot as plt from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional import numpy as np import pandas as pd from threading import Thread import subprocess import GPUtil # Setup comprehensive logging log_dir = Path("logs/overnight_training") log_dir.mkdir(parents=True, exist_ok=True) # Configure detailed logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_dir / f"overnight_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) class OvernightTrainingMonitor: """Comprehensive overnight training monitor for massive 504M parameter model""" def __init__(self): """Initialize the overnight training monitor""" self.start_time = datetime.now() self.monitoring = True # Model specifications self.model_specs = { 'total_parameters': 504_889_098, 'enhanced_cnn_params': 168_296_366, 'dqn_agent_params': 336_592_732, 'memory_usage_mb': 1926.7, 'target_vram_gb': 4.0, 'architecture': 'Massive Enhanced CNN + DQN Agent' } # Training metrics tracking self.training_metrics = { 'episodes_completed': 0, 'total_reward': 0.0, 'best_reward': -float('inf'), 'average_reward': 0.0, 'win_rate': 0.0, 'total_trades': 0, 'profit_factor': 0.0, 'sharpe_ratio': 0.0, 'max_drawdown': 0.0, 'final_balance': 0.0, 'training_loss': 0.0 } # System monitoring self.system_metrics = { 'cpu_usage': [], 'memory_usage': [], 'gpu_usage': [], 'gpu_memory': [], 'disk_io': [], 'network_io': [] } # Performance tracking self.performance_history = [] self.checkpoint_times = [] # Profit tracking (500x leverage simulation) self.profit_metrics = { 'starting_balance': 10000.0, 'current_balance': 10000.0, 'total_pnl': 0.0, 'realized_pnl': 0.0, 'unrealized_pnl': 0.0, 'leverage': 500, 'fees_paid': 0.0, 'roi_percentage': 0.0 } logger.info("🚀 OVERNIGHT TRAINING MONITOR INITIALIZED") logger.info(f"📊 Model: {self.model_specs['total_parameters']:,} parameters") logger.info(f"💾 Memory: {self.model_specs['memory_usage_mb']:.1f} MB") logger.info(f"🎯 Target VRAM: {self.model_specs['target_vram_gb']} GB") logger.info(f"⚡ Leverage: {self.profit_metrics['leverage']}x") def check_system_resources(self) -> Dict: """Check current system resource usage""" try: # CPU and Memory cpu_percent = psutil.cpu_percent(interval=1) memory = psutil.virtual_memory() memory_percent = memory.percent memory_used_gb = memory.used / (1024**3) memory_total_gb = memory.total / (1024**3) # GPU monitoring gpu_usage = 0 gpu_memory_used = 0 gpu_memory_total = 0 if torch.cuda.is_available(): gpu_memory_used = torch.cuda.memory_allocated() / (1024**3) # GB gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB # Try to get GPU utilization try: gpus = GPUtil.getGPUs() if gpus: gpu_usage = gpus[0].load * 100 except: gpu_usage = 0 # Disk I/O disk_io = psutil.disk_io_counters() # Network I/O network_io = psutil.net_io_counters() system_info = { 'timestamp': datetime.now(), 'cpu_usage': cpu_percent, 'memory_percent': memory_percent, 'memory_used_gb': memory_used_gb, 'memory_total_gb': memory_total_gb, 'gpu_usage': gpu_usage, 'gpu_memory_used_gb': gpu_memory_used, 'gpu_memory_total_gb': gpu_memory_total, 'gpu_memory_percent': (gpu_memory_used / gpu_memory_total * 100) if gpu_memory_total > 0 else 0, 'disk_read_gb': disk_io.read_bytes / (1024**3) if disk_io else 0, 'disk_write_gb': disk_io.write_bytes / (1024**3) if disk_io else 0, 'network_sent_gb': network_io.bytes_sent / (1024**3) if network_io else 0, 'network_recv_gb': network_io.bytes_recv / (1024**3) if network_io else 0 } return system_info except Exception as e: logger.error(f"Error checking system resources: {e}") return {} def _parse_training_metrics(self) -> Dict[str, Any]: """Parse REAL training metrics from log files - NO SYNTHETIC DATA""" try: # Read actual training logs for real metrics training_log_path = Path("logs/trading.log") if not training_log_path.exists(): logger.warning("⚠️ No training log found - metrics unavailable") return self._default_metrics() # Parse real metrics from training logs with open(training_log_path, 'r') as f: recent_lines = f.readlines()[-100:] # Get last 100 lines # Extract real metrics from log lines real_metrics = self._extract_real_metrics(recent_lines) if real_metrics: logger.info(f"✅ Parsed {len(real_metrics)} real training metrics") return real_metrics else: logger.warning("⚠️ No real metrics found in logs") return self._default_metrics() except Exception as e: logger.error(f"❌ Error parsing real training metrics: {e}") return self._default_metrics() def _extract_real_metrics(self, log_lines: List[str]) -> Dict[str, Any]: """Extract real metrics from training log lines""" metrics = {} try: # Look for real training indicators loss_values = [] trade_counts = [] pnl_values = [] for line in log_lines: # Extract real loss values if "loss:" in line.lower() or "Loss" in line: try: # Extract numeric loss value import re loss_match = re.search(r'loss[:\s]+([\d\.]+)', line, re.IGNORECASE) if loss_match: loss_values.append(float(loss_match.group(1))) except: pass # Extract real trade information if "TRADE" in line and "OPENED" in line: trade_counts.append(1) # Extract real PnL values if "PnL:" in line: try: pnl_match = re.search(r'PnL[:\s]+\$?([+-]?[\d\.]+)', line) if pnl_match: pnl_values.append(float(pnl_match.group(1))) except: pass # Calculate real averages if loss_values: metrics['current_loss'] = sum(loss_values) / len(loss_values) metrics['loss_trend'] = 'decreasing' if len(loss_values) > 1 and loss_values[-1] < loss_values[0] else 'stable' if trade_counts: metrics['trades_per_hour'] = len(trade_counts) if pnl_values: metrics['total_pnl'] = sum(pnl_values) metrics['avg_pnl'] = sum(pnl_values) / len(pnl_values) metrics['win_rate'] = len([p for p in pnl_values if p > 0]) / len(pnl_values) # Add timestamp metrics['timestamp'] = datetime.now() metrics['data_source'] = 'real_training_logs' return metrics except Exception as e: logger.error(f"❌ Error extracting real metrics: {e}") return {} def _default_metrics(self) -> Dict[str, Any]: """Return default metrics when no real data is available""" return { 'current_loss': 0.0, 'trades_per_hour': 0, 'total_pnl': 0.0, 'avg_pnl': 0.0, 'win_rate': 0.0, 'timestamp': datetime.now(), 'data_source': 'no_real_data_available', 'loss_trend': 'unknown' } def update_training_metrics(self): """Update training metrics from TensorBoard logs and saved models""" try: # Look for TensorBoard log files runs_dir = Path("runs") if runs_dir.exists(): latest_run = max(runs_dir.glob("*"), key=lambda p: p.stat().st_mtime, default=None) if latest_run: # Parse TensorBoard logs (simplified) logger.info(f"📈 Latest training run: {latest_run.name}") # Check for model checkpoints models_dir = Path("models/rl") if models_dir.exists(): checkpoints = list(models_dir.glob("*.pt")) if checkpoints: latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime) checkpoint_time = datetime.fromtimestamp(latest_checkpoint.stat().st_mtime) self.checkpoint_times.append(checkpoint_time) logger.info(f"💾 Latest checkpoint: {latest_checkpoint.name} at {checkpoint_time}") # Parse REAL training metrics from logs - NO SYNTHETIC DATA real_metrics = self._parse_training_metrics() if real_metrics['data_source'] == 'real_training_logs': # Use real metrics from training logs logger.info("✅ Using REAL training metrics") self.training_metrics['total_pnl'] = real_metrics.get('total_pnl', 0.0) self.training_metrics['avg_pnl'] = real_metrics.get('avg_pnl', 0.0) self.training_metrics['win_rate'] = real_metrics.get('win_rate', 0.0) self.training_metrics['current_loss'] = real_metrics.get('current_loss', 0.0) self.training_metrics['trades_per_hour'] = real_metrics.get('trades_per_hour', 0) else: # No real data available - use safe defaults (NO SYNTHETIC) logger.warning("⚠️ No real training metrics available - using zero values") self.training_metrics['total_pnl'] = 0.0 self.training_metrics['avg_pnl'] = 0.0 self.training_metrics['win_rate'] = 0.0 self.training_metrics['current_loss'] = 0.0 self.training_metrics['trades_per_hour'] = 0 # Update other real metrics self.training_metrics['memory_usage'] = self.check_system_resources()['memory_percent'] self.training_metrics['gpu_usage'] = self.check_system_resources()['gpu_usage'] self.training_metrics['training_time'] = (datetime.now() - self.start_time).total_seconds() # Log real metrics logger.info(f"🔄 Real Training Metrics Updated:") logger.info(f" 💰 Total PnL: ${self.training_metrics['total_pnl']:.2f}") logger.info(f" 📊 Win Rate: {self.training_metrics['win_rate']:.1%}") logger.info(f" 🔢 Trades: {self.training_metrics['trades_per_hour']}") logger.info(f" 📉 Loss: {self.training_metrics['current_loss']:.4f}") logger.info(f" 💾 Memory: {self.training_metrics['memory_usage']:.1f}%") logger.info(f" 🎮 GPU: {self.training_metrics['gpu_usage']:.1f}%") except Exception as e: logger.error(f"❌ Error updating real training metrics: {e}") # Set safe defaults on error (NO SYNTHETIC FALLBACK) self.training_metrics.update({ 'total_pnl': 0.0, 'avg_pnl': 0.0, 'win_rate': 0.0, 'current_loss': 0.0, 'trades_per_hour': 0 }) def log_comprehensive_status(self): """Log comprehensive training status""" system_info = self.check_system_resources() self.update_training_metrics() runtime = datetime.now() - self.start_time runtime_hours = runtime.total_seconds() / 3600 logger.info("="*80) logger.info("🚀 MASSIVE MODEL OVERNIGHT TRAINING STATUS") logger.info("="*80) # Training Progress logger.info("📊 TRAINING PROGRESS:") logger.info(f" ⏱️ Runtime: {runtime}") logger.info(f" 📈 Episodes: {self.training_metrics['episodes_completed']:,}") logger.info(f" 🎯 Average Reward: {self.training_metrics['average_reward']:.2f}") logger.info(f" 🏆 Win Rate: {self.training_metrics['win_rate']:.1%}") logger.info(f" 💹 Total Trades: {self.training_metrics['total_trades']:,}") # Profit Metrics (500x Leverage) logger.info("💰 PROFIT METRICS (500x LEVERAGE):") logger.info(f" 💵 Starting Balance: ${self.profit_metrics['starting_balance']:,.2f}") logger.info(f" 💰 Current Balance: ${self.profit_metrics['current_balance']:,.2f}") logger.info(f" 📈 Total P&L: ${self.profit_metrics['total_pnl']:+,.2f}") logger.info(f" 📊 ROI: {self.profit_metrics['roi_percentage']:+.2f}%") logger.info(f" ⚡ Leverage: {self.profit_metrics['leverage']}x") # Model Specifications logger.info("🤖 MODEL SPECIFICATIONS:") logger.info(f" 🧠 Total Parameters: {self.model_specs['total_parameters']:,}") logger.info(f" 🏗️ Enhanced CNN: {self.model_specs['enhanced_cnn_params']:,}") logger.info(f" 🎮 DQN Agent: {self.model_specs['dqn_agent_params']:,}") logger.info(f" 💾 Memory Usage: {self.model_specs['memory_usage_mb']:.1f} MB") # System Resources if system_info: logger.info("💻 SYSTEM RESOURCES:") logger.info(f" 🔄 CPU Usage: {system_info['cpu_usage']:.1f}%") logger.info(f" 🧠 RAM Usage: {system_info['memory_used_gb']:.1f}/{system_info['memory_total_gb']:.1f} GB ({system_info['memory_percent']:.1f}%)") logger.info(f" 🎮 GPU Usage: {system_info['gpu_usage']:.1f}%") logger.info(f" 🔥 VRAM Usage: {system_info['gpu_memory_used_gb']:.1f}/{system_info['gpu_memory_total_gb']:.1f} GB ({system_info['gpu_memory_percent']:.1f}%)") # Store metrics for plotting self.system_metrics['cpu_usage'].append(system_info['cpu_usage']) self.system_metrics['memory_usage'].append(system_info['memory_percent']) self.system_metrics['gpu_usage'].append(system_info['gpu_usage']) self.system_metrics['gpu_memory'].append(system_info['gpu_memory_percent']) # Performance estimate if runtime_hours > 0: episodes_per_hour = self.training_metrics['episodes_completed'] / runtime_hours trades_per_hour = self.training_metrics['total_trades'] / runtime_hours profit_per_hour = self.profit_metrics['total_pnl'] / runtime_hours logger.info("⚡ PERFORMANCE ESTIMATES:") logger.info(f" 📊 Episodes/Hour: {episodes_per_hour:.1f}") logger.info(f" 💹 Trades/Hour: {trades_per_hour:.1f}") logger.info(f" 💰 Profit/Hour: ${profit_per_hour:+.2f}") # Projections for full night (8 hours) hours_remaining = max(0, 8 - runtime_hours) if hours_remaining > 0: projected_episodes = self.training_metrics['episodes_completed'] + (episodes_per_hour * hours_remaining) projected_profit = self.profit_metrics['total_pnl'] + (profit_per_hour * hours_remaining) logger.info("🔮 OVERNIGHT PROJECTIONS:") logger.info(f" ⏰ Hours Remaining: {hours_remaining:.1f}") logger.info(f" 📈 Projected Episodes: {projected_episodes:.0f}") logger.info(f" 💰 Projected Profit: ${projected_profit:+,.2f}") logger.info("="*80) # Save performance snapshot snapshot = { 'timestamp': datetime.now().isoformat(), 'runtime_hours': runtime_hours, 'training_metrics': self.training_metrics.copy(), 'profit_metrics': self.profit_metrics.copy(), 'system_info': system_info } self.performance_history.append(snapshot) def create_performance_plots(self): """Create real-time performance visualization plots""" try: if len(self.performance_history) < 2: return # Extract time series data timestamps = [datetime.fromisoformat(h['timestamp']) for h in self.performance_history] runtime_hours = [h['runtime_hours'] for h in self.performance_history] # Training metrics episodes = [h['training_metrics']['episodes_completed'] for h in self.performance_history] rewards = [h['training_metrics']['average_reward'] for h in self.performance_history] win_rates = [h['training_metrics']['win_rate'] for h in self.performance_history] # Profit metrics profits = [h['profit_metrics']['total_pnl'] for h in self.performance_history] roi = [h['profit_metrics']['roi_percentage'] for h in self.performance_history] # System metrics cpu_usage = [h['system_info'].get('cpu_usage', 0) for h in self.performance_history] gpu_memory = [h['system_info'].get('gpu_memory_percent', 0) for h in self.performance_history] # Create comprehensive dashboard plt.style.use('dark_background') fig, axes = plt.subplots(2, 3, figsize=(20, 12)) fig.suptitle('🚀 MASSIVE MODEL OVERNIGHT TRAINING DASHBOARD 🚀', fontsize=16, fontweight='bold') # Training Episodes axes[0, 0].plot(runtime_hours, episodes, 'cyan', linewidth=2, marker='o') axes[0, 0].set_title('📈 Training Episodes', fontsize=14, fontweight='bold') axes[0, 0].set_xlabel('Runtime (Hours)') axes[0, 0].set_ylabel('Episodes Completed') axes[0, 0].grid(True, alpha=0.3) # Average Reward axes[0, 1].plot(runtime_hours, rewards, 'lime', linewidth=2, marker='s') axes[0, 1].set_title('🎯 Average Reward', fontsize=14, fontweight='bold') axes[0, 1].set_xlabel('Runtime (Hours)') axes[0, 1].set_ylabel('Average Reward') axes[0, 1].grid(True, alpha=0.3) # Win Rate axes[0, 2].plot(runtime_hours, [w*100 for w in win_rates], 'gold', linewidth=2, marker='^') axes[0, 2].set_title('🏆 Win Rate (%)', fontsize=14, fontweight='bold') axes[0, 2].set_xlabel('Runtime (Hours)') axes[0, 2].set_ylabel('Win Rate (%)') axes[0, 2].grid(True, alpha=0.3) # Profit/Loss (500x Leverage) axes[1, 0].plot(runtime_hours, profits, 'magenta', linewidth=3, marker='D') axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.7) axes[1, 0].set_title('💰 P&L (500x Leverage)', fontsize=14, fontweight='bold') axes[1, 0].set_xlabel('Runtime (Hours)') axes[1, 0].set_ylabel('Total P&L ($)') axes[1, 0].grid(True, alpha=0.3) # ROI Percentage axes[1, 1].plot(runtime_hours, roi, 'orange', linewidth=2, marker='*') axes[1, 1].axhline(y=0, color='red', linestyle='--', alpha=0.7) axes[1, 1].set_title('📊 ROI (%)', fontsize=14, fontweight='bold') axes[1, 1].set_xlabel('Runtime (Hours)') axes[1, 1].set_ylabel('ROI (%)') axes[1, 1].grid(True, alpha=0.3) # System Resources axes[1, 2].plot(runtime_hours, cpu_usage, 'red', linewidth=2, label='CPU %', marker='o') axes[1, 2].plot(runtime_hours, gpu_memory, 'cyan', linewidth=2, label='VRAM %', marker='s') axes[1, 2].set_title('💻 System Resources', fontsize=14, fontweight='bold') axes[1, 2].set_xlabel('Runtime (Hours)') axes[1, 2].set_ylabel('Usage (%)') axes[1, 2].legend() axes[1, 2].grid(True, alpha=0.3) plt.tight_layout() # Save plot plots_dir = Path("plots/overnight_training") plots_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") plot_path = plots_dir / f"training_dashboard_{timestamp}.png" plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='black') plt.close() logger.info(f"📊 Performance dashboard saved: {plot_path}") except Exception as e: logger.error(f"Error creating performance plots: {e}") def save_progress_report(self): """Save comprehensive progress report""" try: runtime = datetime.now() - self.start_time report = { 'session_info': { 'start_time': self.start_time.isoformat(), 'current_time': datetime.now().isoformat(), 'runtime': str(runtime), 'runtime_hours': runtime.total_seconds() / 3600 }, 'model_specifications': self.model_specs, 'training_metrics': self.training_metrics, 'profit_metrics': self.profit_metrics, 'system_metrics_summary': { 'avg_cpu_usage': np.mean(self.system_metrics['cpu_usage']) if self.system_metrics['cpu_usage'] else 0, 'avg_memory_usage': np.mean(self.system_metrics['memory_usage']) if self.system_metrics['memory_usage'] else 0, 'avg_gpu_usage': np.mean(self.system_metrics['gpu_usage']) if self.system_metrics['gpu_usage'] else 0, 'avg_gpu_memory': np.mean(self.system_metrics['gpu_memory']) if self.system_metrics['gpu_memory'] else 0 }, 'performance_history': self.performance_history } # Save report reports_dir = Path("reports/overnight_training") reports_dir.mkdir(parents=True, exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") report_path = reports_dir / f"progress_report_{timestamp}.json" with open(report_path, 'w') as f: json.dump(report, f, indent=2, default=str) logger.info(f"📄 Progress report saved: {report_path}") except Exception as e: logger.error(f"Error saving progress report: {e}") def monitor_overnight_training(self, check_interval: int = 300): """Main monitoring loop for overnight training""" logger.info("🌙 STARTING OVERNIGHT TRAINING MONITORING") logger.info(f"⏰ Check interval: {check_interval} seconds ({check_interval/60:.1f} minutes)") logger.info("🚀 Monitoring the MASSIVE 504M parameter model training...") try: while self.monitoring: # Log comprehensive status self.log_comprehensive_status() # Create performance plots every hour runtime_hours = (datetime.now() - self.start_time).total_seconds() / 3600 if len(self.performance_history) > 0 and len(self.performance_history) % 12 == 0: # Every hour (12 * 5min = 1hr) self.create_performance_plots() # Save progress report every 2 hours if len(self.performance_history) > 0 and len(self.performance_history) % 24 == 0: # Every 2 hours self.save_progress_report() # Check if we've been running for 8+ hours (full overnight session) if runtime_hours >= 8: logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETED (8+ hours)") self.finalize_overnight_session() break # Wait for next check time.sleep(check_interval) except KeyboardInterrupt: logger.info("🛑 MONITORING STOPPED BY USER") self.finalize_overnight_session() except Exception as e: logger.error(f"❌ MONITORING ERROR: {e}") self.finalize_overnight_session() def finalize_overnight_session(self): """Finalize the overnight training session""" logger.info("🏁 FINALIZING OVERNIGHT TRAINING SESSION") # Final status log self.log_comprehensive_status() # Create final performance plots self.create_performance_plots() # Save final comprehensive report self.save_progress_report() # Calculate session summary runtime = datetime.now() - self.start_time runtime_hours = runtime.total_seconds() / 3600 logger.info("="*80) logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETE") logger.info("="*80) logger.info(f"⏰ Total Runtime: {runtime}") logger.info(f"📊 Total Episodes: {self.training_metrics['episodes_completed']:,}") logger.info(f"💹 Total Trades: {self.training_metrics['total_trades']:,}") logger.info(f"💰 Final P&L: ${self.profit_metrics['total_pnl']:+,.2f}") logger.info(f"📈 Final ROI: {self.profit_metrics['roi_percentage']:+.2f}%") logger.info(f"🏆 Final Win Rate: {self.training_metrics['win_rate']:.1%}") logger.info(f"🎯 Avg Reward: {self.training_metrics['average_reward']:.2f}") logger.info("="*80) logger.info("🚀 MASSIVE 504M PARAMETER MODEL TRAINING SESSION COMPLETED!") logger.info("="*80) self.monitoring = False def main(): """Main function to start overnight monitoring""" try: logger.info("🚀 INITIALIZING OVERNIGHT TRAINING MONITOR") logger.info("💡 Monitoring 504.89 Million Parameter Enhanced CNN + DQN Agent") logger.info("🎯 Target: 4GB VRAM utilization with maximum profit optimization") # Create monitor monitor = OvernightTrainingMonitor() # Start monitoring (check every 5 minutes) monitor.monitor_overnight_training(check_interval=300) except Exception as e: logger.error(f"Fatal error in overnight monitoring: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()