gogo2/overnight_training_monitor.py
2025-05-25 00:28:52 +03:00

630 lines
28 KiB
Python

#!/usr/bin/env python3
"""
Overnight Training Monitor - 504M Parameter Massive Model
================================================================================
Comprehensive monitoring system for the overnight RL training session with:
- 504.89 Million parameter Enhanced CNN + DQN Agent
- 4GB VRAM utilization
- Real-time performance tracking
- Automated model checkpointing
- Training analytics and reporting
- Memory usage optimization
- Profit maximization metrics
Run this script to monitor the entire overnight training session.
"""
import time
import psutil
import torch
import logging
import json
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional
import numpy as np
import pandas as pd
from threading import Thread
import subprocess
import GPUtil
# Setup comprehensive logging
log_dir = Path("logs/overnight_training")
log_dir.mkdir(parents=True, exist_ok=True)
# Configure detailed logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_dir / f"overnight_training_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class OvernightTrainingMonitor:
"""Comprehensive overnight training monitor for massive 504M parameter model"""
def __init__(self):
"""Initialize the overnight training monitor"""
self.start_time = datetime.now()
self.monitoring = True
# Model specifications
self.model_specs = {
'total_parameters': 504_889_098,
'enhanced_cnn_params': 168_296_366,
'dqn_agent_params': 336_592_732,
'memory_usage_mb': 1926.7,
'target_vram_gb': 4.0,
'architecture': 'Massive Enhanced CNN + DQN Agent'
}
# Training metrics tracking
self.training_metrics = {
'episodes_completed': 0,
'total_reward': 0.0,
'best_reward': -float('inf'),
'average_reward': 0.0,
'win_rate': 0.0,
'total_trades': 0,
'profit_factor': 0.0,
'sharpe_ratio': 0.0,
'max_drawdown': 0.0,
'final_balance': 0.0,
'training_loss': 0.0
}
# System monitoring
self.system_metrics = {
'cpu_usage': [],
'memory_usage': [],
'gpu_usage': [],
'gpu_memory': [],
'disk_io': [],
'network_io': []
}
# Performance tracking
self.performance_history = []
self.checkpoint_times = []
# Profit tracking (500x leverage simulation)
self.profit_metrics = {
'starting_balance': 10000.0,
'current_balance': 10000.0,
'total_pnl': 0.0,
'realized_pnl': 0.0,
'unrealized_pnl': 0.0,
'leverage': 500,
'fees_paid': 0.0,
'roi_percentage': 0.0
}
logger.info("🚀 OVERNIGHT TRAINING MONITOR INITIALIZED")
logger.info(f"📊 Model: {self.model_specs['total_parameters']:,} parameters")
logger.info(f"💾 Memory: {self.model_specs['memory_usage_mb']:.1f} MB")
logger.info(f"🎯 Target VRAM: {self.model_specs['target_vram_gb']} GB")
logger.info(f"⚡ Leverage: {self.profit_metrics['leverage']}x")
def check_system_resources(self) -> Dict:
"""Check current system resource usage"""
try:
# CPU and Memory
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
memory_percent = memory.percent
memory_used_gb = memory.used / (1024**3)
memory_total_gb = memory.total / (1024**3)
# GPU monitoring
gpu_usage = 0
gpu_memory_used = 0
gpu_memory_total = 0
if torch.cuda.is_available():
gpu_memory_used = torch.cuda.memory_allocated() / (1024**3) # GB
gpu_memory_total = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
# Try to get GPU utilization
try:
gpus = GPUtil.getGPUs()
if gpus:
gpu_usage = gpus[0].load * 100
except:
gpu_usage = 0
# Disk I/O
disk_io = psutil.disk_io_counters()
# Network I/O
network_io = psutil.net_io_counters()
system_info = {
'timestamp': datetime.now(),
'cpu_usage': cpu_percent,
'memory_percent': memory_percent,
'memory_used_gb': memory_used_gb,
'memory_total_gb': memory_total_gb,
'gpu_usage': gpu_usage,
'gpu_memory_used_gb': gpu_memory_used,
'gpu_memory_total_gb': gpu_memory_total,
'gpu_memory_percent': (gpu_memory_used / gpu_memory_total * 100) if gpu_memory_total > 0 else 0,
'disk_read_gb': disk_io.read_bytes / (1024**3) if disk_io else 0,
'disk_write_gb': disk_io.write_bytes / (1024**3) if disk_io else 0,
'network_sent_gb': network_io.bytes_sent / (1024**3) if network_io else 0,
'network_recv_gb': network_io.bytes_recv / (1024**3) if network_io else 0
}
return system_info
except Exception as e:
logger.error(f"Error checking system resources: {e}")
return {}
def _parse_training_metrics(self) -> Dict[str, Any]:
"""Parse REAL training metrics from log files - NO SYNTHETIC DATA"""
try:
# Read actual training logs for real metrics
training_log_path = Path("logs/trading.log")
if not training_log_path.exists():
logger.warning("⚠️ No training log found - metrics unavailable")
return self._default_metrics()
# Parse real metrics from training logs
with open(training_log_path, 'r') as f:
recent_lines = f.readlines()[-100:] # Get last 100 lines
# Extract real metrics from log lines
real_metrics = self._extract_real_metrics(recent_lines)
if real_metrics:
logger.info(f"✅ Parsed {len(real_metrics)} real training metrics")
return real_metrics
else:
logger.warning("⚠️ No real metrics found in logs")
return self._default_metrics()
except Exception as e:
logger.error(f"❌ Error parsing real training metrics: {e}")
return self._default_metrics()
def _extract_real_metrics(self, log_lines: List[str]) -> Dict[str, Any]:
"""Extract real metrics from training log lines"""
metrics = {}
try:
# Look for real training indicators
loss_values = []
trade_counts = []
pnl_values = []
for line in log_lines:
# Extract real loss values
if "loss:" in line.lower() or "Loss" in line:
try:
# Extract numeric loss value
import re
loss_match = re.search(r'loss[:\s]+([\d\.]+)', line, re.IGNORECASE)
if loss_match:
loss_values.append(float(loss_match.group(1)))
except:
pass
# Extract real trade information
if "TRADE" in line and "OPENED" in line:
trade_counts.append(1)
# Extract real PnL values
if "PnL:" in line:
try:
pnl_match = re.search(r'PnL[:\s]+\$?([+-]?[\d\.]+)', line)
if pnl_match:
pnl_values.append(float(pnl_match.group(1)))
except:
pass
# Calculate real averages
if loss_values:
metrics['current_loss'] = sum(loss_values) / len(loss_values)
metrics['loss_trend'] = 'decreasing' if len(loss_values) > 1 and loss_values[-1] < loss_values[0] else 'stable'
if trade_counts:
metrics['trades_per_hour'] = len(trade_counts)
if pnl_values:
metrics['total_pnl'] = sum(pnl_values)
metrics['avg_pnl'] = sum(pnl_values) / len(pnl_values)
metrics['win_rate'] = len([p for p in pnl_values if p > 0]) / len(pnl_values)
# Add timestamp
metrics['timestamp'] = datetime.now()
metrics['data_source'] = 'real_training_logs'
return metrics
except Exception as e:
logger.error(f"❌ Error extracting real metrics: {e}")
return {}
def _default_metrics(self) -> Dict[str, Any]:
"""Return default metrics when no real data is available"""
return {
'current_loss': 0.0,
'trades_per_hour': 0,
'total_pnl': 0.0,
'avg_pnl': 0.0,
'win_rate': 0.0,
'timestamp': datetime.now(),
'data_source': 'no_real_data_available',
'loss_trend': 'unknown'
}
def update_training_metrics(self):
"""Update training metrics from TensorBoard logs and saved models"""
try:
# Look for TensorBoard log files
runs_dir = Path("runs")
if runs_dir.exists():
latest_run = max(runs_dir.glob("*"), key=lambda p: p.stat().st_mtime, default=None)
if latest_run:
# Parse TensorBoard logs (simplified)
logger.info(f"📈 Latest training run: {latest_run.name}")
# Check for model checkpoints
models_dir = Path("models/rl")
if models_dir.exists():
checkpoints = list(models_dir.glob("*.pt"))
if checkpoints:
latest_checkpoint = max(checkpoints, key=lambda p: p.stat().st_mtime)
checkpoint_time = datetime.fromtimestamp(latest_checkpoint.stat().st_mtime)
self.checkpoint_times.append(checkpoint_time)
logger.info(f"💾 Latest checkpoint: {latest_checkpoint.name} at {checkpoint_time}")
# Parse REAL training metrics from logs - NO SYNTHETIC DATA
real_metrics = self._parse_training_metrics()
if real_metrics['data_source'] == 'real_training_logs':
# Use real metrics from training logs
logger.info("✅ Using REAL training metrics")
self.training_metrics['total_pnl'] = real_metrics.get('total_pnl', 0.0)
self.training_metrics['avg_pnl'] = real_metrics.get('avg_pnl', 0.0)
self.training_metrics['win_rate'] = real_metrics.get('win_rate', 0.0)
self.training_metrics['current_loss'] = real_metrics.get('current_loss', 0.0)
self.training_metrics['trades_per_hour'] = real_metrics.get('trades_per_hour', 0)
else:
# No real data available - use safe defaults (NO SYNTHETIC)
logger.warning("⚠️ No real training metrics available - using zero values")
self.training_metrics['total_pnl'] = 0.0
self.training_metrics['avg_pnl'] = 0.0
self.training_metrics['win_rate'] = 0.0
self.training_metrics['current_loss'] = 0.0
self.training_metrics['trades_per_hour'] = 0
# Update other real metrics
self.training_metrics['memory_usage'] = self.check_system_resources()['memory_percent']
self.training_metrics['gpu_usage'] = self.check_system_resources()['gpu_usage']
self.training_metrics['training_time'] = (datetime.now() - self.start_time).total_seconds()
# Log real metrics
logger.info(f"🔄 Real Training Metrics Updated:")
logger.info(f" 💰 Total PnL: ${self.training_metrics['total_pnl']:.2f}")
logger.info(f" 📊 Win Rate: {self.training_metrics['win_rate']:.1%}")
logger.info(f" 🔢 Trades: {self.training_metrics['trades_per_hour']}")
logger.info(f" 📉 Loss: {self.training_metrics['current_loss']:.4f}")
logger.info(f" 💾 Memory: {self.training_metrics['memory_usage']:.1f}%")
logger.info(f" 🎮 GPU: {self.training_metrics['gpu_usage']:.1f}%")
except Exception as e:
logger.error(f"❌ Error updating real training metrics: {e}")
# Set safe defaults on error (NO SYNTHETIC FALLBACK)
self.training_metrics.update({
'total_pnl': 0.0,
'avg_pnl': 0.0,
'win_rate': 0.0,
'current_loss': 0.0,
'trades_per_hour': 0
})
def log_comprehensive_status(self):
"""Log comprehensive training status"""
system_info = self.check_system_resources()
self.update_training_metrics()
runtime = datetime.now() - self.start_time
runtime_hours = runtime.total_seconds() / 3600
logger.info("="*80)
logger.info("🚀 MASSIVE MODEL OVERNIGHT TRAINING STATUS")
logger.info("="*80)
# Training Progress
logger.info("📊 TRAINING PROGRESS:")
logger.info(f" ⏱️ Runtime: {runtime}")
logger.info(f" 📈 Episodes: {self.training_metrics['episodes_completed']:,}")
logger.info(f" 🎯 Average Reward: {self.training_metrics['average_reward']:.2f}")
logger.info(f" 🏆 Win Rate: {self.training_metrics['win_rate']:.1%}")
logger.info(f" 💹 Total Trades: {self.training_metrics['total_trades']:,}")
# Profit Metrics (500x Leverage)
logger.info("💰 PROFIT METRICS (500x LEVERAGE):")
logger.info(f" 💵 Starting Balance: ${self.profit_metrics['starting_balance']:,.2f}")
logger.info(f" 💰 Current Balance: ${self.profit_metrics['current_balance']:,.2f}")
logger.info(f" 📈 Total P&L: ${self.profit_metrics['total_pnl']:+,.2f}")
logger.info(f" 📊 ROI: {self.profit_metrics['roi_percentage']:+.2f}%")
logger.info(f" ⚡ Leverage: {self.profit_metrics['leverage']}x")
# Model Specifications
logger.info("🤖 MODEL SPECIFICATIONS:")
logger.info(f" 🧠 Total Parameters: {self.model_specs['total_parameters']:,}")
logger.info(f" 🏗️ Enhanced CNN: {self.model_specs['enhanced_cnn_params']:,}")
logger.info(f" 🎮 DQN Agent: {self.model_specs['dqn_agent_params']:,}")
logger.info(f" 💾 Memory Usage: {self.model_specs['memory_usage_mb']:.1f} MB")
# System Resources
if system_info:
logger.info("💻 SYSTEM RESOURCES:")
logger.info(f" 🔄 CPU Usage: {system_info['cpu_usage']:.1f}%")
logger.info(f" 🧠 RAM Usage: {system_info['memory_used_gb']:.1f}/{system_info['memory_total_gb']:.1f} GB ({system_info['memory_percent']:.1f}%)")
logger.info(f" 🎮 GPU Usage: {system_info['gpu_usage']:.1f}%")
logger.info(f" 🔥 VRAM Usage: {system_info['gpu_memory_used_gb']:.1f}/{system_info['gpu_memory_total_gb']:.1f} GB ({system_info['gpu_memory_percent']:.1f}%)")
# Store metrics for plotting
self.system_metrics['cpu_usage'].append(system_info['cpu_usage'])
self.system_metrics['memory_usage'].append(system_info['memory_percent'])
self.system_metrics['gpu_usage'].append(system_info['gpu_usage'])
self.system_metrics['gpu_memory'].append(system_info['gpu_memory_percent'])
# Performance estimate
if runtime_hours > 0:
episodes_per_hour = self.training_metrics['episodes_completed'] / runtime_hours
trades_per_hour = self.training_metrics['total_trades'] / runtime_hours
profit_per_hour = self.profit_metrics['total_pnl'] / runtime_hours
logger.info("⚡ PERFORMANCE ESTIMATES:")
logger.info(f" 📊 Episodes/Hour: {episodes_per_hour:.1f}")
logger.info(f" 💹 Trades/Hour: {trades_per_hour:.1f}")
logger.info(f" 💰 Profit/Hour: ${profit_per_hour:+.2f}")
# Projections for full night (8 hours)
hours_remaining = max(0, 8 - runtime_hours)
if hours_remaining > 0:
projected_episodes = self.training_metrics['episodes_completed'] + (episodes_per_hour * hours_remaining)
projected_profit = self.profit_metrics['total_pnl'] + (profit_per_hour * hours_remaining)
logger.info("🔮 OVERNIGHT PROJECTIONS:")
logger.info(f" ⏰ Hours Remaining: {hours_remaining:.1f}")
logger.info(f" 📈 Projected Episodes: {projected_episodes:.0f}")
logger.info(f" 💰 Projected Profit: ${projected_profit:+,.2f}")
logger.info("="*80)
# Save performance snapshot
snapshot = {
'timestamp': datetime.now().isoformat(),
'runtime_hours': runtime_hours,
'training_metrics': self.training_metrics.copy(),
'profit_metrics': self.profit_metrics.copy(),
'system_info': system_info
}
self.performance_history.append(snapshot)
def create_performance_plots(self):
"""Create real-time performance visualization plots"""
try:
if len(self.performance_history) < 2:
return
# Extract time series data
timestamps = [datetime.fromisoformat(h['timestamp']) for h in self.performance_history]
runtime_hours = [h['runtime_hours'] for h in self.performance_history]
# Training metrics
episodes = [h['training_metrics']['episodes_completed'] for h in self.performance_history]
rewards = [h['training_metrics']['average_reward'] for h in self.performance_history]
win_rates = [h['training_metrics']['win_rate'] for h in self.performance_history]
# Profit metrics
profits = [h['profit_metrics']['total_pnl'] for h in self.performance_history]
roi = [h['profit_metrics']['roi_percentage'] for h in self.performance_history]
# System metrics
cpu_usage = [h['system_info'].get('cpu_usage', 0) for h in self.performance_history]
gpu_memory = [h['system_info'].get('gpu_memory_percent', 0) for h in self.performance_history]
# Create comprehensive dashboard
plt.style.use('dark_background')
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
fig.suptitle('🚀 MASSIVE MODEL OVERNIGHT TRAINING DASHBOARD 🚀', fontsize=16, fontweight='bold')
# Training Episodes
axes[0, 0].plot(runtime_hours, episodes, 'cyan', linewidth=2, marker='o')
axes[0, 0].set_title('📈 Training Episodes', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Runtime (Hours)')
axes[0, 0].set_ylabel('Episodes Completed')
axes[0, 0].grid(True, alpha=0.3)
# Average Reward
axes[0, 1].plot(runtime_hours, rewards, 'lime', linewidth=2, marker='s')
axes[0, 1].set_title('🎯 Average Reward', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Runtime (Hours)')
axes[0, 1].set_ylabel('Average Reward')
axes[0, 1].grid(True, alpha=0.3)
# Win Rate
axes[0, 2].plot(runtime_hours, [w*100 for w in win_rates], 'gold', linewidth=2, marker='^')
axes[0, 2].set_title('🏆 Win Rate (%)', fontsize=14, fontweight='bold')
axes[0, 2].set_xlabel('Runtime (Hours)')
axes[0, 2].set_ylabel('Win Rate (%)')
axes[0, 2].grid(True, alpha=0.3)
# Profit/Loss (500x Leverage)
axes[1, 0].plot(runtime_hours, profits, 'magenta', linewidth=3, marker='D')
axes[1, 0].axhline(y=0, color='red', linestyle='--', alpha=0.7)
axes[1, 0].set_title('💰 P&L (500x Leverage)', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Runtime (Hours)')
axes[1, 0].set_ylabel('Total P&L ($)')
axes[1, 0].grid(True, alpha=0.3)
# ROI Percentage
axes[1, 1].plot(runtime_hours, roi, 'orange', linewidth=2, marker='*')
axes[1, 1].axhline(y=0, color='red', linestyle='--', alpha=0.7)
axes[1, 1].set_title('📊 ROI (%)', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Runtime (Hours)')
axes[1, 1].set_ylabel('ROI (%)')
axes[1, 1].grid(True, alpha=0.3)
# System Resources
axes[1, 2].plot(runtime_hours, cpu_usage, 'red', linewidth=2, label='CPU %', marker='o')
axes[1, 2].plot(runtime_hours, gpu_memory, 'cyan', linewidth=2, label='VRAM %', marker='s')
axes[1, 2].set_title('💻 System Resources', fontsize=14, fontweight='bold')
axes[1, 2].set_xlabel('Runtime (Hours)')
axes[1, 2].set_ylabel('Usage (%)')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)
plt.tight_layout()
# Save plot
plots_dir = Path("plots/overnight_training")
plots_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
plot_path = plots_dir / f"training_dashboard_{timestamp}.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight', facecolor='black')
plt.close()
logger.info(f"📊 Performance dashboard saved: {plot_path}")
except Exception as e:
logger.error(f"Error creating performance plots: {e}")
def save_progress_report(self):
"""Save comprehensive progress report"""
try:
runtime = datetime.now() - self.start_time
report = {
'session_info': {
'start_time': self.start_time.isoformat(),
'current_time': datetime.now().isoformat(),
'runtime': str(runtime),
'runtime_hours': runtime.total_seconds() / 3600
},
'model_specifications': self.model_specs,
'training_metrics': self.training_metrics,
'profit_metrics': self.profit_metrics,
'system_metrics_summary': {
'avg_cpu_usage': np.mean(self.system_metrics['cpu_usage']) if self.system_metrics['cpu_usage'] else 0,
'avg_memory_usage': np.mean(self.system_metrics['memory_usage']) if self.system_metrics['memory_usage'] else 0,
'avg_gpu_usage': np.mean(self.system_metrics['gpu_usage']) if self.system_metrics['gpu_usage'] else 0,
'avg_gpu_memory': np.mean(self.system_metrics['gpu_memory']) if self.system_metrics['gpu_memory'] else 0
},
'performance_history': self.performance_history
}
# Save report
reports_dir = Path("reports/overnight_training")
reports_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
report_path = reports_dir / f"progress_report_{timestamp}.json"
with open(report_path, 'w') as f:
json.dump(report, f, indent=2, default=str)
logger.info(f"📄 Progress report saved: {report_path}")
except Exception as e:
logger.error(f"Error saving progress report: {e}")
def monitor_overnight_training(self, check_interval: int = 300):
"""Main monitoring loop for overnight training"""
logger.info("🌙 STARTING OVERNIGHT TRAINING MONITORING")
logger.info(f"⏰ Check interval: {check_interval} seconds ({check_interval/60:.1f} minutes)")
logger.info("🚀 Monitoring the MASSIVE 504M parameter model training...")
try:
while self.monitoring:
# Log comprehensive status
self.log_comprehensive_status()
# Create performance plots every hour
runtime_hours = (datetime.now() - self.start_time).total_seconds() / 3600
if len(self.performance_history) > 0 and len(self.performance_history) % 12 == 0: # Every hour (12 * 5min = 1hr)
self.create_performance_plots()
# Save progress report every 2 hours
if len(self.performance_history) > 0 and len(self.performance_history) % 24 == 0: # Every 2 hours
self.save_progress_report()
# Check if we've been running for 8+ hours (full overnight session)
if runtime_hours >= 8:
logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETED (8+ hours)")
self.finalize_overnight_session()
break
# Wait for next check
time.sleep(check_interval)
except KeyboardInterrupt:
logger.info("🛑 MONITORING STOPPED BY USER")
self.finalize_overnight_session()
except Exception as e:
logger.error(f"❌ MONITORING ERROR: {e}")
self.finalize_overnight_session()
def finalize_overnight_session(self):
"""Finalize the overnight training session"""
logger.info("🏁 FINALIZING OVERNIGHT TRAINING SESSION")
# Final status log
self.log_comprehensive_status()
# Create final performance plots
self.create_performance_plots()
# Save final comprehensive report
self.save_progress_report()
# Calculate session summary
runtime = datetime.now() - self.start_time
runtime_hours = runtime.total_seconds() / 3600
logger.info("="*80)
logger.info("🌅 OVERNIGHT TRAINING SESSION COMPLETE")
logger.info("="*80)
logger.info(f"⏰ Total Runtime: {runtime}")
logger.info(f"📊 Total Episodes: {self.training_metrics['episodes_completed']:,}")
logger.info(f"💹 Total Trades: {self.training_metrics['total_trades']:,}")
logger.info(f"💰 Final P&L: ${self.profit_metrics['total_pnl']:+,.2f}")
logger.info(f"📈 Final ROI: {self.profit_metrics['roi_percentage']:+.2f}%")
logger.info(f"🏆 Final Win Rate: {self.training_metrics['win_rate']:.1%}")
logger.info(f"🎯 Avg Reward: {self.training_metrics['average_reward']:.2f}")
logger.info("="*80)
logger.info("🚀 MASSIVE 504M PARAMETER MODEL TRAINING SESSION COMPLETED!")
logger.info("="*80)
self.monitoring = False
def main():
"""Main function to start overnight monitoring"""
try:
logger.info("🚀 INITIALIZING OVERNIGHT TRAINING MONITOR")
logger.info("💡 Monitoring 504.89 Million Parameter Enhanced CNN + DQN Agent")
logger.info("🎯 Target: 4GB VRAM utilization with maximum profit optimization")
# Create monitor
monitor = OvernightTrainingMonitor()
# Start monitoring (check every 5 minutes)
monitor.monitor_overnight_training(check_interval=300)
except Exception as e:
logger.error(f"Fatal error in overnight monitoring: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()