#!/usr/bin/env python3 """ Overnight Training Restart Script Keeps main.py running continuously, restarting it if it crashes. Designed for overnight training sessions with unstable code. Usage: python restart_main_overnight.py Press Ctrl+C to stop the restart loop. """ import subprocess import sys import time import logging from datetime import datetime from pathlib import Path import signal import os # Setup logging for the restart script def setup_restart_logging(): """Setup logging for restart events""" log_dir = Path("logs") log_dir.mkdir(exist_ok=True) # Create restart log file with timestamp timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") log_file = log_dir / f"restart_main_{timestamp}.log" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler(log_file, encoding='utf-8'), logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger(__name__) logger.info(f"Restart script logging to: {log_file}") return logger def kill_existing_processes(logger): """Kill any existing main.py processes to avoid conflicts""" try: if os.name == 'nt': # Windows # Kill any existing Python processes running main.py subprocess.run(['taskkill', '/f', '/im', 'python.exe'], capture_output=True, check=False) subprocess.run(['taskkill', '/f', '/im', 'pythonw.exe'], capture_output=True, check=False) time.sleep(2) except Exception as e: logger.warning(f"Could not kill existing processes: {e}") def run_main_with_restart(logger): """Main restart loop""" restart_count = 0 consecutive_fast_exits = 0 start_time = datetime.now() logger.info("=" * 60) logger.info("OVERNIGHT TRAINING RESTART SCRIPT STARTED") logger.info("=" * 60) logger.info("Press Ctrl+C to stop the restart loop") logger.info("Main script: main.py") logger.info("Restart delay on crash: 10 seconds") logger.info("Fast exit protection: Enabled") logger.info("=" * 60) # Kill any existing processes kill_existing_processes(logger) while True: try: restart_count += 1 run_start_time = datetime.now() logger.info(f"[RESTART #{restart_count}] Starting main.py at {run_start_time.strftime('%H:%M:%S')}") # Start main.py as subprocess process = subprocess.Popen([ sys.executable, "main.py" ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, bufsize=1) logger.info(f"[PROCESS] main.py started with PID: {process.pid}") # Stream output from main.py try: if process.stdout: while True: output = process.stdout.readline() if output == '' and process.poll() is not None: break if output: # Forward output from main.py (remove extra newlines) print(f"[MAIN] {output.rstrip()}") else: # If no stdout, just wait for process to complete process.wait() except KeyboardInterrupt: logger.info("[INTERRUPT] Ctrl+C received, stopping main.py...") process.terminate() try: process.wait(timeout=10) except subprocess.TimeoutExpired: logger.warning("[FORCE KILL] Process didn't terminate, force killing...") process.kill() raise # Process has exited exit_code = process.poll() run_end_time = datetime.now() run_duration = (run_end_time - run_start_time).total_seconds() logger.info(f"[EXIT] main.py exited with code {exit_code}") logger.info(f"[DURATION] Process ran for {run_duration:.1f} seconds") # Check for fast exits (potential configuration issues) if run_duration < 30: # Less than 30 seconds consecutive_fast_exits += 1 logger.warning(f"[FAST EXIT] Process exited quickly ({consecutive_fast_exits} consecutive)") if consecutive_fast_exits >= 5: logger.error("[ABORT] Too many consecutive fast exits (5+)") logger.error("This indicates a configuration or startup problem") logger.error("Please check the main.py script manually") break # Longer delay for fast exits delay = min(60, 10 * consecutive_fast_exits) logger.info(f"[DELAY] Waiting {delay} seconds before restart due to fast exit...") time.sleep(delay) else: consecutive_fast_exits = 0 # Reset counter logger.info("[DELAY] Waiting 10 seconds before restart...") time.sleep(10) # Log session statistics every 10 restarts if restart_count % 10 == 0: total_duration = (datetime.now() - start_time).total_seconds() logger.info(f"[STATS] Session: {restart_count} restarts in {total_duration/3600:.1f} hours") except KeyboardInterrupt: logger.info("[SHUTDOWN] Restart loop interrupted by user") break except Exception as e: logger.error(f"[ERROR] Unexpected error in restart loop: {e}") logger.error("Continuing restart loop after 30 second delay...") time.sleep(30) total_duration = (datetime.now() - start_time).total_seconds() logger.info("=" * 60) logger.info("OVERNIGHT TRAINING SESSION COMPLETE") logger.info(f"Total restarts: {restart_count}") logger.info(f"Total session time: {total_duration/3600:.1f} hours") logger.info("=" * 60) def main(): """Main entry point""" # Setup signal handlers for clean shutdown def signal_handler(signum, frame): logger.info(f"[SIGNAL] Received signal {signum}, shutting down...") sys.exit(0) signal.signal(signal.SIGINT, signal_handler) if hasattr(signal, 'SIGTERM'): signal.signal(signal.SIGTERM, signal_handler) # Setup logging global logger logger = setup_restart_logging() try: run_main_with_restart(logger) except Exception as e: logger.error(f"[FATAL] Fatal error in restart script: {e}") import traceback logger.error(traceback.format_exc()) return 1 return 0 if __name__ == "__main__": sys.exit(main())