Files
gogo2/scripts/restart_main_overnight.py
2025-06-25 15:16:49 +03:00

188 lines
6.9 KiB
Python

#!/usr/bin/env python3
"""
Overnight Training Restart Script
Keeps main.py running continuously, restarting it if it crashes.
Designed for overnight training sessions with unstable code.
Usage:
python restart_main_overnight.py
Press Ctrl+C to stop the restart loop.
"""
import subprocess
import sys
import time
import logging
from datetime import datetime
from pathlib import Path
import signal
import os
# Setup logging for the restart script
def setup_restart_logging():
"""Setup logging for restart events"""
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
# Create restart log file with timestamp
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"restart_main_{timestamp}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file, encoding='utf-8'),
logging.StreamHandler(sys.stdout)
]
)
logger = logging.getLogger(__name__)
logger.info(f"Restart script logging to: {log_file}")
return logger
def kill_existing_processes(logger):
"""Kill any existing main.py processes to avoid conflicts"""
try:
if os.name == 'nt': # Windows
# Kill any existing Python processes running main.py
subprocess.run(['taskkill', '/f', '/im', 'python.exe'],
capture_output=True, check=False)
subprocess.run(['taskkill', '/f', '/im', 'pythonw.exe'],
capture_output=True, check=False)
time.sleep(2)
except Exception as e:
logger.warning(f"Could not kill existing processes: {e}")
def run_main_with_restart(logger):
"""Main restart loop"""
restart_count = 0
consecutive_fast_exits = 0
start_time = datetime.now()
logger.info("=" * 60)
logger.info("OVERNIGHT TRAINING RESTART SCRIPT STARTED")
logger.info("=" * 60)
logger.info("Press Ctrl+C to stop the restart loop")
logger.info("Main script: main.py")
logger.info("Restart delay on crash: 10 seconds")
logger.info("Fast exit protection: Enabled")
logger.info("=" * 60)
# Kill any existing processes
kill_existing_processes(logger)
while True:
try:
restart_count += 1
run_start_time = datetime.now()
logger.info(f"[RESTART #{restart_count}] Starting main.py at {run_start_time.strftime('%H:%M:%S')}")
# Start main.py as subprocess
process = subprocess.Popen([
sys.executable, "main.py"
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
universal_newlines=True, bufsize=1)
logger.info(f"[PROCESS] main.py started with PID: {process.pid}")
# Stream output from main.py
try:
if process.stdout:
while True:
output = process.stdout.readline()
if output == '' and process.poll() is not None:
break
if output:
# Forward output from main.py (remove extra newlines)
print(f"[MAIN] {output.rstrip()}")
else:
# If no stdout, just wait for process to complete
process.wait()
except KeyboardInterrupt:
logger.info("[INTERRUPT] Ctrl+C received, stopping main.py...")
process.terminate()
try:
process.wait(timeout=10)
except subprocess.TimeoutExpired:
logger.warning("[FORCE KILL] Process didn't terminate, force killing...")
process.kill()
raise
# Process has exited
exit_code = process.poll()
run_end_time = datetime.now()
run_duration = (run_end_time - run_start_time).total_seconds()
logger.info(f"[EXIT] main.py exited with code {exit_code}")
logger.info(f"[DURATION] Process ran for {run_duration:.1f} seconds")
# Check for fast exits (potential configuration issues)
if run_duration < 30: # Less than 30 seconds
consecutive_fast_exits += 1
logger.warning(f"[FAST EXIT] Process exited quickly ({consecutive_fast_exits} consecutive)")
if consecutive_fast_exits >= 5:
logger.error("[ABORT] Too many consecutive fast exits (5+)")
logger.error("This indicates a configuration or startup problem")
logger.error("Please check the main.py script manually")
break
# Longer delay for fast exits
delay = min(60, 10 * consecutive_fast_exits)
logger.info(f"[DELAY] Waiting {delay} seconds before restart due to fast exit...")
time.sleep(delay)
else:
consecutive_fast_exits = 0 # Reset counter
logger.info("[DELAY] Waiting 10 seconds before restart...")
time.sleep(10)
# Log session statistics every 10 restarts
if restart_count % 10 == 0:
total_duration = (datetime.now() - start_time).total_seconds()
logger.info(f"[STATS] Session: {restart_count} restarts in {total_duration/3600:.1f} hours")
except KeyboardInterrupt:
logger.info("[SHUTDOWN] Restart loop interrupted by user")
break
except Exception as e:
logger.error(f"[ERROR] Unexpected error in restart loop: {e}")
logger.error("Continuing restart loop after 30 second delay...")
time.sleep(30)
total_duration = (datetime.now() - start_time).total_seconds()
logger.info("=" * 60)
logger.info("OVERNIGHT TRAINING SESSION COMPLETE")
logger.info(f"Total restarts: {restart_count}")
logger.info(f"Total session time: {total_duration/3600:.1f} hours")
logger.info("=" * 60)
def main():
"""Main entry point"""
# Setup signal handlers for clean shutdown
def signal_handler(signum, frame):
logger.info(f"[SIGNAL] Received signal {signum}, shutting down...")
sys.exit(0)
signal.signal(signal.SIGINT, signal_handler)
if hasattr(signal, 'SIGTERM'):
signal.signal(signal.SIGTERM, signal_handler)
# Setup logging
global logger
logger = setup_restart_logging()
try:
run_main_with_restart(logger)
except Exception as e:
logger.error(f"[FATAL] Fatal error in restart script: {e}")
import traceback
logger.error(traceback.format_exc())
return 1
return 0
if __name__ == "__main__":
sys.exit(main())