188 lines
6.9 KiB
Python
188 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Overnight Training Restart Script
|
|
Keeps main.py running continuously, restarting it if it crashes.
|
|
Designed for overnight training sessions with unstable code.
|
|
|
|
Usage:
|
|
python restart_main_overnight.py
|
|
|
|
Press Ctrl+C to stop the restart loop.
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import logging
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
import signal
|
|
import os
|
|
|
|
# Setup logging for the restart script
|
|
def setup_restart_logging():
|
|
"""Setup logging for restart events"""
|
|
log_dir = Path("logs")
|
|
log_dir.mkdir(exist_ok=True)
|
|
|
|
# Create restart log file with timestamp
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
log_file = log_dir / f"restart_main_{timestamp}.log"
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.FileHandler(log_file, encoding='utf-8'),
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
|
|
logger = logging.getLogger(__name__)
|
|
logger.info(f"Restart script logging to: {log_file}")
|
|
return logger
|
|
|
|
def kill_existing_processes(logger):
|
|
"""Kill any existing main.py processes to avoid conflicts"""
|
|
try:
|
|
if os.name == 'nt': # Windows
|
|
# Kill any existing Python processes running main.py
|
|
subprocess.run(['taskkill', '/f', '/im', 'python.exe'],
|
|
capture_output=True, check=False)
|
|
subprocess.run(['taskkill', '/f', '/im', 'pythonw.exe'],
|
|
capture_output=True, check=False)
|
|
time.sleep(2)
|
|
except Exception as e:
|
|
logger.warning(f"Could not kill existing processes: {e}")
|
|
|
|
def run_main_with_restart(logger):
|
|
"""Main restart loop"""
|
|
restart_count = 0
|
|
consecutive_fast_exits = 0
|
|
start_time = datetime.now()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("OVERNIGHT TRAINING RESTART SCRIPT STARTED")
|
|
logger.info("=" * 60)
|
|
logger.info("Press Ctrl+C to stop the restart loop")
|
|
logger.info("Main script: main.py")
|
|
logger.info("Restart delay on crash: 10 seconds")
|
|
logger.info("Fast exit protection: Enabled")
|
|
logger.info("=" * 60)
|
|
|
|
# Kill any existing processes
|
|
kill_existing_processes(logger)
|
|
|
|
while True:
|
|
try:
|
|
restart_count += 1
|
|
run_start_time = datetime.now()
|
|
|
|
logger.info(f"[RESTART #{restart_count}] Starting main.py at {run_start_time.strftime('%H:%M:%S')}")
|
|
|
|
# Start main.py as subprocess
|
|
process = subprocess.Popen([
|
|
sys.executable, "main.py"
|
|
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
universal_newlines=True, bufsize=1)
|
|
|
|
logger.info(f"[PROCESS] main.py started with PID: {process.pid}")
|
|
|
|
# Stream output from main.py
|
|
try:
|
|
if process.stdout:
|
|
while True:
|
|
output = process.stdout.readline()
|
|
if output == '' and process.poll() is not None:
|
|
break
|
|
if output:
|
|
# Forward output from main.py (remove extra newlines)
|
|
print(f"[MAIN] {output.rstrip()}")
|
|
else:
|
|
# If no stdout, just wait for process to complete
|
|
process.wait()
|
|
except KeyboardInterrupt:
|
|
logger.info("[INTERRUPT] Ctrl+C received, stopping main.py...")
|
|
process.terminate()
|
|
try:
|
|
process.wait(timeout=10)
|
|
except subprocess.TimeoutExpired:
|
|
logger.warning("[FORCE KILL] Process didn't terminate, force killing...")
|
|
process.kill()
|
|
raise
|
|
|
|
# Process has exited
|
|
exit_code = process.poll()
|
|
run_end_time = datetime.now()
|
|
run_duration = (run_end_time - run_start_time).total_seconds()
|
|
|
|
logger.info(f"[EXIT] main.py exited with code {exit_code}")
|
|
logger.info(f"[DURATION] Process ran for {run_duration:.1f} seconds")
|
|
|
|
# Check for fast exits (potential configuration issues)
|
|
if run_duration < 30: # Less than 30 seconds
|
|
consecutive_fast_exits += 1
|
|
logger.warning(f"[FAST EXIT] Process exited quickly ({consecutive_fast_exits} consecutive)")
|
|
|
|
if consecutive_fast_exits >= 5:
|
|
logger.error("[ABORT] Too many consecutive fast exits (5+)")
|
|
logger.error("This indicates a configuration or startup problem")
|
|
logger.error("Please check the main.py script manually")
|
|
break
|
|
|
|
# Longer delay for fast exits
|
|
delay = min(60, 10 * consecutive_fast_exits)
|
|
logger.info(f"[DELAY] Waiting {delay} seconds before restart due to fast exit...")
|
|
time.sleep(delay)
|
|
else:
|
|
consecutive_fast_exits = 0 # Reset counter
|
|
logger.info("[DELAY] Waiting 10 seconds before restart...")
|
|
time.sleep(10)
|
|
|
|
# Log session statistics every 10 restarts
|
|
if restart_count % 10 == 0:
|
|
total_duration = (datetime.now() - start_time).total_seconds()
|
|
logger.info(f"[STATS] Session: {restart_count} restarts in {total_duration/3600:.1f} hours")
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("[SHUTDOWN] Restart loop interrupted by user")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"[ERROR] Unexpected error in restart loop: {e}")
|
|
logger.error("Continuing restart loop after 30 second delay...")
|
|
time.sleep(30)
|
|
|
|
total_duration = (datetime.now() - start_time).total_seconds()
|
|
logger.info("=" * 60)
|
|
logger.info("OVERNIGHT TRAINING SESSION COMPLETE")
|
|
logger.info(f"Total restarts: {restart_count}")
|
|
logger.info(f"Total session time: {total_duration/3600:.1f} hours")
|
|
logger.info("=" * 60)
|
|
|
|
def main():
|
|
"""Main entry point"""
|
|
# Setup signal handlers for clean shutdown
|
|
def signal_handler(signum, frame):
|
|
logger.info(f"[SIGNAL] Received signal {signum}, shutting down...")
|
|
sys.exit(0)
|
|
|
|
signal.signal(signal.SIGINT, signal_handler)
|
|
if hasattr(signal, 'SIGTERM'):
|
|
signal.signal(signal.SIGTERM, signal_handler)
|
|
|
|
# Setup logging
|
|
global logger
|
|
logger = setup_restart_logging()
|
|
|
|
try:
|
|
run_main_with_restart(logger)
|
|
except Exception as e:
|
|
logger.error(f"[FATAL] Fatal error in restart script: {e}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
return 1
|
|
|
|
return 0
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main()) |