cleanup and reorgnization
This commit is contained in:
188
scripts/restart_main_overnight.py
Normal file
188
scripts/restart_main_overnight.py
Normal file
@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Overnight Training Restart Script
|
||||
Keeps main.py running continuously, restarting it if it crashes.
|
||||
Designed for overnight training sessions with unstable code.
|
||||
|
||||
Usage:
|
||||
python restart_main_overnight.py
|
||||
|
||||
Press Ctrl+C to stop the restart loop.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
import signal
|
||||
import os
|
||||
|
||||
# Setup logging for the restart script
|
||||
def setup_restart_logging():
|
||||
"""Setup logging for restart events"""
|
||||
log_dir = Path("logs")
|
||||
log_dir.mkdir(exist_ok=True)
|
||||
|
||||
# Create restart log file with timestamp
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
log_file = log_dir / f"restart_main_{timestamp}.log"
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler(log_file, encoding='utf-8'),
|
||||
logging.StreamHandler(sys.stdout)
|
||||
]
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info(f"Restart script logging to: {log_file}")
|
||||
return logger
|
||||
|
||||
def kill_existing_processes(logger):
|
||||
"""Kill any existing main.py processes to avoid conflicts"""
|
||||
try:
|
||||
if os.name == 'nt': # Windows
|
||||
# Kill any existing Python processes running main.py
|
||||
subprocess.run(['taskkill', '/f', '/im', 'python.exe'],
|
||||
capture_output=True, check=False)
|
||||
subprocess.run(['taskkill', '/f', '/im', 'pythonw.exe'],
|
||||
capture_output=True, check=False)
|
||||
time.sleep(2)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not kill existing processes: {e}")
|
||||
|
||||
def run_main_with_restart(logger):
|
||||
"""Main restart loop"""
|
||||
restart_count = 0
|
||||
consecutive_fast_exits = 0
|
||||
start_time = datetime.now()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("OVERNIGHT TRAINING RESTART SCRIPT STARTED")
|
||||
logger.info("=" * 60)
|
||||
logger.info("Press Ctrl+C to stop the restart loop")
|
||||
logger.info("Main script: main.py")
|
||||
logger.info("Restart delay on crash: 10 seconds")
|
||||
logger.info("Fast exit protection: Enabled")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Kill any existing processes
|
||||
kill_existing_processes(logger)
|
||||
|
||||
while True:
|
||||
try:
|
||||
restart_count += 1
|
||||
run_start_time = datetime.now()
|
||||
|
||||
logger.info(f"[RESTART #{restart_count}] Starting main.py at {run_start_time.strftime('%H:%M:%S')}")
|
||||
|
||||
# Start main.py as subprocess
|
||||
process = subprocess.Popen([
|
||||
sys.executable, "main.py"
|
||||
], stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
universal_newlines=True, bufsize=1)
|
||||
|
||||
logger.info(f"[PROCESS] main.py started with PID: {process.pid}")
|
||||
|
||||
# Stream output from main.py
|
||||
try:
|
||||
if process.stdout:
|
||||
while True:
|
||||
output = process.stdout.readline()
|
||||
if output == '' and process.poll() is not None:
|
||||
break
|
||||
if output:
|
||||
# Forward output from main.py (remove extra newlines)
|
||||
print(f"[MAIN] {output.rstrip()}")
|
||||
else:
|
||||
# If no stdout, just wait for process to complete
|
||||
process.wait()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("[INTERRUPT] Ctrl+C received, stopping main.py...")
|
||||
process.terminate()
|
||||
try:
|
||||
process.wait(timeout=10)
|
||||
except subprocess.TimeoutExpired:
|
||||
logger.warning("[FORCE KILL] Process didn't terminate, force killing...")
|
||||
process.kill()
|
||||
raise
|
||||
|
||||
# Process has exited
|
||||
exit_code = process.poll()
|
||||
run_end_time = datetime.now()
|
||||
run_duration = (run_end_time - run_start_time).total_seconds()
|
||||
|
||||
logger.info(f"[EXIT] main.py exited with code {exit_code}")
|
||||
logger.info(f"[DURATION] Process ran for {run_duration:.1f} seconds")
|
||||
|
||||
# Check for fast exits (potential configuration issues)
|
||||
if run_duration < 30: # Less than 30 seconds
|
||||
consecutive_fast_exits += 1
|
||||
logger.warning(f"[FAST EXIT] Process exited quickly ({consecutive_fast_exits} consecutive)")
|
||||
|
||||
if consecutive_fast_exits >= 5:
|
||||
logger.error("[ABORT] Too many consecutive fast exits (5+)")
|
||||
logger.error("This indicates a configuration or startup problem")
|
||||
logger.error("Please check the main.py script manually")
|
||||
break
|
||||
|
||||
# Longer delay for fast exits
|
||||
delay = min(60, 10 * consecutive_fast_exits)
|
||||
logger.info(f"[DELAY] Waiting {delay} seconds before restart due to fast exit...")
|
||||
time.sleep(delay)
|
||||
else:
|
||||
consecutive_fast_exits = 0 # Reset counter
|
||||
logger.info("[DELAY] Waiting 10 seconds before restart...")
|
||||
time.sleep(10)
|
||||
|
||||
# Log session statistics every 10 restarts
|
||||
if restart_count % 10 == 0:
|
||||
total_duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info(f"[STATS] Session: {restart_count} restarts in {total_duration/3600:.1f} hours")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("[SHUTDOWN] Restart loop interrupted by user")
|
||||
break
|
||||
except Exception as e:
|
||||
logger.error(f"[ERROR] Unexpected error in restart loop: {e}")
|
||||
logger.error("Continuing restart loop after 30 second delay...")
|
||||
time.sleep(30)
|
||||
|
||||
total_duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.info("=" * 60)
|
||||
logger.info("OVERNIGHT TRAINING SESSION COMPLETE")
|
||||
logger.info(f"Total restarts: {restart_count}")
|
||||
logger.info(f"Total session time: {total_duration/3600:.1f} hours")
|
||||
logger.info("=" * 60)
|
||||
|
||||
def main():
|
||||
"""Main entry point"""
|
||||
# Setup signal handlers for clean shutdown
|
||||
def signal_handler(signum, frame):
|
||||
logger.info(f"[SIGNAL] Received signal {signum}, shutting down...")
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
if hasattr(signal, 'SIGTERM'):
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
# Setup logging
|
||||
global logger
|
||||
logger = setup_restart_logging()
|
||||
|
||||
try:
|
||||
run_main_with_restart(logger)
|
||||
except Exception as e:
|
||||
logger.error(f"[FATAL] Fatal error in restart script: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
Reference in New Issue
Block a user