160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Helper script to start monitoring services for RL training
|
|
"""
|
|
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
import requests
|
|
import os
|
|
import json
|
|
from pathlib import Path
|
|
|
|
# Available ports to try for TensorBoard
|
|
TENSORBOARD_PORTS = [6006, 6007, 6008, 6009, 6010, 6011, 6012]
|
|
|
|
def check_port(port, service_name):
|
|
"""Check if a service is running on the specified port"""
|
|
try:
|
|
response = requests.get(f"http://localhost:{port}", timeout=3)
|
|
print(f"✅ {service_name} is running on port {port}")
|
|
return True
|
|
except requests.exceptions.RequestException:
|
|
return False
|
|
|
|
def is_port_in_use(port):
|
|
"""Check if a port is already in use"""
|
|
import socket
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
try:
|
|
s.bind(('localhost', port))
|
|
return False
|
|
except OSError:
|
|
return True
|
|
|
|
def find_available_port(ports_list, service_name):
|
|
"""Find an available port from the list"""
|
|
for port in ports_list:
|
|
if not is_port_in_use(port):
|
|
print(f"🔍 Found available port {port} for {service_name}")
|
|
return port
|
|
else:
|
|
print(f"⚠️ Port {port} is already in use")
|
|
return None
|
|
|
|
def save_port_config(tensorboard_port):
|
|
"""Save the port configuration to a file"""
|
|
config = {
|
|
"tensorboard_port": tensorboard_port,
|
|
"web_dashboard_port": 8051
|
|
}
|
|
with open("monitoring_ports.json", "w") as f:
|
|
json.dump(config, f, indent=2)
|
|
print(f"💾 Port configuration saved to monitoring_ports.json")
|
|
|
|
def start_tensorboard():
|
|
"""Start TensorBoard in background on an available port"""
|
|
try:
|
|
# First check if TensorBoard is already running on any of our ports
|
|
for port in TENSORBOARD_PORTS:
|
|
if check_port(port, "TensorBoard"):
|
|
print(f"✅ TensorBoard already running on port {port}")
|
|
save_port_config(port)
|
|
return port
|
|
|
|
# Find an available port
|
|
port = find_available_port(TENSORBOARD_PORTS, "TensorBoard")
|
|
if port is None:
|
|
print(f"❌ No available ports found in range {TENSORBOARD_PORTS}")
|
|
return None
|
|
|
|
print(f"🚀 Starting TensorBoard on port {port}...")
|
|
|
|
# Create runs directory if it doesn't exist
|
|
Path("runs").mkdir(exist_ok=True)
|
|
|
|
# Start TensorBoard
|
|
if os.name == 'nt': # Windows
|
|
subprocess.Popen([
|
|
sys.executable, "-m", "tensorboard",
|
|
"--logdir=runs", f"--port={port}", "--reload_interval=1"
|
|
], creationflags=subprocess.CREATE_NEW_CONSOLE)
|
|
else: # Linux/Mac
|
|
subprocess.Popen([
|
|
sys.executable, "-m", "tensorboard",
|
|
"--logdir=runs", f"--port={port}", "--reload_interval=1"
|
|
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
|
|
# Wait for TensorBoard to start
|
|
print(f"⏳ Waiting for TensorBoard to start on port {port}...")
|
|
for i in range(15):
|
|
time.sleep(2)
|
|
if check_port(port, "TensorBoard"):
|
|
save_port_config(port)
|
|
return port
|
|
|
|
print(f"⚠️ TensorBoard failed to start on port {port} within 30 seconds")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"❌ Error starting TensorBoard: {e}")
|
|
return None
|
|
|
|
def check_web_dashboard_port():
|
|
"""Check if web dashboard port is available"""
|
|
port = 8051
|
|
if is_port_in_use(port):
|
|
print(f"⚠️ Web dashboard port {port} is in use")
|
|
# Try alternative ports
|
|
for alt_port in [8052, 8053, 8054, 8055]:
|
|
if not is_port_in_use(alt_port):
|
|
print(f"🔍 Alternative port {alt_port} available for web dashboard")
|
|
return alt_port
|
|
print("❌ No alternative ports found for web dashboard")
|
|
return port
|
|
else:
|
|
print(f"✅ Web dashboard port {port} is available")
|
|
return port
|
|
|
|
def main():
|
|
"""Main function"""
|
|
print("=" * 60)
|
|
print("🎯 RL TRAINING MONITORING SETUP")
|
|
print("=" * 60)
|
|
|
|
# Check web dashboard port
|
|
web_port = check_web_dashboard_port()
|
|
|
|
# Start TensorBoard
|
|
tensorboard_port = start_tensorboard()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("📊 MONITORING STATUS")
|
|
print("=" * 60)
|
|
|
|
if tensorboard_port:
|
|
print(f"✅ TensorBoard: http://localhost:{tensorboard_port}")
|
|
# Update port config
|
|
save_port_config(tensorboard_port)
|
|
else:
|
|
print("❌ TensorBoard: Failed to start")
|
|
print(" Manual start: python -m tensorboard --logdir=runs --port=6007")
|
|
|
|
if web_port:
|
|
print(f"✅ Web Dashboard: Ready on port {web_port}")
|
|
|
|
print(f"\n🎯 Ready to start RL training!")
|
|
if tensorboard_port and web_port != 8051:
|
|
print(f"Run: python train_realtime_with_tensorboard.py --episodes 10 --web-port {web_port}")
|
|
else:
|
|
print("Run: python train_realtime_with_tensorboard.py --episodes 10")
|
|
|
|
print(f"\n📋 Available URLs:")
|
|
if tensorboard_port:
|
|
print(f" 📊 TensorBoard: http://localhost:{tensorboard_port}")
|
|
if web_port:
|
|
print(f" 🌐 Web Dashboard: http://localhost:{web_port} (starts with training)")
|
|
|
|
if __name__ == "__main__":
|
|
main() |