wip
This commit is contained in:
160
start_monitoring.py
Normal file
160
start_monitoring.py
Normal file
@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Helper script to start monitoring services for RL training
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
# Available ports to try for TensorBoard
|
||||
TENSORBOARD_PORTS = [6006, 6007, 6008, 6009, 6010, 6011, 6012]
|
||||
|
||||
def check_port(port, service_name):
|
||||
"""Check if a service is running on the specified port"""
|
||||
try:
|
||||
response = requests.get(f"http://localhost:{port}", timeout=3)
|
||||
print(f"✅ {service_name} is running on port {port}")
|
||||
return True
|
||||
except requests.exceptions.RequestException:
|
||||
return False
|
||||
|
||||
def is_port_in_use(port):
|
||||
"""Check if a port is already in use"""
|
||||
import socket
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
try:
|
||||
s.bind(('localhost', port))
|
||||
return False
|
||||
except OSError:
|
||||
return True
|
||||
|
||||
def find_available_port(ports_list, service_name):
|
||||
"""Find an available port from the list"""
|
||||
for port in ports_list:
|
||||
if not is_port_in_use(port):
|
||||
print(f"🔍 Found available port {port} for {service_name}")
|
||||
return port
|
||||
else:
|
||||
print(f"⚠️ Port {port} is already in use")
|
||||
return None
|
||||
|
||||
def save_port_config(tensorboard_port):
|
||||
"""Save the port configuration to a file"""
|
||||
config = {
|
||||
"tensorboard_port": tensorboard_port,
|
||||
"web_dashboard_port": 8051
|
||||
}
|
||||
with open("monitoring_ports.json", "w") as f:
|
||||
json.dump(config, f, indent=2)
|
||||
print(f"💾 Port configuration saved to monitoring_ports.json")
|
||||
|
||||
def start_tensorboard():
|
||||
"""Start TensorBoard in background on an available port"""
|
||||
try:
|
||||
# First check if TensorBoard is already running on any of our ports
|
||||
for port in TENSORBOARD_PORTS:
|
||||
if check_port(port, "TensorBoard"):
|
||||
print(f"✅ TensorBoard already running on port {port}")
|
||||
save_port_config(port)
|
||||
return port
|
||||
|
||||
# Find an available port
|
||||
port = find_available_port(TENSORBOARD_PORTS, "TensorBoard")
|
||||
if port is None:
|
||||
print(f"❌ No available ports found in range {TENSORBOARD_PORTS}")
|
||||
return None
|
||||
|
||||
print(f"🚀 Starting TensorBoard on port {port}...")
|
||||
|
||||
# Create runs directory if it doesn't exist
|
||||
Path("runs").mkdir(exist_ok=True)
|
||||
|
||||
# Start TensorBoard
|
||||
if os.name == 'nt': # Windows
|
||||
subprocess.Popen([
|
||||
sys.executable, "-m", "tensorboard",
|
||||
"--logdir=runs", f"--port={port}", "--reload_interval=1"
|
||||
], creationflags=subprocess.CREATE_NEW_CONSOLE)
|
||||
else: # Linux/Mac
|
||||
subprocess.Popen([
|
||||
sys.executable, "-m", "tensorboard",
|
||||
"--logdir=runs", f"--port={port}", "--reload_interval=1"
|
||||
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
|
||||
# Wait for TensorBoard to start
|
||||
print(f"⏳ Waiting for TensorBoard to start on port {port}...")
|
||||
for i in range(15):
|
||||
time.sleep(2)
|
||||
if check_port(port, "TensorBoard"):
|
||||
save_port_config(port)
|
||||
return port
|
||||
|
||||
print(f"⚠️ TensorBoard failed to start on port {port} within 30 seconds")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error starting TensorBoard: {e}")
|
||||
return None
|
||||
|
||||
def check_web_dashboard_port():
|
||||
"""Check if web dashboard port is available"""
|
||||
port = 8051
|
||||
if is_port_in_use(port):
|
||||
print(f"⚠️ Web dashboard port {port} is in use")
|
||||
# Try alternative ports
|
||||
for alt_port in [8052, 8053, 8054, 8055]:
|
||||
if not is_port_in_use(alt_port):
|
||||
print(f"🔍 Alternative port {alt_port} available for web dashboard")
|
||||
return alt_port
|
||||
print("❌ No alternative ports found for web dashboard")
|
||||
return port
|
||||
else:
|
||||
print(f"✅ Web dashboard port {port} is available")
|
||||
return port
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
print("=" * 60)
|
||||
print("🎯 RL TRAINING MONITORING SETUP")
|
||||
print("=" * 60)
|
||||
|
||||
# Check web dashboard port
|
||||
web_port = check_web_dashboard_port()
|
||||
|
||||
# Start TensorBoard
|
||||
tensorboard_port = start_tensorboard()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("📊 MONITORING STATUS")
|
||||
print("=" * 60)
|
||||
|
||||
if tensorboard_port:
|
||||
print(f"✅ TensorBoard: http://localhost:{tensorboard_port}")
|
||||
# Update port config
|
||||
save_port_config(tensorboard_port)
|
||||
else:
|
||||
print("❌ TensorBoard: Failed to start")
|
||||
print(" Manual start: python -m tensorboard --logdir=runs --port=6007")
|
||||
|
||||
if web_port:
|
||||
print(f"✅ Web Dashboard: Ready on port {web_port}")
|
||||
|
||||
print(f"\n🎯 Ready to start RL training!")
|
||||
if tensorboard_port and web_port != 8051:
|
||||
print(f"Run: python train_realtime_with_tensorboard.py --episodes 10 --web-port {web_port}")
|
||||
else:
|
||||
print("Run: python train_realtime_with_tensorboard.py --episodes 10")
|
||||
|
||||
print(f"\n📋 Available URLs:")
|
||||
if tensorboard_port:
|
||||
print(f" 📊 TensorBoard: http://localhost:{tensorboard_port}")
|
||||
if web_port:
|
||||
print(f" 🌐 Web Dashboard: http://localhost:{web_port} (starts with training)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user