83 lines
3.0 KiB
Python
83 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Training Monitor Script
|
|
|
|
Quick script to check the status of realtime training and show key metrics.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import glob
|
|
|
|
def check_training_status():
|
|
"""Check status of training processes and logs"""
|
|
print("=" * 60)
|
|
print("REALTIME RL TRAINING STATUS CHECK")
|
|
print("=" * 60)
|
|
|
|
# Check TensorBoard logs
|
|
runs_dir = Path("runs")
|
|
if runs_dir.exists():
|
|
log_dirs = list(runs_dir.glob("rl_training_*"))
|
|
recent_logs = sorted(log_dirs, key=lambda x: x.name)[-3:] # Last 3 sessions
|
|
|
|
print("\n📊 RECENT TENSORBOARD LOGS:")
|
|
for log_dir in recent_logs:
|
|
# Get creation time
|
|
stat = log_dir.stat()
|
|
created = datetime.fromtimestamp(stat.st_ctime)
|
|
|
|
# Check for event files
|
|
event_files = list(log_dir.glob("*.tfevents.*"))
|
|
|
|
print(f" 📁 {log_dir.name}")
|
|
print(f" Created: {created.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print(f" Event files: {len(event_files)}")
|
|
|
|
if event_files:
|
|
latest_event = max(event_files, key=lambda x: x.stat().st_mtime)
|
|
modified = datetime.fromtimestamp(latest_event.stat().st_mtime)
|
|
print(f" Last update: {modified.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
print()
|
|
|
|
# Check running processes
|
|
print("🔍 PROCESS STATUS:")
|
|
try:
|
|
import subprocess
|
|
result = subprocess.run(['tasklist'], capture_output=True, text=True, shell=True)
|
|
python_processes = [line for line in result.stdout.split('\n') if 'python.exe' in line]
|
|
print(f" Python processes running: {len(python_processes)}")
|
|
for i, proc in enumerate(python_processes[:5]): # Show first 5
|
|
print(f" {i+1}. {proc.strip()}")
|
|
except Exception as e:
|
|
print(f" Error checking processes: {e}")
|
|
|
|
# Check web services
|
|
print("\n🌐 WEB SERVICES:")
|
|
print(" TensorBoard: http://localhost:6006")
|
|
print(" Web Dashboard: http://localhost:8051")
|
|
|
|
# Check model saves
|
|
models_dir = Path("models/rl")
|
|
if models_dir.exists():
|
|
model_files = list(models_dir.glob("realtime_agent_*.pt"))
|
|
print(f"\n💾 SAVED MODELS: {len(model_files)}")
|
|
for model_file in sorted(model_files, key=lambda x: x.stat().st_mtime)[-3:]:
|
|
modified = datetime.fromtimestamp(model_file.stat().st_mtime)
|
|
print(f" 📄 {model_file.name} - {modified.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("✅ MONITORING URLs:")
|
|
print("📊 TensorBoard: http://localhost:6006")
|
|
print("🌐 Dashboard: http://localhost:8051")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
check_training_status()
|
|
except KeyboardInterrupt:
|
|
print("\nMonitoring stopped.")
|
|
except Exception as e:
|
|
print(f"Error: {e}") |