misc
This commit is contained in:
83
monitor_training.py
Normal file
83
monitor_training.py
Normal file
@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Training Monitor Script
|
||||
|
||||
Quick script to check the status of realtime training and show key metrics.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import glob
|
||||
|
||||
def check_training_status():
|
||||
"""Check status of training processes and logs"""
|
||||
print("=" * 60)
|
||||
print("REALTIME RL TRAINING STATUS CHECK")
|
||||
print("=" * 60)
|
||||
|
||||
# Check TensorBoard logs
|
||||
runs_dir = Path("runs")
|
||||
if runs_dir.exists():
|
||||
log_dirs = list(runs_dir.glob("rl_training_*"))
|
||||
recent_logs = sorted(log_dirs, key=lambda x: x.name)[-3:] # Last 3 sessions
|
||||
|
||||
print("\n📊 RECENT TENSORBOARD LOGS:")
|
||||
for log_dir in recent_logs:
|
||||
# Get creation time
|
||||
stat = log_dir.stat()
|
||||
created = datetime.fromtimestamp(stat.st_ctime)
|
||||
|
||||
# Check for event files
|
||||
event_files = list(log_dir.glob("*.tfevents.*"))
|
||||
|
||||
print(f" 📁 {log_dir.name}")
|
||||
print(f" Created: {created.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Event files: {len(event_files)}")
|
||||
|
||||
if event_files:
|
||||
latest_event = max(event_files, key=lambda x: x.stat().st_mtime)
|
||||
modified = datetime.fromtimestamp(latest_event.stat().st_mtime)
|
||||
print(f" Last update: {modified.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print()
|
||||
|
||||
# Check running processes
|
||||
print("🔍 PROCESS STATUS:")
|
||||
try:
|
||||
import subprocess
|
||||
result = subprocess.run(['tasklist'], capture_output=True, text=True, shell=True)
|
||||
python_processes = [line for line in result.stdout.split('\n') if 'python.exe' in line]
|
||||
print(f" Python processes running: {len(python_processes)}")
|
||||
for i, proc in enumerate(python_processes[:5]): # Show first 5
|
||||
print(f" {i+1}. {proc.strip()}")
|
||||
except Exception as e:
|
||||
print(f" Error checking processes: {e}")
|
||||
|
||||
# Check web services
|
||||
print("\n🌐 WEB SERVICES:")
|
||||
print(" TensorBoard: http://localhost:6006")
|
||||
print(" Web Dashboard: http://localhost:8051")
|
||||
|
||||
# Check model saves
|
||||
models_dir = Path("models/rl")
|
||||
if models_dir.exists():
|
||||
model_files = list(models_dir.glob("realtime_agent_*.pt"))
|
||||
print(f"\n💾 SAVED MODELS: {len(model_files)}")
|
||||
for model_file in sorted(model_files, key=lambda x: x.stat().st_mtime)[-3:]:
|
||||
modified = datetime.fromtimestamp(model_file.stat().st_mtime)
|
||||
print(f" 📄 {model_file.name} - {modified.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ MONITORING URLs:")
|
||||
print("📊 TensorBoard: http://localhost:6006")
|
||||
print("🌐 Dashboard: http://localhost:8051")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
check_training_status()
|
||||
except KeyboardInterrupt:
|
||||
print("\nMonitoring stopped.")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
Reference in New Issue
Block a user