gogo2/monitor_training.py
Dobromir Popov 0fe8286787 misc
2025-05-24 09:58:36 +03:00

83 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""
Training Monitor Script
Quick script to check the status of realtime training and show key metrics.
"""
import os
import time
from pathlib import Path
from datetime import datetime
import glob
def check_training_status():
"""Check status of training processes and logs"""
print("=" * 60)
print("REALTIME RL TRAINING STATUS CHECK")
print("=" * 60)
# Check TensorBoard logs
runs_dir = Path("runs")
if runs_dir.exists():
log_dirs = list(runs_dir.glob("rl_training_*"))
recent_logs = sorted(log_dirs, key=lambda x: x.name)[-3:] # Last 3 sessions
print("\n📊 RECENT TENSORBOARD LOGS:")
for log_dir in recent_logs:
# Get creation time
stat = log_dir.stat()
created = datetime.fromtimestamp(stat.st_ctime)
# Check for event files
event_files = list(log_dir.glob("*.tfevents.*"))
print(f" 📁 {log_dir.name}")
print(f" Created: {created.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Event files: {len(event_files)}")
if event_files:
latest_event = max(event_files, key=lambda x: x.stat().st_mtime)
modified = datetime.fromtimestamp(latest_event.stat().st_mtime)
print(f" Last update: {modified.strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Check running processes
print("🔍 PROCESS STATUS:")
try:
import subprocess
result = subprocess.run(['tasklist'], capture_output=True, text=True, shell=True)
python_processes = [line for line in result.stdout.split('\n') if 'python.exe' in line]
print(f" Python processes running: {len(python_processes)}")
for i, proc in enumerate(python_processes[:5]): # Show first 5
print(f" {i+1}. {proc.strip()}")
except Exception as e:
print(f" Error checking processes: {e}")
# Check web services
print("\n🌐 WEB SERVICES:")
print(" TensorBoard: http://localhost:6006")
print(" Web Dashboard: http://localhost:8051")
# Check model saves
models_dir = Path("models/rl")
if models_dir.exists():
model_files = list(models_dir.glob("realtime_agent_*.pt"))
print(f"\n💾 SAVED MODELS: {len(model_files)}")
for model_file in sorted(model_files, key=lambda x: x.stat().st_mtime)[-3:]:
modified = datetime.fromtimestamp(model_file.stat().st_mtime)
print(f" 📄 {model_file.name} - {modified.strftime('%Y-%m-%d %H:%M:%S')}")
print("\n" + "=" * 60)
print("✅ MONITORING URLs:")
print("📊 TensorBoard: http://localhost:6006")
print("🌐 Dashboard: http://localhost:8051")
print("=" * 60)
if __name__ == "__main__":
try:
check_training_status()
except KeyboardInterrupt:
print("\nMonitoring stopped.")
except Exception as e:
print(f"Error: {e}")