This commit is contained in:
Dobromir Popov
2025-05-24 09:58:36 +03:00
parent ef71160282
commit 0fe8286787
11 changed files with 1396 additions and 483 deletions

83
monitor_training.py Normal file
View File

@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Training Monitor Script
Quick script to check the status of realtime training and show key metrics.
"""
import os
import time
from pathlib import Path
from datetime import datetime
import glob
def check_training_status():
"""Check status of training processes and logs"""
print("=" * 60)
print("REALTIME RL TRAINING STATUS CHECK")
print("=" * 60)
# Check TensorBoard logs
runs_dir = Path("runs")
if runs_dir.exists():
log_dirs = list(runs_dir.glob("rl_training_*"))
recent_logs = sorted(log_dirs, key=lambda x: x.name)[-3:] # Last 3 sessions
print("\n📊 RECENT TENSORBOARD LOGS:")
for log_dir in recent_logs:
# Get creation time
stat = log_dir.stat()
created = datetime.fromtimestamp(stat.st_ctime)
# Check for event files
event_files = list(log_dir.glob("*.tfevents.*"))
print(f" 📁 {log_dir.name}")
print(f" Created: {created.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Event files: {len(event_files)}")
if event_files:
latest_event = max(event_files, key=lambda x: x.stat().st_mtime)
modified = datetime.fromtimestamp(latest_event.stat().st_mtime)
print(f" Last update: {modified.strftime('%Y-%m-%d %H:%M:%S')}")
print()
# Check running processes
print("🔍 PROCESS STATUS:")
try:
import subprocess
result = subprocess.run(['tasklist'], capture_output=True, text=True, shell=True)
python_processes = [line for line in result.stdout.split('\n') if 'python.exe' in line]
print(f" Python processes running: {len(python_processes)}")
for i, proc in enumerate(python_processes[:5]): # Show first 5
print(f" {i+1}. {proc.strip()}")
except Exception as e:
print(f" Error checking processes: {e}")
# Check web services
print("\n🌐 WEB SERVICES:")
print(" TensorBoard: http://localhost:6006")
print(" Web Dashboard: http://localhost:8051")
# Check model saves
models_dir = Path("models/rl")
if models_dir.exists():
model_files = list(models_dir.glob("realtime_agent_*.pt"))
print(f"\n💾 SAVED MODELS: {len(model_files)}")
for model_file in sorted(model_files, key=lambda x: x.stat().st_mtime)[-3:]:
modified = datetime.fromtimestamp(model_file.stat().st_mtime)
print(f" 📄 {model_file.name} - {modified.strftime('%Y-%m-%d %H:%M:%S')}")
print("\n" + "=" * 60)
print("✅ MONITORING URLs:")
print("📊 TensorBoard: http://localhost:6006")
print("🌐 Dashboard: http://localhost:8051")
print("=" * 60)
if __name__ == "__main__":
try:
check_training_status()
except KeyboardInterrupt:
print("\nMonitoring stopped.")
except Exception as e:
print(f"Error: {e}")