sync active training with UI if running

This commit is contained in:
Dobromir Popov
2025-11-12 22:37:20 +02:00
parent fcbc475686
commit 1af3124be7
7 changed files with 384 additions and 10 deletions

12
.vscode/tasks.json vendored
View File

@@ -4,17 +4,13 @@
{
"label": "Kill Stale Processes",
"type": "shell",
"command": "python",
"command": "${command:python.interpreterPath}",
"args": [
<<<<<<< HEAD
"kill_dashboard.py"
=======
"-ExecutionPolicy",
"Bypass",
"-File",
"scripts/kill_stale_processes.ps1"
>>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
],
"options": {
"cwd": "${workspaceFolder}"
},
"group": "build",
"presentation": {
"echo": true,

View File

@@ -2054,6 +2054,52 @@ class RealTrainingAdapter:
'duration_seconds': session.duration_seconds,
'error': session.error
}
def get_active_training_session(self) -> Optional[Dict]:
"""
Get currently active training session (if any)
This allows the UI to resume tracking training progress after page reload
Returns:
Dict with training info if active session exists, None otherwise
"""
# Find any session with 'running' status
for training_id, session in self.training_sessions.items():
if session.status == 'running':
return {
'training_id': training_id,
'status': session.status,
'model_name': session.model_name,
'test_cases_count': session.test_cases_count,
'current_epoch': session.current_epoch,
'total_epochs': session.total_epochs,
'current_loss': session.current_loss,
'start_time': session.start_time
}
return None
def get_all_training_sessions(self) -> List[Dict]:
"""
Get all training sessions (for debugging/monitoring)
Returns:
List of all training session summaries
"""
sessions = []
for training_id, session in self.training_sessions.items():
sessions.append({
'training_id': training_id,
'status': session.status,
'model_name': session.model_name,
'current_epoch': session.current_epoch,
'total_epochs': session.total_epochs,
'start_time': session.start_time,
'duration_seconds': session.duration_seconds
})
return sessions
# Real-time inference support

View File

@@ -1241,6 +1241,48 @@ class AnnotationDashboard:
}
})
@self.server.route('/api/active-training', methods=['GET'])
def get_active_training():
"""
Get currently active training session (if any)
Allows UI to resume tracking after page reload or across multiple clients
"""
try:
if not self.training_adapter:
return jsonify({
'success': False,
'active': False,
'error': {
'code': 'TRAINING_UNAVAILABLE',
'message': 'Real training adapter not available'
}
})
active_session = self.training_adapter.get_active_training_session()
if active_session:
return jsonify({
'success': True,
'active': True,
'session': active_session
})
else:
return jsonify({
'success': True,
'active': False
})
except Exception as e:
logger.error(f"Error getting active training: {e}")
return jsonify({
'success': False,
'active': False,
'error': {
'code': 'ACTIVE_TRAINING_ERROR',
'message': str(e)
}
})
# Live Training API Endpoints
@self.server.route('/api/live-training/start', methods=['POST'])
def start_live_training():

View File

@@ -92,6 +92,16 @@
// Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
loadInitialData();
// Load available models for training panel
if (typeof loadAvailableModels === 'function') {
loadAvailableModels();
}
// Check for active training session (resume tracking after page reload)
if (typeof checkActiveTraining === 'function') {
checkActiveTraining();
}
// Setup keyboard shortcuts
setupKeyboardShortcuts();
});

View File

@@ -109,6 +109,30 @@
// Track model states
let modelStates = [];
let selectedModel = null;
let activeTrainingId = null; // Track active training session
function checkActiveTraining() {
/**
* Check if there's an active training session on page load
* This allows resuming progress tracking after page reload
*/
fetch('/api/active-training')
.then(response => response.json())
.then(data => {
if (data.success && data.active && data.session) {
console.log('Active training session found:', data.session);
// Resume tracking
activeTrainingId = data.session.training_id;
showTrainingStatus();
pollTrainingProgress(activeTrainingId);
} else {
console.log('No active training session');
}
})
.catch(error => {
console.error('Error checking active training:', error);
});
}
function loadAvailableModels() {
fetch('/api/available-models')
@@ -290,11 +314,16 @@
startTraining(modelName, annotationIds);
});
function startTraining(modelName, annotationIds) {
// Show training status
function showTrainingStatus() {
// Show training status UI
document.getElementById('training-status').style.display = 'block';
document.getElementById('training-results').style.display = 'none';
document.getElementById('train-model-btn').disabled = true;
}
function startTraining(modelName, annotationIds) {
// Show training status
showTrainingStatus();
// Reset progress
document.getElementById('training-progress-bar').style.width = '0%';
@@ -313,18 +342,22 @@
.then(response => response.json())
.then(data => {
if (data.success) {
// Store active training ID for persistence across reloads
activeTrainingId = data.training_id;
// Start polling for training progress
pollTrainingProgress(data.training_id);
} else {
showError('Failed to start training: ' + data.error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
activeTrainingId = null;
}
})
.catch(error => {
showError('Network error: ' + error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
activeTrainingId = null;
});
}
@@ -350,9 +383,11 @@
// Check if complete
if (progress.status === 'completed') {
clearInterval(pollInterval);
activeTrainingId = null; // Clear active training
showTrainingResults(progress);
} else if (progress.status === 'failed') {
clearInterval(pollInterval);
activeTrainingId = null; // Clear active training
showError('Training failed: ' + progress.error);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;
@@ -361,6 +396,7 @@
})
.catch(error => {
clearInterval(pollInterval);
// Don't clear activeTrainingId on network error - training might still be running
showError('Failed to get training progress: ' + error.message);
document.getElementById('training-status').style.display = 'none';
document.getElementById('train-model-btn').disabled = false;

225
kill_dashboard.py Normal file
View File

@@ -0,0 +1,225 @@
#!/usr/bin/env python3
"""
Cross-platform script to kill stale Python dashboard processes.
Works on Linux, macOS, and Windows.
"""
import os
import sys
import signal
import psutil
import time
from pathlib import Path
# Dashboard script names to look for
DASHBOARD_SCRIPTS = [
'main_dashboard.py',
'run_clean_dashboard.py',
'web/clean_dashboard.py',
'web/cob_realtime_dashboard.py',
'ANNOTATE/web/app.py',
'COBY/main.py',
'run_integrated_rl_cob_dashboard.py',
'run_realtime_rl_cob_trader.py',
]
# Ports used by dashboards
DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001]
def is_dashboard_process(proc):
"""Check if a process is a dashboard process"""
try:
# Get process info
cmdline = proc.cmdline()
if not cmdline:
return False
# Join command line for easier searching
cmd_str = ' '.join(cmdline).lower()
# Check if it's a Python process
if 'python' not in cmd_str:
return False
# Check if it's running one of our dashboard scripts
for script in DASHBOARD_SCRIPTS:
script_lower = script.lower()
if script_lower in cmd_str:
return True
# Check if it's using one of our dashboard ports
try:
connections = proc.connections()
for conn in connections:
if hasattr(conn, 'laddr') and conn.laddr:
if conn.laddr.port in DASHBOARD_PORTS:
return True
except (psutil.AccessDenied, psutil.NoSuchProcess):
pass
return False
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
return False
def kill_process(proc, force=False):
"""Kill a process gracefully or forcefully"""
try:
pid = proc.pid
name = proc.name()
if force:
# Force kill
if sys.platform == 'win32':
proc.kill()
else:
os.kill(pid, signal.SIGKILL)
print(f" Force killed PID {pid} ({name})")
else:
# Graceful termination
if sys.platform == 'win32':
proc.terminate()
else:
os.kill(pid, signal.SIGTERM)
print(f" Terminated PID {pid} ({name})")
return True
except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e:
print(f" Could not kill PID {proc.pid}: {e}")
return False
def find_processes_on_ports():
"""Find processes listening on dashboard ports"""
port_processes = []
for proc in psutil.process_iter(['pid', 'name']):
try:
connections = proc.connections()
for conn in connections:
if hasattr(conn, 'laddr') and conn.laddr:
if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN':
port_processes.append({
'pid': proc.pid,
'name': proc.name(),
'port': conn.laddr.port
})
except (psutil.AccessDenied, psutil.NoSuchProcess):
pass
return port_processes
def main():
print("=" * 60)
print("Killing Stale Dashboard Processes")
print("=" * 60)
# Find dashboard processes
dashboard_procs = []
print("\nScanning for dashboard processes...")
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
if is_dashboard_process(proc):
dashboard_procs.append(proc)
if not dashboard_procs:
print(" No dashboard processes found")
else:
print(f" Found {len(dashboard_procs)} dashboard process(es):")
for proc in dashboard_procs:
try:
cmdline = ' '.join(proc.cmdline())
print(f" - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}")
except (psutil.NoSuchProcess, psutil.AccessDenied):
print(f" - PID {proc.pid}: (access denied)")
# Find processes on dashboard ports
print("\nChecking for processes on dashboard ports...")
port_procs = find_processes_on_ports()
if not port_procs:
print(" No processes found on dashboard ports")
else:
print(f" Found {len(port_procs)} process(es) on dashboard ports:")
for p in port_procs:
print(f" - PID {p['pid']} ({p['name']}) on port {p['port']}")
# Add to kill list if not already there
try:
proc = psutil.Process(p['pid'])
if proc not in dashboard_procs:
dashboard_procs.append(proc)
except psutil.NoSuchProcess:
pass
# Kill all found processes
if dashboard_procs:
print("\nTerminating processes...")
killed_count = 0
# First, try graceful termination
for proc in dashboard_procs:
if kill_process(proc, force=False):
killed_count += 1
# Wait for processes to exit
print("\nWaiting for processes to exit...")
time.sleep(2)
# Force kill any remaining processes
remaining = []
for proc in dashboard_procs:
try:
if proc.is_running():
remaining.append(proc)
except psutil.NoSuchProcess:
pass
if remaining:
print(f"\nForce killing {len(remaining)} remaining process(es)...")
for proc in remaining:
kill_process(proc, force=True)
print(f"\n✓ Killed {killed_count} dashboard process(es)")
else:
print("\n✓ No processes to kill")
print("\nPort status:")
for port in DASHBOARD_PORTS:
port_free = True
for proc in psutil.process_iter():
try:
for conn in proc.connections():
if hasattr(conn, 'laddr') and conn.laddr:
if conn.laddr.port == port and conn.status == 'LISTEN':
print(f" Port {port}: IN USE by PID {proc.pid}")
port_free = False
break
if not port_free:
break
except (psutil.AccessDenied, psutil.NoSuchProcess):
pass
if port_free:
print(f" Port {port}: FREE")
print("\n" + "=" * 60)
print("Process cleanup completed")
print("=" * 60)
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("\n\nInterrupted by user")
sys.exit(1)
except Exception as e:
print(f"\nError: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -19,6 +19,25 @@ pybit>=5.11.0
requests>=2.31.0
asyncpg>=0.29.0
# COBY System Dependencies
fastapi>=0.104.0
uvicorn>=0.24.0
pydantic>=2.5.0
python-multipart>=0.0.6
aiohttp>=3.9.0
redis>=5.0.0
scipy>=1.11.0
structlog>=23.2.0
click>=8.1.0
rich>=13.7.0
# DuckDB for data storage
duckdb>=0.9.0
# Flask for ANNOTATE system
Flask>=3.0.0
flask-cors>=4.0.0
# NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
# Install one of the following sets manually depending on your hardware:
#