From 1af3124be74853e36c0b01e2e4b83770ff2e9584 Mon Sep 17 00:00:00 2001 From: Dobromir Popov Date: Wed, 12 Nov 2025 22:37:20 +0200 Subject: [PATCH] sync active training with UI if running --- .vscode/tasks.json | 12 +- ANNOTATE/core/real_training_adapter.py | 46 ++++ ANNOTATE/web/app.py | 42 ++++ .../web/templates/annotation_dashboard.html | 10 + .../templates/components/training_panel.html | 40 +++- kill_dashboard.py | 225 ++++++++++++++++++ requirements.txt | 19 ++ 7 files changed, 384 insertions(+), 10 deletions(-) create mode 100644 kill_dashboard.py diff --git a/.vscode/tasks.json b/.vscode/tasks.json index a533f33..e033218 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -4,17 +4,13 @@ { "label": "Kill Stale Processes", "type": "shell", - "command": "python", + "command": "${command:python.interpreterPath}", "args": [ -<<<<<<< HEAD "kill_dashboard.py" -======= - "-ExecutionPolicy", - "Bypass", - "-File", - "scripts/kill_stale_processes.ps1" ->>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b ], + "options": { + "cwd": "${workspaceFolder}" + }, "group": "build", "presentation": { "echo": true, diff --git a/ANNOTATE/core/real_training_adapter.py b/ANNOTATE/core/real_training_adapter.py index f14fdd6..d4abd95 100644 --- a/ANNOTATE/core/real_training_adapter.py +++ b/ANNOTATE/core/real_training_adapter.py @@ -2054,6 +2054,52 @@ class RealTrainingAdapter: 'duration_seconds': session.duration_seconds, 'error': session.error } + + def get_active_training_session(self) -> Optional[Dict]: + """ + Get currently active training session (if any) + + This allows the UI to resume tracking training progress after page reload + + Returns: + Dict with training info if active session exists, None otherwise + """ + # Find any session with 'running' status + for training_id, session in self.training_sessions.items(): + if session.status == 'running': + return { + 'training_id': training_id, + 'status': session.status, + 'model_name': session.model_name, + 'test_cases_count': session.test_cases_count, + 'current_epoch': session.current_epoch, + 'total_epochs': session.total_epochs, + 'current_loss': session.current_loss, + 'start_time': session.start_time + } + + return None + + def get_all_training_sessions(self) -> List[Dict]: + """ + Get all training sessions (for debugging/monitoring) + + Returns: + List of all training session summaries + """ + sessions = [] + for training_id, session in self.training_sessions.items(): + sessions.append({ + 'training_id': training_id, + 'status': session.status, + 'model_name': session.model_name, + 'current_epoch': session.current_epoch, + 'total_epochs': session.total_epochs, + 'start_time': session.start_time, + 'duration_seconds': session.duration_seconds + }) + + return sessions # Real-time inference support diff --git a/ANNOTATE/web/app.py b/ANNOTATE/web/app.py index f651de3..5305688 100644 --- a/ANNOTATE/web/app.py +++ b/ANNOTATE/web/app.py @@ -1241,6 +1241,48 @@ class AnnotationDashboard: } }) + @self.server.route('/api/active-training', methods=['GET']) + def get_active_training(): + """ + Get currently active training session (if any) + Allows UI to resume tracking after page reload or across multiple clients + """ + try: + if not self.training_adapter: + return jsonify({ + 'success': False, + 'active': False, + 'error': { + 'code': 'TRAINING_UNAVAILABLE', + 'message': 'Real training adapter not available' + } + }) + + active_session = self.training_adapter.get_active_training_session() + + if active_session: + return jsonify({ + 'success': True, + 'active': True, + 'session': active_session + }) + else: + return jsonify({ + 'success': True, + 'active': False + }) + + except Exception as e: + logger.error(f"Error getting active training: {e}") + return jsonify({ + 'success': False, + 'active': False, + 'error': { + 'code': 'ACTIVE_TRAINING_ERROR', + 'message': str(e) + } + }) + # Live Training API Endpoints @self.server.route('/api/live-training/start', methods=['POST']) def start_live_training(): diff --git a/ANNOTATE/web/templates/annotation_dashboard.html b/ANNOTATE/web/templates/annotation_dashboard.html index 30815dd..04cd479 100644 --- a/ANNOTATE/web/templates/annotation_dashboard.html +++ b/ANNOTATE/web/templates/annotation_dashboard.html @@ -92,6 +92,16 @@ // Load initial data (may call renderAnnotationsList which needs deleteAnnotation) loadInitialData(); + // Load available models for training panel + if (typeof loadAvailableModels === 'function') { + loadAvailableModels(); + } + + // Check for active training session (resume tracking after page reload) + if (typeof checkActiveTraining === 'function') { + checkActiveTraining(); + } + // Setup keyboard shortcuts setupKeyboardShortcuts(); }); diff --git a/ANNOTATE/web/templates/components/training_panel.html b/ANNOTATE/web/templates/components/training_panel.html index 72441c0..586d87e 100644 --- a/ANNOTATE/web/templates/components/training_panel.html +++ b/ANNOTATE/web/templates/components/training_panel.html @@ -109,6 +109,30 @@ // Track model states let modelStates = []; let selectedModel = null; + let activeTrainingId = null; // Track active training session + + function checkActiveTraining() { + /** + * Check if there's an active training session on page load + * This allows resuming progress tracking after page reload + */ + fetch('/api/active-training') + .then(response => response.json()) + .then(data => { + if (data.success && data.active && data.session) { + console.log('Active training session found:', data.session); + // Resume tracking + activeTrainingId = data.session.training_id; + showTrainingStatus(); + pollTrainingProgress(activeTrainingId); + } else { + console.log('No active training session'); + } + }) + .catch(error => { + console.error('Error checking active training:', error); + }); + } function loadAvailableModels() { fetch('/api/available-models') @@ -290,11 +314,16 @@ startTraining(modelName, annotationIds); }); - function startTraining(modelName, annotationIds) { - // Show training status + function showTrainingStatus() { + // Show training status UI document.getElementById('training-status').style.display = 'block'; document.getElementById('training-results').style.display = 'none'; document.getElementById('train-model-btn').disabled = true; + } + + function startTraining(modelName, annotationIds) { + // Show training status + showTrainingStatus(); // Reset progress document.getElementById('training-progress-bar').style.width = '0%'; @@ -313,18 +342,22 @@ .then(response => response.json()) .then(data => { if (data.success) { + // Store active training ID for persistence across reloads + activeTrainingId = data.training_id; // Start polling for training progress pollTrainingProgress(data.training_id); } else { showError('Failed to start training: ' + data.error.message); document.getElementById('training-status').style.display = 'none'; document.getElementById('train-model-btn').disabled = false; + activeTrainingId = null; } }) .catch(error => { showError('Network error: ' + error.message); document.getElementById('training-status').style.display = 'none'; document.getElementById('train-model-btn').disabled = false; + activeTrainingId = null; }); } @@ -350,9 +383,11 @@ // Check if complete if (progress.status === 'completed') { clearInterval(pollInterval); + activeTrainingId = null; // Clear active training showTrainingResults(progress); } else if (progress.status === 'failed') { clearInterval(pollInterval); + activeTrainingId = null; // Clear active training showError('Training failed: ' + progress.error); document.getElementById('training-status').style.display = 'none'; document.getElementById('train-model-btn').disabled = false; @@ -361,6 +396,7 @@ }) .catch(error => { clearInterval(pollInterval); + // Don't clear activeTrainingId on network error - training might still be running showError('Failed to get training progress: ' + error.message); document.getElementById('training-status').style.display = 'none'; document.getElementById('train-model-btn').disabled = false; diff --git a/kill_dashboard.py b/kill_dashboard.py new file mode 100644 index 0000000..f516952 --- /dev/null +++ b/kill_dashboard.py @@ -0,0 +1,225 @@ +#!/usr/bin/env python3 +""" +Cross-platform script to kill stale Python dashboard processes. +Works on Linux, macOS, and Windows. +""" + +import os +import sys +import signal +import psutil +import time +from pathlib import Path + +# Dashboard script names to look for +DASHBOARD_SCRIPTS = [ + 'main_dashboard.py', + 'run_clean_dashboard.py', + 'web/clean_dashboard.py', + 'web/cob_realtime_dashboard.py', + 'ANNOTATE/web/app.py', + 'COBY/main.py', + 'run_integrated_rl_cob_dashboard.py', + 'run_realtime_rl_cob_trader.py', +] + +# Ports used by dashboards +DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001] + + +def is_dashboard_process(proc): + """Check if a process is a dashboard process""" + try: + # Get process info + cmdline = proc.cmdline() + if not cmdline: + return False + + # Join command line for easier searching + cmd_str = ' '.join(cmdline).lower() + + # Check if it's a Python process + if 'python' not in cmd_str: + return False + + # Check if it's running one of our dashboard scripts + for script in DASHBOARD_SCRIPTS: + script_lower = script.lower() + if script_lower in cmd_str: + return True + + # Check if it's using one of our dashboard ports + try: + connections = proc.connections() + for conn in connections: + if hasattr(conn, 'laddr') and conn.laddr: + if conn.laddr.port in DASHBOARD_PORTS: + return True + except (psutil.AccessDenied, psutil.NoSuchProcess): + pass + + return False + + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + return False + + +def kill_process(proc, force=False): + """Kill a process gracefully or forcefully""" + try: + pid = proc.pid + name = proc.name() + + if force: + # Force kill + if sys.platform == 'win32': + proc.kill() + else: + os.kill(pid, signal.SIGKILL) + print(f" Force killed PID {pid} ({name})") + else: + # Graceful termination + if sys.platform == 'win32': + proc.terminate() + else: + os.kill(pid, signal.SIGTERM) + print(f" Terminated PID {pid} ({name})") + + return True + + except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e: + print(f" Could not kill PID {proc.pid}: {e}") + return False + + +def find_processes_on_ports(): + """Find processes listening on dashboard ports""" + port_processes = [] + + for proc in psutil.process_iter(['pid', 'name']): + try: + connections = proc.connections() + for conn in connections: + if hasattr(conn, 'laddr') and conn.laddr: + if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN': + port_processes.append({ + 'pid': proc.pid, + 'name': proc.name(), + 'port': conn.laddr.port + }) + except (psutil.AccessDenied, psutil.NoSuchProcess): + pass + + return port_processes + + +def main(): + print("=" * 60) + print("Killing Stale Dashboard Processes") + print("=" * 60) + + # Find dashboard processes + dashboard_procs = [] + + print("\nScanning for dashboard processes...") + for proc in psutil.process_iter(['pid', 'name', 'cmdline']): + if is_dashboard_process(proc): + dashboard_procs.append(proc) + + if not dashboard_procs: + print(" No dashboard processes found") + else: + print(f" Found {len(dashboard_procs)} dashboard process(es):") + for proc in dashboard_procs: + try: + cmdline = ' '.join(proc.cmdline()) + print(f" - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}") + except (psutil.NoSuchProcess, psutil.AccessDenied): + print(f" - PID {proc.pid}: (access denied)") + + # Find processes on dashboard ports + print("\nChecking for processes on dashboard ports...") + port_procs = find_processes_on_ports() + + if not port_procs: + print(" No processes found on dashboard ports") + else: + print(f" Found {len(port_procs)} process(es) on dashboard ports:") + for p in port_procs: + print(f" - PID {p['pid']} ({p['name']}) on port {p['port']}") + # Add to kill list if not already there + try: + proc = psutil.Process(p['pid']) + if proc not in dashboard_procs: + dashboard_procs.append(proc) + except psutil.NoSuchProcess: + pass + + # Kill all found processes + if dashboard_procs: + print("\nTerminating processes...") + killed_count = 0 + + # First, try graceful termination + for proc in dashboard_procs: + if kill_process(proc, force=False): + killed_count += 1 + + # Wait for processes to exit + print("\nWaiting for processes to exit...") + time.sleep(2) + + # Force kill any remaining processes + remaining = [] + for proc in dashboard_procs: + try: + if proc.is_running(): + remaining.append(proc) + except psutil.NoSuchProcess: + pass + + if remaining: + print(f"\nForce killing {len(remaining)} remaining process(es)...") + for proc in remaining: + kill_process(proc, force=True) + + print(f"\n✓ Killed {killed_count} dashboard process(es)") + else: + print("\n✓ No processes to kill") + + print("\nPort status:") + for port in DASHBOARD_PORTS: + port_free = True + for proc in psutil.process_iter(): + try: + for conn in proc.connections(): + if hasattr(conn, 'laddr') and conn.laddr: + if conn.laddr.port == port and conn.status == 'LISTEN': + print(f" Port {port}: IN USE by PID {proc.pid}") + port_free = False + break + if not port_free: + break + except (psutil.AccessDenied, psutil.NoSuchProcess): + pass + + if port_free: + print(f" Port {port}: FREE") + + print("\n" + "=" * 60) + print("Process cleanup completed") + print("=" * 60) + + +if __name__ == '__main__': + try: + main() + except KeyboardInterrupt: + print("\n\nInterrupted by user") + sys.exit(1) + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + diff --git a/requirements.txt b/requirements.txt index 913e2c3..4dff287 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,6 +19,25 @@ pybit>=5.11.0 requests>=2.31.0 asyncpg>=0.29.0 +# COBY System Dependencies +fastapi>=0.104.0 +uvicorn>=0.24.0 +pydantic>=2.5.0 +python-multipart>=0.0.6 +aiohttp>=3.9.0 +redis>=5.0.0 +scipy>=1.11.0 +structlog>=23.2.0 +click>=8.1.0 +rich>=13.7.0 + +# DuckDB for data storage +duckdb>=0.9.0 + +# Flask for ANNOTATE system +Flask>=3.0.0 +flask-cors>=4.0.0 + # NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines. # Install one of the following sets manually depending on your hardware: #