sync active training with UI if running

2025-11-12 22:37:20 +02:00
parent fcbc475686
commit 1af3124be7
7 changed files with 384 additions and 10 deletions
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -4,17 +4,13 @@
        {
            "label": "Kill Stale Processes",
            "type": "shell",
-            "command": "python",
+            "command": "${command:python.interpreterPath}",
            "args": [
 <<<<<<< HEAD
                "kill_dashboard.py"
 =======
                "-ExecutionPolicy",
                "Bypass",
                "-File",
                "scripts/kill_stale_processes.ps1"
 >>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
            ],
            "options": {
                "cwd": "${workspaceFolder}"
            },
            "group": "build",
            "presentation": {
                "echo": true,
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -2054,6 +2054,52 @@ class RealTrainingAdapter:
            'duration_seconds': session.duration_seconds,
            'error': session.error
        }
    def get_active_training_session(self) -> Optional[Dict]:
        """
        Get currently active training session (if any)
        This allows the UI to resume tracking training progress after page reload
        Returns:
            Dict with training info if active session exists, None otherwise
        """
        # Find any session with 'running' status
        for training_id, session in self.training_sessions.items():
            if session.status == 'running':
                return {
                    'training_id': training_id,
                    'status': session.status,
                    'model_name': session.model_name,
                    'test_cases_count': session.test_cases_count,
                    'current_epoch': session.current_epoch,
                    'total_epochs': session.total_epochs,
                    'current_loss': session.current_loss,
                    'start_time': session.start_time
                }
        return None
    def get_all_training_sessions(self) -> List[Dict]:
        """
        Get all training sessions (for debugging/monitoring)
        Returns:
            List of all training session summaries
        """
        sessions = []
        for training_id, session in self.training_sessions.items():
            sessions.append({
                'training_id': training_id,
                'status': session.status,
                'model_name': session.model_name,
                'current_epoch': session.current_epoch,
                'total_epochs': session.total_epochs,
                'start_time': session.start_time,
                'duration_seconds': session.duration_seconds
            })
        return sessions
    # Real-time inference support
--- a/ANNOTATE/web/app.py
+++ b/ANNOTATE/web/app.py
@@ -1241,6 +1241,48 @@ class AnnotationDashboard:
                    }
                })
        @self.server.route('/api/active-training', methods=['GET'])
        def get_active_training():
            """
            Get currently active training session (if any)
            Allows UI to resume tracking after page reload or across multiple clients
            """
            try:
                if not self.training_adapter:
                    return jsonify({
                        'success': False,
                        'active': False,
                        'error': {
                            'code': 'TRAINING_UNAVAILABLE',
                            'message': 'Real training adapter not available'
                        }
                    })
                active_session = self.training_adapter.get_active_training_session()
                if active_session:
                    return jsonify({
                        'success': True,
                        'active': True,
                        'session': active_session
                    })
                else:
                    return jsonify({
                        'success': True,
                        'active': False
                    })
            except Exception as e:
                logger.error(f"Error getting active training: {e}")
                return jsonify({
                    'success': False,
                    'active': False,
                    'error': {
                        'code': 'ACTIVE_TRAINING_ERROR',
                        'message': str(e)
                    }
                })
        # Live Training API Endpoints
        @self.server.route('/api/live-training/start', methods=['POST'])
        def start_live_training():
--- a/ANNOTATE/web/templates/annotation_dashboard.html
+++ b/ANNOTATE/web/templates/annotation_dashboard.html
@@ -92,6 +92,16 @@
        // Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
        loadInitialData();
        // Load available models for training panel
        if (typeof loadAvailableModels === 'function') {
            loadAvailableModels();
        }
        // Check for active training session (resume tracking after page reload)
        if (typeof checkActiveTraining === 'function') {
            checkActiveTraining();
        }
        // Setup keyboard shortcuts
        setupKeyboardShortcuts();
    });
--- a/ANNOTATE/web/templates/components/training_panel.html
+++ b/ANNOTATE/web/templates/components/training_panel.html
@@ -109,6 +109,30 @@
    // Track model states
    let modelStates = [];
    let selectedModel = null;
    let activeTrainingId = null;  // Track active training session
    function checkActiveTraining() {
        /**
         * Check if there's an active training session on page load
         * This allows resuming progress tracking after page reload
         */
        fetch('/api/active-training')
            .then(response => response.json())
            .then(data => {
                if (data.success && data.active && data.session) {
                    console.log('Active training session found:', data.session);
                    // Resume tracking
                    activeTrainingId = data.session.training_id;
                    showTrainingStatus();
                    pollTrainingProgress(activeTrainingId);
                } else {
                    console.log('No active training session');
                }
            })
            .catch(error => {
                console.error('Error checking active training:', error);
            });
    }
    function loadAvailableModels() {
        fetch('/api/available-models')
@@ -290,11 +314,16 @@
        startTraining(modelName, annotationIds);
    });
-    function startTraining(modelName, annotationIds) {
+    function showTrainingStatus() {
-        // Show training status
+        // Show training status UI
        document.getElementById('training-status').style.display = 'block';
        document.getElementById('training-results').style.display = 'none';
        document.getElementById('train-model-btn').disabled = true;
    }
    function startTraining(modelName, annotationIds) {
        // Show training status
        showTrainingStatus();
        // Reset progress
        document.getElementById('training-progress-bar').style.width = '0%';
@@ -313,18 +342,22 @@
            .then(response => response.json())
            .then(data => {
                if (data.success) {
                    // Store active training ID for persistence across reloads
                    activeTrainingId = data.training_id;
                    // Start polling for training progress
                    pollTrainingProgress(data.training_id);
                } else {
                    showError('Failed to start training: ' + data.error.message);
                    document.getElementById('training-status').style.display = 'none';
                    document.getElementById('train-model-btn').disabled = false;
                    activeTrainingId = null;
                }
            })
            .catch(error => {
                showError('Network error: ' + error.message);
                document.getElementById('training-status').style.display = 'none';
                document.getElementById('train-model-btn').disabled = false;
                activeTrainingId = null;
            });
    }
@@ -350,9 +383,11 @@
                        // Check if complete
                        if (progress.status === 'completed') {
                            clearInterval(pollInterval);
                            activeTrainingId = null;  // Clear active training
                            showTrainingResults(progress);
                        } else if (progress.status === 'failed') {
                            clearInterval(pollInterval);
                            activeTrainingId = null;  // Clear active training
                            showError('Training failed: ' + progress.error);
                            document.getElementById('training-status').style.display = 'none';
                            document.getElementById('train-model-btn').disabled = false;
@@ -361,6 +396,7 @@
                })
                .catch(error => {
                    clearInterval(pollInterval);
                    // Don't clear activeTrainingId on network error - training might still be running
                    showError('Failed to get training progress: ' + error.message);
                    document.getElementById('training-status').style.display = 'none';
                    document.getElementById('train-model-btn').disabled = false;
--- a/kill_dashboard.py
+++ b/kill_dashboard.py
@@ -0,0 +1,225 @@
 #!/usr/bin/env python3
 """
 Cross-platform script to kill stale Python dashboard processes.
 Works on Linux, macOS, and Windows.
 """
 import os
 import sys
 import signal
 import psutil
 import time
 from pathlib import Path
 # Dashboard script names to look for
 DASHBOARD_SCRIPTS = [
    'main_dashboard.py',
    'run_clean_dashboard.py',
    'web/clean_dashboard.py',
    'web/cob_realtime_dashboard.py',
    'ANNOTATE/web/app.py',
    'COBY/main.py',
    'run_integrated_rl_cob_dashboard.py',
    'run_realtime_rl_cob_trader.py',
 ]
 # Ports used by dashboards
 DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001]
 def is_dashboard_process(proc):
    """Check if a process is a dashboard process"""
    try:
        # Get process info
        cmdline = proc.cmdline()
        if not cmdline:
            return False
        # Join command line for easier searching
        cmd_str = ' '.join(cmdline).lower()
        # Check if it's a Python process
        if 'python' not in cmd_str:
            return False
        # Check if it's running one of our dashboard scripts
        for script in DASHBOARD_SCRIPTS:
            script_lower = script.lower()
            if script_lower in cmd_str:
                return True
        # Check if it's using one of our dashboard ports
        try:
            connections = proc.connections()
            for conn in connections:
                if hasattr(conn, 'laddr') and conn.laddr:
                    if conn.laddr.port in DASHBOARD_PORTS:
                        return True
        except (psutil.AccessDenied, psutil.NoSuchProcess):
            pass
        return False
    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
        return False
 def kill_process(proc, force=False):
    """Kill a process gracefully or forcefully"""
    try:
        pid = proc.pid
        name = proc.name()
        if force:
            # Force kill
            if sys.platform == 'win32':
                proc.kill()
            else:
                os.kill(pid, signal.SIGKILL)
            print(f"  Force killed PID {pid} ({name})")
        else:
            # Graceful termination
            if sys.platform == 'win32':
                proc.terminate()
            else:
                os.kill(pid, signal.SIGTERM)
            print(f"  Terminated PID {pid} ({name})")
        return True
    except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e:
        print(f"  Could not kill PID {proc.pid}: {e}")
        return False
 def find_processes_on_ports():
    """Find processes listening on dashboard ports"""
    port_processes = []
    for proc in psutil.process_iter(['pid', 'name']):
        try:
            connections = proc.connections()
            for conn in connections:
                if hasattr(conn, 'laddr') and conn.laddr:
                    if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN':
                        port_processes.append({
                            'pid': proc.pid,
                            'name': proc.name(),
                            'port': conn.laddr.port
                        })
        except (psutil.AccessDenied, psutil.NoSuchProcess):
            pass
    return port_processes
 def main():
    print("=" * 60)
    print("Killing Stale Dashboard Processes")
    print("=" * 60)
    # Find dashboard processes
    dashboard_procs = []
    print("\nScanning for dashboard processes...")
    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
        if is_dashboard_process(proc):
            dashboard_procs.append(proc)
    if not dashboard_procs:
        print("  No dashboard processes found")
    else:
        print(f"  Found {len(dashboard_procs)} dashboard process(es):")
        for proc in dashboard_procs:
            try:
                cmdline = ' '.join(proc.cmdline())
                print(f"    - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}")
            except (psutil.NoSuchProcess, psutil.AccessDenied):
                print(f"    - PID {proc.pid}: (access denied)")
    # Find processes on dashboard ports
    print("\nChecking for processes on dashboard ports...")
    port_procs = find_processes_on_ports()
    if not port_procs:
        print("  No processes found on dashboard ports")
    else:
        print(f"  Found {len(port_procs)} process(es) on dashboard ports:")
        for p in port_procs:
            print(f"    - PID {p['pid']} ({p['name']}) on port {p['port']}")
            # Add to kill list if not already there
            try:
                proc = psutil.Process(p['pid'])
                if proc not in dashboard_procs:
                    dashboard_procs.append(proc)
            except psutil.NoSuchProcess:
                pass
    # Kill all found processes
    if dashboard_procs:
        print("\nTerminating processes...")
        killed_count = 0
        # First, try graceful termination
        for proc in dashboard_procs:
            if kill_process(proc, force=False):
                killed_count += 1
        # Wait for processes to exit
        print("\nWaiting for processes to exit...")
        time.sleep(2)
        # Force kill any remaining processes
        remaining = []
        for proc in dashboard_procs:
            try:
                if proc.is_running():
                    remaining.append(proc)
            except psutil.NoSuchProcess:
                pass
        if remaining:
            print(f"\nForce killing {len(remaining)} remaining process(es)...")
            for proc in remaining:
                kill_process(proc, force=True)
        print(f"\n✓ Killed {killed_count} dashboard process(es)")
    else:
        print("\n✓ No processes to kill")
    print("\nPort status:")
    for port in DASHBOARD_PORTS:
        port_free = True
        for proc in psutil.process_iter():
            try:
                for conn in proc.connections():
                    if hasattr(conn, 'laddr') and conn.laddr:
                        if conn.laddr.port == port and conn.status == 'LISTEN':
                            print(f"  Port {port}: IN USE by PID {proc.pid}")
                            port_free = False
                            break
                if not port_free:
                    break
            except (psutil.AccessDenied, psutil.NoSuchProcess):
                pass
        if port_free:
            print(f"  Port {port}: FREE")
    print("\n" + "=" * 60)
    print("Process cleanup completed")
    print("=" * 60)
 if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print("\n\nInterrupted by user")
        sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,25 @@ pybit>=5.11.0
 requests>=2.31.0
 asyncpg>=0.29.0
 # COBY System Dependencies
 fastapi>=0.104.0
 uvicorn>=0.24.0
 pydantic>=2.5.0
 python-multipart>=0.0.6
 aiohttp>=3.9.0
 redis>=5.0.0
 scipy>=1.11.0
 structlog>=23.2.0
 click>=8.1.0
 rich>=13.7.0
 # DuckDB for data storage
 duckdb>=0.9.0
 # Flask for ANNOTATE system
 Flask>=3.0.0
 flask-cors>=4.0.0
 # NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
 # Install one of the following sets manually depending on your hardware:
 #