sync active training with UI if running

2025-11-12 22:37:20 +02:00
parent fcbc475686
commit 1af3124be7
7 changed files with 384 additions and 10 deletions
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -4,17 +4,13 @@
        {
            "label": "Kill Stale Processes",
            "type": "shell",
-            "command": "python",
+            "command": "${command:python.interpreterPath}",
            "args": [
-<<<<<<< HEAD
                "kill_dashboard.py"
-=======
-                "-ExecutionPolicy",
-                "Bypass",
-                "-File",
-                "scripts/kill_stale_processes.ps1"
->>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
            ],
+            "options": {
+                "cwd": "${workspaceFolder}"
+            },
            "group": "build",
            "presentation": {
                "echo": true,
--- a/ANNOTATE/core/real_training_adapter.py
+++ b/ANNOTATE/core/real_training_adapter.py
@@ -2054,6 +2054,52 @@ class RealTrainingAdapter:
            'duration_seconds': session.duration_seconds,
            'error': session.error
        }
+    
+    def get_active_training_session(self) -> Optional[Dict]:
+        """
+        Get currently active training session (if any)
+        
+        This allows the UI to resume tracking training progress after page reload
+        
+        Returns:
+            Dict with training info if active session exists, None otherwise
+        """
+        # Find any session with 'running' status
+        for training_id, session in self.training_sessions.items():
+            if session.status == 'running':
+                return {
+                    'training_id': training_id,
+                    'status': session.status,
+                    'model_name': session.model_name,
+                    'test_cases_count': session.test_cases_count,
+                    'current_epoch': session.current_epoch,
+                    'total_epochs': session.total_epochs,
+                    'current_loss': session.current_loss,
+                    'start_time': session.start_time
+                }
+        
+        return None
+    
+    def get_all_training_sessions(self) -> List[Dict]:
+        """
+        Get all training sessions (for debugging/monitoring)
+        
+        Returns:
+            List of all training session summaries
+        """
+        sessions = []
+        for training_id, session in self.training_sessions.items():
+            sessions.append({
+                'training_id': training_id,
+                'status': session.status,
+                'model_name': session.model_name,
+                'current_epoch': session.current_epoch,
+                'total_epochs': session.total_epochs,
+                'start_time': session.start_time,
+                'duration_seconds': session.duration_seconds
+            })
+        
+        return sessions

    
    # Real-time inference support
--- a/ANNOTATE/web/app.py
+++ b/ANNOTATE/web/app.py
@@ -1241,6 +1241,48 @@ class AnnotationDashboard:
                    }
                })
        
+        @self.server.route('/api/active-training', methods=['GET'])
+        def get_active_training():
+            """
+            Get currently active training session (if any)
+            Allows UI to resume tracking after page reload or across multiple clients
+            """
+            try:
+                if not self.training_adapter:
+                    return jsonify({
+                        'success': False,
+                        'active': False,
+                        'error': {
+                            'code': 'TRAINING_UNAVAILABLE',
+                            'message': 'Real training adapter not available'
+                        }
+                    })
+                
+                active_session = self.training_adapter.get_active_training_session()
+                
+                if active_session:
+                    return jsonify({
+                        'success': True,
+                        'active': True,
+                        'session': active_session
+                    })
+                else:
+                    return jsonify({
+                        'success': True,
+                        'active': False
+                    })
+                
+            except Exception as e:
+                logger.error(f"Error getting active training: {e}")
+                return jsonify({
+                    'success': False,
+                    'active': False,
+                    'error': {
+                        'code': 'ACTIVE_TRAINING_ERROR',
+                        'message': str(e)
+                    }
+                })
+        
        # Live Training API Endpoints
        @self.server.route('/api/live-training/start', methods=['POST'])
        def start_live_training():
--- a/ANNOTATE/web/templates/annotation_dashboard.html
+++ b/ANNOTATE/web/templates/annotation_dashboard.html
@@ -92,6 +92,16 @@
        // Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
        loadInitialData();

+        // Load available models for training panel
+        if (typeof loadAvailableModels === 'function') {
+            loadAvailableModels();
+        }
+
+        // Check for active training session (resume tracking after page reload)
+        if (typeof checkActiveTraining === 'function') {
+            checkActiveTraining();
+        }
+
        // Setup keyboard shortcuts
        setupKeyboardShortcuts();
    });
--- a/ANNOTATE/web/templates/components/training_panel.html
+++ b/ANNOTATE/web/templates/components/training_panel.html
@@ -109,6 +109,30 @@
    // Track model states
    let modelStates = [];
    let selectedModel = null;
+    let activeTrainingId = null;  // Track active training session
+
+    function checkActiveTraining() {
+        /**
+         * Check if there's an active training session on page load
+         * This allows resuming progress tracking after page reload
+         */
+        fetch('/api/active-training')
+            .then(response => response.json())
+            .then(data => {
+                if (data.success && data.active && data.session) {
+                    console.log('Active training session found:', data.session);
+                    // Resume tracking
+                    activeTrainingId = data.session.training_id;
+                    showTrainingStatus();
+                    pollTrainingProgress(activeTrainingId);
+                } else {
+                    console.log('No active training session');
+                }
+            })
+            .catch(error => {
+                console.error('Error checking active training:', error);
+            });
+    }

    function loadAvailableModels() {
        fetch('/api/available-models')
@@ -290,11 +314,16 @@
        startTraining(modelName, annotationIds);
    });

-    function startTraining(modelName, annotationIds) {
-        // Show training status
+    function showTrainingStatus() {
+        // Show training status UI
        document.getElementById('training-status').style.display = 'block';
        document.getElementById('training-results').style.display = 'none';
        document.getElementById('train-model-btn').disabled = true;
+    }
+
+    function startTraining(modelName, annotationIds) {
+        // Show training status
+        showTrainingStatus();

        // Reset progress
        document.getElementById('training-progress-bar').style.width = '0%';
@@ -313,18 +342,22 @@
            .then(response => response.json())
            .then(data => {
                if (data.success) {
+                    // Store active training ID for persistence across reloads
+                    activeTrainingId = data.training_id;
                    // Start polling for training progress
                    pollTrainingProgress(data.training_id);
                } else {
                    showError('Failed to start training: ' + data.error.message);
                    document.getElementById('training-status').style.display = 'none';
                    document.getElementById('train-model-btn').disabled = false;
+                    activeTrainingId = null;
                }
            })
            .catch(error => {
                showError('Network error: ' + error.message);
                document.getElementById('training-status').style.display = 'none';
                document.getElementById('train-model-btn').disabled = false;
+                activeTrainingId = null;
            });
    }

@@ -350,9 +383,11 @@
                        // Check if complete
                        if (progress.status === 'completed') {
                            clearInterval(pollInterval);
+                            activeTrainingId = null;  // Clear active training
                            showTrainingResults(progress);
                        } else if (progress.status === 'failed') {
                            clearInterval(pollInterval);
+                            activeTrainingId = null;  // Clear active training
                            showError('Training failed: ' + progress.error);
                            document.getElementById('training-status').style.display = 'none';
                            document.getElementById('train-model-btn').disabled = false;
@@ -361,6 +396,7 @@
                })
                .catch(error => {
                    clearInterval(pollInterval);
+                    // Don't clear activeTrainingId on network error - training might still be running
                    showError('Failed to get training progress: ' + error.message);
                    document.getElementById('training-status').style.display = 'none';
                    document.getElementById('train-model-btn').disabled = false;
--- a/kill_dashboard.py
+++ b/kill_dashboard.py
@@ -0,0 +1,225 @@
+#!/usr/bin/env python3
+"""
+Cross-platform script to kill stale Python dashboard processes.
+Works on Linux, macOS, and Windows.
+"""
+
+import os
+import sys
+import signal
+import psutil
+import time
+from pathlib import Path
+
+# Dashboard script names to look for
+DASHBOARD_SCRIPTS = [
+    'main_dashboard.py',
+    'run_clean_dashboard.py',
+    'web/clean_dashboard.py',
+    'web/cob_realtime_dashboard.py',
+    'ANNOTATE/web/app.py',
+    'COBY/main.py',
+    'run_integrated_rl_cob_dashboard.py',
+    'run_realtime_rl_cob_trader.py',
+]
+
+# Ports used by dashboards
+DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001]
+
+
+def is_dashboard_process(proc):
+    """Check if a process is a dashboard process"""
+    try:
+        # Get process info
+        cmdline = proc.cmdline()
+        if not cmdline:
+            return False
+        
+        # Join command line for easier searching
+        cmd_str = ' '.join(cmdline).lower()
+        
+        # Check if it's a Python process
+        if 'python' not in cmd_str:
+            return False
+        
+        # Check if it's running one of our dashboard scripts
+        for script in DASHBOARD_SCRIPTS:
+            script_lower = script.lower()
+            if script_lower in cmd_str:
+                return True
+        
+        # Check if it's using one of our dashboard ports
+        try:
+            connections = proc.connections()
+            for conn in connections:
+                if hasattr(conn, 'laddr') and conn.laddr:
+                    if conn.laddr.port in DASHBOARD_PORTS:
+                        return True
+        except (psutil.AccessDenied, psutil.NoSuchProcess):
+            pass
+        
+        return False
+    
+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
+        return False
+
+
+def kill_process(proc, force=False):
+    """Kill a process gracefully or forcefully"""
+    try:
+        pid = proc.pid
+        name = proc.name()
+        
+        if force:
+            # Force kill
+            if sys.platform == 'win32':
+                proc.kill()
+            else:
+                os.kill(pid, signal.SIGKILL)
+            print(f"  Force killed PID {pid} ({name})")
+        else:
+            # Graceful termination
+            if sys.platform == 'win32':
+                proc.terminate()
+            else:
+                os.kill(pid, signal.SIGTERM)
+            print(f"  Terminated PID {pid} ({name})")
+        
+        return True
+    
+    except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e:
+        print(f"  Could not kill PID {proc.pid}: {e}")
+        return False
+
+
+def find_processes_on_ports():
+    """Find processes listening on dashboard ports"""
+    port_processes = []
+    
+    for proc in psutil.process_iter(['pid', 'name']):
+        try:
+            connections = proc.connections()
+            for conn in connections:
+                if hasattr(conn, 'laddr') and conn.laddr:
+                    if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN':
+                        port_processes.append({
+                            'pid': proc.pid,
+                            'name': proc.name(),
+                            'port': conn.laddr.port
+                        })
+        except (psutil.AccessDenied, psutil.NoSuchProcess):
+            pass
+    
+    return port_processes
+
+
+def main():
+    print("=" * 60)
+    print("Killing Stale Dashboard Processes")
+    print("=" * 60)
+    
+    # Find dashboard processes
+    dashboard_procs = []
+    
+    print("\nScanning for dashboard processes...")
+    for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
+        if is_dashboard_process(proc):
+            dashboard_procs.append(proc)
+    
+    if not dashboard_procs:
+        print("  No dashboard processes found")
+    else:
+        print(f"  Found {len(dashboard_procs)} dashboard process(es):")
+        for proc in dashboard_procs:
+            try:
+                cmdline = ' '.join(proc.cmdline())
+                print(f"    - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}")
+            except (psutil.NoSuchProcess, psutil.AccessDenied):
+                print(f"    - PID {proc.pid}: (access denied)")
+    
+    # Find processes on dashboard ports
+    print("\nChecking for processes on dashboard ports...")
+    port_procs = find_processes_on_ports()
+    
+    if not port_procs:
+        print("  No processes found on dashboard ports")
+    else:
+        print(f"  Found {len(port_procs)} process(es) on dashboard ports:")
+        for p in port_procs:
+            print(f"    - PID {p['pid']} ({p['name']}) on port {p['port']}")
+            # Add to kill list if not already there
+            try:
+                proc = psutil.Process(p['pid'])
+                if proc not in dashboard_procs:
+                    dashboard_procs.append(proc)
+            except psutil.NoSuchProcess:
+                pass
+    
+    # Kill all found processes
+    if dashboard_procs:
+        print("\nTerminating processes...")
+        killed_count = 0
+        
+        # First, try graceful termination
+        for proc in dashboard_procs:
+            if kill_process(proc, force=False):
+                killed_count += 1
+        
+        # Wait for processes to exit
+        print("\nWaiting for processes to exit...")
+        time.sleep(2)
+        
+        # Force kill any remaining processes
+        remaining = []
+        for proc in dashboard_procs:
+            try:
+                if proc.is_running():
+                    remaining.append(proc)
+            except psutil.NoSuchProcess:
+                pass
+        
+        if remaining:
+            print(f"\nForce killing {len(remaining)} remaining process(es)...")
+            for proc in remaining:
+                kill_process(proc, force=True)
+        
+        print(f"\n✓ Killed {killed_count} dashboard process(es)")
+    else:
+        print("\n✓ No processes to kill")
+    
+    print("\nPort status:")
+    for port in DASHBOARD_PORTS:
+        port_free = True
+        for proc in psutil.process_iter():
+            try:
+                for conn in proc.connections():
+                    if hasattr(conn, 'laddr') and conn.laddr:
+                        if conn.laddr.port == port and conn.status == 'LISTEN':
+                            print(f"  Port {port}: IN USE by PID {proc.pid}")
+                            port_free = False
+                            break
+                if not port_free:
+                    break
+            except (psutil.AccessDenied, psutil.NoSuchProcess):
+                pass
+        
+        if port_free:
+            print(f"  Port {port}: FREE")
+    
+    print("\n" + "=" * 60)
+    print("Process cleanup completed")
+    print("=" * 60)
+
+
+if __name__ == '__main__':
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\n\nInterrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\nError: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,25 @@ pybit>=5.11.0
 requests>=2.31.0
 asyncpg>=0.29.0

+# COBY System Dependencies
+fastapi>=0.104.0
+uvicorn>=0.24.0
+pydantic>=2.5.0
+python-multipart>=0.0.6
+aiohttp>=3.9.0
+redis>=5.0.0
+scipy>=1.11.0
+structlog>=23.2.0
+click>=8.1.0
+rich>=13.7.0
+
+# DuckDB for data storage
+duckdb>=0.9.0
+
+# Flask for ANNOTATE system
+Flask>=3.0.0
+flask-cors>=4.0.0
+
 # NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
 # Install one of the following sets manually depending on your hardware:
 #