sync active training with UI if running
This commit is contained in:
12
.vscode/tasks.json
vendored
12
.vscode/tasks.json
vendored
@@ -4,17 +4,13 @@
|
|||||||
{
|
{
|
||||||
"label": "Kill Stale Processes",
|
"label": "Kill Stale Processes",
|
||||||
"type": "shell",
|
"type": "shell",
|
||||||
"command": "python",
|
"command": "${command:python.interpreterPath}",
|
||||||
"args": [
|
"args": [
|
||||||
<<<<<<< HEAD
|
|
||||||
"kill_dashboard.py"
|
"kill_dashboard.py"
|
||||||
=======
|
|
||||||
"-ExecutionPolicy",
|
|
||||||
"Bypass",
|
|
||||||
"-File",
|
|
||||||
"scripts/kill_stale_processes.ps1"
|
|
||||||
>>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
|
|
||||||
],
|
],
|
||||||
|
"options": {
|
||||||
|
"cwd": "${workspaceFolder}"
|
||||||
|
},
|
||||||
"group": "build",
|
"group": "build",
|
||||||
"presentation": {
|
"presentation": {
|
||||||
"echo": true,
|
"echo": true,
|
||||||
|
|||||||
@@ -2054,6 +2054,52 @@ class RealTrainingAdapter:
|
|||||||
'duration_seconds': session.duration_seconds,
|
'duration_seconds': session.duration_seconds,
|
||||||
'error': session.error
|
'error': session.error
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def get_active_training_session(self) -> Optional[Dict]:
|
||||||
|
"""
|
||||||
|
Get currently active training session (if any)
|
||||||
|
|
||||||
|
This allows the UI to resume tracking training progress after page reload
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict with training info if active session exists, None otherwise
|
||||||
|
"""
|
||||||
|
# Find any session with 'running' status
|
||||||
|
for training_id, session in self.training_sessions.items():
|
||||||
|
if session.status == 'running':
|
||||||
|
return {
|
||||||
|
'training_id': training_id,
|
||||||
|
'status': session.status,
|
||||||
|
'model_name': session.model_name,
|
||||||
|
'test_cases_count': session.test_cases_count,
|
||||||
|
'current_epoch': session.current_epoch,
|
||||||
|
'total_epochs': session.total_epochs,
|
||||||
|
'current_loss': session.current_loss,
|
||||||
|
'start_time': session.start_time
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def get_all_training_sessions(self) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Get all training sessions (for debugging/monitoring)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of all training session summaries
|
||||||
|
"""
|
||||||
|
sessions = []
|
||||||
|
for training_id, session in self.training_sessions.items():
|
||||||
|
sessions.append({
|
||||||
|
'training_id': training_id,
|
||||||
|
'status': session.status,
|
||||||
|
'model_name': session.model_name,
|
||||||
|
'current_epoch': session.current_epoch,
|
||||||
|
'total_epochs': session.total_epochs,
|
||||||
|
'start_time': session.start_time,
|
||||||
|
'duration_seconds': session.duration_seconds
|
||||||
|
})
|
||||||
|
|
||||||
|
return sessions
|
||||||
|
|
||||||
|
|
||||||
# Real-time inference support
|
# Real-time inference support
|
||||||
|
|||||||
@@ -1241,6 +1241,48 @@ class AnnotationDashboard:
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@self.server.route('/api/active-training', methods=['GET'])
|
||||||
|
def get_active_training():
|
||||||
|
"""
|
||||||
|
Get currently active training session (if any)
|
||||||
|
Allows UI to resume tracking after page reload or across multiple clients
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if not self.training_adapter:
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'active': False,
|
||||||
|
'error': {
|
||||||
|
'code': 'TRAINING_UNAVAILABLE',
|
||||||
|
'message': 'Real training adapter not available'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
active_session = self.training_adapter.get_active_training_session()
|
||||||
|
|
||||||
|
if active_session:
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'active': True,
|
||||||
|
'session': active_session
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
return jsonify({
|
||||||
|
'success': True,
|
||||||
|
'active': False
|
||||||
|
})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting active training: {e}")
|
||||||
|
return jsonify({
|
||||||
|
'success': False,
|
||||||
|
'active': False,
|
||||||
|
'error': {
|
||||||
|
'code': 'ACTIVE_TRAINING_ERROR',
|
||||||
|
'message': str(e)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
# Live Training API Endpoints
|
# Live Training API Endpoints
|
||||||
@self.server.route('/api/live-training/start', methods=['POST'])
|
@self.server.route('/api/live-training/start', methods=['POST'])
|
||||||
def start_live_training():
|
def start_live_training():
|
||||||
|
|||||||
@@ -92,6 +92,16 @@
|
|||||||
// Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
|
// Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
|
||||||
loadInitialData();
|
loadInitialData();
|
||||||
|
|
||||||
|
// Load available models for training panel
|
||||||
|
if (typeof loadAvailableModels === 'function') {
|
||||||
|
loadAvailableModels();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for active training session (resume tracking after page reload)
|
||||||
|
if (typeof checkActiveTraining === 'function') {
|
||||||
|
checkActiveTraining();
|
||||||
|
}
|
||||||
|
|
||||||
// Setup keyboard shortcuts
|
// Setup keyboard shortcuts
|
||||||
setupKeyboardShortcuts();
|
setupKeyboardShortcuts();
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -109,6 +109,30 @@
|
|||||||
// Track model states
|
// Track model states
|
||||||
let modelStates = [];
|
let modelStates = [];
|
||||||
let selectedModel = null;
|
let selectedModel = null;
|
||||||
|
let activeTrainingId = null; // Track active training session
|
||||||
|
|
||||||
|
function checkActiveTraining() {
|
||||||
|
/**
|
||||||
|
* Check if there's an active training session on page load
|
||||||
|
* This allows resuming progress tracking after page reload
|
||||||
|
*/
|
||||||
|
fetch('/api/active-training')
|
||||||
|
.then(response => response.json())
|
||||||
|
.then(data => {
|
||||||
|
if (data.success && data.active && data.session) {
|
||||||
|
console.log('Active training session found:', data.session);
|
||||||
|
// Resume tracking
|
||||||
|
activeTrainingId = data.session.training_id;
|
||||||
|
showTrainingStatus();
|
||||||
|
pollTrainingProgress(activeTrainingId);
|
||||||
|
} else {
|
||||||
|
console.log('No active training session');
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.catch(error => {
|
||||||
|
console.error('Error checking active training:', error);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function loadAvailableModels() {
|
function loadAvailableModels() {
|
||||||
fetch('/api/available-models')
|
fetch('/api/available-models')
|
||||||
@@ -290,11 +314,16 @@
|
|||||||
startTraining(modelName, annotationIds);
|
startTraining(modelName, annotationIds);
|
||||||
});
|
});
|
||||||
|
|
||||||
function startTraining(modelName, annotationIds) {
|
function showTrainingStatus() {
|
||||||
// Show training status
|
// Show training status UI
|
||||||
document.getElementById('training-status').style.display = 'block';
|
document.getElementById('training-status').style.display = 'block';
|
||||||
document.getElementById('training-results').style.display = 'none';
|
document.getElementById('training-results').style.display = 'none';
|
||||||
document.getElementById('train-model-btn').disabled = true;
|
document.getElementById('train-model-btn').disabled = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
function startTraining(modelName, annotationIds) {
|
||||||
|
// Show training status
|
||||||
|
showTrainingStatus();
|
||||||
|
|
||||||
// Reset progress
|
// Reset progress
|
||||||
document.getElementById('training-progress-bar').style.width = '0%';
|
document.getElementById('training-progress-bar').style.width = '0%';
|
||||||
@@ -313,18 +342,22 @@
|
|||||||
.then(response => response.json())
|
.then(response => response.json())
|
||||||
.then(data => {
|
.then(data => {
|
||||||
if (data.success) {
|
if (data.success) {
|
||||||
|
// Store active training ID for persistence across reloads
|
||||||
|
activeTrainingId = data.training_id;
|
||||||
// Start polling for training progress
|
// Start polling for training progress
|
||||||
pollTrainingProgress(data.training_id);
|
pollTrainingProgress(data.training_id);
|
||||||
} else {
|
} else {
|
||||||
showError('Failed to start training: ' + data.error.message);
|
showError('Failed to start training: ' + data.error.message);
|
||||||
document.getElementById('training-status').style.display = 'none';
|
document.getElementById('training-status').style.display = 'none';
|
||||||
document.getElementById('train-model-btn').disabled = false;
|
document.getElementById('train-model-btn').disabled = false;
|
||||||
|
activeTrainingId = null;
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
showError('Network error: ' + error.message);
|
showError('Network error: ' + error.message);
|
||||||
document.getElementById('training-status').style.display = 'none';
|
document.getElementById('training-status').style.display = 'none';
|
||||||
document.getElementById('train-model-btn').disabled = false;
|
document.getElementById('train-model-btn').disabled = false;
|
||||||
|
activeTrainingId = null;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -350,9 +383,11 @@
|
|||||||
// Check if complete
|
// Check if complete
|
||||||
if (progress.status === 'completed') {
|
if (progress.status === 'completed') {
|
||||||
clearInterval(pollInterval);
|
clearInterval(pollInterval);
|
||||||
|
activeTrainingId = null; // Clear active training
|
||||||
showTrainingResults(progress);
|
showTrainingResults(progress);
|
||||||
} else if (progress.status === 'failed') {
|
} else if (progress.status === 'failed') {
|
||||||
clearInterval(pollInterval);
|
clearInterval(pollInterval);
|
||||||
|
activeTrainingId = null; // Clear active training
|
||||||
showError('Training failed: ' + progress.error);
|
showError('Training failed: ' + progress.error);
|
||||||
document.getElementById('training-status').style.display = 'none';
|
document.getElementById('training-status').style.display = 'none';
|
||||||
document.getElementById('train-model-btn').disabled = false;
|
document.getElementById('train-model-btn').disabled = false;
|
||||||
@@ -361,6 +396,7 @@
|
|||||||
})
|
})
|
||||||
.catch(error => {
|
.catch(error => {
|
||||||
clearInterval(pollInterval);
|
clearInterval(pollInterval);
|
||||||
|
// Don't clear activeTrainingId on network error - training might still be running
|
||||||
showError('Failed to get training progress: ' + error.message);
|
showError('Failed to get training progress: ' + error.message);
|
||||||
document.getElementById('training-status').style.display = 'none';
|
document.getElementById('training-status').style.display = 'none';
|
||||||
document.getElementById('train-model-btn').disabled = false;
|
document.getElementById('train-model-btn').disabled = false;
|
||||||
|
|||||||
225
kill_dashboard.py
Normal file
225
kill_dashboard.py
Normal file
@@ -0,0 +1,225 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Cross-platform script to kill stale Python dashboard processes.
|
||||||
|
Works on Linux, macOS, and Windows.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import signal
|
||||||
|
import psutil
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Dashboard script names to look for
|
||||||
|
DASHBOARD_SCRIPTS = [
|
||||||
|
'main_dashboard.py',
|
||||||
|
'run_clean_dashboard.py',
|
||||||
|
'web/clean_dashboard.py',
|
||||||
|
'web/cob_realtime_dashboard.py',
|
||||||
|
'ANNOTATE/web/app.py',
|
||||||
|
'COBY/main.py',
|
||||||
|
'run_integrated_rl_cob_dashboard.py',
|
||||||
|
'run_realtime_rl_cob_trader.py',
|
||||||
|
]
|
||||||
|
|
||||||
|
# Ports used by dashboards
|
||||||
|
DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001]
|
||||||
|
|
||||||
|
|
||||||
|
def is_dashboard_process(proc):
|
||||||
|
"""Check if a process is a dashboard process"""
|
||||||
|
try:
|
||||||
|
# Get process info
|
||||||
|
cmdline = proc.cmdline()
|
||||||
|
if not cmdline:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Join command line for easier searching
|
||||||
|
cmd_str = ' '.join(cmdline).lower()
|
||||||
|
|
||||||
|
# Check if it's a Python process
|
||||||
|
if 'python' not in cmd_str:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if it's running one of our dashboard scripts
|
||||||
|
for script in DASHBOARD_SCRIPTS:
|
||||||
|
script_lower = script.lower()
|
||||||
|
if script_lower in cmd_str:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check if it's using one of our dashboard ports
|
||||||
|
try:
|
||||||
|
connections = proc.connections()
|
||||||
|
for conn in connections:
|
||||||
|
if hasattr(conn, 'laddr') and conn.laddr:
|
||||||
|
if conn.laddr.port in DASHBOARD_PORTS:
|
||||||
|
return True
|
||||||
|
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def kill_process(proc, force=False):
|
||||||
|
"""Kill a process gracefully or forcefully"""
|
||||||
|
try:
|
||||||
|
pid = proc.pid
|
||||||
|
name = proc.name()
|
||||||
|
|
||||||
|
if force:
|
||||||
|
# Force kill
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
proc.kill()
|
||||||
|
else:
|
||||||
|
os.kill(pid, signal.SIGKILL)
|
||||||
|
print(f" Force killed PID {pid} ({name})")
|
||||||
|
else:
|
||||||
|
# Graceful termination
|
||||||
|
if sys.platform == 'win32':
|
||||||
|
proc.terminate()
|
||||||
|
else:
|
||||||
|
os.kill(pid, signal.SIGTERM)
|
||||||
|
print(f" Terminated PID {pid} ({name})")
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e:
|
||||||
|
print(f" Could not kill PID {proc.pid}: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def find_processes_on_ports():
|
||||||
|
"""Find processes listening on dashboard ports"""
|
||||||
|
port_processes = []
|
||||||
|
|
||||||
|
for proc in psutil.process_iter(['pid', 'name']):
|
||||||
|
try:
|
||||||
|
connections = proc.connections()
|
||||||
|
for conn in connections:
|
||||||
|
if hasattr(conn, 'laddr') and conn.laddr:
|
||||||
|
if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN':
|
||||||
|
port_processes.append({
|
||||||
|
'pid': proc.pid,
|
||||||
|
'name': proc.name(),
|
||||||
|
'port': conn.laddr.port
|
||||||
|
})
|
||||||
|
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||||
|
pass
|
||||||
|
|
||||||
|
return port_processes
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("=" * 60)
|
||||||
|
print("Killing Stale Dashboard Processes")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
# Find dashboard processes
|
||||||
|
dashboard_procs = []
|
||||||
|
|
||||||
|
print("\nScanning for dashboard processes...")
|
||||||
|
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
|
||||||
|
if is_dashboard_process(proc):
|
||||||
|
dashboard_procs.append(proc)
|
||||||
|
|
||||||
|
if not dashboard_procs:
|
||||||
|
print(" No dashboard processes found")
|
||||||
|
else:
|
||||||
|
print(f" Found {len(dashboard_procs)} dashboard process(es):")
|
||||||
|
for proc in dashboard_procs:
|
||||||
|
try:
|
||||||
|
cmdline = ' '.join(proc.cmdline())
|
||||||
|
print(f" - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}")
|
||||||
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||||
|
print(f" - PID {proc.pid}: (access denied)")
|
||||||
|
|
||||||
|
# Find processes on dashboard ports
|
||||||
|
print("\nChecking for processes on dashboard ports...")
|
||||||
|
port_procs = find_processes_on_ports()
|
||||||
|
|
||||||
|
if not port_procs:
|
||||||
|
print(" No processes found on dashboard ports")
|
||||||
|
else:
|
||||||
|
print(f" Found {len(port_procs)} process(es) on dashboard ports:")
|
||||||
|
for p in port_procs:
|
||||||
|
print(f" - PID {p['pid']} ({p['name']}) on port {p['port']}")
|
||||||
|
# Add to kill list if not already there
|
||||||
|
try:
|
||||||
|
proc = psutil.Process(p['pid'])
|
||||||
|
if proc not in dashboard_procs:
|
||||||
|
dashboard_procs.append(proc)
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Kill all found processes
|
||||||
|
if dashboard_procs:
|
||||||
|
print("\nTerminating processes...")
|
||||||
|
killed_count = 0
|
||||||
|
|
||||||
|
# First, try graceful termination
|
||||||
|
for proc in dashboard_procs:
|
||||||
|
if kill_process(proc, force=False):
|
||||||
|
killed_count += 1
|
||||||
|
|
||||||
|
# Wait for processes to exit
|
||||||
|
print("\nWaiting for processes to exit...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Force kill any remaining processes
|
||||||
|
remaining = []
|
||||||
|
for proc in dashboard_procs:
|
||||||
|
try:
|
||||||
|
if proc.is_running():
|
||||||
|
remaining.append(proc)
|
||||||
|
except psutil.NoSuchProcess:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if remaining:
|
||||||
|
print(f"\nForce killing {len(remaining)} remaining process(es)...")
|
||||||
|
for proc in remaining:
|
||||||
|
kill_process(proc, force=True)
|
||||||
|
|
||||||
|
print(f"\n✓ Killed {killed_count} dashboard process(es)")
|
||||||
|
else:
|
||||||
|
print("\n✓ No processes to kill")
|
||||||
|
|
||||||
|
print("\nPort status:")
|
||||||
|
for port in DASHBOARD_PORTS:
|
||||||
|
port_free = True
|
||||||
|
for proc in psutil.process_iter():
|
||||||
|
try:
|
||||||
|
for conn in proc.connections():
|
||||||
|
if hasattr(conn, 'laddr') and conn.laddr:
|
||||||
|
if conn.laddr.port == port and conn.status == 'LISTEN':
|
||||||
|
print(f" Port {port}: IN USE by PID {proc.pid}")
|
||||||
|
port_free = False
|
||||||
|
break
|
||||||
|
if not port_free:
|
||||||
|
break
|
||||||
|
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||||
|
pass
|
||||||
|
|
||||||
|
if port_free:
|
||||||
|
print(f" Port {port}: FREE")
|
||||||
|
|
||||||
|
print("\n" + "=" * 60)
|
||||||
|
print("Process cleanup completed")
|
||||||
|
print("=" * 60)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n\nInterrupted by user")
|
||||||
|
sys.exit(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\nError: {e}")
|
||||||
|
import traceback
|
||||||
|
traceback.print_exc()
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
@@ -19,6 +19,25 @@ pybit>=5.11.0
|
|||||||
requests>=2.31.0
|
requests>=2.31.0
|
||||||
asyncpg>=0.29.0
|
asyncpg>=0.29.0
|
||||||
|
|
||||||
|
# COBY System Dependencies
|
||||||
|
fastapi>=0.104.0
|
||||||
|
uvicorn>=0.24.0
|
||||||
|
pydantic>=2.5.0
|
||||||
|
python-multipart>=0.0.6
|
||||||
|
aiohttp>=3.9.0
|
||||||
|
redis>=5.0.0
|
||||||
|
scipy>=1.11.0
|
||||||
|
structlog>=23.2.0
|
||||||
|
click>=8.1.0
|
||||||
|
rich>=13.7.0
|
||||||
|
|
||||||
|
# DuckDB for data storage
|
||||||
|
duckdb>=0.9.0
|
||||||
|
|
||||||
|
# Flask for ANNOTATE system
|
||||||
|
Flask>=3.0.0
|
||||||
|
flask-cors>=4.0.0
|
||||||
|
|
||||||
# NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
|
# NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
|
||||||
# Install one of the following sets manually depending on your hardware:
|
# Install one of the following sets manually depending on your hardware:
|
||||||
#
|
#
|
||||||
|
|||||||
Reference in New Issue
Block a user