sync active training with UI if running
This commit is contained in:
12
.vscode/tasks.json
vendored
12
.vscode/tasks.json
vendored
@@ -4,17 +4,13 @@
|
||||
{
|
||||
"label": "Kill Stale Processes",
|
||||
"type": "shell",
|
||||
"command": "python",
|
||||
"command": "${command:python.interpreterPath}",
|
||||
"args": [
|
||||
<<<<<<< HEAD
|
||||
"kill_dashboard.py"
|
||||
=======
|
||||
"-ExecutionPolicy",
|
||||
"Bypass",
|
||||
"-File",
|
||||
"scripts/kill_stale_processes.ps1"
|
||||
>>>>>>> d49a473ed6f4aef55bfdd47d6370e53582be6b7b
|
||||
],
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
"group": "build",
|
||||
"presentation": {
|
||||
"echo": true,
|
||||
|
||||
@@ -2054,6 +2054,52 @@ class RealTrainingAdapter:
|
||||
'duration_seconds': session.duration_seconds,
|
||||
'error': session.error
|
||||
}
|
||||
|
||||
def get_active_training_session(self) -> Optional[Dict]:
|
||||
"""
|
||||
Get currently active training session (if any)
|
||||
|
||||
This allows the UI to resume tracking training progress after page reload
|
||||
|
||||
Returns:
|
||||
Dict with training info if active session exists, None otherwise
|
||||
"""
|
||||
# Find any session with 'running' status
|
||||
for training_id, session in self.training_sessions.items():
|
||||
if session.status == 'running':
|
||||
return {
|
||||
'training_id': training_id,
|
||||
'status': session.status,
|
||||
'model_name': session.model_name,
|
||||
'test_cases_count': session.test_cases_count,
|
||||
'current_epoch': session.current_epoch,
|
||||
'total_epochs': session.total_epochs,
|
||||
'current_loss': session.current_loss,
|
||||
'start_time': session.start_time
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
def get_all_training_sessions(self) -> List[Dict]:
|
||||
"""
|
||||
Get all training sessions (for debugging/monitoring)
|
||||
|
||||
Returns:
|
||||
List of all training session summaries
|
||||
"""
|
||||
sessions = []
|
||||
for training_id, session in self.training_sessions.items():
|
||||
sessions.append({
|
||||
'training_id': training_id,
|
||||
'status': session.status,
|
||||
'model_name': session.model_name,
|
||||
'current_epoch': session.current_epoch,
|
||||
'total_epochs': session.total_epochs,
|
||||
'start_time': session.start_time,
|
||||
'duration_seconds': session.duration_seconds
|
||||
})
|
||||
|
||||
return sessions
|
||||
|
||||
|
||||
# Real-time inference support
|
||||
|
||||
@@ -1241,6 +1241,48 @@ class AnnotationDashboard:
|
||||
}
|
||||
})
|
||||
|
||||
@self.server.route('/api/active-training', methods=['GET'])
|
||||
def get_active_training():
|
||||
"""
|
||||
Get currently active training session (if any)
|
||||
Allows UI to resume tracking after page reload or across multiple clients
|
||||
"""
|
||||
try:
|
||||
if not self.training_adapter:
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'active': False,
|
||||
'error': {
|
||||
'code': 'TRAINING_UNAVAILABLE',
|
||||
'message': 'Real training adapter not available'
|
||||
}
|
||||
})
|
||||
|
||||
active_session = self.training_adapter.get_active_training_session()
|
||||
|
||||
if active_session:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'active': True,
|
||||
'session': active_session
|
||||
})
|
||||
else:
|
||||
return jsonify({
|
||||
'success': True,
|
||||
'active': False
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting active training: {e}")
|
||||
return jsonify({
|
||||
'success': False,
|
||||
'active': False,
|
||||
'error': {
|
||||
'code': 'ACTIVE_TRAINING_ERROR',
|
||||
'message': str(e)
|
||||
}
|
||||
})
|
||||
|
||||
# Live Training API Endpoints
|
||||
@self.server.route('/api/live-training/start', methods=['POST'])
|
||||
def start_live_training():
|
||||
|
||||
@@ -92,6 +92,16 @@
|
||||
// Load initial data (may call renderAnnotationsList which needs deleteAnnotation)
|
||||
loadInitialData();
|
||||
|
||||
// Load available models for training panel
|
||||
if (typeof loadAvailableModels === 'function') {
|
||||
loadAvailableModels();
|
||||
}
|
||||
|
||||
// Check for active training session (resume tracking after page reload)
|
||||
if (typeof checkActiveTraining === 'function') {
|
||||
checkActiveTraining();
|
||||
}
|
||||
|
||||
// Setup keyboard shortcuts
|
||||
setupKeyboardShortcuts();
|
||||
});
|
||||
|
||||
@@ -109,6 +109,30 @@
|
||||
// Track model states
|
||||
let modelStates = [];
|
||||
let selectedModel = null;
|
||||
let activeTrainingId = null; // Track active training session
|
||||
|
||||
function checkActiveTraining() {
|
||||
/**
|
||||
* Check if there's an active training session on page load
|
||||
* This allows resuming progress tracking after page reload
|
||||
*/
|
||||
fetch('/api/active-training')
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success && data.active && data.session) {
|
||||
console.log('Active training session found:', data.session);
|
||||
// Resume tracking
|
||||
activeTrainingId = data.session.training_id;
|
||||
showTrainingStatus();
|
||||
pollTrainingProgress(activeTrainingId);
|
||||
} else {
|
||||
console.log('No active training session');
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
console.error('Error checking active training:', error);
|
||||
});
|
||||
}
|
||||
|
||||
function loadAvailableModels() {
|
||||
fetch('/api/available-models')
|
||||
@@ -290,11 +314,16 @@
|
||||
startTraining(modelName, annotationIds);
|
||||
});
|
||||
|
||||
function startTraining(modelName, annotationIds) {
|
||||
// Show training status
|
||||
function showTrainingStatus() {
|
||||
// Show training status UI
|
||||
document.getElementById('training-status').style.display = 'block';
|
||||
document.getElementById('training-results').style.display = 'none';
|
||||
document.getElementById('train-model-btn').disabled = true;
|
||||
}
|
||||
|
||||
function startTraining(modelName, annotationIds) {
|
||||
// Show training status
|
||||
showTrainingStatus();
|
||||
|
||||
// Reset progress
|
||||
document.getElementById('training-progress-bar').style.width = '0%';
|
||||
@@ -313,18 +342,22 @@
|
||||
.then(response => response.json())
|
||||
.then(data => {
|
||||
if (data.success) {
|
||||
// Store active training ID for persistence across reloads
|
||||
activeTrainingId = data.training_id;
|
||||
// Start polling for training progress
|
||||
pollTrainingProgress(data.training_id);
|
||||
} else {
|
||||
showError('Failed to start training: ' + data.error.message);
|
||||
document.getElementById('training-status').style.display = 'none';
|
||||
document.getElementById('train-model-btn').disabled = false;
|
||||
activeTrainingId = null;
|
||||
}
|
||||
})
|
||||
.catch(error => {
|
||||
showError('Network error: ' + error.message);
|
||||
document.getElementById('training-status').style.display = 'none';
|
||||
document.getElementById('train-model-btn').disabled = false;
|
||||
activeTrainingId = null;
|
||||
});
|
||||
}
|
||||
|
||||
@@ -350,9 +383,11 @@
|
||||
// Check if complete
|
||||
if (progress.status === 'completed') {
|
||||
clearInterval(pollInterval);
|
||||
activeTrainingId = null; // Clear active training
|
||||
showTrainingResults(progress);
|
||||
} else if (progress.status === 'failed') {
|
||||
clearInterval(pollInterval);
|
||||
activeTrainingId = null; // Clear active training
|
||||
showError('Training failed: ' + progress.error);
|
||||
document.getElementById('training-status').style.display = 'none';
|
||||
document.getElementById('train-model-btn').disabled = false;
|
||||
@@ -361,6 +396,7 @@
|
||||
})
|
||||
.catch(error => {
|
||||
clearInterval(pollInterval);
|
||||
// Don't clear activeTrainingId on network error - training might still be running
|
||||
showError('Failed to get training progress: ' + error.message);
|
||||
document.getElementById('training-status').style.display = 'none';
|
||||
document.getElementById('train-model-btn').disabled = false;
|
||||
|
||||
225
kill_dashboard.py
Normal file
225
kill_dashboard.py
Normal file
@@ -0,0 +1,225 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cross-platform script to kill stale Python dashboard processes.
|
||||
Works on Linux, macOS, and Windows.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import psutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Dashboard script names to look for
|
||||
DASHBOARD_SCRIPTS = [
|
||||
'main_dashboard.py',
|
||||
'run_clean_dashboard.py',
|
||||
'web/clean_dashboard.py',
|
||||
'web/cob_realtime_dashboard.py',
|
||||
'ANNOTATE/web/app.py',
|
||||
'COBY/main.py',
|
||||
'run_integrated_rl_cob_dashboard.py',
|
||||
'run_realtime_rl_cob_trader.py',
|
||||
]
|
||||
|
||||
# Ports used by dashboards
|
||||
DASHBOARD_PORTS = [8050, 8051, 8052, 8080, 8081, 5001]
|
||||
|
||||
|
||||
def is_dashboard_process(proc):
|
||||
"""Check if a process is a dashboard process"""
|
||||
try:
|
||||
# Get process info
|
||||
cmdline = proc.cmdline()
|
||||
if not cmdline:
|
||||
return False
|
||||
|
||||
# Join command line for easier searching
|
||||
cmd_str = ' '.join(cmdline).lower()
|
||||
|
||||
# Check if it's a Python process
|
||||
if 'python' not in cmd_str:
|
||||
return False
|
||||
|
||||
# Check if it's running one of our dashboard scripts
|
||||
for script in DASHBOARD_SCRIPTS:
|
||||
script_lower = script.lower()
|
||||
if script_lower in cmd_str:
|
||||
return True
|
||||
|
||||
# Check if it's using one of our dashboard ports
|
||||
try:
|
||||
connections = proc.connections()
|
||||
for conn in connections:
|
||||
if hasattr(conn, 'laddr') and conn.laddr:
|
||||
if conn.laddr.port in DASHBOARD_PORTS:
|
||||
return True
|
||||
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
return False
|
||||
|
||||
|
||||
def kill_process(proc, force=False):
|
||||
"""Kill a process gracefully or forcefully"""
|
||||
try:
|
||||
pid = proc.pid
|
||||
name = proc.name()
|
||||
|
||||
if force:
|
||||
# Force kill
|
||||
if sys.platform == 'win32':
|
||||
proc.kill()
|
||||
else:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
print(f" Force killed PID {pid} ({name})")
|
||||
else:
|
||||
# Graceful termination
|
||||
if sys.platform == 'win32':
|
||||
proc.terminate()
|
||||
else:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
print(f" Terminated PID {pid} ({name})")
|
||||
|
||||
return True
|
||||
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError) as e:
|
||||
print(f" Could not kill PID {proc.pid}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def find_processes_on_ports():
|
||||
"""Find processes listening on dashboard ports"""
|
||||
port_processes = []
|
||||
|
||||
for proc in psutil.process_iter(['pid', 'name']):
|
||||
try:
|
||||
connections = proc.connections()
|
||||
for conn in connections:
|
||||
if hasattr(conn, 'laddr') and conn.laddr:
|
||||
if conn.laddr.port in DASHBOARD_PORTS and conn.status == 'LISTEN':
|
||||
port_processes.append({
|
||||
'pid': proc.pid,
|
||||
'name': proc.name(),
|
||||
'port': conn.laddr.port
|
||||
})
|
||||
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||
pass
|
||||
|
||||
return port_processes
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Killing Stale Dashboard Processes")
|
||||
print("=" * 60)
|
||||
|
||||
# Find dashboard processes
|
||||
dashboard_procs = []
|
||||
|
||||
print("\nScanning for dashboard processes...")
|
||||
for proc in psutil.process_iter(['pid', 'name', 'cmdline']):
|
||||
if is_dashboard_process(proc):
|
||||
dashboard_procs.append(proc)
|
||||
|
||||
if not dashboard_procs:
|
||||
print(" No dashboard processes found")
|
||||
else:
|
||||
print(f" Found {len(dashboard_procs)} dashboard process(es):")
|
||||
for proc in dashboard_procs:
|
||||
try:
|
||||
cmdline = ' '.join(proc.cmdline())
|
||||
print(f" - PID {proc.pid}: {cmdline[:80]}{'...' if len(cmdline) > 80 else ''}")
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
||||
print(f" - PID {proc.pid}: (access denied)")
|
||||
|
||||
# Find processes on dashboard ports
|
||||
print("\nChecking for processes on dashboard ports...")
|
||||
port_procs = find_processes_on_ports()
|
||||
|
||||
if not port_procs:
|
||||
print(" No processes found on dashboard ports")
|
||||
else:
|
||||
print(f" Found {len(port_procs)} process(es) on dashboard ports:")
|
||||
for p in port_procs:
|
||||
print(f" - PID {p['pid']} ({p['name']}) on port {p['port']}")
|
||||
# Add to kill list if not already there
|
||||
try:
|
||||
proc = psutil.Process(p['pid'])
|
||||
if proc not in dashboard_procs:
|
||||
dashboard_procs.append(proc)
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
# Kill all found processes
|
||||
if dashboard_procs:
|
||||
print("\nTerminating processes...")
|
||||
killed_count = 0
|
||||
|
||||
# First, try graceful termination
|
||||
for proc in dashboard_procs:
|
||||
if kill_process(proc, force=False):
|
||||
killed_count += 1
|
||||
|
||||
# Wait for processes to exit
|
||||
print("\nWaiting for processes to exit...")
|
||||
time.sleep(2)
|
||||
|
||||
# Force kill any remaining processes
|
||||
remaining = []
|
||||
for proc in dashboard_procs:
|
||||
try:
|
||||
if proc.is_running():
|
||||
remaining.append(proc)
|
||||
except psutil.NoSuchProcess:
|
||||
pass
|
||||
|
||||
if remaining:
|
||||
print(f"\nForce killing {len(remaining)} remaining process(es)...")
|
||||
for proc in remaining:
|
||||
kill_process(proc, force=True)
|
||||
|
||||
print(f"\n✓ Killed {killed_count} dashboard process(es)")
|
||||
else:
|
||||
print("\n✓ No processes to kill")
|
||||
|
||||
print("\nPort status:")
|
||||
for port in DASHBOARD_PORTS:
|
||||
port_free = True
|
||||
for proc in psutil.process_iter():
|
||||
try:
|
||||
for conn in proc.connections():
|
||||
if hasattr(conn, 'laddr') and conn.laddr:
|
||||
if conn.laddr.port == port and conn.status == 'LISTEN':
|
||||
print(f" Port {port}: IN USE by PID {proc.pid}")
|
||||
port_free = False
|
||||
break
|
||||
if not port_free:
|
||||
break
|
||||
except (psutil.AccessDenied, psutil.NoSuchProcess):
|
||||
pass
|
||||
|
||||
if port_free:
|
||||
print(f" Port {port}: FREE")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Process cleanup completed")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nInterrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\nError: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
@@ -19,6 +19,25 @@ pybit>=5.11.0
|
||||
requests>=2.31.0
|
||||
asyncpg>=0.29.0
|
||||
|
||||
# COBY System Dependencies
|
||||
fastapi>=0.104.0
|
||||
uvicorn>=0.24.0
|
||||
pydantic>=2.5.0
|
||||
python-multipart>=0.0.6
|
||||
aiohttp>=3.9.0
|
||||
redis>=5.0.0
|
||||
scipy>=1.11.0
|
||||
structlog>=23.2.0
|
||||
click>=8.1.0
|
||||
rich>=13.7.0
|
||||
|
||||
# DuckDB for data storage
|
||||
duckdb>=0.9.0
|
||||
|
||||
# Flask for ANNOTATE system
|
||||
Flask>=3.0.0
|
||||
flask-cors>=4.0.0
|
||||
|
||||
# NOTE: PyTorch is intentionally not pinned here to avoid pulling NVIDIA CUDA deps on AMD machines.
|
||||
# Install one of the following sets manually depending on your hardware:
|
||||
#
|
||||
|
||||
Reference in New Issue
Block a user