286 lines
11 KiB
Python
286 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Clean Trading Dashboard Runner with Enhanced Stability and Error Handling
|
|
"""
|
|
|
|
# Ensure we run with the project's virtual environment Python
|
|
try:
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import platform
|
|
|
|
def _ensure_project_venv():
|
|
try:
|
|
project_root = Path(__file__).resolve().parent
|
|
if platform.system().lower().startswith('win'):
|
|
venv_python = project_root / 'venv' / 'Scripts' / 'python.exe'
|
|
else:
|
|
venv_python = project_root / 'venv' / 'bin' / 'python'
|
|
|
|
if venv_python.exists():
|
|
current = Path(sys.executable).resolve()
|
|
target = venv_python.resolve()
|
|
if current != target:
|
|
os.execv(str(target), [str(target), *sys.argv])
|
|
except Exception:
|
|
# If anything goes wrong, continue with current interpreter
|
|
pass
|
|
|
|
_ensure_project_venv()
|
|
except Exception:
|
|
pass
|
|
|
|
import sys
|
|
import logging
|
|
import traceback
|
|
import gc
|
|
import time
|
|
import psutil
|
|
from pathlib import Path
|
|
|
|
# Try to import torch
|
|
try:
|
|
import torch
|
|
HAS_TORCH = True
|
|
except ImportError:
|
|
torch = None
|
|
HAS_TORCH = False
|
|
|
|
# Setup logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def clear_gpu_memory():
|
|
"""Clear GPU memory cache"""
|
|
if HAS_TORCH and torch.cuda.is_available():
|
|
torch.cuda.empty_cache()
|
|
torch.cuda.synchronize()
|
|
|
|
def check_system_resources():
|
|
"""Check if system has enough resources"""
|
|
available_ram = psutil.virtual_memory().available / 1024**3
|
|
if available_ram < 2.0: # Less than 2GB available
|
|
logger.warning(f"Low RAM: {available_ram:.1f} GB available")
|
|
gc.collect()
|
|
clear_gpu_memory()
|
|
return False
|
|
return True
|
|
|
|
def kill_existing_dashboard_processes():
|
|
"""Kill any existing dashboard processes and free port 8050"""
|
|
import subprocess
|
|
import signal
|
|
|
|
try:
|
|
# Find processes using port 8050
|
|
logger.info("Checking for processes using port 8050...")
|
|
|
|
# Method 1: Use lsof to find processes using port 8050
|
|
try:
|
|
result = subprocess.run(['lsof', '-ti', ':8050'],
|
|
capture_output=True, text=True, timeout=10)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
pids = result.stdout.strip().split('\n')
|
|
logger.info(f"Found processes using port 8050: {pids}")
|
|
|
|
for pid in pids:
|
|
if pid.strip():
|
|
try:
|
|
logger.info(f"Killing process {pid}")
|
|
os.kill(int(pid), signal.SIGTERM)
|
|
time.sleep(1)
|
|
# Force kill if still running
|
|
os.kill(int(pid), signal.SIGKILL)
|
|
except (ProcessLookupError, ValueError) as e:
|
|
logger.debug(f"Process {pid} already terminated: {e}")
|
|
except Exception as e:
|
|
logger.warning(f"Error killing process {pid}: {e}")
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
logger.debug("lsof not available or timed out")
|
|
|
|
# Method 2: Use ps and grep to find Python processes
|
|
try:
|
|
result = subprocess.run(['ps', 'aux'],
|
|
capture_output=True, text=True, timeout=10)
|
|
if result.returncode == 0:
|
|
lines = result.stdout.split('\n')
|
|
for line in lines:
|
|
if 'run_clean_dashboard' in line or 'clean_dashboard' in line:
|
|
parts = line.split()
|
|
if len(parts) > 1:
|
|
pid = parts[1]
|
|
try:
|
|
logger.info(f"Killing dashboard process {pid}")
|
|
os.kill(int(pid), signal.SIGTERM)
|
|
time.sleep(1)
|
|
os.kill(int(pid), signal.SIGKILL)
|
|
except (ProcessLookupError, ValueError) as e:
|
|
logger.debug(f"Process {pid} already terminated: {e}")
|
|
except Exception as e:
|
|
logger.warning(f"Error killing process {pid}: {e}")
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
logger.debug("ps not available or timed out")
|
|
|
|
# Method 3: Use netstat to find processes using port 8050
|
|
try:
|
|
result = subprocess.run(['netstat', '-tlnp'],
|
|
capture_output=True, text=True, timeout=10)
|
|
if result.returncode == 0:
|
|
lines = result.stdout.split('\n')
|
|
for line in lines:
|
|
if ':8050' in line and 'LISTEN' in line:
|
|
parts = line.split()
|
|
if len(parts) > 6:
|
|
pid_part = parts[6]
|
|
if '/' in pid_part:
|
|
pid = pid_part.split('/')[0]
|
|
try:
|
|
logger.info(f"Killing process {pid} using port 8050")
|
|
os.kill(int(pid), signal.SIGTERM)
|
|
time.sleep(1)
|
|
os.kill(int(pid), signal.SIGKILL)
|
|
except (ProcessLookupError, ValueError) as e:
|
|
logger.debug(f"Process {pid} already terminated: {e}")
|
|
except Exception as e:
|
|
logger.warning(f"Error killing process {pid}: {e}")
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
logger.debug("netstat not available or timed out")
|
|
|
|
# Wait a bit for processes to fully terminate
|
|
time.sleep(2)
|
|
|
|
# Verify port is free
|
|
try:
|
|
result = subprocess.run(['lsof', '-ti', ':8050'],
|
|
capture_output=True, text=True, timeout=5)
|
|
if result.returncode == 0 and result.stdout.strip():
|
|
logger.warning("Port 8050 still in use after cleanup")
|
|
return False
|
|
else:
|
|
logger.info("Port 8050 is now free")
|
|
return True
|
|
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
logger.info("Port 8050 cleanup verification skipped")
|
|
return True
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during process cleanup: {e}")
|
|
return False
|
|
|
|
def check_port_availability(port=8050):
|
|
"""Check if a port is available"""
|
|
import socket
|
|
|
|
try:
|
|
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
|
s.bind(('127.0.0.1', port))
|
|
return True
|
|
except OSError:
|
|
return False
|
|
|
|
def run_dashboard_with_recovery():
|
|
"""Run dashboard with automatic error recovery"""
|
|
max_retries = 3
|
|
retry_count = 0
|
|
|
|
while retry_count < max_retries:
|
|
try:
|
|
logger.info(f"Starting Clean Trading Dashboard (attempt {retry_count + 1}/{max_retries})")
|
|
|
|
# Clean up existing processes and free port 8050
|
|
if not check_port_availability(8050):
|
|
logger.info("Port 8050 is in use, cleaning up existing processes...")
|
|
if not kill_existing_dashboard_processes():
|
|
logger.warning("Failed to free port 8050, waiting 10 seconds...")
|
|
time.sleep(10)
|
|
continue
|
|
|
|
# Check system resources
|
|
if not check_system_resources():
|
|
logger.warning("System resources low, waiting 30 seconds...")
|
|
time.sleep(30)
|
|
continue
|
|
|
|
# Import here to avoid memory issues on restart
|
|
from core.data_provider import DataProvider
|
|
from core.orchestrator import TradingOrchestrator
|
|
from core.trading_executor import TradingExecutor
|
|
from web.clean_dashboard import create_clean_dashboard
|
|
from data_stream_monitor import get_data_stream_monitor
|
|
|
|
logger.info("Creating data provider...")
|
|
data_provider = DataProvider()
|
|
|
|
logger.info("Creating trading orchestrator...")
|
|
orchestrator = TradingOrchestrator(
|
|
data_provider=data_provider,
|
|
enhanced_rl_training=True
|
|
)
|
|
|
|
logger.info("Creating trading executor...")
|
|
trading_executor = TradingExecutor()
|
|
|
|
logger.info("Creating clean dashboard...")
|
|
dashboard = create_clean_dashboard(data_provider, orchestrator, trading_executor)
|
|
|
|
# Initialize data stream monitor for model input capture (managed by orchestrator)
|
|
logger.info("Data stream is managed by orchestrator; no separate control needed")
|
|
try:
|
|
status = orchestrator.get_data_stream_status()
|
|
logger.info(f"Data Stream: connected={status.get('connected')} streaming={status.get('streaming')}")
|
|
except Exception:
|
|
pass
|
|
|
|
logger.info("Dashboard created successfully")
|
|
logger.info("=== Clean Trading Dashboard Status ===")
|
|
logger.info("- Data Provider: Active")
|
|
logger.info("- Trading Orchestrator: Active")
|
|
logger.info("- Trading Executor: Active")
|
|
logger.info("- Enhanced Training: Active")
|
|
logger.info("- Data Stream Monitor: Active")
|
|
logger.info("- Dashboard: Ready")
|
|
logger.info("=======================================")
|
|
|
|
# Start the dashboard server with error handling
|
|
try:
|
|
logger.info("Starting dashboard server on http://127.0.0.1:8050")
|
|
dashboard.run_server(host='127.0.0.1', port=8050, debug=False)
|
|
except KeyboardInterrupt:
|
|
logger.info("Dashboard stopped by user")
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Dashboard server error: {e}")
|
|
logger.error(traceback.format_exc())
|
|
raise
|
|
|
|
except Exception as e:
|
|
logger.error(f"Critical error in dashboard: {e}")
|
|
logger.error(traceback.format_exc())
|
|
|
|
retry_count += 1
|
|
if retry_count < max_retries:
|
|
logger.info(f"Attempting recovery... ({retry_count}/{max_retries})")
|
|
|
|
# Cleanup
|
|
gc.collect()
|
|
clear_gpu_memory()
|
|
|
|
# Wait before retry
|
|
wait_time = 30 * retry_count # Exponential backoff
|
|
logger.info(f"Waiting {wait_time} seconds before retry...")
|
|
time.sleep(wait_time)
|
|
else:
|
|
logger.error("Max retries reached. Exiting.")
|
|
sys.exit(1)
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
run_dashboard_with_recovery()
|
|
except KeyboardInterrupt:
|
|
logger.info("Application stopped by user")
|
|
sys.exit(0)
|
|
except Exception as e:
|
|
logger.error(f"Fatal error: {e}")
|
|
logger.error(traceback.format_exc())
|
|
sys.exit(1) |