gogo2/run_clean_dashboard.py

#!/usr/bin/env python3
"""
Clean Trading Dashboard Runner with Enhanced Stability and Error Handling
"""

# Ensure we run with the project's virtual environment Python
try:
    import os
    import sys
    from pathlib import Path
    import platform

    def _ensure_project_venv():
        try:
            project_root = Path(__file__).resolve().parent
            if platform.system().lower().startswith('win'):
                venv_python = project_root / 'venv' / 'Scripts' / 'python.exe'
            else:
                venv_python = project_root / 'venv' / 'bin' / 'python'

            if venv_python.exists():
                current = Path(sys.executable).resolve()
                target = venv_python.resolve()
                if current != target:
                    os.execv(str(target), [str(target), *sys.argv])
        except Exception:
            # If anything goes wrong, continue with current interpreter
            pass

    _ensure_project_venv()
except Exception:
    pass

import sys
import logging
import traceback
import gc
import time
import psutil
from pathlib import Path

# Try to import torch
try:
    import torch
    HAS_TORCH = True
except ImportError:
    torch = None
    HAS_TORCH = False

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def clear_gpu_memory():
    """Clear GPU memory cache"""
    if HAS_TORCH and torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

def check_system_resources():
    """Check if system has enough resources"""
    available_ram = psutil.virtual_memory().available / 1024**3
    if available_ram < 2.0:  # Less than 2GB available
        logger.warning(f"Low RAM: {available_ram:.1f} GB available")
        gc.collect()
        clear_gpu_memory()
        return False
    return True

def kill_existing_dashboard_processes():
    """Kill any existing dashboard processes and free port 8050"""
    import subprocess
    import signal

    try:
        # Find processes using port 8050
        logger.info("Checking for processes using port 8050...")

        # Method 1: Use lsof to find processes using port 8050
        try:
            result = subprocess.run(['lsof', '-ti', ':8050'],
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0 and result.stdout.strip():
                pids = result.stdout.strip().split('\n')
                logger.info(f"Found processes using port 8050: {pids}")

                for pid in pids:
                    if pid.strip():
                        try:
                            logger.info(f"Killing process {pid}")
                            os.kill(int(pid), signal.SIGTERM)
                            time.sleep(1)
                            # Force kill if still running
                            os.kill(int(pid), signal.SIGKILL)
                        except (ProcessLookupError, ValueError) as e:
                            logger.debug(f"Process {pid} already terminated: {e}")
                        except Exception as e:
                            logger.warning(f"Error killing process {pid}: {e}")
        except (subprocess.TimeoutExpired, FileNotFoundError):
            logger.debug("lsof not available or timed out")

        # Method 2: Use ps and grep to find Python processes
        try:
            result = subprocess.run(['ps', 'aux'],
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                lines = result.stdout.split('\n')
                for line in lines:
                    if 'run_clean_dashboard' in line or 'clean_dashboard' in line:
                        parts = line.split()
                        if len(parts) > 1:
                            pid = parts[1]
                            try:
                                logger.info(f"Killing dashboard process {pid}")
                                os.kill(int(pid), signal.SIGTERM)
                                time.sleep(1)
                                os.kill(int(pid), signal.SIGKILL)
                            except (ProcessLookupError, ValueError) as e:
                                logger.debug(f"Process {pid} already terminated: {e}")
                            except Exception as e:
                                logger.warning(f"Error killing process {pid}: {e}")
        except (subprocess.TimeoutExpired, FileNotFoundError):
            logger.debug("ps not available or timed out")

        # Method 3: Use netstat to find processes using port 8050
        try:
            result = subprocess.run(['netstat', '-tlnp'],
                                  capture_output=True, text=True, timeout=10)
            if result.returncode == 0:
                lines = result.stdout.split('\n')
                for line in lines:
                    if ':8050' in line and 'LISTEN' in line:
                        parts = line.split()
                        if len(parts) > 6:
                            pid_part = parts[6]
                            if '/' in pid_part:
                                pid = pid_part.split('/')[0]
                                try:
                                    logger.info(f"Killing process {pid} using port 8050")
                                    os.kill(int(pid), signal.SIGTERM)
                                    time.sleep(1)
                                    os.kill(int(pid), signal.SIGKILL)
                                except (ProcessLookupError, ValueError) as e:
                                    logger.debug(f"Process {pid} already terminated: {e}")
                                except Exception as e:
                                    logger.warning(f"Error killing process {pid}: {e}")
        except (subprocess.TimeoutExpired, FileNotFoundError):
            logger.debug("netstat not available or timed out")

        # Wait a bit for processes to fully terminate
        time.sleep(2)

        # Verify port is free
        try:
            result = subprocess.run(['lsof', '-ti', ':8050'],
                                  capture_output=True, text=True, timeout=5)
            if result.returncode == 0 and result.stdout.strip():
                logger.warning("Port 8050 still in use after cleanup")
                return False
            else:
                logger.info("Port 8050 is now free")
                return True
        except (subprocess.TimeoutExpired, FileNotFoundError):
            logger.info("Port 8050 cleanup verification skipped")
            return True

    except Exception as e:
        logger.error(f"Error during process cleanup: {e}")
        return False

def check_port_availability(port=8050):
    """Check if a port is available"""
    import socket

    try:
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind(('127.0.0.1', port))
            return True
    except OSError:
        return False

def run_dashboard_with_recovery():
    """Run dashboard with automatic error recovery"""
    max_retries = 3
    retry_count = 0

    while retry_count < max_retries:
        try:
            logger.info(f"Starting Clean Trading Dashboard (attempt {retry_count + 1}/{max_retries})")

            # Clean up existing processes and free port 8050
            if not check_port_availability(8050):
                logger.info("Port 8050 is in use, cleaning up existing processes...")
                if not kill_existing_dashboard_processes():
                    logger.warning("Failed to free port 8050, waiting 10 seconds...")
                    time.sleep(10)
                    continue

            # Check system resources
            if not check_system_resources():
                logger.warning("System resources low, waiting 30 seconds...")
                time.sleep(30)
                continue

            # Import here to avoid memory issues on restart
            from core.data_provider import DataProvider
            from core.orchestrator import TradingOrchestrator
            from core.trading_executor import TradingExecutor
            from web.clean_dashboard import create_clean_dashboard
            from data_stream_monitor import get_data_stream_monitor

            logger.info("Creating data provider...")
            data_provider = DataProvider()

            logger.info("Creating trading orchestrator...")
            orchestrator = TradingOrchestrator(
                data_provider=data_provider,
                enhanced_rl_training=True
            )

            logger.info("Creating trading executor...")
            trading_executor = TradingExecutor()

            logger.info("Creating clean dashboard...")
            dashboard = create_clean_dashboard(data_provider, orchestrator, trading_executor)

            # Initialize data stream monitor for model input capture (managed by orchestrator)
            logger.info("Data stream is managed by orchestrator; no separate control needed")
            try:
                status = orchestrator.get_data_stream_status()
                logger.info(f"Data Stream: connected={status.get('connected')} streaming={status.get('streaming')}")
            except Exception:
                pass

            logger.info("Dashboard created successfully")
            logger.info("=== Clean Trading Dashboard Status ===")
            logger.info("- Data Provider: Active")
            logger.info("- Trading Orchestrator: Active")
            logger.info("- Trading Executor: Active")
            logger.info("- Enhanced Training: Active")
            logger.info("- Data Stream Monitor: Active")
            logger.info("- Dashboard: Ready")
            logger.info("=======================================")

            # Start the dashboard server with error handling
            try:
                logger.info("Starting dashboard server on http://127.0.0.1:8050")
                dashboard.run_server(host='127.0.0.1', port=8050, debug=False)
            except KeyboardInterrupt:
                logger.info("Dashboard stopped by user")
                break
            except Exception as e:
                logger.error(f"Dashboard server error: {e}")
                logger.error(traceback.format_exc())
                raise

        except Exception as e:
            logger.error(f"Critical error in dashboard: {e}")
            logger.error(traceback.format_exc())

            retry_count += 1
            if retry_count < max_retries:
                logger.info(f"Attempting recovery... ({retry_count}/{max_retries})")

                # Cleanup
                gc.collect()
                clear_gpu_memory()

                # Wait before retry
                wait_time = 30 * retry_count  # Exponential backoff
                logger.info(f"Waiting {wait_time} seconds before retry...")
                time.sleep(wait_time)
            else:
                logger.error("Max retries reached. Exiting.")
                sys.exit(1)

if __name__ == "__main__":
    try:
        run_dashboard_with_recovery()
    except KeyboardInterrupt:
        logger.info("Application stopped by user")
        sys.exit(0)
    except Exception as e:
        logger.error(f"Fatal error: {e}")
        logger.error(traceback.format_exc())
        sys.exit(1)