gogo2/run_clean_dashboard.py

#!/usr/bin/env python3
"""
Run Clean Trading Dashboard with Full Training Pipeline
Integrated system with both training loop and clean web dashboard
"""

import os
# Fix OpenMP library conflicts before importing other modules
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
os.environ['OMP_NUM_THREADS'] = '4'

# Fix matplotlib backend issue - set non-interactive backend before any imports
import matplotlib
matplotlib.use('Agg')  # Use non-interactive Agg backend

import asyncio
import logging
import sys
import platform
from safe_logging import setup_safe_logging
import threading
import time
from pathlib import Path

# Windows-specific async event loop configuration
if platform.system() == "Windows":
    # Use ProactorEventLoop on Windows for better I/O handling
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

# Add project root to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from core.config import get_config, setup_logging
from core.data_provider import DataProvider

# Import checkpoint management
from utils.checkpoint_manager import get_checkpoint_manager
from utils.training_integration import get_training_integration

# Setup logging
setup_safe_logging()
logger = logging.getLogger(__name__)

async def start_training_pipeline(orchestrator, trading_executor):
    """Start the training pipeline in the background with comprehensive error handling"""
    logger.info("=" * 70)
    logger.info("STARTING TRAINING PIPELINE WITH CLEAN DASHBOARD")
    logger.info("=" * 70)

    # Set up async exception handler
    def handle_async_exception(loop, context):
        """Handle uncaught async exceptions"""
        exception = context.get('exception')
        if exception:
            logger.error(f"Uncaught async exception: {exception}")
            logger.error(f"Context: {context}")
        else:
            logger.error(f"Async error: {context.get('message', 'Unknown error')}")

    # Get current event loop and set exception handler
    loop = asyncio.get_running_loop()
    loop.set_exception_handler(handle_async_exception)

    # Initialize checkpoint management
    checkpoint_manager = get_checkpoint_manager()
    training_integration = get_training_integration()

    # Training statistics
    training_stats = {
        'iteration_count': 0,
        'total_decisions': 0,
        'successful_trades': 0,
        'best_performance': 0.0,
        'last_checkpoint_iteration': 0
    }

    try:
        # Start real-time processing with error handling
        try:
            if hasattr(orchestrator, 'start_realtime_processing'):
                await orchestrator.start_realtime_processing()
                logger.info("Real-time processing started")
        except Exception as e:
            logger.error(f"Error starting real-time processing: {e}")

        # Start COB integration with error handling
        try:
            if hasattr(orchestrator, 'start_cob_integration'):
                await orchestrator.start_cob_integration()
                logger.info("COB integration started - 5-minute data matrix active")
            else:
                logger.info("COB integration not available")
        except Exception as e:
            logger.error(f"Error starting COB integration: {e}")

        # Main training loop
        iteration = 0
        last_checkpoint_time = time.time()

        while True:
            try:
                iteration += 1
                training_stats['iteration_count'] = iteration

                # Get symbols to process
                symbols = orchestrator.symbols if hasattr(orchestrator, 'symbols') else ['ETH/USDT']

                # Process each symbol
                for symbol in symbols:
                    try:
                        # Make trading decision (this triggers model training)
                        decision = await orchestrator.make_trading_decision(symbol)
                        if decision:
                            training_stats['total_decisions'] += 1
                            logger.debug(f"[{symbol}] Decision: {decision.action} @ {decision.confidence:.1%}")

                    except Exception as e:
                        logger.warning(f"Error processing {symbol}: {e}")

                # Status logging every 100 iterations
                if iteration % 100 == 0:
                    current_time = time.time()
                    elapsed = current_time - last_checkpoint_time

                    logger.info(f"[TRAINING] Iteration {iteration}, Decisions: {training_stats['total_decisions']}, Time: {elapsed:.1f}s")

                    # Models will save their own checkpoints when performance improves
                    training_stats['last_checkpoint_iteration'] = iteration
                    last_checkpoint_time = current_time

                # Brief pause to prevent overwhelming the system
                await asyncio.sleep(0.1)  # 100ms between iterations

            except Exception as e:
                logger.error(f"Training loop error: {e}")
                await asyncio.sleep(5)  # Wait longer on error

    except Exception as e:
        logger.error(f"Training pipeline error: {e}")
        import traceback
        logger.error(traceback.format_exc())

def start_clean_dashboard_with_training():
    """Start clean dashboard with full training pipeline"""
    try:
        logger.info("=" * 80)
        logger.info("CLEAN TRADING DASHBOARD + FULL TRAINING PIPELINE")
        logger.info("=" * 80)
        logger.info("Features: Real-time Training, COB Integration, Clean UI")
        logger.info("Universal Data Stream: ENABLED")
        logger.info("Neural Decision Fusion: ENABLED")
        logger.info("COB Integration: ENABLED")
        logger.info("GPU Training: ENABLED")
        logger.info("TensorBoard Integration: ENABLED")
        logger.info("Multi-symbol: ETH/USDT, BTC/USDT")

        # Get port from environment or use default
        dashboard_port = int(os.environ.get('DASHBOARD_PORT', '8051'))
        tensorboard_port = int(os.environ.get('TENSORBOARD_PORT', '6006'))
        logger.info(f"Dashboard: http://127.0.0.1:{dashboard_port}")
        logger.info(f"TensorBoard: http://127.0.0.1:{tensorboard_port}")
        logger.info("=" * 80)

        # Check environment variables
        enable_universal_stream = os.environ.get('ENABLE_UNIVERSAL_DATA_STREAM', '1') == '1'
        enable_nn_fusion = os.environ.get('ENABLE_NN_DECISION_FUSION', '1') == '1'
        enable_cob = os.environ.get('ENABLE_COB_INTEGRATION', '1') == '1'

        logger.info(f"Universal Data Stream: {'ENABLED' if enable_universal_stream else 'DISABLED'}")
        logger.info(f"Neural Decision Fusion: {'ENABLED' if enable_nn_fusion else 'DISABLED'}")
        logger.info(f"COB Integration: {'ENABLED' if enable_cob else 'DISABLED'}")

        # Get configuration
        config = get_config()

        # Initialize core components with standardized versions
        from core.standardized_data_provider import StandardizedDataProvider
        from core.orchestrator import TradingOrchestrator
        from core.trading_executor import TradingExecutor

        # Create standardized data provider
        data_provider = StandardizedDataProvider()
        logger.info("StandardizedDataProvider created with BaseDataInput support")

        # Create enhanced orchestrator with standardized data provider
        orchestrator = TradingOrchestrator(data_provider, enhanced_rl_training=True)
        logger.info("Enhanced Trading Orchestrator created with COB integration")

        # Create trading executor
        trading_executor = TradingExecutor(config_path="config.yaml")
        logger.info(f"Creating trading executor with {trading_executor.primary_name} configuration...")


        # Connect trading executor to orchestrator
        orchestrator.trading_executor = trading_executor
        logger.info("Trading Executor connected to Orchestrator")

        # Initialize system resource monitoring
        from utils.system_monitor import start_system_monitoring
        system_monitor = start_system_monitoring()

        # Set up cleanup callback for memory management
        def cleanup_callback():
            """Custom cleanup for memory management"""
            try:
                # Clear orchestrator caches
                if hasattr(orchestrator, 'recent_decisions'):
                    for symbol in orchestrator.recent_decisions:
                        if len(orchestrator.recent_decisions[symbol]) > 50:
                            orchestrator.recent_decisions[symbol] = orchestrator.recent_decisions[symbol][-25:]

                # Clear data provider caches
                if hasattr(data_provider, 'clear_old_data'):
                    data_provider.clear_old_data()

                logger.info("Custom memory cleanup completed")
            except Exception as e:
                logger.error(f"Error in custom cleanup: {e}")

        system_monitor.set_callbacks(cleanup=cleanup_callback)
        logger.info("System resource monitoring started with memory cleanup")

        # Import clean dashboard
        from web.clean_dashboard import create_clean_dashboard

        # Create clean dashboard
        logger.info("Creating clean dashboard...")
        dashboard = create_clean_dashboard(data_provider, orchestrator, trading_executor)
        logger.info("Clean Trading Dashboard created")

        # Add memory cleanup method to dashboard
        def cleanup_dashboard_memory():
            """Clean up dashboard memory caches"""
            try:
                if hasattr(dashboard, 'recent_decisions'):
                    dashboard.recent_decisions = dashboard.recent_decisions[-50:]  # Keep last 50
                if hasattr(dashboard, 'closed_trades'):
                    dashboard.closed_trades = dashboard.closed_trades[-100:]  # Keep last 100
                if hasattr(dashboard, 'tick_cache'):
                    dashboard.tick_cache = dashboard.tick_cache[-1000:]  # Keep last 1000
                logger.debug("Dashboard memory cleanup completed")
            except Exception as e:
                logger.error(f"Error in dashboard memory cleanup: {e}")

        # Set cleanup method on dashboard
        dashboard.cleanup_memory = cleanup_dashboard_memory

        # Start training pipeline in background thread with enhanced error handling
        def training_worker():
            """Run training pipeline in background with comprehensive error handling"""
            try:
                asyncio.run(start_training_pipeline(orchestrator, trading_executor))
            except KeyboardInterrupt:
                logger.info("Training worker stopped by user")
            except Exception as e:
                logger.error(f"Training worker error: {e}")
                import traceback
                logger.error(f"Training worker traceback: {traceback.format_exc()}")
                # Don't exit - let main thread handle restart

        training_thread = threading.Thread(target=training_worker, daemon=True)
        training_thread.start()
        logger.info("Training pipeline started in background with error handling")

        # Wait a moment for training to initialize
        time.sleep(3)

        # Start TensorBoard in background
        from web.tensorboard_integration import get_tensorboard_integration
        tensorboard_port = int(os.environ.get('TENSORBOARD_PORT', '6006'))
        tensorboard_integration = get_tensorboard_integration(log_dir="runs", port=tensorboard_port)

        # Start TensorBoard server
        tensorboard_started = tensorboard_integration.start_tensorboard(open_browser=False)
        if tensorboard_started:
            logger.info(f"TensorBoard started at {tensorboard_integration.get_tensorboard_url()}")
        else:
            logger.warning("Failed to start TensorBoard - training metrics will not be visualized")

        # Start dashboard server with error handling (this blocks)
        logger.info("Starting Clean Dashboard Server with error handling...")
        try:
            dashboard.run_server(host='127.0.0.1', port=dashboard_port, debug=False)
        except Exception as e:
            logger.error(f"Dashboard server error: {e}")
            import traceback
            logger.error(f"Dashboard server traceback: {traceback.format_exc()}")
            raise  # Re-raise to trigger main error handling

    except KeyboardInterrupt:
        logger.info("System stopped by user")
        # Stop TensorBoard
        try:
            tensorboard_integration = get_tensorboard_integration()
            tensorboard_integration.stop_tensorboard()
        except:
            pass
    except Exception as e:
        logger.error(f"Error running clean dashboard with training: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)

def main():
    """Main function with comprehensive error handling"""
    try:
        start_clean_dashboard_with_training()
    except KeyboardInterrupt:
        logger.info("Dashboard stopped by user (Ctrl+C)")
        sys.exit(0)
    except Exception as e:
        logger.error(f"Critical error in main: {e}")
        import traceback
        logger.error(traceback.format_exc())
        sys.exit(1)

if __name__ == "__main__":
    # Ensure logging is flushed on exit
    import atexit
    def flush_logs():
        logging.shutdown()
    atexit.register(flush_logs)

    main()