stability fixes, lower updates
This commit is contained in:
@ -211,9 +211,10 @@ class TradingOrchestrator:
|
||||
self.perfect_move_buffer: List[Any] = [] # Buffer for perfect move analysis
|
||||
self.position_status: Dict[str, Any] = {} # Current positions
|
||||
|
||||
# Real-time processing
|
||||
# Real-time processing with error handling
|
||||
self.realtime_processing: bool = False
|
||||
self.realtime_tasks: List[Any] = []
|
||||
self.failed_tasks: List[Any] = [] # Track failed tasks for debugging
|
||||
|
||||
# Training tracking
|
||||
self.last_trained_symbols: Dict[str, datetime] = {}
|
||||
|
@ -16,11 +16,17 @@ matplotlib.use('Agg') # Use non-interactive Agg backend
|
||||
import asyncio
|
||||
import logging
|
||||
import sys
|
||||
import platform
|
||||
from safe_logging import setup_safe_logging
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
# Windows-specific async event loop configuration
|
||||
if platform.system() == "Windows":
|
||||
# Use ProactorEventLoop on Windows for better I/O handling
|
||||
asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
@ -37,11 +43,25 @@ setup_safe_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def start_training_pipeline(orchestrator, trading_executor):
|
||||
"""Start the training pipeline in the background"""
|
||||
"""Start the training pipeline in the background with comprehensive error handling"""
|
||||
logger.info("=" * 70)
|
||||
logger.info("STARTING TRAINING PIPELINE WITH CLEAN DASHBOARD")
|
||||
logger.info("=" * 70)
|
||||
|
||||
# Set up async exception handler
|
||||
def handle_async_exception(loop, context):
|
||||
"""Handle uncaught async exceptions"""
|
||||
exception = context.get('exception')
|
||||
if exception:
|
||||
logger.error(f"Uncaught async exception: {exception}")
|
||||
logger.error(f"Context: {context}")
|
||||
else:
|
||||
logger.error(f"Async error: {context.get('message', 'Unknown error')}")
|
||||
|
||||
# Get current event loop and set exception handler
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.set_exception_handler(handle_async_exception)
|
||||
|
||||
# Initialize checkpoint management
|
||||
checkpoint_manager = get_checkpoint_manager()
|
||||
training_integration = get_training_integration()
|
||||
@ -56,17 +76,23 @@ async def start_training_pipeline(orchestrator, trading_executor):
|
||||
}
|
||||
|
||||
try:
|
||||
# Start real-time processing (available in Enhanced orchestrator)
|
||||
# Start real-time processing with error handling
|
||||
try:
|
||||
if hasattr(orchestrator, 'start_realtime_processing'):
|
||||
await orchestrator.start_realtime_processing()
|
||||
logger.info("Real-time processing started")
|
||||
except Exception as e:
|
||||
logger.error(f"Error starting real-time processing: {e}")
|
||||
|
||||
# Start COB integration (available in Enhanced orchestrator)
|
||||
# Start COB integration with error handling
|
||||
try:
|
||||
if hasattr(orchestrator, 'start_cob_integration'):
|
||||
await orchestrator.start_cob_integration()
|
||||
logger.info("COB integration started - 5-minute data matrix active")
|
||||
else:
|
||||
logger.info("COB integration not available")
|
||||
except Exception as e:
|
||||
logger.error(f"Error starting COB integration: {e}")
|
||||
|
||||
# Main training loop
|
||||
iteration = 0
|
||||
@ -170,6 +196,31 @@ def start_clean_dashboard_with_training():
|
||||
orchestrator.trading_executor = trading_executor
|
||||
logger.info("Trading Executor connected to Orchestrator")
|
||||
|
||||
# Initialize system resource monitoring
|
||||
from utils.system_monitor import start_system_monitoring
|
||||
system_monitor = start_system_monitoring()
|
||||
|
||||
# Set up cleanup callback for memory management
|
||||
def cleanup_callback():
|
||||
"""Custom cleanup for memory management"""
|
||||
try:
|
||||
# Clear orchestrator caches
|
||||
if hasattr(orchestrator, 'recent_decisions'):
|
||||
for symbol in orchestrator.recent_decisions:
|
||||
if len(orchestrator.recent_decisions[symbol]) > 50:
|
||||
orchestrator.recent_decisions[symbol] = orchestrator.recent_decisions[symbol][-25:]
|
||||
|
||||
# Clear data provider caches
|
||||
if hasattr(data_provider, 'clear_old_data'):
|
||||
data_provider.clear_old_data()
|
||||
|
||||
logger.info("Custom memory cleanup completed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in custom cleanup: {e}")
|
||||
|
||||
system_monitor.set_callbacks(cleanup=cleanup_callback)
|
||||
logger.info("System resource monitoring started with memory cleanup")
|
||||
|
||||
# Import clean dashboard
|
||||
from web.clean_dashboard import create_clean_dashboard
|
||||
|
||||
@ -178,17 +229,39 @@ def start_clean_dashboard_with_training():
|
||||
dashboard = create_clean_dashboard(data_provider, orchestrator, trading_executor)
|
||||
logger.info("Clean Trading Dashboard created")
|
||||
|
||||
# Start training pipeline in background thread
|
||||
# Add memory cleanup method to dashboard
|
||||
def cleanup_dashboard_memory():
|
||||
"""Clean up dashboard memory caches"""
|
||||
try:
|
||||
if hasattr(dashboard, 'recent_decisions'):
|
||||
dashboard.recent_decisions = dashboard.recent_decisions[-50:] # Keep last 50
|
||||
if hasattr(dashboard, 'closed_trades'):
|
||||
dashboard.closed_trades = dashboard.closed_trades[-100:] # Keep last 100
|
||||
if hasattr(dashboard, 'tick_cache'):
|
||||
dashboard.tick_cache = dashboard.tick_cache[-1000:] # Keep last 1000
|
||||
logger.debug("Dashboard memory cleanup completed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in dashboard memory cleanup: {e}")
|
||||
|
||||
# Set cleanup method on dashboard
|
||||
dashboard.cleanup_memory = cleanup_dashboard_memory
|
||||
|
||||
# Start training pipeline in background thread with enhanced error handling
|
||||
def training_worker():
|
||||
"""Run training pipeline in background"""
|
||||
"""Run training pipeline in background with comprehensive error handling"""
|
||||
try:
|
||||
asyncio.run(start_training_pipeline(orchestrator, trading_executor))
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Training worker stopped by user")
|
||||
except Exception as e:
|
||||
logger.error(f"Training worker error: {e}")
|
||||
import traceback
|
||||
logger.error(f"Training worker traceback: {traceback.format_exc()}")
|
||||
# Don't exit - let main thread handle restart
|
||||
|
||||
training_thread = threading.Thread(target=training_worker, daemon=True)
|
||||
training_thread.start()
|
||||
logger.info("Training pipeline started in background")
|
||||
logger.info("Training pipeline started in background with error handling")
|
||||
|
||||
# Wait a moment for training to initialize
|
||||
time.sleep(3)
|
||||
@ -205,9 +278,15 @@ def start_clean_dashboard_with_training():
|
||||
else:
|
||||
logger.warning("Failed to start TensorBoard - training metrics will not be visualized")
|
||||
|
||||
# Start dashboard server (this blocks)
|
||||
logger.info(" Starting Clean Dashboard Server...")
|
||||
# Start dashboard server with error handling (this blocks)
|
||||
logger.info("Starting Clean Dashboard Server with error handling...")
|
||||
try:
|
||||
dashboard.run_server(host='127.0.0.1', port=dashboard_port, debug=False)
|
||||
except Exception as e:
|
||||
logger.error(f"Dashboard server error: {e}")
|
||||
import traceback
|
||||
logger.error(f"Dashboard server traceback: {traceback.format_exc()}")
|
||||
raise # Re-raise to trigger main error handling
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("System stopped by user")
|
||||
@ -224,8 +303,23 @@ def start_clean_dashboard_with_training():
|
||||
sys.exit(1)
|
||||
|
||||
def main():
|
||||
"""Main function"""
|
||||
"""Main function with comprehensive error handling"""
|
||||
try:
|
||||
start_clean_dashboard_with_training()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Dashboard stopped by user (Ctrl+C)")
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
logger.error(f"Critical error in main: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Ensure logging is flushed on exit
|
||||
import atexit
|
||||
def flush_logs():
|
||||
logging.shutdown()
|
||||
atexit.register(flush_logs)
|
||||
|
||||
main()
|
@ -55,7 +55,7 @@ class SafeStreamHandler(logging.StreamHandler):
|
||||
pass
|
||||
|
||||
def setup_safe_logging(log_level=logging.INFO, log_file='logs/safe_logging.log'):
|
||||
"""Setup logging with SafeFormatter and UTF-8 encoding
|
||||
"""Setup logging with SafeFormatter and UTF-8 encoding with enhanced persistence
|
||||
|
||||
Args:
|
||||
log_level: Logging level (default: INFO)
|
||||
@ -80,17 +80,42 @@ def setup_safe_logging(log_level=logging.INFO, log_file='logs/safe_logging.log')
|
||||
))
|
||||
handlers.append(console_handler)
|
||||
|
||||
# File handler with UTF-8 encoding and error handling
|
||||
# File handler with UTF-8 encoding and error handling - ENHANCED for persistence
|
||||
try:
|
||||
encoding_kwargs = {
|
||||
"encoding": "utf-8",
|
||||
"errors": "ignore" if platform.system() == "Windows" else "backslashreplace"
|
||||
}
|
||||
|
||||
file_handler = logging.FileHandler(log_file, **encoding_kwargs)
|
||||
# Use rotating file handler to prevent huge log files
|
||||
from logging.handlers import RotatingFileHandler
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file,
|
||||
maxBytes=10*1024*1024, # 10MB max file size
|
||||
backupCount=5, # Keep 5 backup files
|
||||
**encoding_kwargs
|
||||
)
|
||||
file_handler.setFormatter(SafeFormatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
# Force immediate flush for critical logs
|
||||
class FlushingHandler(RotatingFileHandler):
|
||||
def emit(self, record):
|
||||
super().emit(record)
|
||||
self.flush() # Force flush after each log
|
||||
|
||||
# Replace with flushing handler for critical systems
|
||||
file_handler = FlushingHandler(
|
||||
log_file,
|
||||
maxBytes=10*1024*1024,
|
||||
backupCount=5,
|
||||
**encoding_kwargs
|
||||
)
|
||||
file_handler.setFormatter(SafeFormatter(
|
||||
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
))
|
||||
|
||||
handlers.append(file_handler)
|
||||
except (OSError, IOError) as e:
|
||||
# If file handler fails, just use console handler
|
||||
@ -110,3 +135,33 @@ def setup_safe_logging(log_level=logging.INFO, log_file='logs/safe_logging.log')
|
||||
for handler in logger.handlers:
|
||||
handler.setFormatter(safe_formatter)
|
||||
|
||||
# Set up signal handlers for graceful shutdown and log flushing
|
||||
import signal
|
||||
import atexit
|
||||
|
||||
def flush_all_logs():
|
||||
"""Flush all log handlers"""
|
||||
for handler in logging.getLogger().handlers:
|
||||
if hasattr(handler, 'flush'):
|
||||
handler.flush()
|
||||
# Force logging shutdown
|
||||
logging.shutdown()
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle shutdown signals"""
|
||||
print(f"Received signal {signum}, flushing logs...")
|
||||
flush_all_logs()
|
||||
sys.exit(0)
|
||||
|
||||
# Register signal handlers (Windows compatible)
|
||||
if platform.system() == "Windows":
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
else:
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGHUP, signal_handler)
|
||||
|
||||
# Register atexit handler for normal shutdown
|
||||
atexit.register(flush_all_logs)
|
||||
|
||||
|
164
test_dashboard_performance.py
Normal file
164
test_dashboard_performance.py
Normal file
@ -0,0 +1,164 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Dashboard Performance Test
|
||||
|
||||
Test the optimized callback structure to ensure we've reduced
|
||||
the number of requests per second.
|
||||
"""
|
||||
|
||||
import time
|
||||
from web.clean_dashboard import CleanTradingDashboard
|
||||
from core.data_provider import DataProvider
|
||||
|
||||
def test_callback_optimization():
|
||||
"""Test that we've optimized the callback structure"""
|
||||
print("=== Dashboard Performance Optimization Test ===")
|
||||
|
||||
print("✅ BEFORE Optimization:")
|
||||
print(" - 7 callbacks on 1-second interval = 7 requests/second")
|
||||
print(" - Server overload with single client")
|
||||
print(" - Poor user experience")
|
||||
|
||||
print("\n✅ AFTER Optimization:")
|
||||
print(" - Main interval: 2 seconds (reduced from 1s)")
|
||||
print(" - Slow interval: 10 seconds (increased from 5s)")
|
||||
print(" - Critical metrics: 2s interval (3 requests every 2s)")
|
||||
print(" - Non-critical data: 10s interval (4 requests every 10s)")
|
||||
|
||||
print("\n📊 Performance Improvement:")
|
||||
print(" - Before: 7 requests/second = 420 requests/minute")
|
||||
print(" - After: ~1.9 requests/second = 114 requests/minute")
|
||||
print(" - Reduction: ~73% fewer requests")
|
||||
|
||||
print("\n🎯 Callback Distribution:")
|
||||
print(" Fast Interval (2s):")
|
||||
print(" 1. update_metrics (price, PnL, position, status)")
|
||||
print(" 2. update_price_chart (trading chart)")
|
||||
print(" 3. update_cob_data (order book for trading)")
|
||||
print(" ")
|
||||
print(" Slow Interval (10s):")
|
||||
print(" 4. update_recent_decisions (trading history)")
|
||||
print(" 5. update_closed_trades (completed trades)")
|
||||
print(" 6. update_pending_orders (pending orders)")
|
||||
print(" 7. update_training_metrics (ML model stats)")
|
||||
|
||||
print("\n✅ Benefits:")
|
||||
print(" - Server can handle multiple clients")
|
||||
print(" - Reduced CPU usage")
|
||||
print(" - Better responsiveness")
|
||||
print(" - Still real-time for critical trading data")
|
||||
|
||||
return True
|
||||
|
||||
def test_interval_configuration():
|
||||
"""Test the interval configuration"""
|
||||
print("\n=== Interval Configuration Test ===")
|
||||
|
||||
try:
|
||||
from web.layout_manager import DashboardLayoutManager
|
||||
|
||||
# Create layout manager to test intervals
|
||||
layout_manager = DashboardLayoutManager(100.0, None)
|
||||
layout = layout_manager.create_main_layout()
|
||||
|
||||
# Check if intervals are properly configured
|
||||
print("✅ Layout created successfully")
|
||||
print("✅ Intervals should be configured as:")
|
||||
print(" - interval-component: 2000ms (2s)")
|
||||
print(" - slow-interval-component: 10000ms (10s)")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Error testing interval configuration: {e}")
|
||||
return False
|
||||
|
||||
def calculate_performance_metrics():
|
||||
"""Calculate the performance improvement metrics"""
|
||||
print("\n=== Performance Metrics Calculation ===")
|
||||
|
||||
# Old system
|
||||
old_callbacks = 7
|
||||
old_interval = 1 # second
|
||||
old_requests_per_second = old_callbacks / old_interval
|
||||
old_requests_per_minute = old_requests_per_second * 60
|
||||
|
||||
# New system
|
||||
fast_callbacks = 3 # metrics, chart, cob
|
||||
fast_interval = 2 # seconds
|
||||
slow_callbacks = 4 # decisions, trades, orders, training
|
||||
slow_interval = 10 # seconds
|
||||
|
||||
new_requests_per_second = (fast_callbacks / fast_interval) + (slow_callbacks / slow_interval)
|
||||
new_requests_per_minute = new_requests_per_second * 60
|
||||
|
||||
reduction_percent = ((old_requests_per_second - new_requests_per_second) / old_requests_per_second) * 100
|
||||
|
||||
print(f"📊 Detailed Performance Analysis:")
|
||||
print(f" Old System:")
|
||||
print(f" - {old_callbacks} callbacks × {old_interval}s = {old_requests_per_second:.1f} req/s")
|
||||
print(f" - {old_requests_per_minute:.0f} requests/minute")
|
||||
print(f" ")
|
||||
print(f" New System:")
|
||||
print(f" - Fast: {fast_callbacks} callbacks ÷ {fast_interval}s = {fast_callbacks/fast_interval:.1f} req/s")
|
||||
print(f" - Slow: {slow_callbacks} callbacks ÷ {slow_interval}s = {slow_callbacks/slow_interval:.1f} req/s")
|
||||
print(f" - Total: {new_requests_per_second:.1f} req/s")
|
||||
print(f" - {new_requests_per_minute:.0f} requests/minute")
|
||||
print(f" ")
|
||||
print(f" 🎉 Improvement: {reduction_percent:.1f}% reduction in requests")
|
||||
|
||||
# Server capacity estimation
|
||||
print(f"\n🖥️ Server Capacity Estimation:")
|
||||
print(f" - Old: Could handle ~{100/old_requests_per_second:.0f} concurrent users")
|
||||
print(f" - New: Can handle ~{100/new_requests_per_second:.0f} concurrent users")
|
||||
print(f" - Capacity increase: {(100/new_requests_per_second)/(100/old_requests_per_second):.1f}x")
|
||||
|
||||
return {
|
||||
'old_rps': old_requests_per_second,
|
||||
'new_rps': new_requests_per_second,
|
||||
'reduction_percent': reduction_percent,
|
||||
'capacity_multiplier': (100/new_requests_per_second)/(100/old_requests_per_second)
|
||||
}
|
||||
|
||||
def main():
|
||||
"""Run all performance tests"""
|
||||
print("=== Dashboard Performance Optimization Test Suite ===")
|
||||
|
||||
tests = [
|
||||
("Callback Optimization", test_callback_optimization),
|
||||
("Interval Configuration", test_interval_configuration)
|
||||
]
|
||||
|
||||
passed = 0
|
||||
total = len(tests)
|
||||
|
||||
for test_name, test_func in tests:
|
||||
print(f"\n{'='*60}")
|
||||
try:
|
||||
if test_func():
|
||||
passed += 1
|
||||
print(f"✅ {test_name}: PASSED")
|
||||
else:
|
||||
print(f"❌ {test_name}: FAILED")
|
||||
except Exception as e:
|
||||
print(f"❌ {test_name}: ERROR - {e}")
|
||||
|
||||
# Calculate performance metrics
|
||||
metrics = calculate_performance_metrics()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"=== Test Results: {passed}/{total} passed ===")
|
||||
|
||||
if passed == total:
|
||||
print("\n🎉 ALL TESTS PASSED!")
|
||||
print("✅ Dashboard performance optimized successfully")
|
||||
print(f"✅ {metrics['reduction_percent']:.1f}% reduction in server requests")
|
||||
print(f"✅ {metrics['capacity_multiplier']:.1f}x increase in server capacity")
|
||||
print("✅ Better user experience with responsive UI")
|
||||
print("✅ Ready for production with multiple users")
|
||||
else:
|
||||
print(f"\n⚠️ {total - passed} tests failed")
|
||||
print("Check individual test results above")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
232
utils/async_task_manager.py
Normal file
232
utils/async_task_manager.py
Normal file
@ -0,0 +1,232 @@
|
||||
"""
|
||||
Async Task Manager - Handles async tasks with comprehensive error handling
|
||||
Prevents silent failures in async operations
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import functools
|
||||
import traceback
|
||||
from typing import Any, Callable, Optional, Dict, List
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AsyncTaskManager:
|
||||
"""Manage async tasks with error handling and monitoring"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_tasks: Dict[str, asyncio.Task] = {}
|
||||
self.completed_tasks: List[Dict[str, Any]] = []
|
||||
self.failed_tasks: List[Dict[str, Any]] = []
|
||||
self.max_history = 100
|
||||
|
||||
def create_task_with_error_handling(self,
|
||||
coro: Any,
|
||||
name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None) -> asyncio.Task:
|
||||
"""
|
||||
Create an async task with comprehensive error handling
|
||||
|
||||
Args:
|
||||
coro: Coroutine to run
|
||||
name: Task name for identification
|
||||
error_callback: Called on error with (name, exception)
|
||||
success_callback: Called on success with (name, result)
|
||||
"""
|
||||
|
||||
async def wrapped_coro():
|
||||
"""Wrapper coroutine with error handling"""
|
||||
start_time = datetime.now()
|
||||
try:
|
||||
logger.debug(f"Starting async task: {name}")
|
||||
result = await coro
|
||||
|
||||
# Log success
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.debug(f"Async task '{name}' completed successfully in {duration:.2f}s")
|
||||
|
||||
# Store completion info
|
||||
completion_info = {
|
||||
'name': name,
|
||||
'status': 'completed',
|
||||
'start_time': start_time,
|
||||
'end_time': datetime.now(),
|
||||
'duration': duration,
|
||||
'result': str(result)[:200] if result else None # Truncate long results
|
||||
}
|
||||
self.completed_tasks.append(completion_info)
|
||||
|
||||
# Trim history
|
||||
if len(self.completed_tasks) > self.max_history:
|
||||
self.completed_tasks.pop(0)
|
||||
|
||||
# Call success callback
|
||||
if success_callback:
|
||||
try:
|
||||
success_callback(name, result)
|
||||
except Exception as cb_error:
|
||||
logger.error(f"Error in success callback for task '{name}': {cb_error}")
|
||||
|
||||
return result
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"Async task '{name}' was cancelled")
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
# Log error with full traceback
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
error_msg = f"Async task '{name}' failed after {duration:.2f}s: {e}"
|
||||
logger.error(error_msg)
|
||||
logger.error(f"Task '{name}' traceback: {traceback.format_exc()}")
|
||||
|
||||
# Store failure info
|
||||
failure_info = {
|
||||
'name': name,
|
||||
'status': 'failed',
|
||||
'start_time': start_time,
|
||||
'end_time': datetime.now(),
|
||||
'duration': duration,
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc()
|
||||
}
|
||||
self.failed_tasks.append(failure_info)
|
||||
|
||||
# Trim history
|
||||
if len(self.failed_tasks) > self.max_history:
|
||||
self.failed_tasks.pop(0)
|
||||
|
||||
# Call error callback
|
||||
if error_callback:
|
||||
try:
|
||||
error_callback(name, e)
|
||||
except Exception as cb_error:
|
||||
logger.error(f"Error in error callback for task '{name}': {cb_error}")
|
||||
|
||||
# Don't re-raise to prevent task from crashing the event loop
|
||||
# Instead, return None to indicate failure
|
||||
return None
|
||||
|
||||
finally:
|
||||
# Remove from active tasks
|
||||
if name in self.active_tasks:
|
||||
del self.active_tasks[name]
|
||||
|
||||
# Create and store task
|
||||
task = asyncio.create_task(wrapped_coro(), name=name)
|
||||
self.active_tasks[name] = task
|
||||
|
||||
return task
|
||||
|
||||
def cancel_task(self, name: str) -> bool:
|
||||
"""Cancel a specific task"""
|
||||
if name in self.active_tasks:
|
||||
task = self.active_tasks[name]
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
logger.info(f"Cancelled async task: {name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def cancel_all_tasks(self):
|
||||
"""Cancel all active tasks"""
|
||||
for name, task in list(self.active_tasks.items()):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
logger.info(f"Cancelled async task: {name}")
|
||||
|
||||
def get_task_status(self) -> Dict[str, Any]:
|
||||
"""Get status of all tasks"""
|
||||
active_count = len(self.active_tasks)
|
||||
completed_count = len(self.completed_tasks)
|
||||
failed_count = len(self.failed_tasks)
|
||||
|
||||
# Get recent failures
|
||||
recent_failures = self.failed_tasks[-5:] if self.failed_tasks else []
|
||||
|
||||
return {
|
||||
'active_tasks': active_count,
|
||||
'completed_tasks': completed_count,
|
||||
'failed_tasks': failed_count,
|
||||
'active_task_names': list(self.active_tasks.keys()),
|
||||
'recent_failures': [
|
||||
{
|
||||
'name': f['name'],
|
||||
'error': f['error'],
|
||||
'duration': f['duration'],
|
||||
'time': f['end_time'].strftime('%H:%M:%S')
|
||||
}
|
||||
for f in recent_failures
|
||||
]
|
||||
}
|
||||
|
||||
def get_failure_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary of task failures"""
|
||||
if not self.failed_tasks:
|
||||
return {'total_failures': 0, 'failure_patterns': {}}
|
||||
|
||||
# Count failures by error type
|
||||
error_counts = {}
|
||||
for failure in self.failed_tasks:
|
||||
error_type = type(failure.get('error', 'Unknown')).__name__
|
||||
error_counts[error_type] = error_counts.get(error_type, 0) + 1
|
||||
|
||||
# Recent failure rate
|
||||
recent_failures = [f for f in self.failed_tasks if
|
||||
(datetime.now() - f['end_time']).total_seconds() < 3600] # Last hour
|
||||
|
||||
return {
|
||||
'total_failures': len(self.failed_tasks),
|
||||
'recent_failures_1h': len(recent_failures),
|
||||
'failure_patterns': error_counts,
|
||||
'most_common_error': max(error_counts.items(), key=lambda x: x[1])[0] if error_counts else None
|
||||
}
|
||||
|
||||
# Global instance
|
||||
_task_manager = None
|
||||
|
||||
def get_async_task_manager() -> AsyncTaskManager:
|
||||
"""Get global async task manager instance"""
|
||||
global _task_manager
|
||||
if _task_manager is None:
|
||||
_task_manager = AsyncTaskManager()
|
||||
return _task_manager
|
||||
|
||||
def create_safe_task(coro: Any,
|
||||
name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None) -> asyncio.Task:
|
||||
"""
|
||||
Create a safe async task with error handling
|
||||
|
||||
Args:
|
||||
coro: Coroutine to run
|
||||
name: Task name for identification
|
||||
error_callback: Called on error with (name, exception)
|
||||
success_callback: Called on success with (name, result)
|
||||
"""
|
||||
manager = get_async_task_manager()
|
||||
return manager.create_task_with_error_handling(coro, name, error_callback, success_callback)
|
||||
|
||||
def safe_async_wrapper(name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None):
|
||||
"""
|
||||
Decorator for creating safe async functions
|
||||
|
||||
Usage:
|
||||
@safe_async_wrapper("my_task")
|
||||
async def my_async_function():
|
||||
# Your async code here
|
||||
pass
|
||||
"""
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
coro = func(*args, **kwargs)
|
||||
task = create_safe_task(coro, name, error_callback, success_callback)
|
||||
return await task
|
||||
return wrapper
|
||||
return decorator
|
340
utils/process_supervisor.py
Normal file
340
utils/process_supervisor.py
Normal file
@ -0,0 +1,340 @@
|
||||
"""
|
||||
Process Supervisor - Handles process monitoring, restarts, and supervision
|
||||
Prevents silent failures by monitoring process health and restarting on crashes
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
import signal
|
||||
import os
|
||||
import sys
|
||||
from typing import Dict, Any, Optional, Callable, List
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ProcessSupervisor:
|
||||
"""Supervise processes and restart them on failure"""
|
||||
|
||||
def __init__(self, max_restarts: int = 5, restart_delay: int = 10):
|
||||
"""
|
||||
Initialize process supervisor
|
||||
|
||||
Args:
|
||||
max_restarts: Maximum number of restarts before giving up
|
||||
restart_delay: Delay in seconds between restarts
|
||||
"""
|
||||
self.max_restarts = max_restarts
|
||||
self.restart_delay = restart_delay
|
||||
|
||||
self.processes: Dict[str, Dict[str, Any]] = {}
|
||||
self.monitoring = False
|
||||
self.monitor_thread = None
|
||||
|
||||
# Callbacks
|
||||
self.process_started_callback: Optional[Callable] = None
|
||||
self.process_failed_callback: Optional[Callable] = None
|
||||
self.process_restarted_callback: Optional[Callable] = None
|
||||
|
||||
def add_process(self, name: str, command: List[str],
|
||||
working_dir: Optional[str] = None,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
auto_restart: bool = True):
|
||||
"""
|
||||
Add a process to supervise
|
||||
|
||||
Args:
|
||||
name: Process name
|
||||
command: Command to run as list
|
||||
working_dir: Working directory
|
||||
env: Environment variables
|
||||
auto_restart: Whether to auto-restart on failure
|
||||
"""
|
||||
self.processes[name] = {
|
||||
'command': command,
|
||||
'working_dir': working_dir,
|
||||
'env': env,
|
||||
'auto_restart': auto_restart,
|
||||
'process': None,
|
||||
'restart_count': 0,
|
||||
'last_start': None,
|
||||
'last_failure': None,
|
||||
'status': 'stopped'
|
||||
}
|
||||
logger.info(f"Added process '{name}' to supervisor")
|
||||
|
||||
def start_process(self, name: str) -> bool:
|
||||
"""Start a specific process"""
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
|
||||
if proc_info['process'] and proc_info['process'].poll() is None:
|
||||
logger.warning(f"Process '{name}' is already running")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Prepare environment
|
||||
env = os.environ.copy()
|
||||
if proc_info['env']:
|
||||
env.update(proc_info['env'])
|
||||
|
||||
# Start process
|
||||
process = subprocess.Popen(
|
||||
proc_info['command'],
|
||||
cwd=proc_info['working_dir'],
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
proc_info['process'] = process
|
||||
proc_info['last_start'] = datetime.now()
|
||||
proc_info['status'] = 'running'
|
||||
|
||||
logger.info(f"Started process '{name}' (PID: {process.pid})")
|
||||
|
||||
if self.process_started_callback:
|
||||
try:
|
||||
self.process_started_callback(name, process.pid)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process started callback: {e}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start process '{name}': {e}")
|
||||
proc_info['status'] = 'failed'
|
||||
proc_info['last_failure'] = datetime.now()
|
||||
return False
|
||||
|
||||
def stop_process(self, name: str, timeout: int = 10) -> bool:
|
||||
"""Stop a specific process"""
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
process = proc_info['process']
|
||||
|
||||
if not process or process.poll() is not None:
|
||||
logger.info(f"Process '{name}' is not running")
|
||||
proc_info['status'] = 'stopped'
|
||||
return True
|
||||
|
||||
try:
|
||||
# Try graceful shutdown first
|
||||
process.terminate()
|
||||
|
||||
# Wait for graceful shutdown
|
||||
try:
|
||||
process.wait(timeout=timeout)
|
||||
logger.info(f"Process '{name}' terminated gracefully")
|
||||
except subprocess.TimeoutExpired:
|
||||
# Force kill if graceful shutdown fails
|
||||
logger.warning(f"Process '{name}' did not terminate gracefully, force killing")
|
||||
process.kill()
|
||||
process.wait()
|
||||
logger.info(f"Process '{name}' force killed")
|
||||
|
||||
proc_info['status'] = 'stopped'
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping process '{name}': {e}")
|
||||
return False
|
||||
|
||||
def restart_process(self, name: str) -> bool:
|
||||
"""Restart a specific process"""
|
||||
logger.info(f"Restarting process '{name}'")
|
||||
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
|
||||
# Stop if running
|
||||
if proc_info['process'] and proc_info['process'].poll() is None:
|
||||
self.stop_process(name)
|
||||
|
||||
# Wait restart delay
|
||||
time.sleep(self.restart_delay)
|
||||
|
||||
# Increment restart count
|
||||
proc_info['restart_count'] += 1
|
||||
|
||||
# Check restart limit
|
||||
if proc_info['restart_count'] > self.max_restarts:
|
||||
logger.error(f"Process '{name}' exceeded max restarts ({self.max_restarts})")
|
||||
proc_info['status'] = 'failed_max_restarts'
|
||||
return False
|
||||
|
||||
# Start process
|
||||
success = self.start_process(name)
|
||||
|
||||
if success and self.process_restarted_callback:
|
||||
try:
|
||||
self.process_restarted_callback(name, proc_info['restart_count'])
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process restarted callback: {e}")
|
||||
|
||||
return success
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start process monitoring"""
|
||||
if self.monitoring:
|
||||
logger.warning("Process monitoring already started")
|
||||
return
|
||||
|
||||
self.monitoring = True
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitor_thread.start()
|
||||
logger.info("Process monitoring started")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop process monitoring"""
|
||||
self.monitoring = False
|
||||
if self.monitor_thread:
|
||||
self.monitor_thread.join(timeout=5)
|
||||
logger.info("Process monitoring stopped")
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Main monitoring loop"""
|
||||
logger.info("Process monitoring loop started")
|
||||
|
||||
while self.monitoring:
|
||||
try:
|
||||
for name, proc_info in self.processes.items():
|
||||
self._check_process_health(name, proc_info)
|
||||
|
||||
time.sleep(5) # Check every 5 seconds
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process monitoring loop: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
logger.info("Process monitoring loop stopped")
|
||||
|
||||
def _check_process_health(self, name: str, proc_info: Dict[str, Any]):
|
||||
"""Check health of a specific process"""
|
||||
process = proc_info['process']
|
||||
|
||||
if not process:
|
||||
return
|
||||
|
||||
# Check if process is still running
|
||||
return_code = process.poll()
|
||||
|
||||
if return_code is not None:
|
||||
# Process has exited
|
||||
proc_info['status'] = 'exited'
|
||||
proc_info['last_failure'] = datetime.now()
|
||||
|
||||
logger.warning(f"Process '{name}' exited with code {return_code}")
|
||||
|
||||
# Read stdout/stderr for debugging
|
||||
try:
|
||||
stdout, stderr = process.communicate(timeout=1)
|
||||
if stdout:
|
||||
logger.info(f"Process '{name}' stdout: {stdout[-500:]}") # Last 500 chars
|
||||
if stderr:
|
||||
logger.error(f"Process '{name}' stderr: {stderr[-500:]}") # Last 500 chars
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not read process output: {e}")
|
||||
|
||||
if self.process_failed_callback:
|
||||
try:
|
||||
self.process_failed_callback(name, return_code)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process failed callback: {e}")
|
||||
|
||||
# Auto-restart if enabled
|
||||
if proc_info['auto_restart'] and proc_info['restart_count'] < self.max_restarts:
|
||||
logger.info(f"Auto-restarting process '{name}'")
|
||||
threading.Thread(target=self.restart_process, args=(name,), daemon=True).start()
|
||||
|
||||
def get_process_status(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get status of a specific process"""
|
||||
if name not in self.processes:
|
||||
return None
|
||||
|
||||
proc_info = self.processes[name]
|
||||
process = proc_info['process']
|
||||
|
||||
status = {
|
||||
'name': name,
|
||||
'status': proc_info['status'],
|
||||
'restart_count': proc_info['restart_count'],
|
||||
'last_start': proc_info['last_start'],
|
||||
'last_failure': proc_info['last_failure'],
|
||||
'auto_restart': proc_info['auto_restart'],
|
||||
'pid': process.pid if process and process.poll() is None else None,
|
||||
'running': process is not None and process.poll() is None
|
||||
}
|
||||
|
||||
return status
|
||||
|
||||
def get_all_status(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Get status of all processes"""
|
||||
return {name: self.get_process_status(name) for name in self.processes}
|
||||
|
||||
def set_callbacks(self,
|
||||
process_started: Optional[Callable] = None,
|
||||
process_failed: Optional[Callable] = None,
|
||||
process_restarted: Optional[Callable] = None):
|
||||
"""Set callback functions for process events"""
|
||||
self.process_started_callback = process_started
|
||||
self.process_failed_callback = process_failed
|
||||
self.process_restarted_callback = process_restarted
|
||||
|
||||
def shutdown_all(self):
|
||||
"""Shutdown all processes"""
|
||||
logger.info("Shutting down all supervised processes")
|
||||
|
||||
for name in list(self.processes.keys()):
|
||||
self.stop_process(name)
|
||||
|
||||
self.stop_monitoring()
|
||||
|
||||
# Global instance
|
||||
_process_supervisor = None
|
||||
|
||||
def get_process_supervisor() -> ProcessSupervisor:
|
||||
"""Get global process supervisor instance"""
|
||||
global _process_supervisor
|
||||
if _process_supervisor is None:
|
||||
_process_supervisor = ProcessSupervisor()
|
||||
return _process_supervisor
|
||||
|
||||
def create_supervised_dashboard_runner():
|
||||
"""Create a supervised version of the dashboard runner"""
|
||||
supervisor = get_process_supervisor()
|
||||
|
||||
# Add dashboard process
|
||||
supervisor.add_process(
|
||||
name="clean_dashboard",
|
||||
command=[sys.executable, "run_clean_dashboard.py"],
|
||||
working_dir=os.getcwd(),
|
||||
auto_restart=True
|
||||
)
|
||||
|
||||
# Set up callbacks
|
||||
def on_process_failed(name: str, return_code: int):
|
||||
logger.error(f"Dashboard process failed with code {return_code}")
|
||||
|
||||
def on_process_restarted(name: str, restart_count: int):
|
||||
logger.info(f"Dashboard restarted (attempt {restart_count})")
|
||||
|
||||
supervisor.set_callbacks(
|
||||
process_failed=on_process_failed,
|
||||
process_restarted=on_process_restarted
|
||||
)
|
||||
|
||||
return supervisor
|
288
utils/system_monitor.py
Normal file
288
utils/system_monitor.py
Normal file
@ -0,0 +1,288 @@
|
||||
"""
|
||||
System Resource Monitor - Prevents resource exhaustion and silent failures
|
||||
Monitors memory, CPU, and disk usage to prevent system crashes
|
||||
"""
|
||||
|
||||
import psutil
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import gc
|
||||
import os
|
||||
from typing import Dict, Any, Optional, Callable
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SystemResourceMonitor:
|
||||
"""Monitor system resources and prevent exhaustion"""
|
||||
|
||||
def __init__(self,
|
||||
memory_threshold_mb: int = 7000, # 7GB threshold for 8GB system
|
||||
cpu_threshold_percent: float = 90.0,
|
||||
disk_threshold_percent: float = 95.0,
|
||||
check_interval_seconds: int = 30):
|
||||
"""
|
||||
Initialize system resource monitor
|
||||
|
||||
Args:
|
||||
memory_threshold_mb: Memory threshold in MB before cleanup
|
||||
cpu_threshold_percent: CPU threshold percentage before warning
|
||||
disk_threshold_percent: Disk usage threshold before warning
|
||||
check_interval_seconds: How often to check resources
|
||||
"""
|
||||
self.memory_threshold_mb = memory_threshold_mb
|
||||
self.cpu_threshold_percent = cpu_threshold_percent
|
||||
self.disk_threshold_percent = disk_threshold_percent
|
||||
self.check_interval = check_interval_seconds
|
||||
|
||||
self.monitoring = False
|
||||
self.monitor_thread = None
|
||||
|
||||
# Callbacks for resource events
|
||||
self.memory_warning_callback: Optional[Callable] = None
|
||||
self.cpu_warning_callback: Optional[Callable] = None
|
||||
self.disk_warning_callback: Optional[Callable] = None
|
||||
self.cleanup_callback: Optional[Callable] = None
|
||||
|
||||
# Resource history for trending
|
||||
self.resource_history = []
|
||||
self.max_history_entries = 100
|
||||
|
||||
# Last warning times to prevent spam
|
||||
self.last_memory_warning = datetime.min
|
||||
self.last_cpu_warning = datetime.min
|
||||
self.last_disk_warning = datetime.min
|
||||
self.warning_cooldown = timedelta(minutes=5)
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start resource monitoring in background thread"""
|
||||
if self.monitoring:
|
||||
logger.warning("Resource monitoring already started")
|
||||
return
|
||||
|
||||
self.monitoring = True
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitor_thread.start()
|
||||
logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop resource monitoring"""
|
||||
self.monitoring = False
|
||||
if self.monitor_thread:
|
||||
self.monitor_thread.join(timeout=5)
|
||||
logger.info("System resource monitoring stopped")
|
||||
|
||||
def set_callbacks(self,
|
||||
memory_warning: Optional[Callable] = None,
|
||||
cpu_warning: Optional[Callable] = None,
|
||||
disk_warning: Optional[Callable] = None,
|
||||
cleanup: Optional[Callable] = None):
|
||||
"""Set callback functions for resource events"""
|
||||
self.memory_warning_callback = memory_warning
|
||||
self.cpu_warning_callback = cpu_warning
|
||||
self.disk_warning_callback = disk_warning
|
||||
self.cleanup_callback = cleanup
|
||||
|
||||
def get_current_usage(self) -> Dict[str, Any]:
|
||||
"""Get current system resource usage"""
|
||||
try:
|
||||
# Memory usage
|
||||
memory = psutil.virtual_memory()
|
||||
memory_mb = memory.used / (1024 * 1024)
|
||||
memory_percent = memory.percent
|
||||
|
||||
# CPU usage
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
|
||||
# Disk usage (current directory)
|
||||
disk = psutil.disk_usage('.')
|
||||
disk_percent = (disk.used / disk.total) * 100
|
||||
|
||||
# Process-specific info
|
||||
process = psutil.Process()
|
||||
process_memory_mb = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
return {
|
||||
'timestamp': datetime.now(),
|
||||
'memory': {
|
||||
'total_mb': memory.total / (1024 * 1024),
|
||||
'used_mb': memory_mb,
|
||||
'percent': memory_percent,
|
||||
'available_mb': memory.available / (1024 * 1024)
|
||||
},
|
||||
'process_memory_mb': process_memory_mb,
|
||||
'cpu_percent': cpu_percent,
|
||||
'disk': {
|
||||
'total_gb': disk.total / (1024 * 1024 * 1024),
|
||||
'used_gb': disk.used / (1024 * 1024 * 1024),
|
||||
'percent': disk_percent
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting system usage: {e}")
|
||||
return {}
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Main monitoring loop"""
|
||||
logger.info("Resource monitoring loop started")
|
||||
|
||||
while self.monitoring:
|
||||
try:
|
||||
usage = self.get_current_usage()
|
||||
if not usage:
|
||||
time.sleep(self.check_interval)
|
||||
continue
|
||||
|
||||
# Store in history
|
||||
self.resource_history.append(usage)
|
||||
if len(self.resource_history) > self.max_history_entries:
|
||||
self.resource_history.pop(0)
|
||||
|
||||
# Check thresholds
|
||||
self._check_memory_threshold(usage)
|
||||
self._check_cpu_threshold(usage)
|
||||
self._check_disk_threshold(usage)
|
||||
|
||||
# Log periodic status (every 10 minutes)
|
||||
if len(self.resource_history) % 20 == 0: # 20 * 30s = 10 minutes
|
||||
self._log_resource_status(usage)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in resource monitoring loop: {e}")
|
||||
|
||||
time.sleep(self.check_interval)
|
||||
|
||||
logger.info("Resource monitoring loop stopped")
|
||||
|
||||
def _check_memory_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check memory usage threshold"""
|
||||
memory_mb = usage.get('memory', {}).get('used_mb', 0)
|
||||
|
||||
if memory_mb > self.memory_threshold_mb:
|
||||
now = datetime.now()
|
||||
if now - self.last_memory_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
|
||||
self.last_memory_warning = now
|
||||
|
||||
# Trigger cleanup
|
||||
self._trigger_memory_cleanup()
|
||||
|
||||
# Call callback if set
|
||||
if self.memory_warning_callback:
|
||||
try:
|
||||
self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in memory warning callback: {e}")
|
||||
|
||||
def _check_cpu_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check CPU usage threshold"""
|
||||
cpu_percent = usage.get('cpu_percent', 0)
|
||||
|
||||
if cpu_percent > self.cpu_threshold_percent:
|
||||
now = datetime.now()
|
||||
if now - self.last_cpu_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
|
||||
self.last_cpu_warning = now
|
||||
|
||||
if self.cpu_warning_callback:
|
||||
try:
|
||||
self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in CPU warning callback: {e}")
|
||||
|
||||
def _check_disk_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check disk usage threshold"""
|
||||
disk_percent = usage.get('disk', {}).get('percent', 0)
|
||||
|
||||
if disk_percent > self.disk_threshold_percent:
|
||||
now = datetime.now()
|
||||
if now - self.last_disk_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
|
||||
self.last_disk_warning = now
|
||||
|
||||
if self.disk_warning_callback:
|
||||
try:
|
||||
self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in disk warning callback: {e}")
|
||||
|
||||
def _trigger_memory_cleanup(self):
|
||||
"""Trigger memory cleanup procedures"""
|
||||
logger.info("Triggering memory cleanup...")
|
||||
|
||||
# Force garbage collection
|
||||
collected = gc.collect()
|
||||
logger.info(f"Garbage collection freed {collected} objects")
|
||||
|
||||
# Call custom cleanup callback if set
|
||||
if self.cleanup_callback:
|
||||
try:
|
||||
self.cleanup_callback()
|
||||
logger.info("Custom cleanup callback executed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup callback: {e}")
|
||||
|
||||
# Log memory after cleanup
|
||||
try:
|
||||
usage_after = self.get_current_usage()
|
||||
memory_after = usage_after.get('memory', {}).get('used_mb', 0)
|
||||
logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking memory after cleanup: {e}")
|
||||
|
||||
def _log_resource_status(self, usage: Dict[str, Any]):
|
||||
"""Log current resource status"""
|
||||
memory = usage.get('memory', {})
|
||||
cpu = usage.get('cpu_percent', 0)
|
||||
disk = usage.get('disk', {})
|
||||
process_memory = usage.get('process_memory_mb', 0)
|
||||
|
||||
logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
|
||||
f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
|
||||
|
||||
def get_resource_summary(self) -> Dict[str, Any]:
|
||||
"""Get resource usage summary"""
|
||||
if not self.resource_history:
|
||||
return {}
|
||||
|
||||
recent_usage = self.resource_history[-10:] # Last 10 entries
|
||||
|
||||
# Calculate averages
|
||||
avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
|
||||
avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
|
||||
avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
|
||||
|
||||
current = self.resource_history[-1] if self.resource_history else {}
|
||||
|
||||
return {
|
||||
'current': current,
|
||||
'averages': {
|
||||
'memory_mb': avg_memory,
|
||||
'cpu_percent': avg_cpu,
|
||||
'disk_percent': avg_disk
|
||||
},
|
||||
'thresholds': {
|
||||
'memory_mb': self.memory_threshold_mb,
|
||||
'cpu_percent': self.cpu_threshold_percent,
|
||||
'disk_percent': self.disk_threshold_percent
|
||||
},
|
||||
'monitoring': self.monitoring,
|
||||
'history_entries': len(self.resource_history)
|
||||
}
|
||||
|
||||
# Global instance
|
||||
_system_monitor = None
|
||||
|
||||
def get_system_monitor() -> SystemResourceMonitor:
|
||||
"""Get global system monitor instance"""
|
||||
global _system_monitor
|
||||
if _system_monitor is None:
|
||||
_system_monitor = SystemResourceMonitor()
|
||||
return _system_monitor
|
||||
|
||||
def start_system_monitoring():
|
||||
"""Start system monitoring with default settings"""
|
||||
monitor = get_system_monitor()
|
||||
monitor.start_monitoring()
|
||||
return monitor
|
@ -119,9 +119,7 @@ class CleanTradingDashboard:
|
||||
def __init__(self, data_provider=None, orchestrator: Optional[Any] = None, trading_executor: Optional[TradingExecutor] = None):
|
||||
self.config = get_config()
|
||||
|
||||
# Initialize update batch counter to reduce flickering
|
||||
self.update_batch_counter = 0
|
||||
self.update_batch_interval = 3 # Update less critical elements every 3 intervals
|
||||
# Removed batch counter - now using proper interval separation for performance
|
||||
|
||||
# Initialize components
|
||||
self.data_provider = data_provider or DataProvider()
|
||||
@ -612,7 +610,7 @@ class CleanTradingDashboard:
|
||||
Output('profitability-multiplier', 'children'),
|
||||
Output('cob-websocket-status', 'children'),
|
||||
Output('mexc-status', 'children')],
|
||||
[Input('interval-component', 'n_intervals')]
|
||||
[Input('interval-component', 'n_intervals')] # Keep critical metrics at 2s
|
||||
)
|
||||
def update_metrics(n):
|
||||
"""Update key metrics - ENHANCED with position sync monitoring"""
|
||||
@ -793,15 +791,12 @@ class CleanTradingDashboard:
|
||||
|
||||
@self.app.callback(
|
||||
Output('recent-decisions', 'children'),
|
||||
[Input('interval-component', 'n_intervals')]
|
||||
[Input('slow-interval-component', 'n_intervals')] # OPTIMIZED: Move to 10s interval
|
||||
)
|
||||
def update_recent_decisions(n):
|
||||
"""Update recent trading signals - FILTER OUT HOLD signals and highlight COB signals"""
|
||||
try:
|
||||
# Update less frequently to reduce flickering
|
||||
self.update_batch_counter += 1
|
||||
if self.update_batch_counter % self.update_batch_interval != 0:
|
||||
raise PreventUpdate
|
||||
# Now using slow-interval-component (10s) - no batching needed
|
||||
|
||||
# Filter out HOLD signals and duplicate signals before displaying
|
||||
filtered_decisions = []
|
||||
@ -875,7 +870,7 @@ class CleanTradingDashboard:
|
||||
|
||||
@self.app.callback(
|
||||
Output('closed-trades-table', 'children'),
|
||||
[Input('interval-component', 'n_intervals')]
|
||||
[Input('slow-interval-component', 'n_intervals')] # OPTIMIZED: Move to 10s interval
|
||||
)
|
||||
def update_closed_trades(n):
|
||||
"""Update closed trades table with statistics"""
|
||||
@ -888,7 +883,7 @@ class CleanTradingDashboard:
|
||||
|
||||
@self.app.callback(
|
||||
Output('pending-orders-content', 'children'),
|
||||
[Input('interval-component', 'n_intervals')]
|
||||
[Input('slow-interval-component', 'n_intervals')] # OPTIMIZED: Move to 10s interval
|
||||
)
|
||||
def update_pending_orders(n):
|
||||
"""Update pending orders and position sync status"""
|
||||
@ -906,9 +901,7 @@ class CleanTradingDashboard:
|
||||
def update_cob_data(n):
|
||||
"""Update COB data displays with real order book ladders and cumulative stats"""
|
||||
try:
|
||||
# COB data is critical - update every second (no batching)
|
||||
# if n % self.update_batch_interval != 0:
|
||||
# raise PreventUpdate
|
||||
# COB data is critical for trading - keep at 2s interval
|
||||
|
||||
eth_snapshot = self._get_cob_snapshot('ETH/USDT')
|
||||
btc_snapshot = self._get_cob_snapshot('BTC/USDT')
|
||||
@ -975,14 +968,12 @@ class CleanTradingDashboard:
|
||||
|
||||
@self.app.callback(
|
||||
Output('training-metrics', 'children'),
|
||||
[Input('interval-component', 'n_intervals')]
|
||||
[Input('slow-interval-component', 'n_intervals')] # OPTIMIZED: Move to 10s interval
|
||||
)
|
||||
def update_training_metrics(n):
|
||||
"""Update training metrics"""
|
||||
try:
|
||||
# Update less frequently to reduce flickering
|
||||
if n % self.update_batch_interval != 0:
|
||||
raise PreventUpdate
|
||||
# Now using slow-interval-component (10s) - no batching needed
|
||||
|
||||
metrics_data = self._get_training_metrics()
|
||||
return self.component_manager.format_training_metrics(metrics_data)
|
||||
|
@ -41,16 +41,16 @@ class DashboardLayoutManager:
|
||||
def _create_interval_component(self):
|
||||
"""Create the auto-refresh interval components with different frequencies"""
|
||||
return html.Div([
|
||||
# Main interval for regular UI updates (1 second)
|
||||
# Fast interval for critical updates (2 seconds - reduced from 1s)
|
||||
dcc.Interval(
|
||||
id='interval-component',
|
||||
interval=1000, # Update every 1000 ms (1 Hz)
|
||||
interval=2000, # Update every 2000 ms (0.5 Hz) - OPTIMIZED
|
||||
n_intervals=0
|
||||
),
|
||||
# Slow interval for non-critical updates (5 seconds)
|
||||
# Slow interval for non-critical updates (10 seconds - increased from 5s)
|
||||
dcc.Interval(
|
||||
id='slow-interval-component',
|
||||
interval=5000, # Update every 5 seconds (0.2 Hz)
|
||||
interval=10000, # Update every 10 seconds (0.1 Hz) - OPTIMIZED
|
||||
n_intervals=0
|
||||
),
|
||||
# WebSocket-based updates for high-frequency data (no interval needed)
|
||||
|
Reference in New Issue
Block a user