#!/usr/bin/env python3 """ System Stability Audit and Monitoring This script performs a comprehensive audit of the trading system to identify and fix stability issues, memory leaks, and performance bottlenecks. """ import os import sys import psutil import logging import time import threading import gc from datetime import datetime, timedelta from pathlib import Path from typing import Dict, List, Optional, Any import traceback # Add project root to path project_root = Path(__file__).parent sys.path.insert(0, str(project_root)) from core.config import setup_logging, get_config # Setup logging setup_logging() logger = logging.getLogger(__name__) class SystemStabilityAuditor: """ Comprehensive system stability auditor and monitor Monitors: - Memory usage and leaks - CPU usage and performance - Thread health and deadlocks - Model performance and stability - Dashboard responsiveness - Data provider health """ def __init__(self): """Initialize the stability auditor""" self.config = get_config() self.monitoring_active = False self.monitoring_thread = None # Performance baselines self.baseline_memory = psutil.virtual_memory().used self.baseline_cpu = psutil.cpu_percent() # Monitoring data self.memory_history = [] self.cpu_history = [] self.thread_history = [] self.error_history = [] # Stability metrics self.stability_score = 100.0 self.critical_issues = [] self.warnings = [] logger.info("System Stability Auditor initialized") def start_monitoring(self): """Start continuous system monitoring""" if self.monitoring_active: logger.warning("Monitoring already active") return self.monitoring_active = True self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True) self.monitoring_thread.start() logger.info("System stability monitoring started") def stop_monitoring(self): """Stop system monitoring""" self.monitoring_active = False if self.monitoring_thread: self.monitoring_thread.join(timeout=5) logger.info("System stability monitoring stopped") def _monitoring_loop(self): """Main monitoring loop""" while self.monitoring_active: try: # Collect system metrics self._collect_system_metrics() # Check for memory leaks self._check_memory_leaks() # Check CPU usage self._check_cpu_usage() # Check thread health self._check_thread_health() # Check for deadlocks self._check_for_deadlocks() # Update stability score self._update_stability_score() # Log status every 60 seconds if len(self.memory_history) % 12 == 0: # Every 12 * 5s = 60s self._log_stability_status() time.sleep(5) # Check every 5 seconds except Exception as e: logger.error(f"Error in monitoring loop: {e}") self.error_history.append({ 'timestamp': datetime.now(), 'error': str(e), 'traceback': traceback.format_exc() }) time.sleep(10) # Wait longer on error def _collect_system_metrics(self): """Collect system performance metrics""" try: # Memory metrics memory = psutil.virtual_memory() memory_data = { 'timestamp': datetime.now(), 'used_gb': memory.used / (1024**3), 'available_gb': memory.available / (1024**3), 'percent': memory.percent } self.memory_history.append(memory_data) # Keep only last 720 entries (1 hour at 5s intervals) if len(self.memory_history) > 720: self.memory_history = self.memory_history[-720:] # CPU metrics cpu_percent = psutil.cpu_percent(interval=1) cpu_data = { 'timestamp': datetime.now(), 'percent': cpu_percent, 'cores': psutil.cpu_count() } self.cpu_history.append(cpu_data) # Keep only last 720 entries if len(self.cpu_history) > 720: self.cpu_history = self.cpu_history[-720:] # Thread metrics thread_count = threading.active_count() thread_data = { 'timestamp': datetime.now(), 'count': thread_count, 'threads': [t.name for t in threading.enumerate()] } self.thread_history.append(thread_data) # Keep only last 720 entries if len(self.thread_history) > 720: self.thread_history = self.thread_history[-720:] except Exception as e: logger.error(f"Error collecting system metrics: {e}") def _check_memory_leaks(self): """Check for memory leaks""" try: if len(self.memory_history) < 10: return # Check if memory usage is consistently increasing recent_memory = [m['used_gb'] for m in self.memory_history[-10:]] memory_trend = sum(recent_memory[-5:]) / 5 - sum(recent_memory[:5]) / 5 # If memory increased by more than 100MB in last 10 checks if memory_trend > 0.1: warning = f"Potential memory leak detected: +{memory_trend:.2f}GB in last 50s" if warning not in self.warnings: self.warnings.append(warning) logger.warning(warning) # Force garbage collection gc.collect() logger.info("Forced garbage collection to free memory") # Check for excessive memory usage current_memory = self.memory_history[-1]['percent'] if current_memory > 85: critical = f"High memory usage: {current_memory:.1f}%" if critical not in self.critical_issues: self.critical_issues.append(critical) logger.error(critical) except Exception as e: logger.error(f"Error checking memory leaks: {e}") def _check_cpu_usage(self): """Check CPU usage patterns""" try: if len(self.cpu_history) < 10: return # Check for sustained high CPU usage recent_cpu = [c['percent'] for c in self.cpu_history[-10:]] avg_cpu = sum(recent_cpu) / len(recent_cpu) if avg_cpu > 90: critical = f"Sustained high CPU usage: {avg_cpu:.1f}%" if critical not in self.critical_issues: self.critical_issues.append(critical) logger.error(critical) elif avg_cpu > 75: warning = f"High CPU usage: {avg_cpu:.1f}%" if warning not in self.warnings: self.warnings.append(warning) logger.warning(warning) except Exception as e: logger.error(f"Error checking CPU usage: {e}") def _check_thread_health(self): """Check thread health and detect issues""" try: if len(self.thread_history) < 5: return current_threads = self.thread_history[-1]['count'] # Check for thread explosion if current_threads > 50: critical = f"Thread explosion detected: {current_threads} active threads" if critical not in self.critical_issues: self.critical_issues.append(critical) logger.error(critical) # Log thread names for debugging thread_names = self.thread_history[-1]['threads'] logger.error(f"Active threads: {thread_names}") # Check for thread leaks (gradually increasing thread count) if len(self.thread_history) >= 10: thread_counts = [t['count'] for t in self.thread_history[-10:]] thread_trend = sum(thread_counts[-5:]) / 5 - sum(thread_counts[:5]) / 5 if thread_trend > 2: # More than 2 threads increase on average warning = f"Potential thread leak: +{thread_trend:.1f} threads in last 50s" if warning not in self.warnings: self.warnings.append(warning) logger.warning(warning) except Exception as e: logger.error(f"Error checking thread health: {e}") def _check_for_deadlocks(self): """Check for potential deadlocks""" try: # Simple deadlock detection based on thread states all_threads = threading.enumerate() blocked_threads = [] for thread in all_threads: if hasattr(thread, '_is_stopped') and not thread._is_stopped: # Thread is running but might be blocked # This is a simplified check - real deadlock detection is complex pass # For now, just check if we have threads that haven't been active # More sophisticated deadlock detection would require thread state analysis except Exception as e: logger.error(f"Error checking for deadlocks: {e}") def _update_stability_score(self): """Update overall system stability score""" try: score = 100.0 # Deduct points for critical issues score -= len(self.critical_issues) * 20 # Deduct points for warnings score -= len(self.warnings) * 5 # Deduct points for recent errors recent_errors = [e for e in self.error_history if e['timestamp'] > datetime.now() - timedelta(minutes=10)] score -= len(recent_errors) * 10 # Deduct points for high resource usage if self.memory_history: current_memory = self.memory_history[-1]['percent'] if current_memory > 80: score -= (current_memory - 80) * 2 if self.cpu_history: current_cpu = self.cpu_history[-1]['percent'] if current_cpu > 80: score -= (current_cpu - 80) * 1 self.stability_score = max(0, score) except Exception as e: logger.error(f"Error updating stability score: {e}") def _log_stability_status(self): """Log current stability status""" try: logger.info("=" * 50) logger.info("SYSTEM STABILITY STATUS") logger.info("=" * 50) logger.info(f"Stability Score: {self.stability_score:.1f}/100") if self.memory_history: mem = self.memory_history[-1] logger.info(f"Memory: {mem['used_gb']:.1f}GB used ({mem['percent']:.1f}%)") if self.cpu_history: cpu = self.cpu_history[-1] logger.info(f"CPU: {cpu['percent']:.1f}%") if self.thread_history: threads = self.thread_history[-1] logger.info(f"Threads: {threads['count']} active") if self.critical_issues: logger.error(f"Critical Issues ({len(self.critical_issues)}):") for issue in self.critical_issues[-5:]: # Show last 5 logger.error(f" - {issue}") if self.warnings: logger.warning(f"Warnings ({len(self.warnings)}):") for warning in self.warnings[-5:]: # Show last 5 logger.warning(f" - {warning}") logger.info("=" * 50) except Exception as e: logger.error(f"Error logging stability status: {e}") def get_stability_report(self) -> Dict[str, Any]: """Get comprehensive stability report""" try: return { 'stability_score': self.stability_score, 'critical_issues': self.critical_issues, 'warnings': self.warnings, 'memory_usage': self.memory_history[-1] if self.memory_history else None, 'cpu_usage': self.cpu_history[-1] if self.cpu_history else None, 'thread_count': self.thread_history[-1]['count'] if self.thread_history else 0, 'recent_errors': len([e for e in self.error_history if e['timestamp'] > datetime.now() - timedelta(minutes=10)]), 'monitoring_active': self.monitoring_active } except Exception as e: logger.error(f"Error generating stability report: {e}") return {'error': str(e)} def fix_common_issues(self): """Attempt to fix common stability issues""" try: logger.info("Attempting to fix common stability issues...") # Force garbage collection gc.collect() logger.info("✓ Forced garbage collection") # Clear old history to free memory if len(self.memory_history) > 360: # Keep only 30 minutes self.memory_history = self.memory_history[-360:] if len(self.cpu_history) > 360: self.cpu_history = self.cpu_history[-360:] if len(self.thread_history) > 360: self.thread_history = self.thread_history[-360:] logger.info("✓ Cleared old monitoring history") # Clear old errors cutoff_time = datetime.now() - timedelta(hours=1) self.error_history = [e for e in self.error_history if e['timestamp'] > cutoff_time] logger.info("✓ Cleared old error history") # Reset warnings and critical issues that might be stale self.warnings = [] self.critical_issues = [] logger.info("✓ Reset stale warnings and critical issues") logger.info("Common stability fixes applied") except Exception as e: logger.error(f"Error fixing common issues: {e}") def main(): """Main function for standalone execution""" try: logger.info("Starting System Stability Audit") auditor = SystemStabilityAuditor() auditor.start_monitoring() # Run for 5 minutes then generate report time.sleep(300) report = auditor.get_stability_report() logger.info("FINAL STABILITY REPORT:") logger.info(f"Stability Score: {report['stability_score']:.1f}/100") logger.info(f"Critical Issues: {len(report['critical_issues'])}") logger.info(f"Warnings: {len(report['warnings'])}") # Attempt fixes if needed if report['stability_score'] < 80: auditor.fix_common_issues() auditor.stop_monitoring() except KeyboardInterrupt: logger.info("Audit interrupted by user") except Exception as e: logger.error(f"Error in stability audit: {e}") if __name__ == "__main__": main()