gogo2/system_stability_audit.py

#!/usr/bin/env python3
"""
System Stability Audit and Monitoring

This script performs a comprehensive audit of the trading system to identify
and fix stability issues, memory leaks, and performance bottlenecks.
"""

import os
import sys
import psutil
import logging
import time
import threading
import gc
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import traceback

# Add project root to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))

from core.config import setup_logging, get_config

# Setup logging
setup_logging()
logger = logging.getLogger(__name__)

class SystemStabilityAuditor:
    """
    Comprehensive system stability auditor and monitor

    Monitors:
    - Memory usage and leaks
    - CPU usage and performance
    - Thread health and deadlocks
    - Model performance and stability
    - Dashboard responsiveness
    - Data provider health
    """

    def __init__(self):
        """Initialize the stability auditor"""
        self.config = get_config()
        self.monitoring_active = False
        self.monitoring_thread = None

        # Performance baselines
        self.baseline_memory = psutil.virtual_memory().used
        self.baseline_cpu = psutil.cpu_percent()

        # Monitoring data
        self.memory_history = []
        self.cpu_history = []
        self.thread_history = []
        self.error_history = []

        # Stability metrics
        self.stability_score = 100.0
        self.critical_issues = []
        self.warnings = []

        logger.info("System Stability Auditor initialized")

    def start_monitoring(self):
        """Start continuous system monitoring"""
        if self.monitoring_active:
            logger.warning("Monitoring already active")
            return

        self.monitoring_active = True
        self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
        self.monitoring_thread.start()

        logger.info("System stability monitoring started")

    def stop_monitoring(self):
        """Stop system monitoring"""
        self.monitoring_active = False
        if self.monitoring_thread:
            self.monitoring_thread.join(timeout=5)

        logger.info("System stability monitoring stopped")

    def _monitoring_loop(self):
        """Main monitoring loop"""
        while self.monitoring_active:
            try:
                # Collect system metrics
                self._collect_system_metrics()

                # Check for memory leaks
                self._check_memory_leaks()

                # Check CPU usage
                self._check_cpu_usage()

                # Check thread health
                self._check_thread_health()

                # Check for deadlocks
                self._check_for_deadlocks()

                # Update stability score
                self._update_stability_score()

                # Log status every 60 seconds
                if len(self.memory_history) % 12 == 0:  # Every 12 * 5s = 60s
                    self._log_stability_status()

                time.sleep(5)  # Check every 5 seconds

            except Exception as e:
                logger.error(f"Error in monitoring loop: {e}")
                self.error_history.append({
                    'timestamp': datetime.now(),
                    'error': str(e),
                    'traceback': traceback.format_exc()
                })
                time.sleep(10)  # Wait longer on error

    def _collect_system_metrics(self):
        """Collect system performance metrics"""
        try:
            # Memory metrics
            memory = psutil.virtual_memory()
            memory_data = {
                'timestamp': datetime.now(),
                'used_gb': memory.used / (1024**3),
                'available_gb': memory.available / (1024**3),
                'percent': memory.percent
            }
            self.memory_history.append(memory_data)

            # Keep only last 720 entries (1 hour at 5s intervals)
            if len(self.memory_history) > 720:
                self.memory_history = self.memory_history[-720:]

            # CPU metrics
            cpu_percent = psutil.cpu_percent(interval=1)
            cpu_data = {
                'timestamp': datetime.now(),
                'percent': cpu_percent,
                'cores': psutil.cpu_count()
            }
            self.cpu_history.append(cpu_data)

            # Keep only last 720 entries
            if len(self.cpu_history) > 720:
                self.cpu_history = self.cpu_history[-720:]

            # Thread metrics
            thread_count = threading.active_count()
            thread_data = {
                'timestamp': datetime.now(),
                'count': thread_count,
                'threads': [t.name for t in threading.enumerate()]
            }
            self.thread_history.append(thread_data)

            # Keep only last 720 entries
            if len(self.thread_history) > 720:
                self.thread_history = self.thread_history[-720:]

        except Exception as e:
            logger.error(f"Error collecting system metrics: {e}")

    def _check_memory_leaks(self):
        """Check for memory leaks"""
        try:
            if len(self.memory_history) < 10:
                return

            # Check if memory usage is consistently increasing
            recent_memory = [m['used_gb'] for m in self.memory_history[-10:]]
            memory_trend = sum(recent_memory[-5:]) / 5 - sum(recent_memory[:5]) / 5

            # If memory increased by more than 100MB in last 10 checks
            if memory_trend > 0.1:
                warning = f"Potential memory leak detected: +{memory_trend:.2f}GB in last 50s"
                if warning not in self.warnings:
                    self.warnings.append(warning)
                    logger.warning(warning)

                    # Force garbage collection
                    gc.collect()
                    logger.info("Forced garbage collection to free memory")

            # Check for excessive memory usage
            current_memory = self.memory_history[-1]['percent']
            if current_memory > 85:
                critical = f"High memory usage: {current_memory:.1f}%"
                if critical not in self.critical_issues:
                    self.critical_issues.append(critical)
                    logger.error(critical)

        except Exception as e:
            logger.error(f"Error checking memory leaks: {e}")

    def _check_cpu_usage(self):
        """Check CPU usage patterns"""
        try:
            if len(self.cpu_history) < 10:
                return

            # Check for sustained high CPU usage
            recent_cpu = [c['percent'] for c in self.cpu_history[-10:]]
            avg_cpu = sum(recent_cpu) / len(recent_cpu)

            if avg_cpu > 90:
                critical = f"Sustained high CPU usage: {avg_cpu:.1f}%"
                if critical not in self.critical_issues:
                    self.critical_issues.append(critical)
                    logger.error(critical)
            elif avg_cpu > 75:
                warning = f"High CPU usage: {avg_cpu:.1f}%"
                if warning not in self.warnings:
                    self.warnings.append(warning)
                    logger.warning(warning)

        except Exception as e:
            logger.error(f"Error checking CPU usage: {e}")

    def _check_thread_health(self):
        """Check thread health and detect issues"""
        try:
            if len(self.thread_history) < 5:
                return

            current_threads = self.thread_history[-1]['count']

            # Check for thread explosion
            if current_threads > 50:
                critical = f"Thread explosion detected: {current_threads} active threads"
                if critical not in self.critical_issues:
                    self.critical_issues.append(critical)
                    logger.error(critical)

                    # Log thread names for debugging
                    thread_names = self.thread_history[-1]['threads']
                    logger.error(f"Active threads: {thread_names}")

            # Check for thread leaks (gradually increasing thread count)
            if len(self.thread_history) >= 10:
                thread_counts = [t['count'] for t in self.thread_history[-10:]]
                thread_trend = sum(thread_counts[-5:]) / 5 - sum(thread_counts[:5]) / 5

                if thread_trend > 2:  # More than 2 threads increase on average
                    warning = f"Potential thread leak: +{thread_trend:.1f} threads in last 50s"
                    if warning not in self.warnings:
                        self.warnings.append(warning)
                        logger.warning(warning)

        except Exception as e:
            logger.error(f"Error checking thread health: {e}")

    def _check_for_deadlocks(self):
        """Check for potential deadlocks"""
        try:
            # Simple deadlock detection based on thread states
            all_threads = threading.enumerate()
            blocked_threads = []

            for thread in all_threads:
                if hasattr(thread, '_is_stopped') and not thread._is_stopped:
                    # Thread is running but might be blocked
                    # This is a simplified check - real deadlock detection is complex
                    pass

            # For now, just check if we have threads that haven't been active
            # More sophisticated deadlock detection would require thread state analysis

        except Exception as e:
            logger.error(f"Error checking for deadlocks: {e}")

    def _update_stability_score(self):
        """Update overall system stability score"""
        try:
            score = 100.0

            # Deduct points for critical issues
            score -= len(self.critical_issues) * 20

            # Deduct points for warnings
            score -= len(self.warnings) * 5

            # Deduct points for recent errors
            recent_errors = [e for e in self.error_history
                           if e['timestamp'] > datetime.now() - timedelta(minutes=10)]
            score -= len(recent_errors) * 10

            # Deduct points for high resource usage
            if self.memory_history:
                current_memory = self.memory_history[-1]['percent']
                if current_memory > 80:
                    score -= (current_memory - 80) * 2

            if self.cpu_history:
                current_cpu = self.cpu_history[-1]['percent']
                if current_cpu > 80:
                    score -= (current_cpu - 80) * 1

            self.stability_score = max(0, score)

        except Exception as e:
            logger.error(f"Error updating stability score: {e}")

    def _log_stability_status(self):
        """Log current stability status"""
        try:
            logger.info("=" * 50)
            logger.info("SYSTEM STABILITY STATUS")
            logger.info("=" * 50)
            logger.info(f"Stability Score: {self.stability_score:.1f}/100")

            if self.memory_history:
                mem = self.memory_history[-1]
                logger.info(f"Memory: {mem['used_gb']:.1f}GB used ({mem['percent']:.1f}%)")

            if self.cpu_history:
                cpu = self.cpu_history[-1]
                logger.info(f"CPU: {cpu['percent']:.1f}%")

            if self.thread_history:
                threads = self.thread_history[-1]
                logger.info(f"Threads: {threads['count']} active")

            if self.critical_issues:
                logger.error(f"Critical Issues ({len(self.critical_issues)}):")
                for issue in self.critical_issues[-5:]:  # Show last 5
                    logger.error(f"  - {issue}")

            if self.warnings:
                logger.warning(f"Warnings ({len(self.warnings)}):")
                for warning in self.warnings[-5:]:  # Show last 5
                    logger.warning(f"  - {warning}")

            logger.info("=" * 50)

        except Exception as e:
            logger.error(f"Error logging stability status: {e}")

    def get_stability_report(self) -> Dict[str, Any]:
        """Get comprehensive stability report"""
        try:
            return {
                'stability_score': self.stability_score,
                'critical_issues': self.critical_issues,
                'warnings': self.warnings,
                'memory_usage': self.memory_history[-1] if self.memory_history else None,
                'cpu_usage': self.cpu_history[-1] if self.cpu_history else None,
                'thread_count': self.thread_history[-1]['count'] if self.thread_history else 0,
                'recent_errors': len([e for e in self.error_history
                                    if e['timestamp'] > datetime.now() - timedelta(minutes=10)]),
                'monitoring_active': self.monitoring_active
            }
        except Exception as e:
            logger.error(f"Error generating stability report: {e}")
            return {'error': str(e)}

    def fix_common_issues(self):
        """Attempt to fix common stability issues"""
        try:
            logger.info("Attempting to fix common stability issues...")

            # Force garbage collection
            gc.collect()
            logger.info("✓ Forced garbage collection")

            # Clear old history to free memory
            if len(self.memory_history) > 360:  # Keep only 30 minutes
                self.memory_history = self.memory_history[-360:]
            if len(self.cpu_history) > 360:
                self.cpu_history = self.cpu_history[-360:]
            if len(self.thread_history) > 360:
                self.thread_history = self.thread_history[-360:]

            logger.info("✓ Cleared old monitoring history")

            # Clear old errors
            cutoff_time = datetime.now() - timedelta(hours=1)
            self.error_history = [e for e in self.error_history if e['timestamp'] > cutoff_time]
            logger.info("✓ Cleared old error history")

            # Reset warnings and critical issues that might be stale
            self.warnings = []
            self.critical_issues = []
            logger.info("✓ Reset stale warnings and critical issues")

            logger.info("Common stability fixes applied")

        except Exception as e:
            logger.error(f"Error fixing common issues: {e}")

def main():
    """Main function for standalone execution"""
    try:
        logger.info("Starting System Stability Audit")

        auditor = SystemStabilityAuditor()
        auditor.start_monitoring()

        # Run for 5 minutes then generate report
        time.sleep(300)

        report = auditor.get_stability_report()
        logger.info("FINAL STABILITY REPORT:")
        logger.info(f"Stability Score: {report['stability_score']:.1f}/100")
        logger.info(f"Critical Issues: {len(report['critical_issues'])}")
        logger.info(f"Warnings: {len(report['warnings'])}")

        # Attempt fixes if needed
        if report['stability_score'] < 80:
            auditor.fix_common_issues()

        auditor.stop_monitoring()

    except KeyboardInterrupt:
        logger.info("Audit interrupted by user")
    except Exception as e:
        logger.error(f"Error in stability audit: {e}")

if __name__ == "__main__":
    main()