Files
gogo2/system_stability_audit.py
2025-07-22 15:44:59 +03:00

426 lines
16 KiB
Python

#!/usr/bin/env python3
"""
System Stability Audit and Monitoring
This script performs a comprehensive audit of the trading system to identify
and fix stability issues, memory leaks, and performance bottlenecks.
"""
import os
import sys
import psutil
import logging
import time
import threading
import gc
from datetime import datetime, timedelta
from pathlib import Path
from typing import Dict, List, Optional, Any
import traceback
# Add project root to path
project_root = Path(__file__).parent
sys.path.insert(0, str(project_root))
from core.config import setup_logging, get_config
# Setup logging
setup_logging()
logger = logging.getLogger(__name__)
class SystemStabilityAuditor:
"""
Comprehensive system stability auditor and monitor
Monitors:
- Memory usage and leaks
- CPU usage and performance
- Thread health and deadlocks
- Model performance and stability
- Dashboard responsiveness
- Data provider health
"""
def __init__(self):
"""Initialize the stability auditor"""
self.config = get_config()
self.monitoring_active = False
self.monitoring_thread = None
# Performance baselines
self.baseline_memory = psutil.virtual_memory().used
self.baseline_cpu = psutil.cpu_percent()
# Monitoring data
self.memory_history = []
self.cpu_history = []
self.thread_history = []
self.error_history = []
# Stability metrics
self.stability_score = 100.0
self.critical_issues = []
self.warnings = []
logger.info("System Stability Auditor initialized")
def start_monitoring(self):
"""Start continuous system monitoring"""
if self.monitoring_active:
logger.warning("Monitoring already active")
return
self.monitoring_active = True
self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
self.monitoring_thread.start()
logger.info("System stability monitoring started")
def stop_monitoring(self):
"""Stop system monitoring"""
self.monitoring_active = False
if self.monitoring_thread:
self.monitoring_thread.join(timeout=5)
logger.info("System stability monitoring stopped")
def _monitoring_loop(self):
"""Main monitoring loop"""
while self.monitoring_active:
try:
# Collect system metrics
self._collect_system_metrics()
# Check for memory leaks
self._check_memory_leaks()
# Check CPU usage
self._check_cpu_usage()
# Check thread health
self._check_thread_health()
# Check for deadlocks
self._check_for_deadlocks()
# Update stability score
self._update_stability_score()
# Log status every 60 seconds
if len(self.memory_history) % 12 == 0: # Every 12 * 5s = 60s
self._log_stability_status()
time.sleep(5) # Check every 5 seconds
except Exception as e:
logger.error(f"Error in monitoring loop: {e}")
self.error_history.append({
'timestamp': datetime.now(),
'error': str(e),
'traceback': traceback.format_exc()
})
time.sleep(10) # Wait longer on error
def _collect_system_metrics(self):
"""Collect system performance metrics"""
try:
# Memory metrics
memory = psutil.virtual_memory()
memory_data = {
'timestamp': datetime.now(),
'used_gb': memory.used / (1024**3),
'available_gb': memory.available / (1024**3),
'percent': memory.percent
}
self.memory_history.append(memory_data)
# Keep only last 720 entries (1 hour at 5s intervals)
if len(self.memory_history) > 720:
self.memory_history = self.memory_history[-720:]
# CPU metrics
cpu_percent = psutil.cpu_percent(interval=1)
cpu_data = {
'timestamp': datetime.now(),
'percent': cpu_percent,
'cores': psutil.cpu_count()
}
self.cpu_history.append(cpu_data)
# Keep only last 720 entries
if len(self.cpu_history) > 720:
self.cpu_history = self.cpu_history[-720:]
# Thread metrics
thread_count = threading.active_count()
thread_data = {
'timestamp': datetime.now(),
'count': thread_count,
'threads': [t.name for t in threading.enumerate()]
}
self.thread_history.append(thread_data)
# Keep only last 720 entries
if len(self.thread_history) > 720:
self.thread_history = self.thread_history[-720:]
except Exception as e:
logger.error(f"Error collecting system metrics: {e}")
def _check_memory_leaks(self):
"""Check for memory leaks"""
try:
if len(self.memory_history) < 10:
return
# Check if memory usage is consistently increasing
recent_memory = [m['used_gb'] for m in self.memory_history[-10:]]
memory_trend = sum(recent_memory[-5:]) / 5 - sum(recent_memory[:5]) / 5
# If memory increased by more than 100MB in last 10 checks
if memory_trend > 0.1:
warning = f"Potential memory leak detected: +{memory_trend:.2f}GB in last 50s"
if warning not in self.warnings:
self.warnings.append(warning)
logger.warning(warning)
# Force garbage collection
gc.collect()
logger.info("Forced garbage collection to free memory")
# Check for excessive memory usage
current_memory = self.memory_history[-1]['percent']
if current_memory > 85:
critical = f"High memory usage: {current_memory:.1f}%"
if critical not in self.critical_issues:
self.critical_issues.append(critical)
logger.error(critical)
except Exception as e:
logger.error(f"Error checking memory leaks: {e}")
def _check_cpu_usage(self):
"""Check CPU usage patterns"""
try:
if len(self.cpu_history) < 10:
return
# Check for sustained high CPU usage
recent_cpu = [c['percent'] for c in self.cpu_history[-10:]]
avg_cpu = sum(recent_cpu) / len(recent_cpu)
if avg_cpu > 90:
critical = f"Sustained high CPU usage: {avg_cpu:.1f}%"
if critical not in self.critical_issues:
self.critical_issues.append(critical)
logger.error(critical)
elif avg_cpu > 75:
warning = f"High CPU usage: {avg_cpu:.1f}%"
if warning not in self.warnings:
self.warnings.append(warning)
logger.warning(warning)
except Exception as e:
logger.error(f"Error checking CPU usage: {e}")
def _check_thread_health(self):
"""Check thread health and detect issues"""
try:
if len(self.thread_history) < 5:
return
current_threads = self.thread_history[-1]['count']
# Check for thread explosion
if current_threads > 50:
critical = f"Thread explosion detected: {current_threads} active threads"
if critical not in self.critical_issues:
self.critical_issues.append(critical)
logger.error(critical)
# Log thread names for debugging
thread_names = self.thread_history[-1]['threads']
logger.error(f"Active threads: {thread_names}")
# Check for thread leaks (gradually increasing thread count)
if len(self.thread_history) >= 10:
thread_counts = [t['count'] for t in self.thread_history[-10:]]
thread_trend = sum(thread_counts[-5:]) / 5 - sum(thread_counts[:5]) / 5
if thread_trend > 2: # More than 2 threads increase on average
warning = f"Potential thread leak: +{thread_trend:.1f} threads in last 50s"
if warning not in self.warnings:
self.warnings.append(warning)
logger.warning(warning)
except Exception as e:
logger.error(f"Error checking thread health: {e}")
def _check_for_deadlocks(self):
"""Check for potential deadlocks"""
try:
# Simple deadlock detection based on thread states
all_threads = threading.enumerate()
blocked_threads = []
for thread in all_threads:
if hasattr(thread, '_is_stopped') and not thread._is_stopped:
# Thread is running but might be blocked
# This is a simplified check - real deadlock detection is complex
pass
# For now, just check if we have threads that haven't been active
# More sophisticated deadlock detection would require thread state analysis
except Exception as e:
logger.error(f"Error checking for deadlocks: {e}")
def _update_stability_score(self):
"""Update overall system stability score"""
try:
score = 100.0
# Deduct points for critical issues
score -= len(self.critical_issues) * 20
# Deduct points for warnings
score -= len(self.warnings) * 5
# Deduct points for recent errors
recent_errors = [e for e in self.error_history
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]
score -= len(recent_errors) * 10
# Deduct points for high resource usage
if self.memory_history:
current_memory = self.memory_history[-1]['percent']
if current_memory > 80:
score -= (current_memory - 80) * 2
if self.cpu_history:
current_cpu = self.cpu_history[-1]['percent']
if current_cpu > 80:
score -= (current_cpu - 80) * 1
self.stability_score = max(0, score)
except Exception as e:
logger.error(f"Error updating stability score: {e}")
def _log_stability_status(self):
"""Log current stability status"""
try:
logger.info("=" * 50)
logger.info("SYSTEM STABILITY STATUS")
logger.info("=" * 50)
logger.info(f"Stability Score: {self.stability_score:.1f}/100")
if self.memory_history:
mem = self.memory_history[-1]
logger.info(f"Memory: {mem['used_gb']:.1f}GB used ({mem['percent']:.1f}%)")
if self.cpu_history:
cpu = self.cpu_history[-1]
logger.info(f"CPU: {cpu['percent']:.1f}%")
if self.thread_history:
threads = self.thread_history[-1]
logger.info(f"Threads: {threads['count']} active")
if self.critical_issues:
logger.error(f"Critical Issues ({len(self.critical_issues)}):")
for issue in self.critical_issues[-5:]: # Show last 5
logger.error(f" - {issue}")
if self.warnings:
logger.warning(f"Warnings ({len(self.warnings)}):")
for warning in self.warnings[-5:]: # Show last 5
logger.warning(f" - {warning}")
logger.info("=" * 50)
except Exception as e:
logger.error(f"Error logging stability status: {e}")
def get_stability_report(self) -> Dict[str, Any]:
"""Get comprehensive stability report"""
try:
return {
'stability_score': self.stability_score,
'critical_issues': self.critical_issues,
'warnings': self.warnings,
'memory_usage': self.memory_history[-1] if self.memory_history else None,
'cpu_usage': self.cpu_history[-1] if self.cpu_history else None,
'thread_count': self.thread_history[-1]['count'] if self.thread_history else 0,
'recent_errors': len([e for e in self.error_history
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]),
'monitoring_active': self.monitoring_active
}
except Exception as e:
logger.error(f"Error generating stability report: {e}")
return {'error': str(e)}
def fix_common_issues(self):
"""Attempt to fix common stability issues"""
try:
logger.info("Attempting to fix common stability issues...")
# Force garbage collection
gc.collect()
logger.info("✓ Forced garbage collection")
# Clear old history to free memory
if len(self.memory_history) > 360: # Keep only 30 minutes
self.memory_history = self.memory_history[-360:]
if len(self.cpu_history) > 360:
self.cpu_history = self.cpu_history[-360:]
if len(self.thread_history) > 360:
self.thread_history = self.thread_history[-360:]
logger.info("✓ Cleared old monitoring history")
# Clear old errors
cutoff_time = datetime.now() - timedelta(hours=1)
self.error_history = [e for e in self.error_history if e['timestamp'] > cutoff_time]
logger.info("✓ Cleared old error history")
# Reset warnings and critical issues that might be stale
self.warnings = []
self.critical_issues = []
logger.info("✓ Reset stale warnings and critical issues")
logger.info("Common stability fixes applied")
except Exception as e:
logger.error(f"Error fixing common issues: {e}")
def main():
"""Main function for standalone execution"""
try:
logger.info("Starting System Stability Audit")
auditor = SystemStabilityAuditor()
auditor.start_monitoring()
# Run for 5 minutes then generate report
time.sleep(300)
report = auditor.get_stability_report()
logger.info("FINAL STABILITY REPORT:")
logger.info(f"Stability Score: {report['stability_score']:.1f}/100")
logger.info(f"Critical Issues: {len(report['critical_issues'])}")
logger.info(f"Warnings: {len(report['warnings'])}")
# Attempt fixes if needed
if report['stability_score'] < 80:
auditor.fix_common_issues()
auditor.stop_monitoring()
except KeyboardInterrupt:
logger.info("Audit interrupted by user")
except Exception as e:
logger.error(f"Error in stability audit: {e}")
if __name__ == "__main__":
main()