426 lines
16 KiB
Python
426 lines
16 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
System Stability Audit and Monitoring
|
|
|
|
This script performs a comprehensive audit of the trading system to identify
|
|
and fix stability issues, memory leaks, and performance bottlenecks.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import psutil
|
|
import logging
|
|
import time
|
|
import threading
|
|
import gc
|
|
from datetime import datetime, timedelta
|
|
from pathlib import Path
|
|
from typing import Dict, List, Optional, Any
|
|
import traceback
|
|
|
|
# Add project root to path
|
|
project_root = Path(__file__).parent
|
|
sys.path.insert(0, str(project_root))
|
|
|
|
from core.config import setup_logging, get_config
|
|
|
|
# Setup logging
|
|
setup_logging()
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SystemStabilityAuditor:
|
|
"""
|
|
Comprehensive system stability auditor and monitor
|
|
|
|
Monitors:
|
|
- Memory usage and leaks
|
|
- CPU usage and performance
|
|
- Thread health and deadlocks
|
|
- Model performance and stability
|
|
- Dashboard responsiveness
|
|
- Data provider health
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the stability auditor"""
|
|
self.config = get_config()
|
|
self.monitoring_active = False
|
|
self.monitoring_thread = None
|
|
|
|
# Performance baselines
|
|
self.baseline_memory = psutil.virtual_memory().used
|
|
self.baseline_cpu = psutil.cpu_percent()
|
|
|
|
# Monitoring data
|
|
self.memory_history = []
|
|
self.cpu_history = []
|
|
self.thread_history = []
|
|
self.error_history = []
|
|
|
|
# Stability metrics
|
|
self.stability_score = 100.0
|
|
self.critical_issues = []
|
|
self.warnings = []
|
|
|
|
logger.info("System Stability Auditor initialized")
|
|
|
|
def start_monitoring(self):
|
|
"""Start continuous system monitoring"""
|
|
if self.monitoring_active:
|
|
logger.warning("Monitoring already active")
|
|
return
|
|
|
|
self.monitoring_active = True
|
|
self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
|
|
self.monitoring_thread.start()
|
|
|
|
logger.info("System stability monitoring started")
|
|
|
|
def stop_monitoring(self):
|
|
"""Stop system monitoring"""
|
|
self.monitoring_active = False
|
|
if self.monitoring_thread:
|
|
self.monitoring_thread.join(timeout=5)
|
|
|
|
logger.info("System stability monitoring stopped")
|
|
|
|
def _monitoring_loop(self):
|
|
"""Main monitoring loop"""
|
|
while self.monitoring_active:
|
|
try:
|
|
# Collect system metrics
|
|
self._collect_system_metrics()
|
|
|
|
# Check for memory leaks
|
|
self._check_memory_leaks()
|
|
|
|
# Check CPU usage
|
|
self._check_cpu_usage()
|
|
|
|
# Check thread health
|
|
self._check_thread_health()
|
|
|
|
# Check for deadlocks
|
|
self._check_for_deadlocks()
|
|
|
|
# Update stability score
|
|
self._update_stability_score()
|
|
|
|
# Log status every 60 seconds
|
|
if len(self.memory_history) % 12 == 0: # Every 12 * 5s = 60s
|
|
self._log_stability_status()
|
|
|
|
time.sleep(5) # Check every 5 seconds
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in monitoring loop: {e}")
|
|
self.error_history.append({
|
|
'timestamp': datetime.now(),
|
|
'error': str(e),
|
|
'traceback': traceback.format_exc()
|
|
})
|
|
time.sleep(10) # Wait longer on error
|
|
|
|
def _collect_system_metrics(self):
|
|
"""Collect system performance metrics"""
|
|
try:
|
|
# Memory metrics
|
|
memory = psutil.virtual_memory()
|
|
memory_data = {
|
|
'timestamp': datetime.now(),
|
|
'used_gb': memory.used / (1024**3),
|
|
'available_gb': memory.available / (1024**3),
|
|
'percent': memory.percent
|
|
}
|
|
self.memory_history.append(memory_data)
|
|
|
|
# Keep only last 720 entries (1 hour at 5s intervals)
|
|
if len(self.memory_history) > 720:
|
|
self.memory_history = self.memory_history[-720:]
|
|
|
|
# CPU metrics
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
cpu_data = {
|
|
'timestamp': datetime.now(),
|
|
'percent': cpu_percent,
|
|
'cores': psutil.cpu_count()
|
|
}
|
|
self.cpu_history.append(cpu_data)
|
|
|
|
# Keep only last 720 entries
|
|
if len(self.cpu_history) > 720:
|
|
self.cpu_history = self.cpu_history[-720:]
|
|
|
|
# Thread metrics
|
|
thread_count = threading.active_count()
|
|
thread_data = {
|
|
'timestamp': datetime.now(),
|
|
'count': thread_count,
|
|
'threads': [t.name for t in threading.enumerate()]
|
|
}
|
|
self.thread_history.append(thread_data)
|
|
|
|
# Keep only last 720 entries
|
|
if len(self.thread_history) > 720:
|
|
self.thread_history = self.thread_history[-720:]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error collecting system metrics: {e}")
|
|
|
|
def _check_memory_leaks(self):
|
|
"""Check for memory leaks"""
|
|
try:
|
|
if len(self.memory_history) < 10:
|
|
return
|
|
|
|
# Check if memory usage is consistently increasing
|
|
recent_memory = [m['used_gb'] for m in self.memory_history[-10:]]
|
|
memory_trend = sum(recent_memory[-5:]) / 5 - sum(recent_memory[:5]) / 5
|
|
|
|
# If memory increased by more than 100MB in last 10 checks
|
|
if memory_trend > 0.1:
|
|
warning = f"Potential memory leak detected: +{memory_trend:.2f}GB in last 50s"
|
|
if warning not in self.warnings:
|
|
self.warnings.append(warning)
|
|
logger.warning(warning)
|
|
|
|
# Force garbage collection
|
|
gc.collect()
|
|
logger.info("Forced garbage collection to free memory")
|
|
|
|
# Check for excessive memory usage
|
|
current_memory = self.memory_history[-1]['percent']
|
|
if current_memory > 85:
|
|
critical = f"High memory usage: {current_memory:.1f}%"
|
|
if critical not in self.critical_issues:
|
|
self.critical_issues.append(critical)
|
|
logger.error(critical)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking memory leaks: {e}")
|
|
|
|
def _check_cpu_usage(self):
|
|
"""Check CPU usage patterns"""
|
|
try:
|
|
if len(self.cpu_history) < 10:
|
|
return
|
|
|
|
# Check for sustained high CPU usage
|
|
recent_cpu = [c['percent'] for c in self.cpu_history[-10:]]
|
|
avg_cpu = sum(recent_cpu) / len(recent_cpu)
|
|
|
|
if avg_cpu > 90:
|
|
critical = f"Sustained high CPU usage: {avg_cpu:.1f}%"
|
|
if critical not in self.critical_issues:
|
|
self.critical_issues.append(critical)
|
|
logger.error(critical)
|
|
elif avg_cpu > 75:
|
|
warning = f"High CPU usage: {avg_cpu:.1f}%"
|
|
if warning not in self.warnings:
|
|
self.warnings.append(warning)
|
|
logger.warning(warning)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking CPU usage: {e}")
|
|
|
|
def _check_thread_health(self):
|
|
"""Check thread health and detect issues"""
|
|
try:
|
|
if len(self.thread_history) < 5:
|
|
return
|
|
|
|
current_threads = self.thread_history[-1]['count']
|
|
|
|
# Check for thread explosion
|
|
if current_threads > 50:
|
|
critical = f"Thread explosion detected: {current_threads} active threads"
|
|
if critical not in self.critical_issues:
|
|
self.critical_issues.append(critical)
|
|
logger.error(critical)
|
|
|
|
# Log thread names for debugging
|
|
thread_names = self.thread_history[-1]['threads']
|
|
logger.error(f"Active threads: {thread_names}")
|
|
|
|
# Check for thread leaks (gradually increasing thread count)
|
|
if len(self.thread_history) >= 10:
|
|
thread_counts = [t['count'] for t in self.thread_history[-10:]]
|
|
thread_trend = sum(thread_counts[-5:]) / 5 - sum(thread_counts[:5]) / 5
|
|
|
|
if thread_trend > 2: # More than 2 threads increase on average
|
|
warning = f"Potential thread leak: +{thread_trend:.1f} threads in last 50s"
|
|
if warning not in self.warnings:
|
|
self.warnings.append(warning)
|
|
logger.warning(warning)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking thread health: {e}")
|
|
|
|
def _check_for_deadlocks(self):
|
|
"""Check for potential deadlocks"""
|
|
try:
|
|
# Simple deadlock detection based on thread states
|
|
all_threads = threading.enumerate()
|
|
blocked_threads = []
|
|
|
|
for thread in all_threads:
|
|
if hasattr(thread, '_is_stopped') and not thread._is_stopped:
|
|
# Thread is running but might be blocked
|
|
# This is a simplified check - real deadlock detection is complex
|
|
pass
|
|
|
|
# For now, just check if we have threads that haven't been active
|
|
# More sophisticated deadlock detection would require thread state analysis
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking for deadlocks: {e}")
|
|
|
|
def _update_stability_score(self):
|
|
"""Update overall system stability score"""
|
|
try:
|
|
score = 100.0
|
|
|
|
# Deduct points for critical issues
|
|
score -= len(self.critical_issues) * 20
|
|
|
|
# Deduct points for warnings
|
|
score -= len(self.warnings) * 5
|
|
|
|
# Deduct points for recent errors
|
|
recent_errors = [e for e in self.error_history
|
|
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]
|
|
score -= len(recent_errors) * 10
|
|
|
|
# Deduct points for high resource usage
|
|
if self.memory_history:
|
|
current_memory = self.memory_history[-1]['percent']
|
|
if current_memory > 80:
|
|
score -= (current_memory - 80) * 2
|
|
|
|
if self.cpu_history:
|
|
current_cpu = self.cpu_history[-1]['percent']
|
|
if current_cpu > 80:
|
|
score -= (current_cpu - 80) * 1
|
|
|
|
self.stability_score = max(0, score)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating stability score: {e}")
|
|
|
|
def _log_stability_status(self):
|
|
"""Log current stability status"""
|
|
try:
|
|
logger.info("=" * 50)
|
|
logger.info("SYSTEM STABILITY STATUS")
|
|
logger.info("=" * 50)
|
|
logger.info(f"Stability Score: {self.stability_score:.1f}/100")
|
|
|
|
if self.memory_history:
|
|
mem = self.memory_history[-1]
|
|
logger.info(f"Memory: {mem['used_gb']:.1f}GB used ({mem['percent']:.1f}%)")
|
|
|
|
if self.cpu_history:
|
|
cpu = self.cpu_history[-1]
|
|
logger.info(f"CPU: {cpu['percent']:.1f}%")
|
|
|
|
if self.thread_history:
|
|
threads = self.thread_history[-1]
|
|
logger.info(f"Threads: {threads['count']} active")
|
|
|
|
if self.critical_issues:
|
|
logger.error(f"Critical Issues ({len(self.critical_issues)}):")
|
|
for issue in self.critical_issues[-5:]: # Show last 5
|
|
logger.error(f" - {issue}")
|
|
|
|
if self.warnings:
|
|
logger.warning(f"Warnings ({len(self.warnings)}):")
|
|
for warning in self.warnings[-5:]: # Show last 5
|
|
logger.warning(f" - {warning}")
|
|
|
|
logger.info("=" * 50)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error logging stability status: {e}")
|
|
|
|
def get_stability_report(self) -> Dict[str, Any]:
|
|
"""Get comprehensive stability report"""
|
|
try:
|
|
return {
|
|
'stability_score': self.stability_score,
|
|
'critical_issues': self.critical_issues,
|
|
'warnings': self.warnings,
|
|
'memory_usage': self.memory_history[-1] if self.memory_history else None,
|
|
'cpu_usage': self.cpu_history[-1] if self.cpu_history else None,
|
|
'thread_count': self.thread_history[-1]['count'] if self.thread_history else 0,
|
|
'recent_errors': len([e for e in self.error_history
|
|
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]),
|
|
'monitoring_active': self.monitoring_active
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error generating stability report: {e}")
|
|
return {'error': str(e)}
|
|
|
|
def fix_common_issues(self):
|
|
"""Attempt to fix common stability issues"""
|
|
try:
|
|
logger.info("Attempting to fix common stability issues...")
|
|
|
|
# Force garbage collection
|
|
gc.collect()
|
|
logger.info("✓ Forced garbage collection")
|
|
|
|
# Clear old history to free memory
|
|
if len(self.memory_history) > 360: # Keep only 30 minutes
|
|
self.memory_history = self.memory_history[-360:]
|
|
if len(self.cpu_history) > 360:
|
|
self.cpu_history = self.cpu_history[-360:]
|
|
if len(self.thread_history) > 360:
|
|
self.thread_history = self.thread_history[-360:]
|
|
|
|
logger.info("✓ Cleared old monitoring history")
|
|
|
|
# Clear old errors
|
|
cutoff_time = datetime.now() - timedelta(hours=1)
|
|
self.error_history = [e for e in self.error_history if e['timestamp'] > cutoff_time]
|
|
logger.info("✓ Cleared old error history")
|
|
|
|
# Reset warnings and critical issues that might be stale
|
|
self.warnings = []
|
|
self.critical_issues = []
|
|
logger.info("✓ Reset stale warnings and critical issues")
|
|
|
|
logger.info("Common stability fixes applied")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error fixing common issues: {e}")
|
|
|
|
def main():
|
|
"""Main function for standalone execution"""
|
|
try:
|
|
logger.info("Starting System Stability Audit")
|
|
|
|
auditor = SystemStabilityAuditor()
|
|
auditor.start_monitoring()
|
|
|
|
# Run for 5 minutes then generate report
|
|
time.sleep(300)
|
|
|
|
report = auditor.get_stability_report()
|
|
logger.info("FINAL STABILITY REPORT:")
|
|
logger.info(f"Stability Score: {report['stability_score']:.1f}/100")
|
|
logger.info(f"Critical Issues: {len(report['critical_issues'])}")
|
|
logger.info(f"Warnings: {len(report['warnings'])}")
|
|
|
|
# Attempt fixes if needed
|
|
if report['stability_score'] < 80:
|
|
auditor.fix_common_issues()
|
|
|
|
auditor.stop_monitoring()
|
|
|
|
except KeyboardInterrupt:
|
|
logger.info("Audit interrupted by user")
|
|
except Exception as e:
|
|
logger.error(f"Error in stability audit: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |