fix model mappings,dash updates, trading
This commit is contained in:
426
system_stability_audit.py
Normal file
426
system_stability_audit.py
Normal file
@ -0,0 +1,426 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
System Stability Audit and Monitoring
|
||||
|
||||
This script performs a comprehensive audit of the trading system to identify
|
||||
and fix stability issues, memory leaks, and performance bottlenecks.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import psutil
|
||||
import logging
|
||||
import time
|
||||
import threading
|
||||
import gc
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Any
|
||||
import traceback
|
||||
|
||||
# Add project root to path
|
||||
project_root = Path(__file__).parent
|
||||
sys.path.insert(0, str(project_root))
|
||||
|
||||
from core.config import setup_logging, get_config
|
||||
|
||||
# Setup logging
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SystemStabilityAuditor:
|
||||
"""
|
||||
Comprehensive system stability auditor and monitor
|
||||
|
||||
Monitors:
|
||||
- Memory usage and leaks
|
||||
- CPU usage and performance
|
||||
- Thread health and deadlocks
|
||||
- Model performance and stability
|
||||
- Dashboard responsiveness
|
||||
- Data provider health
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize the stability auditor"""
|
||||
self.config = get_config()
|
||||
self.monitoring_active = False
|
||||
self.monitoring_thread = None
|
||||
|
||||
# Performance baselines
|
||||
self.baseline_memory = psutil.virtual_memory().used
|
||||
self.baseline_cpu = psutil.cpu_percent()
|
||||
|
||||
# Monitoring data
|
||||
self.memory_history = []
|
||||
self.cpu_history = []
|
||||
self.thread_history = []
|
||||
self.error_history = []
|
||||
|
||||
# Stability metrics
|
||||
self.stability_score = 100.0
|
||||
self.critical_issues = []
|
||||
self.warnings = []
|
||||
|
||||
logger.info("System Stability Auditor initialized")
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start continuous system monitoring"""
|
||||
if self.monitoring_active:
|
||||
logger.warning("Monitoring already active")
|
||||
return
|
||||
|
||||
self.monitoring_active = True
|
||||
self.monitoring_thread = threading.Thread(target=self._monitoring_loop, daemon=True)
|
||||
self.monitoring_thread.start()
|
||||
|
||||
logger.info("System stability monitoring started")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop system monitoring"""
|
||||
self.monitoring_active = False
|
||||
if self.monitoring_thread:
|
||||
self.monitoring_thread.join(timeout=5)
|
||||
|
||||
logger.info("System stability monitoring stopped")
|
||||
|
||||
def _monitoring_loop(self):
|
||||
"""Main monitoring loop"""
|
||||
while self.monitoring_active:
|
||||
try:
|
||||
# Collect system metrics
|
||||
self._collect_system_metrics()
|
||||
|
||||
# Check for memory leaks
|
||||
self._check_memory_leaks()
|
||||
|
||||
# Check CPU usage
|
||||
self._check_cpu_usage()
|
||||
|
||||
# Check thread health
|
||||
self._check_thread_health()
|
||||
|
||||
# Check for deadlocks
|
||||
self._check_for_deadlocks()
|
||||
|
||||
# Update stability score
|
||||
self._update_stability_score()
|
||||
|
||||
# Log status every 60 seconds
|
||||
if len(self.memory_history) % 12 == 0: # Every 12 * 5s = 60s
|
||||
self._log_stability_status()
|
||||
|
||||
time.sleep(5) # Check every 5 seconds
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in monitoring loop: {e}")
|
||||
self.error_history.append({
|
||||
'timestamp': datetime.now(),
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc()
|
||||
})
|
||||
time.sleep(10) # Wait longer on error
|
||||
|
||||
def _collect_system_metrics(self):
|
||||
"""Collect system performance metrics"""
|
||||
try:
|
||||
# Memory metrics
|
||||
memory = psutil.virtual_memory()
|
||||
memory_data = {
|
||||
'timestamp': datetime.now(),
|
||||
'used_gb': memory.used / (1024**3),
|
||||
'available_gb': memory.available / (1024**3),
|
||||
'percent': memory.percent
|
||||
}
|
||||
self.memory_history.append(memory_data)
|
||||
|
||||
# Keep only last 720 entries (1 hour at 5s intervals)
|
||||
if len(self.memory_history) > 720:
|
||||
self.memory_history = self.memory_history[-720:]
|
||||
|
||||
# CPU metrics
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
cpu_data = {
|
||||
'timestamp': datetime.now(),
|
||||
'percent': cpu_percent,
|
||||
'cores': psutil.cpu_count()
|
||||
}
|
||||
self.cpu_history.append(cpu_data)
|
||||
|
||||
# Keep only last 720 entries
|
||||
if len(self.cpu_history) > 720:
|
||||
self.cpu_history = self.cpu_history[-720:]
|
||||
|
||||
# Thread metrics
|
||||
thread_count = threading.active_count()
|
||||
thread_data = {
|
||||
'timestamp': datetime.now(),
|
||||
'count': thread_count,
|
||||
'threads': [t.name for t in threading.enumerate()]
|
||||
}
|
||||
self.thread_history.append(thread_data)
|
||||
|
||||
# Keep only last 720 entries
|
||||
if len(self.thread_history) > 720:
|
||||
self.thread_history = self.thread_history[-720:]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting system metrics: {e}")
|
||||
|
||||
def _check_memory_leaks(self):
|
||||
"""Check for memory leaks"""
|
||||
try:
|
||||
if len(self.memory_history) < 10:
|
||||
return
|
||||
|
||||
# Check if memory usage is consistently increasing
|
||||
recent_memory = [m['used_gb'] for m in self.memory_history[-10:]]
|
||||
memory_trend = sum(recent_memory[-5:]) / 5 - sum(recent_memory[:5]) / 5
|
||||
|
||||
# If memory increased by more than 100MB in last 10 checks
|
||||
if memory_trend > 0.1:
|
||||
warning = f"Potential memory leak detected: +{memory_trend:.2f}GB in last 50s"
|
||||
if warning not in self.warnings:
|
||||
self.warnings.append(warning)
|
||||
logger.warning(warning)
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
logger.info("Forced garbage collection to free memory")
|
||||
|
||||
# Check for excessive memory usage
|
||||
current_memory = self.memory_history[-1]['percent']
|
||||
if current_memory > 85:
|
||||
critical = f"High memory usage: {current_memory:.1f}%"
|
||||
if critical not in self.critical_issues:
|
||||
self.critical_issues.append(critical)
|
||||
logger.error(critical)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking memory leaks: {e}")
|
||||
|
||||
def _check_cpu_usage(self):
|
||||
"""Check CPU usage patterns"""
|
||||
try:
|
||||
if len(self.cpu_history) < 10:
|
||||
return
|
||||
|
||||
# Check for sustained high CPU usage
|
||||
recent_cpu = [c['percent'] for c in self.cpu_history[-10:]]
|
||||
avg_cpu = sum(recent_cpu) / len(recent_cpu)
|
||||
|
||||
if avg_cpu > 90:
|
||||
critical = f"Sustained high CPU usage: {avg_cpu:.1f}%"
|
||||
if critical not in self.critical_issues:
|
||||
self.critical_issues.append(critical)
|
||||
logger.error(critical)
|
||||
elif avg_cpu > 75:
|
||||
warning = f"High CPU usage: {avg_cpu:.1f}%"
|
||||
if warning not in self.warnings:
|
||||
self.warnings.append(warning)
|
||||
logger.warning(warning)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking CPU usage: {e}")
|
||||
|
||||
def _check_thread_health(self):
|
||||
"""Check thread health and detect issues"""
|
||||
try:
|
||||
if len(self.thread_history) < 5:
|
||||
return
|
||||
|
||||
current_threads = self.thread_history[-1]['count']
|
||||
|
||||
# Check for thread explosion
|
||||
if current_threads > 50:
|
||||
critical = f"Thread explosion detected: {current_threads} active threads"
|
||||
if critical not in self.critical_issues:
|
||||
self.critical_issues.append(critical)
|
||||
logger.error(critical)
|
||||
|
||||
# Log thread names for debugging
|
||||
thread_names = self.thread_history[-1]['threads']
|
||||
logger.error(f"Active threads: {thread_names}")
|
||||
|
||||
# Check for thread leaks (gradually increasing thread count)
|
||||
if len(self.thread_history) >= 10:
|
||||
thread_counts = [t['count'] for t in self.thread_history[-10:]]
|
||||
thread_trend = sum(thread_counts[-5:]) / 5 - sum(thread_counts[:5]) / 5
|
||||
|
||||
if thread_trend > 2: # More than 2 threads increase on average
|
||||
warning = f"Potential thread leak: +{thread_trend:.1f} threads in last 50s"
|
||||
if warning not in self.warnings:
|
||||
self.warnings.append(warning)
|
||||
logger.warning(warning)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking thread health: {e}")
|
||||
|
||||
def _check_for_deadlocks(self):
|
||||
"""Check for potential deadlocks"""
|
||||
try:
|
||||
# Simple deadlock detection based on thread states
|
||||
all_threads = threading.enumerate()
|
||||
blocked_threads = []
|
||||
|
||||
for thread in all_threads:
|
||||
if hasattr(thread, '_is_stopped') and not thread._is_stopped:
|
||||
# Thread is running but might be blocked
|
||||
# This is a simplified check - real deadlock detection is complex
|
||||
pass
|
||||
|
||||
# For now, just check if we have threads that haven't been active
|
||||
# More sophisticated deadlock detection would require thread state analysis
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking for deadlocks: {e}")
|
||||
|
||||
def _update_stability_score(self):
|
||||
"""Update overall system stability score"""
|
||||
try:
|
||||
score = 100.0
|
||||
|
||||
# Deduct points for critical issues
|
||||
score -= len(self.critical_issues) * 20
|
||||
|
||||
# Deduct points for warnings
|
||||
score -= len(self.warnings) * 5
|
||||
|
||||
# Deduct points for recent errors
|
||||
recent_errors = [e for e in self.error_history
|
||||
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]
|
||||
score -= len(recent_errors) * 10
|
||||
|
||||
# Deduct points for high resource usage
|
||||
if self.memory_history:
|
||||
current_memory = self.memory_history[-1]['percent']
|
||||
if current_memory > 80:
|
||||
score -= (current_memory - 80) * 2
|
||||
|
||||
if self.cpu_history:
|
||||
current_cpu = self.cpu_history[-1]['percent']
|
||||
if current_cpu > 80:
|
||||
score -= (current_cpu - 80) * 1
|
||||
|
||||
self.stability_score = max(0, score)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating stability score: {e}")
|
||||
|
||||
def _log_stability_status(self):
|
||||
"""Log current stability status"""
|
||||
try:
|
||||
logger.info("=" * 50)
|
||||
logger.info("SYSTEM STABILITY STATUS")
|
||||
logger.info("=" * 50)
|
||||
logger.info(f"Stability Score: {self.stability_score:.1f}/100")
|
||||
|
||||
if self.memory_history:
|
||||
mem = self.memory_history[-1]
|
||||
logger.info(f"Memory: {mem['used_gb']:.1f}GB used ({mem['percent']:.1f}%)")
|
||||
|
||||
if self.cpu_history:
|
||||
cpu = self.cpu_history[-1]
|
||||
logger.info(f"CPU: {cpu['percent']:.1f}%")
|
||||
|
||||
if self.thread_history:
|
||||
threads = self.thread_history[-1]
|
||||
logger.info(f"Threads: {threads['count']} active")
|
||||
|
||||
if self.critical_issues:
|
||||
logger.error(f"Critical Issues ({len(self.critical_issues)}):")
|
||||
for issue in self.critical_issues[-5:]: # Show last 5
|
||||
logger.error(f" - {issue}")
|
||||
|
||||
if self.warnings:
|
||||
logger.warning(f"Warnings ({len(self.warnings)}):")
|
||||
for warning in self.warnings[-5:]: # Show last 5
|
||||
logger.warning(f" - {warning}")
|
||||
|
||||
logger.info("=" * 50)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error logging stability status: {e}")
|
||||
|
||||
def get_stability_report(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive stability report"""
|
||||
try:
|
||||
return {
|
||||
'stability_score': self.stability_score,
|
||||
'critical_issues': self.critical_issues,
|
||||
'warnings': self.warnings,
|
||||
'memory_usage': self.memory_history[-1] if self.memory_history else None,
|
||||
'cpu_usage': self.cpu_history[-1] if self.cpu_history else None,
|
||||
'thread_count': self.thread_history[-1]['count'] if self.thread_history else 0,
|
||||
'recent_errors': len([e for e in self.error_history
|
||||
if e['timestamp'] > datetime.now() - timedelta(minutes=10)]),
|
||||
'monitoring_active': self.monitoring_active
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error generating stability report: {e}")
|
||||
return {'error': str(e)}
|
||||
|
||||
def fix_common_issues(self):
|
||||
"""Attempt to fix common stability issues"""
|
||||
try:
|
||||
logger.info("Attempting to fix common stability issues...")
|
||||
|
||||
# Force garbage collection
|
||||
gc.collect()
|
||||
logger.info("✓ Forced garbage collection")
|
||||
|
||||
# Clear old history to free memory
|
||||
if len(self.memory_history) > 360: # Keep only 30 minutes
|
||||
self.memory_history = self.memory_history[-360:]
|
||||
if len(self.cpu_history) > 360:
|
||||
self.cpu_history = self.cpu_history[-360:]
|
||||
if len(self.thread_history) > 360:
|
||||
self.thread_history = self.thread_history[-360:]
|
||||
|
||||
logger.info("✓ Cleared old monitoring history")
|
||||
|
||||
# Clear old errors
|
||||
cutoff_time = datetime.now() - timedelta(hours=1)
|
||||
self.error_history = [e for e in self.error_history if e['timestamp'] > cutoff_time]
|
||||
logger.info("✓ Cleared old error history")
|
||||
|
||||
# Reset warnings and critical issues that might be stale
|
||||
self.warnings = []
|
||||
self.critical_issues = []
|
||||
logger.info("✓ Reset stale warnings and critical issues")
|
||||
|
||||
logger.info("Common stability fixes applied")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error fixing common issues: {e}")
|
||||
|
||||
def main():
|
||||
"""Main function for standalone execution"""
|
||||
try:
|
||||
logger.info("Starting System Stability Audit")
|
||||
|
||||
auditor = SystemStabilityAuditor()
|
||||
auditor.start_monitoring()
|
||||
|
||||
# Run for 5 minutes then generate report
|
||||
time.sleep(300)
|
||||
|
||||
report = auditor.get_stability_report()
|
||||
logger.info("FINAL STABILITY REPORT:")
|
||||
logger.info(f"Stability Score: {report['stability_score']:.1f}/100")
|
||||
logger.info(f"Critical Issues: {len(report['critical_issues'])}")
|
||||
logger.info(f"Warnings: {len(report['warnings'])}")
|
||||
|
||||
# Attempt fixes if needed
|
||||
if report['stability_score'] < 80:
|
||||
auditor.fix_common_issues()
|
||||
|
||||
auditor.stop_monitoring()
|
||||
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Audit interrupted by user")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in stability audit: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Reference in New Issue
Block a user