288 lines
11 KiB
Python
288 lines
11 KiB
Python
"""
|
|
System Resource Monitor - Prevents resource exhaustion and silent failures
|
|
Monitors memory, CPU, and disk usage to prevent system crashes
|
|
"""
|
|
|
|
import psutil
|
|
import logging
|
|
import threading
|
|
import time
|
|
import gc
|
|
import os
|
|
from typing import Dict, Any, Optional, Callable
|
|
from datetime import datetime, timedelta
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SystemResourceMonitor:
|
|
"""Monitor system resources and prevent exhaustion"""
|
|
|
|
def __init__(self,
|
|
memory_threshold_mb: int = 7000, # 7GB threshold for 8GB system
|
|
cpu_threshold_percent: float = 90.0,
|
|
disk_threshold_percent: float = 95.0,
|
|
check_interval_seconds: int = 30):
|
|
"""
|
|
Initialize system resource monitor
|
|
|
|
Args:
|
|
memory_threshold_mb: Memory threshold in MB before cleanup
|
|
cpu_threshold_percent: CPU threshold percentage before warning
|
|
disk_threshold_percent: Disk usage threshold before warning
|
|
check_interval_seconds: How often to check resources
|
|
"""
|
|
self.memory_threshold_mb = memory_threshold_mb
|
|
self.cpu_threshold_percent = cpu_threshold_percent
|
|
self.disk_threshold_percent = disk_threshold_percent
|
|
self.check_interval = check_interval_seconds
|
|
|
|
self.monitoring = False
|
|
self.monitor_thread = None
|
|
|
|
# Callbacks for resource events
|
|
self.memory_warning_callback: Optional[Callable] = None
|
|
self.cpu_warning_callback: Optional[Callable] = None
|
|
self.disk_warning_callback: Optional[Callable] = None
|
|
self.cleanup_callback: Optional[Callable] = None
|
|
|
|
# Resource history for trending
|
|
self.resource_history = []
|
|
self.max_history_entries = 100
|
|
|
|
# Last warning times to prevent spam
|
|
self.last_memory_warning = datetime.min
|
|
self.last_cpu_warning = datetime.min
|
|
self.last_disk_warning = datetime.min
|
|
self.warning_cooldown = timedelta(minutes=5)
|
|
|
|
def start_monitoring(self):
|
|
"""Start resource monitoring in background thread"""
|
|
if self.monitoring:
|
|
logger.warning("Resource monitoring already started")
|
|
return
|
|
|
|
self.monitoring = True
|
|
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
|
self.monitor_thread.start()
|
|
logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
|
|
|
|
def stop_monitoring(self):
|
|
"""Stop resource monitoring"""
|
|
self.monitoring = False
|
|
if self.monitor_thread:
|
|
self.monitor_thread.join(timeout=5)
|
|
logger.info("System resource monitoring stopped")
|
|
|
|
def set_callbacks(self,
|
|
memory_warning: Optional[Callable] = None,
|
|
cpu_warning: Optional[Callable] = None,
|
|
disk_warning: Optional[Callable] = None,
|
|
cleanup: Optional[Callable] = None):
|
|
"""Set callback functions for resource events"""
|
|
self.memory_warning_callback = memory_warning
|
|
self.cpu_warning_callback = cpu_warning
|
|
self.disk_warning_callback = disk_warning
|
|
self.cleanup_callback = cleanup
|
|
|
|
def get_current_usage(self) -> Dict[str, Any]:
|
|
"""Get current system resource usage"""
|
|
try:
|
|
# Memory usage
|
|
memory = psutil.virtual_memory()
|
|
memory_mb = memory.used / (1024 * 1024)
|
|
memory_percent = memory.percent
|
|
|
|
# CPU usage
|
|
cpu_percent = psutil.cpu_percent(interval=1)
|
|
|
|
# Disk usage (current directory)
|
|
disk = psutil.disk_usage('.')
|
|
disk_percent = (disk.used / disk.total) * 100
|
|
|
|
# Process-specific info
|
|
process = psutil.Process()
|
|
process_memory_mb = process.memory_info().rss / (1024 * 1024)
|
|
|
|
return {
|
|
'timestamp': datetime.now(),
|
|
'memory': {
|
|
'total_mb': memory.total / (1024 * 1024),
|
|
'used_mb': memory_mb,
|
|
'percent': memory_percent,
|
|
'available_mb': memory.available / (1024 * 1024)
|
|
},
|
|
'process_memory_mb': process_memory_mb,
|
|
'cpu_percent': cpu_percent,
|
|
'disk': {
|
|
'total_gb': disk.total / (1024 * 1024 * 1024),
|
|
'used_gb': disk.used / (1024 * 1024 * 1024),
|
|
'percent': disk_percent
|
|
}
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Error getting system usage: {e}")
|
|
return {}
|
|
|
|
def _monitor_loop(self):
|
|
"""Main monitoring loop"""
|
|
logger.info("Resource monitoring loop started")
|
|
|
|
while self.monitoring:
|
|
try:
|
|
usage = self.get_current_usage()
|
|
if not usage:
|
|
time.sleep(self.check_interval)
|
|
continue
|
|
|
|
# Store in history
|
|
self.resource_history.append(usage)
|
|
if len(self.resource_history) > self.max_history_entries:
|
|
self.resource_history.pop(0)
|
|
|
|
# Check thresholds
|
|
self._check_memory_threshold(usage)
|
|
self._check_cpu_threshold(usage)
|
|
self._check_disk_threshold(usage)
|
|
|
|
# Log periodic status (every 10 minutes)
|
|
if len(self.resource_history) % 20 == 0: # 20 * 30s = 10 minutes
|
|
self._log_resource_status(usage)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error in resource monitoring loop: {e}")
|
|
|
|
time.sleep(self.check_interval)
|
|
|
|
logger.info("Resource monitoring loop stopped")
|
|
|
|
def _check_memory_threshold(self, usage: Dict[str, Any]):
|
|
"""Check memory usage threshold"""
|
|
memory_mb = usage.get('memory', {}).get('used_mb', 0)
|
|
|
|
if memory_mb > self.memory_threshold_mb:
|
|
now = datetime.now()
|
|
if now - self.last_memory_warning > self.warning_cooldown:
|
|
logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
|
|
self.last_memory_warning = now
|
|
|
|
# Trigger cleanup
|
|
self._trigger_memory_cleanup()
|
|
|
|
# Call callback if set
|
|
if self.memory_warning_callback:
|
|
try:
|
|
self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
|
|
except Exception as e:
|
|
logger.error(f"Error in memory warning callback: {e}")
|
|
|
|
def _check_cpu_threshold(self, usage: Dict[str, Any]):
|
|
"""Check CPU usage threshold"""
|
|
cpu_percent = usage.get('cpu_percent', 0)
|
|
|
|
if cpu_percent > self.cpu_threshold_percent:
|
|
now = datetime.now()
|
|
if now - self.last_cpu_warning > self.warning_cooldown:
|
|
logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
|
|
self.last_cpu_warning = now
|
|
|
|
if self.cpu_warning_callback:
|
|
try:
|
|
self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
|
|
except Exception as e:
|
|
logger.error(f"Error in CPU warning callback: {e}")
|
|
|
|
def _check_disk_threshold(self, usage: Dict[str, Any]):
|
|
"""Check disk usage threshold"""
|
|
disk_percent = usage.get('disk', {}).get('percent', 0)
|
|
|
|
if disk_percent > self.disk_threshold_percent:
|
|
now = datetime.now()
|
|
if now - self.last_disk_warning > self.warning_cooldown:
|
|
logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
|
|
self.last_disk_warning = now
|
|
|
|
if self.disk_warning_callback:
|
|
try:
|
|
self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
|
|
except Exception as e:
|
|
logger.error(f"Error in disk warning callback: {e}")
|
|
|
|
def _trigger_memory_cleanup(self):
|
|
"""Trigger memory cleanup procedures"""
|
|
logger.info("Triggering memory cleanup...")
|
|
|
|
# Force garbage collection
|
|
collected = gc.collect()
|
|
logger.info(f"Garbage collection freed {collected} objects")
|
|
|
|
# Call custom cleanup callback if set
|
|
if self.cleanup_callback:
|
|
try:
|
|
self.cleanup_callback()
|
|
logger.info("Custom cleanup callback executed")
|
|
except Exception as e:
|
|
logger.error(f"Error in cleanup callback: {e}")
|
|
|
|
# Log memory after cleanup
|
|
try:
|
|
usage_after = self.get_current_usage()
|
|
memory_after = usage_after.get('memory', {}).get('used_mb', 0)
|
|
logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
|
|
except Exception as e:
|
|
logger.error(f"Error checking memory after cleanup: {e}")
|
|
|
|
def _log_resource_status(self, usage: Dict[str, Any]):
|
|
"""Log current resource status"""
|
|
memory = usage.get('memory', {})
|
|
cpu = usage.get('cpu_percent', 0)
|
|
disk = usage.get('disk', {})
|
|
process_memory = usage.get('process_memory_mb', 0)
|
|
|
|
logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
|
|
f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
|
|
|
|
def get_resource_summary(self) -> Dict[str, Any]:
|
|
"""Get resource usage summary"""
|
|
if not self.resource_history:
|
|
return {}
|
|
|
|
recent_usage = self.resource_history[-10:] # Last 10 entries
|
|
|
|
# Calculate averages
|
|
avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
|
|
avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
|
|
avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
|
|
|
|
current = self.resource_history[-1] if self.resource_history else {}
|
|
|
|
return {
|
|
'current': current,
|
|
'averages': {
|
|
'memory_mb': avg_memory,
|
|
'cpu_percent': avg_cpu,
|
|
'disk_percent': avg_disk
|
|
},
|
|
'thresholds': {
|
|
'memory_mb': self.memory_threshold_mb,
|
|
'cpu_percent': self.cpu_threshold_percent,
|
|
'disk_percent': self.disk_threshold_percent
|
|
},
|
|
'monitoring': self.monitoring,
|
|
'history_entries': len(self.resource_history)
|
|
}
|
|
|
|
# Global instance
|
|
_system_monitor = None
|
|
|
|
def get_system_monitor() -> SystemResourceMonitor:
|
|
"""Get global system monitor instance"""
|
|
global _system_monitor
|
|
if _system_monitor is None:
|
|
_system_monitor = SystemResourceMonitor()
|
|
return _system_monitor
|
|
|
|
def start_system_monitoring():
|
|
"""Start system monitoring with default settings"""
|
|
monitor = get_system_monitor()
|
|
monitor.start_monitoring()
|
|
return monitor |