Files
gogo2/utils/system_monitor.py
2025-07-26 22:32:45 +03:00

288 lines
11 KiB
Python

"""
System Resource Monitor - Prevents resource exhaustion and silent failures
Monitors memory, CPU, and disk usage to prevent system crashes
"""
import psutil
import logging
import threading
import time
import gc
import os
from typing import Dict, Any, Optional, Callable
from datetime import datetime, timedelta
logger = logging.getLogger(__name__)
class SystemResourceMonitor:
"""Monitor system resources and prevent exhaustion"""
def __init__(self,
memory_threshold_mb: int = 7000, # 7GB threshold for 8GB system
cpu_threshold_percent: float = 90.0,
disk_threshold_percent: float = 95.0,
check_interval_seconds: int = 30):
"""
Initialize system resource monitor
Args:
memory_threshold_mb: Memory threshold in MB before cleanup
cpu_threshold_percent: CPU threshold percentage before warning
disk_threshold_percent: Disk usage threshold before warning
check_interval_seconds: How often to check resources
"""
self.memory_threshold_mb = memory_threshold_mb
self.cpu_threshold_percent = cpu_threshold_percent
self.disk_threshold_percent = disk_threshold_percent
self.check_interval = check_interval_seconds
self.monitoring = False
self.monitor_thread = None
# Callbacks for resource events
self.memory_warning_callback: Optional[Callable] = None
self.cpu_warning_callback: Optional[Callable] = None
self.disk_warning_callback: Optional[Callable] = None
self.cleanup_callback: Optional[Callable] = None
# Resource history for trending
self.resource_history = []
self.max_history_entries = 100
# Last warning times to prevent spam
self.last_memory_warning = datetime.min
self.last_cpu_warning = datetime.min
self.last_disk_warning = datetime.min
self.warning_cooldown = timedelta(minutes=5)
def start_monitoring(self):
"""Start resource monitoring in background thread"""
if self.monitoring:
logger.warning("Resource monitoring already started")
return
self.monitoring = True
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
self.monitor_thread.start()
logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
def stop_monitoring(self):
"""Stop resource monitoring"""
self.monitoring = False
if self.monitor_thread:
self.monitor_thread.join(timeout=5)
logger.info("System resource monitoring stopped")
def set_callbacks(self,
memory_warning: Optional[Callable] = None,
cpu_warning: Optional[Callable] = None,
disk_warning: Optional[Callable] = None,
cleanup: Optional[Callable] = None):
"""Set callback functions for resource events"""
self.memory_warning_callback = memory_warning
self.cpu_warning_callback = cpu_warning
self.disk_warning_callback = disk_warning
self.cleanup_callback = cleanup
def get_current_usage(self) -> Dict[str, Any]:
"""Get current system resource usage"""
try:
# Memory usage
memory = psutil.virtual_memory()
memory_mb = memory.used / (1024 * 1024)
memory_percent = memory.percent
# CPU usage
cpu_percent = psutil.cpu_percent(interval=1)
# Disk usage (current directory)
disk = psutil.disk_usage('.')
disk_percent = (disk.used / disk.total) * 100
# Process-specific info
process = psutil.Process()
process_memory_mb = process.memory_info().rss / (1024 * 1024)
return {
'timestamp': datetime.now(),
'memory': {
'total_mb': memory.total / (1024 * 1024),
'used_mb': memory_mb,
'percent': memory_percent,
'available_mb': memory.available / (1024 * 1024)
},
'process_memory_mb': process_memory_mb,
'cpu_percent': cpu_percent,
'disk': {
'total_gb': disk.total / (1024 * 1024 * 1024),
'used_gb': disk.used / (1024 * 1024 * 1024),
'percent': disk_percent
}
}
except Exception as e:
logger.error(f"Error getting system usage: {e}")
return {}
def _monitor_loop(self):
"""Main monitoring loop"""
logger.info("Resource monitoring loop started")
while self.monitoring:
try:
usage = self.get_current_usage()
if not usage:
time.sleep(self.check_interval)
continue
# Store in history
self.resource_history.append(usage)
if len(self.resource_history) > self.max_history_entries:
self.resource_history.pop(0)
# Check thresholds
self._check_memory_threshold(usage)
self._check_cpu_threshold(usage)
self._check_disk_threshold(usage)
# Log periodic status (every 10 minutes)
if len(self.resource_history) % 20 == 0: # 20 * 30s = 10 minutes
self._log_resource_status(usage)
except Exception as e:
logger.error(f"Error in resource monitoring loop: {e}")
time.sleep(self.check_interval)
logger.info("Resource monitoring loop stopped")
def _check_memory_threshold(self, usage: Dict[str, Any]):
"""Check memory usage threshold"""
memory_mb = usage.get('memory', {}).get('used_mb', 0)
if memory_mb > self.memory_threshold_mb:
now = datetime.now()
if now - self.last_memory_warning > self.warning_cooldown:
logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
self.last_memory_warning = now
# Trigger cleanup
self._trigger_memory_cleanup()
# Call callback if set
if self.memory_warning_callback:
try:
self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
except Exception as e:
logger.error(f"Error in memory warning callback: {e}")
def _check_cpu_threshold(self, usage: Dict[str, Any]):
"""Check CPU usage threshold"""
cpu_percent = usage.get('cpu_percent', 0)
if cpu_percent > self.cpu_threshold_percent:
now = datetime.now()
if now - self.last_cpu_warning > self.warning_cooldown:
logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
self.last_cpu_warning = now
if self.cpu_warning_callback:
try:
self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
except Exception as e:
logger.error(f"Error in CPU warning callback: {e}")
def _check_disk_threshold(self, usage: Dict[str, Any]):
"""Check disk usage threshold"""
disk_percent = usage.get('disk', {}).get('percent', 0)
if disk_percent > self.disk_threshold_percent:
now = datetime.now()
if now - self.last_disk_warning > self.warning_cooldown:
logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
self.last_disk_warning = now
if self.disk_warning_callback:
try:
self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
except Exception as e:
logger.error(f"Error in disk warning callback: {e}")
def _trigger_memory_cleanup(self):
"""Trigger memory cleanup procedures"""
logger.info("Triggering memory cleanup...")
# Force garbage collection
collected = gc.collect()
logger.info(f"Garbage collection freed {collected} objects")
# Call custom cleanup callback if set
if self.cleanup_callback:
try:
self.cleanup_callback()
logger.info("Custom cleanup callback executed")
except Exception as e:
logger.error(f"Error in cleanup callback: {e}")
# Log memory after cleanup
try:
usage_after = self.get_current_usage()
memory_after = usage_after.get('memory', {}).get('used_mb', 0)
logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
except Exception as e:
logger.error(f"Error checking memory after cleanup: {e}")
def _log_resource_status(self, usage: Dict[str, Any]):
"""Log current resource status"""
memory = usage.get('memory', {})
cpu = usage.get('cpu_percent', 0)
disk = usage.get('disk', {})
process_memory = usage.get('process_memory_mb', 0)
logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
def get_resource_summary(self) -> Dict[str, Any]:
"""Get resource usage summary"""
if not self.resource_history:
return {}
recent_usage = self.resource_history[-10:] # Last 10 entries
# Calculate averages
avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
current = self.resource_history[-1] if self.resource_history else {}
return {
'current': current,
'averages': {
'memory_mb': avg_memory,
'cpu_percent': avg_cpu,
'disk_percent': avg_disk
},
'thresholds': {
'memory_mb': self.memory_threshold_mb,
'cpu_percent': self.cpu_threshold_percent,
'disk_percent': self.disk_threshold_percent
},
'monitoring': self.monitoring,
'history_entries': len(self.resource_history)
}
# Global instance
_system_monitor = None
def get_system_monitor() -> SystemResourceMonitor:
"""Get global system monitor instance"""
global _system_monitor
if _system_monitor is None:
_system_monitor = SystemResourceMonitor()
return _system_monitor
def start_system_monitoring():
"""Start system monitoring with default settings"""
monitor = get_system_monitor()
monitor.start_monitoring()
return monitor