""" System Resource Monitor - Prevents resource exhaustion and silent failures Monitors memory, CPU, and disk usage to prevent system crashes """ import psutil import logging import threading import time import gc import os from typing import Dict, Any, Optional, Callable from datetime import datetime, timedelta logger = logging.getLogger(__name__) class SystemResourceMonitor: """Monitor system resources and prevent exhaustion""" def __init__(self, memory_threshold_mb: int = 7000, # 7GB threshold for 8GB system cpu_threshold_percent: float = 90.0, disk_threshold_percent: float = 95.0, check_interval_seconds: int = 30): """ Initialize system resource monitor Args: memory_threshold_mb: Memory threshold in MB before cleanup cpu_threshold_percent: CPU threshold percentage before warning disk_threshold_percent: Disk usage threshold before warning check_interval_seconds: How often to check resources """ self.memory_threshold_mb = memory_threshold_mb self.cpu_threshold_percent = cpu_threshold_percent self.disk_threshold_percent = disk_threshold_percent self.check_interval = check_interval_seconds self.monitoring = False self.monitor_thread = None # Callbacks for resource events self.memory_warning_callback: Optional[Callable] = None self.cpu_warning_callback: Optional[Callable] = None self.disk_warning_callback: Optional[Callable] = None self.cleanup_callback: Optional[Callable] = None # Resource history for trending self.resource_history = [] self.max_history_entries = 100 # Last warning times to prevent spam self.last_memory_warning = datetime.min self.last_cpu_warning = datetime.min self.last_disk_warning = datetime.min self.warning_cooldown = timedelta(minutes=5) def start_monitoring(self): """Start resource monitoring in background thread""" if self.monitoring: logger.warning("Resource monitoring already started") return self.monitoring = True self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) self.monitor_thread.start() logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)") def stop_monitoring(self): """Stop resource monitoring""" self.monitoring = False if self.monitor_thread: self.monitor_thread.join(timeout=5) logger.info("System resource monitoring stopped") def set_callbacks(self, memory_warning: Optional[Callable] = None, cpu_warning: Optional[Callable] = None, disk_warning: Optional[Callable] = None, cleanup: Optional[Callable] = None): """Set callback functions for resource events""" self.memory_warning_callback = memory_warning self.cpu_warning_callback = cpu_warning self.disk_warning_callback = disk_warning self.cleanup_callback = cleanup def get_current_usage(self) -> Dict[str, Any]: """Get current system resource usage""" try: # Memory usage memory = psutil.virtual_memory() memory_mb = memory.used / (1024 * 1024) memory_percent = memory.percent # CPU usage cpu_percent = psutil.cpu_percent(interval=1) # Disk usage (current directory) disk = psutil.disk_usage('.') disk_percent = (disk.used / disk.total) * 100 # Process-specific info process = psutil.Process() process_memory_mb = process.memory_info().rss / (1024 * 1024) return { 'timestamp': datetime.now(), 'memory': { 'total_mb': memory.total / (1024 * 1024), 'used_mb': memory_mb, 'percent': memory_percent, 'available_mb': memory.available / (1024 * 1024) }, 'process_memory_mb': process_memory_mb, 'cpu_percent': cpu_percent, 'disk': { 'total_gb': disk.total / (1024 * 1024 * 1024), 'used_gb': disk.used / (1024 * 1024 * 1024), 'percent': disk_percent } } except Exception as e: logger.error(f"Error getting system usage: {e}") return {} def _monitor_loop(self): """Main monitoring loop""" logger.info("Resource monitoring loop started") while self.monitoring: try: usage = self.get_current_usage() if not usage: time.sleep(self.check_interval) continue # Store in history self.resource_history.append(usage) if len(self.resource_history) > self.max_history_entries: self.resource_history.pop(0) # Check thresholds self._check_memory_threshold(usage) self._check_cpu_threshold(usage) self._check_disk_threshold(usage) # Log periodic status (every 10 minutes) if len(self.resource_history) % 20 == 0: # 20 * 30s = 10 minutes self._log_resource_status(usage) except Exception as e: logger.error(f"Error in resource monitoring loop: {e}") time.sleep(self.check_interval) logger.info("Resource monitoring loop stopped") def _check_memory_threshold(self, usage: Dict[str, Any]): """Check memory usage threshold""" memory_mb = usage.get('memory', {}).get('used_mb', 0) if memory_mb > self.memory_threshold_mb: now = datetime.now() if now - self.last_memory_warning > self.warning_cooldown: logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold") self.last_memory_warning = now # Trigger cleanup self._trigger_memory_cleanup() # Call callback if set if self.memory_warning_callback: try: self.memory_warning_callback(memory_mb, self.memory_threshold_mb) except Exception as e: logger.error(f"Error in memory warning callback: {e}") def _check_cpu_threshold(self, usage: Dict[str, Any]): """Check CPU usage threshold""" cpu_percent = usage.get('cpu_percent', 0) if cpu_percent > self.cpu_threshold_percent: now = datetime.now() if now - self.last_cpu_warning > self.warning_cooldown: logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold") self.last_cpu_warning = now if self.cpu_warning_callback: try: self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent) except Exception as e: logger.error(f"Error in CPU warning callback: {e}") def _check_disk_threshold(self, usage: Dict[str, Any]): """Check disk usage threshold""" disk_percent = usage.get('disk', {}).get('percent', 0) if disk_percent > self.disk_threshold_percent: now = datetime.now() if now - self.last_disk_warning > self.warning_cooldown: logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold") self.last_disk_warning = now if self.disk_warning_callback: try: self.disk_warning_callback(disk_percent, self.disk_threshold_percent) except Exception as e: logger.error(f"Error in disk warning callback: {e}") def _trigger_memory_cleanup(self): """Trigger memory cleanup procedures""" logger.info("Triggering memory cleanup...") # Force garbage collection collected = gc.collect() logger.info(f"Garbage collection freed {collected} objects") # Call custom cleanup callback if set if self.cleanup_callback: try: self.cleanup_callback() logger.info("Custom cleanup callback executed") except Exception as e: logger.error(f"Error in cleanup callback: {e}") # Log memory after cleanup try: usage_after = self.get_current_usage() memory_after = usage_after.get('memory', {}).get('used_mb', 0) logger.info(f"Memory after cleanup: {memory_after:.1f}MB") except Exception as e: logger.error(f"Error checking memory after cleanup: {e}") def _log_resource_status(self, usage: Dict[str, Any]): """Log current resource status""" memory = usage.get('memory', {}) cpu = usage.get('cpu_percent', 0) disk = usage.get('disk', {}) process_memory = usage.get('process_memory_mb', 0) logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), " f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%") def get_resource_summary(self) -> Dict[str, Any]: """Get resource usage summary""" if not self.resource_history: return {} recent_usage = self.resource_history[-10:] # Last 10 entries # Calculate averages avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage) avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage) avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage) current = self.resource_history[-1] if self.resource_history else {} return { 'current': current, 'averages': { 'memory_mb': avg_memory, 'cpu_percent': avg_cpu, 'disk_percent': avg_disk }, 'thresholds': { 'memory_mb': self.memory_threshold_mb, 'cpu_percent': self.cpu_threshold_percent, 'disk_percent': self.disk_threshold_percent }, 'monitoring': self.monitoring, 'history_entries': len(self.resource_history) } # Global instance _system_monitor = None def get_system_monitor() -> SystemResourceMonitor: """Get global system monitor instance""" global _system_monitor if _system_monitor is None: _system_monitor = SystemResourceMonitor() return _system_monitor def start_system_monitoring(): """Start system monitoring with default settings""" monitor = get_system_monitor() monitor.start_monitoring() return monitor