stability fixes, lower updates

2025-07-26 22:32:45 +03:00
parent 9576c52039
commit 7c61c12b70
9 changed files with 1210 additions and 45 deletions
--- a/utils/async_task_manager.py
+++ b/utils/async_task_manager.py
@@ -0,0 +1,232 @@
+"""
+Async Task Manager - Handles async tasks with comprehensive error handling
+Prevents silent failures in async operations
+"""
+
+import asyncio
+import logging
+import functools
+import traceback
+from typing import Any, Callable, Optional, Dict, List
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+class AsyncTaskManager:
+    """Manage async tasks with error handling and monitoring"""
+    
+    def __init__(self):
+        self.active_tasks: Dict[str, asyncio.Task] = {}
+        self.completed_tasks: List[Dict[str, Any]] = []
+        self.failed_tasks: List[Dict[str, Any]] = []
+        self.max_history = 100
+        
+    def create_task_with_error_handling(self, 
+                                      coro: Any, 
+                                      name: str,
+                                      error_callback: Optional[Callable] = None,
+                                      success_callback: Optional[Callable] = None) -> asyncio.Task:
+        """
+        Create an async task with comprehensive error handling
+        
+        Args:
+            coro: Coroutine to run
+            name: Task name for identification
+            error_callback: Called on error with (name, exception)
+            success_callback: Called on success with (name, result)
+        """
+        
+        async def wrapped_coro():
+            """Wrapper coroutine with error handling"""
+            start_time = datetime.now()
+            try:
+                logger.debug(f"Starting async task: {name}")
+                result = await coro
+                
+                # Log success
+                duration = (datetime.now() - start_time).total_seconds()
+                logger.debug(f"Async task '{name}' completed successfully in {duration:.2f}s")
+                
+                # Store completion info
+                completion_info = {
+                    'name': name,
+                    'status': 'completed',
+                    'start_time': start_time,
+                    'end_time': datetime.now(),
+                    'duration': duration,
+                    'result': str(result)[:200] if result else None  # Truncate long results
+                }
+                self.completed_tasks.append(completion_info)
+                
+                # Trim history
+                if len(self.completed_tasks) > self.max_history:
+                    self.completed_tasks.pop(0)
+                
+                # Call success callback
+                if success_callback:
+                    try:
+                        success_callback(name, result)
+                    except Exception as cb_error:
+                        logger.error(f"Error in success callback for task '{name}': {cb_error}")
+                
+                return result
+                
+            except asyncio.CancelledError:
+                logger.info(f"Async task '{name}' was cancelled")
+                raise
+                
+            except Exception as e:
+                # Log error with full traceback
+                duration = (datetime.now() - start_time).total_seconds()
+                error_msg = f"Async task '{name}' failed after {duration:.2f}s: {e}"
+                logger.error(error_msg)
+                logger.error(f"Task '{name}' traceback: {traceback.format_exc()}")
+                
+                # Store failure info
+                failure_info = {
+                    'name': name,
+                    'status': 'failed',
+                    'start_time': start_time,
+                    'end_time': datetime.now(),
+                    'duration': duration,
+                    'error': str(e),
+                    'traceback': traceback.format_exc()
+                }
+                self.failed_tasks.append(failure_info)
+                
+                # Trim history
+                if len(self.failed_tasks) > self.max_history:
+                    self.failed_tasks.pop(0)
+                
+                # Call error callback
+                if error_callback:
+                    try:
+                        error_callback(name, e)
+                    except Exception as cb_error:
+                        logger.error(f"Error in error callback for task '{name}': {cb_error}")
+                
+                # Don't re-raise to prevent task from crashing the event loop
+                # Instead, return None to indicate failure
+                return None
+            
+            finally:
+                # Remove from active tasks
+                if name in self.active_tasks:
+                    del self.active_tasks[name]
+        
+        # Create and store task
+        task = asyncio.create_task(wrapped_coro(), name=name)
+        self.active_tasks[name] = task
+        
+        return task
+    
+    def cancel_task(self, name: str) -> bool:
+        """Cancel a specific task"""
+        if name in self.active_tasks:
+            task = self.active_tasks[name]
+            if not task.done():
+                task.cancel()
+                logger.info(f"Cancelled async task: {name}")
+                return True
+        return False
+    
+    def cancel_all_tasks(self):
+        """Cancel all active tasks"""
+        for name, task in list(self.active_tasks.items()):
+            if not task.done():
+                task.cancel()
+                logger.info(f"Cancelled async task: {name}")
+    
+    def get_task_status(self) -> Dict[str, Any]:
+        """Get status of all tasks"""
+        active_count = len(self.active_tasks)
+        completed_count = len(self.completed_tasks)
+        failed_count = len(self.failed_tasks)
+        
+        # Get recent failures
+        recent_failures = self.failed_tasks[-5:] if self.failed_tasks else []
+        
+        return {
+            'active_tasks': active_count,
+            'completed_tasks': completed_count,
+            'failed_tasks': failed_count,
+            'active_task_names': list(self.active_tasks.keys()),
+            'recent_failures': [
+                {
+                    'name': f['name'],
+                    'error': f['error'],
+                    'duration': f['duration'],
+                    'time': f['end_time'].strftime('%H:%M:%S')
+                }
+                for f in recent_failures
+            ]
+        }
+    
+    def get_failure_summary(self) -> Dict[str, Any]:
+        """Get summary of task failures"""
+        if not self.failed_tasks:
+            return {'total_failures': 0, 'failure_patterns': {}}
+        
+        # Count failures by error type
+        error_counts = {}
+        for failure in self.failed_tasks:
+            error_type = type(failure.get('error', 'Unknown')).__name__
+            error_counts[error_type] = error_counts.get(error_type, 0) + 1
+        
+        # Recent failure rate
+        recent_failures = [f for f in self.failed_tasks if 
+                          (datetime.now() - f['end_time']).total_seconds() < 3600]  # Last hour
+        
+        return {
+            'total_failures': len(self.failed_tasks),
+            'recent_failures_1h': len(recent_failures),
+            'failure_patterns': error_counts,
+            'most_common_error': max(error_counts.items(), key=lambda x: x[1])[0] if error_counts else None
+        }
+
+# Global instance
+_task_manager = None
+
+def get_async_task_manager() -> AsyncTaskManager:
+    """Get global async task manager instance"""
+    global _task_manager
+    if _task_manager is None:
+        _task_manager = AsyncTaskManager()
+    return _task_manager
+
+def create_safe_task(coro: Any, 
+                    name: str,
+                    error_callback: Optional[Callable] = None,
+                    success_callback: Optional[Callable] = None) -> asyncio.Task:
+    """
+    Create a safe async task with error handling
+    
+    Args:
+        coro: Coroutine to run
+        name: Task name for identification
+        error_callback: Called on error with (name, exception)
+        success_callback: Called on success with (name, result)
+    """
+    manager = get_async_task_manager()
+    return manager.create_task_with_error_handling(coro, name, error_callback, success_callback)
+
+def safe_async_wrapper(name: str, 
+                      error_callback: Optional[Callable] = None,
+                      success_callback: Optional[Callable] = None):
+    """
+    Decorator for creating safe async functions
+    
+    Usage:
+        @safe_async_wrapper("my_task")
+        async def my_async_function():
+            # Your async code here
+            pass
+    """
+    def decorator(func):
+        @functools.wraps(func)
+        async def wrapper(*args, **kwargs):
+            coro = func(*args, **kwargs)
+            task = create_safe_task(coro, name, error_callback, success_callback)
+            return await task
+        return wrapper
+    return decorator
--- a/utils/process_supervisor.py
+++ b/utils/process_supervisor.py
@@ -0,0 +1,340 @@
+"""
+Process Supervisor - Handles process monitoring, restarts, and supervision
+Prevents silent failures by monitoring process health and restarting on crashes
+"""
+
+import subprocess
+import threading
+import time
+import logging
+import signal
+import os
+import sys
+from typing import Dict, Any, Optional, Callable, List
+from datetime import datetime, timedelta
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+class ProcessSupervisor:
+    """Supervise processes and restart them on failure"""
+    
+    def __init__(self, max_restarts: int = 5, restart_delay: int = 10):
+        """
+        Initialize process supervisor
+        
+        Args:
+            max_restarts: Maximum number of restarts before giving up
+            restart_delay: Delay in seconds between restarts
+        """
+        self.max_restarts = max_restarts
+        self.restart_delay = restart_delay
+        
+        self.processes: Dict[str, Dict[str, Any]] = {}
+        self.monitoring = False
+        self.monitor_thread = None
+        
+        # Callbacks
+        self.process_started_callback: Optional[Callable] = None
+        self.process_failed_callback: Optional[Callable] = None
+        self.process_restarted_callback: Optional[Callable] = None
+        
+    def add_process(self, name: str, command: List[str], 
+                   working_dir: Optional[str] = None,
+                   env: Optional[Dict[str, str]] = None,
+                   auto_restart: bool = True):
+        """
+        Add a process to supervise
+        
+        Args:
+            name: Process name
+            command: Command to run as list
+            working_dir: Working directory
+            env: Environment variables
+            auto_restart: Whether to auto-restart on failure
+        """
+        self.processes[name] = {
+            'command': command,
+            'working_dir': working_dir,
+            'env': env,
+            'auto_restart': auto_restart,
+            'process': None,
+            'restart_count': 0,
+            'last_start': None,
+            'last_failure': None,
+            'status': 'stopped'
+        }
+        logger.info(f"Added process '{name}' to supervisor")
+    
+    def start_process(self, name: str) -> bool:
+        """Start a specific process"""
+        if name not in self.processes:
+            logger.error(f"Process '{name}' not found")
+            return False
+        
+        proc_info = self.processes[name]
+        
+        if proc_info['process'] and proc_info['process'].poll() is None:
+            logger.warning(f"Process '{name}' is already running")
+            return True
+        
+        try:
+            # Prepare environment
+            env = os.environ.copy()
+            if proc_info['env']:
+                env.update(proc_info['env'])
+            
+            # Start process
+            process = subprocess.Popen(
+                proc_info['command'],
+                cwd=proc_info['working_dir'],
+                env=env,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True
+            )
+            
+            proc_info['process'] = process
+            proc_info['last_start'] = datetime.now()
+            proc_info['status'] = 'running'
+            
+            logger.info(f"Started process '{name}' (PID: {process.pid})")
+            
+            if self.process_started_callback:
+                try:
+                    self.process_started_callback(name, process.pid)
+                except Exception as e:
+                    logger.error(f"Error in process started callback: {e}")
+            
+            return True
+            
+        except Exception as e:
+            logger.error(f"Failed to start process '{name}': {e}")
+            proc_info['status'] = 'failed'
+            proc_info['last_failure'] = datetime.now()
+            return False
+    
+    def stop_process(self, name: str, timeout: int = 10) -> bool:
+        """Stop a specific process"""
+        if name not in self.processes:
+            logger.error(f"Process '{name}' not found")
+            return False
+        
+        proc_info = self.processes[name]
+        process = proc_info['process']
+        
+        if not process or process.poll() is not None:
+            logger.info(f"Process '{name}' is not running")
+            proc_info['status'] = 'stopped'
+            return True
+        
+        try:
+            # Try graceful shutdown first
+            process.terminate()
+            
+            # Wait for graceful shutdown
+            try:
+                process.wait(timeout=timeout)
+                logger.info(f"Process '{name}' terminated gracefully")
+            except subprocess.TimeoutExpired:
+                # Force kill if graceful shutdown fails
+                logger.warning(f"Process '{name}' did not terminate gracefully, force killing")
+                process.kill()
+                process.wait()
+                logger.info(f"Process '{name}' force killed")
+            
+            proc_info['status'] = 'stopped'
+            return True
+            
+        except Exception as e:
+            logger.error(f"Error stopping process '{name}': {e}")
+            return False
+    
+    def restart_process(self, name: str) -> bool:
+        """Restart a specific process"""
+        logger.info(f"Restarting process '{name}'")
+        
+        if name not in self.processes:
+            logger.error(f"Process '{name}' not found")
+            return False
+        
+        proc_info = self.processes[name]
+        
+        # Stop if running
+        if proc_info['process'] and proc_info['process'].poll() is None:
+            self.stop_process(name)
+        
+        # Wait restart delay
+        time.sleep(self.restart_delay)
+        
+        # Increment restart count
+        proc_info['restart_count'] += 1
+        
+        # Check restart limit
+        if proc_info['restart_count'] > self.max_restarts:
+            logger.error(f"Process '{name}' exceeded max restarts ({self.max_restarts})")
+            proc_info['status'] = 'failed_max_restarts'
+            return False
+        
+        # Start process
+        success = self.start_process(name)
+        
+        if success and self.process_restarted_callback:
+            try:
+                self.process_restarted_callback(name, proc_info['restart_count'])
+            except Exception as e:
+                logger.error(f"Error in process restarted callback: {e}")
+        
+        return success
+    
+    def start_monitoring(self):
+        """Start process monitoring"""
+        if self.monitoring:
+            logger.warning("Process monitoring already started")
+            return
+        
+        self.monitoring = True
+        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self.monitor_thread.start()
+        logger.info("Process monitoring started")
+    
+    def stop_monitoring(self):
+        """Stop process monitoring"""
+        self.monitoring = False
+        if self.monitor_thread:
+            self.monitor_thread.join(timeout=5)
+        logger.info("Process monitoring stopped")
+    
+    def _monitor_loop(self):
+        """Main monitoring loop"""
+        logger.info("Process monitoring loop started")
+        
+        while self.monitoring:
+            try:
+                for name, proc_info in self.processes.items():
+                    self._check_process_health(name, proc_info)
+                
+                time.sleep(5)  # Check every 5 seconds
+                
+            except Exception as e:
+                logger.error(f"Error in process monitoring loop: {e}")
+                time.sleep(5)
+        
+        logger.info("Process monitoring loop stopped")
+    
+    def _check_process_health(self, name: str, proc_info: Dict[str, Any]):
+        """Check health of a specific process"""
+        process = proc_info['process']
+        
+        if not process:
+            return
+        
+        # Check if process is still running
+        return_code = process.poll()
+        
+        if return_code is not None:
+            # Process has exited
+            proc_info['status'] = 'exited'
+            proc_info['last_failure'] = datetime.now()
+            
+            logger.warning(f"Process '{name}' exited with code {return_code}")
+            
+            # Read stdout/stderr for debugging
+            try:
+                stdout, stderr = process.communicate(timeout=1)
+                if stdout:
+                    logger.info(f"Process '{name}' stdout: {stdout[-500:]}")  # Last 500 chars
+                if stderr:
+                    logger.error(f"Process '{name}' stderr: {stderr[-500:]}")  # Last 500 chars
+            except Exception as e:
+                logger.warning(f"Could not read process output: {e}")
+            
+            if self.process_failed_callback:
+                try:
+                    self.process_failed_callback(name, return_code)
+                except Exception as e:
+                    logger.error(f"Error in process failed callback: {e}")
+            
+            # Auto-restart if enabled
+            if proc_info['auto_restart'] and proc_info['restart_count'] < self.max_restarts:
+                logger.info(f"Auto-restarting process '{name}'")
+                threading.Thread(target=self.restart_process, args=(name,), daemon=True).start()
+    
+    def get_process_status(self, name: str) -> Optional[Dict[str, Any]]:
+        """Get status of a specific process"""
+        if name not in self.processes:
+            return None
+        
+        proc_info = self.processes[name]
+        process = proc_info['process']
+        
+        status = {
+            'name': name,
+            'status': proc_info['status'],
+            'restart_count': proc_info['restart_count'],
+            'last_start': proc_info['last_start'],
+            'last_failure': proc_info['last_failure'],
+            'auto_restart': proc_info['auto_restart'],
+            'pid': process.pid if process and process.poll() is None else None,
+            'running': process is not None and process.poll() is None
+        }
+        
+        return status
+    
+    def get_all_status(self) -> Dict[str, Dict[str, Any]]:
+        """Get status of all processes"""
+        return {name: self.get_process_status(name) for name in self.processes}
+    
+    def set_callbacks(self,
+                     process_started: Optional[Callable] = None,
+                     process_failed: Optional[Callable] = None,
+                     process_restarted: Optional[Callable] = None):
+        """Set callback functions for process events"""
+        self.process_started_callback = process_started
+        self.process_failed_callback = process_failed
+        self.process_restarted_callback = process_restarted
+    
+    def shutdown_all(self):
+        """Shutdown all processes"""
+        logger.info("Shutting down all supervised processes")
+        
+        for name in list(self.processes.keys()):
+            self.stop_process(name)
+        
+        self.stop_monitoring()
+
+# Global instance
+_process_supervisor = None
+
+def get_process_supervisor() -> ProcessSupervisor:
+    """Get global process supervisor instance"""
+    global _process_supervisor
+    if _process_supervisor is None:
+        _process_supervisor = ProcessSupervisor()
+    return _process_supervisor
+
+def create_supervised_dashboard_runner():
+    """Create a supervised version of the dashboard runner"""
+    supervisor = get_process_supervisor()
+    
+    # Add dashboard process
+    supervisor.add_process(
+        name="clean_dashboard",
+        command=[sys.executable, "run_clean_dashboard.py"],
+        working_dir=os.getcwd(),
+        auto_restart=True
+    )
+    
+    # Set up callbacks
+    def on_process_failed(name: str, return_code: int):
+        logger.error(f"Dashboard process failed with code {return_code}")
+    
+    def on_process_restarted(name: str, restart_count: int):
+        logger.info(f"Dashboard restarted (attempt {restart_count})")
+    
+    supervisor.set_callbacks(
+        process_failed=on_process_failed,
+        process_restarted=on_process_restarted
+    )
+    
+    return supervisor
--- a/utils/system_monitor.py
+++ b/utils/system_monitor.py
@@ -0,0 +1,288 @@
+"""
+System Resource Monitor - Prevents resource exhaustion and silent failures
+Monitors memory, CPU, and disk usage to prevent system crashes
+"""
+
+import psutil
+import logging
+import threading
+import time
+import gc
+import os
+from typing import Dict, Any, Optional, Callable
+from datetime import datetime, timedelta
+
+logger = logging.getLogger(__name__)
+
+class SystemResourceMonitor:
+    """Monitor system resources and prevent exhaustion"""
+    
+    def __init__(self, 
+                 memory_threshold_mb: int = 7000,  # 7GB threshold for 8GB system
+                 cpu_threshold_percent: float = 90.0,
+                 disk_threshold_percent: float = 95.0,
+                 check_interval_seconds: int = 30):
+        """
+        Initialize system resource monitor
+        
+        Args:
+            memory_threshold_mb: Memory threshold in MB before cleanup
+            cpu_threshold_percent: CPU threshold percentage before warning
+            disk_threshold_percent: Disk usage threshold before warning
+            check_interval_seconds: How often to check resources
+        """
+        self.memory_threshold_mb = memory_threshold_mb
+        self.cpu_threshold_percent = cpu_threshold_percent
+        self.disk_threshold_percent = disk_threshold_percent
+        self.check_interval = check_interval_seconds
+        
+        self.monitoring = False
+        self.monitor_thread = None
+        
+        # Callbacks for resource events
+        self.memory_warning_callback: Optional[Callable] = None
+        self.cpu_warning_callback: Optional[Callable] = None
+        self.disk_warning_callback: Optional[Callable] = None
+        self.cleanup_callback: Optional[Callable] = None
+        
+        # Resource history for trending
+        self.resource_history = []
+        self.max_history_entries = 100
+        
+        # Last warning times to prevent spam
+        self.last_memory_warning = datetime.min
+        self.last_cpu_warning = datetime.min
+        self.last_disk_warning = datetime.min
+        self.warning_cooldown = timedelta(minutes=5)
+        
+    def start_monitoring(self):
+        """Start resource monitoring in background thread"""
+        if self.monitoring:
+            logger.warning("Resource monitoring already started")
+            return
+        
+        self.monitoring = True
+        self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
+        self.monitor_thread.start()
+        logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
+    
+    def stop_monitoring(self):
+        """Stop resource monitoring"""
+        self.monitoring = False
+        if self.monitor_thread:
+            self.monitor_thread.join(timeout=5)
+        logger.info("System resource monitoring stopped")
+    
+    def set_callbacks(self, 
+                     memory_warning: Optional[Callable] = None,
+                     cpu_warning: Optional[Callable] = None,
+                     disk_warning: Optional[Callable] = None,
+                     cleanup: Optional[Callable] = None):
+        """Set callback functions for resource events"""
+        self.memory_warning_callback = memory_warning
+        self.cpu_warning_callback = cpu_warning
+        self.disk_warning_callback = disk_warning
+        self.cleanup_callback = cleanup
+    
+    def get_current_usage(self) -> Dict[str, Any]:
+        """Get current system resource usage"""
+        try:
+            # Memory usage
+            memory = psutil.virtual_memory()
+            memory_mb = memory.used / (1024 * 1024)
+            memory_percent = memory.percent
+            
+            # CPU usage
+            cpu_percent = psutil.cpu_percent(interval=1)
+            
+            # Disk usage (current directory)
+            disk = psutil.disk_usage('.')
+            disk_percent = (disk.used / disk.total) * 100
+            
+            # Process-specific info
+            process = psutil.Process()
+            process_memory_mb = process.memory_info().rss / (1024 * 1024)
+            
+            return {
+                'timestamp': datetime.now(),
+                'memory': {
+                    'total_mb': memory.total / (1024 * 1024),
+                    'used_mb': memory_mb,
+                    'percent': memory_percent,
+                    'available_mb': memory.available / (1024 * 1024)
+                },
+                'process_memory_mb': process_memory_mb,
+                'cpu_percent': cpu_percent,
+                'disk': {
+                    'total_gb': disk.total / (1024 * 1024 * 1024),
+                    'used_gb': disk.used / (1024 * 1024 * 1024),
+                    'percent': disk_percent
+                }
+            }
+        except Exception as e:
+            logger.error(f"Error getting system usage: {e}")
+            return {}
+    
+    def _monitor_loop(self):
+        """Main monitoring loop"""
+        logger.info("Resource monitoring loop started")
+        
+        while self.monitoring:
+            try:
+                usage = self.get_current_usage()
+                if not usage:
+                    time.sleep(self.check_interval)
+                    continue
+                
+                # Store in history
+                self.resource_history.append(usage)
+                if len(self.resource_history) > self.max_history_entries:
+                    self.resource_history.pop(0)
+                
+                # Check thresholds
+                self._check_memory_threshold(usage)
+                self._check_cpu_threshold(usage)
+                self._check_disk_threshold(usage)
+                
+                # Log periodic status (every 10 minutes)
+                if len(self.resource_history) % 20 == 0:  # 20 * 30s = 10 minutes
+                    self._log_resource_status(usage)
+                
+            except Exception as e:
+                logger.error(f"Error in resource monitoring loop: {e}")
+            
+            time.sleep(self.check_interval)
+        
+        logger.info("Resource monitoring loop stopped")
+    
+    def _check_memory_threshold(self, usage: Dict[str, Any]):
+        """Check memory usage threshold"""
+        memory_mb = usage.get('memory', {}).get('used_mb', 0)
+        
+        if memory_mb > self.memory_threshold_mb:
+            now = datetime.now()
+            if now - self.last_memory_warning > self.warning_cooldown:
+                logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
+                self.last_memory_warning = now
+                
+                # Trigger cleanup
+                self._trigger_memory_cleanup()
+                
+                # Call callback if set
+                if self.memory_warning_callback:
+                    try:
+                        self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
+                    except Exception as e:
+                        logger.error(f"Error in memory warning callback: {e}")
+    
+    def _check_cpu_threshold(self, usage: Dict[str, Any]):
+        """Check CPU usage threshold"""
+        cpu_percent = usage.get('cpu_percent', 0)
+        
+        if cpu_percent > self.cpu_threshold_percent:
+            now = datetime.now()
+            if now - self.last_cpu_warning > self.warning_cooldown:
+                logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
+                self.last_cpu_warning = now
+                
+                if self.cpu_warning_callback:
+                    try:
+                        self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
+                    except Exception as e:
+                        logger.error(f"Error in CPU warning callback: {e}")
+    
+    def _check_disk_threshold(self, usage: Dict[str, Any]):
+        """Check disk usage threshold"""
+        disk_percent = usage.get('disk', {}).get('percent', 0)
+        
+        if disk_percent > self.disk_threshold_percent:
+            now = datetime.now()
+            if now - self.last_disk_warning > self.warning_cooldown:
+                logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
+                self.last_disk_warning = now
+                
+                if self.disk_warning_callback:
+                    try:
+                        self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
+                    except Exception as e:
+                        logger.error(f"Error in disk warning callback: {e}")
+    
+    def _trigger_memory_cleanup(self):
+        """Trigger memory cleanup procedures"""
+        logger.info("Triggering memory cleanup...")
+        
+        # Force garbage collection
+        collected = gc.collect()
+        logger.info(f"Garbage collection freed {collected} objects")
+        
+        # Call custom cleanup callback if set
+        if self.cleanup_callback:
+            try:
+                self.cleanup_callback()
+                logger.info("Custom cleanup callback executed")
+            except Exception as e:
+                logger.error(f"Error in cleanup callback: {e}")
+        
+        # Log memory after cleanup
+        try:
+            usage_after = self.get_current_usage()
+            memory_after = usage_after.get('memory', {}).get('used_mb', 0)
+            logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
+        except Exception as e:
+            logger.error(f"Error checking memory after cleanup: {e}")
+    
+    def _log_resource_status(self, usage: Dict[str, Any]):
+        """Log current resource status"""
+        memory = usage.get('memory', {})
+        cpu = usage.get('cpu_percent', 0)
+        disk = usage.get('disk', {})
+        process_memory = usage.get('process_memory_mb', 0)
+        
+        logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
+                   f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
+    
+    def get_resource_summary(self) -> Dict[str, Any]:
+        """Get resource usage summary"""
+        if not self.resource_history:
+            return {}
+        
+        recent_usage = self.resource_history[-10:]  # Last 10 entries
+        
+        # Calculate averages
+        avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
+        avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
+        avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
+        
+        current = self.resource_history[-1] if self.resource_history else {}
+        
+        return {
+            'current': current,
+            'averages': {
+                'memory_mb': avg_memory,
+                'cpu_percent': avg_cpu,
+                'disk_percent': avg_disk
+            },
+            'thresholds': {
+                'memory_mb': self.memory_threshold_mb,
+                'cpu_percent': self.cpu_threshold_percent,
+                'disk_percent': self.disk_threshold_percent
+            },
+            'monitoring': self.monitoring,
+            'history_entries': len(self.resource_history)
+        }
+
+# Global instance
+_system_monitor = None
+
+def get_system_monitor() -> SystemResourceMonitor:
+    """Get global system monitor instance"""
+    global _system_monitor
+    if _system_monitor is None:
+        _system_monitor = SystemResourceMonitor()
+    return _system_monitor
+
+def start_system_monitoring():
+    """Start system monitoring with default settings"""
+    monitor = get_system_monitor()
+    monitor.start_monitoring()
+    return monitor