stability fixes, lower updates
This commit is contained in:
232
utils/async_task_manager.py
Normal file
232
utils/async_task_manager.py
Normal file
@ -0,0 +1,232 @@
|
||||
"""
|
||||
Async Task Manager - Handles async tasks with comprehensive error handling
|
||||
Prevents silent failures in async operations
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import functools
|
||||
import traceback
|
||||
from typing import Any, Callable, Optional, Dict, List
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AsyncTaskManager:
|
||||
"""Manage async tasks with error handling and monitoring"""
|
||||
|
||||
def __init__(self):
|
||||
self.active_tasks: Dict[str, asyncio.Task] = {}
|
||||
self.completed_tasks: List[Dict[str, Any]] = []
|
||||
self.failed_tasks: List[Dict[str, Any]] = []
|
||||
self.max_history = 100
|
||||
|
||||
def create_task_with_error_handling(self,
|
||||
coro: Any,
|
||||
name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None) -> asyncio.Task:
|
||||
"""
|
||||
Create an async task with comprehensive error handling
|
||||
|
||||
Args:
|
||||
coro: Coroutine to run
|
||||
name: Task name for identification
|
||||
error_callback: Called on error with (name, exception)
|
||||
success_callback: Called on success with (name, result)
|
||||
"""
|
||||
|
||||
async def wrapped_coro():
|
||||
"""Wrapper coroutine with error handling"""
|
||||
start_time = datetime.now()
|
||||
try:
|
||||
logger.debug(f"Starting async task: {name}")
|
||||
result = await coro
|
||||
|
||||
# Log success
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
logger.debug(f"Async task '{name}' completed successfully in {duration:.2f}s")
|
||||
|
||||
# Store completion info
|
||||
completion_info = {
|
||||
'name': name,
|
||||
'status': 'completed',
|
||||
'start_time': start_time,
|
||||
'end_time': datetime.now(),
|
||||
'duration': duration,
|
||||
'result': str(result)[:200] if result else None # Truncate long results
|
||||
}
|
||||
self.completed_tasks.append(completion_info)
|
||||
|
||||
# Trim history
|
||||
if len(self.completed_tasks) > self.max_history:
|
||||
self.completed_tasks.pop(0)
|
||||
|
||||
# Call success callback
|
||||
if success_callback:
|
||||
try:
|
||||
success_callback(name, result)
|
||||
except Exception as cb_error:
|
||||
logger.error(f"Error in success callback for task '{name}': {cb_error}")
|
||||
|
||||
return result
|
||||
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"Async task '{name}' was cancelled")
|
||||
raise
|
||||
|
||||
except Exception as e:
|
||||
# Log error with full traceback
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
error_msg = f"Async task '{name}' failed after {duration:.2f}s: {e}"
|
||||
logger.error(error_msg)
|
||||
logger.error(f"Task '{name}' traceback: {traceback.format_exc()}")
|
||||
|
||||
# Store failure info
|
||||
failure_info = {
|
||||
'name': name,
|
||||
'status': 'failed',
|
||||
'start_time': start_time,
|
||||
'end_time': datetime.now(),
|
||||
'duration': duration,
|
||||
'error': str(e),
|
||||
'traceback': traceback.format_exc()
|
||||
}
|
||||
self.failed_tasks.append(failure_info)
|
||||
|
||||
# Trim history
|
||||
if len(self.failed_tasks) > self.max_history:
|
||||
self.failed_tasks.pop(0)
|
||||
|
||||
# Call error callback
|
||||
if error_callback:
|
||||
try:
|
||||
error_callback(name, e)
|
||||
except Exception as cb_error:
|
||||
logger.error(f"Error in error callback for task '{name}': {cb_error}")
|
||||
|
||||
# Don't re-raise to prevent task from crashing the event loop
|
||||
# Instead, return None to indicate failure
|
||||
return None
|
||||
|
||||
finally:
|
||||
# Remove from active tasks
|
||||
if name in self.active_tasks:
|
||||
del self.active_tasks[name]
|
||||
|
||||
# Create and store task
|
||||
task = asyncio.create_task(wrapped_coro(), name=name)
|
||||
self.active_tasks[name] = task
|
||||
|
||||
return task
|
||||
|
||||
def cancel_task(self, name: str) -> bool:
|
||||
"""Cancel a specific task"""
|
||||
if name in self.active_tasks:
|
||||
task = self.active_tasks[name]
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
logger.info(f"Cancelled async task: {name}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def cancel_all_tasks(self):
|
||||
"""Cancel all active tasks"""
|
||||
for name, task in list(self.active_tasks.items()):
|
||||
if not task.done():
|
||||
task.cancel()
|
||||
logger.info(f"Cancelled async task: {name}")
|
||||
|
||||
def get_task_status(self) -> Dict[str, Any]:
|
||||
"""Get status of all tasks"""
|
||||
active_count = len(self.active_tasks)
|
||||
completed_count = len(self.completed_tasks)
|
||||
failed_count = len(self.failed_tasks)
|
||||
|
||||
# Get recent failures
|
||||
recent_failures = self.failed_tasks[-5:] if self.failed_tasks else []
|
||||
|
||||
return {
|
||||
'active_tasks': active_count,
|
||||
'completed_tasks': completed_count,
|
||||
'failed_tasks': failed_count,
|
||||
'active_task_names': list(self.active_tasks.keys()),
|
||||
'recent_failures': [
|
||||
{
|
||||
'name': f['name'],
|
||||
'error': f['error'],
|
||||
'duration': f['duration'],
|
||||
'time': f['end_time'].strftime('%H:%M:%S')
|
||||
}
|
||||
for f in recent_failures
|
||||
]
|
||||
}
|
||||
|
||||
def get_failure_summary(self) -> Dict[str, Any]:
|
||||
"""Get summary of task failures"""
|
||||
if not self.failed_tasks:
|
||||
return {'total_failures': 0, 'failure_patterns': {}}
|
||||
|
||||
# Count failures by error type
|
||||
error_counts = {}
|
||||
for failure in self.failed_tasks:
|
||||
error_type = type(failure.get('error', 'Unknown')).__name__
|
||||
error_counts[error_type] = error_counts.get(error_type, 0) + 1
|
||||
|
||||
# Recent failure rate
|
||||
recent_failures = [f for f in self.failed_tasks if
|
||||
(datetime.now() - f['end_time']).total_seconds() < 3600] # Last hour
|
||||
|
||||
return {
|
||||
'total_failures': len(self.failed_tasks),
|
||||
'recent_failures_1h': len(recent_failures),
|
||||
'failure_patterns': error_counts,
|
||||
'most_common_error': max(error_counts.items(), key=lambda x: x[1])[0] if error_counts else None
|
||||
}
|
||||
|
||||
# Global instance
|
||||
_task_manager = None
|
||||
|
||||
def get_async_task_manager() -> AsyncTaskManager:
|
||||
"""Get global async task manager instance"""
|
||||
global _task_manager
|
||||
if _task_manager is None:
|
||||
_task_manager = AsyncTaskManager()
|
||||
return _task_manager
|
||||
|
||||
def create_safe_task(coro: Any,
|
||||
name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None) -> asyncio.Task:
|
||||
"""
|
||||
Create a safe async task with error handling
|
||||
|
||||
Args:
|
||||
coro: Coroutine to run
|
||||
name: Task name for identification
|
||||
error_callback: Called on error with (name, exception)
|
||||
success_callback: Called on success with (name, result)
|
||||
"""
|
||||
manager = get_async_task_manager()
|
||||
return manager.create_task_with_error_handling(coro, name, error_callback, success_callback)
|
||||
|
||||
def safe_async_wrapper(name: str,
|
||||
error_callback: Optional[Callable] = None,
|
||||
success_callback: Optional[Callable] = None):
|
||||
"""
|
||||
Decorator for creating safe async functions
|
||||
|
||||
Usage:
|
||||
@safe_async_wrapper("my_task")
|
||||
async def my_async_function():
|
||||
# Your async code here
|
||||
pass
|
||||
"""
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
async def wrapper(*args, **kwargs):
|
||||
coro = func(*args, **kwargs)
|
||||
task = create_safe_task(coro, name, error_callback, success_callback)
|
||||
return await task
|
||||
return wrapper
|
||||
return decorator
|
340
utils/process_supervisor.py
Normal file
340
utils/process_supervisor.py
Normal file
@ -0,0 +1,340 @@
|
||||
"""
|
||||
Process Supervisor - Handles process monitoring, restarts, and supervision
|
||||
Prevents silent failures by monitoring process health and restarting on crashes
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import threading
|
||||
import time
|
||||
import logging
|
||||
import signal
|
||||
import os
|
||||
import sys
|
||||
from typing import Dict, Any, Optional, Callable, List
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class ProcessSupervisor:
|
||||
"""Supervise processes and restart them on failure"""
|
||||
|
||||
def __init__(self, max_restarts: int = 5, restart_delay: int = 10):
|
||||
"""
|
||||
Initialize process supervisor
|
||||
|
||||
Args:
|
||||
max_restarts: Maximum number of restarts before giving up
|
||||
restart_delay: Delay in seconds between restarts
|
||||
"""
|
||||
self.max_restarts = max_restarts
|
||||
self.restart_delay = restart_delay
|
||||
|
||||
self.processes: Dict[str, Dict[str, Any]] = {}
|
||||
self.monitoring = False
|
||||
self.monitor_thread = None
|
||||
|
||||
# Callbacks
|
||||
self.process_started_callback: Optional[Callable] = None
|
||||
self.process_failed_callback: Optional[Callable] = None
|
||||
self.process_restarted_callback: Optional[Callable] = None
|
||||
|
||||
def add_process(self, name: str, command: List[str],
|
||||
working_dir: Optional[str] = None,
|
||||
env: Optional[Dict[str, str]] = None,
|
||||
auto_restart: bool = True):
|
||||
"""
|
||||
Add a process to supervise
|
||||
|
||||
Args:
|
||||
name: Process name
|
||||
command: Command to run as list
|
||||
working_dir: Working directory
|
||||
env: Environment variables
|
||||
auto_restart: Whether to auto-restart on failure
|
||||
"""
|
||||
self.processes[name] = {
|
||||
'command': command,
|
||||
'working_dir': working_dir,
|
||||
'env': env,
|
||||
'auto_restart': auto_restart,
|
||||
'process': None,
|
||||
'restart_count': 0,
|
||||
'last_start': None,
|
||||
'last_failure': None,
|
||||
'status': 'stopped'
|
||||
}
|
||||
logger.info(f"Added process '{name}' to supervisor")
|
||||
|
||||
def start_process(self, name: str) -> bool:
|
||||
"""Start a specific process"""
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
|
||||
if proc_info['process'] and proc_info['process'].poll() is None:
|
||||
logger.warning(f"Process '{name}' is already running")
|
||||
return True
|
||||
|
||||
try:
|
||||
# Prepare environment
|
||||
env = os.environ.copy()
|
||||
if proc_info['env']:
|
||||
env.update(proc_info['env'])
|
||||
|
||||
# Start process
|
||||
process = subprocess.Popen(
|
||||
proc_info['command'],
|
||||
cwd=proc_info['working_dir'],
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True
|
||||
)
|
||||
|
||||
proc_info['process'] = process
|
||||
proc_info['last_start'] = datetime.now()
|
||||
proc_info['status'] = 'running'
|
||||
|
||||
logger.info(f"Started process '{name}' (PID: {process.pid})")
|
||||
|
||||
if self.process_started_callback:
|
||||
try:
|
||||
self.process_started_callback(name, process.pid)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process started callback: {e}")
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to start process '{name}': {e}")
|
||||
proc_info['status'] = 'failed'
|
||||
proc_info['last_failure'] = datetime.now()
|
||||
return False
|
||||
|
||||
def stop_process(self, name: str, timeout: int = 10) -> bool:
|
||||
"""Stop a specific process"""
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
process = proc_info['process']
|
||||
|
||||
if not process or process.poll() is not None:
|
||||
logger.info(f"Process '{name}' is not running")
|
||||
proc_info['status'] = 'stopped'
|
||||
return True
|
||||
|
||||
try:
|
||||
# Try graceful shutdown first
|
||||
process.terminate()
|
||||
|
||||
# Wait for graceful shutdown
|
||||
try:
|
||||
process.wait(timeout=timeout)
|
||||
logger.info(f"Process '{name}' terminated gracefully")
|
||||
except subprocess.TimeoutExpired:
|
||||
# Force kill if graceful shutdown fails
|
||||
logger.warning(f"Process '{name}' did not terminate gracefully, force killing")
|
||||
process.kill()
|
||||
process.wait()
|
||||
logger.info(f"Process '{name}' force killed")
|
||||
|
||||
proc_info['status'] = 'stopped'
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error stopping process '{name}': {e}")
|
||||
return False
|
||||
|
||||
def restart_process(self, name: str) -> bool:
|
||||
"""Restart a specific process"""
|
||||
logger.info(f"Restarting process '{name}'")
|
||||
|
||||
if name not in self.processes:
|
||||
logger.error(f"Process '{name}' not found")
|
||||
return False
|
||||
|
||||
proc_info = self.processes[name]
|
||||
|
||||
# Stop if running
|
||||
if proc_info['process'] and proc_info['process'].poll() is None:
|
||||
self.stop_process(name)
|
||||
|
||||
# Wait restart delay
|
||||
time.sleep(self.restart_delay)
|
||||
|
||||
# Increment restart count
|
||||
proc_info['restart_count'] += 1
|
||||
|
||||
# Check restart limit
|
||||
if proc_info['restart_count'] > self.max_restarts:
|
||||
logger.error(f"Process '{name}' exceeded max restarts ({self.max_restarts})")
|
||||
proc_info['status'] = 'failed_max_restarts'
|
||||
return False
|
||||
|
||||
# Start process
|
||||
success = self.start_process(name)
|
||||
|
||||
if success and self.process_restarted_callback:
|
||||
try:
|
||||
self.process_restarted_callback(name, proc_info['restart_count'])
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process restarted callback: {e}")
|
||||
|
||||
return success
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start process monitoring"""
|
||||
if self.monitoring:
|
||||
logger.warning("Process monitoring already started")
|
||||
return
|
||||
|
||||
self.monitoring = True
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitor_thread.start()
|
||||
logger.info("Process monitoring started")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop process monitoring"""
|
||||
self.monitoring = False
|
||||
if self.monitor_thread:
|
||||
self.monitor_thread.join(timeout=5)
|
||||
logger.info("Process monitoring stopped")
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Main monitoring loop"""
|
||||
logger.info("Process monitoring loop started")
|
||||
|
||||
while self.monitoring:
|
||||
try:
|
||||
for name, proc_info in self.processes.items():
|
||||
self._check_process_health(name, proc_info)
|
||||
|
||||
time.sleep(5) # Check every 5 seconds
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process monitoring loop: {e}")
|
||||
time.sleep(5)
|
||||
|
||||
logger.info("Process monitoring loop stopped")
|
||||
|
||||
def _check_process_health(self, name: str, proc_info: Dict[str, Any]):
|
||||
"""Check health of a specific process"""
|
||||
process = proc_info['process']
|
||||
|
||||
if not process:
|
||||
return
|
||||
|
||||
# Check if process is still running
|
||||
return_code = process.poll()
|
||||
|
||||
if return_code is not None:
|
||||
# Process has exited
|
||||
proc_info['status'] = 'exited'
|
||||
proc_info['last_failure'] = datetime.now()
|
||||
|
||||
logger.warning(f"Process '{name}' exited with code {return_code}")
|
||||
|
||||
# Read stdout/stderr for debugging
|
||||
try:
|
||||
stdout, stderr = process.communicate(timeout=1)
|
||||
if stdout:
|
||||
logger.info(f"Process '{name}' stdout: {stdout[-500:]}") # Last 500 chars
|
||||
if stderr:
|
||||
logger.error(f"Process '{name}' stderr: {stderr[-500:]}") # Last 500 chars
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not read process output: {e}")
|
||||
|
||||
if self.process_failed_callback:
|
||||
try:
|
||||
self.process_failed_callback(name, return_code)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in process failed callback: {e}")
|
||||
|
||||
# Auto-restart if enabled
|
||||
if proc_info['auto_restart'] and proc_info['restart_count'] < self.max_restarts:
|
||||
logger.info(f"Auto-restarting process '{name}'")
|
||||
threading.Thread(target=self.restart_process, args=(name,), daemon=True).start()
|
||||
|
||||
def get_process_status(self, name: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get status of a specific process"""
|
||||
if name not in self.processes:
|
||||
return None
|
||||
|
||||
proc_info = self.processes[name]
|
||||
process = proc_info['process']
|
||||
|
||||
status = {
|
||||
'name': name,
|
||||
'status': proc_info['status'],
|
||||
'restart_count': proc_info['restart_count'],
|
||||
'last_start': proc_info['last_start'],
|
||||
'last_failure': proc_info['last_failure'],
|
||||
'auto_restart': proc_info['auto_restart'],
|
||||
'pid': process.pid if process and process.poll() is None else None,
|
||||
'running': process is not None and process.poll() is None
|
||||
}
|
||||
|
||||
return status
|
||||
|
||||
def get_all_status(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""Get status of all processes"""
|
||||
return {name: self.get_process_status(name) for name in self.processes}
|
||||
|
||||
def set_callbacks(self,
|
||||
process_started: Optional[Callable] = None,
|
||||
process_failed: Optional[Callable] = None,
|
||||
process_restarted: Optional[Callable] = None):
|
||||
"""Set callback functions for process events"""
|
||||
self.process_started_callback = process_started
|
||||
self.process_failed_callback = process_failed
|
||||
self.process_restarted_callback = process_restarted
|
||||
|
||||
def shutdown_all(self):
|
||||
"""Shutdown all processes"""
|
||||
logger.info("Shutting down all supervised processes")
|
||||
|
||||
for name in list(self.processes.keys()):
|
||||
self.stop_process(name)
|
||||
|
||||
self.stop_monitoring()
|
||||
|
||||
# Global instance
|
||||
_process_supervisor = None
|
||||
|
||||
def get_process_supervisor() -> ProcessSupervisor:
|
||||
"""Get global process supervisor instance"""
|
||||
global _process_supervisor
|
||||
if _process_supervisor is None:
|
||||
_process_supervisor = ProcessSupervisor()
|
||||
return _process_supervisor
|
||||
|
||||
def create_supervised_dashboard_runner():
|
||||
"""Create a supervised version of the dashboard runner"""
|
||||
supervisor = get_process_supervisor()
|
||||
|
||||
# Add dashboard process
|
||||
supervisor.add_process(
|
||||
name="clean_dashboard",
|
||||
command=[sys.executable, "run_clean_dashboard.py"],
|
||||
working_dir=os.getcwd(),
|
||||
auto_restart=True
|
||||
)
|
||||
|
||||
# Set up callbacks
|
||||
def on_process_failed(name: str, return_code: int):
|
||||
logger.error(f"Dashboard process failed with code {return_code}")
|
||||
|
||||
def on_process_restarted(name: str, restart_count: int):
|
||||
logger.info(f"Dashboard restarted (attempt {restart_count})")
|
||||
|
||||
supervisor.set_callbacks(
|
||||
process_failed=on_process_failed,
|
||||
process_restarted=on_process_restarted
|
||||
)
|
||||
|
||||
return supervisor
|
288
utils/system_monitor.py
Normal file
288
utils/system_monitor.py
Normal file
@ -0,0 +1,288 @@
|
||||
"""
|
||||
System Resource Monitor - Prevents resource exhaustion and silent failures
|
||||
Monitors memory, CPU, and disk usage to prevent system crashes
|
||||
"""
|
||||
|
||||
import psutil
|
||||
import logging
|
||||
import threading
|
||||
import time
|
||||
import gc
|
||||
import os
|
||||
from typing import Dict, Any, Optional, Callable
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SystemResourceMonitor:
|
||||
"""Monitor system resources and prevent exhaustion"""
|
||||
|
||||
def __init__(self,
|
||||
memory_threshold_mb: int = 7000, # 7GB threshold for 8GB system
|
||||
cpu_threshold_percent: float = 90.0,
|
||||
disk_threshold_percent: float = 95.0,
|
||||
check_interval_seconds: int = 30):
|
||||
"""
|
||||
Initialize system resource monitor
|
||||
|
||||
Args:
|
||||
memory_threshold_mb: Memory threshold in MB before cleanup
|
||||
cpu_threshold_percent: CPU threshold percentage before warning
|
||||
disk_threshold_percent: Disk usage threshold before warning
|
||||
check_interval_seconds: How often to check resources
|
||||
"""
|
||||
self.memory_threshold_mb = memory_threshold_mb
|
||||
self.cpu_threshold_percent = cpu_threshold_percent
|
||||
self.disk_threshold_percent = disk_threshold_percent
|
||||
self.check_interval = check_interval_seconds
|
||||
|
||||
self.monitoring = False
|
||||
self.monitor_thread = None
|
||||
|
||||
# Callbacks for resource events
|
||||
self.memory_warning_callback: Optional[Callable] = None
|
||||
self.cpu_warning_callback: Optional[Callable] = None
|
||||
self.disk_warning_callback: Optional[Callable] = None
|
||||
self.cleanup_callback: Optional[Callable] = None
|
||||
|
||||
# Resource history for trending
|
||||
self.resource_history = []
|
||||
self.max_history_entries = 100
|
||||
|
||||
# Last warning times to prevent spam
|
||||
self.last_memory_warning = datetime.min
|
||||
self.last_cpu_warning = datetime.min
|
||||
self.last_disk_warning = datetime.min
|
||||
self.warning_cooldown = timedelta(minutes=5)
|
||||
|
||||
def start_monitoring(self):
|
||||
"""Start resource monitoring in background thread"""
|
||||
if self.monitoring:
|
||||
logger.warning("Resource monitoring already started")
|
||||
return
|
||||
|
||||
self.monitoring = True
|
||||
self.monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
|
||||
self.monitor_thread.start()
|
||||
logger.info(f"System resource monitoring started (memory threshold: {self.memory_threshold_mb}MB)")
|
||||
|
||||
def stop_monitoring(self):
|
||||
"""Stop resource monitoring"""
|
||||
self.monitoring = False
|
||||
if self.monitor_thread:
|
||||
self.monitor_thread.join(timeout=5)
|
||||
logger.info("System resource monitoring stopped")
|
||||
|
||||
def set_callbacks(self,
|
||||
memory_warning: Optional[Callable] = None,
|
||||
cpu_warning: Optional[Callable] = None,
|
||||
disk_warning: Optional[Callable] = None,
|
||||
cleanup: Optional[Callable] = None):
|
||||
"""Set callback functions for resource events"""
|
||||
self.memory_warning_callback = memory_warning
|
||||
self.cpu_warning_callback = cpu_warning
|
||||
self.disk_warning_callback = disk_warning
|
||||
self.cleanup_callback = cleanup
|
||||
|
||||
def get_current_usage(self) -> Dict[str, Any]:
|
||||
"""Get current system resource usage"""
|
||||
try:
|
||||
# Memory usage
|
||||
memory = psutil.virtual_memory()
|
||||
memory_mb = memory.used / (1024 * 1024)
|
||||
memory_percent = memory.percent
|
||||
|
||||
# CPU usage
|
||||
cpu_percent = psutil.cpu_percent(interval=1)
|
||||
|
||||
# Disk usage (current directory)
|
||||
disk = psutil.disk_usage('.')
|
||||
disk_percent = (disk.used / disk.total) * 100
|
||||
|
||||
# Process-specific info
|
||||
process = psutil.Process()
|
||||
process_memory_mb = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
return {
|
||||
'timestamp': datetime.now(),
|
||||
'memory': {
|
||||
'total_mb': memory.total / (1024 * 1024),
|
||||
'used_mb': memory_mb,
|
||||
'percent': memory_percent,
|
||||
'available_mb': memory.available / (1024 * 1024)
|
||||
},
|
||||
'process_memory_mb': process_memory_mb,
|
||||
'cpu_percent': cpu_percent,
|
||||
'disk': {
|
||||
'total_gb': disk.total / (1024 * 1024 * 1024),
|
||||
'used_gb': disk.used / (1024 * 1024 * 1024),
|
||||
'percent': disk_percent
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting system usage: {e}")
|
||||
return {}
|
||||
|
||||
def _monitor_loop(self):
|
||||
"""Main monitoring loop"""
|
||||
logger.info("Resource monitoring loop started")
|
||||
|
||||
while self.monitoring:
|
||||
try:
|
||||
usage = self.get_current_usage()
|
||||
if not usage:
|
||||
time.sleep(self.check_interval)
|
||||
continue
|
||||
|
||||
# Store in history
|
||||
self.resource_history.append(usage)
|
||||
if len(self.resource_history) > self.max_history_entries:
|
||||
self.resource_history.pop(0)
|
||||
|
||||
# Check thresholds
|
||||
self._check_memory_threshold(usage)
|
||||
self._check_cpu_threshold(usage)
|
||||
self._check_disk_threshold(usage)
|
||||
|
||||
# Log periodic status (every 10 minutes)
|
||||
if len(self.resource_history) % 20 == 0: # 20 * 30s = 10 minutes
|
||||
self._log_resource_status(usage)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in resource monitoring loop: {e}")
|
||||
|
||||
time.sleep(self.check_interval)
|
||||
|
||||
logger.info("Resource monitoring loop stopped")
|
||||
|
||||
def _check_memory_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check memory usage threshold"""
|
||||
memory_mb = usage.get('memory', {}).get('used_mb', 0)
|
||||
|
||||
if memory_mb > self.memory_threshold_mb:
|
||||
now = datetime.now()
|
||||
if now - self.last_memory_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH MEMORY USAGE: {memory_mb:.1f}MB / {self.memory_threshold_mb}MB threshold")
|
||||
self.last_memory_warning = now
|
||||
|
||||
# Trigger cleanup
|
||||
self._trigger_memory_cleanup()
|
||||
|
||||
# Call callback if set
|
||||
if self.memory_warning_callback:
|
||||
try:
|
||||
self.memory_warning_callback(memory_mb, self.memory_threshold_mb)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in memory warning callback: {e}")
|
||||
|
||||
def _check_cpu_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check CPU usage threshold"""
|
||||
cpu_percent = usage.get('cpu_percent', 0)
|
||||
|
||||
if cpu_percent > self.cpu_threshold_percent:
|
||||
now = datetime.now()
|
||||
if now - self.last_cpu_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH CPU USAGE: {cpu_percent:.1f}% / {self.cpu_threshold_percent}% threshold")
|
||||
self.last_cpu_warning = now
|
||||
|
||||
if self.cpu_warning_callback:
|
||||
try:
|
||||
self.cpu_warning_callback(cpu_percent, self.cpu_threshold_percent)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in CPU warning callback: {e}")
|
||||
|
||||
def _check_disk_threshold(self, usage: Dict[str, Any]):
|
||||
"""Check disk usage threshold"""
|
||||
disk_percent = usage.get('disk', {}).get('percent', 0)
|
||||
|
||||
if disk_percent > self.disk_threshold_percent:
|
||||
now = datetime.now()
|
||||
if now - self.last_disk_warning > self.warning_cooldown:
|
||||
logger.warning(f"HIGH DISK USAGE: {disk_percent:.1f}% / {self.disk_threshold_percent}% threshold")
|
||||
self.last_disk_warning = now
|
||||
|
||||
if self.disk_warning_callback:
|
||||
try:
|
||||
self.disk_warning_callback(disk_percent, self.disk_threshold_percent)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in disk warning callback: {e}")
|
||||
|
||||
def _trigger_memory_cleanup(self):
|
||||
"""Trigger memory cleanup procedures"""
|
||||
logger.info("Triggering memory cleanup...")
|
||||
|
||||
# Force garbage collection
|
||||
collected = gc.collect()
|
||||
logger.info(f"Garbage collection freed {collected} objects")
|
||||
|
||||
# Call custom cleanup callback if set
|
||||
if self.cleanup_callback:
|
||||
try:
|
||||
self.cleanup_callback()
|
||||
logger.info("Custom cleanup callback executed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in cleanup callback: {e}")
|
||||
|
||||
# Log memory after cleanup
|
||||
try:
|
||||
usage_after = self.get_current_usage()
|
||||
memory_after = usage_after.get('memory', {}).get('used_mb', 0)
|
||||
logger.info(f"Memory after cleanup: {memory_after:.1f}MB")
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking memory after cleanup: {e}")
|
||||
|
||||
def _log_resource_status(self, usage: Dict[str, Any]):
|
||||
"""Log current resource status"""
|
||||
memory = usage.get('memory', {})
|
||||
cpu = usage.get('cpu_percent', 0)
|
||||
disk = usage.get('disk', {})
|
||||
process_memory = usage.get('process_memory_mb', 0)
|
||||
|
||||
logger.info(f"RESOURCE STATUS - Memory: {memory.get('used_mb', 0):.1f}MB ({memory.get('percent', 0):.1f}%), "
|
||||
f"Process: {process_memory:.1f}MB, CPU: {cpu:.1f}%, Disk: {disk.get('percent', 0):.1f}%")
|
||||
|
||||
def get_resource_summary(self) -> Dict[str, Any]:
|
||||
"""Get resource usage summary"""
|
||||
if not self.resource_history:
|
||||
return {}
|
||||
|
||||
recent_usage = self.resource_history[-10:] # Last 10 entries
|
||||
|
||||
# Calculate averages
|
||||
avg_memory = sum(u.get('memory', {}).get('used_mb', 0) for u in recent_usage) / len(recent_usage)
|
||||
avg_cpu = sum(u.get('cpu_percent', 0) for u in recent_usage) / len(recent_usage)
|
||||
avg_disk = sum(u.get('disk', {}).get('percent', 0) for u in recent_usage) / len(recent_usage)
|
||||
|
||||
current = self.resource_history[-1] if self.resource_history else {}
|
||||
|
||||
return {
|
||||
'current': current,
|
||||
'averages': {
|
||||
'memory_mb': avg_memory,
|
||||
'cpu_percent': avg_cpu,
|
||||
'disk_percent': avg_disk
|
||||
},
|
||||
'thresholds': {
|
||||
'memory_mb': self.memory_threshold_mb,
|
||||
'cpu_percent': self.cpu_threshold_percent,
|
||||
'disk_percent': self.disk_threshold_percent
|
||||
},
|
||||
'monitoring': self.monitoring,
|
||||
'history_entries': len(self.resource_history)
|
||||
}
|
||||
|
||||
# Global instance
|
||||
_system_monitor = None
|
||||
|
||||
def get_system_monitor() -> SystemResourceMonitor:
|
||||
"""Get global system monitor instance"""
|
||||
global _system_monitor
|
||||
if _system_monitor is None:
|
||||
_system_monitor = SystemResourceMonitor()
|
||||
return _system_monitor
|
||||
|
||||
def start_system_monitoring():
|
||||
"""Start system monitoring with default settings"""
|
||||
monitor = get_system_monitor()
|
||||
monitor.start_monitoring()
|
||||
return monitor
|
Reference in New Issue
Block a user