""" Performance monitoring dashboard and real-time performance tracking. """ import time import asyncio import threading from typing import Dict, List, Optional, Any, Callable from collections import defaultdict, deque from datetime import datetime, timezone, timedelta from dataclasses import dataclass, field from ..utils.logging import get_logger from ..utils.timing import get_current_timestamp from .metrics_collector import MetricsCollector logger = get_logger(__name__) @dataclass class PerformanceAlert: """Performance alert definition""" name: str metric_name: str threshold: float comparison: str # 'gt', 'lt', 'eq' duration: int # seconds message: str severity: str = 'warning' # 'info', 'warning', 'critical' triggered_at: Optional[datetime] = None resolved_at: Optional[datetime] = None def is_triggered(self, value: float) -> bool: """Check if alert should be triggered""" if self.comparison == 'gt': return value > self.threshold elif self.comparison == 'lt': return value < self.threshold elif self.comparison == 'eq': return abs(value - self.threshold) < 0.001 return False @dataclass class PerformanceThresholds: """Performance threshold configuration""" max_cpu_usage: float = 80.0 # % max_memory_usage: float = 85.0 # % min_memory_available: float = 1.0 # GB max_latency_ms: float = 100.0 # milliseconds max_error_rate: float = 5.0 # % min_throughput: float = 100.0 # operations/second class PerformanceMonitor: """ Real-time performance monitoring with alerting and dashboard data. Monitors system performance, tracks KPIs, and provides alerts. """ def __init__(self, metrics_collector: MetricsCollector = None): """ Initialize performance monitor. Args: metrics_collector: Metrics collector instance """ if metrics_collector is None: from .metrics_collector import metrics_collector as default_collector self.metrics_collector = default_collector else: self.metrics_collector = metrics_collector self.thresholds = PerformanceThresholds() # Alert management self.alerts: Dict[str, PerformanceAlert] = {} self.active_alerts: Dict[str, PerformanceAlert] = {} self.alert_history: deque = deque(maxlen=1000) # Performance tracking self.performance_history: deque = deque(maxlen=10000) self.kpi_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000)) # Monitoring control self._monitoring = False self._monitor_thread: Optional[threading.Thread] = None self._monitor_interval = 5.0 # seconds # Alert callbacks self.alert_callbacks: List[Callable[[PerformanceAlert], None]] = [] # Initialize default alerts self._setup_default_alerts() logger.info("Performance monitor initialized") def _setup_default_alerts(self) -> None: """Setup default performance alerts""" default_alerts = [ PerformanceAlert( name="high_cpu_usage", metric_name="system_cpu_usage", threshold=self.thresholds.max_cpu_usage, comparison="gt", duration=30, message="CPU usage is above {threshold}%", severity="warning" ), PerformanceAlert( name="high_memory_usage", metric_name="system_memory_usage", threshold=self.thresholds.max_memory_usage, comparison="gt", duration=30, message="Memory usage is above {threshold}%", severity="warning" ), PerformanceAlert( name="low_memory_available", metric_name="system_memory_available_gb", threshold=self.thresholds.min_memory_available, comparison="lt", duration=60, message="Available memory is below {threshold}GB", severity="critical" ), PerformanceAlert( name="high_latency", metric_name="processing_latency_ms", threshold=self.thresholds.max_latency_ms, comparison="gt", duration=60, message="Processing latency is above {threshold}ms", severity="warning" ) ] for alert in default_alerts: self.add_alert(alert) def start_monitoring(self) -> None: """Start performance monitoring""" if self._monitoring: logger.warning("Performance monitoring already running") return self._monitoring = True self._monitor_thread = threading.Thread( target=self._monitoring_loop, name="PerformanceMonitor", daemon=True ) self._monitor_thread.start() logger.info("Started performance monitoring") def stop_monitoring(self) -> None: """Stop performance monitoring""" if not self._monitoring: return self._monitoring = False if self._monitor_thread: self._monitor_thread.join(timeout=5.0) logger.info("Stopped performance monitoring") def _monitoring_loop(self) -> None: """Main monitoring loop""" while self._monitoring: try: # Collect current performance data self._collect_performance_data() # Check alerts self._check_alerts() # Update KPIs self._update_kpis() time.sleep(self._monitor_interval) except Exception as e: logger.error(f"Error in performance monitoring loop: {e}") time.sleep(self._monitor_interval) def _collect_performance_data(self) -> None: """Collect current performance data""" try: if self.metrics_collector is None: return current_metrics = self.metrics_collector.get_current_metrics() if current_metrics is None: return # Create performance snapshot performance_data = { 'timestamp': get_current_timestamp(), 'system': current_metrics.get('system', {}), 'counters': current_metrics.get('counters', {}), 'gauges': current_metrics.get('gauges', {}), 'histograms': current_metrics.get('histograms', {}) } self.performance_history.append(performance_data) except Exception as e: logger.error(f"Error collecting performance data: {e}") def _check_alerts(self) -> None: """Check all alerts against current metrics""" if self.metrics_collector is None: return current_metrics = self.metrics_collector.get_current_metrics() if current_metrics is None: return current_time = get_current_timestamp() for alert_name, alert in self.alerts.items(): try: # Get metric value metric_value = self._get_metric_value(alert.metric_name, current_metrics) if metric_value is None: continue # Check if alert should be triggered should_trigger = alert.is_triggered(metric_value) if should_trigger and alert_name not in self.active_alerts: # Trigger alert alert.triggered_at = current_time self.active_alerts[alert_name] = alert self.alert_history.append(alert) # Format message message = alert.message.format( threshold=alert.threshold, value=metric_value ) logger.warning(f"Performance alert triggered: {alert.name} - {message}") # Notify callbacks for callback in self.alert_callbacks: try: callback(alert) except Exception as e: logger.error(f"Error in alert callback: {e}") elif not should_trigger and alert_name in self.active_alerts: # Resolve alert resolved_alert = self.active_alerts.pop(alert_name) resolved_alert.resolved_at = current_time logger.info(f"Performance alert resolved: {alert.name}") except Exception as e: logger.error(f"Error checking alert {alert_name}: {e}") def _get_metric_value(self, metric_name: str, metrics: Dict[str, Any]) -> Optional[float]: """Get metric value from metrics data""" if not metrics: return None # Check gauges first gauges = metrics.get('gauges', {}) if gauges and metric_name in gauges: return gauges[metric_name] # Check counters counters = metrics.get('counters', {}) if counters and metric_name in counters: return counters[metric_name] # Check histograms (use average) histograms = metrics.get('histograms', {}) if histograms and metric_name in histograms: hist_data = histograms[metric_name] if hist_data and isinstance(hist_data, dict): return hist_data.get('avg', 0) # Check system metrics system_metrics = metrics.get('system', {}) if system_metrics and metric_name in system_metrics: return system_metrics[metric_name] return None def _update_kpis(self) -> None: """Update key performance indicators""" try: if self.metrics_collector is None: return current_metrics = self.metrics_collector.get_current_metrics() if current_metrics is None: return timestamp = get_current_timestamp() # Calculate throughput (operations per second) throughput = self._calculate_throughput() self.kpi_history['throughput_ops_per_sec'].append({ 'value': throughput, 'timestamp': timestamp }) # Calculate error rate error_rate = self._calculate_error_rate() self.kpi_history['error_rate_percent'].append({ 'value': error_rate, 'timestamp': timestamp }) # Calculate average latency avg_latency = self._calculate_average_latency() self.kpi_history['avg_latency_ms'].append({ 'value': avg_latency, 'timestamp': timestamp }) # Update metrics collector with KPIs self.metrics_collector.set_gauge('kpi_throughput_ops_per_sec', throughput) self.metrics_collector.set_gauge('kpi_error_rate_percent', error_rate) self.metrics_collector.set_gauge('kpi_avg_latency_ms', avg_latency) except Exception as e: logger.error(f"Error updating KPIs: {e}") def _calculate_throughput(self) -> float: """Calculate operations per second throughput""" try: current_metrics = self.metrics_collector.get_current_metrics() counters = current_metrics.get('counters', {}) # Sum up relevant operation counters total_ops = 0 for name, value in counters.items(): if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']): total_ops += value # Calculate rate (simple approximation) if len(self.performance_history) >= 2: prev_data = self.performance_history[-2] current_data = self.performance_history[-1] time_diff = (current_data['timestamp'] - prev_data['timestamp']).total_seconds() if time_diff > 0: prev_ops = sum( value for name, value in prev_data.get('counters', {}).items() if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']) ) return (total_ops - prev_ops) / time_diff return 0.0 except Exception as e: logger.error(f"Error calculating throughput: {e}") return 0.0 def _calculate_error_rate(self) -> float: """Calculate error rate percentage""" try: current_metrics = self.metrics_collector.get_current_metrics() counters = current_metrics.get('counters', {}) # Count errors and total operations total_errors = sum( value for name, value in counters.items() if 'error' in name.lower() or 'failed' in name.lower() ) total_operations = sum( value for name, value in counters.items() if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed', 'total']) ) if total_operations > 0: return (total_errors / total_operations) * 100 return 0.0 except Exception as e: logger.error(f"Error calculating error rate: {e}") return 0.0 def _calculate_average_latency(self) -> float: """Calculate average latency across all operations""" try: current_metrics = self.metrics_collector.get_current_metrics() histograms = current_metrics.get('histograms', {}) # Find latency histograms latency_values = [] for name, stats in histograms.items(): if 'latency' in name.lower(): latency_values.append(stats.get('avg', 0)) if latency_values: return sum(latency_values) / len(latency_values) return 0.0 except Exception as e: logger.error(f"Error calculating average latency: {e}") return 0.0 def add_alert(self, alert: PerformanceAlert) -> None: """Add a performance alert""" self.alerts[alert.name] = alert logger.info(f"Added performance alert: {alert.name}") def remove_alert(self, alert_name: str) -> None: """Remove a performance alert""" if alert_name in self.alerts: del self.alerts[alert_name] # Also remove from active alerts if present self.active_alerts.pop(alert_name, None) logger.info(f"Removed performance alert: {alert_name}") def get_active_alerts(self) -> List[PerformanceAlert]: """Get currently active alerts""" return list(self.active_alerts.values()) def get_alert_history(self, limit: int = 100) -> List[PerformanceAlert]: """Get alert history""" return list(self.alert_history)[-limit:] def get_performance_dashboard_data(self) -> Dict[str, Any]: """Get data for performance dashboard""" current_metrics = {} if self.metrics_collector: current_metrics = self.metrics_collector.get_current_metrics() or {} system_metrics = current_metrics.get('system', {}) or {} return { 'timestamp': get_current_timestamp().isoformat(), 'system_metrics': system_metrics, 'kpis': { name: list(history)[-10:] if history else [] # Last 10 points for name, history in self.kpi_history.items() }, 'active_alerts': [ { 'name': alert.name, 'message': alert.message, 'severity': alert.severity.value if hasattr(alert.severity, 'value') else str(alert.severity), 'triggered_at': alert.triggered_at.isoformat() if alert.triggered_at else None } for alert in self.active_alerts.values() ], 'performance_summary': { 'cpu_usage': system_metrics.get('cpu_usage', 0) if system_metrics else 0, 'memory_usage': system_metrics.get('memory_usage', 0) if system_metrics else 0, 'active_connections': system_metrics.get('active_connections', 0) if system_metrics else 0, 'throughput': self.kpi_history['throughput_ops_per_sec'][-1]['value'] if self.kpi_history['throughput_ops_per_sec'] else 0, 'error_rate': self.kpi_history['error_rate_percent'][-1]['value'] if self.kpi_history['error_rate_percent'] else 0, 'avg_latency': self.kpi_history['avg_latency_ms'][-1]['value'] if self.kpi_history['avg_latency_ms'] else 0 } } def register_alert_callback(self, callback: Callable[[PerformanceAlert], None]) -> None: """Register callback for alert notifications""" self.alert_callbacks.append(callback) logger.info(f"Registered alert callback: {callback.__name__}") def update_thresholds(self, **kwargs) -> None: """Update performance thresholds""" for key, value in kwargs.items(): if hasattr(self.thresholds, key): setattr(self.thresholds, key, value) logger.info(f"Updated threshold {key} to {value}") def get_performance_trends(self, hours: int = 24) -> Dict[str, Any]: """Get performance trends over specified time period""" cutoff_time = get_current_timestamp() - timedelta(hours=hours) # Filter performance history recent_data = [ data for data in self.performance_history if data and data.get('timestamp') and data['timestamp'] >= cutoff_time ] if not recent_data: return {} # Calculate trends trends = {} # CPU usage trend cpu_values = [] for data in recent_data: system_data = data.get('system', {}) if system_data: cpu_values.append(system_data.get('cpu_usage', 0)) if cpu_values: trends['cpu_usage'] = { 'current': cpu_values[-1], 'average': sum(cpu_values) / len(cpu_values), 'max': max(cpu_values), 'trend': 'increasing' if len(cpu_values) > 1 and cpu_values[-1] > cpu_values[0] else 'stable' } # Memory usage trend memory_values = [] for data in recent_data: system_data = data.get('system', {}) if system_data: memory_values.append(system_data.get('memory_usage', 0)) if memory_values: trends['memory_usage'] = { 'current': memory_values[-1], 'average': sum(memory_values) / len(memory_values), 'max': max(memory_values), 'trend': 'increasing' if len(memory_values) > 1 and memory_values[-1] > memory_values[0] else 'stable' } return trends def get_stats(self) -> Dict[str, Any]: """Get performance monitor statistics""" return { 'monitoring': self._monitoring, 'monitor_interval': self._monitor_interval, 'total_alerts': len(self.alerts), 'active_alerts': len(self.active_alerts), 'alert_history_count': len(self.alert_history), 'performance_history_count': len(self.performance_history), 'kpi_metrics': list(self.kpi_history.keys()), 'registered_callbacks': len(self.alert_callbacks), 'thresholds': { 'max_cpu_usage': self.thresholds.max_cpu_usage, 'max_memory_usage': self.thresholds.max_memory_usage, 'min_memory_available': self.thresholds.min_memory_available, 'max_latency_ms': self.thresholds.max_latency_ms, 'max_error_rate': self.thresholds.max_error_rate, 'min_throughput': self.thresholds.min_throughput } } # Global performance monitor instance (initialized lazily) performance_monitor = None def get_performance_monitor(): """Get or create global performance monitor instance""" global performance_monitor if performance_monitor is None: performance_monitor = PerformanceMonitor() return performance_monitor