Files
gogo2/COBY/monitoring/performance_monitor.py
Dobromir Popov fa972ace8a 16. monitoring
2025-08-05 01:06:26 +03:00

556 lines
21 KiB
Python

"""
Performance monitoring dashboard and real-time performance tracking.
"""
import time
import asyncio
import threading
from typing import Dict, List, Optional, Any, Callable
from collections import defaultdict, deque
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, field
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
from .metrics_collector import MetricsCollector
logger = get_logger(__name__)
@dataclass
class PerformanceAlert:
"""Performance alert definition"""
name: str
metric_name: str
threshold: float
comparison: str # 'gt', 'lt', 'eq'
duration: int # seconds
message: str
severity: str = 'warning' # 'info', 'warning', 'critical'
triggered_at: Optional[datetime] = None
resolved_at: Optional[datetime] = None
def is_triggered(self, value: float) -> bool:
"""Check if alert should be triggered"""
if self.comparison == 'gt':
return value > self.threshold
elif self.comparison == 'lt':
return value < self.threshold
elif self.comparison == 'eq':
return abs(value - self.threshold) < 0.001
return False
@dataclass
class PerformanceThresholds:
"""Performance threshold configuration"""
max_cpu_usage: float = 80.0 # %
max_memory_usage: float = 85.0 # %
min_memory_available: float = 1.0 # GB
max_latency_ms: float = 100.0 # milliseconds
max_error_rate: float = 5.0 # %
min_throughput: float = 100.0 # operations/second
class PerformanceMonitor:
"""
Real-time performance monitoring with alerting and dashboard data.
Monitors system performance, tracks KPIs, and provides alerts.
"""
def __init__(self, metrics_collector: MetricsCollector = None):
"""
Initialize performance monitor.
Args:
metrics_collector: Metrics collector instance
"""
if metrics_collector is None:
from .metrics_collector import metrics_collector as default_collector
self.metrics_collector = default_collector
else:
self.metrics_collector = metrics_collector
self.thresholds = PerformanceThresholds()
# Alert management
self.alerts: Dict[str, PerformanceAlert] = {}
self.active_alerts: Dict[str, PerformanceAlert] = {}
self.alert_history: deque = deque(maxlen=1000)
# Performance tracking
self.performance_history: deque = deque(maxlen=10000)
self.kpi_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
# Monitoring control
self._monitoring = False
self._monitor_thread: Optional[threading.Thread] = None
self._monitor_interval = 5.0 # seconds
# Alert callbacks
self.alert_callbacks: List[Callable[[PerformanceAlert], None]] = []
# Initialize default alerts
self._setup_default_alerts()
logger.info("Performance monitor initialized")
def _setup_default_alerts(self) -> None:
"""Setup default performance alerts"""
default_alerts = [
PerformanceAlert(
name="high_cpu_usage",
metric_name="system_cpu_usage",
threshold=self.thresholds.max_cpu_usage,
comparison="gt",
duration=30,
message="CPU usage is above {threshold}%",
severity="warning"
),
PerformanceAlert(
name="high_memory_usage",
metric_name="system_memory_usage",
threshold=self.thresholds.max_memory_usage,
comparison="gt",
duration=30,
message="Memory usage is above {threshold}%",
severity="warning"
),
PerformanceAlert(
name="low_memory_available",
metric_name="system_memory_available_gb",
threshold=self.thresholds.min_memory_available,
comparison="lt",
duration=60,
message="Available memory is below {threshold}GB",
severity="critical"
),
PerformanceAlert(
name="high_latency",
metric_name="processing_latency_ms",
threshold=self.thresholds.max_latency_ms,
comparison="gt",
duration=60,
message="Processing latency is above {threshold}ms",
severity="warning"
)
]
for alert in default_alerts:
self.add_alert(alert)
def start_monitoring(self) -> None:
"""Start performance monitoring"""
if self._monitoring:
logger.warning("Performance monitoring already running")
return
self._monitoring = True
self._monitor_thread = threading.Thread(
target=self._monitoring_loop,
name="PerformanceMonitor",
daemon=True
)
self._monitor_thread.start()
logger.info("Started performance monitoring")
def stop_monitoring(self) -> None:
"""Stop performance monitoring"""
if not self._monitoring:
return
self._monitoring = False
if self._monitor_thread:
self._monitor_thread.join(timeout=5.0)
logger.info("Stopped performance monitoring")
def _monitoring_loop(self) -> None:
"""Main monitoring loop"""
while self._monitoring:
try:
# Collect current performance data
self._collect_performance_data()
# Check alerts
self._check_alerts()
# Update KPIs
self._update_kpis()
time.sleep(self._monitor_interval)
except Exception as e:
logger.error(f"Error in performance monitoring loop: {e}")
time.sleep(self._monitor_interval)
def _collect_performance_data(self) -> None:
"""Collect current performance data"""
try:
if self.metrics_collector is None:
return
current_metrics = self.metrics_collector.get_current_metrics()
if current_metrics is None:
return
# Create performance snapshot
performance_data = {
'timestamp': get_current_timestamp(),
'system': current_metrics.get('system', {}),
'counters': current_metrics.get('counters', {}),
'gauges': current_metrics.get('gauges', {}),
'histograms': current_metrics.get('histograms', {})
}
self.performance_history.append(performance_data)
except Exception as e:
logger.error(f"Error collecting performance data: {e}")
def _check_alerts(self) -> None:
"""Check all alerts against current metrics"""
if self.metrics_collector is None:
return
current_metrics = self.metrics_collector.get_current_metrics()
if current_metrics is None:
return
current_time = get_current_timestamp()
for alert_name, alert in self.alerts.items():
try:
# Get metric value
metric_value = self._get_metric_value(alert.metric_name, current_metrics)
if metric_value is None:
continue
# Check if alert should be triggered
should_trigger = alert.is_triggered(metric_value)
if should_trigger and alert_name not in self.active_alerts:
# Trigger alert
alert.triggered_at = current_time
self.active_alerts[alert_name] = alert
self.alert_history.append(alert)
# Format message
message = alert.message.format(
threshold=alert.threshold,
value=metric_value
)
logger.warning(f"Performance alert triggered: {alert.name} - {message}")
# Notify callbacks
for callback in self.alert_callbacks:
try:
callback(alert)
except Exception as e:
logger.error(f"Error in alert callback: {e}")
elif not should_trigger and alert_name in self.active_alerts:
# Resolve alert
resolved_alert = self.active_alerts.pop(alert_name)
resolved_alert.resolved_at = current_time
logger.info(f"Performance alert resolved: {alert.name}")
except Exception as e:
logger.error(f"Error checking alert {alert_name}: {e}")
def _get_metric_value(self, metric_name: str, metrics: Dict[str, Any]) -> Optional[float]:
"""Get metric value from metrics data"""
if not metrics:
return None
# Check gauges first
gauges = metrics.get('gauges', {})
if gauges and metric_name in gauges:
return gauges[metric_name]
# Check counters
counters = metrics.get('counters', {})
if counters and metric_name in counters:
return counters[metric_name]
# Check histograms (use average)
histograms = metrics.get('histograms', {})
if histograms and metric_name in histograms:
hist_data = histograms[metric_name]
if hist_data and isinstance(hist_data, dict):
return hist_data.get('avg', 0)
# Check system metrics
system_metrics = metrics.get('system', {})
if system_metrics and metric_name in system_metrics:
return system_metrics[metric_name]
return None
def _update_kpis(self) -> None:
"""Update key performance indicators"""
try:
if self.metrics_collector is None:
return
current_metrics = self.metrics_collector.get_current_metrics()
if current_metrics is None:
return
timestamp = get_current_timestamp()
# Calculate throughput (operations per second)
throughput = self._calculate_throughput()
self.kpi_history['throughput_ops_per_sec'].append({
'value': throughput,
'timestamp': timestamp
})
# Calculate error rate
error_rate = self._calculate_error_rate()
self.kpi_history['error_rate_percent'].append({
'value': error_rate,
'timestamp': timestamp
})
# Calculate average latency
avg_latency = self._calculate_average_latency()
self.kpi_history['avg_latency_ms'].append({
'value': avg_latency,
'timestamp': timestamp
})
# Update metrics collector with KPIs
self.metrics_collector.set_gauge('kpi_throughput_ops_per_sec', throughput)
self.metrics_collector.set_gauge('kpi_error_rate_percent', error_rate)
self.metrics_collector.set_gauge('kpi_avg_latency_ms', avg_latency)
except Exception as e:
logger.error(f"Error updating KPIs: {e}")
def _calculate_throughput(self) -> float:
"""Calculate operations per second throughput"""
try:
current_metrics = self.metrics_collector.get_current_metrics()
counters = current_metrics.get('counters', {})
# Sum up relevant operation counters
total_ops = 0
for name, value in counters.items():
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']):
total_ops += value
# Calculate rate (simple approximation)
if len(self.performance_history) >= 2:
prev_data = self.performance_history[-2]
current_data = self.performance_history[-1]
time_diff = (current_data['timestamp'] - prev_data['timestamp']).total_seconds()
if time_diff > 0:
prev_ops = sum(
value for name, value in prev_data.get('counters', {}).items()
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed'])
)
return (total_ops - prev_ops) / time_diff
return 0.0
except Exception as e:
logger.error(f"Error calculating throughput: {e}")
return 0.0
def _calculate_error_rate(self) -> float:
"""Calculate error rate percentage"""
try:
current_metrics = self.metrics_collector.get_current_metrics()
counters = current_metrics.get('counters', {})
# Count errors and total operations
total_errors = sum(
value for name, value in counters.items()
if 'error' in name.lower() or 'failed' in name.lower()
)
total_operations = sum(
value for name, value in counters.items()
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed', 'total'])
)
if total_operations > 0:
return (total_errors / total_operations) * 100
return 0.0
except Exception as e:
logger.error(f"Error calculating error rate: {e}")
return 0.0
def _calculate_average_latency(self) -> float:
"""Calculate average latency across all operations"""
try:
current_metrics = self.metrics_collector.get_current_metrics()
histograms = current_metrics.get('histograms', {})
# Find latency histograms
latency_values = []
for name, stats in histograms.items():
if 'latency' in name.lower():
latency_values.append(stats.get('avg', 0))
if latency_values:
return sum(latency_values) / len(latency_values)
return 0.0
except Exception as e:
logger.error(f"Error calculating average latency: {e}")
return 0.0
def add_alert(self, alert: PerformanceAlert) -> None:
"""Add a performance alert"""
self.alerts[alert.name] = alert
logger.info(f"Added performance alert: {alert.name}")
def remove_alert(self, alert_name: str) -> None:
"""Remove a performance alert"""
if alert_name in self.alerts:
del self.alerts[alert_name]
# Also remove from active alerts if present
self.active_alerts.pop(alert_name, None)
logger.info(f"Removed performance alert: {alert_name}")
def get_active_alerts(self) -> List[PerformanceAlert]:
"""Get currently active alerts"""
return list(self.active_alerts.values())
def get_alert_history(self, limit: int = 100) -> List[PerformanceAlert]:
"""Get alert history"""
return list(self.alert_history)[-limit:]
def get_performance_dashboard_data(self) -> Dict[str, Any]:
"""Get data for performance dashboard"""
current_metrics = {}
if self.metrics_collector:
current_metrics = self.metrics_collector.get_current_metrics() or {}
system_metrics = current_metrics.get('system', {}) or {}
return {
'timestamp': get_current_timestamp().isoformat(),
'system_metrics': system_metrics,
'kpis': {
name: list(history)[-10:] if history else [] # Last 10 points
for name, history in self.kpi_history.items()
},
'active_alerts': [
{
'name': alert.name,
'message': alert.message,
'severity': alert.severity.value if hasattr(alert.severity, 'value') else str(alert.severity),
'triggered_at': alert.triggered_at.isoformat() if alert.triggered_at else None
}
for alert in self.active_alerts.values()
],
'performance_summary': {
'cpu_usage': system_metrics.get('cpu_usage', 0) if system_metrics else 0,
'memory_usage': system_metrics.get('memory_usage', 0) if system_metrics else 0,
'active_connections': system_metrics.get('active_connections', 0) if system_metrics else 0,
'throughput': self.kpi_history['throughput_ops_per_sec'][-1]['value'] if self.kpi_history['throughput_ops_per_sec'] else 0,
'error_rate': self.kpi_history['error_rate_percent'][-1]['value'] if self.kpi_history['error_rate_percent'] else 0,
'avg_latency': self.kpi_history['avg_latency_ms'][-1]['value'] if self.kpi_history['avg_latency_ms'] else 0
}
}
def register_alert_callback(self, callback: Callable[[PerformanceAlert], None]) -> None:
"""Register callback for alert notifications"""
self.alert_callbacks.append(callback)
logger.info(f"Registered alert callback: {callback.__name__}")
def update_thresholds(self, **kwargs) -> None:
"""Update performance thresholds"""
for key, value in kwargs.items():
if hasattr(self.thresholds, key):
setattr(self.thresholds, key, value)
logger.info(f"Updated threshold {key} to {value}")
def get_performance_trends(self, hours: int = 24) -> Dict[str, Any]:
"""Get performance trends over specified time period"""
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
# Filter performance history
recent_data = [
data for data in self.performance_history
if data and data.get('timestamp') and data['timestamp'] >= cutoff_time
]
if not recent_data:
return {}
# Calculate trends
trends = {}
# CPU usage trend
cpu_values = []
for data in recent_data:
system_data = data.get('system', {})
if system_data:
cpu_values.append(system_data.get('cpu_usage', 0))
if cpu_values:
trends['cpu_usage'] = {
'current': cpu_values[-1],
'average': sum(cpu_values) / len(cpu_values),
'max': max(cpu_values),
'trend': 'increasing' if len(cpu_values) > 1 and cpu_values[-1] > cpu_values[0] else 'stable'
}
# Memory usage trend
memory_values = []
for data in recent_data:
system_data = data.get('system', {})
if system_data:
memory_values.append(system_data.get('memory_usage', 0))
if memory_values:
trends['memory_usage'] = {
'current': memory_values[-1],
'average': sum(memory_values) / len(memory_values),
'max': max(memory_values),
'trend': 'increasing' if len(memory_values) > 1 and memory_values[-1] > memory_values[0] else 'stable'
}
return trends
def get_stats(self) -> Dict[str, Any]:
"""Get performance monitor statistics"""
return {
'monitoring': self._monitoring,
'monitor_interval': self._monitor_interval,
'total_alerts': len(self.alerts),
'active_alerts': len(self.active_alerts),
'alert_history_count': len(self.alert_history),
'performance_history_count': len(self.performance_history),
'kpi_metrics': list(self.kpi_history.keys()),
'registered_callbacks': len(self.alert_callbacks),
'thresholds': {
'max_cpu_usage': self.thresholds.max_cpu_usage,
'max_memory_usage': self.thresholds.max_memory_usage,
'min_memory_available': self.thresholds.min_memory_available,
'max_latency_ms': self.thresholds.max_latency_ms,
'max_error_rate': self.thresholds.max_error_rate,
'min_throughput': self.thresholds.min_throughput
}
}
# Global performance monitor instance (initialized lazily)
performance_monitor = None
def get_performance_monitor():
"""Get or create global performance monitor instance"""
global performance_monitor
if performance_monitor is None:
performance_monitor = PerformanceMonitor()
return performance_monitor