16. monitoring
This commit is contained in:
556
COBY/monitoring/performance_monitor.py
Normal file
556
COBY/monitoring/performance_monitor.py
Normal file
@ -0,0 +1,556 @@
|
||||
"""
|
||||
Performance monitoring dashboard and real-time performance tracking.
|
||||
"""
|
||||
|
||||
import time
|
||||
import asyncio
|
||||
import threading
|
||||
from typing import Dict, List, Optional, Any, Callable
|
||||
from collections import defaultdict, deque
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from ..utils.logging import get_logger
|
||||
from ..utils.timing import get_current_timestamp
|
||||
from .metrics_collector import MetricsCollector
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PerformanceAlert:
|
||||
"""Performance alert definition"""
|
||||
name: str
|
||||
metric_name: str
|
||||
threshold: float
|
||||
comparison: str # 'gt', 'lt', 'eq'
|
||||
duration: int # seconds
|
||||
message: str
|
||||
severity: str = 'warning' # 'info', 'warning', 'critical'
|
||||
triggered_at: Optional[datetime] = None
|
||||
resolved_at: Optional[datetime] = None
|
||||
|
||||
def is_triggered(self, value: float) -> bool:
|
||||
"""Check if alert should be triggered"""
|
||||
if self.comparison == 'gt':
|
||||
return value > self.threshold
|
||||
elif self.comparison == 'lt':
|
||||
return value < self.threshold
|
||||
elif self.comparison == 'eq':
|
||||
return abs(value - self.threshold) < 0.001
|
||||
return False
|
||||
|
||||
|
||||
@dataclass
|
||||
class PerformanceThresholds:
|
||||
"""Performance threshold configuration"""
|
||||
max_cpu_usage: float = 80.0 # %
|
||||
max_memory_usage: float = 85.0 # %
|
||||
min_memory_available: float = 1.0 # GB
|
||||
max_latency_ms: float = 100.0 # milliseconds
|
||||
max_error_rate: float = 5.0 # %
|
||||
min_throughput: float = 100.0 # operations/second
|
||||
|
||||
|
||||
class PerformanceMonitor:
|
||||
"""
|
||||
Real-time performance monitoring with alerting and dashboard data.
|
||||
|
||||
Monitors system performance, tracks KPIs, and provides alerts.
|
||||
"""
|
||||
|
||||
def __init__(self, metrics_collector: MetricsCollector = None):
|
||||
"""
|
||||
Initialize performance monitor.
|
||||
|
||||
Args:
|
||||
metrics_collector: Metrics collector instance
|
||||
"""
|
||||
if metrics_collector is None:
|
||||
from .metrics_collector import metrics_collector as default_collector
|
||||
self.metrics_collector = default_collector
|
||||
else:
|
||||
self.metrics_collector = metrics_collector
|
||||
self.thresholds = PerformanceThresholds()
|
||||
|
||||
# Alert management
|
||||
self.alerts: Dict[str, PerformanceAlert] = {}
|
||||
self.active_alerts: Dict[str, PerformanceAlert] = {}
|
||||
self.alert_history: deque = deque(maxlen=1000)
|
||||
|
||||
# Performance tracking
|
||||
self.performance_history: deque = deque(maxlen=10000)
|
||||
self.kpi_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
||||
|
||||
# Monitoring control
|
||||
self._monitoring = False
|
||||
self._monitor_thread: Optional[threading.Thread] = None
|
||||
self._monitor_interval = 5.0 # seconds
|
||||
|
||||
# Alert callbacks
|
||||
self.alert_callbacks: List[Callable[[PerformanceAlert], None]] = []
|
||||
|
||||
# Initialize default alerts
|
||||
self._setup_default_alerts()
|
||||
|
||||
logger.info("Performance monitor initialized")
|
||||
|
||||
def _setup_default_alerts(self) -> None:
|
||||
"""Setup default performance alerts"""
|
||||
default_alerts = [
|
||||
PerformanceAlert(
|
||||
name="high_cpu_usage",
|
||||
metric_name="system_cpu_usage",
|
||||
threshold=self.thresholds.max_cpu_usage,
|
||||
comparison="gt",
|
||||
duration=30,
|
||||
message="CPU usage is above {threshold}%",
|
||||
severity="warning"
|
||||
),
|
||||
PerformanceAlert(
|
||||
name="high_memory_usage",
|
||||
metric_name="system_memory_usage",
|
||||
threshold=self.thresholds.max_memory_usage,
|
||||
comparison="gt",
|
||||
duration=30,
|
||||
message="Memory usage is above {threshold}%",
|
||||
severity="warning"
|
||||
),
|
||||
PerformanceAlert(
|
||||
name="low_memory_available",
|
||||
metric_name="system_memory_available_gb",
|
||||
threshold=self.thresholds.min_memory_available,
|
||||
comparison="lt",
|
||||
duration=60,
|
||||
message="Available memory is below {threshold}GB",
|
||||
severity="critical"
|
||||
),
|
||||
PerformanceAlert(
|
||||
name="high_latency",
|
||||
metric_name="processing_latency_ms",
|
||||
threshold=self.thresholds.max_latency_ms,
|
||||
comparison="gt",
|
||||
duration=60,
|
||||
message="Processing latency is above {threshold}ms",
|
||||
severity="warning"
|
||||
)
|
||||
]
|
||||
|
||||
for alert in default_alerts:
|
||||
self.add_alert(alert)
|
||||
|
||||
def start_monitoring(self) -> None:
|
||||
"""Start performance monitoring"""
|
||||
if self._monitoring:
|
||||
logger.warning("Performance monitoring already running")
|
||||
return
|
||||
|
||||
self._monitoring = True
|
||||
self._monitor_thread = threading.Thread(
|
||||
target=self._monitoring_loop,
|
||||
name="PerformanceMonitor",
|
||||
daemon=True
|
||||
)
|
||||
self._monitor_thread.start()
|
||||
logger.info("Started performance monitoring")
|
||||
|
||||
def stop_monitoring(self) -> None:
|
||||
"""Stop performance monitoring"""
|
||||
if not self._monitoring:
|
||||
return
|
||||
|
||||
self._monitoring = False
|
||||
if self._monitor_thread:
|
||||
self._monitor_thread.join(timeout=5.0)
|
||||
logger.info("Stopped performance monitoring")
|
||||
|
||||
def _monitoring_loop(self) -> None:
|
||||
"""Main monitoring loop"""
|
||||
while self._monitoring:
|
||||
try:
|
||||
# Collect current performance data
|
||||
self._collect_performance_data()
|
||||
|
||||
# Check alerts
|
||||
self._check_alerts()
|
||||
|
||||
# Update KPIs
|
||||
self._update_kpis()
|
||||
|
||||
time.sleep(self._monitor_interval)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in performance monitoring loop: {e}")
|
||||
time.sleep(self._monitor_interval)
|
||||
|
||||
def _collect_performance_data(self) -> None:
|
||||
"""Collect current performance data"""
|
||||
try:
|
||||
if self.metrics_collector is None:
|
||||
return
|
||||
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
|
||||
if current_metrics is None:
|
||||
return
|
||||
|
||||
# Create performance snapshot
|
||||
performance_data = {
|
||||
'timestamp': get_current_timestamp(),
|
||||
'system': current_metrics.get('system', {}),
|
||||
'counters': current_metrics.get('counters', {}),
|
||||
'gauges': current_metrics.get('gauges', {}),
|
||||
'histograms': current_metrics.get('histograms', {})
|
||||
}
|
||||
|
||||
self.performance_history.append(performance_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error collecting performance data: {e}")
|
||||
|
||||
def _check_alerts(self) -> None:
|
||||
"""Check all alerts against current metrics"""
|
||||
if self.metrics_collector is None:
|
||||
return
|
||||
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
if current_metrics is None:
|
||||
return
|
||||
|
||||
current_time = get_current_timestamp()
|
||||
|
||||
for alert_name, alert in self.alerts.items():
|
||||
try:
|
||||
# Get metric value
|
||||
metric_value = self._get_metric_value(alert.metric_name, current_metrics)
|
||||
if metric_value is None:
|
||||
continue
|
||||
|
||||
# Check if alert should be triggered
|
||||
should_trigger = alert.is_triggered(metric_value)
|
||||
|
||||
if should_trigger and alert_name not in self.active_alerts:
|
||||
# Trigger alert
|
||||
alert.triggered_at = current_time
|
||||
self.active_alerts[alert_name] = alert
|
||||
self.alert_history.append(alert)
|
||||
|
||||
# Format message
|
||||
message = alert.message.format(
|
||||
threshold=alert.threshold,
|
||||
value=metric_value
|
||||
)
|
||||
|
||||
logger.warning(f"Performance alert triggered: {alert.name} - {message}")
|
||||
|
||||
# Notify callbacks
|
||||
for callback in self.alert_callbacks:
|
||||
try:
|
||||
callback(alert)
|
||||
except Exception as e:
|
||||
logger.error(f"Error in alert callback: {e}")
|
||||
|
||||
elif not should_trigger and alert_name in self.active_alerts:
|
||||
# Resolve alert
|
||||
resolved_alert = self.active_alerts.pop(alert_name)
|
||||
resolved_alert.resolved_at = current_time
|
||||
|
||||
logger.info(f"Performance alert resolved: {alert.name}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error checking alert {alert_name}: {e}")
|
||||
|
||||
def _get_metric_value(self, metric_name: str, metrics: Dict[str, Any]) -> Optional[float]:
|
||||
"""Get metric value from metrics data"""
|
||||
if not metrics:
|
||||
return None
|
||||
|
||||
# Check gauges first
|
||||
gauges = metrics.get('gauges', {})
|
||||
if gauges and metric_name in gauges:
|
||||
return gauges[metric_name]
|
||||
|
||||
# Check counters
|
||||
counters = metrics.get('counters', {})
|
||||
if counters and metric_name in counters:
|
||||
return counters[metric_name]
|
||||
|
||||
# Check histograms (use average)
|
||||
histograms = metrics.get('histograms', {})
|
||||
if histograms and metric_name in histograms:
|
||||
hist_data = histograms[metric_name]
|
||||
if hist_data and isinstance(hist_data, dict):
|
||||
return hist_data.get('avg', 0)
|
||||
|
||||
# Check system metrics
|
||||
system_metrics = metrics.get('system', {})
|
||||
if system_metrics and metric_name in system_metrics:
|
||||
return system_metrics[metric_name]
|
||||
|
||||
return None
|
||||
|
||||
def _update_kpis(self) -> None:
|
||||
"""Update key performance indicators"""
|
||||
try:
|
||||
if self.metrics_collector is None:
|
||||
return
|
||||
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
if current_metrics is None:
|
||||
return
|
||||
|
||||
timestamp = get_current_timestamp()
|
||||
|
||||
# Calculate throughput (operations per second)
|
||||
throughput = self._calculate_throughput()
|
||||
self.kpi_history['throughput_ops_per_sec'].append({
|
||||
'value': throughput,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Calculate error rate
|
||||
error_rate = self._calculate_error_rate()
|
||||
self.kpi_history['error_rate_percent'].append({
|
||||
'value': error_rate,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Calculate average latency
|
||||
avg_latency = self._calculate_average_latency()
|
||||
self.kpi_history['avg_latency_ms'].append({
|
||||
'value': avg_latency,
|
||||
'timestamp': timestamp
|
||||
})
|
||||
|
||||
# Update metrics collector with KPIs
|
||||
self.metrics_collector.set_gauge('kpi_throughput_ops_per_sec', throughput)
|
||||
self.metrics_collector.set_gauge('kpi_error_rate_percent', error_rate)
|
||||
self.metrics_collector.set_gauge('kpi_avg_latency_ms', avg_latency)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error updating KPIs: {e}")
|
||||
|
||||
def _calculate_throughput(self) -> float:
|
||||
"""Calculate operations per second throughput"""
|
||||
try:
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
counters = current_metrics.get('counters', {})
|
||||
|
||||
# Sum up relevant operation counters
|
||||
total_ops = 0
|
||||
for name, value in counters.items():
|
||||
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']):
|
||||
total_ops += value
|
||||
|
||||
# Calculate rate (simple approximation)
|
||||
if len(self.performance_history) >= 2:
|
||||
prev_data = self.performance_history[-2]
|
||||
current_data = self.performance_history[-1]
|
||||
|
||||
time_diff = (current_data['timestamp'] - prev_data['timestamp']).total_seconds()
|
||||
if time_diff > 0:
|
||||
prev_ops = sum(
|
||||
value for name, value in prev_data.get('counters', {}).items()
|
||||
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed'])
|
||||
)
|
||||
return (total_ops - prev_ops) / time_diff
|
||||
|
||||
return 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating throughput: {e}")
|
||||
return 0.0
|
||||
|
||||
def _calculate_error_rate(self) -> float:
|
||||
"""Calculate error rate percentage"""
|
||||
try:
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
counters = current_metrics.get('counters', {})
|
||||
|
||||
# Count errors and total operations
|
||||
total_errors = sum(
|
||||
value for name, value in counters.items()
|
||||
if 'error' in name.lower() or 'failed' in name.lower()
|
||||
)
|
||||
|
||||
total_operations = sum(
|
||||
value for name, value in counters.items()
|
||||
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed', 'total'])
|
||||
)
|
||||
|
||||
if total_operations > 0:
|
||||
return (total_errors / total_operations) * 100
|
||||
|
||||
return 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating error rate: {e}")
|
||||
return 0.0
|
||||
|
||||
def _calculate_average_latency(self) -> float:
|
||||
"""Calculate average latency across all operations"""
|
||||
try:
|
||||
current_metrics = self.metrics_collector.get_current_metrics()
|
||||
histograms = current_metrics.get('histograms', {})
|
||||
|
||||
# Find latency histograms
|
||||
latency_values = []
|
||||
for name, stats in histograms.items():
|
||||
if 'latency' in name.lower():
|
||||
latency_values.append(stats.get('avg', 0))
|
||||
|
||||
if latency_values:
|
||||
return sum(latency_values) / len(latency_values)
|
||||
|
||||
return 0.0
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error calculating average latency: {e}")
|
||||
return 0.0
|
||||
|
||||
def add_alert(self, alert: PerformanceAlert) -> None:
|
||||
"""Add a performance alert"""
|
||||
self.alerts[alert.name] = alert
|
||||
logger.info(f"Added performance alert: {alert.name}")
|
||||
|
||||
def remove_alert(self, alert_name: str) -> None:
|
||||
"""Remove a performance alert"""
|
||||
if alert_name in self.alerts:
|
||||
del self.alerts[alert_name]
|
||||
# Also remove from active alerts if present
|
||||
self.active_alerts.pop(alert_name, None)
|
||||
logger.info(f"Removed performance alert: {alert_name}")
|
||||
|
||||
def get_active_alerts(self) -> List[PerformanceAlert]:
|
||||
"""Get currently active alerts"""
|
||||
return list(self.active_alerts.values())
|
||||
|
||||
def get_alert_history(self, limit: int = 100) -> List[PerformanceAlert]:
|
||||
"""Get alert history"""
|
||||
return list(self.alert_history)[-limit:]
|
||||
|
||||
def get_performance_dashboard_data(self) -> Dict[str, Any]:
|
||||
"""Get data for performance dashboard"""
|
||||
current_metrics = {}
|
||||
if self.metrics_collector:
|
||||
current_metrics = self.metrics_collector.get_current_metrics() or {}
|
||||
|
||||
system_metrics = current_metrics.get('system', {}) or {}
|
||||
|
||||
return {
|
||||
'timestamp': get_current_timestamp().isoformat(),
|
||||
'system_metrics': system_metrics,
|
||||
'kpis': {
|
||||
name: list(history)[-10:] if history else [] # Last 10 points
|
||||
for name, history in self.kpi_history.items()
|
||||
},
|
||||
'active_alerts': [
|
||||
{
|
||||
'name': alert.name,
|
||||
'message': alert.message,
|
||||
'severity': alert.severity.value if hasattr(alert.severity, 'value') else str(alert.severity),
|
||||
'triggered_at': alert.triggered_at.isoformat() if alert.triggered_at else None
|
||||
}
|
||||
for alert in self.active_alerts.values()
|
||||
],
|
||||
'performance_summary': {
|
||||
'cpu_usage': system_metrics.get('cpu_usage', 0) if system_metrics else 0,
|
||||
'memory_usage': system_metrics.get('memory_usage', 0) if system_metrics else 0,
|
||||
'active_connections': system_metrics.get('active_connections', 0) if system_metrics else 0,
|
||||
'throughput': self.kpi_history['throughput_ops_per_sec'][-1]['value'] if self.kpi_history['throughput_ops_per_sec'] else 0,
|
||||
'error_rate': self.kpi_history['error_rate_percent'][-1]['value'] if self.kpi_history['error_rate_percent'] else 0,
|
||||
'avg_latency': self.kpi_history['avg_latency_ms'][-1]['value'] if self.kpi_history['avg_latency_ms'] else 0
|
||||
}
|
||||
}
|
||||
|
||||
def register_alert_callback(self, callback: Callable[[PerformanceAlert], None]) -> None:
|
||||
"""Register callback for alert notifications"""
|
||||
self.alert_callbacks.append(callback)
|
||||
logger.info(f"Registered alert callback: {callback.__name__}")
|
||||
|
||||
def update_thresholds(self, **kwargs) -> None:
|
||||
"""Update performance thresholds"""
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(self.thresholds, key):
|
||||
setattr(self.thresholds, key, value)
|
||||
logger.info(f"Updated threshold {key} to {value}")
|
||||
|
||||
def get_performance_trends(self, hours: int = 24) -> Dict[str, Any]:
|
||||
"""Get performance trends over specified time period"""
|
||||
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
|
||||
|
||||
# Filter performance history
|
||||
recent_data = [
|
||||
data for data in self.performance_history
|
||||
if data and data.get('timestamp') and data['timestamp'] >= cutoff_time
|
||||
]
|
||||
|
||||
if not recent_data:
|
||||
return {}
|
||||
|
||||
# Calculate trends
|
||||
trends = {}
|
||||
|
||||
# CPU usage trend
|
||||
cpu_values = []
|
||||
for data in recent_data:
|
||||
system_data = data.get('system', {})
|
||||
if system_data:
|
||||
cpu_values.append(system_data.get('cpu_usage', 0))
|
||||
|
||||
if cpu_values:
|
||||
trends['cpu_usage'] = {
|
||||
'current': cpu_values[-1],
|
||||
'average': sum(cpu_values) / len(cpu_values),
|
||||
'max': max(cpu_values),
|
||||
'trend': 'increasing' if len(cpu_values) > 1 and cpu_values[-1] > cpu_values[0] else 'stable'
|
||||
}
|
||||
|
||||
# Memory usage trend
|
||||
memory_values = []
|
||||
for data in recent_data:
|
||||
system_data = data.get('system', {})
|
||||
if system_data:
|
||||
memory_values.append(system_data.get('memory_usage', 0))
|
||||
|
||||
if memory_values:
|
||||
trends['memory_usage'] = {
|
||||
'current': memory_values[-1],
|
||||
'average': sum(memory_values) / len(memory_values),
|
||||
'max': max(memory_values),
|
||||
'trend': 'increasing' if len(memory_values) > 1 and memory_values[-1] > memory_values[0] else 'stable'
|
||||
}
|
||||
|
||||
return trends
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""Get performance monitor statistics"""
|
||||
return {
|
||||
'monitoring': self._monitoring,
|
||||
'monitor_interval': self._monitor_interval,
|
||||
'total_alerts': len(self.alerts),
|
||||
'active_alerts': len(self.active_alerts),
|
||||
'alert_history_count': len(self.alert_history),
|
||||
'performance_history_count': len(self.performance_history),
|
||||
'kpi_metrics': list(self.kpi_history.keys()),
|
||||
'registered_callbacks': len(self.alert_callbacks),
|
||||
'thresholds': {
|
||||
'max_cpu_usage': self.thresholds.max_cpu_usage,
|
||||
'max_memory_usage': self.thresholds.max_memory_usage,
|
||||
'min_memory_available': self.thresholds.min_memory_available,
|
||||
'max_latency_ms': self.thresholds.max_latency_ms,
|
||||
'max_error_rate': self.thresholds.max_error_rate,
|
||||
'min_throughput': self.thresholds.min_throughput
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Global performance monitor instance (initialized lazily)
|
||||
performance_monitor = None
|
||||
|
||||
def get_performance_monitor():
|
||||
"""Get or create global performance monitor instance"""
|
||||
global performance_monitor
|
||||
if performance_monitor is None:
|
||||
performance_monitor = PerformanceMonitor()
|
||||
return performance_monitor
|
Reference in New Issue
Block a user