16. monitoring
This commit is contained in:
@ -147,6 +147,10 @@
|
|||||||
- Implement Coinbase Pro WebSocket connector with proper authentication
|
- Implement Coinbase Pro WebSocket connector with proper authentication
|
||||||
- Create Kraken WebSocket connector with their specific message format
|
- Create Kraken WebSocket connector with their specific message format
|
||||||
- Add exchange-specific data normalization for both exchanges
|
- Add exchange-specific data normalization for both exchanges
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- Implement proper error handling for each exchange's quirks
|
- Implement proper error handling for each exchange's quirks
|
||||||
- Write unit tests for both new exchange connectors
|
- Write unit tests for both new exchange connectors
|
||||||
- _Requirements: 1.1, 1.2, 1.4_
|
- _Requirements: 1.1, 1.2, 1.4_
|
||||||
|
17
COBY/monitoring/__init__.py
Normal file
17
COBY/monitoring/__init__.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
"""
|
||||||
|
Performance monitoring and optimization module.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .metrics_collector import MetricsCollector
|
||||||
|
from .performance_monitor import PerformanceMonitor
|
||||||
|
from .memory_monitor import MemoryMonitor
|
||||||
|
from .latency_tracker import LatencyTracker
|
||||||
|
from .alert_manager import AlertManager
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'MetricsCollector',
|
||||||
|
'PerformanceMonitor',
|
||||||
|
'MemoryMonitor',
|
||||||
|
'LatencyTracker',
|
||||||
|
'AlertManager'
|
||||||
|
]
|
671
COBY/monitoring/alert_manager.py
Normal file
671
COBY/monitoring/alert_manager.py
Normal file
@ -0,0 +1,671 @@
|
|||||||
|
"""
|
||||||
|
Alert management system for performance degradation and system issues.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import smtplib
|
||||||
|
import json
|
||||||
|
from typing import Dict, List, Optional, Any, Callable
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from email.mime.text import MIMEText
|
||||||
|
from email.mime.multipart import MIMEMultipart
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from ..utils.logging import get_logger
|
||||||
|
from ..utils.timing import get_current_timestamp
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AlertSeverity(Enum):
|
||||||
|
"""Alert severity levels"""
|
||||||
|
INFO = "info"
|
||||||
|
WARNING = "warning"
|
||||||
|
CRITICAL = "critical"
|
||||||
|
|
||||||
|
|
||||||
|
class AlertStatus(Enum):
|
||||||
|
"""Alert status"""
|
||||||
|
ACTIVE = "active"
|
||||||
|
RESOLVED = "resolved"
|
||||||
|
ACKNOWLEDGED = "acknowledged"
|
||||||
|
SUPPRESSED = "suppressed"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Alert:
|
||||||
|
"""Alert definition"""
|
||||||
|
id: str
|
||||||
|
name: str
|
||||||
|
description: str
|
||||||
|
severity: AlertSeverity
|
||||||
|
metric_name: str
|
||||||
|
threshold: float
|
||||||
|
comparison: str # 'gt', 'lt', 'eq', 'ne'
|
||||||
|
duration_seconds: int
|
||||||
|
status: AlertStatus = AlertStatus.ACTIVE
|
||||||
|
triggered_at: Optional[datetime] = None
|
||||||
|
resolved_at: Optional[datetime] = None
|
||||||
|
acknowledged_at: Optional[datetime] = None
|
||||||
|
acknowledged_by: Optional[str] = None
|
||||||
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict[str, Any]:
|
||||||
|
"""Convert alert to dictionary"""
|
||||||
|
return {
|
||||||
|
'id': self.id,
|
||||||
|
'name': self.name,
|
||||||
|
'description': self.description,
|
||||||
|
'severity': self.severity.value,
|
||||||
|
'metric_name': self.metric_name,
|
||||||
|
'threshold': self.threshold,
|
||||||
|
'comparison': self.comparison,
|
||||||
|
'duration_seconds': self.duration_seconds,
|
||||||
|
'status': self.status.value,
|
||||||
|
'triggered_at': self.triggered_at.isoformat() if self.triggered_at else None,
|
||||||
|
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
|
||||||
|
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
|
||||||
|
'acknowledged_by': self.acknowledged_by,
|
||||||
|
'metadata': self.metadata
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AlertRule:
|
||||||
|
"""Alert rule configuration"""
|
||||||
|
name: str
|
||||||
|
metric_name: str
|
||||||
|
threshold: float
|
||||||
|
comparison: str
|
||||||
|
duration_seconds: int
|
||||||
|
severity: AlertSeverity
|
||||||
|
description: str = ""
|
||||||
|
enabled: bool = True
|
||||||
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class NotificationChannel:
|
||||||
|
"""Notification channel configuration"""
|
||||||
|
name: str
|
||||||
|
type: str # 'email', 'webhook', 'slack'
|
||||||
|
config: Dict[str, Any]
|
||||||
|
enabled: bool = True
|
||||||
|
severity_filter: List[AlertSeverity] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
class AlertManager:
|
||||||
|
"""
|
||||||
|
Manages alerts, notifications, and alert lifecycle.
|
||||||
|
|
||||||
|
Provides comprehensive alerting with multiple notification channels
|
||||||
|
and alert suppression capabilities.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize alert manager"""
|
||||||
|
# Alert storage
|
||||||
|
self.alert_rules: Dict[str, AlertRule] = {}
|
||||||
|
self.active_alerts: Dict[str, Alert] = {}
|
||||||
|
self.alert_history: deque = deque(maxlen=10000)
|
||||||
|
|
||||||
|
# Notification channels
|
||||||
|
self.notification_channels: Dict[str, NotificationChannel] = {}
|
||||||
|
|
||||||
|
# Alert state tracking
|
||||||
|
self.metric_values: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
|
||||||
|
self.alert_triggers: Dict[str, datetime] = {}
|
||||||
|
|
||||||
|
# Suppression rules
|
||||||
|
self.suppression_rules: Dict[str, Dict[str, Any]] = {}
|
||||||
|
|
||||||
|
# Callbacks
|
||||||
|
self.alert_callbacks: List[Callable[[Alert], None]] = []
|
||||||
|
|
||||||
|
# Statistics
|
||||||
|
self.alerts_triggered = 0
|
||||||
|
self.alerts_resolved = 0
|
||||||
|
self.notifications_sent = 0
|
||||||
|
|
||||||
|
logger.info("Alert manager initialized")
|
||||||
|
|
||||||
|
def add_alert_rule(self, rule: AlertRule) -> None:
|
||||||
|
"""Add an alert rule"""
|
||||||
|
self.alert_rules[rule.name] = rule
|
||||||
|
logger.info(f"Added alert rule: {rule.name}")
|
||||||
|
|
||||||
|
def remove_alert_rule(self, rule_name: str) -> None:
|
||||||
|
"""Remove an alert rule"""
|
||||||
|
if rule_name in self.alert_rules:
|
||||||
|
del self.alert_rules[rule_name]
|
||||||
|
# Also remove any active alerts for this rule
|
||||||
|
alerts_to_remove = [
|
||||||
|
alert_id for alert_id, alert in self.active_alerts.items()
|
||||||
|
if alert.name == rule_name
|
||||||
|
]
|
||||||
|
for alert_id in alerts_to_remove:
|
||||||
|
del self.active_alerts[alert_id]
|
||||||
|
logger.info(f"Removed alert rule: {rule_name}")
|
||||||
|
|
||||||
|
def add_notification_channel(self, channel: NotificationChannel) -> None:
|
||||||
|
"""Add a notification channel"""
|
||||||
|
self.notification_channels[channel.name] = channel
|
||||||
|
logger.info(f"Added notification channel: {channel.name} ({channel.type})")
|
||||||
|
|
||||||
|
def remove_notification_channel(self, channel_name: str) -> None:
|
||||||
|
"""Remove a notification channel"""
|
||||||
|
if channel_name in self.notification_channels:
|
||||||
|
del self.notification_channels[channel_name]
|
||||||
|
logger.info(f"Removed notification channel: {channel_name}")
|
||||||
|
|
||||||
|
def update_metric_value(self, metric_name: str, value: float) -> None:
|
||||||
|
"""Update metric value and check alerts"""
|
||||||
|
timestamp = get_current_timestamp()
|
||||||
|
self.metric_values[metric_name].append((timestamp, value))
|
||||||
|
|
||||||
|
# Check all alert rules for this metric
|
||||||
|
for rule_name, rule in self.alert_rules.items():
|
||||||
|
if rule.metric_name == metric_name and rule.enabled:
|
||||||
|
self._check_alert_rule(rule, value, timestamp)
|
||||||
|
|
||||||
|
def _check_alert_rule(self, rule: AlertRule, value: float, timestamp: datetime) -> None:
|
||||||
|
"""Check if an alert rule should be triggered"""
|
||||||
|
try:
|
||||||
|
# Check if condition is met
|
||||||
|
condition_met = self._evaluate_condition(rule.comparison, value, rule.threshold)
|
||||||
|
|
||||||
|
alert_id = f"{rule.name}_{rule.metric_name}"
|
||||||
|
|
||||||
|
if condition_met:
|
||||||
|
# Check if we need to wait for duration
|
||||||
|
if alert_id not in self.alert_triggers:
|
||||||
|
self.alert_triggers[alert_id] = timestamp
|
||||||
|
return
|
||||||
|
|
||||||
|
# Check if duration has passed
|
||||||
|
trigger_time = self.alert_triggers[alert_id]
|
||||||
|
if (timestamp - trigger_time).total_seconds() >= rule.duration_seconds:
|
||||||
|
# Trigger alert if not already active
|
||||||
|
if alert_id not in self.active_alerts:
|
||||||
|
self._trigger_alert(rule, value, timestamp)
|
||||||
|
else:
|
||||||
|
# Condition not met - clear trigger time and resolve alert if active
|
||||||
|
self.alert_triggers.pop(alert_id, None)
|
||||||
|
if alert_id in self.active_alerts:
|
||||||
|
self._resolve_alert(alert_id, timestamp)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking alert rule {rule.name}: {e}")
|
||||||
|
|
||||||
|
def _evaluate_condition(self, comparison: str, value: float, threshold: float) -> bool:
|
||||||
|
"""Evaluate alert condition"""
|
||||||
|
if comparison == 'gt':
|
||||||
|
return value > threshold
|
||||||
|
elif comparison == 'lt':
|
||||||
|
return value < threshold
|
||||||
|
elif comparison == 'eq':
|
||||||
|
return abs(value - threshold) < 0.001
|
||||||
|
elif comparison == 'ne':
|
||||||
|
return abs(value - threshold) >= 0.001
|
||||||
|
elif comparison == 'gte':
|
||||||
|
return value >= threshold
|
||||||
|
elif comparison == 'lte':
|
||||||
|
return value <= threshold
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown comparison operator: {comparison}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _trigger_alert(self, rule: AlertRule, value: float, timestamp: datetime) -> None:
|
||||||
|
"""Trigger an alert"""
|
||||||
|
try:
|
||||||
|
alert_id = f"{rule.name}_{rule.metric_name}"
|
||||||
|
|
||||||
|
# Create alert
|
||||||
|
alert = Alert(
|
||||||
|
id=alert_id,
|
||||||
|
name=rule.name,
|
||||||
|
description=rule.description or f"{rule.metric_name} {rule.comparison} {rule.threshold}",
|
||||||
|
severity=rule.severity,
|
||||||
|
metric_name=rule.metric_name,
|
||||||
|
threshold=rule.threshold,
|
||||||
|
comparison=rule.comparison,
|
||||||
|
duration_seconds=rule.duration_seconds,
|
||||||
|
triggered_at=timestamp,
|
||||||
|
metadata={
|
||||||
|
'current_value': value,
|
||||||
|
'rule_metadata': rule.metadata
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check suppression rules
|
||||||
|
if self._is_suppressed(alert):
|
||||||
|
alert.status = AlertStatus.SUPPRESSED
|
||||||
|
logger.info(f"Alert suppressed: {alert.name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Store alert
|
||||||
|
self.active_alerts[alert_id] = alert
|
||||||
|
self.alert_history.append(alert)
|
||||||
|
self.alerts_triggered += 1
|
||||||
|
|
||||||
|
logger.warning(f"Alert triggered: {alert.name} - {alert.description}")
|
||||||
|
|
||||||
|
# Send notifications
|
||||||
|
self._send_notifications(alert)
|
||||||
|
|
||||||
|
# Call callbacks
|
||||||
|
for callback in self.alert_callbacks:
|
||||||
|
try:
|
||||||
|
callback(alert)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in alert callback: {e}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error triggering alert: {e}")
|
||||||
|
|
||||||
|
def _resolve_alert(self, alert_id: str, timestamp: datetime) -> None:
|
||||||
|
"""Resolve an alert"""
|
||||||
|
try:
|
||||||
|
if alert_id in self.active_alerts:
|
||||||
|
alert = self.active_alerts[alert_id]
|
||||||
|
alert.status = AlertStatus.RESOLVED
|
||||||
|
alert.resolved_at = timestamp
|
||||||
|
|
||||||
|
# Move to history and remove from active
|
||||||
|
self.alert_history.append(alert)
|
||||||
|
del self.active_alerts[alert_id]
|
||||||
|
self.alerts_resolved += 1
|
||||||
|
|
||||||
|
logger.info(f"Alert resolved: {alert.name}")
|
||||||
|
|
||||||
|
# Send resolution notifications
|
||||||
|
self._send_resolution_notifications(alert)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error resolving alert {alert_id}: {e}")
|
||||||
|
|
||||||
|
def _is_suppressed(self, alert: Alert) -> bool:
|
||||||
|
"""Check if alert should be suppressed"""
|
||||||
|
for rule_name, rule in self.suppression_rules.items():
|
||||||
|
try:
|
||||||
|
# Check if suppression rule applies
|
||||||
|
if self._matches_suppression_rule(alert, rule):
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking suppression rule {rule_name}: {e}")
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _matches_suppression_rule(self, alert: Alert, rule: Dict[str, Any]) -> bool:
|
||||||
|
"""Check if alert matches suppression rule"""
|
||||||
|
# Check alert name pattern
|
||||||
|
if 'alert_pattern' in rule:
|
||||||
|
import re
|
||||||
|
if not re.match(rule['alert_pattern'], alert.name):
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check severity
|
||||||
|
if 'severity' in rule:
|
||||||
|
if alert.severity.value not in rule['severity']:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check time window
|
||||||
|
if 'time_window' in rule:
|
||||||
|
start_time = datetime.fromisoformat(rule['time_window']['start'])
|
||||||
|
end_time = datetime.fromisoformat(rule['time_window']['end'])
|
||||||
|
current_time = get_current_timestamp()
|
||||||
|
|
||||||
|
if not (start_time <= current_time <= end_time):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def _send_notifications(self, alert: Alert) -> None:
|
||||||
|
"""Send notifications for an alert"""
|
||||||
|
for channel_name, channel in self.notification_channels.items():
|
||||||
|
try:
|
||||||
|
if not channel.enabled:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check severity filter
|
||||||
|
if channel.severity_filter and alert.severity not in channel.severity_filter:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Send notification based on channel type
|
||||||
|
if channel.type == 'email':
|
||||||
|
self._send_email_notification(alert, channel)
|
||||||
|
elif channel.type == 'webhook':
|
||||||
|
self._send_webhook_notification(alert, channel)
|
||||||
|
elif channel.type == 'slack':
|
||||||
|
self._send_slack_notification(alert, channel)
|
||||||
|
else:
|
||||||
|
logger.warning(f"Unknown notification channel type: {channel.type}")
|
||||||
|
|
||||||
|
self.notifications_sent += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending notification via {channel_name}: {e}")
|
||||||
|
|
||||||
|
def _send_resolution_notifications(self, alert: Alert) -> None:
|
||||||
|
"""Send resolution notifications"""
|
||||||
|
for channel_name, channel in self.notification_channels.items():
|
||||||
|
try:
|
||||||
|
if not channel.enabled:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Send resolution notification
|
||||||
|
if channel.type == 'email':
|
||||||
|
self._send_email_resolution(alert, channel)
|
||||||
|
elif channel.type == 'webhook':
|
||||||
|
self._send_webhook_resolution(alert, channel)
|
||||||
|
elif channel.type == 'slack':
|
||||||
|
self._send_slack_resolution(alert, channel)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending resolution notification via {channel_name}: {e}")
|
||||||
|
|
||||||
|
def _send_email_notification(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send email notification"""
|
||||||
|
try:
|
||||||
|
config = channel.config
|
||||||
|
|
||||||
|
# Create message
|
||||||
|
msg = MIMEMultipart()
|
||||||
|
msg['From'] = config['from_email']
|
||||||
|
msg['To'] = ', '.join(config['to_emails'])
|
||||||
|
msg['Subject'] = f"[{alert.severity.value.upper()}] {alert.name}"
|
||||||
|
|
||||||
|
# Create body
|
||||||
|
body = f"""
|
||||||
|
Alert: {alert.name}
|
||||||
|
Severity: {alert.severity.value.upper()}
|
||||||
|
Description: {alert.description}
|
||||||
|
Metric: {alert.metric_name}
|
||||||
|
Current Value: {alert.metadata.get('current_value', 'N/A')}
|
||||||
|
Threshold: {alert.threshold}
|
||||||
|
Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'}
|
||||||
|
|
||||||
|
Alert ID: {alert.id}
|
||||||
|
"""
|
||||||
|
|
||||||
|
msg.attach(MIMEText(body, 'plain'))
|
||||||
|
|
||||||
|
# Send email
|
||||||
|
with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server:
|
||||||
|
if config.get('use_tls', True):
|
||||||
|
server.starttls()
|
||||||
|
if 'username' in config and 'password' in config:
|
||||||
|
server.login(config['username'], config['password'])
|
||||||
|
server.send_message(msg)
|
||||||
|
|
||||||
|
logger.info(f"Email notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending email notification: {e}")
|
||||||
|
|
||||||
|
def _send_webhook_notification(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send webhook notification"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
config = channel.config
|
||||||
|
payload = {
|
||||||
|
'alert': alert.to_dict(),
|
||||||
|
'type': 'alert_triggered'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
config['url'],
|
||||||
|
json=payload,
|
||||||
|
headers=config.get('headers', {}),
|
||||||
|
timeout=config.get('timeout', 10)
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
logger.info(f"Webhook notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending webhook notification: {e}")
|
||||||
|
|
||||||
|
def _send_slack_notification(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send Slack notification"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
config = channel.config
|
||||||
|
|
||||||
|
# Create Slack message
|
||||||
|
color = {
|
||||||
|
AlertSeverity.INFO: 'good',
|
||||||
|
AlertSeverity.WARNING: 'warning',
|
||||||
|
AlertSeverity.CRITICAL: 'danger'
|
||||||
|
}.get(alert.severity, 'warning')
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'channel': config['channel'],
|
||||||
|
'username': config.get('username', 'AlertBot'),
|
||||||
|
'attachments': [{
|
||||||
|
'color': color,
|
||||||
|
'title': f"{alert.severity.value.upper()}: {alert.name}",
|
||||||
|
'text': alert.description,
|
||||||
|
'fields': [
|
||||||
|
{'title': 'Metric', 'value': alert.metric_name, 'short': True},
|
||||||
|
{'title': 'Current Value', 'value': str(alert.metadata.get('current_value', 'N/A')), 'short': True},
|
||||||
|
{'title': 'Threshold', 'value': str(alert.threshold), 'short': True},
|
||||||
|
{'title': 'Triggered At', 'value': alert.triggered_at.isoformat() if alert.triggered_at else 'N/A', 'short': True}
|
||||||
|
],
|
||||||
|
'footer': f"Alert ID: {alert.id}"
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
config['webhook_url'],
|
||||||
|
json=payload,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
logger.info(f"Slack notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending Slack notification: {e}")
|
||||||
|
|
||||||
|
def _send_email_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send email resolution notification"""
|
||||||
|
try:
|
||||||
|
config = channel.config
|
||||||
|
|
||||||
|
# Create message
|
||||||
|
msg = MIMEMultipart()
|
||||||
|
msg['From'] = config['from_email']
|
||||||
|
msg['To'] = ', '.join(config['to_emails'])
|
||||||
|
msg['Subject'] = f"[RESOLVED] {alert.name}"
|
||||||
|
|
||||||
|
# Create body
|
||||||
|
duration = ""
|
||||||
|
if alert.triggered_at and alert.resolved_at:
|
||||||
|
duration = str(alert.resolved_at - alert.triggered_at)
|
||||||
|
|
||||||
|
body = f"""
|
||||||
|
Alert RESOLVED: {alert.name}
|
||||||
|
Severity: {alert.severity.value.upper()}
|
||||||
|
Description: {alert.description}
|
||||||
|
Metric: {alert.metric_name}
|
||||||
|
Threshold: {alert.threshold}
|
||||||
|
Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'}
|
||||||
|
Resolved At: {alert.resolved_at.isoformat() if alert.resolved_at else 'N/A'}
|
||||||
|
Duration: {duration}
|
||||||
|
|
||||||
|
Alert ID: {alert.id}
|
||||||
|
"""
|
||||||
|
|
||||||
|
msg.attach(MIMEText(body, 'plain'))
|
||||||
|
|
||||||
|
# Send email
|
||||||
|
with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server:
|
||||||
|
if config.get('use_tls', True):
|
||||||
|
server.starttls()
|
||||||
|
if 'username' in config and 'password' in config:
|
||||||
|
server.login(config['username'], config['password'])
|
||||||
|
server.send_message(msg)
|
||||||
|
|
||||||
|
logger.info(f"Email resolution notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending email resolution notification: {e}")
|
||||||
|
|
||||||
|
def _send_webhook_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send webhook resolution notification"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
config = channel.config
|
||||||
|
payload = {
|
||||||
|
'alert': alert.to_dict(),
|
||||||
|
'type': 'alert_resolved'
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
config['url'],
|
||||||
|
json=payload,
|
||||||
|
headers=config.get('headers', {}),
|
||||||
|
timeout=config.get('timeout', 10)
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
logger.info(f"Webhook resolution notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending webhook resolution notification: {e}")
|
||||||
|
|
||||||
|
def _send_slack_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
|
||||||
|
"""Send Slack resolution notification"""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
|
||||||
|
config = channel.config
|
||||||
|
|
||||||
|
duration = ""
|
||||||
|
if alert.triggered_at and alert.resolved_at:
|
||||||
|
duration = str(alert.resolved_at - alert.triggered_at)
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
'channel': config['channel'],
|
||||||
|
'username': config.get('username', 'AlertBot'),
|
||||||
|
'attachments': [{
|
||||||
|
'color': 'good',
|
||||||
|
'title': f"RESOLVED: {alert.name}",
|
||||||
|
'text': f"Alert has been resolved: {alert.description}",
|
||||||
|
'fields': [
|
||||||
|
{'title': 'Duration', 'value': duration, 'short': True},
|
||||||
|
{'title': 'Resolved At', 'value': alert.resolved_at.isoformat() if alert.resolved_at else 'N/A', 'short': True}
|
||||||
|
],
|
||||||
|
'footer': f"Alert ID: {alert.id}"
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(
|
||||||
|
config['webhook_url'],
|
||||||
|
json=payload,
|
||||||
|
timeout=10
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
logger.info(f"Slack resolution notification sent for alert: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error sending Slack resolution notification: {e}")
|
||||||
|
|
||||||
|
def acknowledge_alert(self, alert_id: str, acknowledged_by: str) -> bool:
|
||||||
|
"""Acknowledge an alert"""
|
||||||
|
if alert_id in self.active_alerts:
|
||||||
|
alert = self.active_alerts[alert_id]
|
||||||
|
alert.status = AlertStatus.ACKNOWLEDGED
|
||||||
|
alert.acknowledged_at = get_current_timestamp()
|
||||||
|
alert.acknowledged_by = acknowledged_by
|
||||||
|
|
||||||
|
logger.info(f"Alert acknowledged by {acknowledged_by}: {alert.name}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def suppress_alert(self, alert_id: str) -> bool:
|
||||||
|
"""Suppress an alert"""
|
||||||
|
if alert_id in self.active_alerts:
|
||||||
|
alert = self.active_alerts[alert_id]
|
||||||
|
alert.status = AlertStatus.SUPPRESSED
|
||||||
|
|
||||||
|
logger.info(f"Alert suppressed: {alert.name}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def add_suppression_rule(self, name: str, rule: Dict[str, Any]) -> None:
|
||||||
|
"""Add alert suppression rule"""
|
||||||
|
self.suppression_rules[name] = rule
|
||||||
|
logger.info(f"Added suppression rule: {name}")
|
||||||
|
|
||||||
|
def remove_suppression_rule(self, name: str) -> None:
|
||||||
|
"""Remove alert suppression rule"""
|
||||||
|
if name in self.suppression_rules:
|
||||||
|
del self.suppression_rules[name]
|
||||||
|
logger.info(f"Removed suppression rule: {name}")
|
||||||
|
|
||||||
|
def get_active_alerts(self, severity: AlertSeverity = None) -> List[Alert]:
|
||||||
|
"""Get active alerts, optionally filtered by severity"""
|
||||||
|
alerts = list(self.active_alerts.values())
|
||||||
|
|
||||||
|
if severity:
|
||||||
|
alerts = [alert for alert in alerts if alert.severity == severity]
|
||||||
|
|
||||||
|
return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True)
|
||||||
|
|
||||||
|
def get_alert_history(self, limit: int = 100, severity: AlertSeverity = None) -> List[Alert]:
|
||||||
|
"""Get alert history"""
|
||||||
|
alerts = list(self.alert_history)
|
||||||
|
|
||||||
|
if severity:
|
||||||
|
alerts = [alert for alert in alerts if alert.severity == severity]
|
||||||
|
|
||||||
|
return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True)[:limit]
|
||||||
|
|
||||||
|
def get_alert_summary(self) -> Dict[str, Any]:
|
||||||
|
"""Get alert summary statistics"""
|
||||||
|
active_by_severity = defaultdict(int)
|
||||||
|
for alert in self.active_alerts.values():
|
||||||
|
active_by_severity[alert.severity.value] += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
'active_alerts': len(self.active_alerts),
|
||||||
|
'active_by_severity': dict(active_by_severity),
|
||||||
|
'total_triggered': self.alerts_triggered,
|
||||||
|
'total_resolved': self.alerts_resolved,
|
||||||
|
'notifications_sent': self.notifications_sent,
|
||||||
|
'alert_rules': len(self.alert_rules),
|
||||||
|
'notification_channels': len(self.notification_channels),
|
||||||
|
'suppression_rules': len(self.suppression_rules)
|
||||||
|
}
|
||||||
|
|
||||||
|
def register_callback(self, callback: Callable[[Alert], None]) -> None:
|
||||||
|
"""Register alert callback"""
|
||||||
|
self.alert_callbacks.append(callback)
|
||||||
|
logger.info(f"Registered alert callback: {callback.__name__}")
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get alert manager statistics"""
|
||||||
|
return {
|
||||||
|
'alert_rules': len(self.alert_rules),
|
||||||
|
'active_alerts': len(self.active_alerts),
|
||||||
|
'alert_history_count': len(self.alert_history),
|
||||||
|
'notification_channels': len(self.notification_channels),
|
||||||
|
'suppression_rules': len(self.suppression_rules),
|
||||||
|
'alerts_triggered': self.alerts_triggered,
|
||||||
|
'alerts_resolved': self.alerts_resolved,
|
||||||
|
'notifications_sent': self.notifications_sent,
|
||||||
|
'registered_callbacks': len(self.alert_callbacks)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global alert manager instance
|
||||||
|
alert_manager = AlertManager()
|
497
COBY/monitoring/latency_tracker.py
Normal file
497
COBY/monitoring/latency_tracker.py
Normal file
@ -0,0 +1,497 @@
|
|||||||
|
"""
|
||||||
|
End-to-end latency tracking for data processing pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
from typing import Dict, List, Optional, Any, ContextManager
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from contextlib import contextmanager
|
||||||
|
|
||||||
|
from ..utils.logging import get_logger, set_correlation_id
|
||||||
|
from ..utils.timing import get_current_timestamp
|
||||||
|
# Import will be done lazily to avoid circular imports
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class LatencyMeasurement:
|
||||||
|
"""Individual latency measurement"""
|
||||||
|
operation: str
|
||||||
|
start_time: float
|
||||||
|
end_time: float
|
||||||
|
duration_ms: float
|
||||||
|
correlation_id: Optional[str] = None
|
||||||
|
metadata: Dict[str, Any] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def duration_seconds(self) -> float:
|
||||||
|
"""Get duration in seconds"""
|
||||||
|
return self.duration_ms / 1000.0
|
||||||
|
|
||||||
|
|
||||||
|
class LatencyTracker:
|
||||||
|
"""
|
||||||
|
Tracks end-to-end latency for various operations in the system.
|
||||||
|
|
||||||
|
Provides context managers for easy latency measurement and
|
||||||
|
comprehensive latency analytics.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, max_measurements: int = 10000):
|
||||||
|
"""
|
||||||
|
Initialize latency tracker.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_measurements: Maximum number of measurements to keep in memory
|
||||||
|
"""
|
||||||
|
self.max_measurements = max_measurements
|
||||||
|
|
||||||
|
# Latency storage
|
||||||
|
self.measurements: Dict[str, deque] = defaultdict(
|
||||||
|
lambda: deque(maxlen=max_measurements)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Active measurements (for tracking ongoing operations)
|
||||||
|
self.active_measurements: Dict[str, Dict[str, float]] = defaultdict(dict)
|
||||||
|
|
||||||
|
# Thread safety
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
|
||||||
|
# Statistics
|
||||||
|
self.total_measurements = 0
|
||||||
|
|
||||||
|
logger.info(f"Latency tracker initialized with max {max_measurements} measurements")
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def measure(self, operation: str, correlation_id: str = None,
|
||||||
|
metadata: Dict[str, Any] = None) -> ContextManager[None]:
|
||||||
|
"""
|
||||||
|
Context manager for measuring operation latency.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Name of the operation being measured
|
||||||
|
correlation_id: Optional correlation ID for tracking
|
||||||
|
metadata: Optional metadata to store with measurement
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
with latency_tracker.measure('data_processing'):
|
||||||
|
# Your code here
|
||||||
|
process_data()
|
||||||
|
"""
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
measurement_id = f"{operation}_{start_time}_{threading.get_ident()}"
|
||||||
|
|
||||||
|
# Store active measurement
|
||||||
|
with self._lock:
|
||||||
|
self.active_measurements[operation][measurement_id] = start_time
|
||||||
|
|
||||||
|
try:
|
||||||
|
yield
|
||||||
|
finally:
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
duration_ms = (end_time - start_time) * 1000
|
||||||
|
|
||||||
|
# Create measurement
|
||||||
|
measurement = LatencyMeasurement(
|
||||||
|
operation=operation,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
correlation_id=correlation_id,
|
||||||
|
metadata=metadata or {}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store measurement
|
||||||
|
with self._lock:
|
||||||
|
self.measurements[operation].append(measurement)
|
||||||
|
self.active_measurements[operation].pop(measurement_id, None)
|
||||||
|
self.total_measurements += 1
|
||||||
|
|
||||||
|
# Record in metrics collector
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
metrics_collector.observe_histogram(
|
||||||
|
f"{operation}_latency_ms",
|
||||||
|
duration_ms,
|
||||||
|
labels={'operation': operation}
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
pass # Metrics collector not available
|
||||||
|
|
||||||
|
logger.debug(f"Measured {operation}: {duration_ms:.2f}ms")
|
||||||
|
|
||||||
|
def start_measurement(self, operation: str, measurement_id: str = None,
|
||||||
|
correlation_id: str = None) -> str:
|
||||||
|
"""
|
||||||
|
Start a manual latency measurement.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Name of the operation
|
||||||
|
measurement_id: Optional custom measurement ID
|
||||||
|
correlation_id: Optional correlation ID
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Measurement ID for ending the measurement
|
||||||
|
"""
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
if measurement_id is None:
|
||||||
|
measurement_id = f"{operation}_{start_time}_{threading.get_ident()}"
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self.active_measurements[operation][measurement_id] = start_time
|
||||||
|
|
||||||
|
logger.debug(f"Started measurement {measurement_id} for {operation}")
|
||||||
|
return measurement_id
|
||||||
|
|
||||||
|
def end_measurement(self, operation: str, measurement_id: str,
|
||||||
|
metadata: Dict[str, Any] = None) -> Optional[LatencyMeasurement]:
|
||||||
|
"""
|
||||||
|
End a manual latency measurement.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Name of the operation
|
||||||
|
measurement_id: Measurement ID from start_measurement
|
||||||
|
metadata: Optional metadata to store
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
LatencyMeasurement: The completed measurement, or None if not found
|
||||||
|
"""
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
start_time = self.active_measurements[operation].pop(measurement_id, None)
|
||||||
|
|
||||||
|
if start_time is None:
|
||||||
|
logger.warning(f"No active measurement found: {measurement_id}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
duration_ms = (end_time - start_time) * 1000
|
||||||
|
|
||||||
|
# Create measurement
|
||||||
|
measurement = LatencyMeasurement(
|
||||||
|
operation=operation,
|
||||||
|
start_time=start_time,
|
||||||
|
end_time=end_time,
|
||||||
|
duration_ms=duration_ms,
|
||||||
|
metadata=metadata or {}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Store measurement
|
||||||
|
with self._lock:
|
||||||
|
self.measurements[operation].append(measurement)
|
||||||
|
self.total_measurements += 1
|
||||||
|
|
||||||
|
# Record in metrics collector
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
metrics_collector.observe_histogram(
|
||||||
|
f"{operation}_latency_ms",
|
||||||
|
duration_ms,
|
||||||
|
labels={'operation': operation}
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
pass # Metrics collector not available
|
||||||
|
|
||||||
|
logger.debug(f"Completed measurement {measurement_id}: {duration_ms:.2f}ms")
|
||||||
|
return measurement
|
||||||
|
|
||||||
|
def get_latency_stats(self, operation: str) -> Dict[str, float]:
|
||||||
|
"""
|
||||||
|
Get latency statistics for an operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Operation name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Latency statistics
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
measurements = list(self.measurements[operation])
|
||||||
|
|
||||||
|
if not measurements:
|
||||||
|
return {
|
||||||
|
'count': 0,
|
||||||
|
'avg_ms': 0.0,
|
||||||
|
'min_ms': 0.0,
|
||||||
|
'max_ms': 0.0,
|
||||||
|
'p50_ms': 0.0,
|
||||||
|
'p95_ms': 0.0,
|
||||||
|
'p99_ms': 0.0
|
||||||
|
}
|
||||||
|
|
||||||
|
durations = [m.duration_ms for m in measurements]
|
||||||
|
durations.sort()
|
||||||
|
|
||||||
|
count = len(durations)
|
||||||
|
avg_ms = sum(durations) / count
|
||||||
|
min_ms = durations[0]
|
||||||
|
max_ms = durations[-1]
|
||||||
|
|
||||||
|
# Calculate percentiles
|
||||||
|
p50_ms = durations[int(0.50 * count)]
|
||||||
|
p95_ms = durations[int(0.95 * count)]
|
||||||
|
p99_ms = durations[int(0.99 * count)]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'count': count,
|
||||||
|
'avg_ms': avg_ms,
|
||||||
|
'min_ms': min_ms,
|
||||||
|
'max_ms': max_ms,
|
||||||
|
'p50_ms': p50_ms,
|
||||||
|
'p95_ms': p95_ms,
|
||||||
|
'p99_ms': p99_ms
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_all_latency_stats(self) -> Dict[str, Dict[str, float]]:
|
||||||
|
"""Get latency statistics for all operations"""
|
||||||
|
with self._lock:
|
||||||
|
operations = list(self.measurements.keys())
|
||||||
|
|
||||||
|
return {
|
||||||
|
operation: self.get_latency_stats(operation)
|
||||||
|
for operation in operations
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_recent_measurements(self, operation: str, limit: int = 100) -> List[LatencyMeasurement]:
|
||||||
|
"""
|
||||||
|
Get recent measurements for an operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Operation name
|
||||||
|
limit: Maximum number of measurements to return
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[LatencyMeasurement]: Recent measurements
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
measurements = list(self.measurements[operation])
|
||||||
|
|
||||||
|
return measurements[-limit:]
|
||||||
|
|
||||||
|
def get_slow_operations(self, threshold_ms: float = 100.0) -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Get operations that are slower than threshold.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
threshold_ms: Latency threshold in milliseconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List: Slow operations with their stats
|
||||||
|
"""
|
||||||
|
slow_operations = []
|
||||||
|
|
||||||
|
for operation in self.measurements.keys():
|
||||||
|
stats = self.get_latency_stats(operation)
|
||||||
|
if stats['avg_ms'] > threshold_ms:
|
||||||
|
slow_operations.append({
|
||||||
|
'operation': operation,
|
||||||
|
'avg_latency_ms': stats['avg_ms'],
|
||||||
|
'p95_latency_ms': stats['p95_ms'],
|
||||||
|
'count': stats['count']
|
||||||
|
})
|
||||||
|
|
||||||
|
# Sort by average latency (descending)
|
||||||
|
slow_operations.sort(key=lambda x: x['avg_latency_ms'], reverse=True)
|
||||||
|
return slow_operations
|
||||||
|
|
||||||
|
def get_latency_trends(self, operation: str, window_size: int = 100) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Get latency trends for an operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Operation name
|
||||||
|
window_size: Number of recent measurements to analyze
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Trend analysis
|
||||||
|
"""
|
||||||
|
recent_measurements = self.get_recent_measurements(operation, window_size)
|
||||||
|
|
||||||
|
if len(recent_measurements) < 2:
|
||||||
|
return {'trend': 'insufficient_data'}
|
||||||
|
|
||||||
|
# Split into two halves for trend analysis
|
||||||
|
mid_point = len(recent_measurements) // 2
|
||||||
|
first_half = recent_measurements[:mid_point]
|
||||||
|
second_half = recent_measurements[mid_point:]
|
||||||
|
|
||||||
|
first_avg = sum(m.duration_ms for m in first_half) / len(first_half)
|
||||||
|
second_avg = sum(m.duration_ms for m in second_half) / len(second_half)
|
||||||
|
|
||||||
|
# Calculate trend
|
||||||
|
change_percent = ((second_avg - first_avg) / first_avg) * 100
|
||||||
|
|
||||||
|
if abs(change_percent) < 5:
|
||||||
|
trend = 'stable'
|
||||||
|
elif change_percent > 0:
|
||||||
|
trend = 'increasing'
|
||||||
|
else:
|
||||||
|
trend = 'decreasing'
|
||||||
|
|
||||||
|
return {
|
||||||
|
'trend': trend,
|
||||||
|
'change_percent': change_percent,
|
||||||
|
'first_half_avg_ms': first_avg,
|
||||||
|
'second_half_avg_ms': second_avg,
|
||||||
|
'sample_size': len(recent_measurements)
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_active_measurements_count(self) -> Dict[str, int]:
|
||||||
|
"""Get count of currently active measurements by operation"""
|
||||||
|
with self._lock:
|
||||||
|
return {
|
||||||
|
operation: len(measurements)
|
||||||
|
for operation, measurements in self.active_measurements.items()
|
||||||
|
if measurements
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_latency_distribution(self, operation: str, bucket_size_ms: float = 10.0) -> Dict[str, int]:
|
||||||
|
"""
|
||||||
|
Get latency distribution in buckets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Operation name
|
||||||
|
bucket_size_ms: Size of each bucket in milliseconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Latency distribution buckets
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
measurements = list(self.measurements[operation])
|
||||||
|
|
||||||
|
if not measurements:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Create buckets
|
||||||
|
distribution = defaultdict(int)
|
||||||
|
|
||||||
|
for measurement in measurements:
|
||||||
|
bucket = int(measurement.duration_ms // bucket_size_ms) * bucket_size_ms
|
||||||
|
bucket_label = f"{bucket:.0f}-{bucket + bucket_size_ms:.0f}ms"
|
||||||
|
distribution[bucket_label] += 1
|
||||||
|
|
||||||
|
return dict(distribution)
|
||||||
|
|
||||||
|
def export_measurements(self, operation: str = None,
|
||||||
|
format: str = 'json') -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Export measurements for analysis.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Specific operation to export (None for all)
|
||||||
|
format: Export format ('json', 'csv')
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List: Exported measurement data
|
||||||
|
"""
|
||||||
|
exported_data = []
|
||||||
|
|
||||||
|
operations = [operation] if operation else list(self.measurements.keys())
|
||||||
|
|
||||||
|
for op in operations:
|
||||||
|
with self._lock:
|
||||||
|
measurements = list(self.measurements[op])
|
||||||
|
|
||||||
|
for measurement in measurements:
|
||||||
|
data = {
|
||||||
|
'operation': measurement.operation,
|
||||||
|
'duration_ms': measurement.duration_ms,
|
||||||
|
'start_time': measurement.start_time,
|
||||||
|
'end_time': measurement.end_time,
|
||||||
|
'correlation_id': measurement.correlation_id,
|
||||||
|
'metadata': measurement.metadata
|
||||||
|
}
|
||||||
|
exported_data.append(data)
|
||||||
|
|
||||||
|
return exported_data
|
||||||
|
|
||||||
|
def clear_measurements(self, operation: str = None) -> None:
|
||||||
|
"""
|
||||||
|
Clear measurements for an operation or all operations.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Specific operation to clear (None for all)
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
if operation:
|
||||||
|
self.measurements[operation].clear()
|
||||||
|
logger.info(f"Cleared measurements for operation: {operation}")
|
||||||
|
else:
|
||||||
|
self.measurements.clear()
|
||||||
|
self.total_measurements = 0
|
||||||
|
logger.info("Cleared all measurements")
|
||||||
|
|
||||||
|
def get_performance_impact(self, operation: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Analyze performance impact of an operation.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
operation: Operation name
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Performance impact analysis
|
||||||
|
"""
|
||||||
|
stats = self.get_latency_stats(operation)
|
||||||
|
trends = self.get_latency_trends(operation)
|
||||||
|
|
||||||
|
# Determine impact level
|
||||||
|
avg_latency = stats['avg_ms']
|
||||||
|
if avg_latency < 10:
|
||||||
|
impact_level = 'low'
|
||||||
|
elif avg_latency < 100:
|
||||||
|
impact_level = 'medium'
|
||||||
|
else:
|
||||||
|
impact_level = 'high'
|
||||||
|
|
||||||
|
# Check for performance degradation
|
||||||
|
degradation = trends.get('trend') == 'increasing' and trends.get('change_percent', 0) > 20
|
||||||
|
|
||||||
|
return {
|
||||||
|
'operation': operation,
|
||||||
|
'impact_level': impact_level,
|
||||||
|
'avg_latency_ms': avg_latency,
|
||||||
|
'p95_latency_ms': stats['p95_ms'],
|
||||||
|
'measurement_count': stats['count'],
|
||||||
|
'trend': trends.get('trend', 'unknown'),
|
||||||
|
'performance_degradation': degradation,
|
||||||
|
'recommendations': self._get_performance_recommendations(stats, trends)
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_performance_recommendations(self, stats: Dict[str, float],
|
||||||
|
trends: Dict[str, Any]) -> List[str]:
|
||||||
|
"""Get performance recommendations based on stats and trends"""
|
||||||
|
recommendations = []
|
||||||
|
|
||||||
|
if stats['avg_ms'] > 100:
|
||||||
|
recommendations.append("Consider optimizing this operation - average latency is high")
|
||||||
|
|
||||||
|
if stats['p95_ms'] > stats['avg_ms'] * 3:
|
||||||
|
recommendations.append("High latency variance detected - investigate outliers")
|
||||||
|
|
||||||
|
if trends.get('trend') == 'increasing':
|
||||||
|
recommendations.append("Latency is trending upward - monitor for performance degradation")
|
||||||
|
|
||||||
|
if stats['count'] < 10:
|
||||||
|
recommendations.append("Insufficient data for reliable analysis - collect more measurements")
|
||||||
|
|
||||||
|
return recommendations
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get latency tracker statistics"""
|
||||||
|
with self._lock:
|
||||||
|
return {
|
||||||
|
'total_measurements': self.total_measurements,
|
||||||
|
'operations_tracked': len(self.measurements),
|
||||||
|
'active_measurements': sum(len(m) for m in self.active_measurements.values()),
|
||||||
|
'max_measurements': self.max_measurements,
|
||||||
|
'operations': list(self.measurements.keys())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global latency tracker instance
|
||||||
|
latency_tracker = LatencyTracker()
|
561
COBY/monitoring/memory_monitor.py
Normal file
561
COBY/monitoring/memory_monitor.py
Normal file
@ -0,0 +1,561 @@
|
|||||||
|
"""
|
||||||
|
Memory usage monitoring and garbage collection optimization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gc
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import tracemalloc
|
||||||
|
from typing import Dict, List, Optional, Any, Tuple
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from ..utils.logging import get_logger
|
||||||
|
from ..utils.timing import get_current_timestamp
|
||||||
|
# Import will be done lazily to avoid circular imports
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MemorySnapshot:
|
||||||
|
"""Memory usage snapshot"""
|
||||||
|
timestamp: datetime
|
||||||
|
total_memory_mb: float
|
||||||
|
available_memory_mb: float
|
||||||
|
process_memory_mb: float
|
||||||
|
gc_collections: Dict[int, int]
|
||||||
|
gc_objects: int
|
||||||
|
tracemalloc_current_mb: Optional[float] = None
|
||||||
|
tracemalloc_peak_mb: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MemoryLeak:
|
||||||
|
"""Memory leak detection result"""
|
||||||
|
object_type: str
|
||||||
|
count_increase: int
|
||||||
|
size_increase_mb: float
|
||||||
|
growth_rate_per_hour: float
|
||||||
|
severity: str # 'low', 'medium', 'high'
|
||||||
|
|
||||||
|
|
||||||
|
class MemoryMonitor:
|
||||||
|
"""
|
||||||
|
Monitors memory usage, detects leaks, and optimizes garbage collection.
|
||||||
|
|
||||||
|
Provides detailed memory analytics and automatic GC optimization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, enable_tracemalloc: bool = True, snapshot_interval: float = 30.0):
|
||||||
|
"""
|
||||||
|
Initialize memory monitor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
enable_tracemalloc: Whether to enable detailed memory tracing
|
||||||
|
snapshot_interval: How often to take memory snapshots (seconds)
|
||||||
|
"""
|
||||||
|
self.enable_tracemalloc = enable_tracemalloc
|
||||||
|
self.snapshot_interval = snapshot_interval
|
||||||
|
|
||||||
|
# Memory tracking
|
||||||
|
self.memory_snapshots: deque = deque(maxlen=1000)
|
||||||
|
self.object_counts: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
|
||||||
|
|
||||||
|
# GC optimization
|
||||||
|
self.gc_stats: Dict[str, Any] = {}
|
||||||
|
self.gc_thresholds = gc.get_threshold()
|
||||||
|
self.auto_gc_enabled = True
|
||||||
|
|
||||||
|
# Leak detection
|
||||||
|
self.leak_detection_enabled = True
|
||||||
|
self.detected_leaks: List[MemoryLeak] = []
|
||||||
|
|
||||||
|
# Monitoring control
|
||||||
|
self._monitoring = False
|
||||||
|
self._monitor_thread: Optional[threading.Thread] = None
|
||||||
|
|
||||||
|
# Initialize tracemalloc if enabled
|
||||||
|
if self.enable_tracemalloc and not tracemalloc.is_tracing():
|
||||||
|
tracemalloc.start()
|
||||||
|
logger.info("Started tracemalloc for detailed memory tracking")
|
||||||
|
|
||||||
|
logger.info(f"Memory monitor initialized (tracemalloc: {self.enable_tracemalloc})")
|
||||||
|
|
||||||
|
def start_monitoring(self) -> None:
|
||||||
|
"""Start memory monitoring"""
|
||||||
|
if self._monitoring:
|
||||||
|
logger.warning("Memory monitoring already running")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._monitoring = True
|
||||||
|
self._monitor_thread = threading.Thread(
|
||||||
|
target=self._monitoring_loop,
|
||||||
|
name="MemoryMonitor",
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
self._monitor_thread.start()
|
||||||
|
logger.info("Started memory monitoring")
|
||||||
|
|
||||||
|
def stop_monitoring(self) -> None:
|
||||||
|
"""Stop memory monitoring"""
|
||||||
|
if not self._monitoring:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._monitoring = False
|
||||||
|
if self._monitor_thread:
|
||||||
|
self._monitor_thread.join(timeout=5.0)
|
||||||
|
logger.info("Stopped memory monitoring")
|
||||||
|
|
||||||
|
def _monitoring_loop(self) -> None:
|
||||||
|
"""Main monitoring loop"""
|
||||||
|
import time
|
||||||
|
|
||||||
|
while self._monitoring:
|
||||||
|
try:
|
||||||
|
# Take memory snapshot
|
||||||
|
self._take_memory_snapshot()
|
||||||
|
|
||||||
|
# Update object counts
|
||||||
|
self._update_object_counts()
|
||||||
|
|
||||||
|
# Check for memory leaks
|
||||||
|
if self.leak_detection_enabled:
|
||||||
|
self._check_for_leaks()
|
||||||
|
|
||||||
|
# Optimize garbage collection
|
||||||
|
if self.auto_gc_enabled:
|
||||||
|
self._optimize_gc()
|
||||||
|
|
||||||
|
time.sleep(self.snapshot_interval)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in memory monitoring loop: {e}")
|
||||||
|
time.sleep(self.snapshot_interval)
|
||||||
|
|
||||||
|
def _take_memory_snapshot(self) -> None:
|
||||||
|
"""Take a memory usage snapshot"""
|
||||||
|
try:
|
||||||
|
import psutil
|
||||||
|
|
||||||
|
# Get system memory info
|
||||||
|
memory = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# Get process memory info
|
||||||
|
process = psutil.Process()
|
||||||
|
process_memory = process.memory_info()
|
||||||
|
|
||||||
|
# Get GC stats
|
||||||
|
gc_collections = {i: gc.get_count()[i] for i in range(3)}
|
||||||
|
gc_objects = len(gc.get_objects())
|
||||||
|
|
||||||
|
# Get tracemalloc stats if enabled
|
||||||
|
tracemalloc_current_mb = None
|
||||||
|
tracemalloc_peak_mb = None
|
||||||
|
|
||||||
|
if self.enable_tracemalloc and tracemalloc.is_tracing():
|
||||||
|
current, peak = tracemalloc.get_traced_memory()
|
||||||
|
tracemalloc_current_mb = current / (1024 * 1024)
|
||||||
|
tracemalloc_peak_mb = peak / (1024 * 1024)
|
||||||
|
|
||||||
|
# Create snapshot
|
||||||
|
snapshot = MemorySnapshot(
|
||||||
|
timestamp=get_current_timestamp(),
|
||||||
|
total_memory_mb=memory.total / (1024 * 1024),
|
||||||
|
available_memory_mb=memory.available / (1024 * 1024),
|
||||||
|
process_memory_mb=process_memory.rss / (1024 * 1024),
|
||||||
|
gc_collections=gc_collections,
|
||||||
|
gc_objects=gc_objects,
|
||||||
|
tracemalloc_current_mb=tracemalloc_current_mb,
|
||||||
|
tracemalloc_peak_mb=tracemalloc_peak_mb
|
||||||
|
)
|
||||||
|
|
||||||
|
self.memory_snapshots.append(snapshot)
|
||||||
|
|
||||||
|
# Update metrics
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
metrics_collector.set_gauge('memory_total_mb', snapshot.total_memory_mb)
|
||||||
|
metrics_collector.set_gauge('memory_available_mb', snapshot.available_memory_mb)
|
||||||
|
metrics_collector.set_gauge('memory_process_mb', snapshot.process_memory_mb)
|
||||||
|
metrics_collector.set_gauge('memory_gc_objects', snapshot.gc_objects)
|
||||||
|
|
||||||
|
if tracemalloc_current_mb is not None:
|
||||||
|
metrics_collector.set_gauge('memory_tracemalloc_current_mb', tracemalloc_current_mb)
|
||||||
|
metrics_collector.set_gauge('memory_tracemalloc_peak_mb', tracemalloc_peak_mb)
|
||||||
|
except ImportError:
|
||||||
|
pass # Metrics collector not available
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error taking memory snapshot: {e}")
|
||||||
|
|
||||||
|
def _update_object_counts(self) -> None:
|
||||||
|
"""Update object counts by type"""
|
||||||
|
try:
|
||||||
|
# Count objects by type
|
||||||
|
object_counts = defaultdict(int)
|
||||||
|
|
||||||
|
for obj in gc.get_objects():
|
||||||
|
obj_type = type(obj).__name__
|
||||||
|
object_counts[obj_type] += 1
|
||||||
|
|
||||||
|
# Store counts with timestamp
|
||||||
|
timestamp = get_current_timestamp()
|
||||||
|
for obj_type, count in object_counts.items():
|
||||||
|
self.object_counts[obj_type].append((timestamp, count))
|
||||||
|
|
||||||
|
# Update metrics for common types
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
common_types = ['dict', 'list', 'tuple', 'str', 'function', 'type']
|
||||||
|
for obj_type in common_types:
|
||||||
|
if obj_type in object_counts:
|
||||||
|
metrics_collector.set_gauge(
|
||||||
|
f'memory_objects_{obj_type}',
|
||||||
|
object_counts[obj_type]
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
pass # Metrics collector not available
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error updating object counts: {e}")
|
||||||
|
|
||||||
|
def _check_for_leaks(self) -> None:
|
||||||
|
"""Check for potential memory leaks"""
|
||||||
|
try:
|
||||||
|
if len(self.memory_snapshots) < 10:
|
||||||
|
return # Need more data
|
||||||
|
|
||||||
|
# Check for consistent memory growth
|
||||||
|
recent_snapshots = list(self.memory_snapshots)[-10:]
|
||||||
|
memory_values = [s.process_memory_mb for s in recent_snapshots]
|
||||||
|
|
||||||
|
# Simple linear regression to detect growth trend
|
||||||
|
if self._is_memory_growing(memory_values):
|
||||||
|
# Check object count growth
|
||||||
|
potential_leaks = self._analyze_object_growth()
|
||||||
|
|
||||||
|
for leak in potential_leaks:
|
||||||
|
if leak not in self.detected_leaks:
|
||||||
|
self.detected_leaks.append(leak)
|
||||||
|
logger.warning(f"Potential memory leak detected: {leak.object_type}")
|
||||||
|
|
||||||
|
# Record leak detection
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
metrics_collector.increment_counter('memory_leaks_detected')
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking for leaks: {e}")
|
||||||
|
|
||||||
|
def _is_memory_growing(self, memory_values: List[float], threshold: float = 5.0) -> bool:
|
||||||
|
"""Check if memory is consistently growing"""
|
||||||
|
if len(memory_values) < 5:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Check if memory increased by more than threshold MB
|
||||||
|
growth = memory_values[-1] - memory_values[0]
|
||||||
|
return growth > threshold
|
||||||
|
|
||||||
|
def _analyze_object_growth(self) -> List[MemoryLeak]:
|
||||||
|
"""Analyze object count growth to identify potential leaks"""
|
||||||
|
leaks = []
|
||||||
|
|
||||||
|
for obj_type, counts in self.object_counts.items():
|
||||||
|
if len(counts) < 10:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get recent counts
|
||||||
|
recent_counts = list(counts)[-10:]
|
||||||
|
timestamps = [item[0] for item in recent_counts]
|
||||||
|
count_values = [item[1] for item in recent_counts]
|
||||||
|
|
||||||
|
# Check for growth
|
||||||
|
if len(count_values) >= 2:
|
||||||
|
growth = count_values[-1] - count_values[0]
|
||||||
|
time_diff = (timestamps[-1] - timestamps[0]).total_seconds() / 3600 # hours
|
||||||
|
|
||||||
|
if growth > 100 and time_diff > 0: # More than 100 objects growth
|
||||||
|
growth_rate = growth / time_diff
|
||||||
|
|
||||||
|
# Determine severity
|
||||||
|
if growth_rate > 1000:
|
||||||
|
severity = 'high'
|
||||||
|
elif growth_rate > 100:
|
||||||
|
severity = 'medium'
|
||||||
|
else:
|
||||||
|
severity = 'low'
|
||||||
|
|
||||||
|
leak = MemoryLeak(
|
||||||
|
object_type=obj_type,
|
||||||
|
count_increase=growth,
|
||||||
|
size_increase_mb=growth * 0.001, # Rough estimate
|
||||||
|
growth_rate_per_hour=growth_rate,
|
||||||
|
severity=severity
|
||||||
|
)
|
||||||
|
leaks.append(leak)
|
||||||
|
|
||||||
|
return leaks
|
||||||
|
|
||||||
|
def _optimize_gc(self) -> None:
|
||||||
|
"""Optimize garbage collection based on memory usage"""
|
||||||
|
try:
|
||||||
|
if not self.memory_snapshots:
|
||||||
|
return
|
||||||
|
|
||||||
|
latest_snapshot = self.memory_snapshots[-1]
|
||||||
|
memory_usage_percent = (
|
||||||
|
(latest_snapshot.total_memory_mb - latest_snapshot.available_memory_mb) /
|
||||||
|
latest_snapshot.total_memory_mb * 100
|
||||||
|
)
|
||||||
|
|
||||||
|
# Adjust GC thresholds based on memory pressure
|
||||||
|
if memory_usage_percent > 85:
|
||||||
|
# High memory pressure - more aggressive GC
|
||||||
|
new_thresholds = (500, 10, 10)
|
||||||
|
if gc.get_threshold() != new_thresholds:
|
||||||
|
gc.set_threshold(*new_thresholds)
|
||||||
|
logger.info("Enabled aggressive garbage collection due to high memory usage")
|
||||||
|
|
||||||
|
# Force collection
|
||||||
|
collected = gc.collect()
|
||||||
|
metrics_collector.increment_counter('memory_gc_forced')
|
||||||
|
logger.debug(f"Forced GC collected {collected} objects")
|
||||||
|
|
||||||
|
elif memory_usage_percent < 50:
|
||||||
|
# Low memory pressure - less aggressive GC
|
||||||
|
new_thresholds = (1000, 20, 20)
|
||||||
|
if gc.get_threshold() != new_thresholds:
|
||||||
|
gc.set_threshold(*new_thresholds)
|
||||||
|
logger.info("Reduced garbage collection frequency due to low memory usage")
|
||||||
|
|
||||||
|
# Update GC stats
|
||||||
|
self.gc_stats = {
|
||||||
|
'threshold': gc.get_threshold(),
|
||||||
|
'counts': gc.get_count(),
|
||||||
|
'collections': gc.get_stats()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error optimizing GC: {e}")
|
||||||
|
|
||||||
|
def force_garbage_collection(self) -> Dict[str, int]:
|
||||||
|
"""Force garbage collection and return statistics"""
|
||||||
|
try:
|
||||||
|
# Get counts before collection
|
||||||
|
before_counts = gc.get_count()
|
||||||
|
before_objects = len(gc.get_objects())
|
||||||
|
|
||||||
|
# Force collection for all generations
|
||||||
|
collected = [gc.collect(generation) for generation in range(3)]
|
||||||
|
total_collected = sum(collected)
|
||||||
|
|
||||||
|
# Get counts after collection
|
||||||
|
after_counts = gc.get_count()
|
||||||
|
after_objects = len(gc.get_objects())
|
||||||
|
|
||||||
|
# Update metrics
|
||||||
|
try:
|
||||||
|
from .metrics_collector import metrics_collector
|
||||||
|
metrics_collector.increment_counter('memory_gc_manual')
|
||||||
|
metrics_collector.set_gauge('memory_gc_objects_collected', total_collected)
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
result = {
|
||||||
|
'total_collected': total_collected,
|
||||||
|
'by_generation': collected,
|
||||||
|
'objects_before': before_objects,
|
||||||
|
'objects_after': after_objects,
|
||||||
|
'objects_freed': before_objects - after_objects,
|
||||||
|
'counts_before': before_counts,
|
||||||
|
'counts_after': after_counts
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"Manual GC collected {total_collected} objects, freed {result['objects_freed']} objects")
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during forced garbage collection: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_memory_usage_summary(self) -> Dict[str, Any]:
|
||||||
|
"""Get current memory usage summary"""
|
||||||
|
if not self.memory_snapshots:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
latest = self.memory_snapshots[-1]
|
||||||
|
|
||||||
|
# Calculate memory usage percentage
|
||||||
|
memory_usage_percent = (
|
||||||
|
(latest.total_memory_mb - latest.available_memory_mb) /
|
||||||
|
latest.total_memory_mb * 100
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
'timestamp': latest.timestamp.isoformat(),
|
||||||
|
'total_memory_mb': latest.total_memory_mb,
|
||||||
|
'available_memory_mb': latest.available_memory_mb,
|
||||||
|
'used_memory_mb': latest.total_memory_mb - latest.available_memory_mb,
|
||||||
|
'memory_usage_percent': memory_usage_percent,
|
||||||
|
'process_memory_mb': latest.process_memory_mb,
|
||||||
|
'gc_objects': latest.gc_objects,
|
||||||
|
'gc_collections': latest.gc_collections,
|
||||||
|
'tracemalloc_current_mb': latest.tracemalloc_current_mb,
|
||||||
|
'tracemalloc_peak_mb': latest.tracemalloc_peak_mb
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_memory_trends(self, hours: int = 1) -> Dict[str, Any]:
|
||||||
|
"""Get memory usage trends over specified time period"""
|
||||||
|
if not self.memory_snapshots:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
|
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
# Filter snapshots
|
||||||
|
recent_snapshots = [
|
||||||
|
s for s in self.memory_snapshots
|
||||||
|
if s.timestamp >= cutoff_time
|
||||||
|
]
|
||||||
|
|
||||||
|
if len(recent_snapshots) < 2:
|
||||||
|
return {'trend': 'insufficient_data'}
|
||||||
|
|
||||||
|
# Calculate trends
|
||||||
|
process_memory_values = [s.process_memory_mb for s in recent_snapshots]
|
||||||
|
gc_object_values = [s.gc_objects for s in recent_snapshots]
|
||||||
|
|
||||||
|
return {
|
||||||
|
'process_memory': {
|
||||||
|
'start_mb': process_memory_values[0],
|
||||||
|
'end_mb': process_memory_values[-1],
|
||||||
|
'change_mb': process_memory_values[-1] - process_memory_values[0],
|
||||||
|
'max_mb': max(process_memory_values),
|
||||||
|
'min_mb': min(process_memory_values),
|
||||||
|
'avg_mb': sum(process_memory_values) / len(process_memory_values)
|
||||||
|
},
|
||||||
|
'gc_objects': {
|
||||||
|
'start_count': gc_object_values[0],
|
||||||
|
'end_count': gc_object_values[-1],
|
||||||
|
'change_count': gc_object_values[-1] - gc_object_values[0],
|
||||||
|
'max_count': max(gc_object_values),
|
||||||
|
'min_count': min(gc_object_values),
|
||||||
|
'avg_count': sum(gc_object_values) / len(gc_object_values)
|
||||||
|
},
|
||||||
|
'sample_count': len(recent_snapshots),
|
||||||
|
'time_period_hours': hours
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_top_memory_consumers(self, limit: int = 10) -> List[Dict[str, Any]]:
|
||||||
|
"""Get top memory consuming object types"""
|
||||||
|
if not self.object_counts:
|
||||||
|
return []
|
||||||
|
|
||||||
|
# Get latest counts
|
||||||
|
latest_counts = {}
|
||||||
|
for obj_type, counts in self.object_counts.items():
|
||||||
|
if counts:
|
||||||
|
latest_counts[obj_type] = counts[-1][1] # Get count from (timestamp, count) tuple
|
||||||
|
|
||||||
|
# Sort by count
|
||||||
|
sorted_types = sorted(
|
||||||
|
latest_counts.items(),
|
||||||
|
key=lambda x: x[1],
|
||||||
|
reverse=True
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'object_type': obj_type,
|
||||||
|
'count': count,
|
||||||
|
'estimated_size_mb': count * 0.001 # Rough estimate
|
||||||
|
}
|
||||||
|
for obj_type, count in sorted_types[:limit]
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_detected_leaks(self) -> List[Dict[str, Any]]:
|
||||||
|
"""Get detected memory leaks"""
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'object_type': leak.object_type,
|
||||||
|
'count_increase': leak.count_increase,
|
||||||
|
'size_increase_mb': leak.size_increase_mb,
|
||||||
|
'growth_rate_per_hour': leak.growth_rate_per_hour,
|
||||||
|
'severity': leak.severity
|
||||||
|
}
|
||||||
|
for leak in self.detected_leaks
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_tracemalloc_top(self, limit: int = 10) -> List[Dict[str, Any]]:
|
||||||
|
"""Get top memory allocations from tracemalloc"""
|
||||||
|
if not self.enable_tracemalloc or not tracemalloc.is_tracing():
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
snapshot = tracemalloc.take_snapshot()
|
||||||
|
top_stats = snapshot.statistics('lineno')
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'filename': stat.traceback.format()[0],
|
||||||
|
'size_mb': stat.size / (1024 * 1024),
|
||||||
|
'count': stat.count
|
||||||
|
}
|
||||||
|
for stat in top_stats[:limit]
|
||||||
|
]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting tracemalloc top: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def clear_leak_history(self) -> None:
|
||||||
|
"""Clear detected leak history"""
|
||||||
|
self.detected_leaks.clear()
|
||||||
|
logger.info("Cleared memory leak history")
|
||||||
|
|
||||||
|
def get_gc_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get garbage collection statistics"""
|
||||||
|
return {
|
||||||
|
'thresholds': gc.get_threshold(),
|
||||||
|
'counts': gc.get_count(),
|
||||||
|
'stats': gc.get_stats(),
|
||||||
|
'auto_gc_enabled': self.auto_gc_enabled,
|
||||||
|
'is_enabled': gc.isenabled()
|
||||||
|
}
|
||||||
|
|
||||||
|
def set_gc_thresholds(self, gen0: int, gen1: int, gen2: int) -> None:
|
||||||
|
"""Set garbage collection thresholds"""
|
||||||
|
gc.set_threshold(gen0, gen1, gen2)
|
||||||
|
logger.info(f"Set GC thresholds to ({gen0}, {gen1}, {gen2})")
|
||||||
|
|
||||||
|
def enable_auto_gc_optimization(self, enabled: bool = True) -> None:
|
||||||
|
"""Enable or disable automatic GC optimization"""
|
||||||
|
self.auto_gc_enabled = enabled
|
||||||
|
logger.info(f"Auto GC optimization {'enabled' if enabled else 'disabled'}")
|
||||||
|
|
||||||
|
def enable_leak_detection(self, enabled: bool = True) -> None:
|
||||||
|
"""Enable or disable memory leak detection"""
|
||||||
|
self.leak_detection_enabled = enabled
|
||||||
|
logger.info(f"Memory leak detection {'enabled' if enabled else 'disabled'}")
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get memory monitor statistics"""
|
||||||
|
return {
|
||||||
|
'monitoring': self._monitoring,
|
||||||
|
'snapshot_interval': self.snapshot_interval,
|
||||||
|
'snapshots_count': len(self.memory_snapshots),
|
||||||
|
'object_types_tracked': len(self.object_counts),
|
||||||
|
'detected_leaks': len(self.detected_leaks),
|
||||||
|
'tracemalloc_enabled': self.enable_tracemalloc and tracemalloc.is_tracing(),
|
||||||
|
'auto_gc_enabled': self.auto_gc_enabled,
|
||||||
|
'leak_detection_enabled': self.leak_detection_enabled,
|
||||||
|
'gc_thresholds': gc.get_threshold()
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global memory monitor instance
|
||||||
|
memory_monitor = MemoryMonitor()
|
395
COBY/monitoring/metrics_collector.py
Normal file
395
COBY/monitoring/metrics_collector.py
Normal file
@ -0,0 +1,395 @@
|
|||||||
|
"""
|
||||||
|
Comprehensive metrics collection for all system components.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import psutil
|
||||||
|
import threading
|
||||||
|
from typing import Dict, List, Optional, Any, Callable
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from ..utils.logging import get_logger
|
||||||
|
from ..utils.timing import get_current_timestamp
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricPoint:
|
||||||
|
"""Individual metric data point"""
|
||||||
|
name: str
|
||||||
|
value: float
|
||||||
|
timestamp: datetime
|
||||||
|
labels: Dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
|
def to_prometheus_format(self) -> str:
|
||||||
|
"""Convert to Prometheus format"""
|
||||||
|
labels_str = ""
|
||||||
|
if self.labels:
|
||||||
|
label_pairs = [f'{k}="{v}"' for k, v in self.labels.items()]
|
||||||
|
labels_str = "{" + ",".join(label_pairs) + "}"
|
||||||
|
|
||||||
|
return f"{self.name}{labels_str} {self.value} {int(self.timestamp.timestamp() * 1000)}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SystemMetrics:
|
||||||
|
"""System-level metrics"""
|
||||||
|
cpu_usage: float
|
||||||
|
memory_usage: float
|
||||||
|
memory_available: float
|
||||||
|
disk_usage: float
|
||||||
|
network_bytes_sent: int
|
||||||
|
network_bytes_recv: int
|
||||||
|
active_connections: int
|
||||||
|
timestamp: datetime
|
||||||
|
|
||||||
|
|
||||||
|
class MetricsCollector:
|
||||||
|
"""
|
||||||
|
Collects and manages performance metrics from all system components.
|
||||||
|
|
||||||
|
Provides Prometheus-compatible metrics and real-time monitoring data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, collection_interval: float = 1.0, max_history: int = 10000):
|
||||||
|
"""
|
||||||
|
Initialize metrics collector.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
collection_interval: How often to collect system metrics (seconds)
|
||||||
|
max_history: Maximum number of metric points to keep in memory
|
||||||
|
"""
|
||||||
|
self.collection_interval = collection_interval
|
||||||
|
self.max_history = max_history
|
||||||
|
|
||||||
|
# Metric storage
|
||||||
|
self.metrics: Dict[str, deque] = defaultdict(lambda: deque(maxlen=max_history))
|
||||||
|
self.counters: Dict[str, float] = defaultdict(float)
|
||||||
|
self.gauges: Dict[str, float] = defaultdict(float)
|
||||||
|
self.histograms: Dict[str, List[float]] = defaultdict(list)
|
||||||
|
|
||||||
|
# System metrics
|
||||||
|
self.system_metrics_history: deque = deque(maxlen=max_history)
|
||||||
|
|
||||||
|
# Collection control
|
||||||
|
self._collecting = False
|
||||||
|
self._collection_thread: Optional[threading.Thread] = None
|
||||||
|
self._lock = threading.RLock()
|
||||||
|
|
||||||
|
# Callbacks for custom metrics
|
||||||
|
self.metric_callbacks: List[Callable[[], Dict[str, float]]] = []
|
||||||
|
|
||||||
|
logger.info(f"Metrics collector initialized with {collection_interval}s interval")
|
||||||
|
|
||||||
|
def start_collection(self) -> None:
|
||||||
|
"""Start automatic metrics collection"""
|
||||||
|
if self._collecting:
|
||||||
|
logger.warning("Metrics collection already running")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._collecting = True
|
||||||
|
self._collection_thread = threading.Thread(
|
||||||
|
target=self._collection_loop,
|
||||||
|
name="MetricsCollector",
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
self._collection_thread.start()
|
||||||
|
logger.info("Started metrics collection")
|
||||||
|
|
||||||
|
def stop_collection(self) -> None:
|
||||||
|
"""Stop automatic metrics collection"""
|
||||||
|
if not self._collecting:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._collecting = False
|
||||||
|
if self._collection_thread:
|
||||||
|
self._collection_thread.join(timeout=5.0)
|
||||||
|
logger.info("Stopped metrics collection")
|
||||||
|
|
||||||
|
def _collection_loop(self) -> None:
|
||||||
|
"""Main collection loop"""
|
||||||
|
while self._collecting:
|
||||||
|
try:
|
||||||
|
# Collect system metrics
|
||||||
|
self._collect_system_metrics()
|
||||||
|
|
||||||
|
# Collect custom metrics from callbacks
|
||||||
|
self._collect_custom_metrics()
|
||||||
|
|
||||||
|
time.sleep(self.collection_interval)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in metrics collection loop: {e}")
|
||||||
|
time.sleep(self.collection_interval)
|
||||||
|
|
||||||
|
def _collect_system_metrics(self) -> None:
|
||||||
|
"""Collect system-level metrics"""
|
||||||
|
try:
|
||||||
|
# CPU usage
|
||||||
|
cpu_percent = psutil.cpu_percent(interval=None)
|
||||||
|
|
||||||
|
# Memory usage
|
||||||
|
memory = psutil.virtual_memory()
|
||||||
|
|
||||||
|
# Disk usage (root partition)
|
||||||
|
disk = psutil.disk_usage('/')
|
||||||
|
|
||||||
|
# Network stats
|
||||||
|
network = psutil.net_io_counters()
|
||||||
|
|
||||||
|
# Active connections
|
||||||
|
connections = len(psutil.net_connections())
|
||||||
|
|
||||||
|
# Create system metrics object
|
||||||
|
sys_metrics = SystemMetrics(
|
||||||
|
cpu_usage=cpu_percent,
|
||||||
|
memory_usage=memory.percent,
|
||||||
|
memory_available=memory.available / (1024**3), # GB
|
||||||
|
disk_usage=disk.percent,
|
||||||
|
network_bytes_sent=network.bytes_sent,
|
||||||
|
network_bytes_recv=network.bytes_recv,
|
||||||
|
active_connections=connections,
|
||||||
|
timestamp=get_current_timestamp()
|
||||||
|
)
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
self.system_metrics_history.append(sys_metrics)
|
||||||
|
|
||||||
|
# Update gauges
|
||||||
|
self.gauges['system_cpu_usage'] = cpu_percent
|
||||||
|
self.gauges['system_memory_usage'] = memory.percent
|
||||||
|
self.gauges['system_memory_available_gb'] = memory.available / (1024**3)
|
||||||
|
self.gauges['system_disk_usage'] = disk.percent
|
||||||
|
self.gauges['system_active_connections'] = connections
|
||||||
|
|
||||||
|
# Update counters (cumulative)
|
||||||
|
self.counters['system_network_bytes_sent'] = network.bytes_sent
|
||||||
|
self.counters['system_network_bytes_recv'] = network.bytes_recv
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting system metrics: {e}")
|
||||||
|
|
||||||
|
def _collect_custom_metrics(self) -> None:
|
||||||
|
"""Collect metrics from registered callbacks"""
|
||||||
|
for callback in self.metric_callbacks:
|
||||||
|
try:
|
||||||
|
custom_metrics = callback()
|
||||||
|
if isinstance(custom_metrics, dict):
|
||||||
|
with self._lock:
|
||||||
|
for name, value in custom_metrics.items():
|
||||||
|
self.record_gauge(name, value)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting custom metrics: {e}")
|
||||||
|
|
||||||
|
def record_counter(self, name: str, value: float = 1.0, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Record a counter metric (cumulative)"""
|
||||||
|
with self._lock:
|
||||||
|
self.counters[name] += value
|
||||||
|
|
||||||
|
# Store metric point
|
||||||
|
point = MetricPoint(
|
||||||
|
name=name,
|
||||||
|
value=self.counters[name],
|
||||||
|
timestamp=get_current_timestamp(),
|
||||||
|
labels=labels or {}
|
||||||
|
)
|
||||||
|
self.metrics[name].append(point)
|
||||||
|
|
||||||
|
def record_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Record a gauge metric (current value)"""
|
||||||
|
with self._lock:
|
||||||
|
self.gauges[name] = value
|
||||||
|
|
||||||
|
# Store metric point
|
||||||
|
point = MetricPoint(
|
||||||
|
name=name,
|
||||||
|
value=value,
|
||||||
|
timestamp=get_current_timestamp(),
|
||||||
|
labels=labels or {}
|
||||||
|
)
|
||||||
|
self.metrics[name].append(point)
|
||||||
|
|
||||||
|
def record_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Record a histogram metric (for latency, sizes, etc.)"""
|
||||||
|
with self._lock:
|
||||||
|
self.histograms[name].append(value)
|
||||||
|
|
||||||
|
# Keep only recent values
|
||||||
|
if len(self.histograms[name]) > 1000:
|
||||||
|
self.histograms[name] = self.histograms[name][-1000:]
|
||||||
|
|
||||||
|
# Store metric point
|
||||||
|
point = MetricPoint(
|
||||||
|
name=name,
|
||||||
|
value=value,
|
||||||
|
timestamp=get_current_timestamp(),
|
||||||
|
labels=labels or {}
|
||||||
|
)
|
||||||
|
self.metrics[name].append(point)
|
||||||
|
|
||||||
|
def increment_counter(self, name: str, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Increment a counter by 1"""
|
||||||
|
self.record_counter(name, 1.0, labels)
|
||||||
|
|
||||||
|
def set_gauge(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Set a gauge value"""
|
||||||
|
self.record_gauge(name, value, labels)
|
||||||
|
|
||||||
|
def observe_histogram(self, name: str, value: float, labels: Dict[str, str] = None) -> None:
|
||||||
|
"""Observe a value in a histogram"""
|
||||||
|
self.record_histogram(name, value, labels)
|
||||||
|
|
||||||
|
def get_current_metrics(self) -> Dict[str, Any]:
|
||||||
|
"""Get current metric values"""
|
||||||
|
with self._lock:
|
||||||
|
return {
|
||||||
|
'counters': dict(self.counters),
|
||||||
|
'gauges': dict(self.gauges),
|
||||||
|
'histograms': {
|
||||||
|
name: {
|
||||||
|
'count': len(values),
|
||||||
|
'sum': sum(values),
|
||||||
|
'avg': sum(values) / len(values) if values else 0,
|
||||||
|
'min': min(values) if values else 0,
|
||||||
|
'max': max(values) if values else 0,
|
||||||
|
'p50': self._percentile(values, 50) if values else 0,
|
||||||
|
'p95': self._percentile(values, 95) if values else 0,
|
||||||
|
'p99': self._percentile(values, 99) if values else 0
|
||||||
|
}
|
||||||
|
for name, values in self.histograms.items()
|
||||||
|
},
|
||||||
|
'system': self.get_latest_system_metrics()
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_latest_system_metrics(self) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Get the latest system metrics"""
|
||||||
|
with self._lock:
|
||||||
|
if not self.system_metrics_history:
|
||||||
|
return None
|
||||||
|
|
||||||
|
latest = self.system_metrics_history[-1]
|
||||||
|
return {
|
||||||
|
'cpu_usage': latest.cpu_usage,
|
||||||
|
'memory_usage': latest.memory_usage,
|
||||||
|
'memory_available_gb': latest.memory_available,
|
||||||
|
'disk_usage': latest.disk_usage,
|
||||||
|
'network_bytes_sent': latest.network_bytes_sent,
|
||||||
|
'network_bytes_recv': latest.network_bytes_recv,
|
||||||
|
'active_connections': latest.active_connections,
|
||||||
|
'timestamp': latest.timestamp.isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_metric_history(self, name: str, limit: int = 100) -> List[Dict[str, Any]]:
|
||||||
|
"""Get historical values for a specific metric"""
|
||||||
|
with self._lock:
|
||||||
|
if name not in self.metrics:
|
||||||
|
return []
|
||||||
|
|
||||||
|
points = list(self.metrics[name])[-limit:]
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'value': point.value,
|
||||||
|
'timestamp': point.timestamp.isoformat(),
|
||||||
|
'labels': point.labels
|
||||||
|
}
|
||||||
|
for point in points
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_prometheus_metrics(self) -> str:
|
||||||
|
"""Export metrics in Prometheus format"""
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
# Export counters
|
||||||
|
for name, value in self.counters.items():
|
||||||
|
lines.append(f"# TYPE {name} counter")
|
||||||
|
lines.append(f"{name} {value}")
|
||||||
|
|
||||||
|
# Export gauges
|
||||||
|
for name, value in self.gauges.items():
|
||||||
|
lines.append(f"# TYPE {name} gauge")
|
||||||
|
lines.append(f"{name} {value}")
|
||||||
|
|
||||||
|
# Export histograms
|
||||||
|
for name, values in self.histograms.items():
|
||||||
|
if values:
|
||||||
|
lines.append(f"# TYPE {name} histogram")
|
||||||
|
lines.append(f"{name}_count {len(values)}")
|
||||||
|
lines.append(f"{name}_sum {sum(values)}")
|
||||||
|
|
||||||
|
# Add percentiles
|
||||||
|
for percentile in [50, 95, 99]:
|
||||||
|
p_value = self._percentile(values, percentile)
|
||||||
|
lines.append(f"{name}_percentile{{quantile=\"0.{percentile:02d}\"}} {p_value}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
def register_callback(self, callback: Callable[[], Dict[str, float]]) -> None:
|
||||||
|
"""Register a callback for custom metrics collection"""
|
||||||
|
self.metric_callbacks.append(callback)
|
||||||
|
logger.info(f"Registered metrics callback: {callback.__name__}")
|
||||||
|
|
||||||
|
def get_performance_summary(self) -> Dict[str, Any]:
|
||||||
|
"""Get a performance summary"""
|
||||||
|
current_metrics = self.get_current_metrics()
|
||||||
|
|
||||||
|
# Calculate rates and trends
|
||||||
|
summary = {
|
||||||
|
'timestamp': get_current_timestamp().isoformat(),
|
||||||
|
'system': current_metrics.get('system', {}),
|
||||||
|
'counters': current_metrics.get('counters', {}),
|
||||||
|
'gauges': current_metrics.get('gauges', {}),
|
||||||
|
'performance_indicators': {}
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add performance indicators
|
||||||
|
histograms = current_metrics.get('histograms', {})
|
||||||
|
for name, stats in histograms.items():
|
||||||
|
if 'latency' in name.lower():
|
||||||
|
summary['performance_indicators'][f"{name}_avg_ms"] = stats['avg']
|
||||||
|
summary['performance_indicators'][f"{name}_p95_ms"] = stats['p95']
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def _percentile(self, values: List[float], percentile: int) -> float:
|
||||||
|
"""Calculate percentile of values"""
|
||||||
|
if not values:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
sorted_values = sorted(values)
|
||||||
|
index = int((percentile / 100.0) * len(sorted_values))
|
||||||
|
index = min(index, len(sorted_values) - 1)
|
||||||
|
return sorted_values[index]
|
||||||
|
|
||||||
|
def reset_metrics(self) -> None:
|
||||||
|
"""Reset all metrics (useful for testing)"""
|
||||||
|
with self._lock:
|
||||||
|
self.metrics.clear()
|
||||||
|
self.counters.clear()
|
||||||
|
self.gauges.clear()
|
||||||
|
self.histograms.clear()
|
||||||
|
self.system_metrics_history.clear()
|
||||||
|
|
||||||
|
logger.info("All metrics reset")
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get collector statistics"""
|
||||||
|
with self._lock:
|
||||||
|
return {
|
||||||
|
'collecting': self._collecting,
|
||||||
|
'collection_interval': self.collection_interval,
|
||||||
|
'max_history': self.max_history,
|
||||||
|
'total_metrics': len(self.metrics),
|
||||||
|
'total_counters': len(self.counters),
|
||||||
|
'total_gauges': len(self.gauges),
|
||||||
|
'total_histograms': len(self.histograms),
|
||||||
|
'system_metrics_count': len(self.system_metrics_history),
|
||||||
|
'registered_callbacks': len(self.metric_callbacks)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global metrics collector instance
|
||||||
|
metrics_collector = MetricsCollector()
|
556
COBY/monitoring/performance_monitor.py
Normal file
556
COBY/monitoring/performance_monitor.py
Normal file
@ -0,0 +1,556 @@
|
|||||||
|
"""
|
||||||
|
Performance monitoring dashboard and real-time performance tracking.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import time
|
||||||
|
import asyncio
|
||||||
|
import threading
|
||||||
|
from typing import Dict, List, Optional, Any, Callable
|
||||||
|
from collections import defaultdict, deque
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from ..utils.logging import get_logger
|
||||||
|
from ..utils.timing import get_current_timestamp
|
||||||
|
from .metrics_collector import MetricsCollector
|
||||||
|
|
||||||
|
logger = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PerformanceAlert:
|
||||||
|
"""Performance alert definition"""
|
||||||
|
name: str
|
||||||
|
metric_name: str
|
||||||
|
threshold: float
|
||||||
|
comparison: str # 'gt', 'lt', 'eq'
|
||||||
|
duration: int # seconds
|
||||||
|
message: str
|
||||||
|
severity: str = 'warning' # 'info', 'warning', 'critical'
|
||||||
|
triggered_at: Optional[datetime] = None
|
||||||
|
resolved_at: Optional[datetime] = None
|
||||||
|
|
||||||
|
def is_triggered(self, value: float) -> bool:
|
||||||
|
"""Check if alert should be triggered"""
|
||||||
|
if self.comparison == 'gt':
|
||||||
|
return value > self.threshold
|
||||||
|
elif self.comparison == 'lt':
|
||||||
|
return value < self.threshold
|
||||||
|
elif self.comparison == 'eq':
|
||||||
|
return abs(value - self.threshold) < 0.001
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PerformanceThresholds:
|
||||||
|
"""Performance threshold configuration"""
|
||||||
|
max_cpu_usage: float = 80.0 # %
|
||||||
|
max_memory_usage: float = 85.0 # %
|
||||||
|
min_memory_available: float = 1.0 # GB
|
||||||
|
max_latency_ms: float = 100.0 # milliseconds
|
||||||
|
max_error_rate: float = 5.0 # %
|
||||||
|
min_throughput: float = 100.0 # operations/second
|
||||||
|
|
||||||
|
|
||||||
|
class PerformanceMonitor:
|
||||||
|
"""
|
||||||
|
Real-time performance monitoring with alerting and dashboard data.
|
||||||
|
|
||||||
|
Monitors system performance, tracks KPIs, and provides alerts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, metrics_collector: MetricsCollector = None):
|
||||||
|
"""
|
||||||
|
Initialize performance monitor.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
metrics_collector: Metrics collector instance
|
||||||
|
"""
|
||||||
|
if metrics_collector is None:
|
||||||
|
from .metrics_collector import metrics_collector as default_collector
|
||||||
|
self.metrics_collector = default_collector
|
||||||
|
else:
|
||||||
|
self.metrics_collector = metrics_collector
|
||||||
|
self.thresholds = PerformanceThresholds()
|
||||||
|
|
||||||
|
# Alert management
|
||||||
|
self.alerts: Dict[str, PerformanceAlert] = {}
|
||||||
|
self.active_alerts: Dict[str, PerformanceAlert] = {}
|
||||||
|
self.alert_history: deque = deque(maxlen=1000)
|
||||||
|
|
||||||
|
# Performance tracking
|
||||||
|
self.performance_history: deque = deque(maxlen=10000)
|
||||||
|
self.kpi_history: Dict[str, deque] = defaultdict(lambda: deque(maxlen=1000))
|
||||||
|
|
||||||
|
# Monitoring control
|
||||||
|
self._monitoring = False
|
||||||
|
self._monitor_thread: Optional[threading.Thread] = None
|
||||||
|
self._monitor_interval = 5.0 # seconds
|
||||||
|
|
||||||
|
# Alert callbacks
|
||||||
|
self.alert_callbacks: List[Callable[[PerformanceAlert], None]] = []
|
||||||
|
|
||||||
|
# Initialize default alerts
|
||||||
|
self._setup_default_alerts()
|
||||||
|
|
||||||
|
logger.info("Performance monitor initialized")
|
||||||
|
|
||||||
|
def _setup_default_alerts(self) -> None:
|
||||||
|
"""Setup default performance alerts"""
|
||||||
|
default_alerts = [
|
||||||
|
PerformanceAlert(
|
||||||
|
name="high_cpu_usage",
|
||||||
|
metric_name="system_cpu_usage",
|
||||||
|
threshold=self.thresholds.max_cpu_usage,
|
||||||
|
comparison="gt",
|
||||||
|
duration=30,
|
||||||
|
message="CPU usage is above {threshold}%",
|
||||||
|
severity="warning"
|
||||||
|
),
|
||||||
|
PerformanceAlert(
|
||||||
|
name="high_memory_usage",
|
||||||
|
metric_name="system_memory_usage",
|
||||||
|
threshold=self.thresholds.max_memory_usage,
|
||||||
|
comparison="gt",
|
||||||
|
duration=30,
|
||||||
|
message="Memory usage is above {threshold}%",
|
||||||
|
severity="warning"
|
||||||
|
),
|
||||||
|
PerformanceAlert(
|
||||||
|
name="low_memory_available",
|
||||||
|
metric_name="system_memory_available_gb",
|
||||||
|
threshold=self.thresholds.min_memory_available,
|
||||||
|
comparison="lt",
|
||||||
|
duration=60,
|
||||||
|
message="Available memory is below {threshold}GB",
|
||||||
|
severity="critical"
|
||||||
|
),
|
||||||
|
PerformanceAlert(
|
||||||
|
name="high_latency",
|
||||||
|
metric_name="processing_latency_ms",
|
||||||
|
threshold=self.thresholds.max_latency_ms,
|
||||||
|
comparison="gt",
|
||||||
|
duration=60,
|
||||||
|
message="Processing latency is above {threshold}ms",
|
||||||
|
severity="warning"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
for alert in default_alerts:
|
||||||
|
self.add_alert(alert)
|
||||||
|
|
||||||
|
def start_monitoring(self) -> None:
|
||||||
|
"""Start performance monitoring"""
|
||||||
|
if self._monitoring:
|
||||||
|
logger.warning("Performance monitoring already running")
|
||||||
|
return
|
||||||
|
|
||||||
|
self._monitoring = True
|
||||||
|
self._monitor_thread = threading.Thread(
|
||||||
|
target=self._monitoring_loop,
|
||||||
|
name="PerformanceMonitor",
|
||||||
|
daemon=True
|
||||||
|
)
|
||||||
|
self._monitor_thread.start()
|
||||||
|
logger.info("Started performance monitoring")
|
||||||
|
|
||||||
|
def stop_monitoring(self) -> None:
|
||||||
|
"""Stop performance monitoring"""
|
||||||
|
if not self._monitoring:
|
||||||
|
return
|
||||||
|
|
||||||
|
self._monitoring = False
|
||||||
|
if self._monitor_thread:
|
||||||
|
self._monitor_thread.join(timeout=5.0)
|
||||||
|
logger.info("Stopped performance monitoring")
|
||||||
|
|
||||||
|
def _monitoring_loop(self) -> None:
|
||||||
|
"""Main monitoring loop"""
|
||||||
|
while self._monitoring:
|
||||||
|
try:
|
||||||
|
# Collect current performance data
|
||||||
|
self._collect_performance_data()
|
||||||
|
|
||||||
|
# Check alerts
|
||||||
|
self._check_alerts()
|
||||||
|
|
||||||
|
# Update KPIs
|
||||||
|
self._update_kpis()
|
||||||
|
|
||||||
|
time.sleep(self._monitor_interval)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in performance monitoring loop: {e}")
|
||||||
|
time.sleep(self._monitor_interval)
|
||||||
|
|
||||||
|
def _collect_performance_data(self) -> None:
|
||||||
|
"""Collect current performance data"""
|
||||||
|
try:
|
||||||
|
if self.metrics_collector is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
|
||||||
|
if current_metrics is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create performance snapshot
|
||||||
|
performance_data = {
|
||||||
|
'timestamp': get_current_timestamp(),
|
||||||
|
'system': current_metrics.get('system', {}),
|
||||||
|
'counters': current_metrics.get('counters', {}),
|
||||||
|
'gauges': current_metrics.get('gauges', {}),
|
||||||
|
'histograms': current_metrics.get('histograms', {})
|
||||||
|
}
|
||||||
|
|
||||||
|
self.performance_history.append(performance_data)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error collecting performance data: {e}")
|
||||||
|
|
||||||
|
def _check_alerts(self) -> None:
|
||||||
|
"""Check all alerts against current metrics"""
|
||||||
|
if self.metrics_collector is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
if current_metrics is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_time = get_current_timestamp()
|
||||||
|
|
||||||
|
for alert_name, alert in self.alerts.items():
|
||||||
|
try:
|
||||||
|
# Get metric value
|
||||||
|
metric_value = self._get_metric_value(alert.metric_name, current_metrics)
|
||||||
|
if metric_value is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Check if alert should be triggered
|
||||||
|
should_trigger = alert.is_triggered(metric_value)
|
||||||
|
|
||||||
|
if should_trigger and alert_name not in self.active_alerts:
|
||||||
|
# Trigger alert
|
||||||
|
alert.triggered_at = current_time
|
||||||
|
self.active_alerts[alert_name] = alert
|
||||||
|
self.alert_history.append(alert)
|
||||||
|
|
||||||
|
# Format message
|
||||||
|
message = alert.message.format(
|
||||||
|
threshold=alert.threshold,
|
||||||
|
value=metric_value
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.warning(f"Performance alert triggered: {alert.name} - {message}")
|
||||||
|
|
||||||
|
# Notify callbacks
|
||||||
|
for callback in self.alert_callbacks:
|
||||||
|
try:
|
||||||
|
callback(alert)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error in alert callback: {e}")
|
||||||
|
|
||||||
|
elif not should_trigger and alert_name in self.active_alerts:
|
||||||
|
# Resolve alert
|
||||||
|
resolved_alert = self.active_alerts.pop(alert_name)
|
||||||
|
resolved_alert.resolved_at = current_time
|
||||||
|
|
||||||
|
logger.info(f"Performance alert resolved: {alert.name}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error checking alert {alert_name}: {e}")
|
||||||
|
|
||||||
|
def _get_metric_value(self, metric_name: str, metrics: Dict[str, Any]) -> Optional[float]:
|
||||||
|
"""Get metric value from metrics data"""
|
||||||
|
if not metrics:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check gauges first
|
||||||
|
gauges = metrics.get('gauges', {})
|
||||||
|
if gauges and metric_name in gauges:
|
||||||
|
return gauges[metric_name]
|
||||||
|
|
||||||
|
# Check counters
|
||||||
|
counters = metrics.get('counters', {})
|
||||||
|
if counters and metric_name in counters:
|
||||||
|
return counters[metric_name]
|
||||||
|
|
||||||
|
# Check histograms (use average)
|
||||||
|
histograms = metrics.get('histograms', {})
|
||||||
|
if histograms and metric_name in histograms:
|
||||||
|
hist_data = histograms[metric_name]
|
||||||
|
if hist_data and isinstance(hist_data, dict):
|
||||||
|
return hist_data.get('avg', 0)
|
||||||
|
|
||||||
|
# Check system metrics
|
||||||
|
system_metrics = metrics.get('system', {})
|
||||||
|
if system_metrics and metric_name in system_metrics:
|
||||||
|
return system_metrics[metric_name]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _update_kpis(self) -> None:
|
||||||
|
"""Update key performance indicators"""
|
||||||
|
try:
|
||||||
|
if self.metrics_collector is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
if current_metrics is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
timestamp = get_current_timestamp()
|
||||||
|
|
||||||
|
# Calculate throughput (operations per second)
|
||||||
|
throughput = self._calculate_throughput()
|
||||||
|
self.kpi_history['throughput_ops_per_sec'].append({
|
||||||
|
'value': throughput,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate error rate
|
||||||
|
error_rate = self._calculate_error_rate()
|
||||||
|
self.kpi_history['error_rate_percent'].append({
|
||||||
|
'value': error_rate,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Calculate average latency
|
||||||
|
avg_latency = self._calculate_average_latency()
|
||||||
|
self.kpi_history['avg_latency_ms'].append({
|
||||||
|
'value': avg_latency,
|
||||||
|
'timestamp': timestamp
|
||||||
|
})
|
||||||
|
|
||||||
|
# Update metrics collector with KPIs
|
||||||
|
self.metrics_collector.set_gauge('kpi_throughput_ops_per_sec', throughput)
|
||||||
|
self.metrics_collector.set_gauge('kpi_error_rate_percent', error_rate)
|
||||||
|
self.metrics_collector.set_gauge('kpi_avg_latency_ms', avg_latency)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error updating KPIs: {e}")
|
||||||
|
|
||||||
|
def _calculate_throughput(self) -> float:
|
||||||
|
"""Calculate operations per second throughput"""
|
||||||
|
try:
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
counters = current_metrics.get('counters', {})
|
||||||
|
|
||||||
|
# Sum up relevant operation counters
|
||||||
|
total_ops = 0
|
||||||
|
for name, value in counters.items():
|
||||||
|
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed']):
|
||||||
|
total_ops += value
|
||||||
|
|
||||||
|
# Calculate rate (simple approximation)
|
||||||
|
if len(self.performance_history) >= 2:
|
||||||
|
prev_data = self.performance_history[-2]
|
||||||
|
current_data = self.performance_history[-1]
|
||||||
|
|
||||||
|
time_diff = (current_data['timestamp'] - prev_data['timestamp']).total_seconds()
|
||||||
|
if time_diff > 0:
|
||||||
|
prev_ops = sum(
|
||||||
|
value for name, value in prev_data.get('counters', {}).items()
|
||||||
|
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed'])
|
||||||
|
)
|
||||||
|
return (total_ops - prev_ops) / time_diff
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calculating throughput: {e}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def _calculate_error_rate(self) -> float:
|
||||||
|
"""Calculate error rate percentage"""
|
||||||
|
try:
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
counters = current_metrics.get('counters', {})
|
||||||
|
|
||||||
|
# Count errors and total operations
|
||||||
|
total_errors = sum(
|
||||||
|
value for name, value in counters.items()
|
||||||
|
if 'error' in name.lower() or 'failed' in name.lower()
|
||||||
|
)
|
||||||
|
|
||||||
|
total_operations = sum(
|
||||||
|
value for name, value in counters.items()
|
||||||
|
if any(keyword in name.lower() for keyword in ['processed', 'handled', 'completed', 'total'])
|
||||||
|
)
|
||||||
|
|
||||||
|
if total_operations > 0:
|
||||||
|
return (total_errors / total_operations) * 100
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calculating error rate: {e}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def _calculate_average_latency(self) -> float:
|
||||||
|
"""Calculate average latency across all operations"""
|
||||||
|
try:
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics()
|
||||||
|
histograms = current_metrics.get('histograms', {})
|
||||||
|
|
||||||
|
# Find latency histograms
|
||||||
|
latency_values = []
|
||||||
|
for name, stats in histograms.items():
|
||||||
|
if 'latency' in name.lower():
|
||||||
|
latency_values.append(stats.get('avg', 0))
|
||||||
|
|
||||||
|
if latency_values:
|
||||||
|
return sum(latency_values) / len(latency_values)
|
||||||
|
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error calculating average latency: {e}")
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def add_alert(self, alert: PerformanceAlert) -> None:
|
||||||
|
"""Add a performance alert"""
|
||||||
|
self.alerts[alert.name] = alert
|
||||||
|
logger.info(f"Added performance alert: {alert.name}")
|
||||||
|
|
||||||
|
def remove_alert(self, alert_name: str) -> None:
|
||||||
|
"""Remove a performance alert"""
|
||||||
|
if alert_name in self.alerts:
|
||||||
|
del self.alerts[alert_name]
|
||||||
|
# Also remove from active alerts if present
|
||||||
|
self.active_alerts.pop(alert_name, None)
|
||||||
|
logger.info(f"Removed performance alert: {alert_name}")
|
||||||
|
|
||||||
|
def get_active_alerts(self) -> List[PerformanceAlert]:
|
||||||
|
"""Get currently active alerts"""
|
||||||
|
return list(self.active_alerts.values())
|
||||||
|
|
||||||
|
def get_alert_history(self, limit: int = 100) -> List[PerformanceAlert]:
|
||||||
|
"""Get alert history"""
|
||||||
|
return list(self.alert_history)[-limit:]
|
||||||
|
|
||||||
|
def get_performance_dashboard_data(self) -> Dict[str, Any]:
|
||||||
|
"""Get data for performance dashboard"""
|
||||||
|
current_metrics = {}
|
||||||
|
if self.metrics_collector:
|
||||||
|
current_metrics = self.metrics_collector.get_current_metrics() or {}
|
||||||
|
|
||||||
|
system_metrics = current_metrics.get('system', {}) or {}
|
||||||
|
|
||||||
|
return {
|
||||||
|
'timestamp': get_current_timestamp().isoformat(),
|
||||||
|
'system_metrics': system_metrics,
|
||||||
|
'kpis': {
|
||||||
|
name: list(history)[-10:] if history else [] # Last 10 points
|
||||||
|
for name, history in self.kpi_history.items()
|
||||||
|
},
|
||||||
|
'active_alerts': [
|
||||||
|
{
|
||||||
|
'name': alert.name,
|
||||||
|
'message': alert.message,
|
||||||
|
'severity': alert.severity.value if hasattr(alert.severity, 'value') else str(alert.severity),
|
||||||
|
'triggered_at': alert.triggered_at.isoformat() if alert.triggered_at else None
|
||||||
|
}
|
||||||
|
for alert in self.active_alerts.values()
|
||||||
|
],
|
||||||
|
'performance_summary': {
|
||||||
|
'cpu_usage': system_metrics.get('cpu_usage', 0) if system_metrics else 0,
|
||||||
|
'memory_usage': system_metrics.get('memory_usage', 0) if system_metrics else 0,
|
||||||
|
'active_connections': system_metrics.get('active_connections', 0) if system_metrics else 0,
|
||||||
|
'throughput': self.kpi_history['throughput_ops_per_sec'][-1]['value'] if self.kpi_history['throughput_ops_per_sec'] else 0,
|
||||||
|
'error_rate': self.kpi_history['error_rate_percent'][-1]['value'] if self.kpi_history['error_rate_percent'] else 0,
|
||||||
|
'avg_latency': self.kpi_history['avg_latency_ms'][-1]['value'] if self.kpi_history['avg_latency_ms'] else 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def register_alert_callback(self, callback: Callable[[PerformanceAlert], None]) -> None:
|
||||||
|
"""Register callback for alert notifications"""
|
||||||
|
self.alert_callbacks.append(callback)
|
||||||
|
logger.info(f"Registered alert callback: {callback.__name__}")
|
||||||
|
|
||||||
|
def update_thresholds(self, **kwargs) -> None:
|
||||||
|
"""Update performance thresholds"""
|
||||||
|
for key, value in kwargs.items():
|
||||||
|
if hasattr(self.thresholds, key):
|
||||||
|
setattr(self.thresholds, key, value)
|
||||||
|
logger.info(f"Updated threshold {key} to {value}")
|
||||||
|
|
||||||
|
def get_performance_trends(self, hours: int = 24) -> Dict[str, Any]:
|
||||||
|
"""Get performance trends over specified time period"""
|
||||||
|
cutoff_time = get_current_timestamp() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
# Filter performance history
|
||||||
|
recent_data = [
|
||||||
|
data for data in self.performance_history
|
||||||
|
if data and data.get('timestamp') and data['timestamp'] >= cutoff_time
|
||||||
|
]
|
||||||
|
|
||||||
|
if not recent_data:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
# Calculate trends
|
||||||
|
trends = {}
|
||||||
|
|
||||||
|
# CPU usage trend
|
||||||
|
cpu_values = []
|
||||||
|
for data in recent_data:
|
||||||
|
system_data = data.get('system', {})
|
||||||
|
if system_data:
|
||||||
|
cpu_values.append(system_data.get('cpu_usage', 0))
|
||||||
|
|
||||||
|
if cpu_values:
|
||||||
|
trends['cpu_usage'] = {
|
||||||
|
'current': cpu_values[-1],
|
||||||
|
'average': sum(cpu_values) / len(cpu_values),
|
||||||
|
'max': max(cpu_values),
|
||||||
|
'trend': 'increasing' if len(cpu_values) > 1 and cpu_values[-1] > cpu_values[0] else 'stable'
|
||||||
|
}
|
||||||
|
|
||||||
|
# Memory usage trend
|
||||||
|
memory_values = []
|
||||||
|
for data in recent_data:
|
||||||
|
system_data = data.get('system', {})
|
||||||
|
if system_data:
|
||||||
|
memory_values.append(system_data.get('memory_usage', 0))
|
||||||
|
|
||||||
|
if memory_values:
|
||||||
|
trends['memory_usage'] = {
|
||||||
|
'current': memory_values[-1],
|
||||||
|
'average': sum(memory_values) / len(memory_values),
|
||||||
|
'max': max(memory_values),
|
||||||
|
'trend': 'increasing' if len(memory_values) > 1 and memory_values[-1] > memory_values[0] else 'stable'
|
||||||
|
}
|
||||||
|
|
||||||
|
return trends
|
||||||
|
|
||||||
|
def get_stats(self) -> Dict[str, Any]:
|
||||||
|
"""Get performance monitor statistics"""
|
||||||
|
return {
|
||||||
|
'monitoring': self._monitoring,
|
||||||
|
'monitor_interval': self._monitor_interval,
|
||||||
|
'total_alerts': len(self.alerts),
|
||||||
|
'active_alerts': len(self.active_alerts),
|
||||||
|
'alert_history_count': len(self.alert_history),
|
||||||
|
'performance_history_count': len(self.performance_history),
|
||||||
|
'kpi_metrics': list(self.kpi_history.keys()),
|
||||||
|
'registered_callbacks': len(self.alert_callbacks),
|
||||||
|
'thresholds': {
|
||||||
|
'max_cpu_usage': self.thresholds.max_cpu_usage,
|
||||||
|
'max_memory_usage': self.thresholds.max_memory_usage,
|
||||||
|
'min_memory_available': self.thresholds.min_memory_available,
|
||||||
|
'max_latency_ms': self.thresholds.max_latency_ms,
|
||||||
|
'max_error_rate': self.thresholds.max_error_rate,
|
||||||
|
'min_throughput': self.thresholds.min_throughput
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Global performance monitor instance (initialized lazily)
|
||||||
|
performance_monitor = None
|
||||||
|
|
||||||
|
def get_performance_monitor():
|
||||||
|
"""Get or create global performance monitor instance"""
|
||||||
|
global performance_monitor
|
||||||
|
if performance_monitor is None:
|
||||||
|
performance_monitor = PerformanceMonitor()
|
||||||
|
return performance_monitor
|
Reference in New Issue
Block a user