Files
gogo2/COBY/monitoring/alert_manager.py
Dobromir Popov fa972ace8a 16. monitoring
2025-08-05 01:06:26 +03:00

671 lines
25 KiB
Python

"""
Alert management system for performance degradation and system issues.
"""
import smtplib
import json
from typing import Dict, List, Optional, Any, Callable
from collections import defaultdict, deque
from datetime import datetime, timezone, timedelta
from dataclasses import dataclass, field
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
from enum import Enum
from ..utils.logging import get_logger
from ..utils.timing import get_current_timestamp
logger = get_logger(__name__)
class AlertSeverity(Enum):
"""Alert severity levels"""
INFO = "info"
WARNING = "warning"
CRITICAL = "critical"
class AlertStatus(Enum):
"""Alert status"""
ACTIVE = "active"
RESOLVED = "resolved"
ACKNOWLEDGED = "acknowledged"
SUPPRESSED = "suppressed"
@dataclass
class Alert:
"""Alert definition"""
id: str
name: str
description: str
severity: AlertSeverity
metric_name: str
threshold: float
comparison: str # 'gt', 'lt', 'eq', 'ne'
duration_seconds: int
status: AlertStatus = AlertStatus.ACTIVE
triggered_at: Optional[datetime] = None
resolved_at: Optional[datetime] = None
acknowledged_at: Optional[datetime] = None
acknowledged_by: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
"""Convert alert to dictionary"""
return {
'id': self.id,
'name': self.name,
'description': self.description,
'severity': self.severity.value,
'metric_name': self.metric_name,
'threshold': self.threshold,
'comparison': self.comparison,
'duration_seconds': self.duration_seconds,
'status': self.status.value,
'triggered_at': self.triggered_at.isoformat() if self.triggered_at else None,
'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None,
'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None,
'acknowledged_by': self.acknowledged_by,
'metadata': self.metadata
}
@dataclass
class AlertRule:
"""Alert rule configuration"""
name: str
metric_name: str
threshold: float
comparison: str
duration_seconds: int
severity: AlertSeverity
description: str = ""
enabled: bool = True
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class NotificationChannel:
"""Notification channel configuration"""
name: str
type: str # 'email', 'webhook', 'slack'
config: Dict[str, Any]
enabled: bool = True
severity_filter: List[AlertSeverity] = field(default_factory=list)
class AlertManager:
"""
Manages alerts, notifications, and alert lifecycle.
Provides comprehensive alerting with multiple notification channels
and alert suppression capabilities.
"""
def __init__(self):
"""Initialize alert manager"""
# Alert storage
self.alert_rules: Dict[str, AlertRule] = {}
self.active_alerts: Dict[str, Alert] = {}
self.alert_history: deque = deque(maxlen=10000)
# Notification channels
self.notification_channels: Dict[str, NotificationChannel] = {}
# Alert state tracking
self.metric_values: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100))
self.alert_triggers: Dict[str, datetime] = {}
# Suppression rules
self.suppression_rules: Dict[str, Dict[str, Any]] = {}
# Callbacks
self.alert_callbacks: List[Callable[[Alert], None]] = []
# Statistics
self.alerts_triggered = 0
self.alerts_resolved = 0
self.notifications_sent = 0
logger.info("Alert manager initialized")
def add_alert_rule(self, rule: AlertRule) -> None:
"""Add an alert rule"""
self.alert_rules[rule.name] = rule
logger.info(f"Added alert rule: {rule.name}")
def remove_alert_rule(self, rule_name: str) -> None:
"""Remove an alert rule"""
if rule_name in self.alert_rules:
del self.alert_rules[rule_name]
# Also remove any active alerts for this rule
alerts_to_remove = [
alert_id for alert_id, alert in self.active_alerts.items()
if alert.name == rule_name
]
for alert_id in alerts_to_remove:
del self.active_alerts[alert_id]
logger.info(f"Removed alert rule: {rule_name}")
def add_notification_channel(self, channel: NotificationChannel) -> None:
"""Add a notification channel"""
self.notification_channels[channel.name] = channel
logger.info(f"Added notification channel: {channel.name} ({channel.type})")
def remove_notification_channel(self, channel_name: str) -> None:
"""Remove a notification channel"""
if channel_name in self.notification_channels:
del self.notification_channels[channel_name]
logger.info(f"Removed notification channel: {channel_name}")
def update_metric_value(self, metric_name: str, value: float) -> None:
"""Update metric value and check alerts"""
timestamp = get_current_timestamp()
self.metric_values[metric_name].append((timestamp, value))
# Check all alert rules for this metric
for rule_name, rule in self.alert_rules.items():
if rule.metric_name == metric_name and rule.enabled:
self._check_alert_rule(rule, value, timestamp)
def _check_alert_rule(self, rule: AlertRule, value: float, timestamp: datetime) -> None:
"""Check if an alert rule should be triggered"""
try:
# Check if condition is met
condition_met = self._evaluate_condition(rule.comparison, value, rule.threshold)
alert_id = f"{rule.name}_{rule.metric_name}"
if condition_met:
# Check if we need to wait for duration
if alert_id not in self.alert_triggers:
self.alert_triggers[alert_id] = timestamp
return
# Check if duration has passed
trigger_time = self.alert_triggers[alert_id]
if (timestamp - trigger_time).total_seconds() >= rule.duration_seconds:
# Trigger alert if not already active
if alert_id not in self.active_alerts:
self._trigger_alert(rule, value, timestamp)
else:
# Condition not met - clear trigger time and resolve alert if active
self.alert_triggers.pop(alert_id, None)
if alert_id in self.active_alerts:
self._resolve_alert(alert_id, timestamp)
except Exception as e:
logger.error(f"Error checking alert rule {rule.name}: {e}")
def _evaluate_condition(self, comparison: str, value: float, threshold: float) -> bool:
"""Evaluate alert condition"""
if comparison == 'gt':
return value > threshold
elif comparison == 'lt':
return value < threshold
elif comparison == 'eq':
return abs(value - threshold) < 0.001
elif comparison == 'ne':
return abs(value - threshold) >= 0.001
elif comparison == 'gte':
return value >= threshold
elif comparison == 'lte':
return value <= threshold
else:
logger.warning(f"Unknown comparison operator: {comparison}")
return False
def _trigger_alert(self, rule: AlertRule, value: float, timestamp: datetime) -> None:
"""Trigger an alert"""
try:
alert_id = f"{rule.name}_{rule.metric_name}"
# Create alert
alert = Alert(
id=alert_id,
name=rule.name,
description=rule.description or f"{rule.metric_name} {rule.comparison} {rule.threshold}",
severity=rule.severity,
metric_name=rule.metric_name,
threshold=rule.threshold,
comparison=rule.comparison,
duration_seconds=rule.duration_seconds,
triggered_at=timestamp,
metadata={
'current_value': value,
'rule_metadata': rule.metadata
}
)
# Check suppression rules
if self._is_suppressed(alert):
alert.status = AlertStatus.SUPPRESSED
logger.info(f"Alert suppressed: {alert.name}")
return
# Store alert
self.active_alerts[alert_id] = alert
self.alert_history.append(alert)
self.alerts_triggered += 1
logger.warning(f"Alert triggered: {alert.name} - {alert.description}")
# Send notifications
self._send_notifications(alert)
# Call callbacks
for callback in self.alert_callbacks:
try:
callback(alert)
except Exception as e:
logger.error(f"Error in alert callback: {e}")
except Exception as e:
logger.error(f"Error triggering alert: {e}")
def _resolve_alert(self, alert_id: str, timestamp: datetime) -> None:
"""Resolve an alert"""
try:
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.status = AlertStatus.RESOLVED
alert.resolved_at = timestamp
# Move to history and remove from active
self.alert_history.append(alert)
del self.active_alerts[alert_id]
self.alerts_resolved += 1
logger.info(f"Alert resolved: {alert.name}")
# Send resolution notifications
self._send_resolution_notifications(alert)
except Exception as e:
logger.error(f"Error resolving alert {alert_id}: {e}")
def _is_suppressed(self, alert: Alert) -> bool:
"""Check if alert should be suppressed"""
for rule_name, rule in self.suppression_rules.items():
try:
# Check if suppression rule applies
if self._matches_suppression_rule(alert, rule):
return True
except Exception as e:
logger.error(f"Error checking suppression rule {rule_name}: {e}")
return False
def _matches_suppression_rule(self, alert: Alert, rule: Dict[str, Any]) -> bool:
"""Check if alert matches suppression rule"""
# Check alert name pattern
if 'alert_pattern' in rule:
import re
if not re.match(rule['alert_pattern'], alert.name):
return False
# Check severity
if 'severity' in rule:
if alert.severity.value not in rule['severity']:
return False
# Check time window
if 'time_window' in rule:
start_time = datetime.fromisoformat(rule['time_window']['start'])
end_time = datetime.fromisoformat(rule['time_window']['end'])
current_time = get_current_timestamp()
if not (start_time <= current_time <= end_time):
return False
return True
def _send_notifications(self, alert: Alert) -> None:
"""Send notifications for an alert"""
for channel_name, channel in self.notification_channels.items():
try:
if not channel.enabled:
continue
# Check severity filter
if channel.severity_filter and alert.severity not in channel.severity_filter:
continue
# Send notification based on channel type
if channel.type == 'email':
self._send_email_notification(alert, channel)
elif channel.type == 'webhook':
self._send_webhook_notification(alert, channel)
elif channel.type == 'slack':
self._send_slack_notification(alert, channel)
else:
logger.warning(f"Unknown notification channel type: {channel.type}")
self.notifications_sent += 1
except Exception as e:
logger.error(f"Error sending notification via {channel_name}: {e}")
def _send_resolution_notifications(self, alert: Alert) -> None:
"""Send resolution notifications"""
for channel_name, channel in self.notification_channels.items():
try:
if not channel.enabled:
continue
# Send resolution notification
if channel.type == 'email':
self._send_email_resolution(alert, channel)
elif channel.type == 'webhook':
self._send_webhook_resolution(alert, channel)
elif channel.type == 'slack':
self._send_slack_resolution(alert, channel)
except Exception as e:
logger.error(f"Error sending resolution notification via {channel_name}: {e}")
def _send_email_notification(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send email notification"""
try:
config = channel.config
# Create message
msg = MIMEMultipart()
msg['From'] = config['from_email']
msg['To'] = ', '.join(config['to_emails'])
msg['Subject'] = f"[{alert.severity.value.upper()}] {alert.name}"
# Create body
body = f"""
Alert: {alert.name}
Severity: {alert.severity.value.upper()}
Description: {alert.description}
Metric: {alert.metric_name}
Current Value: {alert.metadata.get('current_value', 'N/A')}
Threshold: {alert.threshold}
Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'}
Alert ID: {alert.id}
"""
msg.attach(MIMEText(body, 'plain'))
# Send email
with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server:
if config.get('use_tls', True):
server.starttls()
if 'username' in config and 'password' in config:
server.login(config['username'], config['password'])
server.send_message(msg)
logger.info(f"Email notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending email notification: {e}")
def _send_webhook_notification(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send webhook notification"""
try:
import requests
config = channel.config
payload = {
'alert': alert.to_dict(),
'type': 'alert_triggered'
}
response = requests.post(
config['url'],
json=payload,
headers=config.get('headers', {}),
timeout=config.get('timeout', 10)
)
response.raise_for_status()
logger.info(f"Webhook notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending webhook notification: {e}")
def _send_slack_notification(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send Slack notification"""
try:
import requests
config = channel.config
# Create Slack message
color = {
AlertSeverity.INFO: 'good',
AlertSeverity.WARNING: 'warning',
AlertSeverity.CRITICAL: 'danger'
}.get(alert.severity, 'warning')
payload = {
'channel': config['channel'],
'username': config.get('username', 'AlertBot'),
'attachments': [{
'color': color,
'title': f"{alert.severity.value.upper()}: {alert.name}",
'text': alert.description,
'fields': [
{'title': 'Metric', 'value': alert.metric_name, 'short': True},
{'title': 'Current Value', 'value': str(alert.metadata.get('current_value', 'N/A')), 'short': True},
{'title': 'Threshold', 'value': str(alert.threshold), 'short': True},
{'title': 'Triggered At', 'value': alert.triggered_at.isoformat() if alert.triggered_at else 'N/A', 'short': True}
],
'footer': f"Alert ID: {alert.id}"
}]
}
response = requests.post(
config['webhook_url'],
json=payload,
timeout=10
)
response.raise_for_status()
logger.info(f"Slack notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending Slack notification: {e}")
def _send_email_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send email resolution notification"""
try:
config = channel.config
# Create message
msg = MIMEMultipart()
msg['From'] = config['from_email']
msg['To'] = ', '.join(config['to_emails'])
msg['Subject'] = f"[RESOLVED] {alert.name}"
# Create body
duration = ""
if alert.triggered_at and alert.resolved_at:
duration = str(alert.resolved_at - alert.triggered_at)
body = f"""
Alert RESOLVED: {alert.name}
Severity: {alert.severity.value.upper()}
Description: {alert.description}
Metric: {alert.metric_name}
Threshold: {alert.threshold}
Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'}
Resolved At: {alert.resolved_at.isoformat() if alert.resolved_at else 'N/A'}
Duration: {duration}
Alert ID: {alert.id}
"""
msg.attach(MIMEText(body, 'plain'))
# Send email
with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server:
if config.get('use_tls', True):
server.starttls()
if 'username' in config and 'password' in config:
server.login(config['username'], config['password'])
server.send_message(msg)
logger.info(f"Email resolution notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending email resolution notification: {e}")
def _send_webhook_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send webhook resolution notification"""
try:
import requests
config = channel.config
payload = {
'alert': alert.to_dict(),
'type': 'alert_resolved'
}
response = requests.post(
config['url'],
json=payload,
headers=config.get('headers', {}),
timeout=config.get('timeout', 10)
)
response.raise_for_status()
logger.info(f"Webhook resolution notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending webhook resolution notification: {e}")
def _send_slack_resolution(self, alert: Alert, channel: NotificationChannel) -> None:
"""Send Slack resolution notification"""
try:
import requests
config = channel.config
duration = ""
if alert.triggered_at and alert.resolved_at:
duration = str(alert.resolved_at - alert.triggered_at)
payload = {
'channel': config['channel'],
'username': config.get('username', 'AlertBot'),
'attachments': [{
'color': 'good',
'title': f"RESOLVED: {alert.name}",
'text': f"Alert has been resolved: {alert.description}",
'fields': [
{'title': 'Duration', 'value': duration, 'short': True},
{'title': 'Resolved At', 'value': alert.resolved_at.isoformat() if alert.resolved_at else 'N/A', 'short': True}
],
'footer': f"Alert ID: {alert.id}"
}]
}
response = requests.post(
config['webhook_url'],
json=payload,
timeout=10
)
response.raise_for_status()
logger.info(f"Slack resolution notification sent for alert: {alert.name}")
except Exception as e:
logger.error(f"Error sending Slack resolution notification: {e}")
def acknowledge_alert(self, alert_id: str, acknowledged_by: str) -> bool:
"""Acknowledge an alert"""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.status = AlertStatus.ACKNOWLEDGED
alert.acknowledged_at = get_current_timestamp()
alert.acknowledged_by = acknowledged_by
logger.info(f"Alert acknowledged by {acknowledged_by}: {alert.name}")
return True
return False
def suppress_alert(self, alert_id: str) -> bool:
"""Suppress an alert"""
if alert_id in self.active_alerts:
alert = self.active_alerts[alert_id]
alert.status = AlertStatus.SUPPRESSED
logger.info(f"Alert suppressed: {alert.name}")
return True
return False
def add_suppression_rule(self, name: str, rule: Dict[str, Any]) -> None:
"""Add alert suppression rule"""
self.suppression_rules[name] = rule
logger.info(f"Added suppression rule: {name}")
def remove_suppression_rule(self, name: str) -> None:
"""Remove alert suppression rule"""
if name in self.suppression_rules:
del self.suppression_rules[name]
logger.info(f"Removed suppression rule: {name}")
def get_active_alerts(self, severity: AlertSeverity = None) -> List[Alert]:
"""Get active alerts, optionally filtered by severity"""
alerts = list(self.active_alerts.values())
if severity:
alerts = [alert for alert in alerts if alert.severity == severity]
return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True)
def get_alert_history(self, limit: int = 100, severity: AlertSeverity = None) -> List[Alert]:
"""Get alert history"""
alerts = list(self.alert_history)
if severity:
alerts = [alert for alert in alerts if alert.severity == severity]
return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True)[:limit]
def get_alert_summary(self) -> Dict[str, Any]:
"""Get alert summary statistics"""
active_by_severity = defaultdict(int)
for alert in self.active_alerts.values():
active_by_severity[alert.severity.value] += 1
return {
'active_alerts': len(self.active_alerts),
'active_by_severity': dict(active_by_severity),
'total_triggered': self.alerts_triggered,
'total_resolved': self.alerts_resolved,
'notifications_sent': self.notifications_sent,
'alert_rules': len(self.alert_rules),
'notification_channels': len(self.notification_channels),
'suppression_rules': len(self.suppression_rules)
}
def register_callback(self, callback: Callable[[Alert], None]) -> None:
"""Register alert callback"""
self.alert_callbacks.append(callback)
logger.info(f"Registered alert callback: {callback.__name__}")
def get_stats(self) -> Dict[str, Any]:
"""Get alert manager statistics"""
return {
'alert_rules': len(self.alert_rules),
'active_alerts': len(self.active_alerts),
'alert_history_count': len(self.alert_history),
'notification_channels': len(self.notification_channels),
'suppression_rules': len(self.suppression_rules),
'alerts_triggered': self.alerts_triggered,
'alerts_resolved': self.alerts_resolved,
'notifications_sent': self.notifications_sent,
'registered_callbacks': len(self.alert_callbacks)
}
# Global alert manager instance
alert_manager = AlertManager()