""" Alert management system for performance degradation and system issues. """ import smtplib import json from typing import Dict, List, Optional, Any, Callable from collections import defaultdict, deque from datetime import datetime, timezone, timedelta from dataclasses import dataclass, field from email.mime.text import MIMEText from email.mime.multipart import MIMEMultipart from enum import Enum from ..utils.logging import get_logger from ..utils.timing import get_current_timestamp logger = get_logger(__name__) class AlertSeverity(Enum): """Alert severity levels""" INFO = "info" WARNING = "warning" CRITICAL = "critical" class AlertStatus(Enum): """Alert status""" ACTIVE = "active" RESOLVED = "resolved" ACKNOWLEDGED = "acknowledged" SUPPRESSED = "suppressed" @dataclass class Alert: """Alert definition""" id: str name: str description: str severity: AlertSeverity metric_name: str threshold: float comparison: str # 'gt', 'lt', 'eq', 'ne' duration_seconds: int status: AlertStatus = AlertStatus.ACTIVE triggered_at: Optional[datetime] = None resolved_at: Optional[datetime] = None acknowledged_at: Optional[datetime] = None acknowledged_by: Optional[str] = None metadata: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: """Convert alert to dictionary""" return { 'id': self.id, 'name': self.name, 'description': self.description, 'severity': self.severity.value, 'metric_name': self.metric_name, 'threshold': self.threshold, 'comparison': self.comparison, 'duration_seconds': self.duration_seconds, 'status': self.status.value, 'triggered_at': self.triggered_at.isoformat() if self.triggered_at else None, 'resolved_at': self.resolved_at.isoformat() if self.resolved_at else None, 'acknowledged_at': self.acknowledged_at.isoformat() if self.acknowledged_at else None, 'acknowledged_by': self.acknowledged_by, 'metadata': self.metadata } @dataclass class AlertRule: """Alert rule configuration""" name: str metric_name: str threshold: float comparison: str duration_seconds: int severity: AlertSeverity description: str = "" enabled: bool = True metadata: Dict[str, Any] = field(default_factory=dict) @dataclass class NotificationChannel: """Notification channel configuration""" name: str type: str # 'email', 'webhook', 'slack' config: Dict[str, Any] enabled: bool = True severity_filter: List[AlertSeverity] = field(default_factory=list) class AlertManager: """ Manages alerts, notifications, and alert lifecycle. Provides comprehensive alerting with multiple notification channels and alert suppression capabilities. """ def __init__(self): """Initialize alert manager""" # Alert storage self.alert_rules: Dict[str, AlertRule] = {} self.active_alerts: Dict[str, Alert] = {} self.alert_history: deque = deque(maxlen=10000) # Notification channels self.notification_channels: Dict[str, NotificationChannel] = {} # Alert state tracking self.metric_values: Dict[str, deque] = defaultdict(lambda: deque(maxlen=100)) self.alert_triggers: Dict[str, datetime] = {} # Suppression rules self.suppression_rules: Dict[str, Dict[str, Any]] = {} # Callbacks self.alert_callbacks: List[Callable[[Alert], None]] = [] # Statistics self.alerts_triggered = 0 self.alerts_resolved = 0 self.notifications_sent = 0 logger.info("Alert manager initialized") def add_alert_rule(self, rule: AlertRule) -> None: """Add an alert rule""" self.alert_rules[rule.name] = rule logger.info(f"Added alert rule: {rule.name}") def remove_alert_rule(self, rule_name: str) -> None: """Remove an alert rule""" if rule_name in self.alert_rules: del self.alert_rules[rule_name] # Also remove any active alerts for this rule alerts_to_remove = [ alert_id for alert_id, alert in self.active_alerts.items() if alert.name == rule_name ] for alert_id in alerts_to_remove: del self.active_alerts[alert_id] logger.info(f"Removed alert rule: {rule_name}") def add_notification_channel(self, channel: NotificationChannel) -> None: """Add a notification channel""" self.notification_channels[channel.name] = channel logger.info(f"Added notification channel: {channel.name} ({channel.type})") def remove_notification_channel(self, channel_name: str) -> None: """Remove a notification channel""" if channel_name in self.notification_channels: del self.notification_channels[channel_name] logger.info(f"Removed notification channel: {channel_name}") def update_metric_value(self, metric_name: str, value: float) -> None: """Update metric value and check alerts""" timestamp = get_current_timestamp() self.metric_values[metric_name].append((timestamp, value)) # Check all alert rules for this metric for rule_name, rule in self.alert_rules.items(): if rule.metric_name == metric_name and rule.enabled: self._check_alert_rule(rule, value, timestamp) def _check_alert_rule(self, rule: AlertRule, value: float, timestamp: datetime) -> None: """Check if an alert rule should be triggered""" try: # Check if condition is met condition_met = self._evaluate_condition(rule.comparison, value, rule.threshold) alert_id = f"{rule.name}_{rule.metric_name}" if condition_met: # Check if we need to wait for duration if alert_id not in self.alert_triggers: self.alert_triggers[alert_id] = timestamp return # Check if duration has passed trigger_time = self.alert_triggers[alert_id] if (timestamp - trigger_time).total_seconds() >= rule.duration_seconds: # Trigger alert if not already active if alert_id not in self.active_alerts: self._trigger_alert(rule, value, timestamp) else: # Condition not met - clear trigger time and resolve alert if active self.alert_triggers.pop(alert_id, None) if alert_id in self.active_alerts: self._resolve_alert(alert_id, timestamp) except Exception as e: logger.error(f"Error checking alert rule {rule.name}: {e}") def _evaluate_condition(self, comparison: str, value: float, threshold: float) -> bool: """Evaluate alert condition""" if comparison == 'gt': return value > threshold elif comparison == 'lt': return value < threshold elif comparison == 'eq': return abs(value - threshold) < 0.001 elif comparison == 'ne': return abs(value - threshold) >= 0.001 elif comparison == 'gte': return value >= threshold elif comparison == 'lte': return value <= threshold else: logger.warning(f"Unknown comparison operator: {comparison}") return False def _trigger_alert(self, rule: AlertRule, value: float, timestamp: datetime) -> None: """Trigger an alert""" try: alert_id = f"{rule.name}_{rule.metric_name}" # Create alert alert = Alert( id=alert_id, name=rule.name, description=rule.description or f"{rule.metric_name} {rule.comparison} {rule.threshold}", severity=rule.severity, metric_name=rule.metric_name, threshold=rule.threshold, comparison=rule.comparison, duration_seconds=rule.duration_seconds, triggered_at=timestamp, metadata={ 'current_value': value, 'rule_metadata': rule.metadata } ) # Check suppression rules if self._is_suppressed(alert): alert.status = AlertStatus.SUPPRESSED logger.info(f"Alert suppressed: {alert.name}") return # Store alert self.active_alerts[alert_id] = alert self.alert_history.append(alert) self.alerts_triggered += 1 logger.warning(f"Alert triggered: {alert.name} - {alert.description}") # Send notifications self._send_notifications(alert) # Call callbacks for callback in self.alert_callbacks: try: callback(alert) except Exception as e: logger.error(f"Error in alert callback: {e}") except Exception as e: logger.error(f"Error triggering alert: {e}") def _resolve_alert(self, alert_id: str, timestamp: datetime) -> None: """Resolve an alert""" try: if alert_id in self.active_alerts: alert = self.active_alerts[alert_id] alert.status = AlertStatus.RESOLVED alert.resolved_at = timestamp # Move to history and remove from active self.alert_history.append(alert) del self.active_alerts[alert_id] self.alerts_resolved += 1 logger.info(f"Alert resolved: {alert.name}") # Send resolution notifications self._send_resolution_notifications(alert) except Exception as e: logger.error(f"Error resolving alert {alert_id}: {e}") def _is_suppressed(self, alert: Alert) -> bool: """Check if alert should be suppressed""" for rule_name, rule in self.suppression_rules.items(): try: # Check if suppression rule applies if self._matches_suppression_rule(alert, rule): return True except Exception as e: logger.error(f"Error checking suppression rule {rule_name}: {e}") return False def _matches_suppression_rule(self, alert: Alert, rule: Dict[str, Any]) -> bool: """Check if alert matches suppression rule""" # Check alert name pattern if 'alert_pattern' in rule: import re if not re.match(rule['alert_pattern'], alert.name): return False # Check severity if 'severity' in rule: if alert.severity.value not in rule['severity']: return False # Check time window if 'time_window' in rule: start_time = datetime.fromisoformat(rule['time_window']['start']) end_time = datetime.fromisoformat(rule['time_window']['end']) current_time = get_current_timestamp() if not (start_time <= current_time <= end_time): return False return True def _send_notifications(self, alert: Alert) -> None: """Send notifications for an alert""" for channel_name, channel in self.notification_channels.items(): try: if not channel.enabled: continue # Check severity filter if channel.severity_filter and alert.severity not in channel.severity_filter: continue # Send notification based on channel type if channel.type == 'email': self._send_email_notification(alert, channel) elif channel.type == 'webhook': self._send_webhook_notification(alert, channel) elif channel.type == 'slack': self._send_slack_notification(alert, channel) else: logger.warning(f"Unknown notification channel type: {channel.type}") self.notifications_sent += 1 except Exception as e: logger.error(f"Error sending notification via {channel_name}: {e}") def _send_resolution_notifications(self, alert: Alert) -> None: """Send resolution notifications""" for channel_name, channel in self.notification_channels.items(): try: if not channel.enabled: continue # Send resolution notification if channel.type == 'email': self._send_email_resolution(alert, channel) elif channel.type == 'webhook': self._send_webhook_resolution(alert, channel) elif channel.type == 'slack': self._send_slack_resolution(alert, channel) except Exception as e: logger.error(f"Error sending resolution notification via {channel_name}: {e}") def _send_email_notification(self, alert: Alert, channel: NotificationChannel) -> None: """Send email notification""" try: config = channel.config # Create message msg = MIMEMultipart() msg['From'] = config['from_email'] msg['To'] = ', '.join(config['to_emails']) msg['Subject'] = f"[{alert.severity.value.upper()}] {alert.name}" # Create body body = f""" Alert: {alert.name} Severity: {alert.severity.value.upper()} Description: {alert.description} Metric: {alert.metric_name} Current Value: {alert.metadata.get('current_value', 'N/A')} Threshold: {alert.threshold} Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'} Alert ID: {alert.id} """ msg.attach(MIMEText(body, 'plain')) # Send email with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server: if config.get('use_tls', True): server.starttls() if 'username' in config and 'password' in config: server.login(config['username'], config['password']) server.send_message(msg) logger.info(f"Email notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending email notification: {e}") def _send_webhook_notification(self, alert: Alert, channel: NotificationChannel) -> None: """Send webhook notification""" try: import requests config = channel.config payload = { 'alert': alert.to_dict(), 'type': 'alert_triggered' } response = requests.post( config['url'], json=payload, headers=config.get('headers', {}), timeout=config.get('timeout', 10) ) response.raise_for_status() logger.info(f"Webhook notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending webhook notification: {e}") def _send_slack_notification(self, alert: Alert, channel: NotificationChannel) -> None: """Send Slack notification""" try: import requests config = channel.config # Create Slack message color = { AlertSeverity.INFO: 'good', AlertSeverity.WARNING: 'warning', AlertSeverity.CRITICAL: 'danger' }.get(alert.severity, 'warning') payload = { 'channel': config['channel'], 'username': config.get('username', 'AlertBot'), 'attachments': [{ 'color': color, 'title': f"{alert.severity.value.upper()}: {alert.name}", 'text': alert.description, 'fields': [ {'title': 'Metric', 'value': alert.metric_name, 'short': True}, {'title': 'Current Value', 'value': str(alert.metadata.get('current_value', 'N/A')), 'short': True}, {'title': 'Threshold', 'value': str(alert.threshold), 'short': True}, {'title': 'Triggered At', 'value': alert.triggered_at.isoformat() if alert.triggered_at else 'N/A', 'short': True} ], 'footer': f"Alert ID: {alert.id}" }] } response = requests.post( config['webhook_url'], json=payload, timeout=10 ) response.raise_for_status() logger.info(f"Slack notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending Slack notification: {e}") def _send_email_resolution(self, alert: Alert, channel: NotificationChannel) -> None: """Send email resolution notification""" try: config = channel.config # Create message msg = MIMEMultipart() msg['From'] = config['from_email'] msg['To'] = ', '.join(config['to_emails']) msg['Subject'] = f"[RESOLVED] {alert.name}" # Create body duration = "" if alert.triggered_at and alert.resolved_at: duration = str(alert.resolved_at - alert.triggered_at) body = f""" Alert RESOLVED: {alert.name} Severity: {alert.severity.value.upper()} Description: {alert.description} Metric: {alert.metric_name} Threshold: {alert.threshold} Triggered At: {alert.triggered_at.isoformat() if alert.triggered_at else 'N/A'} Resolved At: {alert.resolved_at.isoformat() if alert.resolved_at else 'N/A'} Duration: {duration} Alert ID: {alert.id} """ msg.attach(MIMEText(body, 'plain')) # Send email with smtplib.SMTP(config['smtp_server'], config['smtp_port']) as server: if config.get('use_tls', True): server.starttls() if 'username' in config and 'password' in config: server.login(config['username'], config['password']) server.send_message(msg) logger.info(f"Email resolution notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending email resolution notification: {e}") def _send_webhook_resolution(self, alert: Alert, channel: NotificationChannel) -> None: """Send webhook resolution notification""" try: import requests config = channel.config payload = { 'alert': alert.to_dict(), 'type': 'alert_resolved' } response = requests.post( config['url'], json=payload, headers=config.get('headers', {}), timeout=config.get('timeout', 10) ) response.raise_for_status() logger.info(f"Webhook resolution notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending webhook resolution notification: {e}") def _send_slack_resolution(self, alert: Alert, channel: NotificationChannel) -> None: """Send Slack resolution notification""" try: import requests config = channel.config duration = "" if alert.triggered_at and alert.resolved_at: duration = str(alert.resolved_at - alert.triggered_at) payload = { 'channel': config['channel'], 'username': config.get('username', 'AlertBot'), 'attachments': [{ 'color': 'good', 'title': f"RESOLVED: {alert.name}", 'text': f"Alert has been resolved: {alert.description}", 'fields': [ {'title': 'Duration', 'value': duration, 'short': True}, {'title': 'Resolved At', 'value': alert.resolved_at.isoformat() if alert.resolved_at else 'N/A', 'short': True} ], 'footer': f"Alert ID: {alert.id}" }] } response = requests.post( config['webhook_url'], json=payload, timeout=10 ) response.raise_for_status() logger.info(f"Slack resolution notification sent for alert: {alert.name}") except Exception as e: logger.error(f"Error sending Slack resolution notification: {e}") def acknowledge_alert(self, alert_id: str, acknowledged_by: str) -> bool: """Acknowledge an alert""" if alert_id in self.active_alerts: alert = self.active_alerts[alert_id] alert.status = AlertStatus.ACKNOWLEDGED alert.acknowledged_at = get_current_timestamp() alert.acknowledged_by = acknowledged_by logger.info(f"Alert acknowledged by {acknowledged_by}: {alert.name}") return True return False def suppress_alert(self, alert_id: str) -> bool: """Suppress an alert""" if alert_id in self.active_alerts: alert = self.active_alerts[alert_id] alert.status = AlertStatus.SUPPRESSED logger.info(f"Alert suppressed: {alert.name}") return True return False def add_suppression_rule(self, name: str, rule: Dict[str, Any]) -> None: """Add alert suppression rule""" self.suppression_rules[name] = rule logger.info(f"Added suppression rule: {name}") def remove_suppression_rule(self, name: str) -> None: """Remove alert suppression rule""" if name in self.suppression_rules: del self.suppression_rules[name] logger.info(f"Removed suppression rule: {name}") def get_active_alerts(self, severity: AlertSeverity = None) -> List[Alert]: """Get active alerts, optionally filtered by severity""" alerts = list(self.active_alerts.values()) if severity: alerts = [alert for alert in alerts if alert.severity == severity] return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True) def get_alert_history(self, limit: int = 100, severity: AlertSeverity = None) -> List[Alert]: """Get alert history""" alerts = list(self.alert_history) if severity: alerts = [alert for alert in alerts if alert.severity == severity] return sorted(alerts, key=lambda x: x.triggered_at or datetime.min, reverse=True)[:limit] def get_alert_summary(self) -> Dict[str, Any]: """Get alert summary statistics""" active_by_severity = defaultdict(int) for alert in self.active_alerts.values(): active_by_severity[alert.severity.value] += 1 return { 'active_alerts': len(self.active_alerts), 'active_by_severity': dict(active_by_severity), 'total_triggered': self.alerts_triggered, 'total_resolved': self.alerts_resolved, 'notifications_sent': self.notifications_sent, 'alert_rules': len(self.alert_rules), 'notification_channels': len(self.notification_channels), 'suppression_rules': len(self.suppression_rules) } def register_callback(self, callback: Callable[[Alert], None]) -> None: """Register alert callback""" self.alert_callbacks.append(callback) logger.info(f"Registered alert callback: {callback.__name__}") def get_stats(self) -> Dict[str, Any]: """Get alert manager statistics""" return { 'alert_rules': len(self.alert_rules), 'active_alerts': len(self.active_alerts), 'alert_history_count': len(self.alert_history), 'notification_channels': len(self.notification_channels), 'suppression_rules': len(self.suppression_rules), 'alerts_triggered': self.alerts_triggered, 'alerts_resolved': self.alerts_resolved, 'notifications_sent': self.notifications_sent, 'registered_callbacks': len(self.alert_callbacks) } # Global alert manager instance alert_manager = AlertManager()