# Prometheus alert rules for COBY system groups: - name: coby_alerts rules: # High CPU usage - alert: HighCPUUsage expr: system_cpu_usage > 80 for: 2m labels: severity: warning annotations: summary: "High CPU usage detected" description: "CPU usage is above 80% for more than 2 minutes" # High memory usage - alert: HighMemoryUsage expr: system_memory_usage > 85 for: 2m labels: severity: warning annotations: summary: "High memory usage detected" description: "Memory usage is above 85% for more than 2 minutes" # Low available memory - alert: LowAvailableMemory expr: system_memory_available_gb < 1 for: 1m labels: severity: critical annotations: summary: "Low available memory" description: "Available memory is below 1GB" # High latency - alert: HighLatency expr: processing_latency_ms > 100 for: 5m labels: severity: warning annotations: summary: "High processing latency" description: "Processing latency is above 100ms for more than 5 minutes" # Exchange connection failures - alert: ExchangeConnectionFailure expr: increase(exchange_connection_errors_total[5m]) > 5 for: 1m labels: severity: critical annotations: summary: "Exchange connection failures" description: "More than 5 exchange connection errors in the last 5 minutes" # Database connection issues - alert: DatabaseConnectionFailure expr: database_connection_errors_total > 0 for: 1m labels: severity: critical annotations: summary: "Database connection failure" description: "Database connection errors detected" # High error rate - alert: HighErrorRate expr: kpi_error_rate_percent > 5 for: 5m labels: severity: warning annotations: summary: "High error rate" description: "Error rate is above 5% for more than 5 minutes" # Low throughput - alert: LowThroughput expr: kpi_throughput_ops_per_sec < 10 for: 10m labels: severity: warning annotations: summary: "Low system throughput" description: "System throughput is below 10 ops/sec for more than 10 minutes" # Service down - alert: ServiceDown expr: up == 0 for: 1m labels: severity: critical annotations: summary: "Service is down" description: "{{ $labels.job }} service is down" # Disk space low - alert: DiskSpaceLow expr: system_disk_usage > 90 for: 5m labels: severity: critical annotations: summary: "Disk space low" description: "Disk usage is above 90%"