103 lines
3.0 KiB
YAML
103 lines
3.0 KiB
YAML
# Prometheus alert rules for COBY system
|
|
groups:
|
|
- name: coby_alerts
|
|
rules:
|
|
# High CPU usage
|
|
- alert: HighCPUUsage
|
|
expr: system_cpu_usage > 80
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage detected"
|
|
description: "CPU usage is above 80% for more than 2 minutes"
|
|
|
|
# High memory usage
|
|
- alert: HighMemoryUsage
|
|
expr: system_memory_usage > 85
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage detected"
|
|
description: "Memory usage is above 85% for more than 2 minutes"
|
|
|
|
# Low available memory
|
|
- alert: LowAvailableMemory
|
|
expr: system_memory_available_gb < 1
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Low available memory"
|
|
description: "Available memory is below 1GB"
|
|
|
|
# High latency
|
|
- alert: HighLatency
|
|
expr: processing_latency_ms > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High processing latency"
|
|
description: "Processing latency is above 100ms for more than 5 minutes"
|
|
|
|
# Exchange connection failures
|
|
- alert: ExchangeConnectionFailure
|
|
expr: increase(exchange_connection_errors_total[5m]) > 5
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Exchange connection failures"
|
|
description: "More than 5 exchange connection errors in the last 5 minutes"
|
|
|
|
# Database connection issues
|
|
- alert: DatabaseConnectionFailure
|
|
expr: database_connection_errors_total > 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Database connection failure"
|
|
description: "Database connection errors detected"
|
|
|
|
# High error rate
|
|
- alert: HighErrorRate
|
|
expr: kpi_error_rate_percent > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High error rate"
|
|
description: "Error rate is above 5% for more than 5 minutes"
|
|
|
|
# Low throughput
|
|
- alert: LowThroughput
|
|
expr: kpi_throughput_ops_per_sec < 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Low system throughput"
|
|
description: "System throughput is below 10 ops/sec for more than 10 minutes"
|
|
|
|
# Service down
|
|
- alert: ServiceDown
|
|
expr: up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service is down"
|
|
description: "{{ $labels.job }} service is down"
|
|
|
|
# Disk space low
|
|
- alert: DiskSpaceLow
|
|
expr: system_disk_usage > 90
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk space low"
|
|
description: "Disk usage is above 90%" |