Files
gogo2/COBY/docker/alert_rules.yml
2025-08-05 01:22:27 +03:00

103 lines
3.0 KiB
YAML

# Prometheus alert rules for COBY system
groups:
- name: coby_alerts
rules:
# High CPU usage
- alert: HighCPUUsage
expr: system_cpu_usage > 80
for: 2m
labels:
severity: warning
annotations:
summary: "High CPU usage detected"
description: "CPU usage is above 80% for more than 2 minutes"
# High memory usage
- alert: HighMemoryUsage
expr: system_memory_usage > 85
for: 2m
labels:
severity: warning
annotations:
summary: "High memory usage detected"
description: "Memory usage is above 85% for more than 2 minutes"
# Low available memory
- alert: LowAvailableMemory
expr: system_memory_available_gb < 1
for: 1m
labels:
severity: critical
annotations:
summary: "Low available memory"
description: "Available memory is below 1GB"
# High latency
- alert: HighLatency
expr: processing_latency_ms > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High processing latency"
description: "Processing latency is above 100ms for more than 5 minutes"
# Exchange connection failures
- alert: ExchangeConnectionFailure
expr: increase(exchange_connection_errors_total[5m]) > 5
for: 1m
labels:
severity: critical
annotations:
summary: "Exchange connection failures"
description: "More than 5 exchange connection errors in the last 5 minutes"
# Database connection issues
- alert: DatabaseConnectionFailure
expr: database_connection_errors_total > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Database connection failure"
description: "Database connection errors detected"
# High error rate
- alert: HighErrorRate
expr: kpi_error_rate_percent > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High error rate"
description: "Error rate is above 5% for more than 5 minutes"
# Low throughput
- alert: LowThroughput
expr: kpi_throughput_ops_per_sec < 10
for: 10m
labels:
severity: warning
annotations:
summary: "Low system throughput"
description: "System throughput is below 10 ops/sec for more than 10 minutes"
# Service down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service is down"
description: "{{ $labels.job }} service is down"
# Disk space low
- alert: DiskSpaceLow
expr: system_disk_usage > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Disk space low"
description: "Disk usage is above 90%"