17. docker deployment
This commit is contained in:
103
COBY/docker/alert_rules.yml
Normal file
103
COBY/docker/alert_rules.yml
Normal file
@ -0,0 +1,103 @@
|
||||
# Prometheus alert rules for COBY system
|
||||
groups:
|
||||
- name: coby_alerts
|
||||
rules:
|
||||
# High CPU usage
|
||||
- alert: HighCPUUsage
|
||||
expr: system_cpu_usage > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High CPU usage detected"
|
||||
description: "CPU usage is above 80% for more than 2 minutes"
|
||||
|
||||
# High memory usage
|
||||
- alert: HighMemoryUsage
|
||||
expr: system_memory_usage > 85
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High memory usage detected"
|
||||
description: "Memory usage is above 85% for more than 2 minutes"
|
||||
|
||||
# Low available memory
|
||||
- alert: LowAvailableMemory
|
||||
expr: system_memory_available_gb < 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Low available memory"
|
||||
description: "Available memory is below 1GB"
|
||||
|
||||
# High latency
|
||||
- alert: HighLatency
|
||||
expr: processing_latency_ms > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High processing latency"
|
||||
description: "Processing latency is above 100ms for more than 5 minutes"
|
||||
|
||||
# Exchange connection failures
|
||||
- alert: ExchangeConnectionFailure
|
||||
expr: increase(exchange_connection_errors_total[5m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Exchange connection failures"
|
||||
description: "More than 5 exchange connection errors in the last 5 minutes"
|
||||
|
||||
# Database connection issues
|
||||
- alert: DatabaseConnectionFailure
|
||||
expr: database_connection_errors_total > 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Database connection failure"
|
||||
description: "Database connection errors detected"
|
||||
|
||||
# High error rate
|
||||
- alert: HighErrorRate
|
||||
expr: kpi_error_rate_percent > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "High error rate"
|
||||
description: "Error rate is above 5% for more than 5 minutes"
|
||||
|
||||
# Low throughput
|
||||
- alert: LowThroughput
|
||||
expr: kpi_throughput_ops_per_sec < 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Low system throughput"
|
||||
description: "System throughput is below 10 ops/sec for more than 10 minutes"
|
||||
|
||||
# Service down
|
||||
- alert: ServiceDown
|
||||
expr: up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Service is down"
|
||||
description: "{{ $labels.job }} service is down"
|
||||
|
||||
# Disk space low
|
||||
- alert: DiskSpaceLow
|
||||
expr: system_disk_usage > 90
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Disk space low"
|
||||
description: "Disk usage is above 90%"
|
Reference in New Issue
Block a user