38 lines
1.6 KiB
YAML
38 lines
1.6 KiB
YAML
groups:
|
|
- name: general.rules
|
|
rules:
|
|
- alert: InstanceDown
|
|
expr: up == 0
|
|
for: 2m
|
|
labels: { severity: critical }
|
|
annotations:
|
|
summary: "Instance {{ $labels.instance }} down"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes"
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[5m]))*100) > 80
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% for more than 5 minutes on {{ $labels.instance }}"
|
|
- alert: HighMemoryUsage
|
|
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes*100 > 90
|
|
for: 5m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}"
|
|
- alert: DiskSpaceLow
|
|
expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}/node_filesystem_size_bytes) < 0.1
|
|
for: 10m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }}"
|
|
description: "Disk space is below 10% for more than 10 minutes on {{ $labels.instance }}"
|
|
- alert: CertificateExpiration
|
|
expr: probe_ssl_earliest_cert_expiry - time() < 604800
|
|
for: 0m
|
|
labels: { severity: warning }
|
|
annotations:
|
|
summary: "SSL certificate expiring soon on {{ $labels.instance }}"
|
|
description: "SSL certificate for {{ $labels.instance }} expires in less than 7 days"
|