groups: - name: general.rules rules: - alert: InstanceDown expr: up == 0 for: 2m labels: { severity: critical } annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} down >2m" - alert: HighCPUUsage expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[5m]))*100) > 80 for: 5m labels: { severity: warning } annotations: { summary: "High CPU {{ $labels.instance }}" } - alert: HighMemoryUsage expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes*100 > 90 for: 5m labels: { severity: warning } annotations: { summary: "High memory {{ $labels.instance }}" } - alert: DiskSpaceLow expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"}/node_filesystem_size_bytes) < 0.1 for: 10m labels: { severity: warning } annotations: { summary: "Low disk {{ $labels.instance }}" } - alert: CertificateExpiration expr: probe_ssl_earliest_cert_expiry - time() < 604800 for: 0m labels: { severity: warning } annotations: { summary: "Cert expires soon {{ $labels.instance }}" }