devops/prometheus/prometheus-rules-configmap....

57 lines
1.9 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: default
data:
kubernetes-alerts.yml: |
groups:
- name: kubernetes-alerts
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes."
- alert: PodNotReady
expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes."
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.node }} is not ready"
description: "Node {{ $labels.node }} has been unready for more than 5 minutes."
- name: prometheus-alerts
rules:
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus is down"
description: "Prometheus has been down for more than 1 minute."
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failed"
description: "Prometheus configuration reload has been failing for more than 5 minutes."