71 lines
2.3 KiB
YAML
71 lines
2.3 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-rules
|
|
namespace: default
|
|
data:
|
|
kubernetes-alerts.yml: |
|
|
groups:
|
|
- name: kubernetes-alerts
|
|
rules:
|
|
- alert: PodCrashLooping
|
|
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes."
|
|
|
|
- alert: PodNotReady
|
|
expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
|
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes."
|
|
|
|
- alert: NodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.node }} is not ready"
|
|
description: "Node {{ $labels.node }} has been unready for more than 5 minutes."
|
|
|
|
- name: prometheus-alerts
|
|
rules:
|
|
- alert: PrometheusDown
|
|
expr: up{job="prometheus"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus is down"
|
|
description: "Prometheus has been down for more than 1 minute."
|
|
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Prometheus configuration reload failed"
|
|
description: "Prometheus configuration reload has been failing for more than 5 minutes."
|
|
|
|
- name: aserver-alerts
|
|
rules:
|
|
- alert: AServerOffline
|
|
expr: absent(up{kubernetes_pod_name=~"aserver.*"})
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: aserver
|
|
alert_type: offline
|
|
annotations:
|
|
summary: "AServer 服务离线"
|
|
description: "AServer Pod 不存在或已离线超过1分钟"
|
|
action: "请检查 Pod 状态和日志"
|