apiVersion: v1 kind: ConfigMap metadata: name: prometheus-rules namespace: default data: kubernetes-alerts.yml: | groups: - name: kubernetes-alerts rules: - alert: PodCrashLooping expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0 for: 0m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes." - alert: PodNotReady expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0 for: 5m labels: severity: warning annotations: summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready" description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes." - alert: NodeNotReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 5m labels: severity: critical annotations: summary: "Node {{ $labels.node }} is not ready" description: "Node {{ $labels.node }} has been unready for more than 5 minutes." - name: prometheus-alerts rules: - alert: PrometheusDown expr: up{job="prometheus"} == 0 for: 1m labels: severity: critical annotations: summary: "Prometheus is down" description: "Prometheus has been down for more than 1 minute." - alert: PrometheusConfigReloadFailed expr: prometheus_config_last_reload_successful == 0 for: 5m labels: severity: warning annotations: summary: "Prometheus configuration reload failed" description: "Prometheus configuration reload has been failing for more than 5 minutes." - name: aserver-alerts rules: - alert: AServerOffline expr: absent(up{kubernetes_pod_name=~"aserver.*"}) or up{kubernetes_pod_name=~"aserver.*"} == 0 for: 1m labels: severity: critical service: aserver alert_type: offline annotations: summary: "AServer 服务离线" description: "AServer Pod 不存在或已离线超过1分钟" action: "请检查 Pod 状态和日志"