diff --git a/alertmanager/aserver/alertmanager-webhook-test.yaml b/alertmanager/aserver/alertmanager-webhook-test.yaml new file mode 100644 index 0000000..c9b1414 --- /dev/null +++ b/alertmanager/aserver/alertmanager-webhook-test.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-config + namespace: default +data: + alertmanager.yml: | + route: + group_by: ['alertname', 'service'] + group_wait: 5s + group_interval: 5s + repeat_interval: 15m + receiver: 'aserver-webhook-receiver' + routes: + - match: + service: aserver + alert_type: offlines + receiver: 'aserver-webhook-receiver' + group_wait: 5s + group_interval: 5s + repeat_interval: 15m + + receivers: + - name: 'aserver-webhook-receiver' + webhook_configs: + - url: 'http://alert:8080/webhook/alert' + send_resolved: true diff --git a/prometheus/aserver/aserver-offline-alert.yaml b/prometheus/aserver/aserver-offline-alert.yaml new file mode 100644 index 0000000..ff0d5e8 --- /dev/null +++ b/prometheus/aserver/aserver-offline-alert.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: aserver-offline-alert + namespace: default +data: + aserver-offline-alert.yml: | + groups: + - name: aserver-offline-alerts + rules: + - alert: AServerOffline + expr: up{kubernetes_pod_name=~"aserver.*"} == 0 + for: 1m + labels: + severity: critical + service: aserver + alert_type: offline + annotations: + summary: "AServer 服务离线" + description: "AServer Pod {{ $labels.kubernetes_pod_name }} 已离线超过1分钟" + runbook_url: "https://docs.example.com/aserver-troubleshooting" + action: "请检查 Pod 状态和日志" diff --git a/prometheus/aserver/install-aserver-alerts.sh b/prometheus/aserver/install-aserver-alerts.sh new file mode 100644 index 0000000..07e6430 --- /dev/null +++ b/prometheus/aserver/install-aserver-alerts.sh @@ -0,0 +1,41 @@ +#!/bin/bash +set -euo pipefail + +echo "开始安装 AServer 告警规则到 Prometheus..." + +echo "1. 应用 AServer 离线告警规则..." +kubectl apply -f aserver-offline-alert.yaml + +echo "2. 重启 Prometheus 以加载新的告警规则..." +kubectl rollout restart deployment/prometheus + +echo "等待 Prometheus 重启..." +kubectl wait --for=condition=available --timeout=300s deployment/prometheus + +echo "" +echo "==========================================" +echo "AServer 告警规则安装完成!" +echo "==========================================" +echo "" +echo "告警规则:" +echo "- AServerOffline: aserver离线告警 (1分钟后触发)" +echo " - 严重程度: critical" +echo " - 触发条件: up{kubernetes_pod_name=~\"aserver.*\"} == 0" +echo " - 持续时间: 1分钟" +echo "" +echo "访问地址:" +echo "- Prometheus Alerts: https://prom-ops.t-aaron.com/alerts" +echo "" +echo "验证方法:" +echo "1. 访问 Prometheus Alerts 页面" +echo "2. 查找 'AServerOffline' 告警规则" +echo "3. 状态应该显示为 'Inactive' (正常状态)" +echo "" +echo "测试告警:" +echo "1. 删除 aserver Pod: kubectl delete pod -l app=aserver" +echo "2. 等待1分钟后查看告警状态变为 'Pending' 或 'Firing'" +echo "3. 重新部署 aserver 后告警应自动恢复" +echo "" +echo "注意:此脚本只安装 Prometheus 告警规则" +echo "如需钉钉通知,请运行 Alertmanager 的安装脚本" +echo "==========================================" diff --git a/prometheus/prometheus-rules-configmap.yaml b/prometheus/prometheus-rules-configmap.yaml index 6509f91..ea8a992 100644 --- a/prometheus/prometheus-rules-configmap.yaml +++ b/prometheus/prometheus-rules-configmap.yaml @@ -54,3 +54,17 @@ data: annotations: summary: "Prometheus configuration reload failed" description: "Prometheus configuration reload has been failing for more than 5 minutes." + + - name: aserver-alerts + rules: + - alert: AServerOffline + expr: absent(up{kubernetes_pod_name=~"aserver.*"}) + for: 1m + labels: + severity: critical + service: aserver + alert_type: offline + annotations: + summary: "AServer 服务离线" + description: "AServer Pod 不存在或已离线超过1分钟" + action: "请检查 Pod 状态和日志"