This commit is contained in:
parent
2efb905f47
commit
cae9ac2eaa
|
|
@ -0,0 +1,27 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: alertmanager-config
|
||||
namespace: default
|
||||
data:
|
||||
alertmanager.yml: |
|
||||
route:
|
||||
group_by: ['alertname', 'service']
|
||||
group_wait: 5s
|
||||
group_interval: 5s
|
||||
repeat_interval: 15m
|
||||
receiver: 'aserver-webhook-receiver'
|
||||
routes:
|
||||
- match:
|
||||
service: aserver
|
||||
alert_type: offlines
|
||||
receiver: 'aserver-webhook-receiver'
|
||||
group_wait: 5s
|
||||
group_interval: 5s
|
||||
repeat_interval: 15m
|
||||
|
||||
receivers:
|
||||
- name: 'aserver-webhook-receiver'
|
||||
webhook_configs:
|
||||
- url: 'http://alert:8080/webhook/alert'
|
||||
send_resolved: true
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: aserver-offline-alert
|
||||
namespace: default
|
||||
data:
|
||||
aserver-offline-alert.yml: |
|
||||
groups:
|
||||
- name: aserver-offline-alerts
|
||||
rules:
|
||||
- alert: AServerOffline
|
||||
expr: up{kubernetes_pod_name=~"aserver.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: aserver
|
||||
alert_type: offline
|
||||
annotations:
|
||||
summary: "AServer 服务离线"
|
||||
description: "AServer Pod {{ $labels.kubernetes_pod_name }} 已离线超过1分钟"
|
||||
runbook_url: "https://docs.example.com/aserver-troubleshooting"
|
||||
action: "请检查 Pod 状态和日志"
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
echo "开始安装 AServer 告警规则到 Prometheus..."
|
||||
|
||||
echo "1. 应用 AServer 离线告警规则..."
|
||||
kubectl apply -f aserver-offline-alert.yaml
|
||||
|
||||
echo "2. 重启 Prometheus 以加载新的告警规则..."
|
||||
kubectl rollout restart deployment/prometheus
|
||||
|
||||
echo "等待 Prometheus 重启..."
|
||||
kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "AServer 告警规则安装完成!"
|
||||
echo "=========================================="
|
||||
echo ""
|
||||
echo "告警规则:"
|
||||
echo "- AServerOffline: aserver离线告警 (1分钟后触发)"
|
||||
echo " - 严重程度: critical"
|
||||
echo " - 触发条件: up{kubernetes_pod_name=~\"aserver.*\"} == 0"
|
||||
echo " - 持续时间: 1分钟"
|
||||
echo ""
|
||||
echo "访问地址:"
|
||||
echo "- Prometheus Alerts: https://prom-ops.t-aaron.com/alerts"
|
||||
echo ""
|
||||
echo "验证方法:"
|
||||
echo "1. 访问 Prometheus Alerts 页面"
|
||||
echo "2. 查找 'AServerOffline' 告警规则"
|
||||
echo "3. 状态应该显示为 'Inactive' (正常状态)"
|
||||
echo ""
|
||||
echo "测试告警:"
|
||||
echo "1. 删除 aserver Pod: kubectl delete pod -l app=aserver"
|
||||
echo "2. 等待1分钟后查看告警状态变为 'Pending' 或 'Firing'"
|
||||
echo "3. 重新部署 aserver 后告警应自动恢复"
|
||||
echo ""
|
||||
echo "注意:此脚本只安装 Prometheus 告警规则"
|
||||
echo "如需钉钉通知,请运行 Alertmanager 的安装脚本"
|
||||
echo "=========================================="
|
||||
|
|
@ -54,3 +54,17 @@ data:
|
|||
annotations:
|
||||
summary: "Prometheus configuration reload failed"
|
||||
description: "Prometheus configuration reload has been failing for more than 5 minutes."
|
||||
|
||||
- name: aserver-alerts
|
||||
rules:
|
||||
- alert: AServerOffline
|
||||
expr: absent(up{kubernetes_pod_name=~"aserver.*"})
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
service: aserver
|
||||
alert_type: offline
|
||||
annotations:
|
||||
summary: "AServer 服务离线"
|
||||
description: "AServer Pod 不存在或已离线超过1分钟"
|
||||
action: "请检查 Pod 状态和日志"
|
||||
|
|
|
|||
Loading…
Reference in New Issue