42 lines
1.4 KiB
Bash
42 lines
1.4 KiB
Bash
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
echo "开始安装 AServer 告警规则到 Prometheus..."
|
|
|
|
echo "1. 应用 AServer 离线告警规则..."
|
|
kubectl apply -f aserver-offline-alert.yaml
|
|
|
|
echo "2. 重启 Prometheus 以加载新的告警规则..."
|
|
kubectl rollout restart deployment/prometheus
|
|
|
|
echo "等待 Prometheus 重启..."
|
|
kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
|
|
|
echo ""
|
|
echo "=========================================="
|
|
echo "AServer 告警规则安装完成!"
|
|
echo "=========================================="
|
|
echo ""
|
|
echo "告警规则:"
|
|
echo "- AServerOffline: aserver离线告警 (1分钟后触发)"
|
|
echo " - 严重程度: critical"
|
|
echo " - 触发条件: up{kubernetes_pod_name=~\"aserver.*\"} == 0"
|
|
echo " - 持续时间: 1分钟"
|
|
echo ""
|
|
echo "访问地址:"
|
|
echo "- Prometheus Alerts: https://prom-ops.t-aaron.com/alerts"
|
|
echo ""
|
|
echo "验证方法:"
|
|
echo "1. 访问 Prometheus Alerts 页面"
|
|
echo "2. 查找 'AServerOffline' 告警规则"
|
|
echo "3. 状态应该显示为 'Inactive' (正常状态)"
|
|
echo ""
|
|
echo "测试告警:"
|
|
echo "1. 删除 aserver Pod: kubectl delete pod -l app=aserver"
|
|
echo "2. 等待1分钟后查看告警状态变为 'Pending' 或 'Firing'"
|
|
echo "3. 重新部署 aserver 后告警应自动恢复"
|
|
echo ""
|
|
echo "注意:此脚本只安装 Prometheus 告警规则"
|
|
echo "如需钉钉通知,请运行 Alertmanager 的安装脚本"
|
|
echo "=========================================="
|