add connect
This commit is contained in:
parent
718b5bb9a4
commit
33f210b602
|
|
@ -15,4 +15,28 @@ kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
|||
echo "Prometheus 安装完成!"
|
||||
echo "查看状态: kubectl get pods -l app=prometheus"
|
||||
|
||||
echo ""
|
||||
echo "=========================================="
|
||||
echo "Prometheus 与 Alertmanager 集成配置"
|
||||
echo "注意:此部分需要在安装完 Alertmanager 后执行"
|
||||
echo "=========================================="
|
||||
|
||||
echo "应用告警规则配置..."
|
||||
kubectl apply -f prometheus-rules-configmap.yaml
|
||||
|
||||
echo "更新 Prometheus 配置以集成 Alertmanager..."
|
||||
kubectl apply -f prometheus-configmap.yaml
|
||||
kubectl apply -f prometheus-deployment.yaml
|
||||
|
||||
echo "等待 Prometheus 重新启动..."
|
||||
kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
||||
|
||||
echo "=========================================="
|
||||
echo "Prometheus 与 Alertmanager 集成完成!"
|
||||
echo "=========================================="
|
||||
echo "访问地址: https://prometheus-ops.t-aaron.com"
|
||||
echo "告警管理: https://alertmanager-ops.t-aaron.com"
|
||||
echo "已集成 Alertmanager,告警规则已加载"
|
||||
echo "=========================================="
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,16 @@ data:
|
|||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
rule_files:
|
||||
- "/etc/prometheus/rules/*.yml"
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager-service:9093
|
||||
|
||||
scrape_configs:
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
|
|
|
|||
|
|
@ -42,12 +42,17 @@ spec:
|
|||
volumeMounts:
|
||||
- name: prometheus-config
|
||||
mountPath: /etc/prometheus/
|
||||
- name: prometheus-rules
|
||||
mountPath: /etc/prometheus/rules/
|
||||
- name: prometheus-storage
|
||||
mountPath: /prometheus/
|
||||
volumes:
|
||||
- name: prometheus-config
|
||||
configMap:
|
||||
name: prometheus-config
|
||||
- name: prometheus-rules
|
||||
configMap:
|
||||
name: prometheus-rules
|
||||
- name: prometheus-storage
|
||||
hostPath:
|
||||
path: /opt/prometheus/data
|
||||
|
|
|
|||
|
|
@ -0,0 +1,56 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-rules
|
||||
namespace: default
|
||||
data:
|
||||
kubernetes-alerts.yml: |
|
||||
groups:
|
||||
- name: kubernetes-alerts
|
||||
rules:
|
||||
- alert: PodCrashLooping
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes."
|
||||
|
||||
- alert: PodNotReady
|
||||
expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
|
||||
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes."
|
||||
|
||||
- alert: NodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Node {{ $labels.node }} is not ready"
|
||||
description: "Node {{ $labels.node }} has been unready for more than 5 minutes."
|
||||
|
||||
- name: prometheus-alerts
|
||||
rules:
|
||||
- alert: PrometheusDown
|
||||
expr: up{job="prometheus"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Prometheus is down"
|
||||
description: "Prometheus has been down for more than 1 minute."
|
||||
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Prometheus configuration reload failed"
|
||||
description: "Prometheus configuration reload has been failing for more than 5 minutes."
|
||||
Loading…
Reference in New Issue