add connect

This commit is contained in:
孙小云 2025-09-16 10:10:25 +08:00
parent 718b5bb9a4
commit 33f210b602
4 changed files with 94 additions and 0 deletions

View File

@ -15,4 +15,28 @@ kubectl wait --for=condition=available --timeout=300s deployment/prometheus
echo "Prometheus 安装完成!" echo "Prometheus 安装完成!"
echo "查看状态: kubectl get pods -l app=prometheus" echo "查看状态: kubectl get pods -l app=prometheus"
echo ""
echo "=========================================="
echo "Prometheus 与 Alertmanager 集成配置"
echo "注意:此部分需要在安装完 Alertmanager 后执行"
echo "=========================================="
echo "应用告警规则配置..."
kubectl apply -f prometheus-rules-configmap.yaml
echo "更新 Prometheus 配置以集成 Alertmanager..."
kubectl apply -f prometheus-configmap.yaml
kubectl apply -f prometheus-deployment.yaml
echo "等待 Prometheus 重新启动..."
kubectl wait --for=condition=available --timeout=300s deployment/prometheus
echo "=========================================="
echo "Prometheus 与 Alertmanager 集成完成!"
echo "=========================================="
echo "访问地址: https://prometheus-ops.t-aaron.com"
echo "告警管理: https://alertmanager-ops.t-aaron.com"
echo "已集成 Alertmanager告警规则已加载"
echo "=========================================="

View File

@ -8,7 +8,16 @@ data:
global: global:
scrape_interval: 15s scrape_interval: 15s
evaluation_interval: 15s evaluation_interval: 15s
rule_files: rule_files:
- "/etc/prometheus/rules/*.yml"
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager-service:9093
scrape_configs: scrape_configs:
- job_name: 'prometheus' - job_name: 'prometheus'
static_configs: static_configs:

View File

@ -42,12 +42,17 @@ spec:
volumeMounts: volumeMounts:
- name: prometheus-config - name: prometheus-config
mountPath: /etc/prometheus/ mountPath: /etc/prometheus/
- name: prometheus-rules
mountPath: /etc/prometheus/rules/
- name: prometheus-storage - name: prometheus-storage
mountPath: /prometheus/ mountPath: /prometheus/
volumes: volumes:
- name: prometheus-config - name: prometheus-config
configMap: configMap:
name: prometheus-config name: prometheus-config
- name: prometheus-rules
configMap:
name: prometheus-rules
- name: prometheus-storage - name: prometheus-storage
hostPath: hostPath:
path: /opt/prometheus/data path: /opt/prometheus/data

View File

@ -0,0 +1,56 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: default
data:
kubernetes-alerts.yml: |
groups:
- name: kubernetes-alerts
rules:
- alert: PodCrashLooping
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
for: 0m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes."
- alert: PodNotReady
expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes."
- alert: NodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.node }} is not ready"
description: "Node {{ $labels.node }} has been unready for more than 5 minutes."
- name: prometheus-alerts
rules:
- alert: PrometheusDown
expr: up{job="prometheus"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Prometheus is down"
description: "Prometheus has been down for more than 1 minute."
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Prometheus configuration reload failed"
description: "Prometheus configuration reload has been failing for more than 5 minutes."