add connect
This commit is contained in:
parent
718b5bb9a4
commit
33f210b602
|
|
@ -15,4 +15,28 @@ kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
||||||
echo "Prometheus 安装完成!"
|
echo "Prometheus 安装完成!"
|
||||||
echo "查看状态: kubectl get pods -l app=prometheus"
|
echo "查看状态: kubectl get pods -l app=prometheus"
|
||||||
|
|
||||||
|
echo ""
|
||||||
|
echo "=========================================="
|
||||||
|
echo "Prometheus 与 Alertmanager 集成配置"
|
||||||
|
echo "注意:此部分需要在安装完 Alertmanager 后执行"
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
echo "应用告警规则配置..."
|
||||||
|
kubectl apply -f prometheus-rules-configmap.yaml
|
||||||
|
|
||||||
|
echo "更新 Prometheus 配置以集成 Alertmanager..."
|
||||||
|
kubectl apply -f prometheus-configmap.yaml
|
||||||
|
kubectl apply -f prometheus-deployment.yaml
|
||||||
|
|
||||||
|
echo "等待 Prometheus 重新启动..."
|
||||||
|
kubectl wait --for=condition=available --timeout=300s deployment/prometheus
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo "Prometheus 与 Alertmanager 集成完成!"
|
||||||
|
echo "=========================================="
|
||||||
|
echo "访问地址: https://prometheus-ops.t-aaron.com"
|
||||||
|
echo "告警管理: https://alertmanager-ops.t-aaron.com"
|
||||||
|
echo "已集成 Alertmanager,告警规则已加载"
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,16 @@ data:
|
||||||
global:
|
global:
|
||||||
scrape_interval: 15s
|
scrape_interval: 15s
|
||||||
evaluation_interval: 15s
|
evaluation_interval: 15s
|
||||||
|
|
||||||
rule_files:
|
rule_files:
|
||||||
|
- "/etc/prometheus/rules/*.yml"
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager-service:9093
|
||||||
|
|
||||||
scrape_configs:
|
scrape_configs:
|
||||||
- job_name: 'prometheus'
|
- job_name: 'prometheus'
|
||||||
static_configs:
|
static_configs:
|
||||||
|
|
|
||||||
|
|
@ -42,12 +42,17 @@ spec:
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: prometheus-config
|
- name: prometheus-config
|
||||||
mountPath: /etc/prometheus/
|
mountPath: /etc/prometheus/
|
||||||
|
- name: prometheus-rules
|
||||||
|
mountPath: /etc/prometheus/rules/
|
||||||
- name: prometheus-storage
|
- name: prometheus-storage
|
||||||
mountPath: /prometheus/
|
mountPath: /prometheus/
|
||||||
volumes:
|
volumes:
|
||||||
- name: prometheus-config
|
- name: prometheus-config
|
||||||
configMap:
|
configMap:
|
||||||
name: prometheus-config
|
name: prometheus-config
|
||||||
|
- name: prometheus-rules
|
||||||
|
configMap:
|
||||||
|
name: prometheus-rules
|
||||||
- name: prometheus-storage
|
- name: prometheus-storage
|
||||||
hostPath:
|
hostPath:
|
||||||
path: /opt/prometheus/data
|
path: /opt/prometheus/data
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,56 @@
|
||||||
|
apiVersion: v1
|
||||||
|
kind: ConfigMap
|
||||||
|
metadata:
|
||||||
|
name: prometheus-rules
|
||||||
|
namespace: default
|
||||||
|
data:
|
||||||
|
kubernetes-alerts.yml: |
|
||||||
|
groups:
|
||||||
|
- name: kubernetes-alerts
|
||||||
|
rules:
|
||||||
|
- alert: PodCrashLooping
|
||||||
|
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping"
|
||||||
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes."
|
||||||
|
|
||||||
|
- alert: PodNotReady
|
||||||
|
expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready"
|
||||||
|
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes."
|
||||||
|
|
||||||
|
- alert: NodeNotReady
|
||||||
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Node {{ $labels.node }} is not ready"
|
||||||
|
description: "Node {{ $labels.node }} has been unready for more than 5 minutes."
|
||||||
|
|
||||||
|
- name: prometheus-alerts
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusDown
|
||||||
|
expr: up{job="prometheus"} == 0
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus is down"
|
||||||
|
description: "Prometheus has been down for more than 1 minute."
|
||||||
|
|
||||||
|
- alert: PrometheusConfigReloadFailed
|
||||||
|
expr: prometheus_config_last_reload_successful == 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: "Prometheus configuration reload failed"
|
||||||
|
description: "Prometheus configuration reload has been failing for more than 5 minutes."
|
||||||
Loading…
Reference in New Issue