diff --git a/prometheus/install-prometheus.sh b/prometheus/install-prometheus.sh index 79dc344..120b77a 100644 --- a/prometheus/install-prometheus.sh +++ b/prometheus/install-prometheus.sh @@ -15,4 +15,28 @@ kubectl wait --for=condition=available --timeout=300s deployment/prometheus echo "Prometheus 安装完成!" echo "查看状态: kubectl get pods -l app=prometheus" +echo "" +echo "==========================================" +echo "Prometheus 与 Alertmanager 集成配置" +echo "注意:此部分需要在安装完 Alertmanager 后执行" +echo "==========================================" + +echo "应用告警规则配置..." +kubectl apply -f prometheus-rules-configmap.yaml + +echo "更新 Prometheus 配置以集成 Alertmanager..." +kubectl apply -f prometheus-configmap.yaml +kubectl apply -f prometheus-deployment.yaml + +echo "等待 Prometheus 重新启动..." +kubectl wait --for=condition=available --timeout=300s deployment/prometheus + +echo "==========================================" +echo "Prometheus 与 Alertmanager 集成完成!" +echo "==========================================" +echo "访问地址: https://prometheus-ops.t-aaron.com" +echo "告警管理: https://alertmanager-ops.t-aaron.com" +echo "已集成 Alertmanager,告警规则已加载" +echo "==========================================" + diff --git a/prometheus/prometheus-configmap.yaml b/prometheus/prometheus-configmap.yaml index b6d712e..11984b0 100644 --- a/prometheus/prometheus-configmap.yaml +++ b/prometheus/prometheus-configmap.yaml @@ -8,7 +8,16 @@ data: global: scrape_interval: 15s evaluation_interval: 15s + rule_files: + - "/etc/prometheus/rules/*.yml" + + alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager-service:9093 + scrape_configs: - job_name: 'prometheus' static_configs: diff --git a/prometheus/prometheus-deployment.yaml b/prometheus/prometheus-deployment.yaml index bc0ebb1..5adafe8 100644 --- a/prometheus/prometheus-deployment.yaml +++ b/prometheus/prometheus-deployment.yaml @@ -42,12 +42,17 @@ spec: volumeMounts: - name: prometheus-config mountPath: /etc/prometheus/ + - name: prometheus-rules + mountPath: /etc/prometheus/rules/ - name: prometheus-storage mountPath: /prometheus/ volumes: - name: prometheus-config configMap: name: prometheus-config + - name: prometheus-rules + configMap: + name: prometheus-rules - name: prometheus-storage hostPath: path: /opt/prometheus/data diff --git a/prometheus/prometheus-rules-configmap.yaml b/prometheus/prometheus-rules-configmap.yaml new file mode 100644 index 0000000..6509f91 --- /dev/null +++ b/prometheus/prometheus-rules-configmap.yaml @@ -0,0 +1,56 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-rules + namespace: default +data: + kubernetes-alerts.yml: | + groups: + - name: kubernetes-alerts + rules: + - alert: PodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 0 + for: 0m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping" + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting {{ printf \"%.2f\" $value }} times / 15 minutes." + + - alert: PodNotReady + expr: kube_pod_status_phase{phase=~"Pending|Unknown"} > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is not ready" + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for more than 5 minutes." + + - alert: NodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Node {{ $labels.node }} is not ready" + description: "Node {{ $labels.node }} has been unready for more than 5 minutes." + + - name: prometheus-alerts + rules: + - alert: PrometheusDown + expr: up{job="prometheus"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Prometheus is down" + description: "Prometheus has been down for more than 1 minute." + + - alert: PrometheusConfigReloadFailed + expr: prometheus_config_last_reload_successful == 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Prometheus configuration reload failed" + description: "Prometheus configuration reload has been failing for more than 5 minutes."