feat: add alerts for flux
This commit is contained in:
parent
0294c45903
commit
f9132bcc01
|
@ -0,0 +1,5 @@
|
|||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
resources:
|
||||
- ./podmonitor.yaml
|
||||
- ./rules.yaml
|
|
@ -0,0 +1,32 @@
|
|||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/podmonitor_v1.json
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PodMonitor
|
||||
metadata:
|
||||
name: flux-system
|
||||
namespace: flux-system
|
||||
labels:
|
||||
app.kubernetes.io/part-of: flux
|
||||
app.kubernetes.io/component: monitoring
|
||||
spec:
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- flux-system
|
||||
selector:
|
||||
matchExpressions:
|
||||
- key: app
|
||||
operator: In
|
||||
values:
|
||||
- helm-controller
|
||||
- source-controller
|
||||
- kustomize-controller
|
||||
- notification-controller
|
||||
- image-automation-controller
|
||||
- image-reflector-controller
|
||||
podMetricsEndpoints:
|
||||
- port: http-prom
|
||||
relabelings:
|
||||
# Ref: https://github.com/prometheus-operator/prometheus-operator/issues/4816
|
||||
- sourceLabels: [__meta_kubernetes_pod_phase]
|
||||
action: keep
|
||||
regex: Running
|
|
@ -0,0 +1,32 @@
|
|||
---
|
||||
# yaml-language-server: $schema=https://kubernetes-schemas.pages.dev/monitoring.coreos.com/prometheusrule_v1.json
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: flux-rules
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: flux.rules
|
||||
rules:
|
||||
- alert: FluxComponentAbsent
|
||||
annotations:
|
||||
summary: Flux component has disappeared from Prometheus target discovery.
|
||||
expr: |
|
||||
absent(up{job=~".*flux-system.*"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: FluxReconciliationFailure
|
||||
annotations:
|
||||
summary: >-
|
||||
{{ $labels.kind }} {{ $labels.namespace }}/{{ $labels.name }} reconciliation
|
||||
has been failing for more than 15 minutes.
|
||||
expr: |
|
||||
max(gotk_reconcile_condition{status="False",type="Ready"}) by (namespace, name, kind)
|
||||
+
|
||||
on(namespace, name, kind) (max(gotk_reconcile_condition{status="Deleted"})
|
||||
by (namespace, name, kind)) * 2 == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
|
@ -11,3 +11,4 @@ resources:
|
|||
- ./varken
|
||||
- ./proxmoxve-exporter
|
||||
- ./external-monitors
|
||||
- ./flux
|
Loading…
Reference in New Issue