From 52b51c396689574308a1af997d55597be58917b7 Mon Sep 17 00:00:00 2001 From: SeanOMik Date: Tue, 23 Jan 2024 18:51:53 -0500 Subject: [PATCH] feat(longhorn): add alerts and service monitor --- .../apps/monitoring/grafana/helm-release.yaml | 5 +++++ cluster/core/storage/longhorn/alerts.yaml | 21 +++++++++++++++++++ .../core/storage/longhorn/kustomization.yaml | 4 +++- .../storage/longhorn/service-monitor.yaml | 13 ++++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 cluster/core/storage/longhorn/alerts.yaml create mode 100644 cluster/core/storage/longhorn/service-monitor.yaml diff --git a/cluster/apps/monitoring/grafana/helm-release.yaml b/cluster/apps/monitoring/grafana/helm-release.yaml index 867be10..724f791 100644 --- a/cluster/apps/monitoring/grafana/helm-release.yaml +++ b/cluster/apps/monitoring/grafana/helm-release.yaml @@ -129,6 +129,11 @@ spec: cert-manager: url: https://raw.githubusercontent.com/monitoring-mixins/website/master/assets/cert-manager/dashboards/cert-manager.json datasource: Victoria + longhorn: + # renovate: depName="Longhorn" + gnetId: 16888 + revision: 8 + datasource: Victoria spegel: # renovate: depName="Spegel" gnetId: 18089 diff --git a/cluster/core/storage/longhorn/alerts.yaml b/cluster/core/storage/longhorn/alerts.yaml new file mode 100644 index 0000000..f06de80 --- /dev/null +++ b/cluster/core/storage/longhorn/alerts.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: prometheus-longhorn-rules + namespace: longhorn-system + labels: + release: kube-prometheus-stack +spec: + groups: + - name: longhorn.rules + rules: + - alert: LonghornVolumeUsageCritical + annotations: + description: Longhorn volume {{$labels.volume}} on {{$labels.node}} is at {{$value}}% used for + more than 5 minutes. + summary: Longhorn volume capacity is over 90% used. + expr: 100 * (longhorn_volume_usage_bytes / longhorn_volume_capacity_bytes) > 90 + for: 5m + labels: + issue: Longhorn volume {{$labels.volume}} usage on {{$labels.node}} is critical. + severity: critical \ No newline at end of file diff --git a/cluster/core/storage/longhorn/kustomization.yaml b/cluster/core/storage/longhorn/kustomization.yaml index 9e20722..7be259d 100644 --- a/cluster/core/storage/longhorn/kustomization.yaml +++ b/cluster/core/storage/longhorn/kustomization.yaml @@ -3,4 +3,6 @@ kind: Kustomization resources: - ./namespace.yaml - ./helm-repository.yaml -- ./helm-release.yaml \ No newline at end of file +- ./helm-release.yaml +- ./alerts.yaml +- ./service-monitor.yaml \ No newline at end of file diff --git a/cluster/core/storage/longhorn/service-monitor.yaml b/cluster/core/storage/longhorn/service-monitor.yaml new file mode 100644 index 0000000..1685220 --- /dev/null +++ b/cluster/core/storage/longhorn/service-monitor.yaml @@ -0,0 +1,13 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: longhorn + namespace: longhorn-system + labels: + release: kube-prometheus-stack +spec: + selector: + matchLabels: + app: longhorn-manager + endpoints: + - port: manager \ No newline at end of file