diff --git a/cluster/apps/monitoring/zfs-exporter/alerts.yaml b/cluster/apps/monitoring/zfs-exporter/alerts.yaml new file mode 100644 index 0000000..a873f28 --- /dev/null +++ b/cluster/apps/monitoring/zfs-exporter/alerts.yaml @@ -0,0 +1,49 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: zfs-exporter-rules + labels: + release: kube-prometheus-stack +spec: + groups: + - name: zfsexporter.rules + rules: + - alert: DriveErrorsCriticalAlert + expr: zfs_read_errors{device_type="disk"} + zfs_write_errors{device_type="disk"} + zfs_checksum_errors{device_type="disk"} > 1000 + annotations: + summary: A drive encountered some errors (drive {{ $labels.device_name }}) + description: "A drive encountered some errors! drive: {{ $labels.device_name }} in vdev {{ $labels.vdev }} in pool {{ $labels.pool }} \n error count: {{ $value }}" + labels: + severity: critical + + - alert: DriveAnyErrorsAlert + expr: zfs_read_errors{device_type="disk"} + zfs_write_errors{device_type="disk"} + zfs_checksum_errors{device_type="disk"} > 0 + annotations: + summary: A drive encountered some errors (drive {{ $labels.device_name }}) + description: "A drive encountered some errors! drive: {{ $labels.device_name }} in vdev {{ $labels.vdev }} in pool {{ $labels.pool }} \n error count: {{ $value }}" + labels: + severity: warning + + - alert: DriveReadErrorsAlert + expr: zfs_read_errors{device_type="disk"} > 0 + annotations: + summary: A drive encountered a checksum error (drive {{ $labels.device_name }}) + description: "A drive encountered a checksum error! drive: {{ $labels.device_name }} in vdev {{ $labels.vdev }} in pool {{ $labels.pool }} \n error count: {{ $value }}" + labels: + severity: warning + + - alert: DriveWriteErrorsAlert + expr: zfs_write_errors{device_type="disk"} > 0 + annotations: + summary: A drive encountered a checksum error (drive {{ $labels.device_name }}) + description: "A drive encountered a checksum error! drive: {{ $labels.device_name }} in vdev {{ $labels.vdev }} in pool {{ $labels.pool }} \n error count: {{ $value }}" + labels: + severity: warning + + - alert: DriveChecksumErrorsAlert + expr: zfs_checksum_errors{device_type="disk"} > 0 + annotations: + summary: A drive encountered a checksum error (drive {{ $labels.device_name }}) + description: "A drive encountered a checksum error! drive: {{ $labels.device_name }} in vdev {{ $labels.vdev }} in pool {{ $labels.pool }} \n error count: {{ $value }}" + labels: + severity: warning \ No newline at end of file