Compare commits

..

4 Commits

Author SHA1 Message Date
SeanOMik 6b076b2e1d
chore: update readme, add dashboard 2023-08-06 19:12:40 -04:00
SeanOMik 6e79f8965d
chore: bump version 2023-08-06 18:30:53 -04:00
SeanOMik 16ab12d098
fix: use gauges instead of counters 2023-08-06 18:29:09 -04:00
SeanOMik 8580e2fab3
feat: collect frag metrics 2023-08-06 18:09:24 -04:00
6 changed files with 1425 additions and 53 deletions

2
Cargo.lock generated
View File

@ -1854,7 +1854,7 @@ dependencies = [
[[package]]
name = "zfs_promexporter"
version = "0.1.0"
version = "0.0.2"
dependencies = [
"actix-web",
"chrono",

View File

@ -1,6 +1,6 @@
[package]
name = "zfs_promexporter"
version = "0.1.0"
version = "0.0.2"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@ -25,23 +25,28 @@ $ docker run --rm -d --privileged -p 8080:8080 --name zfs_exporter ghcr.io/seano
## What Metrics Are Exported?
* `zfs_health` (`enum`): Represents the device's health, can be `online`, `degraded`, `faulted`, `offline`, `available`, `unavailable` and `removed`. The state is stored in the `state` label, and the value is `0` for not-in-state, and `1` for in-state.
* `zfs_read_errors` (int counter): The amount of read errors for this device.
* `zfs_write_errors` (int counter): The amount of write errors for this device.
* `zfs_checksum_errors` (int counter): The amount of checksum errors for this device.
* `zfs_disk_count` (int counter): The amount of disks in this pool or vdev.
* `zfs_vdev_count` (int counter): The amount of vdevs in the pool.
* `zfs_spare_count` (int counter): The spare amount in the pool.
* `zfs_raw_size` (int counter): The raw size (in bytes) of the device. This is not the actual capacity.
* `zfs_capacity` (int counter): The capacity (in bytes) of the device.
* `zfs_available` (int counter): The available bytes of the device.
* `zfs_read_operations` (int counter): The amount of read operations on this device.
* `zfs_write_operations` (int counter): The amount of write operations on this device.
* `zfs_read_bandwidth` (int counter): The read bandwidth for this device in bytes per second.
* `zfs_write_bandwidth` (int counter): The write bandwidth for this device in bytes per second.
* `zfs_read_errors` (int gauge): The amount of read errors for this device.
* `zfs_write_errors` (int gauge): The amount of write errors for this device.
* `zfs_checksum_errors` (int gauge): The amount of checksum errors for this device.
* `zfs_disk_count` (int gauge): The amount of disks in this pool or vdev.
* `zfs_vdev_count` (int gauge): The amount of vdevs in the pool.
* `zfs_spare_count` (int gauge): The spare amount in the pool.
* `zfs_raw_size` (int gauge): The raw size (in bytes) of the device. This is not the actual capacity.
* `zfs_capacity` (int gauge): The capacity (in bytes) of the device.
* `zfs_available` (int gauge): The available bytes of the device.
* `zfs_read_operations` (int gauge): The amount of read operations on this device.
* `zfs_write_operations` (int gauge): The amount of write operations on this device.
* `zfs_read_bandwidth` (int gauge): The read bandwidth for this device in bytes per second.
* `zfs_write_bandwidth` (int gauge): The write bandwidth for this device in bytes per second.
* `zfs_fragmentation` (int gauge): The percentage (0-100) of fragmentation of the device.
**Note: the `zpool status` commands use 1024, not 1000.**
**Note: Sizes output from `zpool status` are in TiB/GiB (1024), not TB/GB (1000).**
There are some common labels for the metrics:
* `device_name`: The name of the device that this metric is related to.
* `device_type`: The type of the device. Can be `pool`, `vdev` or `disk`.
* `pool`: The ZFS pool that this device (`vdev` or `disk`) is a part of.
* `pool`: The ZFS pool that this device (`vdev` or `disk`) is a part of.
## Grafana Dashboard
Import from json: [dashboard.json](./dashboard.json)
![dashboard screenshot](assets/dashboard.png)

BIN
assets/dashboard.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 271 KiB

1334
dashboard.json Normal file

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ use actix_web::middleware::Logger;
use libzetta::zpool::{ZpoolOpen3, ZpoolEngine, Vdev, Health, vdev::ErrorStatistics, Reason};
use prometheus::{Encoder, IntCounter, Registry};
use prometheus::{Encoder, IntCounter, Registry, IntGauge};
use clap::Parser;
@ -20,10 +20,10 @@ fn encode_metrics(reg: &Registry) -> Result<String, FromUtf8Error> {
String::from_utf8(buffer.clone())
}
fn register_intcounter(reg: &Registry, name: &str, help: &str, val: u64) -> prometheus::Result<()> {
let counter = IntCounter::new(name, help)?;
counter.inc_by(val);
reg.register(Box::new(counter))?;
fn register_intguage(reg: &Registry, name: &str, help: &str, val: u64) -> prometheus::Result<()> {
let gauge = IntGauge::new(name, help)?;
gauge.set(val as i64);
reg.register(Box::new(gauge))?;
Ok(())
}
@ -38,7 +38,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Online => 1,
_ => 0,
};
register_intcounter(&online_reg, "health", "The health of the device. This is an enum.", online_val)?;
register_intguage(&online_reg, "health", "The health of the device. This is an enum.", online_val)?;
labels.insert(String::from("state"), String::from("degraded"));
let degraded_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -46,7 +46,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Degraded => 1,
_ => 0,
};
register_intcounter(&degraded_reg, "health", "The health of the device. This is an enum.", degraded_val)?;
register_intguage(&degraded_reg, "health", "The health of the device. This is an enum.", degraded_val)?;
labels.insert(String::from("state"), String::from("faulted"));
let faulted_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -54,7 +54,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Faulted => 1,
_ => 0,
};
register_intcounter(&faulted_reg, "health", "The health of the device. This is an enum.", faulted_val)?;
register_intguage(&faulted_reg, "health", "The health of the device. This is an enum.", faulted_val)?;
labels.insert(String::from("state"), String::from("offline"));
let offline_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -62,7 +62,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Offline => 1,
_ => 0,
};
register_intcounter(&offline_reg, "health", "The health of the device. This is an enum.", offline_val)?;
register_intguage(&offline_reg, "health", "The health of the device. This is an enum.", offline_val)?;
labels.insert(String::from("state"), String::from("available"));
let available_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -70,7 +70,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Available => 1,
_ => 0,
};
register_intcounter(&available_reg, "health", "The health of the device. This is an enum.", available_val)?;
register_intguage(&available_reg, "health", "The health of the device. This is an enum.", available_val)?;
labels.insert(String::from("state"), String::from("unavailable"));
let unavailable_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -78,7 +78,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Unavailable => 1,
_ => 0,
};
register_intcounter(&unavailable_reg, "health", "The health of the device. This is an enum.", unavailable_val)?;
register_intguage(&unavailable_reg, "health", "The health of the device. This is an enum.", unavailable_val)?;
labels.insert(String::from("state"), String::from("removed"));
let removed_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
@ -86,15 +86,15 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
Health::Removed => 1,
_ => 0,
};
register_intcounter(&removed_reg, "health", "The health of the device. This is an enum.", removed_val)?;
register_intguage(&removed_reg, "health", "The health of the device. This is an enum.", removed_val)?;
Ok(vec![online_reg, degraded_reg, faulted_reg, offline_reg, available_reg, unavailable_reg, removed_reg])
}
fn register_error_stats(reg: &Registry, error_stats: ErrorStatistics) -> prometheus::Result<()> {
register_intcounter(reg, "read_errors", "The amount of I/O errors that occurred during reading", error_stats.read)?;
register_intcounter(reg, "write_errors", "The amount of I/O errors that occurred during writing", error_stats.write)?;
register_intcounter(reg, "checksum_errors", "The amount of checksum errors, meaning the device returned corrupted data from a read request", error_stats.checksum)?;
register_intguage(reg, "read_errors", "The amount of I/O errors that occurred during reading", error_stats.read)?;
register_intguage(reg, "write_errors", "The amount of I/O errors that occurred during writing", error_stats.write)?;
register_intguage(reg, "checksum_errors", "The amount of checksum errors, meaning the device returned corrupted data from a read request", error_stats.checksum)?;
Ok(())
}
@ -109,7 +109,7 @@ fn register_vdev_stats(vdev: &Vdev, vdev_device: &Device, vdev_name: String, sta
vdev_device.io_stats.collect_metrics(&vdev_reg)?;
register_error_stats(&vdev_reg, vdev.error_statistics().clone())?;
register_intcounter(&vdev_reg, "disk_count", "Total count of drives in this pool or vdev", vdev.disks().len() as u64)?;
register_intguage(&vdev_reg, "disk_count", "Total count of drives in this pool or vdev", vdev.disks().len() as u64)?;
Ok(vdev_reg)
}
@ -149,8 +149,8 @@ async fn metrics_endpoint() -> impl Responder {
// Create a registry for general pool metrics
let pool_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone())).unwrap();
register_intcounter(&pool_reg, "vdev_count", "Count of vdevs in this pool", pool.vdevs().len() as u64).unwrap();
register_intcounter(&pool_reg, "spare_count", "The amount of spare drives", pool.spares().len() as u64).unwrap();
register_intguage(&pool_reg, "vdev_count", "Count of vdevs in this pool", pool.vdevs().len() as u64).unwrap();
register_intguage(&pool_reg, "spare_count", "The amount of spare drives", pool.spares().len() as u64).unwrap();
// Calculate the total drive count and register it as a metric.
let total_disk_count = IntCounter::new("disk_count", "Total count of drives in this pool or vdev").unwrap();
@ -186,26 +186,55 @@ async fn metrics_endpoint() -> impl Responder {
let output = String::from_utf8(output.stdout)
.expect(&format!("Failure to convert output of `zpool iostat` to utf8."));
let devices = Device::parse_from_stdout(output);
let mut devices = Device::parse_from_stdout(output);
// Get the pool from the devices and collect the io stats
if let Some(pool_dev) = devices.iter().find(|dev| dev.name == pool.name().clone()) {
pool_dev.io_stats.collect_metrics(&pool_reg).unwrap();
if let Some(pool_dev) = devices.iter_mut().find(|dev| dev.name == pool.name().clone()) {
// Get the raw size of the pool.
let output = String::from_utf8(
Command::new("zpool")
.args(["list", "-Hp", pool.name().as_str()])
.args(["list", "-Hpv", pool.name().as_str()])
.output()
.expect(&format!("Failure to execute `zpool iostat {} -v 1 2`", pool.name()))
.stdout).expect(&format!("Failure to convert output of `zpool iostat {} -v 1 2` to utf8.", pool.name()));
// Extract the size from the output
let cols: Vec<&str> = output.split("\t").collect();
if cols.len() == 11 {
let size: u64 = cols[1].parse().unwrap();
register_intcounter(&pool_reg, "raw_size", "The raw size of this device (this is not the usable space)", size).unwrap();
let mut lines = output.split("\n");
{
let line = lines.next().unwrap();
let cols: Vec<&str> = line.split("\t").collect();
// make sure this line is actually a pool
if cols.len() == 11 {
let size: u64 = cols[1].parse().unwrap();
register_intguage(&pool_reg, "raw_size", "The raw size of this device (this is not the usable space)", size).unwrap();
let frag = cols[6].parse::<u64>().unwrap();
pool_dev.io_stats.frag = Some(frag);
} else {
panic!("Failure to parse pool")
}
}
for line in lines {
let cols: Vec<&str> = line.split("\t").collect();
// Check if this line is correct
if cols.len() == 10 {
let name = cols[0];
if let Some(device) = devices.iter_mut()
.find(|dev| dev.name == name && dev.is_pool_or_vdev()) {
let frag = cols[6].parse::<u64>().unwrap();
device.io_stats.frag = Some(frag);
}
}
}
// Collect pool io stats into registry
let pool_dev = devices.iter_mut().find(|dev| dev.name == pool.name().clone()).unwrap();
pool_dev.io_stats.collect_metrics(&pool_reg).unwrap();
}
// Push pool metrics
@ -277,6 +306,7 @@ async fn metrics_endpoint() -> impl Responder {
struct IoStats {
capacity: Option<u64>,
available: Option<u64>,
frag: Option<u64>,
read_op: u64,
write_op: u64,
@ -286,10 +316,11 @@ struct IoStats {
}
impl IoStats {
fn new(capacity: Option<u64>, available: Option<u64>, read_op: u64, write_op: u64, read_band: u64, write_band: u64) -> Self {
fn new(capacity: Option<u64>, available: Option<u64>, frag: Option<u64>, read_op: u64, write_op: u64, read_band: u64, write_band: u64) -> Self {
Self {
capacity,
available,
frag,
read_op,
write_op,
read_band,
@ -298,15 +329,17 @@ impl IoStats {
}
fn collect_metrics(&self, reg: &Registry) -> prometheus::Result<()> {
if let (Some(capacity), Some(available)) = (self.capacity, self.available) {
register_intcounter(&reg, "capacity", "The capacity of the device in bytes", capacity)?;
register_intcounter(&reg, "available", "The available bytes in the device", available)?;
// These will always be Some at the same time, no mix match
if let (Some(capacity), Some(available), Some(frag)) = (self.capacity, self.available, self.frag) {
register_intguage(&reg, "capacity", "The capacity of the device in bytes", capacity)?;
register_intguage(&reg, "available", "The available bytes in the device", available)?;
register_intguage(&reg, "fragmentation", "The percentage (0-100) of fragmentation of the device", frag)?;
}
register_intcounter(&reg, "read_operations", "The read operations for this device per second", self.read_op)?;
register_intcounter(&reg, "write_operations", "The write operations for this device per second", self.write_op)?;
register_intcounter(&reg, "read_bandwidth", "The read bandwidth for this device in bytes per second", self.read_band)?;
register_intcounter(&reg, "write_bandwidth", "The write bandwidth for this device in bytes per second", self.write_band)?;
register_intguage(&reg, "read_operations", "The read operations for this device per second", self.read_op)?;
register_intguage(&reg, "write_operations", "The write operations for this device per second", self.write_op)?;
register_intguage(&reg, "read_bandwidth", "The read bandwidth for this device in bytes per second", self.read_band)?;
register_intguage(&reg, "write_bandwidth", "The write bandwidth for this device in bytes per second", self.write_band)?;
Ok(())
}
@ -362,7 +395,7 @@ impl Device {
};
parsed.push(Device::new(String::from(name),
IoStats::new(alloc, free, read_op, write_op, read_band, write_band)));
IoStats::new(alloc, free, None, read_op, write_op, read_band, write_band)));
}
return parsed;