Compare commits
No commits in common. "6b076b2e1d7f093714f5c841ed11d7040fa6b5d8" and "5f3d54948c268753bd0d728e07f24daa22203b64" have entirely different histories.
6b076b2e1d
...
5f3d54948c
|
@ -1854,7 +1854,7 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "zfs_promexporter"
|
name = "zfs_promexporter"
|
||||||
version = "0.0.2"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix-web",
|
"actix-web",
|
||||||
"chrono",
|
"chrono",
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[package]
|
[package]
|
||||||
name = "zfs_promexporter"
|
name = "zfs_promexporter"
|
||||||
version = "0.0.2"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
33
README.md
33
README.md
|
@ -25,28 +25,23 @@ $ docker run --rm -d --privileged -p 8080:8080 --name zfs_exporter ghcr.io/seano
|
||||||
|
|
||||||
## What Metrics Are Exported?
|
## What Metrics Are Exported?
|
||||||
* `zfs_health` (`enum`): Represents the device's health, can be `online`, `degraded`, `faulted`, `offline`, `available`, `unavailable` and `removed`. The state is stored in the `state` label, and the value is `0` for not-in-state, and `1` for in-state.
|
* `zfs_health` (`enum`): Represents the device's health, can be `online`, `degraded`, `faulted`, `offline`, `available`, `unavailable` and `removed`. The state is stored in the `state` label, and the value is `0` for not-in-state, and `1` for in-state.
|
||||||
* `zfs_read_errors` (int gauge): The amount of read errors for this device.
|
* `zfs_read_errors` (int counter): The amount of read errors for this device.
|
||||||
* `zfs_write_errors` (int gauge): The amount of write errors for this device.
|
* `zfs_write_errors` (int counter): The amount of write errors for this device.
|
||||||
* `zfs_checksum_errors` (int gauge): The amount of checksum errors for this device.
|
* `zfs_checksum_errors` (int counter): The amount of checksum errors for this device.
|
||||||
* `zfs_disk_count` (int gauge): The amount of disks in this pool or vdev.
|
* `zfs_disk_count` (int counter): The amount of disks in this pool or vdev.
|
||||||
* `zfs_vdev_count` (int gauge): The amount of vdevs in the pool.
|
* `zfs_vdev_count` (int counter): The amount of vdevs in the pool.
|
||||||
* `zfs_spare_count` (int gauge): The spare amount in the pool.
|
* `zfs_spare_count` (int counter): The spare amount in the pool.
|
||||||
* `zfs_raw_size` (int gauge): The raw size (in bytes) of the device. This is not the actual capacity.
|
* `zfs_raw_size` (int counter): The raw size (in bytes) of the device. This is not the actual capacity.
|
||||||
* `zfs_capacity` (int gauge): The capacity (in bytes) of the device.
|
* `zfs_capacity` (int counter): The capacity (in bytes) of the device.
|
||||||
* `zfs_available` (int gauge): The available bytes of the device.
|
* `zfs_available` (int counter): The available bytes of the device.
|
||||||
* `zfs_read_operations` (int gauge): The amount of read operations on this device.
|
* `zfs_read_operations` (int counter): The amount of read operations on this device.
|
||||||
* `zfs_write_operations` (int gauge): The amount of write operations on this device.
|
* `zfs_write_operations` (int counter): The amount of write operations on this device.
|
||||||
* `zfs_read_bandwidth` (int gauge): The read bandwidth for this device in bytes per second.
|
* `zfs_read_bandwidth` (int counter): The read bandwidth for this device in bytes per second.
|
||||||
* `zfs_write_bandwidth` (int gauge): The write bandwidth for this device in bytes per second.
|
* `zfs_write_bandwidth` (int counter): The write bandwidth for this device in bytes per second.
|
||||||
* `zfs_fragmentation` (int gauge): The percentage (0-100) of fragmentation of the device.
|
|
||||||
|
|
||||||
**Note: Sizes output from `zpool status` are in TiB/GiB (1024), not TB/GB (1000).**
|
**Note: the `zpool status` commands use 1024, not 1000.**
|
||||||
|
|
||||||
There are some common labels for the metrics:
|
There are some common labels for the metrics:
|
||||||
* `device_name`: The name of the device that this metric is related to.
|
* `device_name`: The name of the device that this metric is related to.
|
||||||
* `device_type`: The type of the device. Can be `pool`, `vdev` or `disk`.
|
* `device_type`: The type of the device. Can be `pool`, `vdev` or `disk`.
|
||||||
* `pool`: The ZFS pool that this device (`vdev` or `disk`) is a part of.
|
* `pool`: The ZFS pool that this device (`vdev` or `disk`) is a part of.
|
||||||
|
|
||||||
## Grafana Dashboard
|
|
||||||
Import from json: [dashboard.json](./dashboard.json)
|
|
||||||
![dashboard screenshot](assets/dashboard.png)
|
|
Binary file not shown.
Before Width: | Height: | Size: 271 KiB |
1334
dashboard.json
1334
dashboard.json
File diff suppressed because it is too large
Load Diff
105
src/main.rs
105
src/main.rs
|
@ -3,7 +3,7 @@ use actix_web::middleware::Logger;
|
||||||
|
|
||||||
use libzetta::zpool::{ZpoolOpen3, ZpoolEngine, Vdev, Health, vdev::ErrorStatistics, Reason};
|
use libzetta::zpool::{ZpoolOpen3, ZpoolEngine, Vdev, Health, vdev::ErrorStatistics, Reason};
|
||||||
|
|
||||||
use prometheus::{Encoder, IntCounter, Registry, IntGauge};
|
use prometheus::{Encoder, IntCounter, Registry};
|
||||||
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
|
|
||||||
|
@ -20,10 +20,10 @@ fn encode_metrics(reg: &Registry) -> Result<String, FromUtf8Error> {
|
||||||
String::from_utf8(buffer.clone())
|
String::from_utf8(buffer.clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_intguage(reg: &Registry, name: &str, help: &str, val: u64) -> prometheus::Result<()> {
|
fn register_intcounter(reg: &Registry, name: &str, help: &str, val: u64) -> prometheus::Result<()> {
|
||||||
let gauge = IntGauge::new(name, help)?;
|
let counter = IntCounter::new(name, help)?;
|
||||||
gauge.set(val as i64);
|
counter.inc_by(val);
|
||||||
reg.register(Box::new(gauge))?;
|
reg.register(Box::new(counter))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Online => 1,
|
Health::Online => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&online_reg, "health", "The health of the device. This is an enum.", online_val)?;
|
register_intcounter(&online_reg, "health", "The health of the device. This is an enum.", online_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("degraded"));
|
labels.insert(String::from("state"), String::from("degraded"));
|
||||||
let degraded_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let degraded_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -46,7 +46,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Degraded => 1,
|
Health::Degraded => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(°raded_reg, "health", "The health of the device. This is an enum.", degraded_val)?;
|
register_intcounter(°raded_reg, "health", "The health of the device. This is an enum.", degraded_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("faulted"));
|
labels.insert(String::from("state"), String::from("faulted"));
|
||||||
let faulted_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let faulted_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -54,7 +54,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Faulted => 1,
|
Health::Faulted => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&faulted_reg, "health", "The health of the device. This is an enum.", faulted_val)?;
|
register_intcounter(&faulted_reg, "health", "The health of the device. This is an enum.", faulted_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("offline"));
|
labels.insert(String::from("state"), String::from("offline"));
|
||||||
let offline_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let offline_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -62,7 +62,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Offline => 1,
|
Health::Offline => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&offline_reg, "health", "The health of the device. This is an enum.", offline_val)?;
|
register_intcounter(&offline_reg, "health", "The health of the device. This is an enum.", offline_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("available"));
|
labels.insert(String::from("state"), String::from("available"));
|
||||||
let available_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let available_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -70,7 +70,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Available => 1,
|
Health::Available => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&available_reg, "health", "The health of the device. This is an enum.", available_val)?;
|
register_intcounter(&available_reg, "health", "The health of the device. This is an enum.", available_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("unavailable"));
|
labels.insert(String::from("state"), String::from("unavailable"));
|
||||||
let unavailable_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let unavailable_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -78,7 +78,7 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Unavailable => 1,
|
Health::Unavailable => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&unavailable_reg, "health", "The health of the device. This is an enum.", unavailable_val)?;
|
register_intcounter(&unavailable_reg, "health", "The health of the device. This is an enum.", unavailable_val)?;
|
||||||
|
|
||||||
labels.insert(String::from("state"), String::from("removed"));
|
labels.insert(String::from("state"), String::from("removed"));
|
||||||
let removed_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
let removed_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone()))?;
|
||||||
|
@ -86,15 +86,15 @@ fn register_health(labels: HashMap<String, String>, health: Health) -> prometheu
|
||||||
Health::Removed => 1,
|
Health::Removed => 1,
|
||||||
_ => 0,
|
_ => 0,
|
||||||
};
|
};
|
||||||
register_intguage(&removed_reg, "health", "The health of the device. This is an enum.", removed_val)?;
|
register_intcounter(&removed_reg, "health", "The health of the device. This is an enum.", removed_val)?;
|
||||||
|
|
||||||
Ok(vec![online_reg, degraded_reg, faulted_reg, offline_reg, available_reg, unavailable_reg, removed_reg])
|
Ok(vec![online_reg, degraded_reg, faulted_reg, offline_reg, available_reg, unavailable_reg, removed_reg])
|
||||||
}
|
}
|
||||||
|
|
||||||
fn register_error_stats(reg: &Registry, error_stats: ErrorStatistics) -> prometheus::Result<()> {
|
fn register_error_stats(reg: &Registry, error_stats: ErrorStatistics) -> prometheus::Result<()> {
|
||||||
register_intguage(reg, "read_errors", "The amount of I/O errors that occurred during reading", error_stats.read)?;
|
register_intcounter(reg, "read_errors", "The amount of I/O errors that occurred during reading", error_stats.read)?;
|
||||||
register_intguage(reg, "write_errors", "The amount of I/O errors that occurred during writing", error_stats.write)?;
|
register_intcounter(reg, "write_errors", "The amount of I/O errors that occurred during writing", error_stats.write)?;
|
||||||
register_intguage(reg, "checksum_errors", "The amount of checksum errors, meaning the device returned corrupted data from a read request", error_stats.checksum)?;
|
register_intcounter(reg, "checksum_errors", "The amount of checksum errors, meaning the device returned corrupted data from a read request", error_stats.checksum)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -109,7 +109,7 @@ fn register_vdev_stats(vdev: &Vdev, vdev_device: &Device, vdev_name: String, sta
|
||||||
vdev_device.io_stats.collect_metrics(&vdev_reg)?;
|
vdev_device.io_stats.collect_metrics(&vdev_reg)?;
|
||||||
register_error_stats(&vdev_reg, vdev.error_statistics().clone())?;
|
register_error_stats(&vdev_reg, vdev.error_statistics().clone())?;
|
||||||
|
|
||||||
register_intguage(&vdev_reg, "disk_count", "Total count of drives in this pool or vdev", vdev.disks().len() as u64)?;
|
register_intcounter(&vdev_reg, "disk_count", "Total count of drives in this pool or vdev", vdev.disks().len() as u64)?;
|
||||||
|
|
||||||
Ok(vdev_reg)
|
Ok(vdev_reg)
|
||||||
}
|
}
|
||||||
|
@ -149,8 +149,8 @@ async fn metrics_endpoint() -> impl Responder {
|
||||||
// Create a registry for general pool metrics
|
// Create a registry for general pool metrics
|
||||||
let pool_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone())).unwrap();
|
let pool_reg = Registry::new_custom(Some("zfs".to_string()), Some(labels.clone())).unwrap();
|
||||||
|
|
||||||
register_intguage(&pool_reg, "vdev_count", "Count of vdevs in this pool", pool.vdevs().len() as u64).unwrap();
|
register_intcounter(&pool_reg, "vdev_count", "Count of vdevs in this pool", pool.vdevs().len() as u64).unwrap();
|
||||||
register_intguage(&pool_reg, "spare_count", "The amount of spare drives", pool.spares().len() as u64).unwrap();
|
register_intcounter(&pool_reg, "spare_count", "The amount of spare drives", pool.spares().len() as u64).unwrap();
|
||||||
|
|
||||||
// Calculate the total drive count and register it as a metric.
|
// Calculate the total drive count and register it as a metric.
|
||||||
let total_disk_count = IntCounter::new("disk_count", "Total count of drives in this pool or vdev").unwrap();
|
let total_disk_count = IntCounter::new("disk_count", "Total count of drives in this pool or vdev").unwrap();
|
||||||
|
@ -186,55 +186,26 @@ async fn metrics_endpoint() -> impl Responder {
|
||||||
let output = String::from_utf8(output.stdout)
|
let output = String::from_utf8(output.stdout)
|
||||||
.expect(&format!("Failure to convert output of `zpool iostat` to utf8."));
|
.expect(&format!("Failure to convert output of `zpool iostat` to utf8."));
|
||||||
|
|
||||||
let mut devices = Device::parse_from_stdout(output);
|
let devices = Device::parse_from_stdout(output);
|
||||||
|
|
||||||
// Get the pool from the devices and collect the io stats
|
// Get the pool from the devices and collect the io stats
|
||||||
if let Some(pool_dev) = devices.iter_mut().find(|dev| dev.name == pool.name().clone()) {
|
if let Some(pool_dev) = devices.iter().find(|dev| dev.name == pool.name().clone()) {
|
||||||
|
pool_dev.io_stats.collect_metrics(&pool_reg).unwrap();
|
||||||
|
|
||||||
// Get the raw size of the pool.
|
// Get the raw size of the pool.
|
||||||
let output = String::from_utf8(
|
let output = String::from_utf8(
|
||||||
Command::new("zpool")
|
Command::new("zpool")
|
||||||
.args(["list", "-Hpv", pool.name().as_str()])
|
.args(["list", "-Hp", pool.name().as_str()])
|
||||||
.output()
|
.output()
|
||||||
.expect(&format!("Failure to execute `zpool iostat {} -v 1 2`", pool.name()))
|
.expect(&format!("Failure to execute `zpool iostat {} -v 1 2`", pool.name()))
|
||||||
.stdout).expect(&format!("Failure to convert output of `zpool iostat {} -v 1 2` to utf8.", pool.name()));
|
.stdout).expect(&format!("Failure to convert output of `zpool iostat {} -v 1 2` to utf8.", pool.name()));
|
||||||
|
|
||||||
// Extract the size from the output
|
// Extract the size from the output
|
||||||
|
let cols: Vec<&str> = output.split("\t").collect();
|
||||||
let mut lines = output.split("\n");
|
if cols.len() == 11 {
|
||||||
{
|
let size: u64 = cols[1].parse().unwrap();
|
||||||
let line = lines.next().unwrap();
|
register_intcounter(&pool_reg, "raw_size", "The raw size of this device (this is not the usable space)", size).unwrap();
|
||||||
let cols: Vec<&str> = line.split("\t").collect();
|
|
||||||
|
|
||||||
// make sure this line is actually a pool
|
|
||||||
if cols.len() == 11 {
|
|
||||||
let size: u64 = cols[1].parse().unwrap();
|
|
||||||
register_intguage(&pool_reg, "raw_size", "The raw size of this device (this is not the usable space)", size).unwrap();
|
|
||||||
|
|
||||||
let frag = cols[6].parse::<u64>().unwrap();
|
|
||||||
pool_dev.io_stats.frag = Some(frag);
|
|
||||||
} else {
|
|
||||||
panic!("Failure to parse pool")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for line in lines {
|
|
||||||
let cols: Vec<&str> = line.split("\t").collect();
|
|
||||||
|
|
||||||
// Check if this line is correct
|
|
||||||
if cols.len() == 10 {
|
|
||||||
let name = cols[0];
|
|
||||||
|
|
||||||
if let Some(device) = devices.iter_mut()
|
|
||||||
.find(|dev| dev.name == name && dev.is_pool_or_vdev()) {
|
|
||||||
let frag = cols[6].parse::<u64>().unwrap();
|
|
||||||
device.io_stats.frag = Some(frag);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Collect pool io stats into registry
|
|
||||||
let pool_dev = devices.iter_mut().find(|dev| dev.name == pool.name().clone()).unwrap();
|
|
||||||
pool_dev.io_stats.collect_metrics(&pool_reg).unwrap();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Push pool metrics
|
// Push pool metrics
|
||||||
|
@ -306,7 +277,6 @@ async fn metrics_endpoint() -> impl Responder {
|
||||||
struct IoStats {
|
struct IoStats {
|
||||||
capacity: Option<u64>,
|
capacity: Option<u64>,
|
||||||
available: Option<u64>,
|
available: Option<u64>,
|
||||||
frag: Option<u64>,
|
|
||||||
|
|
||||||
read_op: u64,
|
read_op: u64,
|
||||||
write_op: u64,
|
write_op: u64,
|
||||||
|
@ -316,11 +286,10 @@ struct IoStats {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IoStats {
|
impl IoStats {
|
||||||
fn new(capacity: Option<u64>, available: Option<u64>, frag: Option<u64>, read_op: u64, write_op: u64, read_band: u64, write_band: u64) -> Self {
|
fn new(capacity: Option<u64>, available: Option<u64>, read_op: u64, write_op: u64, read_band: u64, write_band: u64) -> Self {
|
||||||
Self {
|
Self {
|
||||||
capacity,
|
capacity,
|
||||||
available,
|
available,
|
||||||
frag,
|
|
||||||
read_op,
|
read_op,
|
||||||
write_op,
|
write_op,
|
||||||
read_band,
|
read_band,
|
||||||
|
@ -329,17 +298,15 @@ impl IoStats {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn collect_metrics(&self, reg: &Registry) -> prometheus::Result<()> {
|
fn collect_metrics(&self, reg: &Registry) -> prometheus::Result<()> {
|
||||||
// These will always be Some at the same time, no mix match
|
if let (Some(capacity), Some(available)) = (self.capacity, self.available) {
|
||||||
if let (Some(capacity), Some(available), Some(frag)) = (self.capacity, self.available, self.frag) {
|
register_intcounter(®, "capacity", "The capacity of the device in bytes", capacity)?;
|
||||||
register_intguage(®, "capacity", "The capacity of the device in bytes", capacity)?;
|
register_intcounter(®, "available", "The available bytes in the device", available)?;
|
||||||
register_intguage(®, "available", "The available bytes in the device", available)?;
|
|
||||||
register_intguage(®, "fragmentation", "The percentage (0-100) of fragmentation of the device", frag)?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
register_intguage(®, "read_operations", "The read operations for this device per second", self.read_op)?;
|
register_intcounter(®, "read_operations", "The read operations for this device per second", self.read_op)?;
|
||||||
register_intguage(®, "write_operations", "The write operations for this device per second", self.write_op)?;
|
register_intcounter(®, "write_operations", "The write operations for this device per second", self.write_op)?;
|
||||||
register_intguage(®, "read_bandwidth", "The read bandwidth for this device in bytes per second", self.read_band)?;
|
register_intcounter(®, "read_bandwidth", "The read bandwidth for this device in bytes per second", self.read_band)?;
|
||||||
register_intguage(®, "write_bandwidth", "The write bandwidth for this device in bytes per second", self.write_band)?;
|
register_intcounter(®, "write_bandwidth", "The write bandwidth for this device in bytes per second", self.write_band)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -395,7 +362,7 @@ impl Device {
|
||||||
};
|
};
|
||||||
|
|
||||||
parsed.push(Device::new(String::from(name),
|
parsed.push(Device::new(String::from(name),
|
||||||
IoStats::new(alloc, free, None, read_op, write_op, read_band, write_band)));
|
IoStats::new(alloc, free, read_op, write_op, read_band, write_band)));
|
||||||
}
|
}
|
||||||
|
|
||||||
return parsed;
|
return parsed;
|
||||||
|
|
Loading…
Reference in New Issue