- Fix silent failure: script now posts to dedicated zfs-health ntfy topic
instead of grafana-alerts catch-all (pools were offline 12+ hours
undetected because alerts were buried in Grafana noise)
- Three explicit states: ONLINE (silent), DEGRADED (high priority),
MISSING (urgent priority) — empty zpool list output is now a MISSING
alert, not silently ignored
- Textfile metrics written atomically after loop completes only:
zfs_pool_present{pool=X} 0|1 and zfs_health_last_run_seconds
- Added trap cleanup so mid-script crash leaves previous .prom intact
- Logs each pool state to syslog via logger -t zfs-health-check
- Remove duplicate cron entry running as tommy (was firing twice per tick)
- Enable node_exporter textfile collector for Prometheus scraping
Incident: usb1-zfs and usb2-zfs offline since PBS boot (missing cachefile).
Imported and cachefile regenerated in this session. No data errors.
Refs: 2026-05-05 health check CRITICAL C1/C4
68 lines
2.6 KiB
Bash
Executable File
68 lines
2.6 KiB
Bash
Executable File
#!/bin/bash
|
|
# ZFS pool health check — /usr/local/bin/zfs-health-check.sh
|
|
# Runs every 5 min via root crontab on PBS.
|
|
# Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent).
|
|
# Writes Prometheus textfile metrics after full loop completes only.
|
|
|
|
NTFY_URL="https://ntfy.goattw.net/zfs-health"
|
|
EXPECTED_POOLS=(usb1-zfs usb2-zfs)
|
|
TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom"
|
|
|
|
TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom)
|
|
# On crash, clean up tmpfile — do NOT write partial results to final path
|
|
trap 'rm -f "$TMPFILE"' EXIT
|
|
|
|
logger -t zfs-health-check "starting pool health check"
|
|
|
|
# Accumulate metric lines; written atomically only after full loop
|
|
METRICS=""
|
|
METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n"
|
|
METRICS+="# TYPE zfs_pool_present gauge\n"
|
|
METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n"
|
|
METRICS+="# TYPE zfs_health_last_run_seconds gauge\n"
|
|
|
|
for pool in "${EXPECTED_POOLS[@]}"; do
|
|
status=$(zpool list -H -o health "$pool" 2>/dev/null)
|
|
|
|
if [[ "$status" == "ONLINE" ]]; then
|
|
logger -t zfs-health-check "pool $pool: ONLINE"
|
|
METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
|
|
|
|
elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then
|
|
logger -t zfs-health-check "pool $pool: $status — alerting"
|
|
METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
|
|
curl -sf \
|
|
-H "Priority: high" \
|
|
-H "Tags: warning" \
|
|
-H "Title: PBS ZFS pool degraded: $pool" \
|
|
-d "Pool $pool is $status on PBS — check immediately" \
|
|
"$NTFY_URL" \
|
|
|| logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
|
|
|
|
else
|
|
# Empty or error — pool not imported / missing
|
|
logger -t zfs-health-check "pool $pool: MISSING — alerting"
|
|
METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n"
|
|
curl -sf \
|
|
-H "Priority: urgent" \
|
|
-H "Tags: rotating_light" \
|
|
-H "Title: PBS ZFS pool MISSING: $pool" \
|
|
-d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \
|
|
"$NTFY_URL" \
|
|
|| logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
|
|
fi
|
|
done
|
|
|
|
# Loop completed — write timestamp and atomically replace textfile
|
|
METRICS+="zfs_health_last_run_seconds $(date +%s)\n"
|
|
printf "%b" "$METRICS" > "$TMPFILE"
|
|
mkdir -p "$(dirname "$TEXTFILE")"
|
|
mv "$TMPFILE" "$TEXTFILE"
|
|
chmod 644 "$TEXTFILE"
|
|
|
|
# Disarm cleanup trap — mv succeeded
|
|
trap - EXIT
|
|
|
|
logger -t zfs-health-check "completed — metrics written to $TEXTFILE"
|
|
exit 0
|