pbs: rewrite zfs-health-check.sh, enable textfile collector
- Fix silent failure: script now posts to dedicated zfs-health ntfy topic
instead of grafana-alerts catch-all (pools were offline 12+ hours
undetected because alerts were buried in Grafana noise)
- Three explicit states: ONLINE (silent), DEGRADED (high priority),
MISSING (urgent priority) — empty zpool list output is now a MISSING
alert, not silently ignored
- Textfile metrics written atomically after loop completes only:
zfs_pool_present{pool=X} 0|1 and zfs_health_last_run_seconds
- Added trap cleanup so mid-script crash leaves previous .prom intact
- Logs each pool state to syslog via logger -t zfs-health-check
- Remove duplicate cron entry running as tommy (was firing twice per tick)
- Enable node_exporter textfile collector for Prometheus scraping
Incident: usb1-zfs and usb2-zfs offline since PBS boot (missing cachefile).
Imported and cachefile regenerated in this session. No data errors.
Refs: 2026-05-05 health check CRITICAL C1/C4
This commit is contained in:
11
pbs/node_exporter.service
Normal file
11
pbs/node_exporter.service
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Prometheus Node Exporter
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile
|
||||
Restart=on-failure
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
67
pbs/zfs-health-check.sh
Executable file
67
pbs/zfs-health-check.sh
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
# ZFS pool health check — /usr/local/bin/zfs-health-check.sh
|
||||
# Runs every 5 min via root crontab on PBS.
|
||||
# Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent).
|
||||
# Writes Prometheus textfile metrics after full loop completes only.
|
||||
|
||||
NTFY_URL="https://ntfy.goattw.net/zfs-health"
|
||||
EXPECTED_POOLS=(usb1-zfs usb2-zfs)
|
||||
TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom"
|
||||
|
||||
TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom)
|
||||
# On crash, clean up tmpfile — do NOT write partial results to final path
|
||||
trap 'rm -f "$TMPFILE"' EXIT
|
||||
|
||||
logger -t zfs-health-check "starting pool health check"
|
||||
|
||||
# Accumulate metric lines; written atomically only after full loop
|
||||
METRICS=""
|
||||
METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n"
|
||||
METRICS+="# TYPE zfs_pool_present gauge\n"
|
||||
METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n"
|
||||
METRICS+="# TYPE zfs_health_last_run_seconds gauge\n"
|
||||
|
||||
for pool in "${EXPECTED_POOLS[@]}"; do
|
||||
status=$(zpool list -H -o health "$pool" 2>/dev/null)
|
||||
|
||||
if [[ "$status" == "ONLINE" ]]; then
|
||||
logger -t zfs-health-check "pool $pool: ONLINE"
|
||||
METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
|
||||
|
||||
elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then
|
||||
logger -t zfs-health-check "pool $pool: $status — alerting"
|
||||
METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
|
||||
curl -sf \
|
||||
-H "Priority: high" \
|
||||
-H "Tags: warning" \
|
||||
-H "Title: PBS ZFS pool degraded: $pool" \
|
||||
-d "Pool $pool is $status on PBS — check immediately" \
|
||||
"$NTFY_URL" \
|
||||
|| logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
|
||||
|
||||
else
|
||||
# Empty or error — pool not imported / missing
|
||||
logger -t zfs-health-check "pool $pool: MISSING — alerting"
|
||||
METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n"
|
||||
curl -sf \
|
||||
-H "Priority: urgent" \
|
||||
-H "Tags: rotating_light" \
|
||||
-H "Title: PBS ZFS pool MISSING: $pool" \
|
||||
-d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \
|
||||
"$NTFY_URL" \
|
||||
|| logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
|
||||
fi
|
||||
done
|
||||
|
||||
# Loop completed — write timestamp and atomically replace textfile
|
||||
METRICS+="zfs_health_last_run_seconds $(date +%s)\n"
|
||||
printf "%b" "$METRICS" > "$TMPFILE"
|
||||
mkdir -p "$(dirname "$TEXTFILE")"
|
||||
mv "$TMPFILE" "$TEXTFILE"
|
||||
chmod 644 "$TEXTFILE"
|
||||
|
||||
# Disarm cleanup trap — mv succeeded
|
||||
trap - EXIT
|
||||
|
||||
logger -t zfs-health-check "completed — metrics written to $TEXTFILE"
|
||||
exit 0
|
||||
Reference in New Issue
Block a user