pbs: rewrite zfs-health-check.sh, enable textfile collector

- Fix silent failure: script now posts to dedicated zfs-health ntfy topic instead of grafana-alerts catch-all (pools were offline 12+ hours undetected because alerts were buried in Grafana noise) - Three explicit states: ONLINE (silent), DEGRADED (high priority), MISSING (urgent priority) — empty zpool list output is now a MISSING alert, not silently ignored - Textfile metrics written atomically after loop completes only: zfs_pool_present{pool=X} 0|1 and zfs_health_last_run_seconds - Added trap cleanup so mid-script crash leaves previous .prom intact - Logs each pool state to syslog via logger -t zfs-health-check - Remove duplicate cron entry running as tommy (was firing twice per tick) - Enable node_exporter textfile collector for Prometheus scraping Incident: usb1-zfs and usb2-zfs offline since PBS boot (missing cachefile). Imported and cachefile regenerated in this session. No data errors. Refs: 2026-05-05 health check CRITICAL C1/C4
2026-05-05 19:44:28 -05:00
parent 331820b8de
commit 23194ed22a
2 changed files with 78 additions and 0 deletions
@@ -0,0 +1,11 @@
+[Unit]
+Description=Prometheus Node Exporter
+After=network.target
+
+[Service]
+Type=simple
+ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile
+Restart=on-failure
+
+[Install]
+WantedBy=multi-user.target
@@ -0,0 +1,67 @@
+#!/bin/bash
+# ZFS pool health check — /usr/local/bin/zfs-health-check.sh
+# Runs every 5 min via root crontab on PBS.
+# Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent).
+# Writes Prometheus textfile metrics after full loop completes only.
+
+NTFY_URL="https://ntfy.goattw.net/zfs-health"
+EXPECTED_POOLS=(usb1-zfs usb2-zfs)
+TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom"
+
+TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom)
+# On crash, clean up tmpfile — do NOT write partial results to final path
+trap 'rm -f "$TMPFILE"' EXIT
+
+logger -t zfs-health-check "starting pool health check"
+
+# Accumulate metric lines; written atomically only after full loop
+METRICS=""
+METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n"
+METRICS+="# TYPE zfs_pool_present gauge\n"
+METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n"
+METRICS+="# TYPE zfs_health_last_run_seconds gauge\n"
+
+for pool in "${EXPECTED_POOLS[@]}"; do
+    status=$(zpool list -H -o health "$pool" 2>/dev/null)
+
+    if [[ "$status" == "ONLINE" ]]; then
+        logger -t zfs-health-check "pool $pool: ONLINE"
+        METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
+
+    elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then
+        logger -t zfs-health-check "pool $pool: $status — alerting"
+        METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
+        curl -sf \
+            -H "Priority: high" \
+            -H "Tags: warning" \
+            -H "Title: PBS ZFS pool degraded: $pool" \
+            -d "Pool $pool is $status on PBS — check immediately" \
+            "$NTFY_URL" \
+            || logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
+
+    else
+        # Empty or error — pool not imported / missing
+        logger -t zfs-health-check "pool $pool: MISSING — alerting"
+        METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n"
+        curl -sf \
+            -H "Priority: urgent" \
+            -H "Tags: rotating_light" \
+            -H "Title: PBS ZFS pool MISSING: $pool" \
+            -d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \
+            "$NTFY_URL" \
+            || logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
+    fi
+done
+
+# Loop completed — write timestamp and atomically replace textfile
+METRICS+="zfs_health_last_run_seconds $(date +%s)\n"
+printf "%b" "$METRICS" > "$TMPFILE"
+mkdir -p "$(dirname "$TEXTFILE")"
+mv "$TMPFILE" "$TEXTFILE"
+chmod 644 "$TEXTFILE"
+
+# Disarm cleanup trap — mv succeeded
+trap - EXIT
+
+logger -t zfs-health-check "completed — metrics written to $TEXTFILE"
+exit 0