#!/bin/bash # ZFS pool health check — /usr/local/bin/zfs-health-check.sh # Runs every 5 min via root crontab on PBS. # Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent). # Writes Prometheus textfile metrics after full loop completes only. NTFY_URL="https://ntfy.goattw.net/zfs-health" EXPECTED_POOLS=(usb1-zfs usb2-zfs) TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom" TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom) # On crash, clean up tmpfile — do NOT write partial results to final path trap 'rm -f "$TMPFILE"' EXIT logger -t zfs-health-check "starting pool health check" # Accumulate metric lines; written atomically only after full loop METRICS="" METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n" METRICS+="# TYPE zfs_pool_present gauge\n" METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n" METRICS+="# TYPE zfs_health_last_run_seconds gauge\n" for pool in "${EXPECTED_POOLS[@]}"; do status=$(zpool list -H -o health "$pool" 2>/dev/null) if [[ "$status" == "ONLINE" ]]; then logger -t zfs-health-check "pool $pool: ONLINE" METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n" elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then logger -t zfs-health-check "pool $pool: $status — alerting" METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n" curl -sf \ -H "Priority: high" \ -H "Tags: warning" \ -H "Title: PBS ZFS pool degraded: $pool" \ -d "Pool $pool is $status on PBS — check immediately" \ "$NTFY_URL" \ || logger -t zfs-health-check "WARNING: ntfy post failed for $pool" else # Empty or error — pool not imported / missing logger -t zfs-health-check "pool $pool: MISSING — alerting" METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n" curl -sf \ -H "Priority: urgent" \ -H "Tags: rotating_light" \ -H "Title: PBS ZFS pool MISSING: $pool" \ -d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \ "$NTFY_URL" \ || logger -t zfs-health-check "WARNING: ntfy post failed for $pool" fi done # Loop completed — write timestamp and atomically replace textfile METRICS+="zfs_health_last_run_seconds $(date +%s)\n" printf "%b" "$METRICS" > "$TMPFILE" mkdir -p "$(dirname "$TEXTFILE")" mv "$TMPFILE" "$TEXTFILE" chmod 644 "$TEXTFILE" # Disarm cleanup trap — mv succeeded trap - EXIT logger -t zfs-health-check "completed — metrics written to $TEXTFILE" exit 0