homelab-configs/pbs/zfs-health-check.sh

#!/bin/bash
# ZFS pool health check — /usr/local/bin/zfs-health-check.sh
# Runs every 5 min via root crontab on PBS.
# Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent).
# Writes Prometheus textfile metrics after full loop completes only.

NTFY_URL="https://ntfy.goattw.net/zfs-health"
EXPECTED_POOLS=(usb1-zfs usb2-zfs)
TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom"

TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom)
# On crash, clean up tmpfile — do NOT write partial results to final path
trap 'rm -f "$TMPFILE"' EXIT

logger -t zfs-health-check "starting pool health check"

# Accumulate metric lines; written atomically only after full loop
METRICS=""
METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n"
METRICS+="# TYPE zfs_pool_present gauge\n"
METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n"
METRICS+="# TYPE zfs_health_last_run_seconds gauge\n"

for pool in "${EXPECTED_POOLS[@]}"; do
    status=$(zpool list -H -o health "$pool" 2>/dev/null)

    if [[ "$status" == "ONLINE" ]]; then
        logger -t zfs-health-check "pool $pool: ONLINE"
        METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"

    elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then
        logger -t zfs-health-check "pool $pool: $status — alerting"
        METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n"
        curl -sf \
            -H "Priority: high" \
            -H "Tags: warning" \
            -H "Title: PBS ZFS pool degraded: $pool" \
            -d "Pool $pool is $status on PBS — check immediately" \
            "$NTFY_URL" \
            || logger -t zfs-health-check "WARNING: ntfy post failed for $pool"

    else
        # Empty or error — pool not imported / missing
        logger -t zfs-health-check "pool $pool: MISSING — alerting"
        METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n"
        curl -sf \
            -H "Priority: urgent" \
            -H "Tags: rotating_light" \
            -H "Title: PBS ZFS pool MISSING: $pool" \
            -d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \
            "$NTFY_URL" \
            || logger -t zfs-health-check "WARNING: ntfy post failed for $pool"
    fi
done

# Loop completed — write timestamp and atomically replace textfile
METRICS+="zfs_health_last_run_seconds $(date +%s)\n"
printf "%b" "$METRICS" > "$TMPFILE"
mkdir -p "$(dirname "$TEXTFILE")"
mv "$TMPFILE" "$TEXTFILE"
chmod 644 "$TEXTFILE"

# Disarm cleanup trap — mv succeeded
trap - EXIT

logger -t zfs-health-check "completed — metrics written to $TEXTFILE"
exit 0