diff --git a/pbs/node_exporter.service b/pbs/node_exporter.service new file mode 100644 index 0000000..54375ec --- /dev/null +++ b/pbs/node_exporter.service @@ -0,0 +1,11 @@ +[Unit] +Description=Prometheus Node Exporter +After=network.target + +[Service] +Type=simple +ExecStart=/usr/local/bin/node_exporter --collector.textfile.directory=/var/lib/node_exporter/textfile +Restart=on-failure + +[Install] +WantedBy=multi-user.target diff --git a/pbs/zfs-health-check.sh b/pbs/zfs-health-check.sh new file mode 100755 index 0000000..f1d566e --- /dev/null +++ b/pbs/zfs-health-check.sh @@ -0,0 +1,67 @@ +#!/bin/bash +# ZFS pool health check — /usr/local/bin/zfs-health-check.sh +# Runs every 5 min via root crontab on PBS. +# Three states: ONLINE (silent), DEGRADED/FAULTED (ntfy high), MISSING (ntfy urgent). +# Writes Prometheus textfile metrics after full loop completes only. + +NTFY_URL="https://ntfy.goattw.net/zfs-health" +EXPECTED_POOLS=(usb1-zfs usb2-zfs) +TEXTFILE="/var/lib/node_exporter/textfile/zfs_health.prom" + +TMPFILE=$(mktemp /tmp/zfs_health.XXXXXX.prom) +# On crash, clean up tmpfile — do NOT write partial results to final path +trap 'rm -f "$TMPFILE"' EXIT + +logger -t zfs-health-check "starting pool health check" + +# Accumulate metric lines; written atomically only after full loop +METRICS="" +METRICS+="# HELP zfs_pool_present 1 if pool imported and healthy, 0 if missing\n" +METRICS+="# TYPE zfs_pool_present gauge\n" +METRICS+="# HELP zfs_health_last_run_seconds Unix timestamp of last successful check\n" +METRICS+="# TYPE zfs_health_last_run_seconds gauge\n" + +for pool in "${EXPECTED_POOLS[@]}"; do + status=$(zpool list -H -o health "$pool" 2>/dev/null) + + if [[ "$status" == "ONLINE" ]]; then + logger -t zfs-health-check "pool $pool: ONLINE" + METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n" + + elif [[ "$status" =~ ^(DEGRADED|FAULTED|REMOVED|UNAVAIL)$ ]]; then + logger -t zfs-health-check "pool $pool: $status — alerting" + METRICS+="zfs_pool_present{pool=\"$pool\"} 1\n" + curl -sf \ + -H "Priority: high" \ + -H "Tags: warning" \ + -H "Title: PBS ZFS pool degraded: $pool" \ + -d "Pool $pool is $status on PBS — check immediately" \ + "$NTFY_URL" \ + || logger -t zfs-health-check "WARNING: ntfy post failed for $pool" + + else + # Empty or error — pool not imported / missing + logger -t zfs-health-check "pool $pool: MISSING — alerting" + METRICS+="zfs_pool_present{pool=\"$pool\"} 0\n" + curl -sf \ + -H "Priority: urgent" \ + -H "Tags: rotating_light" \ + -H "Title: PBS ZFS pool MISSING: $pool" \ + -d "Pool $pool not imported on PBS. Check cables. Fix: zpool import $pool" \ + "$NTFY_URL" \ + || logger -t zfs-health-check "WARNING: ntfy post failed for $pool" + fi +done + +# Loop completed — write timestamp and atomically replace textfile +METRICS+="zfs_health_last_run_seconds $(date +%s)\n" +printf "%b" "$METRICS" > "$TMPFILE" +mkdir -p "$(dirname "$TEXTFILE")" +mv "$TMPFILE" "$TEXTFILE" +chmod 644 "$TEXTFILE" + +# Disarm cleanup trap — mv succeeded +trap - EXIT + +logger -t zfs-health-check "completed — metrics written to $TEXTFILE" +exit 0