Files
homelab-configs/traefik/cert-expiry-check.sh
tommy 10b60761ff traefik: add cert-expiry-check.sh with Prometheus textfile output
Reads acme.json hourly on docker-node01, writes:
  traefik_cert_expiry_days{domain=X} N
  traefik_cert_check_last_run_seconds EPOCH

Two Grafana alert thresholds:
  Warning  < 30d: auto-renewal window opened, ntfy high priority
  Critical < 14d: ACME renewal failed, ntfy urgent

Textfile at /var/lib/node_exporter/textfile/cert_expiry.prom
Scraped by existing node-exporter job on 192.168.99.186:9100
Grafana rules: cfl8jqdlhu680d (warning), afl8jqdoepwqod (critical)
Break-tested: 35d threshold fired for vault/pdf/scrutiny/gitea correctly.

Cron: 0 * * * * sudo /usr/local/bin/cert-expiry-check.sh
2026-05-06 05:34:31 -05:00

95 lines
3.7 KiB
Bash
Executable File

#!/bin/bash
# Cert expiry check — parses Traefik acme.json, writes textfile metrics
# Correct Prometheus text format: HELP+TYPE immediately before each family's data
ACME_JSON="/home/tommy/traefik/acme.json"
TEXTFILE="/var/lib/node_exporter/textfile/cert_expiry.prom"
NTFY_URL="https://ntfy.goattw.net/homelab-alerts"
logger -t cert-expiry-check "starting cert expiry check"
TMPFILE=$(mktemp /tmp/cert_expiry.XXXXXX.prom)
trap 'rm -f "$TMPFILE"' EXIT
if [[ ! -r "$ACME_JSON" ]]; then
logger -t cert-expiry-check "ERROR: cannot read $ACME_JSON"
curl -sf \
-H "Priority: high" -H "Tags: warning" \
-H "Title: Cert expiry check: acme.json unreadable" \
-d "Cannot read $ACME_JSON on node01" "$NTFY_URL" || true
exit 1
fi
NOW=$(date +%s)
# Write cert_expiry_days family: HELP + TYPE + all data lines together
{
echo "# HELP traefik_cert_expiry_days Days until TLS certificate expires (from acme.json)"
echo "# TYPE traefik_cert_expiry_days gauge"
python3 - "$ACME_JSON" "$NOW" << 'PYEOF'
import sys, json, base64, subprocess, tempfile, os
acme_path = sys.argv[1]
now = int(sys.argv[2])
with open(acme_path) as f:
data = json.load(f)
seen = set()
for resolver_name, resolver in data.items():
for c in (resolver.get('Certificates', []) or []):
domain = c.get('domain', {}).get('main', '')
if not domain or domain in seen:
continue
seen.add(domain)
try:
raw = base64.b64decode(c.get('certificate', ''))
with tempfile.NamedTemporaryFile(suffix='.crt', delete=False) as t:
t.write(raw); tmp = t.name
result = subprocess.run(
['openssl', 'x509', '-noout', '-enddate', '-in', tmp],
capture_output=True, text=True)
os.unlink(tmp)
exp_str = result.stdout.strip().replace('notAfter=', '')
from datetime import datetime
exp_ts = int(datetime.strptime(exp_str, '%b %d %H:%M:%S %Y %Z').timestamp())
days_left = round((exp_ts - now) / 86400, 1)
print(f'traefik_cert_expiry_days{{domain="{domain}"}} {days_left}')
except Exception:
pass
PYEOF
} > "$TMPFILE"
# Write last_run family separately, appended after cert_expiry_days data
cat >> "$TMPFILE" << EOF
# HELP traefik_cert_check_last_run_seconds Unix timestamp of last successful cert check
# TYPE traefik_cert_check_last_run_seconds gauge
traefik_cert_check_last_run_seconds $NOW
EOF
# Parse and alert on thresholds (from what we just wrote)
while IFS= read -r line; do
[[ "$line" =~ ^traefik_cert_expiry_days ]] || continue
domain=$(echo "$line" | sed 's/.*domain="\([^"]*\)".*/\1/')
days=$(echo "$line" | awk '{print $NF}')
logger -t cert-expiry-check "domain=$domain days_left=$days"
if (( $(echo "$days < 14 && $days > 0" | bc -l) )); then
logger -t cert-expiry-check "CRITICAL: $domain expires in ${days}d"
curl -sf -H "Priority: urgent" -H "Tags: rotating_light" \
-H "Title: TLS cert EXPIRING: $domain" \
-d "Certificate for $domain expires in ${days} days. ACME renewal FAILED." \
"$NTFY_URL" || true
elif (( $(echo "$days < 30 && $days > 0" | bc -l) )); then
logger -t cert-expiry-check "WARNING: $domain expires in ${days}d"
curl -sf -H "Priority: high" -H "Tags: warning" \
-H "Title: TLS cert renewal window: $domain" \
-d "Certificate for $domain expires in ${days} days. Auto-renewal should start soon." \
"$NTFY_URL" || true
fi
done < "$TMPFILE"
mv "$TMPFILE" "$TEXTFILE"
chmod 644 "$TEXTFILE"
trap - EXIT
logger -t cert-expiry-check "completed — metrics written to $TEXTFILE"
exit 0