From 10b60761ff94a655d2be2279244c791d4d9e01c7 Mon Sep 17 00:00:00 2001 From: tommy Date: Wed, 6 May 2026 05:34:31 -0500 Subject: [PATCH] traefik: add cert-expiry-check.sh with Prometheus textfile output Reads acme.json hourly on docker-node01, writes: traefik_cert_expiry_days{domain=X} N traefik_cert_check_last_run_seconds EPOCH Two Grafana alert thresholds: Warning < 30d: auto-renewal window opened, ntfy high priority Critical < 14d: ACME renewal failed, ntfy urgent Textfile at /var/lib/node_exporter/textfile/cert_expiry.prom Scraped by existing node-exporter job on 192.168.99.186:9100 Grafana rules: cfl8jqdlhu680d (warning), afl8jqdoepwqod (critical) Break-tested: 35d threshold fired for vault/pdf/scrutiny/gitea correctly. Cron: 0 * * * * sudo /usr/local/bin/cert-expiry-check.sh --- traefik/cert-expiry-check.sh | 94 ++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100755 traefik/cert-expiry-check.sh diff --git a/traefik/cert-expiry-check.sh b/traefik/cert-expiry-check.sh new file mode 100755 index 0000000..5c4dfaf --- /dev/null +++ b/traefik/cert-expiry-check.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# Cert expiry check — parses Traefik acme.json, writes textfile metrics +# Correct Prometheus text format: HELP+TYPE immediately before each family's data + +ACME_JSON="/home/tommy/traefik/acme.json" +TEXTFILE="/var/lib/node_exporter/textfile/cert_expiry.prom" +NTFY_URL="https://ntfy.goattw.net/homelab-alerts" + +logger -t cert-expiry-check "starting cert expiry check" + +TMPFILE=$(mktemp /tmp/cert_expiry.XXXXXX.prom) +trap 'rm -f "$TMPFILE"' EXIT + +if [[ ! -r "$ACME_JSON" ]]; then + logger -t cert-expiry-check "ERROR: cannot read $ACME_JSON" + curl -sf \ + -H "Priority: high" -H "Tags: warning" \ + -H "Title: Cert expiry check: acme.json unreadable" \ + -d "Cannot read $ACME_JSON on node01" "$NTFY_URL" || true + exit 1 +fi + +NOW=$(date +%s) + +# Write cert_expiry_days family: HELP + TYPE + all data lines together +{ + echo "# HELP traefik_cert_expiry_days Days until TLS certificate expires (from acme.json)" + echo "# TYPE traefik_cert_expiry_days gauge" + + python3 - "$ACME_JSON" "$NOW" << 'PYEOF' +import sys, json, base64, subprocess, tempfile, os +acme_path = sys.argv[1] +now = int(sys.argv[2]) +with open(acme_path) as f: + data = json.load(f) +seen = set() +for resolver_name, resolver in data.items(): + for c in (resolver.get('Certificates', []) or []): + domain = c.get('domain', {}).get('main', '') + if not domain or domain in seen: + continue + seen.add(domain) + try: + raw = base64.b64decode(c.get('certificate', '')) + with tempfile.NamedTemporaryFile(suffix='.crt', delete=False) as t: + t.write(raw); tmp = t.name + result = subprocess.run( + ['openssl', 'x509', '-noout', '-enddate', '-in', tmp], + capture_output=True, text=True) + os.unlink(tmp) + exp_str = result.stdout.strip().replace('notAfter=', '') + from datetime import datetime + exp_ts = int(datetime.strptime(exp_str, '%b %d %H:%M:%S %Y %Z').timestamp()) + days_left = round((exp_ts - now) / 86400, 1) + print(f'traefik_cert_expiry_days{{domain="{domain}"}} {days_left}') + except Exception: + pass +PYEOF +} > "$TMPFILE" + +# Write last_run family separately, appended after cert_expiry_days data +cat >> "$TMPFILE" << EOF +# HELP traefik_cert_check_last_run_seconds Unix timestamp of last successful cert check +# TYPE traefik_cert_check_last_run_seconds gauge +traefik_cert_check_last_run_seconds $NOW +EOF + +# Parse and alert on thresholds (from what we just wrote) +while IFS= read -r line; do + [[ "$line" =~ ^traefik_cert_expiry_days ]] || continue + domain=$(echo "$line" | sed 's/.*domain="\([^"]*\)".*/\1/') + days=$(echo "$line" | awk '{print $NF}') + logger -t cert-expiry-check "domain=$domain days_left=$days" + if (( $(echo "$days < 14 && $days > 0" | bc -l) )); then + logger -t cert-expiry-check "CRITICAL: $domain expires in ${days}d" + curl -sf -H "Priority: urgent" -H "Tags: rotating_light" \ + -H "Title: TLS cert EXPIRING: $domain" \ + -d "Certificate for $domain expires in ${days} days. ACME renewal FAILED." \ + "$NTFY_URL" || true + elif (( $(echo "$days < 30 && $days > 0" | bc -l) )); then + logger -t cert-expiry-check "WARNING: $domain expires in ${days}d" + curl -sf -H "Priority: high" -H "Tags: warning" \ + -H "Title: TLS cert renewal window: $domain" \ + -d "Certificate for $domain expires in ${days} days. Auto-renewal should start soon." \ + "$NTFY_URL" || true + fi +done < "$TMPFILE" + +mv "$TMPFILE" "$TEXTFILE" +chmod 644 "$TEXTFILE" +trap - EXIT + +logger -t cert-expiry-check "completed — metrics written to $TEXTFILE" +exit 0