#!/bin/bash # Cert expiry check — parses Traefik acme.json, writes textfile metrics # Correct Prometheus text format: HELP+TYPE immediately before each family's data ACME_JSON="/home/tommy/traefik/acme.json" TEXTFILE="/var/lib/node_exporter/textfile/cert_expiry.prom" NTFY_URL="https://ntfy.goattw.net/homelab-alerts" logger -t cert-expiry-check "starting cert expiry check" TMPFILE=$(mktemp /tmp/cert_expiry.XXXXXX.prom) trap 'rm -f "$TMPFILE"' EXIT if [[ ! -r "$ACME_JSON" ]]; then logger -t cert-expiry-check "ERROR: cannot read $ACME_JSON" curl -sf \ -H "Priority: high" -H "Tags: warning" \ -H "Title: Cert expiry check: acme.json unreadable" \ -d "Cannot read $ACME_JSON on node01" "$NTFY_URL" || true exit 1 fi NOW=$(date +%s) # Write cert_expiry_days family: HELP + TYPE + all data lines together { echo "# HELP traefik_cert_expiry_days Days until TLS certificate expires (from acme.json)" echo "# TYPE traefik_cert_expiry_days gauge" python3 - "$ACME_JSON" "$NOW" << 'PYEOF' import sys, json, base64, subprocess, tempfile, os acme_path = sys.argv[1] now = int(sys.argv[2]) with open(acme_path) as f: data = json.load(f) seen = set() for resolver_name, resolver in data.items(): for c in (resolver.get('Certificates', []) or []): domain = c.get('domain', {}).get('main', '') if not domain or domain in seen: continue seen.add(domain) try: raw = base64.b64decode(c.get('certificate', '')) with tempfile.NamedTemporaryFile(suffix='.crt', delete=False) as t: t.write(raw); tmp = t.name result = subprocess.run( ['openssl', 'x509', '-noout', '-enddate', '-in', tmp], capture_output=True, text=True) os.unlink(tmp) exp_str = result.stdout.strip().replace('notAfter=', '') from datetime import datetime exp_ts = int(datetime.strptime(exp_str, '%b %d %H:%M:%S %Y %Z').timestamp()) days_left = round((exp_ts - now) / 86400, 1) print(f'traefik_cert_expiry_days{{domain="{domain}"}} {days_left}') except Exception: pass PYEOF } > "$TMPFILE" # Write last_run family separately, appended after cert_expiry_days data cat >> "$TMPFILE" << EOF # HELP traefik_cert_check_last_run_seconds Unix timestamp of last successful cert check # TYPE traefik_cert_check_last_run_seconds gauge traefik_cert_check_last_run_seconds $NOW EOF # Parse and alert on thresholds (from what we just wrote) while IFS= read -r line; do [[ "$line" =~ ^traefik_cert_expiry_days ]] || continue domain=$(echo "$line" | sed 's/.*domain="\([^"]*\)".*/\1/') days=$(echo "$line" | awk '{print $NF}') logger -t cert-expiry-check "domain=$domain days_left=$days" if (( $(echo "$days < 14 && $days > 0" | bc -l) )); then logger -t cert-expiry-check "CRITICAL: $domain expires in ${days}d" curl -sf -H "Priority: urgent" -H "Tags: rotating_light" \ -H "Title: TLS cert EXPIRING: $domain" \ -d "Certificate for $domain expires in ${days} days. ACME renewal FAILED." \ "$NTFY_URL" || true elif (( $(echo "$days < 30 && $days > 0" | bc -l) )); then logger -t cert-expiry-check "WARNING: $domain expires in ${days}d" curl -sf -H "Priority: high" -H "Tags: warning" \ -H "Title: TLS cert renewal window: $domain" \ -d "Certificate for $domain expires in ${days} days. Auto-renewal should start soon." \ "$NTFY_URL" || true fi done < "$TMPFILE" mv "$TMPFILE" "$TEXTFILE" chmod 644 "$TEXTFILE" trap - EXIT logger -t cert-expiry-check "completed — metrics written to $TEXTFILE" exit 0