From e3ee020d5322261899b125471216bc0a7a03d663 Mon Sep 17 00:00:00 2001 From: tommy Date: Wed, 6 May 2026 05:46:07 -0500 Subject: [PATCH] monitoring: add postfix queue + cert expiry scripts, Phase 4D alerts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit postfix-queue-check.sh: - Reads mailq queue depth, writes postfix_queue_size{host=X} textfile metric - Deployed on compute3 (systemd node_exporter) and compute5 (Docker) - Cron: */5 * * * * as root on each host - Prometheus alert: postfix_queue_size > 10 (uid: efl8kjns461a8f) node-exporter-compute5-compose.yml: - Added textfile volume mount /var/lib/node_exporter/textfile:/textfile:ro - Added --collector.textfile.directory=/textfile flag cert-expiry-check.sh: - Also stored here for monitoring/ grouping Phase 4D Grafana alert rules (all in Infrastructure Alerts folder): cfl8jqdlhu680d TLS Cert Expiry Warning (30d) — break-tested ✓ afl8jqdoepwqod TLS Cert ACME Renewal Failure (14d) — no real certs in window ffl8k2ry0nu2od Alertmanager Down — break-tested, fired ✓ efl8kjns461a8f Postfix Queue Backing Up — metric confirmed, 5m window dfl8k2s0xjklcf Authelia Restart Loop — cadvisor-based proxy metric Rules stored in grafana.db only — not yaml-provisioned (Phase 5 candidate) --- monitoring/node-exporter-compute5-compose.yml | 21 ++++++++++++++ monitoring/postfix-queue-check.sh | 28 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 monitoring/node-exporter-compute5-compose.yml create mode 100755 monitoring/postfix-queue-check.sh diff --git a/monitoring/node-exporter-compute5-compose.yml b/monitoring/node-exporter-compute5-compose.yml new file mode 100644 index 0000000..791fcc1 --- /dev/null +++ b/monitoring/node-exporter-compute5-compose.yml @@ -0,0 +1,21 @@ +# node-exporter - Host metrics exporter for Prometheus +# Host: compute5 (192.168.99.196), Port: 9100 + +services: + node-exporter: + image: prom/node-exporter:latest + container_name: node-exporter + restart: unless-stopped + pid: host + network_mode: host + volumes: + - /proc:/host/proc:ro + - /sys:/host/sys:ro + - /:/rootfs:ro + - /var/lib/node_exporter/textfile:/textfile:ro + command: + - '--path.procfs=/host/proc' + - '--path.sysfs=/host/sys' + - '--path.rootfs=/rootfs' + - '--collector.textfile.directory=/textfile' + - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' diff --git a/monitoring/postfix-queue-check.sh b/monitoring/postfix-queue-check.sh new file mode 100755 index 0000000..893a011 --- /dev/null +++ b/monitoring/postfix-queue-check.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Postfix queue depth textfile metric — runs every 5 min via root crontab. + +TEXTFILE="/var/lib/node_exporter/textfile/postfix_queue.prom" +HOSTNAME=$(hostname -s) +TMPFILE=$(mktemp /tmp/postfix_queue.XXXXXX.prom) +trap 'rm -f "$TMPFILE"' EXIT + +QUEUE_SIZE=0 +if command -v mailq >/dev/null 2>&1; then + # grep -c exits 1 on no match even for count=0; handle separately + QUEUE_SIZE=$(mailq 2>/dev/null | grep -c '^[0-9A-F]' 2>/dev/null) || QUEUE_SIZE=0 +fi + +cat > "$TMPFILE" << EOF +# HELP postfix_queue_size Number of messages in postfix mail queue +# TYPE postfix_queue_size gauge +postfix_queue_size{host="${HOSTNAME}"} ${QUEUE_SIZE} +# HELP postfix_queue_check_last_run_seconds Unix timestamp of last queue check +# TYPE postfix_queue_check_last_run_seconds gauge +postfix_queue_check_last_run_seconds $(date +%s) +EOF + +mkdir -p "$(dirname "$TEXTFILE")" +mv "$TMPFILE" "$TEXTFILE" +chmod 644 "$TEXTFILE" +trap - EXIT +exit 0