#!/bin/bash # # Graceful Apache reload with forced cleanup of stuck workers. # # Apache has no timeout on graceful-restart: if a worker is blocked in a # mid-request syscall (classically, reading from a mod_fcgid Unix socket # whose PHP backend has hung), it has SIGUSR1 blocked and never exits. # Over time these leak scoreboard slots until the instance fails with # "AH03490 scoreboard is full, not at MaxRequestWorkers". # # This wrapper snapshots the pre-reload worker PIDs, runs the graceful, # polls up to GRACE seconds for them to drain on their own, and SIGKILLs # any that are still alive and still parented to the same master. # # Deploy via a systemd drop-in: # /etc/systemd/system/apache2@.service.d/10-reload-cleanup.conf # [Service] # ExecReload= # ExecReload=/usr/local/sbin/apache-graceful-with-cleanup %i # TimeoutStartSec=120 # # Usage: apache-graceful-with-cleanup # Env: APACHE_RELOAD_GRACE seconds to wait before SIGKILL (default 60) set -eo pipefail INSTANCE="${1:?instance name required}" GRACE="${APACHE_RELOAD_GRACE:-60}" UNIT="apache2@${INSTANCE}" CONF="/etc/apache2-${INSTANCE}/apache2.conf" log() { logger -t apache-reload-cleanup "${INSTANCE}: $*"; } # Find the master PID. Prefer systemd's MainPID, but fall back to process # introspection if the unit's PID tracking is misconfigured (common on # Type=forking units with no PIDFile=). The master is the oldest apache2 # process whose cmdline names this instance's config dir. find_master_pid() { local from_systemd from_systemd=$(systemctl show -p MainPID --value "${UNIT}" 2>/dev/null) if [[ -n "${from_systemd}" && "${from_systemd}" != "0" ]]; then echo "${from_systemd}" return fi pgrep -o -f "apache2 -d /etc/apache2-${INSTANCE}(/| |$)" || true } MAIN=$(find_master_pid) if [[ -z "${MAIN}" ]]; then log "master not found, nothing to do" exit 0 fi OLD=$(pgrep -P "${MAIN}" | tr '\n' ' ' | sed 's/ $//') log "reload master=${MAIN} grace=${GRACE}s snapshot=[${OLD}]" echo "reload master=${MAIN} grace=${GRACE}s snapshot=[${OLD}]" /usr/sbin/apachectl -f "${CONF}" -k graceful if [[ -z "${OLD}" ]]; then log "no workers to track, done" exit 0 fi # 'Still a worker' = /proc/ exists AND its parent is still our # snapshotted master. still_alive() { local out='' for p in ${OLD}; do [[ -d /proc/${p} ]] || continue local ppid ppid=$(awk '{print $4}' /proc/${p}/stat 2>/dev/null || echo 0) if [[ "${ppid}" == "${MAIN}" ]]; then out+="${p} " fi done echo "${out% }" } elapsed=0 echo "waiting for old workers to die" while (( elapsed < GRACE )); do echo $elapsed survivors=$(still_alive) if [[ -z "${survivors}" ]]; then log "all workers drained in ${elapsed}s, no kill needed" exit 0 fi sleep 1 elapsed=$((elapsed + 1)) done # Grace elapsed — confirm master is still our snapshot (no intervening # full restart) before SIGKILL'ing. if [[ ! -d /proc/${MAIN} ]] || \ ! grep -qa "apache2-${INSTANCE}" /proc/${MAIN}/cmdline 2>/dev/null; then log "skip kill: master ${MAIN} gone or replaced" exit 0 fi survivors=$(still_alive) if [[ -z "${survivors}" ]]; then log "all workers drained at boundary, no kill needed" echo "all workers drained at boundary" exit 0 fi killed='' for p in ${survivors}; do if kill -9 "${p}" 2>/dev/null; then echo "killing ${p}" killed+="${p} " fi done log "SIGKILL after ${GRACE}s: [${killed% }]"