photoncloud-monorepo/baremetal/first-boot/health-check.sh

79 lines
2.5 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
# health-check.sh - Health check wrapper for services
# Usage: health-check.sh <service_name> <health_url> [timeout] [retry_interval]
#
# Arguments:
# service_name - Name of the service (for logging)
# health_url - HTTP/HTTPS URL of the health endpoint
# timeout - Maximum time to wait in seconds (default: 300)
# retry_interval - Time between retries in seconds (default: 5)
#
# Returns:
# 0 - Service is healthy
# 1 - Service is unhealthy (timeout reached)
SERVICE_NAME="${1:-}"
HEALTH_URL="${2:-}"
TIMEOUT="${3:-300}"
RETRY_INTERVAL="${4:-5}"
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
CURL_MAX_TIME="${CURL_MAX_TIME:-10}"
CURL_INSECURE="${CURL_INSECURE:-1}"
# Validate arguments
if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" ]]; then
echo "ERROR: Missing required arguments" >&2
echo "Usage: $0 <service_name> <health_url> [timeout] [retry_interval]" >&2
exit 1
fi
# Logging function with JSON output
log() {
local level="$1"
local message="$2"
local timestamp
timestamp=$(date -Iseconds)
echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"service\":\"$SERVICE_NAME\",\"message\":\"$message\"}" >&2
}
# Main health check loop
log "INFO" "Starting health check for $SERVICE_NAME at $HEALTH_URL (timeout: ${TIMEOUT}s)"
START_TIME=$(date +%s)
ATTEMPT=0
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
log "ERROR" "Health check timeout reached after ${ELAPSED}s"
exit 1
fi
ATTEMPT=$((ATTEMPT + 1))
log "INFO" "Health check attempt $ATTEMPT (elapsed: ${ELAPSED}s)"
# Perform health check (allow insecure TLS if configured)
CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
if [[ "$HTTP_CODE" == "200" ]]; then
log "INFO" "Health check passed (HTTP $HTTP_CODE)"
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"$SERVICE_NAME\",\"status\":\"healthy\",\"attempts\":$ATTEMPT,\"elapsed\":${ELAPSED}}"
exit 0
elif [[ "$HTTP_CODE" == "000" ]]; then
log "WARN" "Health check failed: connection error (attempt $ATTEMPT)"
else
log "WARN" "Health check failed: HTTP $HTTP_CODE (attempt $ATTEMPT)"
fi
sleep "$RETRY_INTERVAL"
done