photoncloud-monorepo/baremetal/first-boot/cluster-join.sh

181 lines
5.8 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
# cluster-join.sh - Reusable script for cluster join logic
# Usage: cluster-join.sh <service_name> <health_url> <leader_url> <join_payload> [max_attempts] [retry_delay]
#
# Arguments:
# service_name - Name of the service (e.g., chainfire, flaredb)
# health_url - Local health endpoint URL
# leader_url - Leader's cluster management URL
# join_payload - JSON payload for join request
# max_attempts - Maximum number of join attempts (default: 5)
# retry_delay - Delay between retries in seconds (default: 10)
#
# Returns:
# 0 - Successfully joined cluster
# 1 - Failed to join cluster after max attempts
# 2 - Already joined (detected by checking cluster membership)
# 3 - Invalid arguments
SERVICE_NAME="${1:-}"
HEALTH_URL="${2:-}"
LEADER_URL="${3:-}"
JOIN_PAYLOAD="${4:-}"
MAX_ATTEMPTS="${5:-5}"
RETRY_DELAY="${6:-10}"
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
CURL_MAX_TIME="${CURL_MAX_TIME:-15}"
CURL_INSECURE="${CURL_INSECURE:-1}"
FIRST_BOOT_MARKER="/var/lib/first-boot-automation/.${SERVICE_NAME}-joined"
# Validate arguments
if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" || -z "$LEADER_URL" || -z "$JOIN_PAYLOAD" ]]; then
echo "ERROR: Missing required arguments" >&2
echo "Usage: $0 <service_name> <health_url> <leader_url> <join_payload> [max_attempts] [retry_delay]" >&2
exit 3
fi
# Logging function with JSON output
log() {
local level="$1"
local message="$2"
local timestamp
timestamp=$(date -Iseconds)
echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"service\":\"$SERVICE_NAME\",\"operation\":\"cluster-join\",\"message\":\"$message\"}" >&2
}
# Check if already joined (marker file exists)
if [[ -f "$FIRST_BOOT_MARKER" ]]; then
log "INFO" "Cluster join marker found, already joined"
if [[ -r "$FIRST_BOOT_MARKER" ]]; then
MARKER_INFO=$(cat "$FIRST_BOOT_MARKER")
log "INFO" "Join timestamp: $MARKER_INFO"
fi
exit 2
fi
# Wait for local service to be healthy
log "INFO" "Waiting for local $SERVICE_NAME to be healthy"
# Use health-check.sh script if available, otherwise inline health check
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
if [[ -x "$SCRIPT_DIR/health-check.sh" ]]; then
if ! "$SCRIPT_DIR/health-check.sh" "$SERVICE_NAME" "$HEALTH_URL" 120 5; then
log "ERROR" "Local $SERVICE_NAME failed health check"
exit 1
fi
else
# Inline health check
HEALTH_TIMEOUT=120
HEALTH_START=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - HEALTH_START))
if [[ $ELAPSED -ge $HEALTH_TIMEOUT ]]; then
log "ERROR" "Health check timeout after ${ELAPSED}s"
exit 1
fi
CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
if [[ "$HTTP_CODE" == "200" ]]; then
log "INFO" "Local $SERVICE_NAME is healthy"
break
fi
log "WARN" "Waiting for $SERVICE_NAME health (${ELAPSED}s elapsed)"
sleep 5
done
fi
# Parse join payload to extract node info for logging
if command -v jq &> /dev/null; then
NODE_ID=$(echo "$JOIN_PAYLOAD" | jq -r '.id // .node_id // "unknown"')
log "INFO" "Attempting to join cluster as node: $NODE_ID"
else
log "INFO" "Attempting to join cluster (jq not available for payload parsing)"
fi
# Cluster join loop with retry logic
log "INFO" "Starting cluster join attempts (max: $MAX_ATTEMPTS, delay: ${RETRY_DELAY}s)"
for ATTEMPT in $(seq 1 "$MAX_ATTEMPTS"); do
log "INFO" "Cluster join attempt $ATTEMPT/$MAX_ATTEMPTS"
# Make join request to leader
RESPONSE_FILE=$(mktemp)
PAYLOAD_FILE=$(mktemp)
printf '%s' "$JOIN_PAYLOAD" > "$PAYLOAD_FILE"
CURL_FLAGS=(-s -w "%{http_code}" -o "$RESPONSE_FILE" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" \
-X POST "$LEADER_URL/admin/member/add" \
-H "Content-Type: application/json" \
--data-binary "@$PAYLOAD_FILE" 2>/dev/null || echo "000")
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "")
rm -f "$RESPONSE_FILE" "$PAYLOAD_FILE"
log "INFO" "Join request response: HTTP $HTTP_CODE"
# Check response
if [[ "$HTTP_CODE" == "200" || "$HTTP_CODE" == "201" ]]; then
log "INFO" "Successfully joined cluster"
# Create join marker
mkdir -p "$(dirname "$FIRST_BOOT_MARKER")"
date -Iseconds > "$FIRST_BOOT_MARKER"
# Log response details if available
if [[ -n "$RESPONSE_BODY" ]]; then
log "INFO" "Join response: $RESPONSE_BODY"
fi
exit 0
elif [[ "$HTTP_CODE" == "409" ]]; then
# Already member of cluster
log "WARN" "Node already member of cluster (HTTP 409)"
# Create join marker to prevent future attempts
mkdir -p "$(dirname "$FIRST_BOOT_MARKER")"
date -Iseconds > "$FIRST_BOOT_MARKER"
exit 2
elif [[ "$HTTP_CODE" == "000" ]]; then
log "ERROR" "Join request failed: connection error to leader $LEADER_URL"
if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then
log "INFO" "Retrying in ${RETRY_DELAY}s..."
sleep "$RETRY_DELAY"
fi
else
log "ERROR" "Join request failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY"
if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then
log "INFO" "Retrying in ${RETRY_DELAY}s..."
sleep "$RETRY_DELAY"
fi
fi
done
# Max attempts exhausted
log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts"
exit 1