181 lines
5.8 KiB
Bash
Executable file
181 lines
5.8 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
|
|
set -euo pipefail
|
|
|
|
# cluster-join.sh - Reusable script for cluster join logic
|
|
# Usage: cluster-join.sh <service_name> <health_url> <leader_url> <join_payload> [max_attempts] [retry_delay]
|
|
#
|
|
# Arguments:
|
|
# service_name - Name of the service (e.g., chainfire, flaredb)
|
|
# health_url - Local health endpoint URL
|
|
# leader_url - Leader's cluster management URL
|
|
# join_payload - JSON payload for join request
|
|
# max_attempts - Maximum number of join attempts (default: 5)
|
|
# retry_delay - Delay between retries in seconds (default: 10)
|
|
#
|
|
# Returns:
|
|
# 0 - Successfully joined cluster
|
|
# 1 - Failed to join cluster after max attempts
|
|
# 2 - Already joined (detected by checking cluster membership)
|
|
# 3 - Invalid arguments
|
|
|
|
SERVICE_NAME="${1:-}"
|
|
HEALTH_URL="${2:-}"
|
|
LEADER_URL="${3:-}"
|
|
JOIN_PAYLOAD="${4:-}"
|
|
MAX_ATTEMPTS="${5:-5}"
|
|
RETRY_DELAY="${6:-10}"
|
|
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
|
|
CURL_MAX_TIME="${CURL_MAX_TIME:-15}"
|
|
CURL_INSECURE="${CURL_INSECURE:-1}"
|
|
|
|
FIRST_BOOT_MARKER="/var/lib/first-boot-automation/.${SERVICE_NAME}-joined"
|
|
|
|
# Validate arguments
|
|
if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" || -z "$LEADER_URL" || -z "$JOIN_PAYLOAD" ]]; then
|
|
echo "ERROR: Missing required arguments" >&2
|
|
echo "Usage: $0 <service_name> <health_url> <leader_url> <join_payload> [max_attempts] [retry_delay]" >&2
|
|
exit 3
|
|
fi
|
|
|
|
# Logging function with JSON output
|
|
log() {
|
|
local level="$1"
|
|
local message="$2"
|
|
local timestamp
|
|
timestamp=$(date -Iseconds)
|
|
|
|
echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"service\":\"$SERVICE_NAME\",\"operation\":\"cluster-join\",\"message\":\"$message\"}" >&2
|
|
}
|
|
|
|
# Check if already joined (marker file exists)
|
|
if [[ -f "$FIRST_BOOT_MARKER" ]]; then
|
|
log "INFO" "Cluster join marker found, already joined"
|
|
|
|
if [[ -r "$FIRST_BOOT_MARKER" ]]; then
|
|
MARKER_INFO=$(cat "$FIRST_BOOT_MARKER")
|
|
log "INFO" "Join timestamp: $MARKER_INFO"
|
|
fi
|
|
|
|
exit 2
|
|
fi
|
|
|
|
# Wait for local service to be healthy
|
|
log "INFO" "Waiting for local $SERVICE_NAME to be healthy"
|
|
|
|
# Use health-check.sh script if available, otherwise inline health check
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
if [[ -x "$SCRIPT_DIR/health-check.sh" ]]; then
|
|
if ! "$SCRIPT_DIR/health-check.sh" "$SERVICE_NAME" "$HEALTH_URL" 120 5; then
|
|
log "ERROR" "Local $SERVICE_NAME failed health check"
|
|
exit 1
|
|
fi
|
|
else
|
|
# Inline health check
|
|
HEALTH_TIMEOUT=120
|
|
HEALTH_START=$(date +%s)
|
|
|
|
while true; do
|
|
CURRENT_TIME=$(date +%s)
|
|
ELAPSED=$((CURRENT_TIME - HEALTH_START))
|
|
|
|
if [[ $ELAPSED -ge $HEALTH_TIMEOUT ]]; then
|
|
log "ERROR" "Health check timeout after ${ELAPSED}s"
|
|
exit 1
|
|
fi
|
|
|
|
CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
|
|
if [[ "$CURL_INSECURE" == "1" ]]; then
|
|
CURL_FLAGS+=(-k)
|
|
fi
|
|
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
|
|
|
|
if [[ "$HTTP_CODE" == "200" ]]; then
|
|
log "INFO" "Local $SERVICE_NAME is healthy"
|
|
break
|
|
fi
|
|
|
|
log "WARN" "Waiting for $SERVICE_NAME health (${ELAPSED}s elapsed)"
|
|
sleep 5
|
|
done
|
|
fi
|
|
|
|
# Parse join payload to extract node info for logging
|
|
if command -v jq &> /dev/null; then
|
|
NODE_ID=$(echo "$JOIN_PAYLOAD" | jq -r '.id // .node_id // "unknown"')
|
|
log "INFO" "Attempting to join cluster as node: $NODE_ID"
|
|
else
|
|
log "INFO" "Attempting to join cluster (jq not available for payload parsing)"
|
|
fi
|
|
|
|
# Cluster join loop with retry logic
|
|
log "INFO" "Starting cluster join attempts (max: $MAX_ATTEMPTS, delay: ${RETRY_DELAY}s)"
|
|
|
|
for ATTEMPT in $(seq 1 "$MAX_ATTEMPTS"); do
|
|
log "INFO" "Cluster join attempt $ATTEMPT/$MAX_ATTEMPTS"
|
|
|
|
# Make join request to leader
|
|
RESPONSE_FILE=$(mktemp)
|
|
PAYLOAD_FILE=$(mktemp)
|
|
printf '%s' "$JOIN_PAYLOAD" > "$PAYLOAD_FILE"
|
|
|
|
CURL_FLAGS=(-s -w "%{http_code}" -o "$RESPONSE_FILE" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
|
|
if [[ "$CURL_INSECURE" == "1" ]]; then
|
|
CURL_FLAGS+=(-k)
|
|
fi
|
|
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" \
|
|
-X POST "$LEADER_URL/admin/member/add" \
|
|
-H "Content-Type: application/json" \
|
|
--data-binary "@$PAYLOAD_FILE" 2>/dev/null || echo "000")
|
|
|
|
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "")
|
|
rm -f "$RESPONSE_FILE" "$PAYLOAD_FILE"
|
|
|
|
log "INFO" "Join request response: HTTP $HTTP_CODE"
|
|
|
|
# Check response
|
|
if [[ "$HTTP_CODE" == "200" || "$HTTP_CODE" == "201" ]]; then
|
|
log "INFO" "Successfully joined cluster"
|
|
|
|
# Create join marker
|
|
mkdir -p "$(dirname "$FIRST_BOOT_MARKER")"
|
|
date -Iseconds > "$FIRST_BOOT_MARKER"
|
|
|
|
# Log response details if available
|
|
if [[ -n "$RESPONSE_BODY" ]]; then
|
|
log "INFO" "Join response: $RESPONSE_BODY"
|
|
fi
|
|
|
|
exit 0
|
|
|
|
elif [[ "$HTTP_CODE" == "409" ]]; then
|
|
# Already member of cluster
|
|
log "WARN" "Node already member of cluster (HTTP 409)"
|
|
|
|
# Create join marker to prevent future attempts
|
|
mkdir -p "$(dirname "$FIRST_BOOT_MARKER")"
|
|
date -Iseconds > "$FIRST_BOOT_MARKER"
|
|
|
|
exit 2
|
|
|
|
elif [[ "$HTTP_CODE" == "000" ]]; then
|
|
log "ERROR" "Join request failed: connection error to leader $LEADER_URL"
|
|
|
|
if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then
|
|
log "INFO" "Retrying in ${RETRY_DELAY}s..."
|
|
sleep "$RETRY_DELAY"
|
|
fi
|
|
|
|
else
|
|
log "ERROR" "Join request failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY"
|
|
|
|
if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then
|
|
log "INFO" "Retrying in ${RETRY_DELAY}s..."
|
|
sleep "$RETRY_DELAY"
|
|
fi
|
|
fi
|
|
done
|
|
|
|
# Max attempts exhausted
|
|
log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts"
|
|
exit 1
|