#!/usr/bin/env bash set -euo pipefail # cluster-join.sh - Reusable script for cluster join logic # Usage: cluster-join.sh [max_attempts] [retry_delay] # # Arguments: # service_name - Name of the service (e.g., chainfire, flaredb) # health_url - Local health endpoint URL # leader_url - Leader's cluster management URL # join_payload - JSON payload for join request # max_attempts - Maximum number of join attempts (default: 5) # retry_delay - Delay between retries in seconds (default: 10) # # Returns: # 0 - Successfully joined cluster # 1 - Failed to join cluster after max attempts # 2 - Already joined (detected by checking cluster membership) # 3 - Invalid arguments SERVICE_NAME="${1:-}" HEALTH_URL="${2:-}" LEADER_URL="${3:-}" JOIN_PAYLOAD="${4:-}" MAX_ATTEMPTS="${5:-5}" RETRY_DELAY="${6:-10}" CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}" CURL_MAX_TIME="${CURL_MAX_TIME:-15}" CURL_INSECURE="${CURL_INSECURE:-1}" FIRST_BOOT_MARKER="/var/lib/first-boot-automation/.${SERVICE_NAME}-joined" # Validate arguments if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" || -z "$LEADER_URL" || -z "$JOIN_PAYLOAD" ]]; then echo "ERROR: Missing required arguments" >&2 echo "Usage: $0 [max_attempts] [retry_delay]" >&2 exit 3 fi # Logging function with JSON output log() { local level="$1" local message="$2" local timestamp timestamp=$(date -Iseconds) echo "{\"timestamp\":\"$timestamp\",\"level\":\"$level\",\"service\":\"$SERVICE_NAME\",\"operation\":\"cluster-join\",\"message\":\"$message\"}" >&2 } # Check if already joined (marker file exists) if [[ -f "$FIRST_BOOT_MARKER" ]]; then log "INFO" "Cluster join marker found, already joined" if [[ -r "$FIRST_BOOT_MARKER" ]]; then MARKER_INFO=$(cat "$FIRST_BOOT_MARKER") log "INFO" "Join timestamp: $MARKER_INFO" fi exit 2 fi # Wait for local service to be healthy log "INFO" "Waiting for local $SERVICE_NAME to be healthy" # Use health-check.sh script if available, otherwise inline health check SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" if [[ -x "$SCRIPT_DIR/health-check.sh" ]]; then if ! "$SCRIPT_DIR/health-check.sh" "$SERVICE_NAME" "$HEALTH_URL" 120 5; then log "ERROR" "Local $SERVICE_NAME failed health check" exit 1 fi else # Inline health check HEALTH_TIMEOUT=120 HEALTH_START=$(date +%s) while true; do CURRENT_TIME=$(date +%s) ELAPSED=$((CURRENT_TIME - HEALTH_START)) if [[ $ELAPSED -ge $HEALTH_TIMEOUT ]]; then log "ERROR" "Health check timeout after ${ELAPSED}s" exit 1 fi CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME") if [[ "$CURL_INSECURE" == "1" ]]; then CURL_FLAGS+=(-k) fi HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000") if [[ "$HTTP_CODE" == "200" ]]; then log "INFO" "Local $SERVICE_NAME is healthy" break fi log "WARN" "Waiting for $SERVICE_NAME health (${ELAPSED}s elapsed)" sleep 5 done fi # Parse join payload to extract node info for logging if command -v jq &> /dev/null; then NODE_ID=$(echo "$JOIN_PAYLOAD" | jq -r '.id // .node_id // "unknown"') log "INFO" "Attempting to join cluster as node: $NODE_ID" else log "INFO" "Attempting to join cluster (jq not available for payload parsing)" fi # Cluster join loop with retry logic log "INFO" "Starting cluster join attempts (max: $MAX_ATTEMPTS, delay: ${RETRY_DELAY}s)" for ATTEMPT in $(seq 1 "$MAX_ATTEMPTS"); do log "INFO" "Cluster join attempt $ATTEMPT/$MAX_ATTEMPTS" # Make join request to leader RESPONSE_FILE=$(mktemp) PAYLOAD_FILE=$(mktemp) printf '%s' "$JOIN_PAYLOAD" > "$PAYLOAD_FILE" CURL_FLAGS=(-s -w "%{http_code}" -o "$RESPONSE_FILE" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME") if [[ "$CURL_INSECURE" == "1" ]]; then CURL_FLAGS+=(-k) fi HTTP_CODE=$(curl "${CURL_FLAGS[@]}" \ -X POST "$LEADER_URL/admin/member/add" \ -H "Content-Type: application/json" \ --data-binary "@$PAYLOAD_FILE" 2>/dev/null || echo "000") RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "") rm -f "$RESPONSE_FILE" "$PAYLOAD_FILE" log "INFO" "Join request response: HTTP $HTTP_CODE" # Check response if [[ "$HTTP_CODE" == "200" || "$HTTP_CODE" == "201" ]]; then log "INFO" "Successfully joined cluster" # Create join marker mkdir -p "$(dirname "$FIRST_BOOT_MARKER")" date -Iseconds > "$FIRST_BOOT_MARKER" # Log response details if available if [[ -n "$RESPONSE_BODY" ]]; then log "INFO" "Join response: $RESPONSE_BODY" fi exit 0 elif [[ "$HTTP_CODE" == "409" ]]; then # Already member of cluster log "WARN" "Node already member of cluster (HTTP 409)" # Create join marker to prevent future attempts mkdir -p "$(dirname "$FIRST_BOOT_MARKER")" date -Iseconds > "$FIRST_BOOT_MARKER" exit 2 elif [[ "$HTTP_CODE" == "000" ]]; then log "ERROR" "Join request failed: connection error to leader $LEADER_URL" if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then log "INFO" "Retrying in ${RETRY_DELAY}s..." sleep "$RETRY_DELAY" fi else log "ERROR" "Join request failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY" if [[ $ATTEMPT -lt $MAX_ATTEMPTS ]]; then log "INFO" "Retrying in ${RETRY_DELAY}s..." sleep "$RETRY_DELAY" fi fi done # Max attempts exhausted log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts" exit 1