{ config, lib, pkgs, ... }: let cfg = config.services.first-boot-automation; configFilePath = toString cfg.configFile; configEtcPath = if lib.hasPrefix "/etc/" configFilePath then lib.removePrefix "/etc/" configFilePath else null; hasPlasmacloudManagedClusterConfig = (config ? plasmacloud) && (config.plasmacloud ? cluster) && (config.plasmacloud.cluster.generated.nodeClusterConfig or null) != null; availableNixNOSClusters = builtins.attrNames (config.nix-nos.clusters or {}); resolvedNixNOSClusterName = if builtins.elem cfg.nixnosClusterName availableNixNOSClusters then cfg.nixnosClusterName else if (config ? plasmacloud) && (config.plasmacloud ? cluster) && (config.plasmacloud.cluster.enable or false) && builtins.elem config.plasmacloud.cluster.name availableNixNOSClusters then config.plasmacloud.cluster.name else if builtins.length availableNixNOSClusters == 1 then builtins.head availableNixNOSClusters else cfg.nixnosClusterName; useNixNOS = cfg.useNixNOS && (config.nix-nos.enable or false) && (builtins.length availableNixNOSClusters) > 0; nixNOSClusterConfig = if useNixNOS then config.nix-nos.generateClusterConfig { hostname = config.networking.hostName; clusterName = resolvedNixNOSClusterName; } else null; # Helper function to create cluster join service mkClusterJoinService = { serviceName, healthUrl, leaderUrlKey, defaultLeaderUrl, joinPath ? null, port, description ? "" }: { description = "Cluster Join for ${description}"; after = [ "network-online.target" "${serviceName}.service" ]; wants = [ "network-online.target" ]; wantedBy = [ "multi-user.target" ]; # Only run if first-boot automation is enabled unitConfig = { ConditionPathExists = cfg.configFile; }; serviceConfig = { Type = "oneshot"; RemainAfterExit = true; Restart = "on-failure"; RestartSec = "30s"; }; script = '' set -euo pipefail # Logging function log() { echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"${serviceName}-join\",\"level\":\"$1\",\"message\":\"$2\"}" } log "INFO" "Starting cluster join process for ${serviceName}" # Read cluster config at runtime CONFIG_FILE="${cfg.configFile}" if [ -f "$CONFIG_FILE" ]; then IS_BOOTSTRAP=$(${pkgs.jq}/bin/jq -r '.bootstrap // false' "$CONFIG_FILE") LEADER_URL=$(${pkgs.jq}/bin/jq -r '.${leaderUrlKey} // .leader_url // "${defaultLeaderUrl}"' "$CONFIG_FILE") NODE_ID=$(${pkgs.jq}/bin/jq -r '.node_id // "unknown"' "$CONFIG_FILE") RAFT_ADDR=$(${pkgs.jq}/bin/jq -r '.raft_addr // "127.0.0.1:${toString (port + 1)}"' "$CONFIG_FILE") log "INFO" "Loaded config: bootstrap=$IS_BOOTSTRAP, node_id=$NODE_ID" else log "ERROR" "Config file not found: $CONFIG_FILE" exit 1 fi # Wait for local service health log "INFO" "Waiting for local ${serviceName} to be healthy" HEALTH_TIMEOUT=120 HEALTH_START=$(date +%s) while true; do CURRENT_TIME=$(date +%s) ELAPSED=$((CURRENT_TIME - HEALTH_START)) if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then log "ERROR" "Health check timeout after ''${ELAPSED}s" exit 1 fi HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "${healthUrl}" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "Local ${serviceName} is healthy" break fi log "WARN" "Waiting for ${serviceName} health (''${ELAPSED}s elapsed, HTTP $HTTP_CODE)" sleep 5 done # Check if this is a bootstrap node if [ "$IS_BOOTSTRAP" = "true" ]; then log "INFO" "Bootstrap node detected, cluster already initialized" # Create marker to indicate initialization complete mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-initialized" exit 0 fi # Check if already joined if [ -f "/var/lib/first-boot-automation/.${serviceName}-joined" ]; then log "INFO" "Already joined cluster (marker exists)" exit 0 fi ${if joinPath == null then '' log "INFO" "No join API configured for ${serviceName}; assuming static-peer startup" mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined" exit 0 '' else ""} # Join existing cluster log "INFO" "Attempting to join existing cluster" log "INFO" "Leader URL: $LEADER_URL, Node ID: $NODE_ID, Raft Addr: $RAFT_ADDR" MAX_ATTEMPTS=5 RETRY_DELAY=10 for ATTEMPT in $(seq 1 $MAX_ATTEMPTS); do log "INFO" "Join attempt $ATTEMPT/$MAX_ATTEMPTS" # Make join request RESPONSE_FILE=$(mktemp) HTTP_CODE=$(${pkgs.curl}/bin/curl -s -w "%{http_code}" -o "$RESPONSE_FILE" \ -X POST "$LEADER_URL${joinPath}" \ -H "Content-Type: application/json" \ -d "{\"id\":\"$NODE_ID\",\"raft_addr\":\"$RAFT_ADDR\"}" 2>/dev/null || echo "000") RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "") rm -f "$RESPONSE_FILE" log "INFO" "Join request response: HTTP $HTTP_CODE" if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then log "INFO" "Successfully joined cluster" # Create join marker mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined" exit 0 elif [ "$HTTP_CODE" = "409" ]; then log "WARN" "Already member of cluster (HTTP 409)" mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined" exit 0 else log "ERROR" "Join failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY" if [ $ATTEMPT -lt $MAX_ATTEMPTS ]; then log "INFO" "Retrying in ''${RETRY_DELAY}s..." sleep $RETRY_DELAY fi fi done log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts" exit 1 ''; }; in { options.services.first-boot-automation = { enable = lib.mkEnableOption "first-boot cluster join automation"; useNixNOS = lib.mkOption { type = lib.types.bool; default = false; description = "Use nix-nos topology for cluster configuration instead of cluster-config.json"; }; nixnosClusterName = lib.mkOption { type = lib.types.str; default = "plasmacloud"; description = "Name of the nix-nos cluster to use (only used when useNixNOS is true)"; }; configFile = lib.mkOption { type = lib.types.path; default = "/etc/nixos/secrets/cluster-config.json"; description = "Path to cluster configuration JSON file (used when useNixNOS is false)"; }; enableChainfire = lib.mkOption { type = lib.types.bool; default = true; description = "Enable Chainfire cluster join automation"; }; enableFlareDB = lib.mkOption { type = lib.types.bool; default = true; description = "Enable FlareDB cluster join automation"; }; enableIAM = lib.mkOption { type = lib.types.bool; default = true; description = "Enable IAM initial setup automation"; }; enableHealthCheck = lib.mkOption { type = lib.types.bool; default = true; description = "Enable cluster health check service"; }; chainfirePort = lib.mkOption { type = lib.types.port; default = 2379; description = "Chainfire API port"; }; flaredbPort = lib.mkOption { type = lib.types.port; default = 2479; description = "FlareDB API port"; }; iamPort = lib.mkOption { type = lib.types.port; default = config.services.iam.httpPort; description = "IAM API port"; }; }; config = lib.mkIf cfg.enable { assertions = [ { assertion = (!cfg.useNixNOS) || (config.nix-nos.enable or false); message = "services.first-boot-automation.useNixNOS requires nix-nos.enable = true"; } { assertion = (!cfg.useNixNOS) || ((builtins.length availableNixNOSClusters) > 0); message = "services.first-boot-automation.useNixNOS requires at least one nix-nos.clusters entry"; } { assertion = (!cfg.useNixNOS) || (configEtcPath != null); message = "services.first-boot-automation.useNixNOS requires services.first-boot-automation.configFile to live under /etc"; } { assertion = (!cfg.useNixNOS) || builtins.elem resolvedNixNOSClusterName availableNixNOSClusters; message = "services.first-boot-automation.useNixNOS could not resolve nix-nos cluster '${cfg.nixnosClusterName}' (available: ${lib.concatStringsSep ", " availableNixNOSClusters})"; } ]; environment.etc = lib.mkIf (useNixNOS && !hasPlasmacloudManagedClusterConfig) ( lib.optionalAttrs (configEtcPath != null) { "${configEtcPath}" = { text = builtins.toJSON nixNOSClusterConfig; mode = "0600"; }; } ); # Chainfire cluster join service systemd.services.chainfire-cluster-join = lib.mkIf cfg.enableChainfire ( mkClusterJoinService { serviceName = "chainfire"; healthUrl = "http://localhost:8081/health"; # Health endpoint on admin port leaderUrlKey = "chainfire_leader_url"; defaultLeaderUrl = "http://localhost:8081"; joinPath = "/admin/member/add"; port = cfg.chainfirePort; description = "Chainfire"; } ); # FlareDB cluster join service systemd.services.flaredb-cluster-join = lib.mkIf cfg.enableFlareDB ( mkClusterJoinService { serviceName = "flaredb"; healthUrl = "http://localhost:8082/health"; # Health endpoint on admin port leaderUrlKey = "flaredb_leader_url"; defaultLeaderUrl = "http://localhost:8082"; joinPath = "/admin/member/add"; port = cfg.flaredbPort; description = "FlareDB"; } // { after = [ "network-online.target" "flaredb.service" "chainfire-cluster-join.service" ]; requires = [ "chainfire-cluster-join.service" ]; } ); # IAM initial setup service systemd.services.iam-initial-setup = lib.mkIf cfg.enableIAM { description = "IAM Initial Setup"; after = [ "network-online.target" "iam.service" "flaredb-cluster-join.service" ]; wants = [ "network-online.target" ]; wantedBy = [ "multi-user.target" ]; unitConfig = { ConditionPathExists = cfg.configFile; }; serviceConfig = { Type = "oneshot"; RemainAfterExit = true; Restart = "on-failure"; RestartSec = "30s"; }; script = '' set -euo pipefail log() { echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"iam-setup\",\"level\":\"$1\",\"message\":\"$2\"}" } log "INFO" "Starting IAM initial setup" # Check if already initialized if [ -f "/var/lib/first-boot-automation/.iam-initialized" ]; then log "INFO" "IAM already initialized (marker exists)" exit 0 fi # Wait for IAM health log "INFO" "Waiting for IAM to be healthy" HEALTH_TIMEOUT=120 HEALTH_START=$(date +%s) while true; do CURRENT_TIME=$(date +%s) ELAPSED=$((CURRENT_TIME - HEALTH_START)) if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then log "ERROR" "Health check timeout" exit 1 fi HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "IAM is healthy" break fi sleep 5 done # Check if admin user exists log "INFO" "Checking for existing admin user" ADMIN_HEADER=() ${lib.optionalString (config.services.iam.adminToken != null) '' ADMIN_HEADER=(-H "x-iam-admin-token: ${config.services.iam.adminToken}") ''} HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "''${ADMIN_HEADER[@]}" "http://localhost:${toString cfg.iamPort}/api/v1/users/admin" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "Admin user already exists" mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized" exit 0 fi log "INFO" "Creating bootstrap admin user" RESPONSE_FILE=$(mktemp) HTTP_CODE=$(${pkgs.curl}/bin/curl -s -w "%{http_code}" -o "$RESPONSE_FILE" \ -X POST "http://localhost:${toString cfg.iamPort}/api/v1/users" \ "''${ADMIN_HEADER[@]}" \ -H "Content-Type: application/json" \ -d '{"id":"admin","name":"Bootstrap Admin"}' 2>/dev/null || echo "000") RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "") rm -f "$RESPONSE_FILE" if [ "$HTTP_CODE" != "201" ] && [ "$HTTP_CODE" != "200" ] && [ "$HTTP_CODE" != "409" ]; then log "ERROR" "Failed to create admin user: HTTP $HTTP_CODE, response: $RESPONSE_BODY" exit 1 fi # Mark as initialized for now mkdir -p /var/lib/first-boot-automation date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized" log "INFO" "IAM setup complete" ''; }; # Cluster health check service systemd.services.cluster-health-check = lib.mkIf cfg.enableHealthCheck { description = "Cluster Health Check"; after = [ "network-online.target" "chainfire-cluster-join.service" "flaredb-cluster-join.service" "iam-initial-setup.service" ]; wants = [ "network-online.target" ]; wantedBy = [ "multi-user.target" ]; unitConfig = { ConditionPathExists = cfg.configFile; }; serviceConfig = { Type = "oneshot"; RemainAfterExit = false; }; script = '' set -euo pipefail log() { echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"health-check\",\"level\":\"$1\",\"message\":\"$2\"}" } log "INFO" "Starting cluster health check" FAILURES=0 # Check Chainfire if systemctl is-active chainfire.service > /dev/null 2>&1; then HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8081/health" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "Chainfire health check: PASSED" else log "ERROR" "Chainfire health check: FAILED (HTTP $HTTP_CODE)" FAILURES=$((FAILURES + 1)) fi fi # Check FlareDB if systemctl is-active flaredb.service > /dev/null 2>&1; then HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8082/health" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "FlareDB health check: PASSED" else log "ERROR" "FlareDB health check: FAILED (HTTP $HTTP_CODE)" FAILURES=$((FAILURES + 1)) fi fi # Check IAM if systemctl is-active iam.service > /dev/null 2>&1; then HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000") if [ "$HTTP_CODE" = "200" ]; then log "INFO" "IAM health check: PASSED" else log "ERROR" "IAM health check: FAILED (HTTP $HTTP_CODE)" FAILURES=$((FAILURES + 1)) fi fi if [ $FAILURES -eq 0 ]; then log "INFO" "All cluster health checks passed" exit 0 else log "ERROR" "Cluster health check failed ($FAILURES failures)" exit 1 fi ''; }; # Create state directory systemd.tmpfiles.rules = [ "d /var/lib/first-boot-automation 0755 root root -" ]; }; }