- chainfire.nix: CHAINFIRE__NETWORK__HTTP_ADDR env var - flaredb.nix: FLAREDB_HTTP_ADDR env var - first-boot-automation.nix: jq-based config reading Fixes ChainFire crash: "unexpected argument '--http-addr' found" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
443 lines
15 KiB
Nix
443 lines
15 KiB
Nix
{ config, lib, pkgs, ... }:
|
|
|
|
let
|
|
cfg = config.services.first-boot-automation;
|
|
|
|
# Helper script paths
|
|
scriptDir = pkgs.writeTextDir "first-boot-scripts" "";
|
|
healthCheckScript = "${scriptDir}/../../../baremetal/first-boot/health-check.sh";
|
|
bootstrapDetectorScript = "${scriptDir}/../../../baremetal/first-boot/bootstrap-detector.sh";
|
|
clusterJoinScript = "${scriptDir}/../../../baremetal/first-boot/cluster-join.sh";
|
|
|
|
# Read cluster config from nix-nos or file
|
|
# Priority: 1) nix-nos topology, 2) cluster-config.json file, 3) defaults
|
|
clusterConfigExists = builtins.pathExists cfg.configFile;
|
|
|
|
# Check if nix-nos is available and enabled
|
|
useNixNOS = cfg.useNixNOS && (config.nix-nos.enable or false) &&
|
|
(builtins.length (builtins.attrNames (config.nix-nos.clusters or {}))) > 0;
|
|
|
|
clusterConfig =
|
|
if useNixNOS then
|
|
# Generate config from nix-nos topology
|
|
config.nix-nos.generateClusterConfig {
|
|
hostname = config.networking.hostName;
|
|
clusterName = cfg.nixnosClusterName;
|
|
}
|
|
else if clusterConfigExists && cfg.enable then
|
|
# Read from cluster-config.json file (legacy)
|
|
builtins.fromJSON (builtins.readFile cfg.configFile)
|
|
else
|
|
# Fallback defaults
|
|
{
|
|
node_id = "unknown";
|
|
node_role = "control-plane";
|
|
bootstrap = false;
|
|
cluster_name = "default-cluster";
|
|
leader_url = "https://localhost:2379";
|
|
raft_addr = "127.0.0.1:2380";
|
|
initial_peers = [];
|
|
flaredb_peers = [];
|
|
};
|
|
|
|
# Helper function to create cluster join service
|
|
mkClusterJoinService = { serviceName, healthUrl, leaderUrlPath, port, description ? "" }:
|
|
let
|
|
leaderUrl = clusterConfig.leader_url or "https://localhost:${toString port}";
|
|
nodeId = clusterConfig.node_id or "unknown";
|
|
raftAddr = clusterConfig.raft_addr or "127.0.0.1:${toString (port + 1)}";
|
|
isBootstrap = clusterConfig.bootstrap or false;
|
|
in
|
|
{
|
|
description = "Cluster Join for ${description}";
|
|
after = [ "network-online.target" "${serviceName}.service" ];
|
|
wants = [ "network-online.target" ];
|
|
wantedBy = [ "multi-user.target" ];
|
|
|
|
# Only run if first-boot automation is enabled
|
|
unitConfig = {
|
|
ConditionPathExists = cfg.configFile;
|
|
};
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
RemainAfterExit = true;
|
|
Restart = "on-failure";
|
|
RestartSec = "30s";
|
|
};
|
|
|
|
script = ''
|
|
set -euo pipefail
|
|
|
|
# Logging function
|
|
log() {
|
|
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"${serviceName}-join\",\"level\":\"$1\",\"message\":\"$2\"}"
|
|
}
|
|
|
|
log "INFO" "Starting cluster join process for ${serviceName}"
|
|
|
|
# Read cluster config at runtime
|
|
CONFIG_FILE="${cfg.configFile}"
|
|
if [ -f "$CONFIG_FILE" ]; then
|
|
IS_BOOTSTRAP=$(${pkgs.jq}/bin/jq -r '.bootstrap // false' "$CONFIG_FILE")
|
|
LEADER_URL=$(${pkgs.jq}/bin/jq -r '.leader_url // "https://localhost:${toString port}"' "$CONFIG_FILE")
|
|
NODE_ID=$(${pkgs.jq}/bin/jq -r '.node_id // "unknown"' "$CONFIG_FILE")
|
|
RAFT_ADDR=$(${pkgs.jq}/bin/jq -r '.raft_addr // "127.0.0.1:${toString (port + 1)}"' "$CONFIG_FILE")
|
|
log "INFO" "Loaded config: bootstrap=$IS_BOOTSTRAP, node_id=$NODE_ID"
|
|
else
|
|
log "ERROR" "Config file not found: $CONFIG_FILE"
|
|
exit 1
|
|
fi
|
|
|
|
# Wait for local service health
|
|
log "INFO" "Waiting for local ${serviceName} to be healthy"
|
|
|
|
HEALTH_TIMEOUT=120
|
|
HEALTH_START=$(date +%s)
|
|
|
|
while true; do
|
|
CURRENT_TIME=$(date +%s)
|
|
ELAPSED=$((CURRENT_TIME - HEALTH_START))
|
|
|
|
if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then
|
|
log "ERROR" "Health check timeout after ''${ELAPSED}s"
|
|
exit 1
|
|
fi
|
|
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "${healthUrl}" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "Local ${serviceName} is healthy"
|
|
break
|
|
fi
|
|
|
|
log "WARN" "Waiting for ${serviceName} health (''${ELAPSED}s elapsed, HTTP $HTTP_CODE)"
|
|
sleep 5
|
|
done
|
|
|
|
# Check if this is a bootstrap node
|
|
if [ "$IS_BOOTSTRAP" = "true" ]; then
|
|
log "INFO" "Bootstrap node detected, cluster already initialized"
|
|
|
|
# Create marker to indicate initialization complete
|
|
mkdir -p /var/lib/first-boot-automation
|
|
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-initialized"
|
|
|
|
exit 0
|
|
fi
|
|
|
|
# Check if already joined
|
|
if [ -f "/var/lib/first-boot-automation/.${serviceName}-joined" ]; then
|
|
log "INFO" "Already joined cluster (marker exists)"
|
|
exit 0
|
|
fi
|
|
|
|
# Join existing cluster
|
|
log "INFO" "Attempting to join existing cluster"
|
|
log "INFO" "Leader URL: $LEADER_URL, Node ID: $NODE_ID, Raft Addr: $RAFT_ADDR"
|
|
|
|
MAX_ATTEMPTS=5
|
|
RETRY_DELAY=10
|
|
|
|
for ATTEMPT in $(seq 1 $MAX_ATTEMPTS); do
|
|
log "INFO" "Join attempt $ATTEMPT/$MAX_ATTEMPTS"
|
|
|
|
# Make join request
|
|
RESPONSE_FILE=$(mktemp)
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -w "%{http_code}" -o "$RESPONSE_FILE" \
|
|
-X POST "$LEADER_URL${leaderUrlPath}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"id\":\"$NODE_ID\",\"raft_addr\":\"$RAFT_ADDR\"}" 2>/dev/null || echo "000")
|
|
|
|
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "")
|
|
rm -f "$RESPONSE_FILE"
|
|
|
|
log "INFO" "Join request response: HTTP $HTTP_CODE"
|
|
|
|
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then
|
|
log "INFO" "Successfully joined cluster"
|
|
|
|
# Create join marker
|
|
mkdir -p /var/lib/first-boot-automation
|
|
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined"
|
|
|
|
exit 0
|
|
elif [ "$HTTP_CODE" = "409" ]; then
|
|
log "WARN" "Already member of cluster (HTTP 409)"
|
|
|
|
mkdir -p /var/lib/first-boot-automation
|
|
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined"
|
|
|
|
exit 0
|
|
else
|
|
log "ERROR" "Join failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY"
|
|
|
|
if [ $ATTEMPT -lt $MAX_ATTEMPTS ]; then
|
|
log "INFO" "Retrying in ''${RETRY_DELAY}s..."
|
|
sleep $RETRY_DELAY
|
|
fi
|
|
fi
|
|
done
|
|
|
|
log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts"
|
|
exit 1
|
|
'';
|
|
};
|
|
|
|
in
|
|
{
|
|
options.services.first-boot-automation = {
|
|
enable = lib.mkEnableOption "first-boot cluster join automation";
|
|
|
|
useNixNOS = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = false;
|
|
description = "Use nix-nos topology for cluster configuration instead of cluster-config.json";
|
|
};
|
|
|
|
nixnosClusterName = lib.mkOption {
|
|
type = lib.types.str;
|
|
default = "plasmacloud";
|
|
description = "Name of the nix-nos cluster to use (only used when useNixNOS is true)";
|
|
};
|
|
|
|
configFile = lib.mkOption {
|
|
type = lib.types.path;
|
|
default = "/etc/nixos/secrets/cluster-config.json";
|
|
description = "Path to cluster configuration JSON file (used when useNixNOS is false)";
|
|
};
|
|
|
|
enableChainfire = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = true;
|
|
description = "Enable Chainfire cluster join automation";
|
|
};
|
|
|
|
enableFlareDB = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = true;
|
|
description = "Enable FlareDB cluster join automation";
|
|
};
|
|
|
|
enableIAM = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = true;
|
|
description = "Enable IAM initial setup automation";
|
|
};
|
|
|
|
enableHealthCheck = lib.mkOption {
|
|
type = lib.types.bool;
|
|
default = true;
|
|
description = "Enable cluster health check service";
|
|
};
|
|
|
|
chainfirePort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 2379;
|
|
description = "Chainfire API port";
|
|
};
|
|
|
|
flaredbPort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 2479;
|
|
description = "FlareDB API port";
|
|
};
|
|
|
|
iamPort = lib.mkOption {
|
|
type = lib.types.port;
|
|
default = 8080;
|
|
description = "IAM API port";
|
|
};
|
|
};
|
|
|
|
config = lib.mkIf cfg.enable {
|
|
# Chainfire cluster join service
|
|
systemd.services.chainfire-cluster-join = lib.mkIf cfg.enableChainfire (
|
|
mkClusterJoinService {
|
|
serviceName = "chainfire";
|
|
healthUrl = "http://localhost:8081/health"; # Health endpoint on admin port
|
|
leaderUrlPath = "/admin/member/add";
|
|
port = cfg.chainfirePort;
|
|
description = "Chainfire";
|
|
}
|
|
);
|
|
|
|
# FlareDB cluster join service
|
|
systemd.services.flaredb-cluster-join = lib.mkIf cfg.enableFlareDB (
|
|
mkClusterJoinService {
|
|
serviceName = "flaredb";
|
|
healthUrl = "http://localhost:8082/health"; # Health endpoint on admin port
|
|
leaderUrlPath = "/admin/member/add";
|
|
port = cfg.flaredbPort;
|
|
description = "FlareDB";
|
|
} // {
|
|
after = [ "network-online.target" "flaredb.service" "chainfire-cluster-join.service" ];
|
|
requires = [ "chainfire-cluster-join.service" ];
|
|
}
|
|
);
|
|
|
|
# IAM initial setup service
|
|
systemd.services.iam-initial-setup = lib.mkIf cfg.enableIAM {
|
|
description = "IAM Initial Setup";
|
|
after = [ "network-online.target" "iam.service" "flaredb-cluster-join.service" ];
|
|
wants = [ "network-online.target" ];
|
|
wantedBy = [ "multi-user.target" ];
|
|
|
|
unitConfig = {
|
|
ConditionPathExists = cfg.configFile;
|
|
};
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
RemainAfterExit = true;
|
|
Restart = "on-failure";
|
|
RestartSec = "30s";
|
|
};
|
|
|
|
script = ''
|
|
set -euo pipefail
|
|
|
|
log() {
|
|
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"iam-setup\",\"level\":\"$1\",\"message\":\"$2\"}"
|
|
}
|
|
|
|
log "INFO" "Starting IAM initial setup"
|
|
|
|
# Check if already initialized
|
|
if [ -f "/var/lib/first-boot-automation/.iam-initialized" ]; then
|
|
log "INFO" "IAM already initialized (marker exists)"
|
|
exit 0
|
|
fi
|
|
|
|
# Wait for IAM health
|
|
log "INFO" "Waiting for IAM to be healthy"
|
|
|
|
HEALTH_TIMEOUT=120
|
|
HEALTH_START=$(date +%s)
|
|
|
|
while true; do
|
|
CURRENT_TIME=$(date +%s)
|
|
ELAPSED=$((CURRENT_TIME - HEALTH_START))
|
|
|
|
if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then
|
|
log "ERROR" "Health check timeout"
|
|
exit 1
|
|
fi
|
|
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -k -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "IAM is healthy"
|
|
break
|
|
fi
|
|
|
|
sleep 5
|
|
done
|
|
|
|
# Check if admin user exists
|
|
log "INFO" "Checking for existing admin user"
|
|
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/api/users/admin" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "Admin user already exists"
|
|
mkdir -p /var/lib/first-boot-automation
|
|
date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized"
|
|
exit 0
|
|
fi
|
|
|
|
# TODO: Create admin user (requires IAM API implementation)
|
|
log "WARN" "Admin user creation not yet implemented (waiting for IAM API)"
|
|
|
|
# Mark as initialized for now
|
|
mkdir -p /var/lib/first-boot-automation
|
|
date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized"
|
|
|
|
log "INFO" "IAM setup complete"
|
|
'';
|
|
};
|
|
|
|
# Cluster health check service
|
|
systemd.services.cluster-health-check = lib.mkIf cfg.enableHealthCheck {
|
|
description = "Cluster Health Check";
|
|
after = [
|
|
"network-online.target"
|
|
"chainfire-cluster-join.service"
|
|
"flaredb-cluster-join.service"
|
|
"iam-initial-setup.service"
|
|
];
|
|
wants = [ "network-online.target" ];
|
|
wantedBy = [ "multi-user.target" ];
|
|
|
|
unitConfig = {
|
|
ConditionPathExists = cfg.configFile;
|
|
};
|
|
|
|
serviceConfig = {
|
|
Type = "oneshot";
|
|
RemainAfterExit = false;
|
|
};
|
|
|
|
script = ''
|
|
set -euo pipefail
|
|
|
|
log() {
|
|
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"health-check\",\"level\":\"$1\",\"message\":\"$2\"}"
|
|
}
|
|
|
|
log "INFO" "Starting cluster health check"
|
|
|
|
FAILURES=0
|
|
|
|
# Check Chainfire
|
|
if systemctl is-active chainfire.service > /dev/null 2>&1; then
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8081/health" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "Chainfire health check: PASSED"
|
|
else
|
|
log "ERROR" "Chainfire health check: FAILED (HTTP $HTTP_CODE)"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
fi
|
|
|
|
# Check FlareDB
|
|
if systemctl is-active flaredb.service > /dev/null 2>&1; then
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8082/health" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "FlareDB health check: PASSED"
|
|
else
|
|
log "ERROR" "FlareDB health check: FAILED (HTTP $HTTP_CODE)"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
fi
|
|
|
|
# Check IAM
|
|
if systemctl is-active iam.service > /dev/null 2>&1; then
|
|
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000")
|
|
|
|
if [ "$HTTP_CODE" = "200" ]; then
|
|
log "INFO" "IAM health check: PASSED"
|
|
else
|
|
log "ERROR" "IAM health check: FAILED (HTTP $HTTP_CODE)"
|
|
FAILURES=$((FAILURES + 1))
|
|
fi
|
|
fi
|
|
|
|
if [ $FAILURES -eq 0 ]; then
|
|
log "INFO" "All cluster health checks passed"
|
|
exit 0
|
|
else
|
|
log "ERROR" "Cluster health check failed ($FAILURES failures)"
|
|
exit 1
|
|
fi
|
|
'';
|
|
};
|
|
|
|
# Create state directory
|
|
systemd.tmpfiles.rules = [
|
|
"d /var/lib/first-boot-automation 0755 root root -"
|
|
];
|
|
};
|
|
}
|