photoncloud-monorepo/nix/modules/first-boot-automation.nix
centra 96d46a3603
Some checks failed
Nix CI / filter (push) Successful in 8s
Nix CI / gate (shared crates) (push) Has been skipped
Nix CI / gate () (push) Failing after 5s
Nix CI / build () (push) Has been skipped
Nix CI / ci-status (push) Failing after 1s
Integrate topology-driven bootstrap into nix-nos
2026-03-30 14:39:28 +09:00

480 lines
16 KiB
Nix

{ config, lib, pkgs, ... }:
let
cfg = config.services.first-boot-automation;
configFilePath = toString cfg.configFile;
configEtcPath =
if lib.hasPrefix "/etc/" configFilePath
then lib.removePrefix "/etc/" configFilePath
else null;
hasPlasmacloudManagedClusterConfig =
(config ? plasmacloud)
&& (config.plasmacloud ? cluster)
&& (config.plasmacloud.cluster.generated.nodeClusterConfig or null) != null;
availableNixNOSClusters = builtins.attrNames (config.nix-nos.clusters or {});
resolvedNixNOSClusterName =
if builtins.elem cfg.nixnosClusterName availableNixNOSClusters then
cfg.nixnosClusterName
else if
(config ? plasmacloud)
&& (config.plasmacloud ? cluster)
&& (config.plasmacloud.cluster.enable or false)
&& builtins.elem config.plasmacloud.cluster.name availableNixNOSClusters
then
config.plasmacloud.cluster.name
else if builtins.length availableNixNOSClusters == 1 then
builtins.head availableNixNOSClusters
else
cfg.nixnosClusterName;
useNixNOS = cfg.useNixNOS && (config.nix-nos.enable or false) &&
(builtins.length availableNixNOSClusters) > 0;
nixNOSClusterConfig =
if useNixNOS then
config.nix-nos.generateClusterConfig {
hostname = config.networking.hostName;
clusterName = resolvedNixNOSClusterName;
}
else
null;
# Helper function to create cluster join service
mkClusterJoinService = {
serviceName,
healthUrl,
leaderUrlKey,
defaultLeaderUrl,
joinPath ? null,
port,
description ? ""
}: {
description = "Cluster Join for ${description}";
after = [ "network-online.target" "${serviceName}.service" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
# Only run if first-boot automation is enabled
unitConfig = {
ConditionPathExists = cfg.configFile;
};
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
Restart = "on-failure";
RestartSec = "30s";
};
script = ''
set -euo pipefail
# Logging function
log() {
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"${serviceName}-join\",\"level\":\"$1\",\"message\":\"$2\"}"
}
log "INFO" "Starting cluster join process for ${serviceName}"
# Read cluster config at runtime
CONFIG_FILE="${cfg.configFile}"
if [ -f "$CONFIG_FILE" ]; then
IS_BOOTSTRAP=$(${pkgs.jq}/bin/jq -r '.bootstrap // false' "$CONFIG_FILE")
LEADER_URL=$(${pkgs.jq}/bin/jq -r '.${leaderUrlKey} // .leader_url // "${defaultLeaderUrl}"' "$CONFIG_FILE")
NODE_ID=$(${pkgs.jq}/bin/jq -r '.node_id // "unknown"' "$CONFIG_FILE")
RAFT_ADDR=$(${pkgs.jq}/bin/jq -r '.raft_addr // "127.0.0.1:${toString (port + 1)}"' "$CONFIG_FILE")
log "INFO" "Loaded config: bootstrap=$IS_BOOTSTRAP, node_id=$NODE_ID"
else
log "ERROR" "Config file not found: $CONFIG_FILE"
exit 1
fi
# Wait for local service health
log "INFO" "Waiting for local ${serviceName} to be healthy"
HEALTH_TIMEOUT=120
HEALTH_START=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - HEALTH_START))
if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then
log "ERROR" "Health check timeout after ''${ELAPSED}s"
exit 1
fi
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "${healthUrl}" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "Local ${serviceName} is healthy"
break
fi
log "WARN" "Waiting for ${serviceName} health (''${ELAPSED}s elapsed, HTTP $HTTP_CODE)"
sleep 5
done
# Check if this is a bootstrap node
if [ "$IS_BOOTSTRAP" = "true" ]; then
log "INFO" "Bootstrap node detected, cluster already initialized"
# Create marker to indicate initialization complete
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-initialized"
exit 0
fi
# Check if already joined
if [ -f "/var/lib/first-boot-automation/.${serviceName}-joined" ]; then
log "INFO" "Already joined cluster (marker exists)"
exit 0
fi
${if joinPath == null then ''
log "INFO" "No join API configured for ${serviceName}; assuming static-peer startup"
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined"
exit 0
'' else ""}
# Join existing cluster
log "INFO" "Attempting to join existing cluster"
log "INFO" "Leader URL: $LEADER_URL, Node ID: $NODE_ID, Raft Addr: $RAFT_ADDR"
MAX_ATTEMPTS=5
RETRY_DELAY=10
for ATTEMPT in $(seq 1 $MAX_ATTEMPTS); do
log "INFO" "Join attempt $ATTEMPT/$MAX_ATTEMPTS"
# Make join request
RESPONSE_FILE=$(mktemp)
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -w "%{http_code}" -o "$RESPONSE_FILE" \
-X POST "$LEADER_URL${joinPath}" \
-H "Content-Type: application/json" \
-d "{\"id\":\"$NODE_ID\",\"raft_addr\":\"$RAFT_ADDR\"}" 2>/dev/null || echo "000")
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "")
rm -f "$RESPONSE_FILE"
log "INFO" "Join request response: HTTP $HTTP_CODE"
if [ "$HTTP_CODE" = "200" ] || [ "$HTTP_CODE" = "201" ]; then
log "INFO" "Successfully joined cluster"
# Create join marker
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined"
exit 0
elif [ "$HTTP_CODE" = "409" ]; then
log "WARN" "Already member of cluster (HTTP 409)"
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.${serviceName}-joined"
exit 0
else
log "ERROR" "Join failed: HTTP $HTTP_CODE, response: $RESPONSE_BODY"
if [ $ATTEMPT -lt $MAX_ATTEMPTS ]; then
log "INFO" "Retrying in ''${RETRY_DELAY}s..."
sleep $RETRY_DELAY
fi
fi
done
log "ERROR" "Failed to join cluster after $MAX_ATTEMPTS attempts"
exit 1
'';
};
in
{
options.services.first-boot-automation = {
enable = lib.mkEnableOption "first-boot cluster join automation";
useNixNOS = lib.mkOption {
type = lib.types.bool;
default = false;
description = "Use nix-nos topology for cluster configuration instead of cluster-config.json";
};
nixnosClusterName = lib.mkOption {
type = lib.types.str;
default = "plasmacloud";
description = "Name of the nix-nos cluster to use (only used when useNixNOS is true)";
};
configFile = lib.mkOption {
type = lib.types.path;
default = "/etc/nixos/secrets/cluster-config.json";
description = "Path to cluster configuration JSON file (used when useNixNOS is false)";
};
enableChainfire = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Enable Chainfire cluster join automation";
};
enableFlareDB = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Enable FlareDB cluster join automation";
};
enableIAM = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Enable IAM initial setup automation";
};
enableHealthCheck = lib.mkOption {
type = lib.types.bool;
default = true;
description = "Enable cluster health check service";
};
chainfirePort = lib.mkOption {
type = lib.types.port;
default = 2379;
description = "Chainfire API port";
};
flaredbPort = lib.mkOption {
type = lib.types.port;
default = 2479;
description = "FlareDB API port";
};
iamPort = lib.mkOption {
type = lib.types.port;
default = 8080;
description = "IAM API port";
};
};
config = lib.mkIf cfg.enable {
assertions = [
{
assertion = (!cfg.useNixNOS) || (config.nix-nos.enable or false);
message = "services.first-boot-automation.useNixNOS requires nix-nos.enable = true";
}
{
assertion = (!cfg.useNixNOS) || ((builtins.length availableNixNOSClusters) > 0);
message = "services.first-boot-automation.useNixNOS requires at least one nix-nos.clusters entry";
}
{
assertion = (!cfg.useNixNOS) || (configEtcPath != null);
message = "services.first-boot-automation.useNixNOS requires services.first-boot-automation.configFile to live under /etc";
}
{
assertion = (!cfg.useNixNOS) || builtins.elem resolvedNixNOSClusterName availableNixNOSClusters;
message = "services.first-boot-automation.useNixNOS could not resolve nix-nos cluster '${cfg.nixnosClusterName}' (available: ${lib.concatStringsSep ", " availableNixNOSClusters})";
}
];
environment.etc = lib.mkIf (useNixNOS && !hasPlasmacloudManagedClusterConfig) (
lib.optionalAttrs (configEtcPath != null) {
"${configEtcPath}" = {
text = builtins.toJSON nixNOSClusterConfig;
mode = "0600";
};
}
);
# Chainfire cluster join service
systemd.services.chainfire-cluster-join = lib.mkIf cfg.enableChainfire (
mkClusterJoinService {
serviceName = "chainfire";
healthUrl = "http://localhost:8081/health"; # Health endpoint on admin port
leaderUrlKey = "chainfire_leader_url";
defaultLeaderUrl = "http://localhost:8081";
joinPath = "/admin/member/add";
port = cfg.chainfirePort;
description = "Chainfire";
}
);
# FlareDB cluster join service
systemd.services.flaredb-cluster-join = lib.mkIf cfg.enableFlareDB (
mkClusterJoinService {
serviceName = "flaredb";
healthUrl = "http://localhost:8082/health"; # Health endpoint on admin port
leaderUrlKey = "flaredb_leader_url";
defaultLeaderUrl = "http://localhost:8082";
joinPath = "/admin/member/add";
port = cfg.flaredbPort;
description = "FlareDB";
} // {
after = [ "network-online.target" "flaredb.service" "chainfire-cluster-join.service" ];
requires = [ "chainfire-cluster-join.service" ];
}
);
# IAM initial setup service
systemd.services.iam-initial-setup = lib.mkIf cfg.enableIAM {
description = "IAM Initial Setup";
after = [ "network-online.target" "iam.service" "flaredb-cluster-join.service" ];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
unitConfig = {
ConditionPathExists = cfg.configFile;
};
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
Restart = "on-failure";
RestartSec = "30s";
};
script = ''
set -euo pipefail
log() {
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"iam-setup\",\"level\":\"$1\",\"message\":\"$2\"}"
}
log "INFO" "Starting IAM initial setup"
# Check if already initialized
if [ -f "/var/lib/first-boot-automation/.iam-initialized" ]; then
log "INFO" "IAM already initialized (marker exists)"
exit 0
fi
# Wait for IAM health
log "INFO" "Waiting for IAM to be healthy"
HEALTH_TIMEOUT=120
HEALTH_START=$(date +%s)
while true; do
CURRENT_TIME=$(date +%s)
ELAPSED=$((CURRENT_TIME - HEALTH_START))
if [ $ELAPSED -ge $HEALTH_TIMEOUT ]; then
log "ERROR" "Health check timeout"
exit 1
fi
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "IAM is healthy"
break
fi
sleep 5
done
# Check if admin user exists
log "INFO" "Checking for existing admin user"
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/api/users/admin" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "Admin user already exists"
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized"
exit 0
fi
# TODO: Create admin user (requires IAM API implementation)
log "WARN" "Admin user creation not yet implemented (waiting for IAM API)"
# Mark as initialized for now
mkdir -p /var/lib/first-boot-automation
date -Iseconds > "/var/lib/first-boot-automation/.iam-initialized"
log "INFO" "IAM setup complete"
'';
};
# Cluster health check service
systemd.services.cluster-health-check = lib.mkIf cfg.enableHealthCheck {
description = "Cluster Health Check";
after = [
"network-online.target"
"chainfire-cluster-join.service"
"flaredb-cluster-join.service"
"iam-initial-setup.service"
];
wants = [ "network-online.target" ];
wantedBy = [ "multi-user.target" ];
unitConfig = {
ConditionPathExists = cfg.configFile;
};
serviceConfig = {
Type = "oneshot";
RemainAfterExit = false;
};
script = ''
set -euo pipefail
log() {
echo "{\"timestamp\":\"$(date -Iseconds)\",\"service\":\"health-check\",\"level\":\"$1\",\"message\":\"$2\"}"
}
log "INFO" "Starting cluster health check"
FAILURES=0
# Check Chainfire
if systemctl is-active chainfire.service > /dev/null 2>&1; then
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8081/health" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "Chainfire health check: PASSED"
else
log "ERROR" "Chainfire health check: FAILED (HTTP $HTTP_CODE)"
FAILURES=$((FAILURES + 1))
fi
fi
# Check FlareDB
if systemctl is-active flaredb.service > /dev/null 2>&1; then
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:8082/health" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "FlareDB health check: PASSED"
else
log "ERROR" "FlareDB health check: FAILED (HTTP $HTTP_CODE)"
FAILURES=$((FAILURES + 1))
fi
fi
# Check IAM
if systemctl is-active iam.service > /dev/null 2>&1; then
HTTP_CODE=$(${pkgs.curl}/bin/curl -s -o /dev/null -w "%{http_code}" "http://localhost:${toString cfg.iamPort}/health" 2>/dev/null || echo "000")
if [ "$HTTP_CODE" = "200" ]; then
log "INFO" "IAM health check: PASSED"
else
log "ERROR" "IAM health check: FAILED (HTTP $HTTP_CODE)"
FAILURES=$((FAILURES + 1))
fi
fi
if [ $FAILURES -eq 0 ]; then
log "INFO" "All cluster health checks passed"
exit 0
else
log "ERROR" "Cluster health check failed ($FAILURES failures)"
exit 1
fi
'';
};
# Create state directory
systemd.tmpfiles.rules = [
"d /var/lib/first-boot-automation 0755 root root -"
];
};
}