photoncloud-monorepo/nix/iso/plasmacloud-iso.nix

470 lines
17 KiB
Nix

# PlasmaCloud Bootstrap ISO
# Minimal ISO with DHCP + Phone Home to Deployer + Auto-Install
# For VM cluster deployment: boots, phones home, partitions disk, installs NixOS
{ config, lib, pkgs, modulesPath, ... }:
{
imports = [
"${modulesPath}/installer/cd-dvd/installation-cd-minimal.nix"
];
# ISO metadata
isoImage = {
isoName = "plasmacloud-bootstrap.iso";
makeEfiBootable = true;
makeUsbBootable = true;
};
# Embed the repository into the ISO for offline flake install
isoImage.contents = [
{ source = ../../.; target = "/opt/plasmacloud-src"; }
];
# Minimal network: DHCP on all interfaces
networking.useNetworkd = true;
networking.networkmanager.enable = lib.mkForce false;
systemd.network.networks."10-dhcp" = {
matchConfig.Name = "*";
DHCP = "yes";
};
# Phone Home service — fetches secrets from Deployer
systemd.services.plasmacloud-bootstrap = {
description = "PlasmaCloud Bootstrap via Phone Home";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
};
script = ''
set -euo pipefail
cmdline_value() {
local key="$1"
local arg
for arg in $(cat /proc/cmdline); do
case "$arg" in
"$key"=*)
echo "''${arg#*=}"
return 0
;;
esac
done
return 1
}
mkdir -p /etc/plasmacloud
# Discover Deployer via environment, kernel cmdline, or fallback.
DEPLOYER_URL="''${DEPLOYER_URL:-}"
if [ -z "$DEPLOYER_URL" ]; then
DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
fi
if [ -z "$DEPLOYER_URL" ]; then
DEPLOYER_URL="http://192.168.100.1:8080"
fi
# Get machine identity
MACHINE_ID=$(cat /etc/machine-id)
echo "PlasmaCloud Bootstrap starting..."
echo "Machine ID: $MACHINE_ID"
echo "Deployer URL: $DEPLOYER_URL"
# Optional bootstrap token (from file or environment)
TOKEN_FILE="/etc/plasmacloud/bootstrap-token"
DEPLOYER_TOKEN=""
if [ -s "$TOKEN_FILE" ]; then
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
else
DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
fi
DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
"$DEPLOYER_CA_CERT_URL" \
-o "$DEPLOYER_CA_CERT_PATH"
fi
fi
CURL_ARGS=(-sf --connect-timeout 5 --max-time 15)
if [ -n "$DEPLOYER_TOKEN" ]; then
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
fi
if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
fi
NODE_IP=$(${pkgs.iproute2}/bin/ip -4 route get 1.1.1.1 2>/dev/null | ${pkgs.gawk}/bin/awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')
if [ -z "$NODE_IP" ]; then
NODE_IP=$(${pkgs.iproute2}/bin/ip -4 addr show scope global 2>/dev/null | ${pkgs.gawk}/bin/awk '/inet / {sub("/.*","",$2); print $2; exit}')
fi
if [ -z "$NODE_IP" ]; then
NODE_IP=$(hostname -I 2>/dev/null | ${pkgs.gawk}/bin/awk '{print $1}')
fi
NODE_HOSTNAME=$(hostname)
CPU_MODEL=$(${pkgs.gawk}/bin/awk -F: '/model name/ {gsub(/^[ \t]+/, "", $2); print $2; exit}' /proc/cpuinfo 2>/dev/null || true)
CPU_CORES=$(${pkgs.gawk}/bin/awk '/^cpu cores/ {print $4; exit}' /proc/cpuinfo 2>/dev/null || true)
CPU_THREADS=$(${pkgs.coreutils}/bin/nproc --all 2>/dev/null || true)
MEMORY_KIB=$(${pkgs.gawk}/bin/awk '/MemTotal:/ {print $2; exit}' /proc/meminfo 2>/dev/null || true)
MEMORY_BYTES=""
if [ -n "$MEMORY_KIB" ]; then
MEMORY_BYTES=$((MEMORY_KIB * 1024))
fi
DISKS_JSON=$(${pkgs.util-linux}/bin/lsblk -J -b -o NAME,PATH,SIZE,MODEL,SERIAL,ROTA,TYPE 2>/dev/null | ${pkgs.jq}/bin/jq '
[.blockdevices[] | select(.type == "disk") | {
name: .name,
path: (.path // null),
size_bytes: (.size | tonumber?),
model: ((.model // "") | if . == "" then null else . end),
serial: ((.serial // "") | if . == "" then null else . end),
rotational: (if .rota == null then null else (.rota == 1) end)
}]
')
NICS_JSON=$(${pkgs.iproute2}/bin/ip -j link 2>/dev/null | ${pkgs.jq}/bin/jq '
[.[] | select(.ifname != "lo") | {
name: .ifname,
mac_address: ((.address // "") | if . == "" or . == "00:00:00:00:00:00" then null else . end),
oper_state: ((.operstate // "") | ascii_downcase | if . == "" then null else . end)
}]
')
DMI_VENDOR=$(tr -d '\n' </sys/class/dmi/id/sys_vendor 2>/dev/null || true)
DMI_PRODUCT=$(tr -d '\n' </sys/class/dmi/id/product_name 2>/dev/null || true)
DMI_SERIAL=$(tr -d '\n' </sys/class/dmi/id/product_serial 2>/dev/null || true)
HARDWARE_FACTS=$(${pkgs.jq}/bin/jq -n \
--arg architecture "$(${pkgs.coreutils}/bin/uname -m)" \
--arg cpu_model "$CPU_MODEL" \
--arg cpu_threads "$CPU_THREADS" \
--arg cpu_cores "$CPU_CORES" \
--arg memory_bytes "$MEMORY_BYTES" \
--arg dmi_vendor "$DMI_VENDOR" \
--arg dmi_product "$DMI_PRODUCT" \
--arg dmi_serial "$DMI_SERIAL" \
--argjson disks "$DISKS_JSON" \
--argjson nics "$NICS_JSON" '
{
architecture: (if $architecture == "" then null else $architecture end),
cpu_model: (if $cpu_model == "" then null else $cpu_model end),
cpu_threads: (if $cpu_threads == "" then null else ($cpu_threads | tonumber) end),
cpu_cores: (if $cpu_cores == "" then null else ($cpu_cores | tonumber) end),
memory_bytes: (if $memory_bytes == "" then null else ($memory_bytes | tonumber) end),
disks: $disks,
nics: $nics,
dmi: ({
vendor: (if $dmi_vendor == "" then null else $dmi_vendor end),
product_name: (if $dmi_product == "" then null else $dmi_product end),
serial_number: (if $dmi_serial == "" then null else $dmi_serial end)
} | with_entries(select(.value != null)))
}
| if (.dmi | length) == 0 then del(.dmi) else . end
')
REQUEST_JSON=$(${pkgs.jq}/bin/jq -n \
--arg machine_id "$MACHINE_ID" \
--arg node_id "$NODE_HOSTNAME" \
--arg hostname "$NODE_HOSTNAME" \
--arg ip "$NODE_IP" \
--argjson hardware_facts "$HARDWARE_FACTS" '
{
machine_id: $machine_id,
node_id: $node_id,
hostname: $hostname,
ip: $ip,
hardware_facts: $hardware_facts
}
')
# Phone Home request with retry
for i in 1 2 3 4 5; do
echo "Attempt $i/5: Contacting Deployer..."
if RESPONSE=$(${pkgs.curl}/bin/curl "''${CURL_ARGS[@]}" -X POST \
-H "Content-Type: application/json" \
-d "$REQUEST_JSON" \
"$DEPLOYER_URL/api/v1/phone-home"); then
echo " Phone Home successful"
# Create directories
mkdir -p /etc/ssh /etc/plasmacloud /root/.ssh
# Validate success flag
SUCCESS=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.success // false' || echo "false")
if [ "$SUCCESS" != "true" ]; then
MESSAGE=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.message // empty' || true)
echo " Phone Home rejected: $MESSAGE"
sleep $((2 ** i))
continue
fi
# Extract and apply secrets
NODE_CONFIG=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -c '.node_config // empty' || true)
if [ -z "$NODE_CONFIG" ] || [ "$NODE_CONFIG" = "null" ]; then
echo " Phone Home response missing node_config"
sleep $((2 ** i))
continue
fi
echo "$NODE_CONFIG" > /etc/plasmacloud/node-config.json
echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.node_config.ssh_authorized_keys[]?' > /root/.ssh/authorized_keys
# Apply SSH host key if provided
SSH_HOST_KEY=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.ssh_host_key // empty')
if [ -n "$SSH_HOST_KEY" ]; then
umask 077
echo "$SSH_HOST_KEY" > /etc/ssh/ssh_host_ed25519_key
${pkgs.openssh}/bin/ssh-keygen -y -f /etc/ssh/ssh_host_ed25519_key > /etc/ssh/ssh_host_ed25519_key.pub
fi
# Apply TLS material if provided
TLS_CERT=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.tls_cert // empty')
TLS_KEY=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.tls_key // empty')
if [ -n "$TLS_CERT" ] && [ -n "$TLS_KEY" ]; then
umask 077
mkdir -p /etc/plasmacloud/tls
echo "$TLS_CERT" > /etc/plasmacloud/tls/node.crt
echo "$TLS_KEY" > /etc/plasmacloud/tls/node.key
fi
# Generate host keys locally if missing
if [ ! -s /etc/ssh/ssh_host_ed25519_key ]; then
${pkgs.openssh}/bin/ssh-keygen -A
fi
# Set permissions
chmod 644 /etc/plasmacloud/node-config.json 2>/dev/null || true
chmod 700 /root/.ssh 2>/dev/null || true
chmod 600 /root/.ssh/authorized_keys 2>/dev/null || true
chmod 600 /etc/ssh/ssh_host_ed25519_key 2>/dev/null || true
chmod 644 /etc/ssh/ssh_host_ed25519_key.pub 2>/dev/null || true
chmod 600 /etc/plasmacloud/tls/node.key 2>/dev/null || true
chmod 644 /etc/plasmacloud/tls/node.crt 2>/dev/null || true
# Signal success
NODE_ID=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.node_id // "unknown"')
echo " Bootstrap complete: $NODE_ID"
exit 0
else
echo " Phone Home failed, attempt $i/5"
sleep $((2 ** i))
fi
done
echo " Bootstrap failed after 5 attempts"
exit 1
'';
};
# Auto-install service - partitions disk and runs nixos-install
systemd.services.plasmacloud-install = {
description = "PlasmaCloud Auto-Install to Disk";
wantedBy = [ "multi-user.target" ];
after = [ "plasmacloud-bootstrap.service" ];
requires = [ "plasmacloud-bootstrap.service" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
StandardOutput = "journal+console";
StandardError = "journal+console";
};
script = ''
set -euo pipefail
cmdline_value() {
local key="$1"
local arg
for arg in $(cat /proc/cmdline); do
case "$arg" in
"$key"=*)
echo "''${arg#*=}"
return 0
;;
esac
done
return 1
}
if [ ! -s /etc/plasmacloud/node-config.json ]; then
echo "ERROR: node-config.json missing (bootstrap not complete?)"
exit 1
fi
NODE_ID=$(${pkgs.jq}/bin/jq -r '.hostname // empty' /etc/plasmacloud/node-config.json)
NODE_IP=$(${pkgs.jq}/bin/jq -r '.ip // empty' /etc/plasmacloud/node-config.json)
NIXOS_CONFIGURATION=$(${pkgs.jq}/bin/jq -r '.install_plan.nixos_configuration // .hostname // empty' /etc/plasmacloud/node-config.json)
DISKO_PATH=$(${pkgs.jq}/bin/jq -r '.install_plan.disko_config_path // empty' /etc/plasmacloud/node-config.json)
TARGET_DISK=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk // empty' /etc/plasmacloud/node-config.json)
TARGET_DISK_BY_ID=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk_by_id // empty' /etc/plasmacloud/node-config.json)
DEPLOYER_URL="''${DEPLOYER_URL:-}"
if [ -z "$DEPLOYER_URL" ]; then
DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
fi
if [ -z "$DEPLOYER_URL" ]; then
DEPLOYER_URL="http://192.168.100.1:8080"
fi
SRC_ROOT="/opt/plasmacloud-src"
if [ -z "$NODE_ID" ] || [ -z "$NODE_IP" ]; then
echo "ERROR: node-config.json missing hostname/ip"
exit 1
fi
if [ -z "$NIXOS_CONFIGURATION" ]; then
echo "ERROR: node-config.json missing install_plan.nixos_configuration"
exit 1
fi
TOKEN_FILE="/etc/plasmacloud/bootstrap-token"
DEPLOYER_TOKEN=""
if [ -s "$TOKEN_FILE" ]; then
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
else
DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
fi
DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
"$DEPLOYER_CA_CERT_URL" \
-o "$DEPLOYER_CA_CERT_PATH"
fi
fi
CURL_ARGS=(-sfL --connect-timeout 5 --max-time 120)
if [ -n "$DEPLOYER_TOKEN" ]; then
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
fi
if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
fi
BUNDLE_PATH="/run/plasmacloud/flake-bundle.tar.gz"
mkdir -p /run/plasmacloud
if ${pkgs.curl}/bin/curl "''${CURL_ARGS[@]}" \
"$DEPLOYER_URL/api/v1/bootstrap/flake-bundle" \
-o "$BUNDLE_PATH"; then
echo "Downloaded bootstrap flake bundle from deployer"
rm -rf "$SRC_ROOT"
mkdir -p "$SRC_ROOT"
${pkgs.gzip}/bin/gzip -dc "$BUNDLE_PATH" | ${pkgs.gnutar}/bin/tar -xf - -C "$SRC_ROOT"
else
echo "No deployer flake bundle available; using embedded source tree"
fi
if [ -z "$DISKO_PATH" ]; then
CANDIDATE_DISKO="nix/nodes/vm-cluster/$NODE_ID/disko.nix"
if [ -f "$SRC_ROOT/$CANDIDATE_DISKO" ]; then
DISKO_PATH="$CANDIDATE_DISKO"
fi
fi
if [ -z "$DISKO_PATH" ]; then
echo "ERROR: node-config.json missing install_plan.disko_config_path and no default Disko path exists for $NODE_ID"
exit 1
fi
if [ ! -f "$SRC_ROOT/$DISKO_PATH" ]; then
echo "ERROR: Disko config not found: $SRC_ROOT/$DISKO_PATH"
exit 1
fi
echo "PlasmaCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, disko_path=$DISKO_PATH)"
# Resolve installation target disk.
if [ -n "$TARGET_DISK_BY_ID" ]; then
if [ ! -b "$TARGET_DISK_BY_ID" ]; then
echo "ERROR: target_disk_by_id does not exist: $TARGET_DISK_BY_ID"
exit 1
fi
DISK="$TARGET_DISK_BY_ID"
elif [ -n "$TARGET_DISK" ]; then
if [ ! -b "$TARGET_DISK" ]; then
echo "ERROR: target_disk does not exist: $TARGET_DISK"
exit 1
fi
DISK="$TARGET_DISK"
else
DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}')
fi
if [ -z "$DISK" ]; then
echo "ERROR: No disk found"
exit 1
fi
ROOT_PART=$(${pkgs.util-linux}/bin/lsblk -lnpo NAME,TYPE "$DISK" 2>/dev/null | ${pkgs.gawk}/bin/awk '$2=="part"{print $1}' | sed -n '2p')
mkdir -p /mnt
# Skip if already installed
if [ -n "$ROOT_PART" ] && ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then
mount "$ROOT_PART" /mnt 2>/dev/null || true
if [ -e /mnt/etc/NIXOS ]; then
echo " Existing NixOS detected; skipping install"
umount /mnt || true
exit 0
fi
umount /mnt || true
fi
echo "Validating NixOS configuration output..."
nix eval --raw "$SRC_ROOT#nixosConfigurations.$NIXOS_CONFIGURATION.config.system.build.toplevel.drvPath" >/dev/null
EFFECTIVE_DISKO_PATH="$SRC_ROOT/$DISKO_PATH"
if [ -n "$DISK" ]; then
cat > /run/plasmacloud/disko-wrapper.nix <<EOF
{ ... }:
{
imports = [
"$SRC_ROOT/nix/modules/install-target.nix"
"$SRC_ROOT/$DISKO_PATH"
];
plasmacloud.install.diskDevice = "$DISK";
}
EOF
EFFECTIVE_DISKO_PATH="/run/plasmacloud/disko-wrapper.nix"
fi
echo "Running disko to partition $DISK..."
export NIX_CONFIG="experimental-features = nix-command flakes"
nix run github:nix-community/disko -- --mode disko "$EFFECTIVE_DISKO_PATH"
echo "Running nixos-install..."
nixos-install --flake "$SRC_ROOT#$NIXOS_CONFIGURATION" --no-root-passwd
sync
echo " Install complete; rebooting..."
${pkgs.systemd}/bin/systemctl reboot
'';
};
# Packages for bootstrap + install
environment.systemPackages = with pkgs; [
curl jq vim htop gawk gnugrep util-linux parted dosfstools e2fsprogs gnutar gzip
];
# SSH with key-based auth for non-interactive access
services.openssh = {
enable = true;
settings.PermitRootLogin = "prohibit-password";
};
# SSH access keys are provisioned dynamically via phone-home
users.users.root.openssh.authorizedKeys.keys = [ ];
}