#!/usr/bin/env bash set -euo pipefail ROOT="${ULTRACLOUD_REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" DEFAULT_WORK_ROOT="${ULTRACLOUD_WORK_ROOT:-}" CLUSTER_ID="${ULTRACLOUD_BAREMETAL_CLUSTER_ID:-baremetal-iso-canonical}" CHAINFIRE_ENDPOINT="http://127.0.0.1:2379" DEPLOYER_ENDPOINT="http://127.0.0.1:8088" BINARY_CACHE_ENDPOINT="http://127.0.0.1:8090" BOOTSTRAP_TOKEN="${ULTRACLOUD_BAREMETAL_BOOTSTRAP_TOKEN:-baremetal-iso-bootstrap-token}" CONTROL_NODE_CLASS="${ULTRACLOUD_BAREMETAL_CONTROL_NODE_CLASS:-iso-control-plane}" WORKER_NODE_CLASS="${ULTRACLOUD_BAREMETAL_WORKER_NODE_CLASS:-iso-worker}" CONTROL_NIXOS_CONFIGURATION="${ULTRACLOUD_BAREMETAL_CONTROL_CONFIGURATION:-baremetal-qemu-control-plane}" WORKER_NIXOS_CONFIGURATION="${ULTRACLOUD_BAREMETAL_WORKER_CONFIGURATION:-baremetal-qemu-worker}" CONTROL_DISKO_CONFIG_PATH="${ULTRACLOUD_BAREMETAL_CONTROL_DISKO_CONFIG_PATH:-nix/nodes/baremetal-qemu/control-plane/disko.nix}" WORKER_DISKO_CONFIG_PATH="${ULTRACLOUD_BAREMETAL_WORKER_DISKO_CONFIG_PATH:-nix/nodes/baremetal-qemu/worker/disko.nix}" CONTROL_TARGET_DISK_BY_ID="${ULTRACLOUD_BAREMETAL_CONTROL_TARGET_DISK_BY_ID:-/dev/disk/by-id/virtio-uc-control-root}" WORKER_TARGET_DISK_BY_ID="${ULTRACLOUD_BAREMETAL_WORKER_TARGET_DISK_BY_ID:-/dev/disk/by-id/virtio-uc-worker-root}" CONTROL_DISK_SERIAL="${ULTRACLOUD_BAREMETAL_CONTROL_DISK_SERIAL:-uc-control-root}" WORKER_DISK_SERIAL="${ULTRACLOUD_BAREMETAL_WORKER_DISK_SERIAL:-uc-worker-root}" CONTROL_HEALTH_CHECK_PATH="/etc/ultracloud-role-control-plane" WORKER_HEALTH_CHECK_PATH="/etc/ultracloud-role-worker" CONTROL_NODE_ID="${ULTRACLOUD_BAREMETAL_CONTROL_NODE_ID:-iso-control-plane-01}" WORKER_NODE_ID="${ULTRACLOUD_BAREMETAL_WORKER_NODE_ID:-iso-worker-01}" CONTROL_SSH_PORT="${ULTRACLOUD_BAREMETAL_CONTROL_SSH_PORT:-22231}" WORKER_SSH_PORT="${ULTRACLOUD_BAREMETAL_WORKER_SSH_PORT:-22232}" CONTROL_DHCP_START="${ULTRACLOUD_BAREMETAL_CONTROL_DHCP_START:-10.0.2.15}" WORKER_DHCP_START="${ULTRACLOUD_BAREMETAL_WORKER_DHCP_START:-10.0.2.16}" CONTROL_DISK_GIB="${ULTRACLOUD_BAREMETAL_CONTROL_DISK_GIB:-18G}" WORKER_DISK_GIB="${ULTRACLOUD_BAREMETAL_WORKER_DISK_GIB:-18G}" log() { printf '[baremetal-iso-e2e] %s\n' "$*" } marker() { printf 'ULTRACLOUD_MARKER %s\n' "$*" } die() { echo "[baremetal-iso-e2e] ERROR: $*" >&2 exit 1 } require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1" } host_cpu_count() { local count count="$(getconf _NPROCESSORS_ONLN 2>/dev/null || nproc 2>/dev/null || echo 1)" if [[ ! "${count}" =~ ^[0-9]+$ ]] || (( count < 1 )); then count=1 fi printf '%s\n' "${count}" } default_local_nix_max_jobs() { local cpu_count="$1" if (( cpu_count <= 2 )); then printf '1\n' return 0 fi printf '%s\n' "$(( (cpu_count + 1) / 2 ))" } default_local_nix_build_cores() { local cpu_count="$1" local max_jobs="$2" local build_cores=1 if (( max_jobs > 0 )); then build_cores="$(( cpu_count / max_jobs ))" fi if (( build_cores < 1 )); then build_cores=1 fi printf '%s\n' "${build_cores}" } default_baremetal_vm_vcpus() { local cpu_count="$1" if (( cpu_count >= 8 )); then printf '4\n' elif (( cpu_count >= 4 )); then printf '2\n' else printf '1\n' fi } default_baremetal_vm_memory_mib() { local cpu_count="$1" if (( cpu_count >= 8 )); then printf '3072\n' else printf '2048\n' fi } append_nix_config_line() { local line="$1" if [[ -n "${NIX_CONFIG:-}" ]]; then NIX_CONFIG+=$'\n' fi NIX_CONFIG+="${line}" } configure_local_nix_execution() { append_nix_config_line "builders =" append_nix_config_line "max-jobs = ${LOCAL_NIX_MAX_JOBS}" append_nix_config_line "cores = ${LOCAL_NIX_BUILD_CORES}" append_nix_config_line "experimental-features = nix-command flakes" append_nix_config_line "warn-dirty = false" export NIX_CONFIG } host_kvm_access() { [[ -r /dev/kvm && -w /dev/kvm ]] } qemu_machine_args() { if [[ "${BAREMETAL_VM_ACCELERATOR_MODE}" == "kvm" ]]; then printf '%s\n' \ "-machine" "pc,accel=kvm:tcg" \ "-enable-kvm" \ "-cpu" "host" return 0 fi printf '%s\n' \ "-machine" "pc" \ "-accel" "tcg,thread=multi" \ "-cpu" "max" } nix_build_local() { NIX_BUILD_CORES="${LOCAL_NIX_BUILD_CORES}" nix \ --option builders '' \ --option warn-dirty false \ --max-jobs "${LOCAL_NIX_MAX_JOBS}" \ build "$@" } resolve_default_work_root() { if [[ -n "${DEFAULT_WORK_ROOT}" ]]; then printf '%s\n' "${DEFAULT_WORK_ROOT}" return 0 fi if [[ -w "${ROOT}" ]]; then printf '%s\n' "${ROOT}/work" return 0 fi if [[ -n "${TMPDIR:-}" ]]; then printf '%s\n' "${TMPDIR}/ultracloud" return 0 fi printf '%s\n' "/tmp/ultracloud" } resolve_store_path() { local env_name="$1" local attr="$2" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi nix_build_local "$ROOT#$attr" --no-link --print-out-paths } resolve_binary() { local env_name="$1" local bin_name="$2" local attr="$3" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi if command -v "$bin_name" >/dev/null 2>&1; then command -v "$bin_name" return 0 fi local out out="$(nix_build_local "$ROOT#$attr" --no-link --print-out-paths)" printf '%s/bin/%s\n' "$out" "$bin_name" } resolve_iso_image() { local candidate="$1" if [[ -f "$candidate" ]]; then printf '%s\n' "$candidate" return 0 fi local iso_dir="$candidate/iso" if [[ -d "$iso_dir" ]]; then local iso_path iso_path="$(find "$iso_dir" -maxdepth 1 -type f -name '*.iso' | head -n 1)" if [[ -n "$iso_path" ]]; then printf '%s\n' "$iso_path" return 0 fi fi die "unable to resolve a bootable ISO file from $candidate" } resolve_ovmf_firmware() { local env_name="$1" local relative_path="$2" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi local ovmf_dir ovmf_dir="$(nix_build_local nixpkgs#OVMF.fd --no-link --print-out-paths)" printf '%s/%s\n' "$ovmf_dir" "$relative_path" } capture_environment() { { printf 'started_at=%s\n' "$(date -Is)" printf 'pwd=%s\n' "$PWD" printf 'user=%s\n' "$(id -un)" printf 'uid=%s\n' "$(id -u)" printf 'gid=%s\n' "$(id -g)" printf 'work_root=%s\n' "${DEFAULT_WORK_ROOT}" printf 'state_dir=%s\n' "$TMP_DIR" printf 'iso_image=%s\n' "$ISO_IMAGE" printf 'flake_bundle=%s\n' "$FLAKE_BUNDLE" printf 'bootstrap_token_set=%s\n' "$([[ -n "${BOOTSTRAP_TOKEN}" ]] && echo yes || echo no)" printf 'control_node_class=%s\n' "$CONTROL_NODE_CLASS" printf 'worker_node_class=%s\n' "$WORKER_NODE_CLASS" printf 'control_nixos_configuration=%s\n' "$CONTROL_NIXOS_CONFIGURATION" printf 'worker_nixos_configuration=%s\n' "$WORKER_NIXOS_CONFIGURATION" printf 'control_disko_config_path=%s\n' "$CONTROL_DISKO_CONFIG_PATH" printf 'worker_disko_config_path=%s\n' "$WORKER_DISKO_CONFIG_PATH" printf 'control_target_disk_by_id=%s\n' "$CONTROL_TARGET_DISK_BY_ID" printf 'worker_target_disk_by_id=%s\n' "$WORKER_TARGET_DISK_BY_ID" printf 'control_target=%s\n' "$CONTROL_TARGET_SYSTEM" printf 'worker_target=%s\n' "$WORKER_TARGET_SYSTEM" printf 'tmpdir=%s\n' "${TMPDIR:-}" printf 'host_cpu_count=%s\n' "${HOST_CPU_COUNT}" printf 'local_nix_max_jobs=%s\n' "${LOCAL_NIX_MAX_JOBS}" printf 'local_nix_build_cores=%s\n' "${LOCAL_NIX_BUILD_CORES}" printf 'vm_accelerator_mode=%s\n' "${BAREMETAL_VM_ACCELERATOR_MODE}" printf 'vm_vcpus=%s\n' "${BAREMETAL_VM_VCPUS}" printf 'vm_memory_mib=%s\n' "${BAREMETAL_VM_MEMORY_MIB}" printf 'kvm_present=%s\n' "$([[ -e /dev/kvm ]] && echo yes || echo no)" printf 'kvm_access=%s\n' "$([[ -r /dev/kvm && -w /dev/kvm ]] && echo rw || echo no)" printf 'nix_builders=%s\n' "$(nix config show builders 2>/dev/null | awk -F' = ' 'NR==1 { print $2 }')" } >"$TMP_DIR/environment.txt" } wait_for_http() { local url="$1" local timeout_secs="$2" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if curl -fsS "$url" >/dev/null 2>&1; then return 0 fi sleep 1 done return 1 } wait_for_log_marker() { local label="$1" local log_file="$2" local needle="$3" local timeout_secs="$4" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if [[ -f "$log_file" ]] && grep -Eq "$needle" "$log_file"; then log "${label}: observed ${needle}" return 0 fi sleep 2 done return 1 } ssh_base() { local port="$1" shift ssh \ -F /dev/null \ -i "$SSH_KEY" \ -o BatchMode=yes \ -o ConnectTimeout=5 \ -o ConnectionAttempts=1 \ -o StrictHostKeyChecking=no \ -o UserKnownHostsFile=/dev/null \ -o LogLevel=ERROR \ -p "$port" \ root@127.0.0.1 "$@" } wait_for_ssh() { local label="$1" local port="$2" local timeout_secs="$3" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if ssh_base "$port" true >/dev/null 2>&1; then log "${label}: SSH is reachable on port ${port}" return 0 fi sleep 2 done return 1 } ssh_shell() { local port="$1" local script="$2" local quoted printf -v quoted '%q' "$script" ssh_base "$port" "bash -lc $quoted" } current_system_path() { local port="$1" ssh_shell "$port" 'readlink -f /run/current-system' } remote_boot_id() { local port="$1" ssh_shell "$port" 'cat /proc/sys/kernel/random/boot_id' } remote_journal_has_marker() { local port="$1" local needle="$2" shift 2 local remote_cmd="journalctl -b -o cat --no-pager" local unit for unit in "$@"; do printf -v remote_cmd '%s -u %q' "$remote_cmd" "$unit" done printf -v remote_cmd '%s | grep -Fq %q' "$remote_cmd" "$needle" ssh_shell "$port" "$remote_cmd" } wait_for_remote_journal_marker() { local label="$1" local port="$2" local needle="$3" local timeout_secs="$4" shift 4 local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if remote_journal_has_marker "$port" "$needle" "$@" >/dev/null 2>&1; then log "${label}: observed ${needle} via remote journal" return 0 fi sleep 2 done return 1 } wait_for_remote_unit_active() { local label="$1" local port="$2" local unit_name="$3" local timeout_secs="$4" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if ssh_shell "$port" "systemctl is-active ${unit_name} >/dev/null" >/dev/null 2>&1; then log "${label}: ${unit_name} is active" return 0 fi sleep 2 done return 1 } wait_for_reboot_transition() { local label="$1" local port="$2" local previous_boot_id="$3" local timeout_secs="$4" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do local current_boot_id if current_boot_id="$(remote_boot_id "$port" 2>/dev/null)"; then if [[ -n "$current_boot_id" && "$current_boot_id" != "$previous_boot_id" ]]; then log "${label}: reboot completed with boot_id=${current_boot_id}" return 0 fi fi sleep 2 done return 1 } observed_status() { local node_id="$1" local payload if ! payload="$( "$DEPLOYER_CTL_BIN" \ --chainfire-endpoint "$CHAINFIRE_ENDPOINT" \ --cluster-id "$CLUSTER_ID" \ --cluster-namespace ultracloud \ --deployer-namespace deployer \ node inspect \ --node-id "$node_id" \ --include-observed-system \ --format json 2>/dev/null )"; then printf 'missing\n' return 0 fi jq -r '.observed_system.status // "missing"' <<<"$payload" } wait_for_observed_active() { local node_id="$1" local timeout_secs="$2" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if [[ "$(observed_status "$node_id")" == "active" ]]; then log "${node_id}: observed-system reached active" return 0 fi sleep 5 done return 1 } inspect_node_payload() { local node_id="$1" "$DEPLOYER_CTL_BIN" \ --chainfire-endpoint "$CHAINFIRE_ENDPOINT" \ --cluster-id "$CLUSTER_ID" \ --cluster-namespace ultracloud \ --deployer-namespace deployer \ node inspect \ --node-id "$node_id" \ --include-desired-system \ --format json } assert_node_contract() { local node_id="$1" local expected_node_class="$2" local expected_nixos_configuration="$3" local expected_disko_config_path="$4" local expected_target_disk_by_id="$5" local expected_health_check_path="$6" local expected_target_system="$7" local payload payload="$(inspect_node_payload "$node_id")" \ || die "${node_id} install contract is not inspectable through deployer-ctl" jq -e \ --arg node_id "$node_id" \ --arg node_class "$expected_node_class" \ --arg nixos_configuration "$expected_nixos_configuration" \ --arg disko_config_path "$expected_disko_config_path" \ --arg target_disk_by_id "$expected_target_disk_by_id" \ --arg health_check_path "$expected_health_check_path" \ --arg target_system "$expected_target_system" \ ' .node.node_id == $node_id and .node.node_class == $node_class and .node.install_plan.nixos_configuration == $nixos_configuration and .node.install_plan.disko_config_path == $disko_config_path and (.node.install_plan.target_disk_by_id // "") == $target_disk_by_id and (.node.install_plan.target_disk // "") == "" and .desired_system.nixos_configuration == $nixos_configuration and (.desired_system.target_system // "") == $target_system and (.desired_system.switch_action // "switch") == "switch" and (.desired_system.rollback_on_failure // true) == true and ((.desired_system.health_check_command | if length == 0 then "" else .[-1] end) == $health_check_path) ' <<<"$payload" >/dev/null \ || die "${node_id} install contract did not resolve to the expected class/profile defaults" log "${node_id}: install contract resolved via node class ${expected_node_class}" } assert_port_free() { local port="$1" if ss -ltn "( sport = :$port )" | grep -Fq ":$port"; then die "port $port is already in use" fi } start_host_services() { cat >"$TMP_DIR/chainfire.toml" <"$TMP_DIR/deployer.toml" <"$CHAINFIRE_LOG" 2>&1 & CHAINFIRE_PID="$!" wait_for_http "http://127.0.0.1:8081/health" 120 \ || die "host Chainfire did not become healthy" log "Starting host-side Deployer" NO_COLOR=1 CLICOLOR=0 RUST_LOG_STYLE=never \ "$DEPLOYER_SERVER_BIN" --config "$TMP_DIR/deployer.toml" >"$DEPLOYER_LOG" 2>&1 & DEPLOYER_PID="$!" wait_for_http "http://127.0.0.1:8088/health" 120 \ || die "host Deployer did not become healthy" } seed_binary_cache() { local path local nar_rel local nar_path local store_base local store_hash local nar_hash local nar_size local refs local deriver mkdir -p "$NIX_CACHE_DIR/nar" cat >"$NIX_CACHE_DIR/nix-cache-info" <<'EOF' StoreDir: /nix/store WantMassQuery: 1 Priority: 30 EOF log "Seeding host-local Nix binary cache" if [[ -n "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION:-}" && -f "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" ]]; then nix-store --load-db <"${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" fi while IFS= read -r path; do [[ -n "$path" ]] || continue store_base="$(basename "$path")" store_hash="${store_base%%-*}" nar_rel="nar/${store_base}.nar" nar_path="$NIX_CACHE_DIR/$nar_rel" if [[ ! -f "$nar_path" ]]; then nix-store --dump "$path" >"$nar_path" fi nar_size="$(stat -c%s "$nar_path")" nar_hash="$(nix hash file --type sha256 --base32 "$nar_path")" refs="$(nix-store --query --references "$path" | xargs -r -n1 basename | tr '\n' ' ' | sed 's/ $//')" deriver="$(nix-store --query --deriver "$path" 2>/dev/null || true)" deriver="$(basename "$deriver" 2>/dev/null || true)" { echo "StorePath: $path" echo "URL: $nar_rel" echo "Compression: none" echo "FileHash: sha256:$nar_hash" echo "FileSize: $nar_size" echo "NarHash: sha256:$nar_hash" echo "NarSize: $nar_size" echo "References: $refs" if [[ -n "$deriver" && "$deriver" != "unknown-deriver" ]]; then echo "Deriver: $deriver" fi } >"$NIX_CACHE_DIR/${store_hash}.narinfo" done < <( nix-store --query --requisites \ "$CONTROL_TARGET_SYSTEM" \ "$WORKER_TARGET_SYSTEM" \ "$CONTROL_DISKO_SCRIPT" \ "$WORKER_DISKO_SCRIPT" \ | sort -u ) } start_binary_cache() { seed_binary_cache log "Starting host-local Nix binary cache" python3 -m http.server 8090 --bind 0.0.0.0 --directory "$NIX_CACHE_DIR" \ >"$NIX_CACHE_LOG" 2>&1 & NIX_CACHE_PID="$!" wait_for_http "${BINARY_CACHE_ENDPOINT}/nix-cache-info" 120 \ || die "host-local Nix binary cache did not become reachable" } apply_cluster_state() { cat >"$TMP_DIR/cluster-state.yaml" </dev/null rm -f "$ovmf_vars_path" cp "$OVMF_VARS_TEMPLATE" "$ovmf_vars_path" chmod u+w "$ovmf_vars_path" nohup "$QEMU_BIN" \ -name "$label" \ -smp "${BAREMETAL_VM_VCPUS}" \ -m "${BAREMETAL_VM_MEMORY_MIB}" \ -nographic \ -no-reboot \ -boot order=dc,once=d,menu=off \ $(qemu_machine_args) \ -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ -drive if=pflash,format=raw,file="$ovmf_vars_path" \ -drive id=systemdisk,if=none,file="$disk_path",format=qcow2 \ -device virtio-blk-pci,bootindex=1,drive=systemdisk,serial="$disk_serial" \ -cdrom "$ISO_IMAGE" \ -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ -device virtio-net-pci,netdev=user0,mac="${mac}" \ -smbios type=1,product=UltraCloudQEMUBaremetal,serial="${node_id}" \ >"$log_path" 2>&1 & echo "$!" >"${log_path}.pid" } launch_installed_vm() { local label="$1" local ssh_port="$2" local dhcp_start="$3" local mac="$4" local disk_serial="$5" local disk_path="$6" local log_path="$7" local ovmf_vars_path="${disk_path}.ovmf-vars.fd" [[ -f "$ovmf_vars_path" ]] || die "missing OVMF vars file for relaunch: $ovmf_vars_path" nohup "$QEMU_BIN" \ -name "$label" \ -smp "${BAREMETAL_VM_VCPUS}" \ -m "${BAREMETAL_VM_MEMORY_MIB}" \ -nographic \ $(qemu_machine_args) \ -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ -drive if=pflash,format=raw,file="$ovmf_vars_path" \ -drive id=systemdisk,if=none,file="$disk_path",format=qcow2 \ -device virtio-blk-pci,bootindex=1,drive=systemdisk,serial="$disk_serial" \ -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ -device virtio-net-pci,netdev=user0,mac="${mac}" \ >>"$log_path" 2>&1 & echo "$!" >"${log_path}.pid" } wait_for_pid_exit() { local label="$1" local pid_file="$2" local timeout_secs="$3" local deadline=$((SECONDS + timeout_secs)) local pid [[ -f "$pid_file" ]] || die "${label} is missing pid file $pid_file" pid="$(cat "$pid_file")" while (( SECONDS < deadline )); do if ! kill -0 "$pid" >/dev/null 2>&1; then log "${label}: QEMU exited after installer-triggered reboot" return 0 fi sleep 2 done return 1 } verify_node() { local node_id="$1" local ssh_port="$2" local disk_path="$3" local log_path="$4" local expected_role="$5" local expected_system="$6" local expected_nixos_configuration="$7" local expected_node_class="$8" local expected_disko_config_path="$9" local expected_target_disk_by_id="${10}" local expected_health_check_path="${11}" local dhcp_start="${12}" local mac="${13}" local disk_serial="${14}" wait_for_log_marker "$node_id" "$TMP_DIR/deployer.log" "Node registered successfully.*node_id=${node_id}" 900 \ || die "${node_id} never completed /api/v1/phone-home registration" assert_node_contract \ "$node_id" \ "$expected_node_class" \ "$expected_nixos_configuration" \ "$expected_disko_config_path" \ "$expected_target_disk_by_id" \ "$expected_health_check_path" \ "$expected_system" wait_for_ssh "$node_id" "$ssh_port" 900 \ || die "${node_id} never exposed SSH during the installer boot" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.boot.${node_id}" 120 \ ultracloud-bootstrap.service ultracloud-install.service \ || die "${node_id} never recorded the pre-install boot marker" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.phone-home.complete.${node_id}" 120 \ ultracloud-bootstrap.service ultracloud-install.service \ || die "${node_id} never recorded the phone-home completion marker" marker "pre-install.${node_id}" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.bundle-downloaded.${node_id}" 1200 \ ultracloud-install.service \ || die "${node_id} never downloaded the flake bundle" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.disko.complete.${node_id}" 2400 \ ultracloud-install.service \ || die "${node_id} never completed disko" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.nixos-install.complete.${node_id}" 3600 \ ultracloud-install.service \ || die "${node_id} never finished nixos-install" marker "install.${node_id}" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER reboot.${node_id}" 3600 \ ultracloud-install.service \ || die "${node_id} never emitted reboot marker" marker "reboot.${node_id}" wait_for_pid_exit "$node_id" "${log_path}.pid" 300 \ || die "${node_id} installer VM did not exit after the reboot marker" launch_installed_vm \ "ultracloud-baremetal-${node_id}-installed" \ "$ssh_port" \ "$dhcp_start" \ "$mac" \ "$disk_serial" \ "$disk_path" \ "$log_path" wait_for_ssh "$node_id" "$ssh_port" 1800 \ || die "${node_id} did not come back over SSH after reboot" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER post-install.boot.${node_id}.${expected_role}" 1800 \ ultracloud-baremetal-postinstall-marker.service \ || die "${node_id} never emitted post-install marker" marker "post-install.${node_id}" ssh_shell "$ssh_port" 'test -f /etc/ultracloud/node-config.json' ssh_shell "$ssh_port" 'test -d /var/lib/photon-src/.bundle-inputs/nixpkgs' wait_for_remote_unit_active "$node_id" "$ssh_port" "nix-agent.service" 180 \ || die "${node_id} never started nix-agent.service after install" ssh_shell "$ssh_port" "grep -Fx '${expected_role}' /etc/ultracloud-role" ssh_shell "$ssh_port" "test -b '${expected_target_disk_by_id}'" if [[ "$expected_role" == "control-plane" ]]; then wait_for_remote_unit_active "$node_id" "$ssh_port" "chainfire.service" 180 \ || die "${node_id} never started chainfire.service after install" fi wait_for_observed_active "$node_id" 1200 \ || die "${node_id} never reached observed-system active" [[ "$(current_system_path "$ssh_port")" == "$expected_system" ]] \ || die "${node_id} current system does not match expected target" marker "desired-system-active.${node_id}" } cleanup() { local status="$?" set +e if [[ -n "${TMP_DIR:-}" && -d "${TMP_DIR}" ]]; then { printf 'finished_at=%s\n' "$(date -Is)" printf 'exit_status=%s\n' "$status" } >>"$TMP_DIR/environment.txt" fi for pid_file in "$CONTROL_LOG.pid" "$WORKER_LOG.pid"; do if [[ -f "$pid_file" ]]; then pid="$(cat "$pid_file")" kill "$pid" 2>/dev/null || true wait "$pid" 2>/dev/null || true fi done if [[ -n "${DEPLOYER_PID:-}" ]]; then kill "$DEPLOYER_PID" 2>/dev/null || true wait "$DEPLOYER_PID" 2>/dev/null || true fi if [[ -n "${CHAINFIRE_PID:-}" ]]; then kill "$CHAINFIRE_PID" 2>/dev/null || true wait "$CHAINFIRE_PID" 2>/dev/null || true fi if [[ -n "${NIX_CACHE_PID:-}" ]]; then kill "$NIX_CACHE_PID" 2>/dev/null || true wait "$NIX_CACHE_PID" 2>/dev/null || true fi if (( status != 0 )); then log "control-plane serial log tail:" tail -n 120 "$CONTROL_LOG" 2>/dev/null || true log "worker serial log tail:" tail -n 120 "$WORKER_LOG" 2>/dev/null || true log "deployer log tail:" tail -n 120 "$DEPLOYER_LOG" 2>/dev/null || true log "chainfire log tail:" tail -n 120 "$CHAINFIRE_LOG" 2>/dev/null || true log "binary cache log tail:" tail -n 120 "$NIX_CACHE_LOG" 2>/dev/null || true fi if [[ "${KEEP_STATE_DIR:-0}" != "1" ]]; then rm -rf "$TMP_DIR" fi exit "$status" } main() { DEFAULT_WORK_ROOT="$(resolve_default_work_root)" HOST_CPU_COUNT="$(host_cpu_count)" LOCAL_NIX_MAX_JOBS="${ULTRACLOUD_BAREMETAL_NIX_MAX_JOBS:-${ULTRACLOUD_LOCAL_NIX_MAX_JOBS:-$(default_local_nix_max_jobs "${HOST_CPU_COUNT}")}}" LOCAL_NIX_BUILD_CORES="${ULTRACLOUD_BAREMETAL_NIX_BUILD_CORES:-${ULTRACLOUD_LOCAL_NIX_BUILD_CORES:-$(default_local_nix_build_cores "${HOST_CPU_COUNT}" "${LOCAL_NIX_MAX_JOBS}")}}" BAREMETAL_VM_VCPUS="${ULTRACLOUD_BAREMETAL_VM_VCPUS:-$(default_baremetal_vm_vcpus "${HOST_CPU_COUNT}")}" BAREMETAL_VM_MEMORY_MIB="${ULTRACLOUD_BAREMETAL_VM_MEMORY_MIB:-$(default_baremetal_vm_memory_mib "${HOST_CPU_COUNT}")}" if [[ "${ULTRACLOUD_BAREMETAL_FORCE_TCG:-0}" == "1" ]]; then BAREMETAL_VM_ACCELERATOR_MODE="tcg" elif host_kvm_access; then BAREMETAL_VM_ACCELERATOR_MODE="kvm" else BAREMETAL_VM_ACCELERATOR_MODE="tcg" fi configure_local_nix_execution require_cmd curl require_cmd jq require_cmd nix require_cmd python3 require_cmd qemu-img require_cmd qemu-system-x86_64 require_cmd ssh require_cmd ssh-keygen require_cmd ss ISO_IMAGE="$(resolve_iso_image "$(resolve_store_path ULTRACLOUD_BAREMETAL_ISO_IMAGE 'nixosConfigurations.ultracloud-iso.config.system.build.isoImage')")" FLAKE_BUNDLE="$(resolve_store_path ULTRACLOUD_BAREMETAL_FLAKE_BUNDLE 'packages.x86_64-linux.ultracloudFlakeBundle')" CONTROL_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_TARGET 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel')" WORKER_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_TARGET 'nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel')" CONTROL_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount')" WORKER_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount')" CHAINFIRE_BIN="$(resolve_binary ULTRACLOUD_CHAINFIRE_SERVER_BIN chainfire 'packages.x86_64-linux.chainfire-server')" DEPLOYER_SERVER_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_SERVER_BIN deployer-server 'packages.x86_64-linux.deployer-server')" DEPLOYER_CTL_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_CTL_BIN deployer-ctl 'packages.x86_64-linux.deployer-ctl')" OVMF_CODE_FD="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_CODE 'FV/OVMF_CODE.fd')" OVMF_VARS_TEMPLATE="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_VARS 'FV/OVMF_VARS.fd')" QEMU_BIN="${ULTRACLOUD_QEMU_BIN:-$(command -v qemu-system-x86_64)}" QEMU_IMG_BIN="${ULTRACLOUD_QEMU_IMG_BIN:-$(command -v qemu-img)}" if [[ -n "${ULTRACLOUD_BAREMETAL_STATE_DIR:-}" ]]; then TMP_DIR="$ULTRACLOUD_BAREMETAL_STATE_DIR" KEEP_STATE_DIR=1 mkdir -p "$TMP_DIR" find "$TMP_DIR" -mindepth 1 -maxdepth 1 \ ! -name nix-cache \ -exec rm -rf {} + else TMP_DIR="${DEFAULT_WORK_ROOT}/baremetal-iso" KEEP_STATE_DIR=1 mkdir -p "$TMP_DIR" find "$TMP_DIR" -mindepth 1 -maxdepth 1 \ ! -name nix-cache \ -exec rm -rf {} + fi export TMPDIR="${TMPDIR:-${DEFAULT_WORK_ROOT}/tmp}" export XDG_CACHE_HOME="${XDG_CACHE_HOME:-${DEFAULT_WORK_ROOT}/xdg-cache}" mkdir -p "$TMPDIR" mkdir -p "$XDG_CACHE_HOME" NIX_CACHE_DIR="$TMP_DIR/nix-cache" CONTROL_LOG="$TMP_DIR/control-plane.serial.log" WORKER_LOG="$TMP_DIR/worker.serial.log" DEPLOYER_LOG="$TMP_DIR/deployer.log" CHAINFIRE_LOG="$TMP_DIR/chainfire.log" NIX_CACHE_LOG="$TMP_DIR/nix-cache.log" trap cleanup EXIT SSH_KEY="$TMP_DIR/id_ed25519" ssh-keygen -q -t ed25519 -N "" -f "$SSH_KEY" >/dev/null SSH_PUBKEY="$(tr -d '\n' <"$SSH_KEY.pub")" capture_environment assert_port_free 2379 assert_port_free 8081 assert_port_free 8088 assert_port_free 8090 assert_port_free "$CONTROL_SSH_PORT" assert_port_free "$WORKER_SSH_PORT" start_binary_cache start_host_services apply_cluster_state launch_iso_vm \ "ultracloud-baremetal-control-plane" \ "$CONTROL_NODE_ID" \ "$CONTROL_SSH_PORT" \ "$CONTROL_DHCP_START" \ "52:54:00:11:22:31" \ "$CONTROL_DISK_SERIAL" \ "$CONTROL_DISK_GIB" \ "$TMP_DIR/control-plane.qcow2" \ "$CONTROL_LOG" verify_node \ "$CONTROL_NODE_ID" \ "$CONTROL_SSH_PORT" \ "$TMP_DIR/control-plane.qcow2" \ "$CONTROL_LOG" \ "control-plane" \ "$CONTROL_TARGET_SYSTEM" \ "$CONTROL_NIXOS_CONFIGURATION" \ "$CONTROL_NODE_CLASS" \ "$CONTROL_DISKO_CONFIG_PATH" \ "$CONTROL_TARGET_DISK_BY_ID" \ "$CONTROL_HEALTH_CHECK_PATH" \ "$CONTROL_DHCP_START" \ "52:54:00:11:22:31" \ "$CONTROL_DISK_SERIAL" launch_iso_vm \ "ultracloud-baremetal-worker" \ "$WORKER_NODE_ID" \ "$WORKER_SSH_PORT" \ "$WORKER_DHCP_START" \ "52:54:00:11:22:32" \ "$WORKER_DISK_SERIAL" \ "$WORKER_DISK_GIB" \ "$TMP_DIR/worker.qcow2" \ "$WORKER_LOG" verify_node \ "$WORKER_NODE_ID" \ "$WORKER_SSH_PORT" \ "$TMP_DIR/worker.qcow2" \ "$WORKER_LOG" \ "worker" \ "$WORKER_TARGET_SYSTEM" \ "$WORKER_NIXOS_CONFIGURATION" \ "$WORKER_NODE_CLASS" \ "$WORKER_DISKO_CONFIG_PATH" \ "$WORKER_TARGET_DISK_BY_ID" \ "$WORKER_HEALTH_CHECK_PATH" \ "$WORKER_DHCP_START" \ "52:54:00:11:22:32" \ "$WORKER_DISK_SERIAL" log "Canonical ISO bare-metal QEMU verification succeeded" } main "$@"