#!/usr/bin/env bash set -euo pipefail ROOT="${ULTRACLOUD_REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" CLUSTER_ID="baremetal-iso-canonical" CHAINFIRE_ENDPOINT="http://127.0.0.1:2379" DEPLOYER_ENDPOINT="http://127.0.0.1:8088" BINARY_CACHE_ENDPOINT="http://127.0.0.1:8090" BOOTSTRAP_TOKEN="baremetal-iso-bootstrap-token" CONTROL_NODE_ID="iso-control-plane-01" WORKER_NODE_ID="iso-worker-01" CONTROL_SSH_PORT="22231" WORKER_SSH_PORT="22232" CONTROL_DHCP_START="10.0.2.15" WORKER_DHCP_START="10.0.2.16" CONTROL_DISK_GIB="18G" WORKER_DISK_GIB="18G" log() { printf '[baremetal-iso-e2e] %s\n' "$*" } marker() { printf 'ULTRACLOUD_MARKER %s\n' "$*" } die() { echo "[baremetal-iso-e2e] ERROR: $*" >&2 exit 1 } require_cmd() { command -v "$1" >/dev/null 2>&1 || die "required command not found: $1" } resolve_store_path() { local env_name="$1" local attr="$2" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi nix build "$ROOT#$attr" --no-link --print-out-paths } resolve_binary() { local env_name="$1" local bin_name="$2" local attr="$3" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi if command -v "$bin_name" >/dev/null 2>&1; then command -v "$bin_name" return 0 fi local out out="$(nix build "$ROOT#$attr" --no-link --print-out-paths)" printf '%s/bin/%s\n' "$out" "$bin_name" } resolve_iso_image() { local candidate="$1" if [[ -f "$candidate" ]]; then printf '%s\n' "$candidate" return 0 fi local iso_dir="$candidate/iso" if [[ -d "$iso_dir" ]]; then local iso_path iso_path="$(find "$iso_dir" -maxdepth 1 -type f -name '*.iso' | head -n 1)" if [[ -n "$iso_path" ]]; then printf '%s\n' "$iso_path" return 0 fi fi die "unable to resolve a bootable ISO file from $candidate" } resolve_ovmf_firmware() { local env_name="$1" local relative_path="$2" if [[ -n "${!env_name:-}" ]]; then printf '%s\n' "${!env_name}" return 0 fi local ovmf_dir ovmf_dir="$(nix build nixpkgs#OVMF.fd --no-link --print-out-paths)" printf '%s/%s\n' "$ovmf_dir" "$relative_path" } wait_for_http() { local url="$1" local timeout_secs="$2" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if curl -fsS "$url" >/dev/null 2>&1; then return 0 fi sleep 1 done return 1 } wait_for_log_marker() { local label="$1" local log_file="$2" local needle="$3" local timeout_secs="$4" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if [[ -f "$log_file" ]] && grep -Eq "$needle" "$log_file"; then log "${label}: observed ${needle}" return 0 fi sleep 2 done return 1 } ssh_base() { local port="$1" shift ssh \ -F /dev/null \ -i "$SSH_KEY" \ -o BatchMode=yes \ -o ConnectTimeout=5 \ -o ConnectionAttempts=1 \ -o StrictHostKeyChecking=no \ -o UserKnownHostsFile=/dev/null \ -o LogLevel=ERROR \ -p "$port" \ root@127.0.0.1 "$@" } wait_for_ssh() { local label="$1" local port="$2" local timeout_secs="$3" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if ssh_base "$port" true >/dev/null 2>&1; then log "${label}: SSH is reachable on port ${port}" return 0 fi sleep 2 done return 1 } ssh_shell() { local port="$1" local script="$2" local quoted printf -v quoted '%q' "$script" ssh_base "$port" "bash -lc $quoted" } current_system_path() { local port="$1" ssh_shell "$port" 'readlink -f /run/current-system' } remote_boot_id() { local port="$1" ssh_shell "$port" 'cat /proc/sys/kernel/random/boot_id' } remote_journal_has_marker() { local port="$1" local needle="$2" shift 2 local remote_cmd="journalctl -b -o cat --no-pager" local unit for unit in "$@"; do printf -v remote_cmd '%s -u %q' "$remote_cmd" "$unit" done printf -v remote_cmd '%s | grep -Fq %q' "$remote_cmd" "$needle" ssh_shell "$port" "$remote_cmd" } wait_for_remote_journal_marker() { local label="$1" local port="$2" local needle="$3" local timeout_secs="$4" shift 4 local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if remote_journal_has_marker "$port" "$needle" "$@" >/dev/null 2>&1; then log "${label}: observed ${needle} via remote journal" return 0 fi sleep 2 done return 1 } wait_for_reboot_transition() { local label="$1" local port="$2" local previous_boot_id="$3" local timeout_secs="$4" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do local current_boot_id if current_boot_id="$(remote_boot_id "$port" 2>/dev/null)"; then if [[ -n "$current_boot_id" && "$current_boot_id" != "$previous_boot_id" ]]; then log "${label}: reboot completed with boot_id=${current_boot_id}" return 0 fi fi sleep 2 done return 1 } observed_status() { local node_id="$1" local payload if ! payload="$( "$DEPLOYER_CTL_BIN" \ --chainfire-endpoint "$CHAINFIRE_ENDPOINT" \ --cluster-id "$CLUSTER_ID" \ --cluster-namespace ultracloud \ --deployer-namespace deployer \ node inspect \ --node-id "$node_id" \ --include-observed-system \ --format json 2>/dev/null )"; then printf 'missing\n' return 0 fi jq -r '.observed_system.status // "missing"' <<<"$payload" } wait_for_observed_active() { local node_id="$1" local timeout_secs="$2" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if [[ "$(observed_status "$node_id")" == "active" ]]; then log "${node_id}: observed-system reached active" return 0 fi sleep 5 done return 1 } assert_port_free() { local port="$1" if ss -ltn "( sport = :$port )" | grep -Fq ":$port"; then die "port $port is already in use" fi } start_host_services() { cat >"$TMP_DIR/chainfire.toml" <"$TMP_DIR/deployer.toml" <"$CHAINFIRE_LOG" 2>&1 & CHAINFIRE_PID="$!" wait_for_http "http://127.0.0.1:8081/health" 120 \ || die "host Chainfire did not become healthy" log "Starting host-side Deployer" NO_COLOR=1 CLICOLOR=0 RUST_LOG_STYLE=never \ "$DEPLOYER_SERVER_BIN" --config "$TMP_DIR/deployer.toml" >"$DEPLOYER_LOG" 2>&1 & DEPLOYER_PID="$!" wait_for_http "http://127.0.0.1:8088/health" 120 \ || die "host Deployer did not become healthy" } seed_binary_cache() { local path local nar_rel local nar_path local store_base local store_hash local nar_hash local nar_size local refs local deriver mkdir -p "$NIX_CACHE_DIR/nar" cat >"$NIX_CACHE_DIR/nix-cache-info" <<'EOF' StoreDir: /nix/store WantMassQuery: 1 Priority: 30 EOF log "Seeding host-local Nix binary cache" if [[ -n "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION:-}" && -f "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" ]]; then nix-store --load-db <"${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" fi while IFS= read -r path; do [[ -n "$path" ]] || continue store_base="$(basename "$path")" store_hash="${store_base%%-*}" nar_rel="nar/${store_base}.nar" nar_path="$NIX_CACHE_DIR/$nar_rel" if [[ ! -f "$nar_path" ]]; then nix-store --dump "$path" >"$nar_path" fi nar_size="$(stat -c%s "$nar_path")" nar_hash="$(nix hash file --type sha256 --base32 "$nar_path")" refs="$(nix-store --query --references "$path" | xargs -r -n1 basename | tr '\n' ' ' | sed 's/ $//')" deriver="$(nix-store --query --deriver "$path" 2>/dev/null || true)" deriver="$(basename "$deriver" 2>/dev/null || true)" { echo "StorePath: $path" echo "URL: $nar_rel" echo "Compression: none" echo "FileHash: sha256:$nar_hash" echo "FileSize: $nar_size" echo "NarHash: sha256:$nar_hash" echo "NarSize: $nar_size" echo "References: $refs" if [[ -n "$deriver" && "$deriver" != "unknown-deriver" ]]; then echo "Deriver: $deriver" fi } >"$NIX_CACHE_DIR/${store_hash}.narinfo" done < <( nix-store --query --requisites \ "$CONTROL_TARGET_SYSTEM" \ "$WORKER_TARGET_SYSTEM" \ "$CONTROL_DISKO_SCRIPT" \ "$WORKER_DISKO_SCRIPT" \ | sort -u ) } start_binary_cache() { seed_binary_cache log "Starting host-local Nix binary cache" python3 -m http.server 8090 --bind 0.0.0.0 --directory "$NIX_CACHE_DIR" \ >"$NIX_CACHE_LOG" 2>&1 & NIX_CACHE_PID="$!" wait_for_http "${BINARY_CACHE_ENDPOINT}/nix-cache-info" 120 \ || die "host-local Nix binary cache did not become reachable" } apply_cluster_state() { cat >"$TMP_DIR/cluster-state.yaml" </dev/null rm -f "$ovmf_vars_path" cp "$OVMF_VARS_TEMPLATE" "$ovmf_vars_path" chmod u+w "$ovmf_vars_path" nohup "$QEMU_BIN" \ -name "$label" \ -machine accel=tcg \ -cpu max \ -smp 2 \ -m 2048 \ -nographic \ -no-reboot \ -boot order=dc,once=d,menu=off \ -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ -drive if=pflash,format=raw,file="$ovmf_vars_path" \ -drive file="$disk_path",if=virtio,format=qcow2 \ -cdrom "$ISO_IMAGE" \ -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ -device virtio-net-pci,netdev=user0,mac="${mac}" \ -smbios type=1,product=UltraCloudQEMUBaremetal,serial="${node_id}" \ >"$log_path" 2>&1 & echo "$!" >"${log_path}.pid" } launch_installed_vm() { local label="$1" local ssh_port="$2" local dhcp_start="$3" local mac="$4" local disk_path="$5" local log_path="$6" local ovmf_vars_path="${disk_path}.ovmf-vars.fd" [[ -f "$ovmf_vars_path" ]] || die "missing OVMF vars file for relaunch: $ovmf_vars_path" nohup "$QEMU_BIN" \ -name "$label" \ -machine accel=tcg \ -cpu max \ -smp 2 \ -m 2048 \ -nographic \ -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ -drive if=pflash,format=raw,file="$ovmf_vars_path" \ -drive file="$disk_path",if=virtio,format=qcow2 \ -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ -device virtio-net-pci,netdev=user0,mac="${mac}" \ >>"$log_path" 2>&1 & echo "$!" >"${log_path}.pid" } wait_for_pid_exit() { local label="$1" local pid_file="$2" local timeout_secs="$3" local deadline=$((SECONDS + timeout_secs)) local pid [[ -f "$pid_file" ]] || die "${label} is missing pid file $pid_file" pid="$(cat "$pid_file")" while (( SECONDS < deadline )); do if ! kill -0 "$pid" >/dev/null 2>&1; then log "${label}: QEMU exited after installer-triggered reboot" return 0 fi sleep 2 done return 1 } verify_node() { local node_id="$1" local ssh_port="$2" local disk_path="$3" local log_path="$4" local expected_role="$5" local expected_system="$6" local dhcp_start="$7" local mac="$8" wait_for_log_marker "$node_id" "$TMP_DIR/deployer.log" "Node registered successfully.*node_id=${node_id}" 900 \ || die "${node_id} never completed /api/v1/phone-home registration" wait_for_ssh "$node_id" "$ssh_port" 900 \ || die "${node_id} never exposed SSH during the installer boot" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.boot.${node_id}" 120 \ ultracloud-bootstrap.service ultracloud-install.service \ || die "${node_id} never recorded the pre-install boot marker" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.phone-home.complete.${node_id}" 120 \ ultracloud-bootstrap.service ultracloud-install.service \ || die "${node_id} never recorded the phone-home completion marker" marker "pre-install.${node_id}" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.bundle-downloaded.${node_id}" 1200 \ ultracloud-install.service \ || die "${node_id} never downloaded the flake bundle" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.disko.complete.${node_id}" 2400 \ ultracloud-install.service \ || die "${node_id} never completed disko" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.nixos-install.complete.${node_id}" 3600 \ ultracloud-install.service \ || die "${node_id} never finished nixos-install" marker "install.${node_id}" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER reboot.${node_id}" 3600 \ ultracloud-install.service \ || die "${node_id} never emitted reboot marker" marker "reboot.${node_id}" wait_for_pid_exit "$node_id" "${log_path}.pid" 300 \ || die "${node_id} installer VM did not exit after the reboot marker" launch_installed_vm \ "ultracloud-baremetal-${node_id}-installed" \ "$ssh_port" \ "$dhcp_start" \ "$mac" \ "$disk_path" \ "$log_path" wait_for_ssh "$node_id" "$ssh_port" 1800 \ || die "${node_id} did not come back over SSH after reboot" wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER post-install.boot.${node_id}.${expected_role}" 1800 \ ultracloud-baremetal-postinstall-marker.service \ || die "${node_id} never emitted post-install marker" marker "post-install.${node_id}" ssh_shell "$ssh_port" 'test -f /etc/ultracloud/node-config.json' ssh_shell "$ssh_port" 'test -d /var/lib/photon-src/.bundle-inputs/nixpkgs' ssh_shell "$ssh_port" 'systemctl is-active nix-agent.service >/dev/null' ssh_shell "$ssh_port" "grep -Fx '${expected_role}' /etc/ultracloud-role" if [[ "$expected_role" == "control-plane" ]]; then ssh_shell "$ssh_port" 'systemctl is-active chainfire.service >/dev/null' fi wait_for_observed_active "$node_id" 1200 \ || die "${node_id} never reached observed-system active" [[ "$(current_system_path "$ssh_port")" == "$expected_system" ]] \ || die "${node_id} current system does not match expected target" marker "desired-system-active.${node_id}" } cleanup() { local status="$?" set +e for pid_file in "$CONTROL_LOG.pid" "$WORKER_LOG.pid"; do if [[ -f "$pid_file" ]]; then pid="$(cat "$pid_file")" kill "$pid" 2>/dev/null || true wait "$pid" 2>/dev/null || true fi done if [[ -n "${DEPLOYER_PID:-}" ]]; then kill "$DEPLOYER_PID" 2>/dev/null || true wait "$DEPLOYER_PID" 2>/dev/null || true fi if [[ -n "${CHAINFIRE_PID:-}" ]]; then kill "$CHAINFIRE_PID" 2>/dev/null || true wait "$CHAINFIRE_PID" 2>/dev/null || true fi if [[ -n "${NIX_CACHE_PID:-}" ]]; then kill "$NIX_CACHE_PID" 2>/dev/null || true wait "$NIX_CACHE_PID" 2>/dev/null || true fi if (( status != 0 )); then log "control-plane serial log tail:" tail -n 120 "$CONTROL_LOG" 2>/dev/null || true log "worker serial log tail:" tail -n 120 "$WORKER_LOG" 2>/dev/null || true log "deployer log tail:" tail -n 120 "$DEPLOYER_LOG" 2>/dev/null || true log "chainfire log tail:" tail -n 120 "$CHAINFIRE_LOG" 2>/dev/null || true log "binary cache log tail:" tail -n 120 "$NIX_CACHE_LOG" 2>/dev/null || true fi if [[ "${KEEP_STATE_DIR:-0}" != "1" ]]; then rm -rf "$TMP_DIR" fi exit "$status" } main() { require_cmd curl require_cmd jq require_cmd nix require_cmd python3 require_cmd qemu-img require_cmd qemu-system-x86_64 require_cmd ssh require_cmd ssh-keygen require_cmd ss ISO_IMAGE="$(resolve_iso_image "$(resolve_store_path ULTRACLOUD_BAREMETAL_ISO_IMAGE 'nixosConfigurations.ultracloud-iso.config.system.build.isoImage')")" FLAKE_BUNDLE="$(resolve_store_path ULTRACLOUD_BAREMETAL_FLAKE_BUNDLE 'packages.x86_64-linux.ultracloudFlakeBundle')" CONTROL_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_TARGET 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel')" WORKER_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_TARGET 'nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel')" CONTROL_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount')" WORKER_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount')" CHAINFIRE_BIN="$(resolve_binary ULTRACLOUD_CHAINFIRE_SERVER_BIN chainfire 'packages.x86_64-linux.chainfire-server')" DEPLOYER_SERVER_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_SERVER_BIN deployer-server 'packages.x86_64-linux.deployer-server')" DEPLOYER_CTL_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_CTL_BIN deployer-ctl 'packages.x86_64-linux.deployer-ctl')" OVMF_CODE_FD="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_CODE 'FV/OVMF_CODE.fd')" OVMF_VARS_TEMPLATE="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_VARS 'FV/OVMF_VARS.fd')" QEMU_BIN="${ULTRACLOUD_QEMU_BIN:-$(command -v qemu-system-x86_64)}" QEMU_IMG_BIN="${ULTRACLOUD_QEMU_IMG_BIN:-$(command -v qemu-img)}" if [[ -n "${ULTRACLOUD_BAREMETAL_STATE_DIR:-}" ]]; then TMP_DIR="$ULTRACLOUD_BAREMETAL_STATE_DIR" KEEP_STATE_DIR=1 mkdir -p "$TMP_DIR" find "$TMP_DIR" -mindepth 1 -maxdepth 1 \ ! -name nix-cache \ -exec rm -rf {} + else TMP_DIR="$(mktemp -d -t ultracloud-baremetal-iso.XXXXXX)" KEEP_STATE_DIR=0 fi NIX_CACHE_DIR="$TMP_DIR/nix-cache" CONTROL_LOG="$TMP_DIR/control-plane.serial.log" WORKER_LOG="$TMP_DIR/worker.serial.log" DEPLOYER_LOG="$TMP_DIR/deployer.log" CHAINFIRE_LOG="$TMP_DIR/chainfire.log" NIX_CACHE_LOG="$TMP_DIR/nix-cache.log" trap cleanup EXIT SSH_KEY="$TMP_DIR/id_ed25519" ssh-keygen -q -t ed25519 -N "" -f "$SSH_KEY" >/dev/null SSH_PUBKEY="$(tr -d '\n' <"$SSH_KEY.pub")" assert_port_free 2379 assert_port_free 8081 assert_port_free 8088 assert_port_free 8090 assert_port_free "$CONTROL_SSH_PORT" assert_port_free "$WORKER_SSH_PORT" start_binary_cache start_host_services apply_cluster_state launch_iso_vm \ "ultracloud-baremetal-control-plane" \ "$CONTROL_NODE_ID" \ "$CONTROL_SSH_PORT" \ "$CONTROL_DHCP_START" \ "52:54:00:11:22:31" \ "$CONTROL_DISK_GIB" \ "$TMP_DIR/control-plane.qcow2" \ "$CONTROL_LOG" verify_node \ "$CONTROL_NODE_ID" \ "$CONTROL_SSH_PORT" \ "$TMP_DIR/control-plane.qcow2" \ "$CONTROL_LOG" \ "control-plane" \ "$CONTROL_TARGET_SYSTEM" \ "$CONTROL_DHCP_START" \ "52:54:00:11:22:31" launch_iso_vm \ "ultracloud-baremetal-worker" \ "$WORKER_NODE_ID" \ "$WORKER_SSH_PORT" \ "$WORKER_DHCP_START" \ "52:54:00:11:22:32" \ "$WORKER_DISK_GIB" \ "$TMP_DIR/worker.qcow2" \ "$WORKER_LOG" verify_node \ "$WORKER_NODE_ID" \ "$WORKER_SSH_PORT" \ "$TMP_DIR/worker.qcow2" \ "$WORKER_LOG" \ "worker" \ "$WORKER_TARGET_SYSTEM" \ "$WORKER_DHCP_START" \ "52:54:00:11:22:32" log "Canonical ISO bare-metal QEMU verification succeeded" } main "$@"