824 lines
23 KiB
Bash
824 lines
23 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ROOT="${ULTRACLOUD_REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
|
|
|
|
CLUSTER_ID="baremetal-iso-canonical"
|
|
CHAINFIRE_ENDPOINT="http://127.0.0.1:2379"
|
|
DEPLOYER_ENDPOINT="http://127.0.0.1:8088"
|
|
BINARY_CACHE_ENDPOINT="http://127.0.0.1:8090"
|
|
BOOTSTRAP_TOKEN="baremetal-iso-bootstrap-token"
|
|
CONTROL_NODE_ID="iso-control-plane-01"
|
|
WORKER_NODE_ID="iso-worker-01"
|
|
CONTROL_SSH_PORT="22231"
|
|
WORKER_SSH_PORT="22232"
|
|
CONTROL_DHCP_START="10.0.2.15"
|
|
WORKER_DHCP_START="10.0.2.16"
|
|
CONTROL_DISK_GIB="18G"
|
|
WORKER_DISK_GIB="18G"
|
|
|
|
log() {
|
|
printf '[baremetal-iso-e2e] %s\n' "$*"
|
|
}
|
|
|
|
marker() {
|
|
printf 'ULTRACLOUD_MARKER %s\n' "$*"
|
|
}
|
|
|
|
die() {
|
|
echo "[baremetal-iso-e2e] ERROR: $*" >&2
|
|
exit 1
|
|
}
|
|
|
|
require_cmd() {
|
|
command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"
|
|
}
|
|
|
|
resolve_store_path() {
|
|
local env_name="$1"
|
|
local attr="$2"
|
|
if [[ -n "${!env_name:-}" ]]; then
|
|
printf '%s\n' "${!env_name}"
|
|
return 0
|
|
fi
|
|
nix build "$ROOT#$attr" --no-link --print-out-paths
|
|
}
|
|
|
|
resolve_binary() {
|
|
local env_name="$1"
|
|
local bin_name="$2"
|
|
local attr="$3"
|
|
if [[ -n "${!env_name:-}" ]]; then
|
|
printf '%s\n' "${!env_name}"
|
|
return 0
|
|
fi
|
|
if command -v "$bin_name" >/dev/null 2>&1; then
|
|
command -v "$bin_name"
|
|
return 0
|
|
fi
|
|
local out
|
|
out="$(nix build "$ROOT#$attr" --no-link --print-out-paths)"
|
|
printf '%s/bin/%s\n' "$out" "$bin_name"
|
|
}
|
|
|
|
resolve_iso_image() {
|
|
local candidate="$1"
|
|
if [[ -f "$candidate" ]]; then
|
|
printf '%s\n' "$candidate"
|
|
return 0
|
|
fi
|
|
|
|
local iso_dir="$candidate/iso"
|
|
if [[ -d "$iso_dir" ]]; then
|
|
local iso_path
|
|
iso_path="$(find "$iso_dir" -maxdepth 1 -type f -name '*.iso' | head -n 1)"
|
|
if [[ -n "$iso_path" ]]; then
|
|
printf '%s\n' "$iso_path"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
die "unable to resolve a bootable ISO file from $candidate"
|
|
}
|
|
|
|
resolve_ovmf_firmware() {
|
|
local env_name="$1"
|
|
local relative_path="$2"
|
|
if [[ -n "${!env_name:-}" ]]; then
|
|
printf '%s\n' "${!env_name}"
|
|
return 0
|
|
fi
|
|
|
|
local ovmf_dir
|
|
ovmf_dir="$(nix build nixpkgs#OVMF.fd --no-link --print-out-paths)"
|
|
printf '%s/%s\n' "$ovmf_dir" "$relative_path"
|
|
}
|
|
|
|
wait_for_http() {
|
|
local url="$1"
|
|
local timeout_secs="$2"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if curl -fsS "$url" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
return 1
|
|
}
|
|
|
|
wait_for_log_marker() {
|
|
local label="$1"
|
|
local log_file="$2"
|
|
local needle="$3"
|
|
local timeout_secs="$4"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if [[ -f "$log_file" ]] && grep -Eq "$needle" "$log_file"; then
|
|
log "${label}: observed ${needle}"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
ssh_base() {
|
|
local port="$1"
|
|
shift
|
|
ssh \
|
|
-F /dev/null \
|
|
-i "$SSH_KEY" \
|
|
-o BatchMode=yes \
|
|
-o ConnectTimeout=5 \
|
|
-o ConnectionAttempts=1 \
|
|
-o StrictHostKeyChecking=no \
|
|
-o UserKnownHostsFile=/dev/null \
|
|
-o LogLevel=ERROR \
|
|
-p "$port" \
|
|
root@127.0.0.1 "$@"
|
|
}
|
|
|
|
wait_for_ssh() {
|
|
local label="$1"
|
|
local port="$2"
|
|
local timeout_secs="$3"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if ssh_base "$port" true >/dev/null 2>&1; then
|
|
log "${label}: SSH is reachable on port ${port}"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
ssh_shell() {
|
|
local port="$1"
|
|
local script="$2"
|
|
local quoted
|
|
printf -v quoted '%q' "$script"
|
|
ssh_base "$port" "bash -lc $quoted"
|
|
}
|
|
|
|
current_system_path() {
|
|
local port="$1"
|
|
ssh_shell "$port" 'readlink -f /run/current-system'
|
|
}
|
|
|
|
remote_boot_id() {
|
|
local port="$1"
|
|
ssh_shell "$port" 'cat /proc/sys/kernel/random/boot_id'
|
|
}
|
|
|
|
remote_journal_has_marker() {
|
|
local port="$1"
|
|
local needle="$2"
|
|
shift 2
|
|
|
|
local remote_cmd="journalctl -b -o cat --no-pager"
|
|
local unit
|
|
for unit in "$@"; do
|
|
printf -v remote_cmd '%s -u %q' "$remote_cmd" "$unit"
|
|
done
|
|
printf -v remote_cmd '%s | grep -Fq %q' "$remote_cmd" "$needle"
|
|
|
|
ssh_shell "$port" "$remote_cmd"
|
|
}
|
|
|
|
wait_for_remote_journal_marker() {
|
|
local label="$1"
|
|
local port="$2"
|
|
local needle="$3"
|
|
local timeout_secs="$4"
|
|
shift 4
|
|
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if remote_journal_has_marker "$port" "$needle" "$@" >/dev/null 2>&1; then
|
|
log "${label}: observed ${needle} via remote journal"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
wait_for_reboot_transition() {
|
|
local label="$1"
|
|
local port="$2"
|
|
local previous_boot_id="$3"
|
|
local timeout_secs="$4"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
|
|
while (( SECONDS < deadline )); do
|
|
local current_boot_id
|
|
if current_boot_id="$(remote_boot_id "$port" 2>/dev/null)"; then
|
|
if [[ -n "$current_boot_id" && "$current_boot_id" != "$previous_boot_id" ]]; then
|
|
log "${label}: reboot completed with boot_id=${current_boot_id}"
|
|
return 0
|
|
fi
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
observed_status() {
|
|
local node_id="$1"
|
|
local payload
|
|
if ! payload="$(
|
|
"$DEPLOYER_CTL_BIN" \
|
|
--chainfire-endpoint "$CHAINFIRE_ENDPOINT" \
|
|
--cluster-id "$CLUSTER_ID" \
|
|
--cluster-namespace ultracloud \
|
|
--deployer-namespace deployer \
|
|
node inspect \
|
|
--node-id "$node_id" \
|
|
--include-observed-system \
|
|
--format json 2>/dev/null
|
|
)"; then
|
|
printf 'missing\n'
|
|
return 0
|
|
fi
|
|
|
|
jq -r '.observed_system.status // "missing"' <<<"$payload"
|
|
}
|
|
|
|
wait_for_observed_active() {
|
|
local node_id="$1"
|
|
local timeout_secs="$2"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
while (( SECONDS < deadline )); do
|
|
if [[ "$(observed_status "$node_id")" == "active" ]]; then
|
|
log "${node_id}: observed-system reached active"
|
|
return 0
|
|
fi
|
|
sleep 5
|
|
done
|
|
return 1
|
|
}
|
|
|
|
assert_port_free() {
|
|
local port="$1"
|
|
if ss -ltn "( sport = :$port )" | grep -Fq ":$port"; then
|
|
die "port $port is already in use"
|
|
fi
|
|
}
|
|
|
|
start_host_services() {
|
|
cat >"$TMP_DIR/chainfire.toml" <<EOF
|
|
[node]
|
|
id = 1
|
|
name = "baremetal-iso-chainfire"
|
|
role = "control_plane"
|
|
|
|
[storage]
|
|
data_dir = "$TMP_DIR/chainfire-data"
|
|
|
|
[network]
|
|
api_addr = "0.0.0.0:2379"
|
|
http_addr = "0.0.0.0:8081"
|
|
raft_addr = "0.0.0.0:2380"
|
|
gossip_addr = "0.0.0.0:2381"
|
|
|
|
[cluster]
|
|
id = 1
|
|
initial_members = []
|
|
bootstrap = true
|
|
|
|
[raft]
|
|
role = "voter"
|
|
EOF
|
|
|
|
cat >"$TMP_DIR/deployer.toml" <<EOF
|
|
bind_addr = "0.0.0.0:8088"
|
|
cluster_id = "${CLUSTER_ID}"
|
|
cluster_namespace = "ultracloud"
|
|
heartbeat_timeout_secs = 300
|
|
local_state_path = "$TMP_DIR/deployer-state"
|
|
bootstrap_flake_bundle_path = "$FLAKE_BUNDLE"
|
|
bootstrap_token = "${BOOTSTRAP_TOKEN}"
|
|
require_chainfire = true
|
|
allow_unknown_nodes = false
|
|
allow_unauthenticated = true
|
|
allow_test_mappings = false
|
|
tls_self_signed = false
|
|
|
|
[chainfire]
|
|
endpoints = ["${CHAINFIRE_ENDPOINT}"]
|
|
namespace = "deployer"
|
|
EOF
|
|
|
|
log "Starting host-side Chainfire"
|
|
NO_COLOR=1 CLICOLOR=0 RUST_LOG_STYLE=never \
|
|
"$CHAINFIRE_BIN" --config "$TMP_DIR/chainfire.toml" >"$CHAINFIRE_LOG" 2>&1 &
|
|
CHAINFIRE_PID="$!"
|
|
|
|
wait_for_http "http://127.0.0.1:8081/health" 120 \
|
|
|| die "host Chainfire did not become healthy"
|
|
|
|
log "Starting host-side Deployer"
|
|
NO_COLOR=1 CLICOLOR=0 RUST_LOG_STYLE=never \
|
|
"$DEPLOYER_SERVER_BIN" --config "$TMP_DIR/deployer.toml" >"$DEPLOYER_LOG" 2>&1 &
|
|
DEPLOYER_PID="$!"
|
|
|
|
wait_for_http "http://127.0.0.1:8088/health" 120 \
|
|
|| die "host Deployer did not become healthy"
|
|
}
|
|
|
|
seed_binary_cache() {
|
|
local path
|
|
local nar_rel
|
|
local nar_path
|
|
local store_base
|
|
local store_hash
|
|
local nar_hash
|
|
local nar_size
|
|
local refs
|
|
local deriver
|
|
|
|
mkdir -p "$NIX_CACHE_DIR/nar"
|
|
cat >"$NIX_CACHE_DIR/nix-cache-info" <<'EOF'
|
|
StoreDir: /nix/store
|
|
WantMassQuery: 1
|
|
Priority: 30
|
|
EOF
|
|
|
|
log "Seeding host-local Nix binary cache"
|
|
if [[ -n "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION:-}" && -f "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" ]]; then
|
|
nix-store --load-db <"${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration"
|
|
fi
|
|
while IFS= read -r path; do
|
|
[[ -n "$path" ]] || continue
|
|
|
|
store_base="$(basename "$path")"
|
|
store_hash="${store_base%%-*}"
|
|
nar_rel="nar/${store_base}.nar"
|
|
nar_path="$NIX_CACHE_DIR/$nar_rel"
|
|
|
|
if [[ ! -f "$nar_path" ]]; then
|
|
nix-store --dump "$path" >"$nar_path"
|
|
fi
|
|
|
|
nar_size="$(stat -c%s "$nar_path")"
|
|
nar_hash="$(nix hash file --type sha256 --base32 "$nar_path")"
|
|
refs="$(nix-store --query --references "$path" | xargs -r -n1 basename | tr '\n' ' ' | sed 's/ $//')"
|
|
deriver="$(nix-store --query --deriver "$path" 2>/dev/null || true)"
|
|
deriver="$(basename "$deriver" 2>/dev/null || true)"
|
|
|
|
{
|
|
echo "StorePath: $path"
|
|
echo "URL: $nar_rel"
|
|
echo "Compression: none"
|
|
echo "FileHash: sha256:$nar_hash"
|
|
echo "FileSize: $nar_size"
|
|
echo "NarHash: sha256:$nar_hash"
|
|
echo "NarSize: $nar_size"
|
|
echo "References: $refs"
|
|
if [[ -n "$deriver" && "$deriver" != "unknown-deriver" ]]; then
|
|
echo "Deriver: $deriver"
|
|
fi
|
|
} >"$NIX_CACHE_DIR/${store_hash}.narinfo"
|
|
done < <(
|
|
nix-store --query --requisites \
|
|
"$CONTROL_TARGET_SYSTEM" \
|
|
"$WORKER_TARGET_SYSTEM" \
|
|
"$CONTROL_DISKO_SCRIPT" \
|
|
"$WORKER_DISKO_SCRIPT" \
|
|
| sort -u
|
|
)
|
|
}
|
|
|
|
start_binary_cache() {
|
|
seed_binary_cache
|
|
|
|
log "Starting host-local Nix binary cache"
|
|
python3 -m http.server 8090 --bind 0.0.0.0 --directory "$NIX_CACHE_DIR" \
|
|
>"$NIX_CACHE_LOG" 2>&1 &
|
|
NIX_CACHE_PID="$!"
|
|
|
|
wait_for_http "${BINARY_CACHE_ENDPOINT}/nix-cache-info" 120 \
|
|
|| die "host-local Nix binary cache did not become reachable"
|
|
}
|
|
|
|
apply_cluster_state() {
|
|
cat >"$TMP_DIR/cluster-state.yaml" <<EOF
|
|
cluster:
|
|
cluster_id: ${CLUSTER_ID}
|
|
environment: qemu
|
|
|
|
node_classes:
|
|
- name: iso-control-plane
|
|
description: Canonical ISO-installed QEMU control-plane target
|
|
roles:
|
|
- control-plane
|
|
labels:
|
|
tier: control-plane
|
|
canonical_install_path: iso
|
|
install_plan:
|
|
nixos_configuration: baremetal-qemu-control-plane
|
|
disko_config_path: nix/nodes/baremetal-qemu/control-plane/disko.nix
|
|
disko_script_path: ${CONTROL_DISKO_SCRIPT}
|
|
target_disk: /dev/vda
|
|
- name: iso-worker
|
|
description: Canonical ISO-installed QEMU worker target
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: worker
|
|
canonical_install_path: iso
|
|
install_plan:
|
|
nixos_configuration: baremetal-qemu-worker
|
|
disko_config_path: nix/nodes/baremetal-qemu/worker/disko.nix
|
|
disko_script_path: ${WORKER_DISKO_SCRIPT}
|
|
target_disk: /dev/vda
|
|
|
|
pools:
|
|
- name: control
|
|
description: ISO bare-metal control-plane pool
|
|
node_class: iso-control-plane
|
|
labels:
|
|
pool.ultracloud.io/name: control
|
|
- name: workers
|
|
description: ISO bare-metal worker pool
|
|
node_class: iso-worker
|
|
labels:
|
|
pool.ultracloud.io/name: workers
|
|
|
|
nodes:
|
|
- node_id: ${CONTROL_NODE_ID}
|
|
hostname: ${CONTROL_NODE_ID}
|
|
ip: ${CONTROL_DHCP_START}
|
|
roles:
|
|
- control-plane
|
|
labels:
|
|
canonical_install_path: iso
|
|
pool: control
|
|
node_class: iso-control-plane
|
|
install_plan:
|
|
nixos_configuration: baremetal-qemu-control-plane
|
|
disko_config_path: nix/nodes/baremetal-qemu/control-plane/disko.nix
|
|
disko_script_path: ${CONTROL_DISKO_SCRIPT}
|
|
target_disk: /dev/vda
|
|
desired_system:
|
|
nixos_configuration: baremetal-qemu-control-plane
|
|
target_system: ${CONTROL_TARGET_SYSTEM}
|
|
health_check_command:
|
|
- test
|
|
- -f
|
|
- /etc/ultracloud-role-control-plane
|
|
rollback_on_failure: true
|
|
state: pending
|
|
- node_id: ${WORKER_NODE_ID}
|
|
hostname: ${WORKER_NODE_ID}
|
|
ip: ${WORKER_DHCP_START}
|
|
roles:
|
|
- worker
|
|
labels:
|
|
canonical_install_path: iso
|
|
pool: workers
|
|
node_class: iso-worker
|
|
install_plan:
|
|
nixos_configuration: baremetal-qemu-worker
|
|
disko_config_path: nix/nodes/baremetal-qemu/worker/disko.nix
|
|
disko_script_path: ${WORKER_DISKO_SCRIPT}
|
|
target_disk: /dev/vda
|
|
desired_system:
|
|
nixos_configuration: baremetal-qemu-worker
|
|
target_system: ${WORKER_TARGET_SYSTEM}
|
|
health_check_command:
|
|
- test
|
|
- -f
|
|
- /etc/ultracloud-role-worker
|
|
rollback_on_failure: true
|
|
state: pending
|
|
|
|
enrollment_rules:
|
|
- name: iso-control-plane
|
|
priority: 200
|
|
match_hostname_prefix: iso-control-plane
|
|
pool: control
|
|
node_class: iso-control-plane
|
|
labels:
|
|
canonical_install_path: iso
|
|
ssh_authorized_keys:
|
|
- ${SSH_PUBKEY}
|
|
- name: iso-worker
|
|
priority: 190
|
|
match_hostname_prefix: iso-worker
|
|
pool: workers
|
|
node_class: iso-worker
|
|
labels:
|
|
canonical_install_path: iso
|
|
ssh_authorized_keys:
|
|
- ${SSH_PUBKEY}
|
|
EOF
|
|
|
|
"$DEPLOYER_CTL_BIN" \
|
|
--chainfire-endpoint "$CHAINFIRE_ENDPOINT" \
|
|
--cluster-id "$CLUSTER_ID" \
|
|
--cluster-namespace ultracloud \
|
|
--deployer-namespace deployer \
|
|
apply --config "$TMP_DIR/cluster-state.yaml" --prune
|
|
}
|
|
|
|
launch_iso_vm() {
|
|
local label="$1"
|
|
local node_id="$2"
|
|
local ssh_port="$3"
|
|
local dhcp_start="$4"
|
|
local mac="$5"
|
|
local disk_size="$6"
|
|
local disk_path="$7"
|
|
local log_path="$8"
|
|
local ovmf_vars_path="${disk_path}.ovmf-vars.fd"
|
|
|
|
"$QEMU_IMG_BIN" create -f qcow2 "$disk_path" "$disk_size" >/dev/null
|
|
rm -f "$ovmf_vars_path"
|
|
cp "$OVMF_VARS_TEMPLATE" "$ovmf_vars_path"
|
|
chmod u+w "$ovmf_vars_path"
|
|
|
|
nohup "$QEMU_BIN" \
|
|
-name "$label" \
|
|
-machine accel=tcg \
|
|
-cpu max \
|
|
-smp 2 \
|
|
-m 2048 \
|
|
-nographic \
|
|
-no-reboot \
|
|
-boot order=dc,once=d,menu=off \
|
|
-drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \
|
|
-drive if=pflash,format=raw,file="$ovmf_vars_path" \
|
|
-drive file="$disk_path",if=virtio,format=qcow2 \
|
|
-cdrom "$ISO_IMAGE" \
|
|
-netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \
|
|
-device virtio-net-pci,netdev=user0,mac="${mac}" \
|
|
-smbios type=1,product=UltraCloudQEMUBaremetal,serial="${node_id}" \
|
|
>"$log_path" 2>&1 &
|
|
echo "$!" >"${log_path}.pid"
|
|
}
|
|
|
|
launch_installed_vm() {
|
|
local label="$1"
|
|
local ssh_port="$2"
|
|
local dhcp_start="$3"
|
|
local mac="$4"
|
|
local disk_path="$5"
|
|
local log_path="$6"
|
|
local ovmf_vars_path="${disk_path}.ovmf-vars.fd"
|
|
|
|
[[ -f "$ovmf_vars_path" ]] || die "missing OVMF vars file for relaunch: $ovmf_vars_path"
|
|
|
|
nohup "$QEMU_BIN" \
|
|
-name "$label" \
|
|
-machine accel=tcg \
|
|
-cpu max \
|
|
-smp 2 \
|
|
-m 2048 \
|
|
-nographic \
|
|
-drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \
|
|
-drive if=pflash,format=raw,file="$ovmf_vars_path" \
|
|
-drive file="$disk_path",if=virtio,format=qcow2 \
|
|
-netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \
|
|
-device virtio-net-pci,netdev=user0,mac="${mac}" \
|
|
>>"$log_path" 2>&1 &
|
|
echo "$!" >"${log_path}.pid"
|
|
}
|
|
|
|
wait_for_pid_exit() {
|
|
local label="$1"
|
|
local pid_file="$2"
|
|
local timeout_secs="$3"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
local pid
|
|
|
|
[[ -f "$pid_file" ]] || die "${label} is missing pid file $pid_file"
|
|
pid="$(cat "$pid_file")"
|
|
while (( SECONDS < deadline )); do
|
|
if ! kill -0 "$pid" >/dev/null 2>&1; then
|
|
log "${label}: QEMU exited after installer-triggered reboot"
|
|
return 0
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 1
|
|
}
|
|
|
|
verify_node() {
|
|
local node_id="$1"
|
|
local ssh_port="$2"
|
|
local disk_path="$3"
|
|
local log_path="$4"
|
|
local expected_role="$5"
|
|
local expected_system="$6"
|
|
local dhcp_start="$7"
|
|
local mac="$8"
|
|
|
|
wait_for_log_marker "$node_id" "$TMP_DIR/deployer.log" "Node registered successfully.*node_id=${node_id}" 900 \
|
|
|| die "${node_id} never completed /api/v1/phone-home registration"
|
|
wait_for_ssh "$node_id" "$ssh_port" 900 \
|
|
|| die "${node_id} never exposed SSH during the installer boot"
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.boot.${node_id}" 120 \
|
|
ultracloud-bootstrap.service ultracloud-install.service \
|
|
|| die "${node_id} never recorded the pre-install boot marker"
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.phone-home.complete.${node_id}" 120 \
|
|
ultracloud-bootstrap.service ultracloud-install.service \
|
|
|| die "${node_id} never recorded the phone-home completion marker"
|
|
marker "pre-install.${node_id}"
|
|
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.bundle-downloaded.${node_id}" 1200 \
|
|
ultracloud-install.service \
|
|
|| die "${node_id} never downloaded the flake bundle"
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.disko.complete.${node_id}" 2400 \
|
|
ultracloud-install.service \
|
|
|| die "${node_id} never completed disko"
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.nixos-install.complete.${node_id}" 3600 \
|
|
ultracloud-install.service \
|
|
|| die "${node_id} never finished nixos-install"
|
|
marker "install.${node_id}"
|
|
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER reboot.${node_id}" 3600 \
|
|
ultracloud-install.service \
|
|
|| die "${node_id} never emitted reboot marker"
|
|
marker "reboot.${node_id}"
|
|
|
|
wait_for_pid_exit "$node_id" "${log_path}.pid" 300 \
|
|
|| die "${node_id} installer VM did not exit after the reboot marker"
|
|
launch_installed_vm \
|
|
"ultracloud-baremetal-${node_id}-installed" \
|
|
"$ssh_port" \
|
|
"$dhcp_start" \
|
|
"$mac" \
|
|
"$disk_path" \
|
|
"$log_path"
|
|
wait_for_ssh "$node_id" "$ssh_port" 1800 \
|
|
|| die "${node_id} did not come back over SSH after reboot"
|
|
wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER post-install.boot.${node_id}.${expected_role}" 1800 \
|
|
ultracloud-baremetal-postinstall-marker.service \
|
|
|| die "${node_id} never emitted post-install marker"
|
|
marker "post-install.${node_id}"
|
|
|
|
ssh_shell "$ssh_port" 'test -f /etc/ultracloud/node-config.json'
|
|
ssh_shell "$ssh_port" 'test -d /var/lib/photon-src/.bundle-inputs/nixpkgs'
|
|
ssh_shell "$ssh_port" 'systemctl is-active nix-agent.service >/dev/null'
|
|
ssh_shell "$ssh_port" "grep -Fx '${expected_role}' /etc/ultracloud-role"
|
|
if [[ "$expected_role" == "control-plane" ]]; then
|
|
ssh_shell "$ssh_port" 'systemctl is-active chainfire.service >/dev/null'
|
|
fi
|
|
|
|
wait_for_observed_active "$node_id" 1200 \
|
|
|| die "${node_id} never reached observed-system active"
|
|
[[ "$(current_system_path "$ssh_port")" == "$expected_system" ]] \
|
|
|| die "${node_id} current system does not match expected target"
|
|
marker "desired-system-active.${node_id}"
|
|
}
|
|
|
|
cleanup() {
|
|
local status="$?"
|
|
set +e
|
|
|
|
for pid_file in "$CONTROL_LOG.pid" "$WORKER_LOG.pid"; do
|
|
if [[ -f "$pid_file" ]]; then
|
|
pid="$(cat "$pid_file")"
|
|
kill "$pid" 2>/dev/null || true
|
|
wait "$pid" 2>/dev/null || true
|
|
fi
|
|
done
|
|
|
|
if [[ -n "${DEPLOYER_PID:-}" ]]; then
|
|
kill "$DEPLOYER_PID" 2>/dev/null || true
|
|
wait "$DEPLOYER_PID" 2>/dev/null || true
|
|
fi
|
|
if [[ -n "${CHAINFIRE_PID:-}" ]]; then
|
|
kill "$CHAINFIRE_PID" 2>/dev/null || true
|
|
wait "$CHAINFIRE_PID" 2>/dev/null || true
|
|
fi
|
|
if [[ -n "${NIX_CACHE_PID:-}" ]]; then
|
|
kill "$NIX_CACHE_PID" 2>/dev/null || true
|
|
wait "$NIX_CACHE_PID" 2>/dev/null || true
|
|
fi
|
|
|
|
if (( status != 0 )); then
|
|
log "control-plane serial log tail:"
|
|
tail -n 120 "$CONTROL_LOG" 2>/dev/null || true
|
|
log "worker serial log tail:"
|
|
tail -n 120 "$WORKER_LOG" 2>/dev/null || true
|
|
log "deployer log tail:"
|
|
tail -n 120 "$DEPLOYER_LOG" 2>/dev/null || true
|
|
log "chainfire log tail:"
|
|
tail -n 120 "$CHAINFIRE_LOG" 2>/dev/null || true
|
|
log "binary cache log tail:"
|
|
tail -n 120 "$NIX_CACHE_LOG" 2>/dev/null || true
|
|
fi
|
|
|
|
if [[ "${KEEP_STATE_DIR:-0}" != "1" ]]; then
|
|
rm -rf "$TMP_DIR"
|
|
fi
|
|
exit "$status"
|
|
}
|
|
|
|
main() {
|
|
require_cmd curl
|
|
require_cmd jq
|
|
require_cmd nix
|
|
require_cmd python3
|
|
require_cmd qemu-img
|
|
require_cmd qemu-system-x86_64
|
|
require_cmd ssh
|
|
require_cmd ssh-keygen
|
|
require_cmd ss
|
|
|
|
ISO_IMAGE="$(resolve_iso_image "$(resolve_store_path ULTRACLOUD_BAREMETAL_ISO_IMAGE 'nixosConfigurations.ultracloud-iso.config.system.build.isoImage')")"
|
|
FLAKE_BUNDLE="$(resolve_store_path ULTRACLOUD_BAREMETAL_FLAKE_BUNDLE 'packages.x86_64-linux.ultracloudFlakeBundle')"
|
|
CONTROL_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_TARGET 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel')"
|
|
WORKER_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_TARGET 'nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel')"
|
|
CONTROL_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount')"
|
|
WORKER_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount')"
|
|
CHAINFIRE_BIN="$(resolve_binary ULTRACLOUD_CHAINFIRE_SERVER_BIN chainfire 'packages.x86_64-linux.chainfire-server')"
|
|
DEPLOYER_SERVER_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_SERVER_BIN deployer-server 'packages.x86_64-linux.deployer-server')"
|
|
DEPLOYER_CTL_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_CTL_BIN deployer-ctl 'packages.x86_64-linux.deployer-ctl')"
|
|
OVMF_CODE_FD="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_CODE 'FV/OVMF_CODE.fd')"
|
|
OVMF_VARS_TEMPLATE="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_VARS 'FV/OVMF_VARS.fd')"
|
|
QEMU_BIN="${ULTRACLOUD_QEMU_BIN:-$(command -v qemu-system-x86_64)}"
|
|
QEMU_IMG_BIN="${ULTRACLOUD_QEMU_IMG_BIN:-$(command -v qemu-img)}"
|
|
|
|
if [[ -n "${ULTRACLOUD_BAREMETAL_STATE_DIR:-}" ]]; then
|
|
TMP_DIR="$ULTRACLOUD_BAREMETAL_STATE_DIR"
|
|
KEEP_STATE_DIR=1
|
|
mkdir -p "$TMP_DIR"
|
|
find "$TMP_DIR" -mindepth 1 -maxdepth 1 \
|
|
! -name nix-cache \
|
|
-exec rm -rf {} +
|
|
else
|
|
TMP_DIR="$(mktemp -d -t ultracloud-baremetal-iso.XXXXXX)"
|
|
KEEP_STATE_DIR=0
|
|
fi
|
|
NIX_CACHE_DIR="$TMP_DIR/nix-cache"
|
|
CONTROL_LOG="$TMP_DIR/control-plane.serial.log"
|
|
WORKER_LOG="$TMP_DIR/worker.serial.log"
|
|
DEPLOYER_LOG="$TMP_DIR/deployer.log"
|
|
CHAINFIRE_LOG="$TMP_DIR/chainfire.log"
|
|
NIX_CACHE_LOG="$TMP_DIR/nix-cache.log"
|
|
trap cleanup EXIT
|
|
|
|
SSH_KEY="$TMP_DIR/id_ed25519"
|
|
ssh-keygen -q -t ed25519 -N "" -f "$SSH_KEY" >/dev/null
|
|
SSH_PUBKEY="$(tr -d '\n' <"$SSH_KEY.pub")"
|
|
|
|
assert_port_free 2379
|
|
assert_port_free 8081
|
|
assert_port_free 8088
|
|
assert_port_free 8090
|
|
assert_port_free "$CONTROL_SSH_PORT"
|
|
assert_port_free "$WORKER_SSH_PORT"
|
|
|
|
start_binary_cache
|
|
start_host_services
|
|
apply_cluster_state
|
|
|
|
launch_iso_vm \
|
|
"ultracloud-baremetal-control-plane" \
|
|
"$CONTROL_NODE_ID" \
|
|
"$CONTROL_SSH_PORT" \
|
|
"$CONTROL_DHCP_START" \
|
|
"52:54:00:11:22:31" \
|
|
"$CONTROL_DISK_GIB" \
|
|
"$TMP_DIR/control-plane.qcow2" \
|
|
"$CONTROL_LOG"
|
|
|
|
verify_node \
|
|
"$CONTROL_NODE_ID" \
|
|
"$CONTROL_SSH_PORT" \
|
|
"$TMP_DIR/control-plane.qcow2" \
|
|
"$CONTROL_LOG" \
|
|
"control-plane" \
|
|
"$CONTROL_TARGET_SYSTEM" \
|
|
"$CONTROL_DHCP_START" \
|
|
"52:54:00:11:22:31"
|
|
|
|
launch_iso_vm \
|
|
"ultracloud-baremetal-worker" \
|
|
"$WORKER_NODE_ID" \
|
|
"$WORKER_SSH_PORT" \
|
|
"$WORKER_DHCP_START" \
|
|
"52:54:00:11:22:32" \
|
|
"$WORKER_DISK_GIB" \
|
|
"$TMP_DIR/worker.qcow2" \
|
|
"$WORKER_LOG"
|
|
|
|
verify_node \
|
|
"$WORKER_NODE_ID" \
|
|
"$WORKER_SSH_PORT" \
|
|
"$TMP_DIR/worker.qcow2" \
|
|
"$WORKER_LOG" \
|
|
"worker" \
|
|
"$WORKER_TARGET_SYSTEM" \
|
|
"$WORKER_DHCP_START" \
|
|
"52:54:00:11:22:32"
|
|
|
|
log "Canonical ISO bare-metal QEMU verification succeeded"
|
|
}
|
|
|
|
main "$@"
|