photoncloud-monorepo/nix/test-cluster/run-cluster.sh

#!/usr/bin/env bash
# PhotonCloud VM test-cluster harness
#
# Commands:
#   build      Build one or more VM derivations
#   start      Build if needed, start VMs, and wait for SSH
#   wait       Wait for SSH on running VMs
#   validate   Run multi-node smoke validation, including nested KVM on workers
#   smoke      start + validate
#   fresh-smoke clean + host-build + start + validate
#   fresh-matrix clean + host-build + start + composed-configuration validation
#   fresh-bench-storage clean + host-build + start + storage benchmark
#   stop       Stop running VMs
#   status     Show VM process status
#   ssh        Open an interactive SSH session to a node
#   logs       Show the VM log for a node
#   clean      Stop VMs and remove local runtime state
#
# Examples:
#   ./run-cluster.sh smoke
#   ./run-cluster.sh start node01 node02 node03
#   ./run-cluster.sh validate

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CLUSTER_DIR="${SCRIPT_DIR}"
CLUSTER_FLAKE_REF="${PHOTON_CLUSTER_FLAKE:-${CLUSTER_DIR}}"
VM_DIR_BASE="${PHOTON_VM_DIR:-${HOME}/.photoncloud-test-cluster}"
VDE_SWITCH_DIR_BASE="${PHOTON_CLUSTER_VDE_SWITCH_DIR:-/tmp/photoncloud-test-cluster-vde.sock}"
CORONAFS_API_PORT="${PHOTON_CORONAFS_API_PORT:-50088}"
CORONAFS_VOLUME_ROOT="/var/lib/coronafs/volumes"
SSH_PASSWORD="${PHOTON_VM_ROOT_PASSWORD:-test}"
SSH_CONNECT_TIMEOUT="${PHOTON_VM_SSH_CONNECT_TIMEOUT:-5}"
SSH_WAIT_TIMEOUT="${PHOTON_VM_SSH_WAIT_TIMEOUT:-300}"
UNIT_WAIT_TIMEOUT="${PHOTON_VM_UNIT_WAIT_TIMEOUT:-240}"
HTTP_WAIT_TIMEOUT="${PHOTON_VM_HTTP_WAIT_TIMEOUT:-180}"
KVM_WAIT_TIMEOUT="${PHOTON_VM_KVM_WAIT_TIMEOUT:-180}"
FLAREDB_WAIT_TIMEOUT="${PHOTON_VM_FLAREDB_WAIT_TIMEOUT:-180}"
GRPCURL_MAX_MSG_SIZE="${PHOTON_VM_GRPCURL_MAX_MSG_SIZE:-1073741824}"
GRPCURL_TIMEOUT_SECS="${PHOTON_VM_GRPCURL_TIMEOUT_SECS:-30}"
TUNNEL_WAIT_TIMEOUT="${PHOTON_VM_TUNNEL_WAIT_TIMEOUT:-30}"
STORAGE_BENCHMARK_COMMAND="${PHOTON_VM_STORAGE_BENCH_COMMAND:-bench-storage}"
LIGHTNINGSTOR_BENCH_CLIENT_NODE="${PHOTON_VM_LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node06}"
STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_IMAGE_BENCH:-0}"
STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH:-0}"
CLUSTER_NIX_MAX_JOBS="${PHOTON_CLUSTER_NIX_MAX_JOBS:-2}"
CLUSTER_NIX_BUILD_CORES="${PHOTON_CLUSTER_NIX_BUILD_CORES:-4}"
BUILD_PROFILE="${PHOTON_CLUSTER_BUILD_PROFILE:-default}"
CLUSTER_SKIP_BUILD="${PHOTON_CLUSTER_SKIP_BUILD:-0}"
CLUSTER_LOCK_HELD=0

NODES=(node01 node02 node03 node04 node05 node06)
STORAGE_NODES=(node01 node02 node03 node04 node05)

IAM_PROTO_DIR="${REPO_ROOT}/iam/proto"
IAM_PROTO="${IAM_PROTO_DIR}/iam.proto"
PRISMNET_PROTO_DIR="${REPO_ROOT}/prismnet/crates/prismnet-api/proto"
PRISMNET_PROTO="${PRISMNET_PROTO_DIR}/prismnet.proto"
FLASHDNS_PROTO_DIR="${REPO_ROOT}/flashdns/crates/flashdns-api/proto"
FLASHDNS_PROTO="${FLASHDNS_PROTO_DIR}/flashdns.proto"
FIBERLB_PROTO_DIR="${REPO_ROOT}/fiberlb/crates/fiberlb-api/proto"
FIBERLB_PROTO="${FIBERLB_PROTO_DIR}/fiberlb.proto"
K8SHOST_PROTO_DIR="${REPO_ROOT}/k8shost/crates/k8shost-proto/proto"
K8SHOST_PROTO="${K8SHOST_PROTO_DIR}/k8s.proto"
CREDITSERVICE_PROTO_DIR="${REPO_ROOT}/creditservice/proto"
CREDITSERVICE_PROTO="${CREDITSERVICE_PROTO_DIR}/creditservice.proto"
LIGHTNINGSTOR_PROTO_DIR="${REPO_ROOT}/lightningstor/crates/lightningstor-api/proto"
LIGHTNINGSTOR_PROTO="${LIGHTNINGSTOR_PROTO_DIR}/lightningstor.proto"
PLASMAVMC_PROTO_DIR="${REPO_ROOT}/plasmavmc/proto"
PLASMAVMC_PROTO="${PLASMAVMC_PROTO_DIR}/plasmavmc.proto"
FLAREDB_PROTO_DIR="${REPO_ROOT}/flaredb/crates/flaredb-proto/src"
FLAREDB_PROTO="${FLAREDB_PROTO_DIR}/kvrpc.proto"

# shellcheck disable=SC2034
NODE_PHASES=(
  "node01 node02 node03"
  "node04 node05"
  "node06"
)

declare -A SSH_PORTS=(
  [node01]=2201
  [node02]=2202
  [node03]=2203
  [node04]=2204
  [node05]=2205
  [node06]=2206
)

declare -A STORAGE_SSH_PORTS=(
  [node01]=2301
  [node02]=2302
  [node03]=2303
  [node04]=2304
  [node05]=2305
)

declare -A NODE_IPS=(
  [node01]=10.100.0.11
  [node02]=10.100.0.12
  [node03]=10.100.0.13
  [node04]=10.100.0.21
  [node05]=10.100.0.22
  [node06]=10.100.0.100
)

declare -A NODE_UNITS=(
  [node01]="chainfire flaredb iam prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost"
  [node02]="chainfire flaredb iam"
  [node03]="chainfire flaredb iam"
  [node04]="plasmavmc lightningstor node-agent"
  [node05]="plasmavmc lightningstor node-agent"
  [node06]="apigateway nightlight creditservice deployer fleet-scheduler"
)

declare -A STORAGE_BUILD_TARGETS=(
  [node01]=storage-node01
  [node02]=storage-node02
  [node03]=storage-node03
  [node04]=storage-node04
  [node05]=storage-node05
)

SSH_OPTS=(
  -o StrictHostKeyChecking=no
  -o UserKnownHostsFile=/dev/null
  -o LogLevel=ERROR
  -o ConnectTimeout="${SSH_CONNECT_TIMEOUT}"
  -o PreferredAuthentications=password
  -o PubkeyAuthentication=no
  -o KbdInteractiveAuthentication=no
)

log() {
  printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >&2
}

die() {
  log "ERROR: $*"
  exit 1
}

warn() {
  log "WARN: $*"
}

run_deployer_ctl() {
  RUST_LOG="${RUST_LOG:-warn}" \
    nix --option warn-dirty false run --quiet \
      --extra-experimental-features 'nix-command flakes' \
      "${REPO_ROOT}#deployer-ctl" -- "$@"
}

release_cluster_lock() {
  local lock_dir
  local owner=""
  lock_dir="$(cluster_lock_dir)"

  if [[ "${CLUSTER_LOCK_HELD}" -ne 1 ]]; then
    return 0
  fi

  if [[ -d "${lock_dir}" ]]; then
    if [[ -f "${lock_dir}/pid" ]]; then
      owner="$(<"${lock_dir}/pid")"
    fi

    if [[ -z "${owner}" || "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ]]; then
      rm -rf "${lock_dir}"
    fi
  fi

  CLUSTER_LOCK_HELD=0
  unset PHOTON_CLUSTER_LOCK_OWNER
}

acquire_cluster_lock() {
  local lock_dir
  local owner=""
  lock_dir="$(cluster_lock_dir)"

  if [[ "${CLUSTER_LOCK_HELD}" -eq 1 ]]; then
    return 0
  fi

  mkdir -p "$(dirname "${lock_dir}")"

  if mkdir "${lock_dir}" 2>/dev/null; then
    printf '%s\n' "$$" >"${lock_dir}/pid"
    CLUSTER_LOCK_HELD=1
    export PHOTON_CLUSTER_LOCK_OWNER="$$"
    trap release_cluster_lock EXIT
    return 0
  fi

  if [[ -f "${lock_dir}/pid" ]]; then
    owner="$(<"${lock_dir}/pid")"
  fi

  if [[ -n "${owner}" && ( "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ) ]]; then
    CLUSTER_LOCK_HELD=1
    export PHOTON_CLUSTER_LOCK_OWNER="${owner}"
    trap release_cluster_lock EXIT
    return 0
  fi

  if [[ -n "${owner}" ]] && ! kill -0 "${owner}" >/dev/null 2>&1; then
    warn "reclaiming stale PhotonCloud test-cluster lock from pid ${owner}"
    rm -f "${lock_dir}/pid"
    rmdir "${lock_dir}" 2>/dev/null || true
    if mkdir "${lock_dir}" 2>/dev/null; then
      printf '%s\n' "$$" >"${lock_dir}/pid"
      CLUSTER_LOCK_HELD=1
      export PHOTON_CLUSTER_LOCK_OWNER="$$"
      trap release_cluster_lock EXIT
      return 0
    fi
  fi

  die "another PhotonCloud test-cluster run is active${owner:+ (pid ${owner})}; lock: ${lock_dir}"
}

lightningstor_data_root() {
  case "$1" in
    node01) printf '%s\n' /var/lib/lightningstor/node ;;
    node04|node05) printf '%s\n' /var/lib/lightningstor ;;
    *) die "no LightningStor data root mapping for $1" ;;
  esac
}

profile_slug() {
  local slug
  slug="$(printf '%s' "${BUILD_PROFILE}" | tr -c 'A-Za-z0-9._-' '-')"
  slug="${slug##-}"
  slug="${slug%%-}"
  if [[ -z "${slug}" ]]; then
    slug="default"
  fi
  printf '%s\n' "${slug}"
}

profile_state_suffix() {
  local slug
  slug="$(profile_slug)"
  if [[ "${slug}" == "default" ]]; then
    printf '\n'
  else
    printf -- '-%s\n' "${slug}"
  fi
}

vm_dir() {
  printf '%s%s\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
}

cluster_lock_dir() {
  printf '%s%s.lock\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
}

vde_switch_dir() {
  printf '%s%s\n' "${VDE_SWITCH_DIR_BASE}" "$(profile_state_suffix)"
}

vde_switch_pid_file() {
  printf '%s/vde-switch.pid\n' "$(vm_dir)"
}

all_build_profiles() {
  local seen=""
  local profile

  for profile in default storage "${BUILD_PROFILE}"; do
    [[ -n "${profile}" ]] || continue
    case " ${seen} " in
      *" ${profile} "*) continue ;;
    esac
    seen="${seen} ${profile}"
    printf '%s\n' "${profile}"
  done
}

with_build_profile() {
  local next_profile="$1"
  local prev_profile="${BUILD_PROFILE}"
  shift

  BUILD_PROFILE="${next_profile}"
  "$@"
  local rc=$?
  BUILD_PROFILE="${prev_profile}"
  return "${rc}"
}

lightningstor_data_file_count() {
  local node="$1"
  local root
  root="$(lightningstor_data_root "${node}")"
  local deadline=$((SECONDS + SSH_WAIT_TIMEOUT))
  local output=""

  while true; do
    if output="$(ssh_node "${node}" "find ${root} -type f ! -name '*.tmp' | wc -l" 2>/dev/null)"; then
      printf '%s\n' "${output}"
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out collecting LightningStor file count from ${node}"
    fi
    sleep 2
  done
}

lightningstor_count_triplet() {
  printf '%s %s %s\n' \
    "$(lightningstor_data_file_count node01)" \
    "$(lightningstor_data_file_count node04)" \
    "$(lightningstor_data_file_count node05)"
}

wait_for_lightningstor_counts_greater_than() {
  local before_node01="$1"
  local before_node04="$2"
  local before_node05="$3"
  local context="$4"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))

  while true; do
    local count_node01 count_node04 count_node05
    read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
    if (( count_node01 > before_node01 )) && (( count_node04 > before_node04 )) && (( count_node05 > before_node05 )); then
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for distributed LightningStor replicas for ${context}"
    fi
    sleep 2
  done
}

wait_for_lightningstor_counts_equal() {
  local expected_node01="$1"
  local expected_node04="$2"
  local expected_node05="$3"
  local context="$4"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))

  while true; do
    local count_node01 count_node04 count_node05
    read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
    if (( count_node01 == expected_node01 )) && (( count_node04 == expected_node04 )) && (( count_node05 == expected_node05 )); then
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for distributed LightningStor counts to settle for ${context}: expected ${expected_node01}/${expected_node04}/${expected_node05}, got ${count_node01}/${count_node04}/${count_node05}"
    fi
    sleep 2
  done
}

require_cmd() {
  command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"
}

grpcurl_capture() {
  local status=0
  local output=""

  output="$(timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl "$@" 2>&1)" || status=$?
  printf '%s' "${output}"
  return "${status}"
}

build_link() {
  printf '%s/build-%s' "$(vm_dir)" "$1"
}

guest_image_link() {
  printf '%s/build-vm-guest-image' "$(vm_dir)"
}

guest_bench_image_link() {
  printf '%s/build-vm-bench-guest-image' "$(vm_dir)"
}

runtime_dir() {
  printf '%s/%s' "$(vm_dir)" "$1"
}

pid_file() {
  printf '%s/%s/vm.pid' "$(vm_dir)" "$1"
}

log_file() {
  printf '%s/%s/vm.log' "$(vm_dir)" "$1"
}

runvm_path() {
  local node="$1"
  find -L "$(build_link "${node}")/bin" -maxdepth 1 -name 'run-*-vm' | head -n1
}

guest_image_path() {
  local link_path
  link_path="$(guest_image_link)"
  build_guest_image
  find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
}

guest_bench_image_path() {
  local link_path
  link_path="$(guest_bench_image_link)"
  build_guest_bench_image
  find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
}

all_or_requested_nodes() {
  if [[ "$#" -eq 0 ]]; then
    printf '%s\n' "${NODES[@]}"
  else
    printf '%s\n' "$@"
  fi
}

validate_nodes_exist() {
  local node
  for node in "$@"; do
    [[ -n "${SSH_PORTS[${node}]:-}" ]] || die "unknown node: ${node}"
  done
}

ssh_port_for_node() {
  local node="$1"

  if [[ "${BUILD_PROFILE}" == "storage" && -n "${STORAGE_SSH_PORTS[${node}]:-}" ]]; then
    printf '%s\n' "${STORAGE_SSH_PORTS[${node}]}"
  else
    printf '%s\n' "${SSH_PORTS[${node}]}"
  fi
}

host_nested_param_path() {
  if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then
    printf '%s\n' /sys/module/kvm_intel/parameters/nested
  elif [[ -f /sys/module/kvm_amd/parameters/nested ]]; then
    printf '%s\n' /sys/module/kvm_amd/parameters/nested
  fi
}

preflight() {
  acquire_cluster_lock
  require_cmd nix
  require_cmd qemu-system-x86_64
  require_cmd ssh
  require_cmd sshpass
  require_cmd curl
  require_cmd grpcurl
  require_cmd vde_switch

  mkdir -p "$(vm_dir)"
  log "Cluster build profile: ${BUILD_PROFILE} (state dir $(vm_dir))"

  [[ -e /dev/kvm ]] || die "/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization"
  [[ -r /dev/kvm && -w /dev/kvm ]] || warn "/dev/kvm exists but current user may not have full access"

  local nested_path
  nested_path="$(host_nested_param_path || true)"
  if [[ -n "${nested_path}" ]]; then
    log "Host nested virtualization parameter: ${nested_path}=$(<"${nested_path}")"
  else
    warn "Could not locate host nested virtualization parameter; guest nested-KVM validation may fail"
  fi
}

vde_switch_ctl_path() {
  printf '%s/ctl\n' "$(vde_switch_dir)"
}

vde_switch_running() {
  if [[ -f "$(vde_switch_pid_file)" ]] && kill -0 "$(<"$(vde_switch_pid_file)")" 2>/dev/null; then
    [[ -S "$(vde_switch_ctl_path)" ]]
    return
  fi

  [[ -S "$(vde_switch_ctl_path)" ]]
}

ensure_vde_switch() {
  local deadline
  local vde_dir

  vde_dir="$(vde_switch_dir)"

  if vde_switch_running; then
    return 0
  fi

  rm -rf "${vde_dir}"
  rm -f "$(vde_switch_pid_file)"

  log "Starting VDE switch at ${vde_dir}"
  vde_switch \
    -sock "${vde_dir}" \
    -daemon \
    -pidfile "$(vde_switch_pid_file)"

  deadline=$((SECONDS + 10))
  while true; do
    if vde_switch_running; then
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VDE switch at ${vde_dir}"
    fi
    sleep 1
  done
}

stop_vde_switch() {
  local pid=""
  local vde_dir

  vde_dir="$(vde_switch_dir)"

  if [[ -f "$(vde_switch_pid_file)" ]]; then
    pid="$(<"$(vde_switch_pid_file)")"
  fi

  if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then
    log "Stopping VDE switch (PID ${pid})"
    kill "${pid}" || true
    for _ in {1..10}; do
      if ! kill -0 "${pid}" 2>/dev/null; then
        break
      fi
      sleep 1
    done
    if kill -0 "${pid}" 2>/dev/null; then
      warn "VDE switch did not stop after SIGTERM; sending SIGKILL"
      kill -9 "${pid}" || true
    fi
  fi

  rm -f "$(vde_switch_pid_file)"
  rm -rf "${vde_dir}"
}

any_vm_running() {
  local node
  for node in "${NODES[@]}"; do
    if is_running "${node}"; then
      return 0
    fi
  done
  return 1
}

terminate_pids() {
  local context="$1"
  shift
  local pids=("$@")
  local pid

  [[ "${#pids[@]}" -gt 0 ]] || return 0

  log "Stopping stale ${context}: ${pids[*]}"
  kill "${pids[@]}" 2>/dev/null || true
  for _ in {1..20}; do
    local remaining=0
    for pid in "${pids[@]}"; do
      if kill -0 "${pid}" 2>/dev/null; then
        remaining=1
        break
      fi
    done
    if [[ "${remaining}" -eq 0 ]]; then
      return 0
    fi
    sleep 1
  done

  warn "Force-killing stale ${context}: ${pids[*]}"
  kill -9 "${pids[@]}" 2>/dev/null || true
}

stale_vm_pids_for_nodes_current_profile() {
  local nodes=("$@")
  local pid cmd node port runtime_path
  declare -A seen=()

  while read -r pid cmd; do
    [[ -n "${pid:-}" ]] || continue
    for node in "${nodes[@]}"; do
      port="$(ssh_port_for_node "${node}")"
      runtime_path="$(runtime_dir "${node}")/${node}.qcow2"
      if [[ "${cmd}" == *"qemu-system"* ]] && {
        [[ "${cmd}" == *"file=${runtime_path}"* ]] ||
        [[ "${cmd}" == *"hostfwd=tcp::${port}-:22"* ]];
      }; then
        seen["${pid}"]=1
      fi
    done
  done < <(pgrep -af 'qemu-system[^[:space:]]*|run-.*-vm' || true)

  for node in "${nodes[@]}"; do
    port="$(ssh_port_for_node "${node}")"
    while read -r pid; do
      [[ -n "${pid:-}" ]] || continue
      seen["${pid}"]=1
    done < <(
      ss -H -ltnp "( sport = :${port} )" 2>/dev/null \
        | sed -n 's/.*pid=\([0-9]\+\).*/\1/p' \
        | sort -u
    )
  done

  printf '%s\n' "${!seen[@]}" | sort -n
}

stop_stale_vm_processes_current_profile() {
  local nodes=("$@")
  local pids=()

  mapfile -t pids < <(stale_vm_pids_for_nodes_current_profile "${nodes[@]}")
  terminate_pids "VM processes" "${pids[@]}"
}

stop_nodes_current_profile() {
  local nodes=("$@")
  local node

  for node in "${nodes[@]}"; do
    stop_vm "${node}"
  done

  stop_stale_vm_processes_current_profile "${nodes[@]}"

  if ! any_vm_running; then
    stop_vde_switch
  fi
}

stop_nodes_all_profiles() {
  local nodes=("$@")
  local profile

  while IFS= read -r profile; do
    with_build_profile "${profile}" stop_nodes_current_profile "${nodes[@]}"
  done < <(all_build_profiles)
}

remove_runtime_state_current_profile() {
  local state_dir
  state_dir="$(vm_dir)"

  if [[ -d "${state_dir}" ]]; then
    log "Removing runtime state under ${state_dir}"
    find "${state_dir}" -mindepth 1 -delete 2>/dev/null || true
  fi
}

remove_runtime_state_all_profiles() {
  local profile

  while IFS= read -r profile; do
    with_build_profile "${profile}" remove_runtime_state_current_profile
  done < <(all_build_profiles)
}

build_vm() {
  local node="$1"
  local target
  local out

  target="$(build_target_for_node "${node}")"
  log "Building ${node} VM derivation (${target})"
  out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
    --max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
    --extra-experimental-features 'nix-command flakes' \
    "${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm" \
    --no-link --print-out-paths | tail -n1)"
  [[ -n "${out}" ]] || die "failed to resolve VM output for ${node}"
  ln -sfn "${out}" "$(build_link "${node}")"
}

build_target_for_node() {
  local node="$1"

  if [[ "${BUILD_PROFILE}" == "storage" ]]; then
    printf '%s\n' "${STORAGE_BUILD_TARGETS[${node}]:-${node}}"
  else
    printf '%s\n' "${node}"
  fi
}

build_vms() {
  local nodes=("$@")
  local targets=()
  local outputs=()
  local node
  local target
  local i

  for node in "${nodes[@]}"; do
    target="$(build_target_for_node "${node}")"
    targets+=("${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm")
  done

  log "Building VM derivations in one Nix invocation: ${nodes[*]}"
  mapfile -t outputs < <(
    NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
      --max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
      --extra-experimental-features 'nix-command flakes' \
      "${targets[@]}" \
      --no-link --print-out-paths
  )

  [[ "${#outputs[@]}" -eq "${#nodes[@]}" ]] || die "expected ${#nodes[@]} VM outputs, got ${#outputs[@]}"

  for i in "${!nodes[@]}"; do
    ln -sfn "${outputs[${i}]}" "$(build_link "${nodes[${i}]}")"
  done
}

build_guest_image() {
  local out

  log "Building bootable VM guest image on the host"
  out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
    --max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
    --extra-experimental-features 'nix-command flakes' \
    "${CLUSTER_FLAKE_REF}#vmGuestImage" \
    --no-link --print-out-paths | tail -n1)"
  [[ -n "${out}" ]] || die "failed to resolve VM guest image output"
  ln -sfn "${out}" "$(guest_image_link)"
}

build_guest_bench_image() {
  local out

  log "Building VM benchmark guest image on the host"
  out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
    --max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
    --extra-experimental-features 'nix-command flakes' \
    "${CLUSTER_FLAKE_REF}#vmBenchGuestImage" \
    --no-link --print-out-paths | tail -n1)"
  [[ -n "${out}" ]] || die "failed to resolve VM benchmark guest image output"
  ln -sfn "${out}" "$(guest_bench_image_link)"
}

build_requested() {
  local nodes
  mapfile -t nodes < <(all_or_requested_nodes "$@")
  validate_nodes_exist "${nodes[@]}"
  preflight

  build_vms "${nodes[@]}"
}

is_running() {
  local node="$1"
  local pid_path
  pid_path="$(pid_file "${node}")"
  [[ -f "${pid_path}" ]] || return 1
  kill -0 "$(<"${pid_path}")" 2>/dev/null
}

start_vm() {
  local node="$1"
  local build_path runvm node_runtime pid_path vm_log ssh_port

  ensure_vde_switch

  build_path="$(build_link "${node}")"
  [[ -L "${build_path}" ]] || build_vm "${node}"
  runvm="$(runvm_path "${node}")"
  [[ -n "${runvm}" ]] || die "failed to locate run-*-vm for ${node}"

  node_runtime="$(runtime_dir "${node}")"
  pid_path="$(pid_file "${node}")"
  vm_log="$(log_file "${node}")"
  mkdir -p "${node_runtime}"

  if is_running "${node}"; then
    log "${node} already running (PID $(<"${pid_path}"))"
    return 0
  fi

  ssh_port="$(ssh_port_for_node "${node}")"
  if ss -H -ltn "( sport = :${ssh_port} )" | grep -q .; then
    warn "port ${ssh_port} is already in use before starting ${node}"
    ss -H -ltnp "( sport = :${ssh_port} )" || true
    die "SSH forward port ${ssh_port} for ${node} is already in use"
  fi

  log "Starting ${node}"
  (
    cd "${node_runtime}"
    nohup setsid "${runvm}" </dev/null >"${vm_log}" 2>&1 &
    echo $! >"${pid_path}"
  )
  sleep 2

  if ! is_running "${node}"; then
    warn "${node} failed to stay running; recent log follows"
    tail -n 80 "${vm_log}" || true
    die "failed to start ${node}"
  fi
}

stop_vm() {
  local node="$1"
  local pid_path pid

  pid_path="$(pid_file "${node}")"
  if [[ ! -f "${pid_path}" ]]; then
    log "${node} is not running"
    return 0
  fi

  pid="$(<"${pid_path}")"
  if kill -0 "${pid}" 2>/dev/null; then
    log "Stopping ${node} (PID ${pid})"
    kill "${pid}" || true
    for _ in {1..20}; do
      if ! kill -0 "${pid}" 2>/dev/null; then
        break
      fi
      sleep 1
    done
    if kill -0 "${pid}" 2>/dev/null; then
      warn "${node} did not stop after SIGTERM; sending SIGKILL"
      kill -9 "${pid}" || true
    fi
  fi

  rm -f "${pid_path}"
}

ssh_node() {
  local node="$1"
  shift
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"
  sshpass -p "${SSH_PASSWORD}" \
    ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 "$@"
}

ssh_node_script() {
  local node="$1"
  shift
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"
  sshpass -p "${SSH_PASSWORD}" \
    ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 bash -se -- "$@"
}

scp_to_node() {
  local node="$1"
  local local_path="$2"
  local remote_path="$3"
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"
  sshpass -p "${SSH_PASSWORD}" \
    scp "${SSH_OPTS[@]}" -P "${ssh_port}" "${local_path}" "root@127.0.0.1:${remote_path}"
}

start_ssh_tunnel() {
  local node="$1"
  local local_port="$2"
  local remote_port="$3"
  local remote_host="${4:-127.0.0.1}"
  local control_socket
  control_socket="$(vm_dir)/tunnel-${node}-${local_port}.ctl"
  local deadline
  local attempt_deadline
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"

  if [[ -e "${control_socket}" ]]; then
    sshpass -p "${SSH_PASSWORD}" \
      ssh "${SSH_OPTS[@]}" \
      -S "${control_socket}" \
      -O exit \
      -p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
    rm -f "${control_socket}"
  fi

  if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
    pkill -f -- "ssh .* -L ${local_port}:${remote_host}:${remote_port} " >/dev/null 2>&1 || true
    for _ in {1..10}; do
      if ! ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
        break
      fi
      sleep 1
    done
    if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
      die "local tunnel port ${local_port} is already in use"
    fi
  fi

  deadline=$((SECONDS + TUNNEL_WAIT_TIMEOUT))
  while true; do
    sshpass -p "${SSH_PASSWORD}" \
      ssh "${SSH_OPTS[@]}" \
      -o ExitOnForwardFailure=yes \
      -S "${control_socket}" \
      -M -f -N \
      -L "${local_port}:${remote_host}:${remote_port}" \
      -p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true

    attempt_deadline=$((SECONDS + 10))
    while true; do
      if sshpass -p "${SSH_PASSWORD}" \
        ssh "${SSH_OPTS[@]}" \
        -S "${control_socket}" \
        -O check \
        -p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1; then
        printf '%s\n' "${control_socket}"
        return 0
      fi
      if (( SECONDS >= attempt_deadline )); then
        break
      fi
      sleep 1
    done

    sshpass -p "${SSH_PASSWORD}" \
      ssh "${SSH_OPTS[@]}" \
      -S "${control_socket}" \
      -O exit \
      -p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
    rm -f "${control_socket}"

    if (( SECONDS >= deadline )); then
      warn "failed to establish ssh tunnel for ${node}:${remote_port} on local port ${local_port}"
      ss -H -ltnp "( sport = :${local_port} )" || true
      ps -ef | grep -F -- "-L ${local_port}:${remote_host}:${remote_port}" | grep -v grep || true
      die "ssh tunnel for ${node}:${remote_host}:${remote_port} did not bind local port ${local_port}"
    fi
    sleep 1
  done
}

stop_ssh_tunnel() {
  local node="$1"
  local control_socket="$2"
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"

  [[ -n "${control_socket}" ]] || return 0
  if [[ -e "${control_socket}" ]]; then
    sshpass -p "${SSH_PASSWORD}" \
      ssh "${SSH_OPTS[@]}" \
      -S "${control_socket}" \
      -O exit \
      -p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
    rm -f "${control_socket}"
  fi
}

issue_project_admin_token() {
  local iam_port="$1"
  local org_id="$2"
  local project_id="$3"
  local principal_id="$4"
  local create_principal_json create_binding_json issue_token_json token deadline output

  create_principal_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
  )"
  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT + 180))
  while true; do
    output="$(
      timeout 15 grpcurl -plaintext \
        -import-path "${IAM_PROTO_DIR}" \
        -proto "${IAM_PROTO}" \
        -d "${create_principal_json}" \
        127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
    )" && break
    if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out creating IAM principal ${principal_id}: ${output}"
    fi
    sleep 2
  done

  create_binding_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
  )"
  while true; do
    output="$(
      timeout 15 grpcurl -plaintext \
        -import-path "${IAM_PROTO_DIR}" \
        -proto "${IAM_PROTO}" \
        -d "${create_binding_json}" \
        127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreateBinding 2>&1
    )" && break
    if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out creating IAM binding for ${principal_id}: ${output}"
    fi
    sleep 2
  done

  issue_token_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
  )"
  while true; do
    output="$(
      timeout 15 grpcurl -plaintext \
        -import-path "${IAM_PROTO_DIR}" \
        -proto "${IAM_PROTO}" \
        -d "${issue_token_json}" \
        127.0.0.1:"${iam_port}" iam.v1.IamToken/IssueToken 2>&1
    )" && {
      token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
      if [[ -n "${token}" ]]; then
        break
      fi
    }
    if (( SECONDS >= deadline )); then
      die "timed out issuing IAM token for ${principal_id}: ${output}"
    fi
    sleep 2
  done

  wait_for_project_admin_authorization "${iam_port}" "${org_id}" "${project_id}" "${principal_id}"
  printf '%s\n' "${token}"
}

issue_project_admin_token_any() {
  local org_id="$1"
  local project_id="$2"
  local principal_id="$3"
  shift 3
  local ports=("$@")
  local create_principal_json create_binding_json issue_token_json token deadline output
  local selected_port="" create_port="" binding_port="" issue_port="" port

  create_principal_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
  )"
  create_binding_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
  )"
  issue_token_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
  )"

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while [[ -z "${create_port}" ]]; do
    for port in "${ports[@]}"; do
      output="$(
        timeout 15 grpcurl -plaintext \
          -import-path "${IAM_PROTO_DIR}" \
          -proto "${IAM_PROTO}" \
          -d "${create_principal_json}" \
          127.0.0.1:"${port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
      )" && {
        create_port="${port}"
        break
      }
      if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
        create_port="${port}"
        break
      fi
    done
    if [[ -n "${create_port}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out creating IAM principal ${principal_id}: ${output}"
    fi
    sleep 2
  done

  while [[ -z "${binding_port}" ]]; do
    for port in "${ports[@]}"; do
      output="$(
        timeout 15 grpcurl -plaintext \
          -import-path "${IAM_PROTO_DIR}" \
          -proto "${IAM_PROTO}" \
          -d "${create_binding_json}" \
          127.0.0.1:"${port}" iam.v1.IamAdmin/CreateBinding 2>&1
      )" && {
        binding_port="${port}"
        break
      }
      if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
        binding_port="${port}"
        break
      fi
    done
    if [[ -n "${binding_port}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out creating IAM binding for ${principal_id}: ${output}"
    fi
    sleep 2
  done

  while [[ -z "${issue_port}" ]]; do
    for port in "${ports[@]}"; do
      output="$(
        timeout 15 grpcurl -plaintext \
          -import-path "${IAM_PROTO_DIR}" \
          -proto "${IAM_PROTO}" \
          -d "${issue_token_json}" \
          127.0.0.1:"${port}" iam.v1.IamToken/IssueToken 2>&1
      )" && {
        token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
        if [[ -n "${token}" ]]; then
          issue_port="${port}"
          break
        fi
      }
    done
    if [[ -n "${issue_port}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out issuing IAM token for ${principal_id}: ${output}"
    fi
    sleep 2
  done

  selected_port="$(wait_for_project_admin_authorization_any "${org_id}" "${project_id}" "${principal_id}" "${ports[@]}")"
  printf '%s\t%s\n' "${selected_port}" "${token}"
}

wait_for_project_admin_authorization() {
  local iam_port="$1"
  local org_id="$2"
  local project_id="$3"
  local principal_id="$4"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  local authorize_json

  authorize_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{
        principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
        action:"storage:buckets:create",
        resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
      }'
  )"

  while true; do
    if timeout 15 grpcurl -plaintext \
      -import-path "${IAM_PROTO_DIR}" \
      -proto "${IAM_PROTO}" \
      -d "${authorize_json}" \
      127.0.0.1:"${iam_port}" iam.v1.IamAuthz/Authorize \
      | jq -e '.allowed == true' >/dev/null 2>&1; then
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
    fi
    sleep 2
  done
}

wait_for_project_admin_authorization_any() {
  local org_id="$1"
  local project_id="$2"
  local principal_id="$3"
  shift 3
  local ports=("$@")
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  local authorize_json port

  authorize_json="$(
    jq -cn \
      --arg id "${principal_id}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{
        principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
        action:"storage:buckets:create",
        resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
      }'
  )"

  while true; do
    for port in "${ports[@]}"; do
      if timeout 15 grpcurl -plaintext \
        -import-path "${IAM_PROTO_DIR}" \
        -proto "${IAM_PROTO}" \
        -d "${authorize_json}" \
        127.0.0.1:"${port}" iam.v1.IamAuthz/Authorize \
        | jq -e '.allowed == true' >/dev/null 2>&1; then
        printf '%s\n' "${port}"
        return 0
      fi
    done
    if (( SECONDS >= deadline )); then
      die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
    fi
    sleep 2
  done
}

ensure_lightningstor_bucket() {
  local ls_port="$1"
  local token="$2"
  local bucket="$3"
  local org_id="$4"
  local project_id="$5"
  local head_json create_json

  head_json="$(jq -cn --arg bucket "${bucket}" '{bucket:$bucket}')"
  create_json="$(
    jq -cn \
      --arg bucket "${bucket}" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      '{bucket:$bucket, region:"default", orgId:$org, projectId:$project}'
  )"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  local output=""

  while true; do
    if timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${head_json}" \
      127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/HeadBucket >/dev/null 2>&1; then
      return 0
    fi

    output="$(
      grpcurl_capture -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
        -proto "${LIGHTNINGSTOR_PROTO}" \
        -d "${create_json}" \
        127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/CreateBucket
    )" && return 0

    if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
      return 0
    fi
    if (( SECONDS >= deadline )); then
      die "timed out ensuring LightningStor bucket ${bucket}: ${output}"
    fi
    sleep 2
  done
}

wait_for_lightningstor_write_quorum() {
  local ls_port="$1"
  local token="$2"
  local bucket="$3"
  local context="$4"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  local key="write-quorum-probe-$(date +%s)-$RANDOM"
  local body="quorum-probe-${key}"
  local body_b64 put_json delete_json output status
  local before_node01 before_node04 before_node05

  read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)
  body_b64="$(printf '%s' "${body}" | base64 -w0)"
  put_json="$(
    jq -cn \
      --arg bucket "${bucket}" \
      --arg key "${key}" \
      --arg body "${body_b64}" \
      '{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
  )"
  delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"

  while true; do
    status=0
    output="$(
      grpcurl_capture -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
        -proto "${LIGHTNINGSTOR_PROTO}" \
        -d "${put_json}" \
        127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/PutObject
    )" || status=$?

    if (( status == 0 )); then
      wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe"
      output="$(
        grpcurl_capture -plaintext \
          -H "authorization: Bearer ${token}" \
          -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
          -proto "${LIGHTNINGSTOR_PROTO}" \
          -d "${delete_json}" \
          127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/DeleteObject
      )" || die "failed to delete LightningStor write quorum probe for ${context}: ${output}"
      wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe cleanup"
      return 0
    fi

    if (( SECONDS >= deadline )); then
      die "timed out waiting for LightningStor write quorum for ${context}: ${output}"
    fi

    if ! grep -q "Not enough healthy nodes" <<<"${output}"; then
      die "unexpected LightningStor write quorum failure for ${context}: ${output}"
    fi

    sleep 2
  done
}

download_lightningstor_object_to_file() {
  local ls_port="$1"
  local token="$2"
  local bucket="$3"
  local key="$4"
  local output_path="$5"
  local get_json

  get_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
  timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
    -max-msg-sz "${GRPCURL_MAX_MSG_SIZE}" \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "${get_json}" \
    127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/GetObject \
    | jq -r '.bodyChunk? // empty' \
    | base64 -d >"${output_path}"
}

calc_mib_per_s() {
  local bytes="$1"
  local elapsed_ns="$2"
  awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
    BEGIN {
      if (elapsed_ns <= 0) {
        print "0.00"
      } else {
        printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
      }
    }
  '
}

calc_ops_per_s() {
  local operations="$1"
  local elapsed_ns="$2"
  awk -v operations="${operations}" -v elapsed_ns="${elapsed_ns}" '
    BEGIN {
      if (elapsed_ns <= 0) {
        print "0.00"
      } else {
        printf "%.2f", operations / (elapsed_ns / 1000000000.0)
      }
    }
  '
}

calc_seconds_from_ns() {
  local elapsed_ns="$1"
  awk -v elapsed_ns="${elapsed_ns}" '
    BEGIN {
      if (elapsed_ns <= 0) {
        print "0.00"
      } else {
        printf "%.2f", elapsed_ns / 1000000000.0
      }
    }
  '
}

bw_bytes_to_mibps() {
  local bw_bytes="$1"
  awk -v bw_bytes="${bw_bytes}" 'BEGIN { printf "%.2f", bw_bytes / 1048576.0 }'
}

bps_to_mibps() {
  local bits_per_second="$1"
  awk -v bits_per_second="${bits_per_second}" 'BEGIN { printf "%.2f", bits_per_second / 8.0 / 1048576.0 }'
}

allocate_free_listener_port() {
  local node="$1"
  local start_port="${2:-18080}"
  local end_port="${3:-18999}"

  ssh_node_script "${node}" "${start_port}" "${end_port}" <<'EOS'
set -euo pipefail

start_port="$1"
end_port="$2"

for ((port=start_port; port<=end_port; port++)); do
  if ! ss -ltnH "( sport = :${port} )" | grep -q .; then
    printf '%s\n' "${port}"
    exit 0
  fi
done

exit 1
EOS
}

run_remote_fio_json() {
  local node="$1"
  local target_path="$2"
  local rw="$3"
  local bs="$4"
  local size_mb="$5"
  local runtime_secs="${6:-0}"
  local iodepth="${7:-1}"
  local ioengine="${8:-sync}"

  ssh_node_script "${node}" "${target_path}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${iodepth}" "${ioengine}" <<'EOS'
set -euo pipefail

target_path="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"
iodepth="$6"
ioengine="$7"

mkdir -p "$(dirname "${target_path}")"

if [[ "${rw}" == *read* ]]; then
  dd if=/dev/zero of="${target_path}" bs=1M count="${size_mb}" status=none conv=fsync
fi

fio_args=(
  --name=photon-bench
  --filename="${target_path}"
  --rw="${rw}"
  --bs="${bs}"
  --size="${size_mb}M"
  --ioengine="${ioengine}"
  --direct=1
  --iodepth="${iodepth}"
  --output-format=json
)

if [[ "${runtime_secs}" != "0" ]]; then
  fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi

if [[ "${rw}" == *write* ]]; then
  fio_args+=(--fdatasync=1)
fi

result_json="$(fio "${fio_args[@]}")"
rm -f "${target_path}"

if [[ "${rw}" == *read* ]]; then
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}

run_remote_block_fio_json() {
  local node="$1"
  local target="$2"
  local rw="$3"
  local bs="$4"
  local size_mb="$5"
  local runtime_secs="${6:-0}"

  ssh_node_script "${node}" "${target}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" <<'EOS'
set -euo pipefail

target="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"

fio_args=(
  --name=photon-bench
  --filename="${target}"
  --rw="${rw}"
  --bs="${bs}"
  --size="${size_mb}M"
  --ioengine=libaio
  --direct=1
  --output-format=json
)

if [[ "${runtime_secs}" != "0" ]]; then
  fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi

if [[ "${rw}" == *write* ]]; then
  fio_args+=(--fdatasync=1)
fi

result_json="$(fio "${fio_args[@]}")"

if [[ "${rw}" == *read* ]]; then
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}

run_remote_dd_read_json() {
  local node="$1"
  local target_path="$2"
  local size_mb="$3"

  ssh_node_script "${node}" "${target_path}" "${size_mb}" <<'EOS'
set -euo pipefail

target_path="$1"
size_mb="$2"

[[ -f "${target_path}" ]]
start_ns="$(date +%s%N)"
dd if="${target_path}" of=/dev/null bs=1M status=none
end_ns="$(date +%s%N)"

printf '{"size_bytes":%s,"duration_ns":%s}\n' \
  "$((size_mb * 1024 * 1024))" \
  "$((end_ns - start_ns))"
EOS
}

coronafs_api_url() {
  printf 'http://127.0.0.1:%s' "${1:-15088}"
}

coronafs_api_request() {
  local base_port="$1"
  local method="$2"
  local path="$3"
  local payload="${4:-}"
  if [[ -n "${payload}" ]]; then
    curl -fsS -X "${method}" \
      -H 'content-type: application/json' \
      --data "${payload}" \
      "$(coronafs_api_url "${base_port}")${path}"
  else
    curl -fsS -X "${method}" "$(coronafs_api_url "${base_port}")${path}"
  fi
}

coronafs_create_volume() {
  local base_port="$1"
  local volume_id="$2"
  local size_bytes="$3"
  coronafs_api_request "${base_port}" PUT "/v1/volumes/${volume_id}" "$(jq -cn --argjson size_bytes "${size_bytes}" '{size_bytes:$size_bytes}')"
}

coronafs_export_volume_json() {
  local base_port="$1"
  local volume_id="$2"
  coronafs_api_request "${base_port}" POST "/v1/volumes/${volume_id}/export"
}

coronafs_get_volume_json() {
  local base_port="$1"
  local volume_id="$2"
  coronafs_api_request "${base_port}" GET "/v1/volumes/${volume_id}"
}

coronafs_delete_volume() {
  local base_port="$1"
  local volume_id="$2"
  coronafs_api_request "${base_port}" DELETE "/v1/volumes/${volume_id}" >/dev/null
}

run_remote_nbd_fio_json() {
  local node="$1"
  local nbd_uri="$2"
  local rw="$3"
  local bs="$4"
  local size_mb="$5"
  local runtime_secs="${6:-0}"
  local nbd_device="${7:-/dev/nbd0}"
  local iodepth="${8:-1}"

  ssh_node_script "${node}" "${nbd_uri}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${nbd_device}" "${iodepth}" <<'EOS'
set -euo pipefail

nbd_uri="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"
nbd_device="$6"
iodepth="$7"

modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
qemu-nbd \
  --format=raw \
  --cache=none \
  --aio=io_uring \
  --connect="${nbd_device}" \
  "${nbd_uri}"
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT

fio_args=(
  --name=photon-bench
  --filename="${nbd_device}"
  --rw="${rw}"
  --bs="${bs}"
  --size="${size_mb}M"
  --ioengine=libaio
  --direct=1
  --iodepth="${iodepth}"
  --output-format=json
)

if [[ "${runtime_secs}" != "0" ]]; then
  fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi

if [[ "${rw}" == *write* ]]; then
  fio_args+=(--fdatasync=1)
fi

result_json="$(fio "${fio_args[@]}")"

if [[ "${rw}" == *read* ]]; then
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
  printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}

run_remote_nbd_dd_read_json() {
  local node="$1"
  local nbd_uri="$2"
  local size_mb="$3"
  local nbd_device="${4:-/dev/nbd0}"

  ssh_node_script "${node}" "${nbd_uri}" "${size_mb}" "${nbd_device}" <<'EOS'
set -euo pipefail

nbd_uri="$1"
size_mb="$2"
nbd_device="$3"

modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
qemu-nbd \
  --format=raw \
  --cache=none \
  --aio=io_uring \
  --connect="${nbd_device}" \
  "${nbd_uri}"
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT

start_ns="$(date +%s%N)"
dd if="${nbd_device}" of=/dev/null bs=1M count="${size_mb}" status=none
end_ns="$(date +%s%N)"

printf '{"size_bytes":%s,"duration_ns":%s}\n' \
  "$((size_mb * 1024 * 1024))" \
  "$((end_ns - start_ns))"
EOS
}

run_remote_iperf_json() {
  local client_node="$1"
  local server_node="$2"
  local server_ip="$3"
  local duration_secs="${4:-10}"
  local server_port
  local server_pid

  server_port="$(allocate_free_listener_port "${server_node}" 19000 19100)"
  server_pid="$(ssh_node_script "${server_node}" "${server_port}" <<'EOS'
set -euo pipefail

server_port="$1"
log_path="/tmp/iperf3-server-${server_port}.log"
rm -f "${log_path}"
nohup iperf3 -s -1 -p "${server_port}" >"${log_path}" 2>&1 &
printf '%s\n' "$!"
EOS
)"

  sleep 1

  ssh_node_script "${client_node}" "${server_ip}" "${server_port}" "${duration_secs}" "${server_pid}" <<'EOS'
set -euo pipefail

server_ip="$1"
server_port="$2"
duration_secs="$3"
server_pid="$4"

client_json="$(iperf3 -c "${server_ip}" -p "${server_port}" -t "${duration_secs}" -J)"
printf '%s' "${client_json}" | jq -c '{
  bits_per_second: (
    .end.sum_received.bits_per_second //
    .end.sum.bits_per_second //
    .end.sum_sent.bits_per_second //
    0
  ),
  retransmits: (.end.sum_sent.retransmits // 0)
}'
EOS
}

wait_for_plasmavmc_workers_registered() {
  local vm_port="$1"
  local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for PlasmaVMC workers to register with the control plane"
  until grpcurl -plaintext \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d '{}' \
    127.0.0.1:"${vm_port}" plasmavmc.v1.NodeService/ListNodes \
    | jq -e '
        ([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node04")) != null
        and
        ([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null
      ' >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for PlasmaVMC workers to register"
    fi
    sleep 2
  done
}

wait_for_ssh() {
  local node="$1"
  local timeout="${2:-${SSH_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))
  local observed_host=""

  log "Waiting for SSH on ${node}"
  while true; do
    observed_host="$(ssh_node "${node}" "hostname" 2>/dev/null || true)"
    if [[ "${observed_host}" == "${node}" ]]; then
      break
    fi
    if ! is_running "${node}"; then
      tail -n 100 "$(log_file "${node}")" || true
      die "${node} VM process exited while waiting for SSH"
    fi
    if (( SECONDS >= deadline )); then
      if [[ -n "${observed_host}" ]]; then
        warn "SSH on port $(ssh_port_for_node "${node}") answered as '${observed_host}' while waiting for ${node}"
      fi
      tail -n 100 "$(log_file "${node}")" || true
      die "timed out waiting for SSH on ${node}"
    fi
    sleep 2
  done
}

wait_for_ssh_down() {
  local node="$1"
  local timeout="${2:-60}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for SSH to stop on ${node}"
  until ! ssh_node "${node}" true >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for SSH shutdown on ${node}"
    fi
    sleep 2
  done
}

wait_for_unit() {
  local node="$1"
  local unit="$2"
  local timeout="${3:-${UNIT_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))
  local stable_checks=0
  local required_stable_checks=3

  log "Waiting for ${unit}.service on ${node}"
  while (( stable_checks < required_stable_checks )); do
    if ssh_node "${node}" "state=\$(systemctl show --property=ActiveState --value ${unit}.service); sub=\$(systemctl show --property=SubState --value ${unit}.service); [[ \"\${state}\" == active && (\"\${sub}\" == running || \"\${sub}\" == exited) ]]" >/dev/null 2>&1; then
      stable_checks=$((stable_checks + 1))
    else
      stable_checks=0
    fi
    if ! is_running "${node}"; then
      tail -n 100 "$(log_file "${node}")" || true
      die "${node} VM process exited while waiting for ${unit}.service"
    fi
    if (( SECONDS >= deadline )); then
      ssh_node "${node}" "systemctl status --no-pager ${unit}.service || true" || true
      ssh_node "${node}" "journalctl -u ${unit}.service -n 80 --no-pager || true" || true
      die "timed out waiting for ${unit}.service on ${node}"
    fi
    sleep 2
  done
}

wait_for_http() {
  local node="$1"
  local url="$2"
  local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for HTTP endpoint on ${node}: ${url}"
  until ssh_node "${node}" "curl -fsS '${url}' >/dev/null" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for ${url} on ${node}"
    fi
    sleep 2
  done
}

wait_for_http_status() {
  local node="$1"
  local url="$2"
  local expected_codes="$3"
  local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for HTTP status on ${node}: ${url} (${expected_codes})"
  until ssh_node "${node}" "code=\$(curl -sS -o /dev/null -w '%{http_code}' '${url}' || true); case \" ${expected_codes} \" in *\" \${code} \"*) exit 0 ;; *) exit 1 ;; esac" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for HTTP status ${expected_codes} from ${url} on ${node}"
    fi
    sleep 2
  done
}

wait_for_http_body() {
  local node="$1"
  local url="$2"
  local expected="$3"
  local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for HTTP body on ${node}: ${url}"
  until ssh_node_script "${node}" "${url}" "${expected}" <<'EOF' >/dev/null 2>&1
set -euo pipefail
url="$1"
expected="$2"
body="$(curl -fsS "${url}")"
[[ "${body}" == "${expected}" ]]
EOF
  do
    if (( SECONDS >= deadline )); then
      ssh_node "${node}" "curl -fsS '${url}' || true" || true
      die "timed out waiting for expected HTTP body from ${url} on ${node}"
    fi
    sleep 2
  done
}

wait_for_host_http() {
  local url="$1"
  local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for host HTTP endpoint: ${url}"
  until curl -fsS "${url}" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for host HTTP endpoint ${url}"
    fi
    sleep 2
  done
}

host_api_request() {
  local stage="$1"
  local method="$2"
  local url="$3"
  local token="$4"
  local body="${5:-}"
  local response_file headers_file stderr_file http_code

  response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-response-XXXXXX)"
  headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-headers-XXXXXX)"
  stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-stderr-XXXXXX)"

  if [[ -n "${body}" ]]; then
    http_code="$(
      curl -sS \
        -D "${headers_file}" \
        -o "${response_file}" \
        -w '%{http_code}' \
        -H "Authorization: Bearer ${token}" \
        -H 'Content-Type: application/json' \
        -X "${method}" \
        -d "${body}" \
        "${url}" \
        2>"${stderr_file}" || true
    )"
  else
    http_code="$(
      curl -sS \
        -D "${headers_file}" \
        -o "${response_file}" \
        -w '%{http_code}' \
        -H "Authorization: Bearer ${token}" \
        -X "${method}" \
        "${url}" \
        2>"${stderr_file}" || true
    )"
  fi

  if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
    cat "${response_file}"
    rm -f "${response_file}" "${headers_file}" "${stderr_file}"
    return 0
  fi

  log "Host API request failed during ${stage}: ${method} ${url} (status=${http_code:-curl-error})"
  if [[ -s "${stderr_file}" ]]; then
    sed 's/^/[curl] /' "${stderr_file}" >&2
  fi
  if [[ -s "${headers_file}" ]]; then
    sed 's/^/[headers] /' "${headers_file}" >&2
  fi
  if [[ -s "${response_file}" ]]; then
    sed 's/^/[body] /' "${response_file}" >&2
  fi
  rm -f "${response_file}" "${headers_file}" "${stderr_file}"
  die "host API request failed during ${stage}"
}

gateway_api_request() {
  local stage="$1"
  local method="$2"
  local request_path="$3"
  local token="$4"
  local body="${5:-}"
  local body_b64=""

  if [[ -n "${body}" ]]; then
    body_b64="$(printf '%s' "${body}" | base64 | tr -d '\n')"
  fi

  if ssh_node_script node06 "${method}" "${request_path}" "${token}" "${body_b64}" <<'EOF'
set -euo pipefail

method="$1"
request_path="$2"
token="$3"
body_b64="${4:-}"
url="http://127.0.0.1:8080${request_path}"
response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-response-XXXXXX)"
headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-headers-XXXXXX)"
stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-stderr-XXXXXX)"
body_file=""

cleanup() {
  rm -f "${response_file}" "${headers_file}" "${stderr_file}"
  if [[ -n "${body_file}" ]]; then
    rm -f "${body_file}"
  fi
}
trap cleanup EXIT

if [[ -n "${body_b64}" ]]; then
  body_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-body-XXXXXX)"
  printf '%s' "${body_b64}" | base64 -d >"${body_file}"
  http_code="$(
    curl -sS \
      -D "${headers_file}" \
      -o "${response_file}" \
      -w '%{http_code}' \
      -H "Authorization: Bearer ${token}" \
      -H 'Content-Type: application/json' \
      -X "${method}" \
      --data-binary @"${body_file}" \
      "${url}" \
      2>"${stderr_file}" || true
  )"
else
  http_code="$(
    curl -sS \
      -D "${headers_file}" \
      -o "${response_file}" \
      -w '%{http_code}' \
      -H "Authorization: Bearer ${token}" \
      -X "${method}" \
      "${url}" \
      2>"${stderr_file}" || true
  )"
fi

if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
  cat "${response_file}"
  exit 0
fi

echo "status=${http_code:-curl-error}" >&2
if [[ -s "${stderr_file}" ]]; then
  sed 's/^/[curl] /' "${stderr_file}" >&2
fi
if [[ -s "${headers_file}" ]]; then
  sed 's/^/[headers] /' "${headers_file}" >&2
fi
if [[ -s "${response_file}" ]]; then
  sed 's/^/[body] /' "${response_file}" >&2
fi
exit 1
EOF
  then
    return 0
  fi

  log "Gateway API request failed during ${stage}: ${method} ${request_path}"
  die "gateway API request failed during ${stage}"
}

grpc_health_check() {
  local node="$1"
  local port="$2"
  local service="$3"
  ssh_node "${node}" \
    "grpcurl -plaintext -d '{\"service\":\"${service}\"}' 127.0.0.1:${port} grpc.health.v1.Health/Check | jq -e '.status == \"SERVING\"' >/dev/null"
}

wait_for_grpc_health() {
  local node="$1"
  local port="$2"
  local service="$3"
  local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for gRPC health on ${node}:${port} (${service})"
  until grpc_health_check "${node}" "${port}" "${service}" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for gRPC health ${service} on ${node}:${port}"
    fi
    sleep 2
  done
}

check_tcp_port() {
  local node="$1"
  local port="$2"
  ssh_node "${node}" "ss -H -ltn '( sport = :${port} )' | grep -q ."
}

check_udp_port() {
  local node="$1"
  local port="$2"
  ssh_node "${node}" "ss -H -lun '( sport = :${port} )' | grep -q ."
}

wait_for_tcp_port() {
  local node="$1"
  local port="$2"
  local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for TCP port ${port} on ${node}"
  until check_tcp_port "${node}" "${port}" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for TCP port ${port} on ${node}"
    fi
    sleep 2
  done
}

wait_for_udp_port() {
  local node="$1"
  local port="$2"
  local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for UDP port ${port} on ${node}"
  until check_udp_port "${node}" "${port}" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for UDP port ${port} on ${node}"
    fi
    sleep 2
  done
}

wait_for_flaredb_region() {
  local node="$1"
  local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for FlareDB region metadata on ${node}"
  until ssh_node "${node}" "curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -e '(.data.leader_id > 0) and ((.data.peers | sort) == [1,2,3])' >/dev/null" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for FlareDB region metadata on ${node}"
    fi
    sleep 2
  done
}

wait_for_flaredb_route_metadata() {
  local node="$1"
  local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  log "Waiting for FlareDB route metadata on ${node}"
  until ssh_node "${node}" "bash -se" <<'EOF' >/dev/null 2>&1
set -euo pipefail
actual="$(curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -r '.data.leader_id')"
recorded="$(curl -fsS http://127.0.0.1:8081/api/v1/kv/flaredb/regions/1 | jq -r '.data.value | fromjson | .leader_id')"
[[ "${actual}" != "0" ]]
[[ "${actual}" == "${recorded}" ]]
EOF
  do
    if (( SECONDS >= deadline )); then
      die "timed out waiting for FlareDB route metadata on ${node}"
    fi
    sleep 2
  done
}

ensure_flaredb_proto_on_node() {
  local node="$1"
  local proto_root="${2:-/var/lib/photon-test-protos/flaredb}"

  ssh_node "${node}" "install -d -m 0755 ${proto_root}"
  scp_to_node "${node}" "${FLAREDB_PROTO}" "${proto_root}/kvrpc.proto"
}

vm_runtime_dir_path() {
  printf '%s/%s\n' /run/libvirt/plasmavmc "$1"
}

vm_console_path() {
  printf '%s/console.log\n' "$(vm_runtime_dir_path "$1")"
}

wait_for_vm_console_pattern() {
  local node="$1"
  local vm_id="$2"
  local pattern="$3"
  local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))
  local console_path console_q pattern_q

  console_path="$(vm_console_path "${vm_id}")"
  console_q="$(printf '%q' "${console_path}")"
  pattern_q="$(printf '%q' "${pattern}")"

  log "Waiting for VM console output on ${node}: ${pattern}"
  until ssh_node "${node}" "bash -lc 'test -f ${console_q} && grep -F -- ${pattern_q} ${console_q} >/dev/null'" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      ssh_node "${node}" "bash -lc 'test -f ${console_q} && tail -n 80 ${console_q} || true'" || true
      die "timed out waiting for VM console pattern ${pattern} on ${node}"
    fi
    sleep 2
  done
}

read_vm_console_line_matching() {
  local node="$1"
  local vm_id="$2"
  local pattern="$3"
  local console_path console_q pattern_q

  console_path="$(vm_console_path "${vm_id}")"
  console_q="$(printf '%q' "${console_path}")"
  pattern_q="$(printf '%q' "${pattern}")"
  ssh_node "${node}" "bash -lc 'grep -F -- ${pattern_q} ${console_q} | tail -n1'"
}

wait_for_qemu_volume_present() {
  local node="$1"
  local volume_path="$2"
  local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  until ssh_node "${node}" "pgrep -fa '[q]emu-system' | grep -F '${volume_path}' >/dev/null" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" || true
      die "timed out waiting for qemu to attach ${volume_path} on ${node}"
    fi
    sleep 2
  done
}

wait_for_qemu_volume_absent() {
  local node="$1"
  local volume_path="$2"
  local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
  local deadline=$((SECONDS + timeout))

  until ssh_node "${node}" "bash -lc '! pgrep -fa \"[q]emu-system\" | grep -F \"${volume_path}\" >/dev/null'" >/dev/null 2>&1; do
    if (( SECONDS >= deadline )); then
      ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" || true
      die "timed out waiting for qemu to release ${volume_path} on ${node}"
    fi
    sleep 2
  done
}

try_get_vm_json() {
  local token="$1"
  local get_vm_json="$2"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${get_vm_json}" \
    127.0.0.1:15082 plasmavmc.v1.VmService/GetVm
}

wait_requested() {
  local nodes
  mapfile -t nodes < <(all_or_requested_nodes "$@")
  validate_nodes_exist "${nodes[@]}"
  preflight

  local node
  for node in "${nodes[@]}"; do
    wait_for_ssh "${node}"
  done
}

start_requested() {
  local nodes
  mapfile -t nodes < <(all_or_requested_nodes "$@")
  validate_nodes_exist "${nodes[@]}"
  preflight
  if [[ "${CLUSTER_SKIP_BUILD}" == "1" ]]; then
    local node
    for node in "${nodes[@]}"; do
      [[ -L "$(build_link "${node}")" ]] || die "missing VM build link for ${node} while PHOTON_CLUSTER_SKIP_BUILD=1"
    done
    log "Skipping VM build because PHOTON_CLUSTER_SKIP_BUILD=1"
  else
    build_vms "${nodes[@]}"
  fi

  if [[ "$#" -eq 0 ]]; then
    local phase node
    for phase in "${NODE_PHASES[@]}"; do
      for node in ${phase}; do
        start_vm "${node}"
      done
      for node in ${phase}; do
        wait_for_ssh "${node}"
      done
    done
  else
    local node
    for node in "${nodes[@]}"; do
      start_vm "${node}"
    done
    for node in "${nodes[@]}"; do
      wait_for_ssh "${node}"
    done
  fi
}

validate_units() {
  local node unit

  for node in node01 node02 node03; do
    wait_for_unit "${node}" chainfire
    wait_for_unit "${node}" flaredb
  done

  for node in node01 node02 node03; do
    wait_for_flaredb_region "${node}"
  done

  for node in node01 node02 node03; do
    wait_for_unit "${node}" iam
  done

  for unit in prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost; do
    wait_for_unit node01 "${unit}"
  done

  for node in node04 node05; do
    for unit in ${NODE_UNITS[${node}]}; do
      wait_for_unit "${node}" "${unit}"
    done
  done

  for unit in ${NODE_UNITS[node06]}; do
    wait_for_unit node06 "${unit}"
  done
}

validate_storage_units() {
  local node unit

  for node in node01 node02 node03; do
    wait_for_unit "${node}" chainfire
    wait_for_unit "${node}" flaredb
  done

  for node in node01 node02 node03; do
    wait_for_flaredb_region "${node}"
  done

  for node in node01 node02 node03; do
    wait_for_unit "${node}" iam
  done

  for unit in plasmavmc lightningstor coronafs; do
    wait_for_unit node01 "${unit}"
  done

  for node in node04 node05; do
    for unit in ${NODE_UNITS[${node}]}; do
      wait_for_unit "${node}" "${unit}"
    done
  done
}

validate_storage_control_plane() {
  wait_for_http node01 http://127.0.0.1:8081/health
  wait_for_http node01 http://127.0.0.1:8082/health
  wait_for_http node01 http://127.0.0.1:8083/health
  wait_for_http node01 http://127.0.0.1:8084/health
  wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
  wait_for_tcp_port node01 50086
  wait_for_tcp_port node01 9000
  wait_for_http node02 http://127.0.0.1:8081/health
  wait_for_http node02 http://127.0.0.1:8082/health
  wait_for_http node02 http://127.0.0.1:8083/health
  wait_for_http node03 http://127.0.0.1:8081/health
  wait_for_http node03 http://127.0.0.1:8082/health
  wait_for_http node03 http://127.0.0.1:8083/health
}

validate_control_plane() {
  wait_for_http node01 http://127.0.0.1:8081/health
  wait_for_http node01 http://127.0.0.1:8082/health
  wait_for_http node01 http://127.0.0.1:8083/health
  wait_for_http node01 http://127.0.0.1:8087/health
  wait_for_http node01 http://127.0.0.1:8084/health
  wait_for_http node01 http://127.0.0.1:8085/health
  wait_for_http node02 http://127.0.0.1:8081/health
  wait_for_http node02 http://127.0.0.1:8082/health
  wait_for_http node02 http://127.0.0.1:8083/health
  wait_for_http node03 http://127.0.0.1:8081/health
  wait_for_http node03 http://127.0.0.1:8082/health
  wait_for_http node03 http://127.0.0.1:8083/health

  wait_for_tcp_port node01 50084
  wait_for_http node01 http://127.0.0.1:9097/metrics
  wait_for_udp_port node01 5353
  wait_for_tcp_port node01 50085
  wait_for_http node01 http://127.0.0.1:9098/metrics
  wait_for_tcp_port node01 50086
  wait_for_tcp_port node01 50090
  wait_for_http_status node01 http://127.0.0.1:9000 "200 403"
  wait_for_http node01 http://127.0.0.1:9099/metrics
  wait_for_http node01 http://127.0.0.1:9198/metrics

  log "Validating ChainFire replication across control-plane nodes"
  ssh_node_script node01 <<'EOS'
set -euo pipefail
key="validation-chainfire-$(date +%s)"
value="ok-$RANDOM"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
leader=""
for ip in "${nodes[@]}"; do
  code="$(curl -sS -o /tmp/chainfire-put.out -w '%{http_code}' \
    -X PUT "http://${ip}:8081/api/v1/kv/${key}" \
    -H 'Content-Type: application/json' \
    -d "{\"value\":\"${value}\"}" || true)"
  if [[ "${code}" == "200" ]]; then
    leader="${ip}"
    break
  fi
done
[[ -n "${leader}" ]]
curl -fsS http://10.100.0.11:8081/api/v1/cluster/status | jq -e '.data.term >= 1' >/dev/null
for ip in "${nodes[@]}"; do
  deadline=$((SECONDS + 30))
  while true; do
    actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
    if [[ "${actual}" == "${value}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      echo "chainfire replication did not converge on ${ip}" >&2
      exit 1
    fi
    sleep 1
  done
done
EOS

  log "Validating FlareDB replication across control-plane nodes"
  wait_for_flaredb_region node01
  wait_for_flaredb_region node02
  wait_for_flaredb_region node03
  ssh_node_script node01 <<'EOS'
set -euo pipefail
key="validation-flaredb-$(date +%s)"
value="ok-$RANDOM"
namespace="validation"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
writer=""
for ip in "${nodes[@]}"; do
  code="$(curl -sS -o /tmp/flaredb-put.out -w '%{http_code}' \
    -X PUT "http://${ip}:8082/api/v1/kv/${key}" \
    -H 'Content-Type: application/json' \
    -d "{\"value\":\"${value}\",\"namespace\":\"${namespace}\"}" || true)"
  if [[ "${code}" == "200" ]]; then
    writer="${ip}"
    break
  fi
done
[[ -n "${writer}" ]]
for ip in "${nodes[@]}"; do
  deadline=$((SECONDS + 120))
  while true; do
    actual="$(curl -fsS --get "http://${ip}:8082/api/v1/scan" \
      --data-urlencode "start=${key}" \
      --data-urlencode "end=${key}~" \
      --data-urlencode "namespace=${namespace}" 2>/dev/null \
      | jq -r '.data.items[0].value // empty' 2>/dev/null || true)"
    if [[ "${actual}" == "${value}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      echo "flaredb replication did not converge on ${ip}" >&2
      exit 1
    fi
    sleep 1
  done
done
EOS

  log "Validating FlareDB strong-consistency CAS on the control plane"
  local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
  ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
  ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
set -euo pipefail
proto_root="$1"
key="validation-flaredb-strong-$(date +%s)"
value="ok-$RANDOM"
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"default"}')"
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"default"}')"
writer=""
for ip in "${nodes[@]}"; do
  if grpcurl -plaintext \
    -import-path "${proto_root}" \
    -proto "${proto_root}/kvrpc.proto" \
    -d "${request}" \
    "${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-cas.out 2>/dev/null; then
    if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-cas.out >/dev/null; then
      writer="${ip}"
      break
    fi
  fi
done
[[ -n "${writer}" ]]
deadline=$((SECONDS + 90))
while true; do
  if grpcurl -plaintext \
    -import-path "${proto_root}" \
    -proto "${proto_root}/kvrpc.proto" \
    -d "${get_request}" \
    "${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-cas-get.out 2>/dev/null; then
    if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-cas-get.out >/dev/null; then
      break
    fi
  fi
  if (( SECONDS >= deadline )); then
    echo "flaredb strong CAS read did not converge on leader ${writer}" >&2
    exit 1
  fi
  sleep 1
done
EOS
}

validate_iam_flow() {
  log "Validating IAM token issuance, validation, and scoped authorization"

  local iam_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="iam-smoke-org"
  local project_id="iam-smoke-project"
  local principal_id="iam-smoke-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  grpcurl -plaintext \
    -import-path "${IAM_PROTO_DIR}" \
    -proto "${IAM_PROTO}" \
    -d "$(jq -cn --arg token "${token}" '{token:$token}')" \
    127.0.0.1:15080 iam.v1.IamToken/ValidateToken \
    | jq -e --arg org "${org_id}" --arg project "${project_id}" --arg principal "${principal_id}" \
      '.valid == true and .claims.orgId == $org and .claims.projectId == $project and .claims.principalId == $principal' >/dev/null

  grpcurl -plaintext \
    -import-path "${IAM_PROTO_DIR}" \
    -proto "${IAM_PROTO}" \
    -d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
      '{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"allow-check", orgId:$org, projectId:$project}}')" \
    127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
    | jq -e '.allowed == true' >/dev/null

  grpcurl -plaintext \
    -import-path "${IAM_PROTO_DIR}" \
    -proto "${IAM_PROTO}" \
    -d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
      '{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"deny-check", orgId:$org, projectId:($project + "-other")}}')" \
    127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
    | jq -e '(.allowed // false) == false' >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_prismnet_flow() {
  log "Validating PrismNet VPC, subnet, and port lifecycle"

  local iam_tunnel="" prism_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
  trap 'stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="prismnet-smoke-org"
  local project_id="prismnet-smoke-project"
  local principal_id="prismnet-smoke-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  local vpc_resp subnet_resp port_resp
  local vpc_id subnet_id port_id

  vpc_resp="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg name "prismnet-smoke-vpc" \
      '{orgId:$org, projectId:$project, name:$name, description:"smoke vpc", cidrBlock:"10.44.0.0/16"}')" \
    127.0.0.1:15081 prismnet.VpcService/CreateVpc)"
  vpc_id="$(printf '%s' "${vpc_resp}" | jq -r '.vpc.id')"
  [[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "PrismNet CreateVpc did not return a VPC ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
    127.0.0.1:15081 prismnet.VpcService/GetVpc \
    | jq -e --arg id "${vpc_id}" '.vpc.id == $id' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, pageSize:100, pageToken:""}')" \
    127.0.0.1:15081 prismnet.VpcService/ListVpcs \
    | jq -e --arg id "${vpc_id}" '.vpcs | any(.id == $id)' >/dev/null

  subnet_resp="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"prismnet-smoke-subnet", description:"smoke subnet", cidrBlock:"10.44.10.0/24", gatewayIp:"10.44.10.1", dhcpEnabled:true}')" \
    127.0.0.1:15081 prismnet.SubnetService/CreateSubnet)"
  subnet_id="$(printf '%s' "${subnet_resp}" | jq -r '.subnet.id')"
  [[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "PrismNet CreateSubnet did not return a subnet ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
    127.0.0.1:15081 prismnet.SubnetService/GetSubnet \
    | jq -e --arg id "${subnet_id}" '.subnet.id == $id' >/dev/null

  port_resp="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"prismnet-smoke-port", description:"smoke port", ipAddress:""}')" \
    127.0.0.1:15081 prismnet.PortService/CreatePort)"
  port_id="$(printf '%s' "${port_resp}" | jq -r '.port.id')"
  [[ -n "${port_id}" && "${port_id}" != "null" ]] || die "PrismNet CreatePort did not return a port ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
    127.0.0.1:15081 prismnet.PortService/GetPort \
    | jq -e --arg id "${port_id}" '.port.id == $id and (.port.ipAddress | length) > 0' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, deviceId:"", pageSize:100, pageToken:""}')" \
    127.0.0.1:15081 prismnet.PortService/ListPorts \
    | jq -e --arg id "${port_id}" '.ports | any(.id == $id)' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id, name:"prismnet-smoke-port-updated", description:"updated", securityGroupIds:[], adminStateUp:false}')" \
    127.0.0.1:15081 prismnet.PortService/UpdatePort \
    | jq -e '.port.name == "prismnet-smoke-port-updated" and (.port.adminStateUp // false) == false' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
    127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
    127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
    127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${prism_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_flashdns_flow() {
  log "Validating FlashDNS zone, record, and authoritative query flow"

  local iam_tunnel="" dns_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
  trap 'stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="flashdns-smoke-org"
  local project_id="flashdns-smoke-project"
  local principal_id="flashdns-smoke-$(date +%s)"
  local token zone_name zone_resp zone_id record_resp record_id fqdn
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
  zone_name="smoke-$(date +%s).cluster.test"

  zone_resp="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.smoke.test", adminEmail:"admin@smoke.test"}')" \
    127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone)"
  zone_id="$(printf '%s' "${zone_resp}" | jq -r '.zone.id')"
  [[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "FlashDNS CreateZone did not return a zone ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg id "${zone_id}" '{id:$id}')" \
    127.0.0.1:15084 flashdns.v1.ZoneService/GetZone \
    | jq -e --arg id "${zone_id}" --arg name "${zone_name}" \
      '.zone.id == $id and (.zone.name == $name or .zone.name == ($name + "."))' >/dev/null

  record_resp="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"api", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord)"
  record_id="$(printf '%s' "${record_resp}" | jq -r '.record.id')"
  [[ -n "${record_id}" && "${record_id}" != "null" ]] || die "FlashDNS CreateRecord did not return a record ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/GetRecord \
    | jq -e --arg id "${record_id}" '.record.id == $id' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, nameFilter:"", typeFilter:"", pageSize:100, pageToken:""}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/ListRecords \
    | jq -e --arg id "${record_id}" '.records | any(.id == $id)' >/dev/null

  fqdn="api.${zone_name}"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for authoritative FlashDNS answer for ${fqdn}"
    fi
    sleep 2
  done

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
    127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${dns_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_fiberlb_flow() {
  log "Validating FiberLB management API, runtime listeners, and backend failover behavior"

  local iam_tunnel="" lb_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
  trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="fiberlb-smoke-org"
  local project_id="fiberlb-smoke-project"
  local principal_id="fiberlb-smoke-$(date +%s)"
  local token lb_id pool_id backend_id listener_id listener_port
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
  listener_port=$((18080 + (RANDOM % 100)))

  lb_id="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg name "fiberlb-smoke-lb" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, description:"smoke lb"}')" \
    127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer \
    | jq -r '.loadbalancer.id')"
  [[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "FiberLB CreateLoadBalancer did not return an ID"

  pool_id="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg name "fiberlb-smoke-pool" --arg lb "${lb_id}" '{name:$name, loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
    127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool \
    | jq -r '.pool.id')"
  [[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "FiberLB CreatePool did not return an ID"

  backend_id="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg name "fiberlb-smoke-backend" --arg pool "${pool_id}" '{name:$name, poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend \
    | jq -r '.backend.id')"
  [[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "FiberLB CreateBackend did not return an ID"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/GetBackend \
    | jq -e --arg id "${backend_id}" '.backend.id == $id' >/dev/null

  listener_id="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg name "fiberlb-smoke-listener" --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:$name, loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
    127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener \
    | jq -r '.listener.id')"
  [[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "FiberLB CreateListener did not return an ID"

  wait_for_tcp_port node01 "${listener_port}"
  wait_for_http node01 "http://127.0.0.1:${listener_port}/health"

  local fiberlb_pid fiberlb_peak_cpu load_pid settle_ok
  fiberlb_pid="$(ssh_node node01 'pidof fiberlb')"
  [[ -n "${fiberlb_pid}" ]] || die "FiberLB process is not running on node01"

  ssh_node node01 \
    "bash -lc 'seq 1 256 | xargs -P 32 -I{} curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null'" &
  load_pid=$!
  sleep 1
  fiberlb_peak_cpu="$(ssh_node node01 "top -b -d 1 -n 5 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0; if (cpu > max) max = cpu } END { print max + 0 }'")"
  wait "${load_pid}"
  log "FiberLB peak CPU during synthetic load: ${fiberlb_peak_cpu}%"

  settle_ok=0
  for _ in {1..10}; do
    if ssh_node node01 \
      "top -b -d 1 -n 2 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0 } END { exit !(cpu < 20.0) }'"; then
      settle_ok=1
      break
    fi
    sleep 2
  done
  [[ "${settle_ok}" -eq 1 ]] || die "FiberLB CPU did not settle after synthetic load"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_DISABLED"}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
    | jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_DISABLED"' >/dev/null

  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ! ssh_node node01 "curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null" >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for disabled FiberLB backend to stop serving traffic"
    fi
    sleep 2
  done

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_ENABLED"}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
    | jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_ENABLED"' >/dev/null
  wait_for_http node01 "http://127.0.0.1:${listener_port}/health"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${lb_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_k8shost_flow() {
  log "Validating K8sHost node, pod, service, and controller integrations"

  local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel="" k8s_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
  dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
  lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
  k8s_tunnel="$(start_ssh_tunnel node01 15087 50087)"
  trap 'stop_ssh_tunnel node01 "${k8s_tunnel}"; stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="default-org"
  local project_id="default-project"
  local principal_id="k8shost-smoke-$(date +%s)"
  local token node_name pod_name service_name service_port
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
  node_name="smoke-node-$(date +%s)"
  pod_name="smoke-pod-$(date +%s)"
  service_name="smoke-svc-$(date +%s)"
  service_port=$((18180 + (RANDOM % 100)))

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg name "${node_name}" --arg org "${org_id}" --arg project "${project_id}" '{node:{metadata:{name:$name, orgId:$org, projectId:$project}, spec:{podCidr:"10.244.0.0/24"}, status:{addresses:[{type:"InternalIP", address:"10.100.0.21"}], conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4", memory:"8192Mi"}, allocatable:{cpu:"4", memory:"8192Mi"}}}}')" \
    127.0.0.1:15087 k8shost.NodeService/RegisterNode >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg name "${node_name}" '{nodeName:$name, status:{conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4"}, allocatable:{cpu:"4"}}}')" \
    127.0.0.1:15087 k8shost.NodeService/Heartbeat \
    | jq -e '.success == true' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d '{}' \
    127.0.0.1:15087 k8shost.NodeService/ListNodes \
    | jq -e --arg name "${node_name}" '.items | any(.metadata.name == $name)' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg name "${pod_name}" --arg org "${org_id}" --arg project "${project_id}" '{pod:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project, labels:{app:"k8shost-smoke"}}, spec:{containers:[{name:"backend", image:"smoke", ports:[{containerPort:8081, protocol:"TCP"}]}]}, status:{phase:"Running", podIp:"10.100.0.11", hostIp:"10.100.0.11"}}}')" \
    127.0.0.1:15087 k8shost.PodService/CreatePod >/dev/null

  log "Matrix case: K8sHost + PrismNet"
  local pools_json
  pools_json="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" \
    -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
    127.0.0.1:15081 prismnet.IpamService/ListServiceIPPools)"
  if ! printf '%s' "${pools_json}" | jq -e '.pools | length > 0' >/dev/null; then
    grpcurl -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${PRISMNET_PROTO_DIR}" \
      -proto "${PRISMNET_PROTO}" \
      -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, name:"default-cluster-ip-pool", description:"smoke-created default ClusterIP pool", cidrBlock:"10.96.42.0/24", poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
      127.0.0.1:15081 prismnet.IpamService/CreateServiceIPPool >/dev/null
  fi

  log "Matrix case: K8sHost + PrismNet + FiberLB + FlashDNS"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg name "${service_name}" --arg org "${org_id}" --arg project "${project_id}" --argjson port "${service_port}" '{service:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project}, spec:{ports:[{name:"http", port:$port, targetPort:8081, protocol:"TCP"}], selector:{app:"k8shost-smoke"}, type:"LoadBalancer"}}}')" \
    127.0.0.1:15087 k8shost.ServiceService/CreateService >/dev/null

  local service_json cluster_ip lb_id record_id zone_id
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    service_json="$(grpcurl -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${K8SHOST_PROTO_DIR}" \
      -proto "${K8SHOST_PROTO}" \
      -d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
      127.0.0.1:15087 k8shost.ServiceService/GetService 2>/dev/null || true)"
    if [[ -n "${service_json}" ]] && printf '%s' "${service_json}" | jq -e '
      .service.status.loadBalancer.ingress[0].ip != null and
      .service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"] != null and
      .service.metadata.annotations["flashdns.plasmacloud.io/record-id"] != null' >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for K8sHost controllers to provision service ${service_name}"
    fi
    sleep 2
  done

  cluster_ip="$(printf '%s' "${service_json}" | jq -r '.service.spec.clusterIp')"
  lb_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"]')"
  record_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/record-id"]')"
  zone_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/zone-id"]')"
  [[ -n "${cluster_ip}" && "${cluster_ip}" != "null" ]] || die "K8sHost service did not get a cluster IP"
  [[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "K8sHost service did not get a FiberLB load balancer"
  [[ -n "${record_id}" && "${record_id}" != "null" ]] || die "K8sHost service did not get a FlashDNS record"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn '{namespace:"default"}')" \
    127.0.0.1:15087 k8shost.ServiceService/ListServices \
    | jq -e --arg name "${service_name}" '.items | any(.metadata.name == $name)' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn '{namespace:"default", labelSelector:{app:"k8shost-smoke"}}')" \
    127.0.0.1:15087 k8shost.PodService/ListPods \
    | jq -e --arg name "${pod_name}" '.items | any(.metadata.name == $name)' >/dev/null

  log "Matrix case: K8sHost + FlashDNS"
  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_name}.default.svc.cluster.local A | grep -Fx '${cluster_ip}'" >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for K8sHost FlashDNS record for ${service_name}"
    fi
    sleep 2
  done

  log "Matrix case: K8sHost + FiberLB"
  wait_for_http node01 "http://127.0.0.1:${service_port}/health"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
    127.0.0.1:15087 k8shost.ServiceService/DeleteService >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${K8SHOST_PROTO_DIR}" \
    -proto "${K8SHOST_PROTO}" \
    -d "$(jq -cn --arg ns "default" --arg name "${pod_name}" '{namespace:$ns, name:$name}')" \
    127.0.0.1:15087 k8shost.PodService/DeletePod >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" \
    -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
    127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${k8s_tunnel}"
  stop_ssh_tunnel node01 "${lb_tunnel}"
  stop_ssh_tunnel node01 "${dns_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_workers() {
  wait_for_http node04 http://127.0.0.1:8084/health
  wait_for_http node05 http://127.0.0.1:8084/health
  wait_for_tcp_port node04 50086
  wait_for_tcp_port node05 50086
  wait_for_http node04 http://127.0.0.1:9098/metrics
  wait_for_http node05 http://127.0.0.1:9098/metrics
  wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"

  log "Validating CoronaFS block export accessibility on worker nodes"
  local coronafs_tunnel="" probe_volume="worker-probe-$(date +%s)"
  coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
  trap 'stop_ssh_tunnel node01 "${coronafs_tunnel}"' RETURN
  coronafs_create_volume 15088 "${probe_volume}" $((64 * 1024 * 1024)) >/dev/null
  local probe_export_json probe_uri
  probe_export_json="$(coronafs_export_volume_json 15088 "${probe_volume}")"
  probe_uri="$(printf '%s' "${probe_export_json}" | jq -r '.export.uri')"
  [[ -n "${probe_uri}" && "${probe_uri}" != "null" ]] || die "CoronaFS probe volume did not return an export URI"
  run_remote_nbd_fio_json node04 "${probe_uri}" write 1M 32 >/dev/null
  run_remote_nbd_dd_read_json node05 "${probe_uri}" 32 >/dev/null
  coronafs_delete_volume 15088 "${probe_volume}"
  stop_ssh_tunnel node01 "${coronafs_tunnel}"
  trap - RETURN
}

validate_nested_kvm_workers() {
  log "Validating nested KVM inside worker VMs"
  for node in node04 node05; do
    ssh_node_script "${node}" <<'EOS'
set -euo pipefail
modprobe kvm_intel >/dev/null 2>&1 || modprobe kvm_amd >/dev/null 2>&1 || true
[[ -c /dev/kvm ]]
grep -Eq 'vmx|svm' /proc/cpuinfo
qemu-system-x86_64 \
  -accel kvm \
  -cpu host \
  -machine q35 \
  -m 256 \
  -display none \
  -nodefaults \
  -no-reboot \
  -daemonize \
  -pidfile /tmp/nested-kvm.pid \
  -serial file:/tmp/nested-kvm.log \
  -kernel /run/current-system/kernel \
  -append 'console=ttyS0' >/tmp/nested-kvm.cmd.log 2>&1
sleep 5
kill -0 "$(cat /tmp/nested-kvm.pid)"
kill "$(cat /tmp/nested-kvm.pid)"
EOS
  done
}

validate_lightningstor_distributed_storage() {
  log "Validating distributed LightningStor object replication across node01/node04/node05"

  local iam_tunnel="" ls_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
  trap 'stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="smoke-org"
  local project_id="smoke-project"
  local principal_id="lightningstor-smoke-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  local bucket="dist-smoke-$(date +%s)"
  ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"
  wait_for_lightningstor_write_quorum 15086 "${token}" "${bucket}" "distributed LightningStor validation"

  local before_node01 before_node04 before_node05
  read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)

  local key="replication-check-$(date +%s)"
  local body="distributed-object-${key}"
  local body_b64
  body_b64="$(printf '%s' "${body}" | base64 -w0)"

  local put_json head_json delete_json output
  put_json="$(
    jq -cn \
      --arg bucket "${bucket}" \
      --arg key "${key}" \
      --arg body "${body_b64}" \
      '{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
  )"
  log "LightningStor distributed replication: PUT ${bucket}/${key}"
  output="$(
    grpcurl_capture -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${put_json}" \
      127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject
  )" || die "failed to write LightningStor distributed replication probe ${bucket}/${key}: ${output}"

  head_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
  log "LightningStor distributed replication: HEAD ${bucket}/${key}"
  output="$(
    grpcurl_capture -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${head_json}" \
      127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
  )" || die "failed to head LightningStor distributed replication probe ${bucket}/${key}: ${output}"
  printf '%s\n' "${output}" \
    | jq -e --arg size "$(printf '%s' "${body}" | wc -c | awk '{print $1}')" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
    || die "LightningStor distributed replication probe ${bucket}/${key} returned unexpected metadata: ${output}"
  local fetched_body
  log "LightningStor distributed replication: GET ${bucket}/${key}"
  output="$(
    grpcurl_capture -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${head_json}" \
      127.0.0.1:15086 lightningstor.v1.ObjectService/GetObject
  )" || die "failed to fetch LightningStor distributed replication probe ${bucket}/${key}: ${output}"
  fetched_body="$(printf '%s\n' "${output}" | jq -rsr '[.[] | .bodyChunk? | select(. != null) | @base64d] | join("")')" \
    || die "failed to decode LightningStor distributed replication probe ${bucket}/${key}: ${output}"
  [[ "${fetched_body}" == "${body}" ]] || die "distributed LightningStor returned unexpected object payload"

  wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "generic object replication"

  delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
  log "LightningStor distributed replication: DELETE ${bucket}/${key}"
  output="$(
    grpcurl_capture -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${delete_json}" \
      127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject
  )" || die "failed to delete LightningStor distributed replication probe ${bucket}/${key}: ${output}"

  wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "generic object cleanup"

  trap - RETURN
  stop_ssh_tunnel node01 "${ls_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_vm_storage_flow() {
  log "Validating PlasmaVMC image import, shared-volume execution, and live migration"

  local iam_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
  vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
  coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
  local image_source_path=""
  local node01_proto_root="/var/lib/plasmavmc/test-protos"
  cleanup_vm_storage_flow() {
    if [[ -n "${image_source_path}" ]]; then
      ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
    fi
    stop_ssh_tunnel node01 "${coronafs_tunnel}"
    stop_ssh_tunnel node01 "${vm_tunnel}"
    stop_ssh_tunnel node01 "${ls_tunnel}"
    stop_ssh_tunnel node01 "${iam_tunnel}"
  }
  trap cleanup_vm_storage_flow RETURN

  wait_for_plasmavmc_workers_registered 15082

  local org_id="vm-smoke-org"
  local project_id="vm-smoke-project"
  local principal_id="plasmavmc-smoke-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
  wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC image import"

  local image_name="vm-image-$(date +%s)"
  local image_id=""
  local guest_image_local_path guest_image_sha guest_image_size remote_guest_image_sha
  local image_before_node01 image_before_node04 image_before_node05
  local image_after_node01 image_after_node04 image_after_node05
  read -r image_before_node01 image_before_node04 image_before_node05 < <(lightningstor_count_triplet)
  guest_image_local_path="$(guest_image_path)"
  [[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image"
  guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
  guest_image_size="$(stat -c %s "${guest_image_local_path}")"
  ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
  ssh_node node01 "install -d -m 0755 ${node01_proto_root}/iam ${node01_proto_root}/plasmavmc ${node01_proto_root}/lightningstor"
  scp_to_node node01 "${IAM_PROTO}" "${node01_proto_root}/iam/iam.proto"
  scp_to_node node01 "${PLASMAVMC_PROTO}" "${node01_proto_root}/plasmavmc/plasmavmc.proto"
  scp_to_node node01 "${LIGHTNINGSTOR_PROTO}" "${node01_proto_root}/lightningstor/lightningstor.proto"
  ssh_node node01 "find /var/lib/plasmavmc/imports -maxdepth 1 -type f -name 'vm-image-*.qcow2' -delete"
  image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
  scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
  remote_guest_image_sha="$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")"
  [[ "${remote_guest_image_sha}" == "${guest_image_sha}" ]] || die "bootable VM guest image checksum mismatch after host distribution"

  local create_image_json
  log "Matrix case: PlasmaVMC + LightningStor"
  create_image_json="$(
    jq -cn \
      --arg name "${image_name}" \
      --arg org "${org_id}" \
      --arg sha "${guest_image_sha}" \
      --arg source_url "file://${image_source_path}" \
      '{
        name:$name,
        orgId:$org,
        visibility:"VISIBILITY_PRIVATE",
        format:"IMAGE_FORMAT_QCOW2",
        osType:"OS_TYPE_LINUX",
        osVersion:"smoke",
        architecture:"ARCHITECTURE_X86_64",
        minDiskGib:1,
        minMemoryMib:512,
        metadata:{purpose:"smoke", sourceSha256:$sha},
        sourceUrl:$source_url
      }'
  )"
  local create_image_response
  create_image_response="$(
    ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_image_json}" | base64 | tr -d '\n')" <<'EOS'
set -euo pipefail
proto_root="$1"
token="$2"
request_b64="$3"
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
grpcurl -plaintext \
  -H "authorization: Bearer ${token}" \
  -import-path "${proto_root}/plasmavmc" \
  -proto "${proto_root}/plasmavmc/plasmavmc.proto" \
  -d "${request_json}" \
  127.0.0.1:50082 plasmavmc.v1.ImageService/CreateImage
EOS
  )"
  image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
  [[ -n "${image_id}" && "${image_id}" != "null" ]] || die "failed to create image through PlasmaVMC"
  printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE" and .format == "IMAGE_FORMAT_QCOW2"' >/dev/null

  local image_key="${org_id}/${project_id}/${image_id}.qcow2"
  local get_image_json
  get_image_json="$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${get_image_json}" \
    127.0.0.1:15082 plasmavmc.v1.ImageService/GetImage \
    | jq -e --arg image "${image_id}" '.id == $image and .status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" '{orgId:$org, pageSize:100, pageToken:"", includePublic:false}')" \
    127.0.0.1:15082 plasmavmc.v1.ImageService/ListImages \
    | jq -e --arg image "${image_id}" '.images | any(.id == $image)' >/dev/null

  local head_image_json head_image_response
  head_image_json="$(jq -cn --arg bucket "plasmavmc-images" --arg key "${image_key}" '{bucket:$bucket, key:$key}')"
  head_image_response="$(
    grpcurl_capture -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
      -proto "${LIGHTNINGSTOR_PROTO}" \
      -d "${head_image_json}" \
      127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
  )" || die "failed to head imported PlasmaVMC image object ${image_key}: ${head_image_response}"
  printf '%s\n' "${head_image_response}" \
    | jq -e --arg size "${guest_image_size}" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
    || die "imported PlasmaVMC image object ${image_key} returned unexpected size: ${head_image_response}"
  local image_checksum
  image_checksum="$(printf '%s' "${create_image_response}" | jq -r '.checksum')"
  [[ -n "${image_checksum}" && "${image_checksum}" != "null" ]] || die "CreateImage response did not return an imported image checksum"
  # CreateImage computes the checksum from the normalized qcow2 artifact before upload.
  [[ "${image_checksum}" == "${guest_image_sha}" ]] || die "imported PlasmaVMC image checksum mismatch"
  ssh_node node01 "rm -f ${image_source_path}"
  image_source_path=""
  wait_for_lightningstor_counts_greater_than "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image import"
  read -r image_after_node01 image_after_node04 image_after_node05 < <(lightningstor_count_triplet)

  local create_vm_rest_json
  create_vm_rest_json="$(
    jq -cn \
      --arg name "smoke-vm-$(date +%s)" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      --arg image_id "${image_id}" \
      '{
        name:$name,
        org_id:$org,
        project_id:$project,
        hypervisor:"kvm",
        vcpus:1,
        memory_mib:1024,
        disks:[
          {
            id:"root",
            source:{type:"image", image_id:$image_id},
            size_gib:4,
            boot_index:1
          },
          {
            id:"data",
            source:{type:"blank"},
            size_gib:2
          }
        ]
      }'
  )"

  local create_vm_grpc_json
  create_vm_grpc_json="$(
    jq -cn \
      --arg name "$(printf '%s' "${create_vm_rest_json}" | jq -r '.name')" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      --arg image_id "${image_id}" \
      '{
        name:$name,
        orgId:$org,
        projectId:$project,
        hypervisor:"HYPERVISOR_TYPE_KVM",
        spec:{
          cpu:{vcpus:1, coresPerSocket:1, sockets:1},
          memory:{sizeMib:1024},
          disks:[
            {
              id:"root",
              source:{imageId:$image_id},
              sizeGib:4,
              bus:"DISK_BUS_VIRTIO",
              cache:"DISK_CACHE_NONE",
              bootIndex:1
            },
            {
              id:"data",
              source:{blank:true},
              sizeGib:2,
              bus:"DISK_BUS_VIRTIO",
              cache:"DISK_CACHE_NONE"
            }
          ]
        }
      }'
  )"

  local create_response vm_id
  create_response="$(
    ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_vm_grpc_json}" | base64 | tr -d '\n')" <<'EOS'
set -euo pipefail
proto_root="$1"
token="$2"
request_b64="$3"
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
grpcurl -plaintext \
  -H "authorization: Bearer ${token}" \
  -import-path "${proto_root}/plasmavmc" \
  -proto "${proto_root}/plasmavmc/plasmavmc.proto" \
  -d "${request_json}" \
  127.0.0.1:50082 plasmavmc.v1.VmService/CreateVm
EOS
  )"
  vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
  [[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "failed to create VM through PlasmaVMC"

  local get_vm_json
  get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"

  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  local node_id=""
  local peer_node=""
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
      fi
      sleep 2
      continue
    fi
    node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
    if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
    fi
    sleep 2
  done
  if [[ "${node_id}" == "node04" ]]; then
    peer_node="node05"
  else
    peer_node="node04"
  fi

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to reach RUNNING"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to reach RUNNING"
    fi
    sleep 2
  done

  log "Matrix case: PlasmaVMC + CoronaFS"
  local volume_id="${vm_id}-root"
  local data_volume_id="${vm_id}-data"
  local volume_path="${CORONAFS_VOLUME_ROOT}/${volume_id}.raw"
  local data_volume_path="${CORONAFS_VOLUME_ROOT}/${data_volume_id}.raw"
  local volume_export_json data_volume_export_json volume_uri data_volume_uri
  volume_export_json="$(coronafs_export_volume_json 15088 "${volume_id}")"
  data_volume_export_json="$(coronafs_export_volume_json 15088 "${data_volume_id}")"
  volume_uri="$(printf '%s' "${volume_export_json}" | jq -r '.export.uri')"
  data_volume_uri="$(printf '%s' "${data_volume_export_json}" | jq -r '.export.uri')"
  [[ -n "${volume_uri}" && "${volume_uri}" != "null" ]] || die "CoronaFS root volume export URI missing"
  [[ -n "${data_volume_uri}" && "${data_volume_uri}" != "null" ]] || die "CoronaFS data volume export URI missing"
  ssh_node node01 "test -f ${volume_path}"
  ssh_node node01 "test -f ${data_volume_path}"
  wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
  wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
  wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM startup"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=1"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=1"

  log "Matrix case: PlasmaVMC + CoronaFS + LightningStor"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to stop after first boot"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to stop after first boot"
    fi
    sleep 2
  done

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
      node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
    fi
    sleep 2
  done
  if [[ "${node_id}" == "node04" ]]; then
    peer_node="node05"
  else
    peer_node="node04"
  fi
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=2"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=2"
  wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM restart"

  local migrate_vm_json
  migrate_vm_json="$(
    jq -cn \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      --arg vm "${vm_id}" \
      --arg destination_node "${peer_node}" \
      '{
        orgId:$org,
        projectId:$project,
        vmId:$vm,
        destinationNodeId:$destination_node,
        timeoutSeconds:120,
        wait:true
      }'
  )"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${migrate_vm_json}" \
    127.0.0.1:15082 plasmavmc.v1.VmService/MigrateVm >/dev/null

  local source_node="${node_id}"
  local destination_node="${peer_node}"
  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} live migration to ${destination_node}"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e --arg node "${destination_node}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} live migration to ${destination_node}"
    fi
    sleep 2
  done
  node_id="${destination_node}"
  wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
  wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
  wait_for_qemu_volume_absent "${source_node}" "${volume_uri}"
  wait_for_qemu_volume_absent "${source_node}" "${data_volume_uri}"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_HEARTBEAT count=2"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to stop after live migration"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to stop after live migration"
    fi
    sleep 2
  done

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e --arg node "${node_id}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
    fi
    sleep 2
  done

  wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
  wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=3"
  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=3"
  wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM post-migration restart"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for VM ${vm_id} to stop"
      fi
      sleep 2
      continue
    fi
    if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} to stop"
    fi
    sleep 2
  done

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/DeleteVm >/dev/null

  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ! grpcurl -plaintext \
      -H "authorization: Bearer ${token}" \
      -import-path "${PLASMAVMC_PROTO_DIR}" \
      -proto "${PLASMAVMC_PROTO}" \
      -d "${get_vm_json}" \
      127.0.0.1:15082 plasmavmc.v1.VmService/GetVm >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for VM ${vm_id} deletion to propagate"
    fi
    sleep 2
  done

  ssh_node "${node_id}" "bash -lc '[[ ! -d $(printf '%q' "$(vm_runtime_dir_path "${vm_id}")") ]]'"
  ssh_node node01 "bash -lc '[[ ! -f ${volume_path} ]]'"
  ssh_node node01 "bash -lc '[[ ! -f ${data_volume_path} ]]'"
  if coronafs_get_volume_json 15088 "${volume_id}" >/dev/null 2>&1; then
    die "CoronaFS root volume metadata still exists after VM deletion"
  fi
  if coronafs_get_volume_json 15088 "${data_volume_id}" >/dev/null 2>&1; then
    die "CoronaFS data volume metadata still exists after VM deletion"
  fi
  wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM deletion"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "${head_image_json}" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject \
    | jq -e '(.object.size | tonumber) > 0' >/dev/null
  if grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${volume_id}.raw" '{bucket:$bucket, key:$key}')" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
    die "shared-fs VM volume unexpectedly persisted to LightningStor object storage"
  fi
  if grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${data_volume_id}.raw" '{bucket:$bucket, key:$key}')" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
    die "shared-fs VM data volume unexpectedly persisted to LightningStor object storage"
  fi

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${get_image_json}" \
    127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null
  if grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "${head_image_json}" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
    die "image object still present after ImageService/DeleteImage"
  fi
  wait_for_lightningstor_counts_equal "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image cleanup"

  trap - RETURN
  cleanup_vm_storage_flow
}

validate_gateway() {
  wait_for_http node06 http://127.0.0.1:8080/health
  wait_for_http node06 http://127.0.0.1:9090/api/v1/series
  wait_for_tcp_port node06 50089
  wait_for_http node06 http://127.0.0.1:3011/health

  log "Validating host-forwarded gateway endpoints"
  wait_for_host_http http://127.0.0.1:8080/health
  wait_for_host_http http://127.0.0.1:9090/api/v1/series

  log "Validating API Gateway proxy routes"

  local iam_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local org_id="gateway-smoke-org"
  local project_id="gateway-smoke-project"
  local principal_id="gateway-smoke-$(date +%s)"
  local token vpc_json vpc_id
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  vpc_json="$(
    curl -fsS \
      -X POST http://127.0.0.1:8080/api/v1/vpcs \
      -H "Authorization: Bearer ${token}" \
      -H 'Content-Type: application/json' \
      -d "$(jq -cn --arg name "gateway-smoke-vpc" --arg org "${org_id}" --arg project "${project_id}" \
        '{name:$name, org_id:$org, project_id:$project, cidr_block:"10.55.0.0/16", description:"gateway proxy smoke"}')"
  )"
  vpc_id="$(printf '%s' "${vpc_json}" | jq -r '.data.id')"
  [[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "API Gateway VPC create did not return an ID"

  curl -fsS --get http://127.0.0.1:8080/api/v1/vpcs \
    -H "Authorization: Bearer ${token}" \
    --data-urlencode "org_id=${org_id}" \
    --data-urlencode "project_id=${project_id}" \
    | jq -e --arg id "${vpc_id}" '.data.vpcs | any(.id == $id)' >/dev/null

  curl -fsS http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
    -H "Authorization: Bearer ${token}" \
    | jq -e --arg id "${vpc_id}" '.data.id == $id' >/dev/null

  curl -fsS http://127.0.0.1:8080/api/v1/vms \
    -H "Authorization: Bearer ${token}" \
    | jq -e '.data.vms != null' >/dev/null

  curl -fsS -X DELETE http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
    -H "Authorization: Bearer ${token}" >/dev/null

  trap - RETURN
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_nightlight_flow() {
  log "Validating NightLight remote_write ingestion and query endpoints"

  local metric_name="nightlight_smoke_metric_$(date +%s)"
  local metric_value
  metric_value="$(awk 'BEGIN{srand(); printf "%.3f\n", (rand()*100)+1}')"

  python3 "${REPO_ROOT}/nix/test-cluster/nightlight_remote_write.py" \
    --url http://127.0.0.1:9090/api/v1/write \
    --metric "${metric_name}" \
    --value "${metric_value}" \
    --label source=smoke \
    --label cluster=photoncloud

  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if curl -fsS --get http://127.0.0.1:9090/api/v1/query \
      --data-urlencode "query=${metric_name}{source=\"smoke\"}" \
      | jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
        .status == "success"
        and (.data.result | length) >= 1
        and (.data.result | any(.metric.__name__ == $name and (.value[1] >= ($expected - 0.001)) and (.value[1] <= ($expected + 0.001))))
      ' >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for NightLight query result for ${metric_name}"
    fi
    sleep 2
  done

  curl -fsS http://127.0.0.1:9090/api/v1/label/__name__/values \
    | jq -e --arg name "${metric_name}" '.status == "success" and (.data | index($name)) != null' >/dev/null
  curl -fsS http://127.0.0.1:9090/api/v1/series \
    | jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.__name__ == $name))' >/dev/null
}

validate_creditservice_flow() {
  log "Validating CreditService REST and gRPC quota flows"

  local iam_tunnel="" credit_grpc_tunnel="" credit_http_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  credit_grpc_tunnel="$(start_ssh_tunnel node06 15089 50089)"
  credit_http_tunnel="$(start_ssh_tunnel node06 13011 3011)"
  trap 'stop_ssh_tunnel node06 "${credit_http_tunnel}"; stop_ssh_tunnel node06 "${credit_grpc_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  local suffix
  suffix="$(date +%s)"
  local org_id="credit-smoke-org-${suffix}"
  local project_id="credit-smoke-project-${suffix}"
  local principal_id="credit-smoke-$(date +%s)"
  local token reservation_json reservation_id
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  curl -fsS \
    -X POST http://127.0.0.1:13011/api/v1/wallets \
    -H "Authorization: Bearer ${token}" \
    -H 'Content-Type: application/json' \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{org_id:$org, project_id:$project, initial_balance:1000}')" \
    | jq -e '.data.project_id != null and .data.balance == 1000 and .data.available == 1000' >/dev/null

  curl -fsS http://127.0.0.1:13011/api/v1/wallets/"${project_id}" \
    -H "Authorization: Bearer ${token}" \
    | jq -e --arg project "${project_id}" '.data.project_id == $project and .data.balance == 1000' >/dev/null

  curl -fsS \
    -X POST http://127.0.0.1:13011/api/v1/wallets/"${project_id}"/topup \
    -H "Authorization: Bearer ${token}" \
    -H 'Content-Type: application/json' \
    -d '{"amount":250,"description":"smoke topup"}' \
    | jq -e '.data.balance == 1250 and .data.total_deposited == 1250' >/dev/null

  reservation_json="$(
    curl -fsS \
      -X POST http://127.0.0.1:13011/api/v1/reservations \
      -H "Authorization: Bearer ${token}" \
      -H 'Content-Type: application/json' \
      -d "$(jq -cn --arg project "${project_id}" '{project_id:$project, amount:200, description:"smoke reservation", resource_type:"vm", ttl_seconds:120}')"
  )"
  reservation_id="$(printf '%s' "${reservation_json}" | jq -r '.data.id')"
  [[ -n "${reservation_id}" && "${reservation_id}" != "null" ]] || die "CreditService reservation did not return an ID"

  curl -fsS \
    -X POST http://127.0.0.1:13011/api/v1/reservations/"${reservation_id}"/commit \
    -H "Authorization: Bearer ${token}" \
    -H 'Content-Type: application/json' \
    -d '{"actual_amount":150,"resource_id":"smoke-vm"}' \
    | jq -e '.data.balance == 1100 and .data.reserved == 0 and .data.available == 1100' >/dev/null

  curl -fsS http://127.0.0.1:13011/api/v1/wallets/"${project_id}"/transactions \
    -H "Authorization: Bearer ${token}" \
    | jq -e '.data.transactions | length >= 3' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${CREDITSERVICE_PROTO_DIR}" \
    -proto "${CREDITSERVICE_PROTO}" \
    -d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", limit:2}')" \
    127.0.0.1:15089 creditservice.v1.CreditService/SetQuota \
    | jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${CREDITSERVICE_PROTO_DIR}" \
    -proto "${CREDITSERVICE_PROTO}" \
    -d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE"}')" \
    127.0.0.1:15089 creditservice.v1.CreditService/GetQuota \
    | jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${CREDITSERVICE_PROTO_DIR}" \
    -proto "${CREDITSERVICE_PROTO}" \
    -d "$(jq -cn --arg project "${project_id}" '{projectId:$project}')" \
    127.0.0.1:15089 creditservice.v1.CreditService/ListQuotas \
    | jq -e '.quotas | length >= 1' >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${CREDITSERVICE_PROTO_DIR}" \
    -proto "${CREDITSERVICE_PROTO}" \
    -d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", quantity:3, estimatedCost:0}')" \
    127.0.0.1:15089 creditservice.v1.CreditService/CheckQuota \
    | jq -e '(.allowed // false) == false and (.availableQuota == "2" or .availableQuota == 2)' >/dev/null

  ssh_node_script node06 <<'EOS'
set -euo pipefail
systemctl is-active --quiet creditservice.service
journalctl -u creditservice.service --no-pager | grep -F 'Connecting to IAM server at' >/dev/null
EOS

  trap - RETURN
  stop_ssh_tunnel node06 "${credit_http_tunnel}"
  stop_ssh_tunnel node06 "${credit_grpc_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_deployer_flow() {
  log "Validating Deployer health, admin registration, and phone-home flows"

  local deployer_tunnel=""
  deployer_tunnel="$(start_ssh_tunnel node06 13012 8088)"
  trap 'stop_ssh_tunnel node06 "${deployer_tunnel}"' RETURN

  wait_for_http node06 "http://127.0.0.1:8088/health"
  curl -fsS http://127.0.0.1:13012/health | grep -Fx 'OK' >/dev/null

  local machine_id node_id phone_home_json
  machine_id="smoke-machine-$(date +%s)"
  node_id="smoke-node-$(date +%s)"

  curl -fsS \
    -H 'content-type: application/json' \
    -H 'x-deployer-token: test-admin-token' \
    -d "$(jq -cn \
      --arg machine "${machine_id}" \
      --arg node "${node_id}" \
      '{machine_id:$machine, node_id:$node, role:"worker", ip:"10.100.0.250", services:["plasmavmc"], ssh_authorized_keys:["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFiberLBSmokeKey smoke@test"]}')" \
    http://127.0.0.1:13012/api/v1/admin/nodes \
    | jq -e --arg machine "${machine_id}" --arg node "${node_id}" '.success == true and .machine_id == $machine and .node_id == $node' >/dev/null

  curl -fsS \
    -H 'x-deployer-token: test-admin-token' \
    http://127.0.0.1:13012/api/v1/admin/nodes \
    | jq -e --arg node "${node_id}" '.nodes | any(.node_id == $node and .ip == "10.100.0.250" and .role == "worker")' >/dev/null

  phone_home_json="$(curl -fsS \
    -H 'content-type: application/json' \
    -H 'x-deployer-token: test-bootstrap-token' \
    -d "$(jq -cn \
      --arg machine "${machine_id}" \
      --arg node "${node_id}" \
      '{machine_id:$machine, node_id:$node, hostname:$node, ip:"10.100.0.250", metadata:{rack:"smoke-a1"}}')" \
    http://127.0.0.1:13012/api/v1/phone-home)"
  printf '%s' "${phone_home_json}" | jq -e --arg node "${node_id}" '
    .success == true and
    .node_id == $node and
    .state == "provisioning" and
    .node_config.hostname == $node and
    .node_config.role == "worker" and
    (.node_config.services | index("plasmavmc")) != null
  ' >/dev/null

  trap - RETURN
  stop_ssh_tunnel node06 "${deployer_tunnel}"
}

validate_native_runtime_flow() {
  log "Validating native deployer + scheduler runtime orchestration"

  wait_for_unit node04 node-agent
  wait_for_unit node05 node-agent
  wait_for_unit node06 fleet-scheduler
  wait_for_http node06 "http://127.0.0.1:8088/health"

  local tmp_dir native_config drained_config restored_config
  local chainfire_tunnel_node01="" chainfire_tunnel_node02="" chainfire_tunnel_node03=""
  local chainfire_endpoint="http://127.0.0.1:12379,http://127.0.0.1:12380,http://127.0.0.1:12381"
  local iam_tunnel="" lb_tunnel="" token lb_name
  local native_fresh_healthy_map_expr native_fresh_healthy_count_expr
  tmp_dir="$(mktemp -d -p "${TMPDIR:-/tmp}" photon-native-runtime-XXXXXX)"
  native_config="${tmp_dir}/native-runtime.yaml"
  drained_config="${tmp_dir}/native-runtime-drained.yaml"
  restored_config="${tmp_dir}/native-runtime-restored.yaml"
  native_fresh_healthy_map_expr='map(select(.state == "healthy" and (((((.last_heartbeat // .observed_at) // "") | sub("\\.[0-9]+"; "") | sub("\\+00:00$"; "Z") | fromdateiso8601?) // 0) >= (now - 300))))'
  native_fresh_healthy_count_expr="${native_fresh_healthy_map_expr} | length"
  chainfire_tunnel_node01="$(start_ssh_tunnel node01 12379 2379 "${NODE_IPS[node01]}")"
  chainfire_tunnel_node02="$(start_ssh_tunnel node02 12380 2379 "${NODE_IPS[node02]}")"
  chainfire_tunnel_node03="$(start_ssh_tunnel node03 12381 2379 "${NODE_IPS[node03]}")"
  trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"; stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"; stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"; stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"; rm -rf "${tmp_dir}"' RETURN

  native_dump_values() {
    local prefix="$1"
    run_deployer_ctl \
      --chainfire-endpoint "${chainfire_endpoint}" \
      --cluster-id "test-cluster" \
      --cluster-namespace "photoncloud" \
      --deployer-namespace "deployer" \
      dump --prefix "${prefix}" --format json \
      | jq -rc '.value'
  }

  wait_for_native_dump_count() {
    local prefix="$1"
    local jq_expr="$2"
    local expected="$3"
    local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
    local deadline=$((SECONDS + timeout))

    while true; do
      local count
      count="$(
        native_dump_values "${prefix}" \
          | sed '/^$/d' \
          | jq -sr "${jq_expr}" 2>/dev/null \
          || printf '0'
      )"
      if [[ "${count}" == "${expected}" ]]; then
        return 0
      fi
      if (( SECONDS >= deadline )); then
        die "timed out waiting for prefix ${prefix} to satisfy ${jq_expr} == ${expected} (got ${count})"
      fi
      sleep 2
    done
  }

  native_first_healthy_instance() {
    local service="$1"
    native_dump_values "photoncloud/clusters/test-cluster/instances/${service}/" \
      | sed '/^$/d' \
      | jq -sr "${native_fresh_healthy_map_expr} | sort_by(.instance_id) | first"
  }

  wait_for_native_instance_node() {
    local service="$1"
    local expected_node="$2"
    local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
    local deadline=$((SECONDS + timeout))
    local instance_value="" node_id=""

    while true; do
      instance_value="$(native_first_healthy_instance "${service}")"
      node_id="$(printf '%s' "${instance_value}" | jq -r '.node_id // empty')"
      if [[ "${node_id}" == "${expected_node}" ]]; then
        printf '%s' "${instance_value}"
        return 0
      fi
      if (( SECONDS >= deadline )); then
        die "timed out waiting for ${service} to run on ${expected_node}"
      fi
      sleep 2
    done
  }

  native_publication_state() {
    native_dump_values "photoncloud/clusters/test-cluster/publications/" \
      | sed '/^$/d' \
      | jq -sr 'map(select(.service == "native-web")) | first'
  }

  wait_for_native_dns_record() {
    local fqdn="$1"
    local expected_ip="$2"
    local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
    local deadline=$((SECONDS + timeout))

    while true; do
      if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '${expected_ip}'" >/dev/null 2>&1; then
        return 0
      fi
      if (( SECONDS >= deadline )); then
        die "timed out waiting for native DNS record for ${fqdn}"
      fi
      sleep 2
    done
  }

  wait_for_native_lb_backends() {
    local pool_id="$1"
    local expected_count="$2"
    local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
    shift 3
    local deadline=$((SECONDS + timeout))
    local response=""

    while true; do
      response="$(
        grpcurl_capture -plaintext \
          -H "authorization: Bearer ${token}" \
          -import-path "${FIBERLB_PROTO_DIR}" \
          -proto "${FIBERLB_PROTO}" \
          -d "$(jq -cn --arg pool_id "${pool_id}" '{poolId:$pool_id, pageSize:100, pageToken:""}')" \
          127.0.0.1:15085 fiberlb.v1.BackendService/ListBackends
      )" || true

      if printf '%s' "${response}" \
        | jq -e --argjson expected "${expected_count}" '(.backends | length) == $expected' >/dev/null 2>&1; then
        local matched=1
        local expected_ip
        for expected_ip in "$@"; do
          if ! printf '%s' "${response}" | jq -e --arg ip "${expected_ip}" '.backends | any(.address == $ip)' >/dev/null 2>&1; then
            matched=0
            break
          fi
        done
        if [[ "${matched}" == "1" ]]; then
          return 0
        fi
      fi

      if (( SECONDS >= deadline )); then
        die "timed out waiting for native FiberLB backends for pool ${pool_id}: ${response}"
      fi
      sleep 2
    done
  }

  cat >"${native_config}" <<'EOF'
cluster:
  cluster_id: test-cluster
  environment: test
node_classes:
  - name: worker-linux
    description: Native runtime worker
    roles:
      - worker
    labels:
      tier: general
      runtime: native
pools:
  - name: general
    description: General-purpose native worker pool
    node_class: worker-linux
    labels:
      pool.photoncloud.io/name: general
nodes:
  - node_id: node04
    hostname: node04
    ip: 10.100.0.21
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-b
    state: provisioning
  - node_id: node05
    hostname: node05
    ip: 10.100.0.22
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-c
    state: provisioning
services:
  - name: native-web
    protocol: http
    ports:
      http: 18190
    schedule:
      replicas: 2
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        spread_by_label: failure_domain
        max_instances_per_node: 1
      instance_port: 18190
      process:
        command: python3
        args:
          - -m
          - http.server
          - ${INSTANCE_PORT}
          - --bind
          - ${INSTANCE_IP}
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 3
    publish:
      dns:
        zone: native.cluster.test
        name: web
        ttl: 30
        mode: load_balancer
      load_balancer:
        org_id: native-services
        project_id: test-cluster
        listener_port: 18191
        protocol: http
        pool_protocol: http
  - name: native-container
    protocol: http
    ports:
      http: 18192
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        max_instances_per_node: 1
      instance_port: 18192
      container:
        image: docker.io/library/nginx:1.27-alpine
        runtime: podman
        pull_policy: if-not-present
        ports:
          - container_port: 80
            host_port: 18192
            protocol: tcp
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 5
        startup_grace_secs: 120
instances: []
mtls_policies: []
EOF

  cat >"${drained_config}" <<'EOF'
cluster:
  cluster_id: test-cluster
  environment: test
node_classes:
  - name: worker-linux
    description: Native runtime worker
    roles:
      - worker
    labels:
      tier: general
      runtime: native
pools:
  - name: general
    description: General-purpose native worker pool
    node_class: worker-linux
    labels:
      pool.photoncloud.io/name: general
nodes:
  - node_id: node04
    hostname: node04
    ip: 10.100.0.21
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-b
    state: draining
  - node_id: node05
    hostname: node05
    ip: 10.100.0.22
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-c
    state: active
services:
  - name: native-web
    protocol: http
    ports:
      http: 18190
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        spread_by_label: failure_domain
        max_instances_per_node: 1
      instance_port: 18190
      process:
        command: python3
        args:
          - -m
          - http.server
          - ${INSTANCE_PORT}
          - --bind
          - ${INSTANCE_IP}
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 3
    publish:
      dns:
        zone: native.cluster.test
        name: web
        ttl: 30
        mode: load_balancer
      load_balancer:
        org_id: native-services
        project_id: test-cluster
        listener_port: 18191
        protocol: http
        pool_protocol: http
  - name: native-container
    protocol: http
    ports:
      http: 18192
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        max_instances_per_node: 1
      instance_port: 18192
      container:
        image: docker.io/library/nginx:1.27-alpine
        runtime: podman
        pull_policy: if-not-present
        ports:
          - container_port: 80
            host_port: 18192
            protocol: tcp
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 5
        startup_grace_secs: 120
instances: []
mtls_policies: []
EOF

  cat >"${restored_config}" <<'EOF'
cluster:
  cluster_id: test-cluster
  environment: test
node_classes:
  - name: worker-linux
    description: Native runtime worker
    roles:
      - worker
    labels:
      tier: general
      runtime: native
pools:
  - name: general
    description: General-purpose native worker pool
    node_class: worker-linux
    labels:
      pool.photoncloud.io/name: general
nodes:
  - node_id: node04
    hostname: node04
    ip: 10.100.0.21
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-b
    state: active
  - node_id: node05
    hostname: node05
    ip: 10.100.0.22
    roles:
      - worker
    labels:
      runtime: native
    pool: general
    node_class: worker-linux
    failure_domain: zone-c
    state: active
services:
  - name: native-web
    protocol: http
    ports:
      http: 18190
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        spread_by_label: failure_domain
        max_instances_per_node: 1
      instance_port: 18190
      process:
        command: python3
        args:
          - -m
          - http.server
          - ${INSTANCE_PORT}
          - --bind
          - ${INSTANCE_IP}
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 3
    publish:
      dns:
        zone: native.cluster.test
        name: web
        ttl: 30
        mode: load_balancer
      load_balancer:
        org_id: native-services
        project_id: test-cluster
        listener_port: 18191
        protocol: http
        pool_protocol: http
  - name: native-container
    protocol: http
    ports:
      http: 18192
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          runtime: native
        max_instances_per_node: 1
      instance_port: 18192
      container:
        image: docker.io/library/nginx:1.27-alpine
        runtime: podman
        pull_policy: if-not-present
        ports:
          - container_port: 80
            host_port: 18192
            protocol: tcp
      health_check:
        type: http
        path: /
        interval_secs: 5
        timeout_secs: 5
        startup_grace_secs: 120
instances: []
mtls_policies: []
EOF

  run_deployer_ctl \
    --chainfire-endpoint "${chainfire_endpoint}" \
    --cluster-id "test-cluster" \
    --cluster-namespace "photoncloud" \
    --deployer-namespace "deployer" \
    apply --config "${native_config}"

  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/nodes/" \
    'map(select(.labels.runtime == "native" and .state == "active")) | length' \
    "2" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    'length' \
    "2" \
    300
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    "${native_fresh_healthy_count_expr}" \
    "2" \
    300
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    'length' \
    "1" \
    360
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    360

  wait_for_http node04 "http://10.100.0.21:18190/" 240
  wait_for_http node05 "http://10.100.0.22:18190/" 240
  local container_value container_node container_ip container_port
  container_value="$(native_first_healthy_instance "native-container")"
  container_node="$(printf '%s' "${container_value}" | jq -r '.node_id')"
  container_ip="$(printf '%s' "${container_value}" | jq -r '.ip')"
  container_port="$(printf '%s' "${container_value}" | jq -r '.port')"
  [[ -n "${container_node}" && "${container_node}" != "null" ]] || die "native-container did not report a healthy instance"
  wait_for_http "${container_node}" "http://${container_ip}:${container_port}/" 360
  wait_for_http node01 "http://127.0.0.1:18191/" 240

  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/publications/" \
    'map(select(.service == "native-web")) | length' \
    "1" \
    180

  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"

  token="$(issue_project_admin_token 15080 "native-services" "test-cluster" "native-runtime-$(date +%s)")"
  lb_name="test-cluster-native-web"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" \
    -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn '{orgId:"native-services", projectId:"test-cluster", pageSize:100, pageToken:""}')" \
    127.0.0.1:15085 fiberlb.v1.LoadBalancerService/ListLoadBalancers \
    | jq -e --arg name "${lb_name}" '.loadbalancers | any(.name == $name)' >/dev/null

  local publication_value publication_fqdn publication_ip publication_pool_id
  publication_value="$(native_publication_state)"
  publication_fqdn="$(printf '%s' "${publication_value}" | jq -r '.dns.fqdn')"
  publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
  publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
  [[ -n "${publication_fqdn}" && "${publication_fqdn}" != "null" ]] || die "native publication missing fqdn"
  [[ -n "${publication_ip}" && "${publication_ip}" != "null" ]] || die "native publication missing dns value"
  [[ -n "${publication_pool_id}" && "${publication_pool_id}" != "null" ]] || die "native publication missing pool id"

  wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
  wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22

  run_deployer_ctl \
    --chainfire-endpoint "${chainfire_endpoint}" \
    --cluster-id "test-cluster" \
    --cluster-namespace "photoncloud" \
    --deployer-namespace "deployer" \
    apply --config "${drained_config}"

  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    'length' \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    'length' \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240
  local drained_web_value drained_web_node drained_container_value drained_container_node
  drained_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
  drained_web_node="$(printf '%s' "${drained_web_value}" | jq -r '.node_id')"
  [[ "${drained_web_node}" == "node05" ]] || die "native-web did not relocate to node05 after draining node04"
  drained_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
  drained_container_node="$(printf '%s' "${drained_container_value}" | jq -r '.node_id')"
  [[ "${drained_container_node}" == "node05" ]] || die "native-container did not relocate to node05 after draining node04"
  wait_for_http node05 "http://10.100.0.22:18190/" 240
  wait_for_http node05 "http://10.100.0.22:18192/" 240
  wait_for_http node01 "http://127.0.0.1:18191/" 240
  publication_value="$(native_publication_state)"
  publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
  publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
  wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
  wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180

  run_deployer_ctl \
    --chainfire-endpoint "${chainfire_endpoint}" \
    --cluster-id "test-cluster" \
    --cluster-namespace "photoncloud" \
    --deployer-namespace "deployer" \
    apply --config "${restored_config}"

  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    'length' \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    'length' \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240
  local restored_web_value restored_web_node restored_container_value restored_container_node
  restored_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
  restored_web_node="$(printf '%s' "${restored_web_value}" | jq -r '.node_id')"
  [[ "${restored_web_node}" == "node05" ]] || die "native-web unexpectedly moved after node04 returned to service"
  restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
  restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')"
  [[ "${restored_container_node}" == "node05" ]] || die "native-container unexpectedly moved after node04 returned to service"
  publication_value="$(native_publication_state)"
  publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
  publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
  wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
  wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
  wait_for_http node01 "http://127.0.0.1:18191/" 240

  log "Simulating native worker loss and scheduler failover"
  stop_vm node05
  wait_for_ssh_down node05 120

  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-web/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240
  wait_for_native_dump_count \
    "photoncloud/clusters/test-cluster/instances/native-container/" \
    "${native_fresh_healthy_count_expr}" \
    "1" \
    240

  local failover_web_value failover_web_node failover_container_value failover_container_node
  failover_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
  failover_web_node="$(printf '%s' "${failover_web_value}" | jq -r '.node_id')"
  [[ "${failover_web_node}" == "node04" ]] || die "native-web did not fail over to node04 after node05 stopped"
  failover_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
  failover_container_node="$(printf '%s' "${failover_container_value}" | jq -r '.node_id')"
  [[ "${failover_container_node}" == "node04" ]] || die "native-container did not fail over to node04 after node05 stopped"
  publication_value="$(native_publication_state)"
  publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
  publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
  wait_for_native_lb_backends "${publication_pool_id}" "1" 240 10.100.0.21
  wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
  wait_for_http node04 "http://10.100.0.21:18190/" 240
  wait_for_http node04 "http://10.100.0.21:18192/" 240
  wait_for_http node01 "http://127.0.0.1:18191/" 240

  log "Restarting native worker and ensuring placement stays stable"
  start_vm node05
  wait_for_ssh node05
  wait_for_unit node05 plasmavmc
  wait_for_unit node05 lightningstor
  wait_for_unit node05 node-agent

  local recovered_web_value recovered_web_node recovered_container_value recovered_container_node
  recovered_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
  recovered_web_node="$(printf '%s' "${recovered_web_value}" | jq -r '.node_id')"
  [[ "${recovered_web_node}" == "node04" ]] || die "native-web unexpectedly churned after node05 recovered"
  recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
  recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')"
  [[ "${recovered_container_node}" == "node04" ]] || die "native-container unexpectedly churned after node05 recovered"
  publication_value="$(native_publication_state)"
  publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
  publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
  wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.21
  wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
  wait_for_http node01 "http://127.0.0.1:18191/" 240

  trap - RETURN
  stop_ssh_tunnel node01 "${lb_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
  stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"
  stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"
  stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"
  rm -rf "${tmp_dir}"
}

validate_network_provider_matrix() {
  log "Validating component matrix: PrismNet, FlashDNS, and FiberLB in composed tenant scenarios"

  local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
  dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
  lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"

  local org_id="matrix-net-org"
  local project_id="matrix-net-project"
  local principal_id="matrix-net-$(date +%s)"
  local token=""
  local vpc_id="" subnet_id="" port_id="" port_ip=""
  local zone_id="" zone_name="matrix-$(date +%s).cluster.test"
  local workload_record_id="" service_record_id=""
  local lb_id="" pool_id="" backend_id="" listener_id="" listener_port=""
  local workload_fqdn="" service_fqdn=""

  cleanup_network_provider_matrix() {
    if [[ -n "${service_record_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
        -d "$(jq -cn --arg id "${service_record_id}" '{id:$id}')" \
        127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
    fi
    if [[ -n "${workload_record_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
        -d "$(jq -cn --arg id "${workload_record_id}" '{id:$id}')" \
        127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
    fi
    if [[ -n "${listener_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
        -d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
        127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null 2>&1 || true
    fi
    if [[ -n "${backend_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
        -d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
        127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null 2>&1 || true
    fi
    if [[ -n "${pool_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
        -d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
        127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null 2>&1 || true
    fi
    if [[ -n "${lb_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
        -d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
        127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null 2>&1 || true
    fi
    if [[ -n "${port_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
        127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null 2>&1 || true
    fi
    if [[ -n "${subnet_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
        127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null 2>&1 || true
    fi
    if [[ -n "${vpc_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
        127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null 2>&1 || true
    fi
    if [[ -n "${zone_id}" ]]; then
      grpcurl -plaintext -H "authorization: Bearer ${token}" \
        -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
        -d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
        127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null 2>&1 || true
    fi

    stop_ssh_tunnel node01 "${lb_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${dns_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${prism_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
  }
  trap cleanup_network_provider_matrix RETURN EXIT

  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  log "Matrix case: PrismNet only"
  vpc_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, name:"matrix-vpc", description:"component matrix", cidrBlock:"10.52.0.0/16"}')" \
    127.0.0.1:15081 prismnet.VpcService/CreateVpc | jq -r '.vpc.id')"
  [[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "component matrix PrismNet VPC creation failed"

  subnet_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"matrix-subnet", description:"component matrix", cidrBlock:"10.52.10.0/24", gatewayIp:"10.52.10.1", dhcpEnabled:true}')" \
    127.0.0.1:15081 prismnet.SubnetService/CreateSubnet | jq -r '.subnet.id')"
  [[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "component matrix PrismNet subnet creation failed"

  local port_response
  port_response="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"matrix-port", description:"component matrix", ipAddress:""}')" \
    127.0.0.1:15081 prismnet.PortService/CreatePort)"
  port_id="$(printf '%s' "${port_response}" | jq -r '.port.id')"
  port_ip="$(printf '%s' "${port_response}" | jq -r '.port.ipAddress')"
  [[ -n "${port_id}" && "${port_id}" != "null" && -n "${port_ip}" && "${port_ip}" != "null" ]] || die "component matrix PrismNet port creation failed"

  log "Matrix case: PrismNet + FlashDNS"
  zone_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.matrix.test", adminEmail:"admin@matrix.test"}')" \
    127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone | jq -r '.zone.id')"
  [[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "component matrix FlashDNS zone creation failed"

  workload_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg zone "${zone_id}" --arg address "${port_ip}" '{zoneId:$zone, name:"workload", recordType:"A", ttl:60, data:{a:{address:$address}}}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
  [[ -n "${workload_record_id}" && "${workload_record_id}" != "null" ]] || die "component matrix FlashDNS workload record creation failed"

  workload_fqdn="workload.${zone_name}"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${workload_fqdn} A | grep -Fx '${port_ip}'" >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for FlashDNS answer for ${workload_fqdn}"
    fi
    sleep 2
  done

  log "Matrix case: PrismNet + FiberLB"
  listener_port="$(allocate_free_listener_port node01 18180 18999)" || die "failed to allocate a free FiberLB listener port for component matrix"
  lb_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{name:"matrix-lb", orgId:$org, projectId:$project, description:"component matrix"}')" \
    127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer | jq -r '.loadbalancer.id')"
  [[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "component matrix FiberLB creation failed"

  pool_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg lb "${lb_id}" '{name:"matrix-pool", loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
    127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool | jq -r '.pool.id')"
  [[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "component matrix FiberLB pool creation failed"

  backend_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg pool "${pool_id}" '{name:"matrix-backend", poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
    127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend | jq -r '.backend.id')"
  [[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "component matrix FiberLB backend creation failed"

  listener_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
    -d "$(jq -cn --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:"matrix-listener", loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
    127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener | jq -r '.listener.id')"
  [[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "component matrix FiberLB listener creation failed"
  wait_for_http node01 "http://127.0.0.1:${listener_port}/health"

  log "Matrix case: PrismNet + FlashDNS + FiberLB"
  service_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
    -import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
    -d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"service", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
    127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
  [[ -n "${service_record_id}" && "${service_record_id}" != "null" ]] || die "component matrix FlashDNS service record creation failed"

  service_fqdn="service.${zone_name}"
  deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for FlashDNS answer for ${service_fqdn}"
    fi
    sleep 2
  done

  ssh_node node01 "curl -fsS --max-time 5 --resolve ${service_fqdn}:${listener_port}:10.100.0.11 http://${service_fqdn}:${listener_port}/health >/dev/null"

  trap - RETURN EXIT
  cleanup_network_provider_matrix
}

validate_component_matrix() {
  validate_control_plane
  validate_iam_flow
  validate_network_provider_matrix
  validate_vm_storage_flow
  validate_k8shost_flow
  validate_gateway
  validate_nightlight_flow
  validate_creditservice_flow
  validate_deployer_flow
  validate_native_runtime_flow
  log "Component matrix validation succeeded"
}

benchmark_coronafs_performance() {
  log "Benchmarking CoronaFS NBD-backed volume throughput against local worker disk"

  local local_write_json local_read_json local_rand_json
  local coronafs_write_json coronafs_read_json coronafs_rand_json
  local local_depth_write_json local_depth_read_json
  local coronafs_depth_write_json coronafs_depth_read_json
  local cross_worker_read_json
  local coronafs_tunnel="" bench_volume="coronafs-bench-$(date +%s)"
  local coronafs_export_json coronafs_uri

  coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
  cleanup_coronafs_bench() {
    coronafs_delete_volume 15088 "${bench_volume}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${coronafs_tunnel}"
  }
  trap cleanup_coronafs_bench RETURN

  coronafs_create_volume 15088 "${bench_volume}" $((512 * 1024 * 1024)) >/dev/null
  coronafs_export_json="$(coronafs_export_volume_json 15088 "${bench_volume}")"
  coronafs_uri="$(printf '%s' "${coronafs_export_json}" | jq -r '.export.uri')"
  [[ -n "${coronafs_uri}" && "${coronafs_uri}" != "null" ]] || die "CoronaFS benchmark volume did not return an export URI"

  local_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqwrite.dat write 1M 256)"
  local_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqread.dat read 1M 256)"
  local_rand_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread.dat randread 4k 128 10)"
  local_rand_depth_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread-depth.dat randread 4k 512 15 32 libaio)"

  coronafs_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 256)"
  coronafs_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" read 1M 256)"
  coronafs_rand_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 128 10)"
  coronafs_rand_depth_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 512 15 /dev/nbd0 32)"
  local_depth_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthwrite.dat write 1M 1024 15 32 libaio)"
  local_depth_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthread.dat read 1M 1024 15 32 libaio)"
  coronafs_depth_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 1024 15 /dev/nbd0 32)"
  coronafs_depth_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" read 1M 1024 15 /dev/nbd0 32)"
  cross_worker_read_json="$(run_remote_nbd_fio_json node05 "${coronafs_uri}" read 1M 256 0 /dev/nbd1 1)"

  local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
  local coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps
  local local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps

  local_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_write_json}" | jq -r '.bw_bytes')")"
  local_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_read_json}" | jq -r '.bw_bytes')")"
  local_rand_iops="$(printf '%s' "${local_rand_json}" | jq -r '.iops | floor')"
  local_rand_depth_iops="$(printf '%s' "${local_rand_depth_json}" | jq -r '.iops | floor')"

  coronafs_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_write_json}" | jq -r '.bw_bytes')")"
  coronafs_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_read_json}" | jq -r '.bw_bytes')")"
  coronafs_rand_iops="$(printf '%s' "${coronafs_rand_json}" | jq -r '.iops | floor')"
  coronafs_rand_depth_iops="$(printf '%s' "${coronafs_rand_depth_json}" | jq -r '.iops | floor')"
  local_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_write_json}" | jq -r '.bw_bytes')")"
  local_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_read_json}" | jq -r '.bw_bytes')")"
  coronafs_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_depth_write_json}" | jq -r '.bw_bytes')")"
  coronafs_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_depth_read_json}" | jq -r '.bw_bytes')")"
  coronafs_cross_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${cross_worker_read_json}" | jq -r '.bw_bytes')")"

  log "CoronaFS local baseline: write=${local_write_mibps} MiB/s read=${local_read_mibps} MiB/s randread=${local_rand_iops} IOPS queued_randread=${local_rand_depth_iops} IOPS"
  log "CoronaFS shared block volume: write=${coronafs_write_mibps} MiB/s read=${coronafs_read_mibps} MiB/s randread=${coronafs_rand_iops} IOPS queued_randread=${coronafs_rand_depth_iops} IOPS"
  log "CoronaFS queued depth-32 profile: local_write=${local_depth_write_mibps} MiB/s local_read=${local_depth_read_mibps} MiB/s shared_write=${coronafs_depth_write_mibps} MiB/s shared_read=${coronafs_depth_read_mibps} MiB/s"
  log "CoronaFS cross-worker shared read: read=${coronafs_cross_read_mibps} MiB/s (node04 write -> node05 direct read over the same NBD export)"

  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
    "${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
    "${coronafs_write_mibps}" "${coronafs_read_mibps}" "${coronafs_rand_iops}" "${coronafs_rand_depth_iops}" \
    "${coronafs_cross_read_mibps}" \
    "${local_depth_write_mibps}" "${local_depth_read_mibps}" \
    "${coronafs_depth_write_mibps}" "${coronafs_depth_read_mibps}"

  trap - RETURN
  cleanup_coronafs_bench
}

benchmark_lightningstor_performance() {
  local client_node="${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}"
  log "Benchmarking LightningStor S3 throughput from ${client_node}"

  local bucket="ls-bench-$(date +%s)"
  local object_key="bench-object.bin"
  local result_json
  if ! result_json="$(ssh_node_script "${client_node}" "${bucket}" "${object_key}" 256 32 4 8 <<'EOS'
set -euo pipefail

bucket="$1"
object_key="$2"
size_mb="$3"
small_count="$4"
small_size_mb="$5"
parallelism="$6"
endpoint="http://10.100.0.11:9000"
workdir="/var/tmp/photon-bench-s3"
src="${workdir}/upload.bin"
dst="${workdir}/download.bin"
mkdir -p "${workdir}"
python3 - "${bucket}" "${object_key}" "${size_mb}" "${small_count}" "${small_size_mb}" "${parallelism}" "${endpoint}" "${workdir}" "${src}" "${dst}" <<'PY'
import concurrent.futures
import hashlib
import json
import os
import pathlib
import threading
import time

import boto3
from botocore.config import Config


bucket, object_key, size_mb, small_count, small_size_mb, parallelism, endpoint, workdir, src, dst = os.sys.argv[1:11]
size_mb = int(size_mb)
small_count = int(small_count)
small_size_mb = int(small_size_mb)
parallelism = int(parallelism)
workdir_path = pathlib.Path(workdir)
src_path = pathlib.Path(src)
dst_path = pathlib.Path(dst)
small_size_bytes = small_size_mb * 1024 * 1024
large_size_bytes = size_mb * 1024 * 1024
thread_local = threading.local()


def ensure_sparse_file(path: pathlib.Path, size_bytes: int) -> None:
    if path.exists() and path.stat().st_size == size_bytes:
        return
    with path.open("wb") as handle:
        handle.truncate(size_bytes)


def sha256_file(path: pathlib.Path) -> str:
    digest = hashlib.sha256()
    with path.open("rb") as handle:
        while True:
            chunk = handle.read(8 * 1024 * 1024)
            if not chunk:
                break
            digest.update(chunk)
    return digest.hexdigest()


def new_client():
    return boto3.session.Session().client(
        "s3",
        endpoint_url=endpoint,
        region_name="us-east-1",
        aws_access_key_id="photoncloud-test",
        aws_secret_access_key="photoncloud-test-secret",
        use_ssl=False,
        verify=False,
        config=Config(
            retries={"max_attempts": 8, "mode": "standard"},
            s3={"addressing_style": "path"},
            max_pool_connections=max(32, parallelism * 4),
            signature_version="s3v4",
        ),
    )


def client():
    existing = getattr(thread_local, "client", None)
    if existing is None:
        existing = new_client()
        thread_local.client = existing
    return existing


def put_file(key: str, path: pathlib.Path) -> None:
    with path.open("rb") as handle:
        client().put_object(Bucket=bucket, Key=key, Body=handle)


def get_file(key: str, path: pathlib.Path) -> None:
    response = client().get_object(Bucket=bucket, Key=key)
    with path.open("wb") as handle:
        body = response["Body"]
        for chunk in body.iter_chunks(chunk_size=8 * 1024 * 1024):
            if chunk:
                handle.write(chunk)


def delete_key(key: str) -> None:
    client().delete_object(Bucket=bucket, Key=key)


workdir_path.mkdir(parents=True, exist_ok=True)
ensure_sparse_file(src_path, large_size_bytes)
src_sha = sha256_file(src_path)
small_paths = []
for index in range(1, small_count + 1):
    path = workdir_path / f"payload-{index}.bin"
    ensure_sparse_file(path, small_size_bytes)
    small_paths.append(path)

control_client = new_client()
control_client.create_bucket(Bucket=bucket)

upload_start = time.monotonic_ns()
put_file(object_key, src_path)
upload_end = time.monotonic_ns()

if dst_path.exists():
    dst_path.unlink()
download_start = time.monotonic_ns()
get_file(object_key, dst_path)
download_end = time.monotonic_ns()

if sha256_file(dst_path) != src_sha:
    raise SystemExit("large-object checksum mismatch")

head = control_client.head_object(Bucket=bucket, Key=object_key)
if int(head["ContentLength"]) != large_size_bytes:
    raise SystemExit("large-object size mismatch")

delete_key(object_key)

small_total_bytes = small_count * small_size_bytes

small_upload_start = time.monotonic_ns()
for index, path in enumerate(small_paths, start=1):
    put_file(f"small-{index}.bin", path)
small_upload_end = time.monotonic_ns()

small_download_start = time.monotonic_ns()
for index in range(1, small_count + 1):
    small_dst = workdir_path / f"small-download-{index}.bin"
    get_file(f"small-{index}.bin", small_dst)
    if small_dst.stat().st_size != small_size_bytes:
        raise SystemExit(f"small-object size mismatch for {small_dst}")
small_download_end = time.monotonic_ns()

with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
    list(executor.map(delete_key, [f"small-{index}.bin" for index in range(1, small_count + 1)]))

parallel_upload_start = time.monotonic_ns()
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
    list(
        executor.map(
            lambda item: put_file(f"parallel-small-{item[0]}.bin", item[1]),
            list(enumerate(small_paths, start=1)),
        )
    )
parallel_upload_end = time.monotonic_ns()

parallel_download_start = time.monotonic_ns()


def download_parallel(index: int) -> None:
    path = workdir_path / f"parallel-download-{index}.bin"
    get_file(f"parallel-small-{index}.bin", path)
    if path.stat().st_size != small_size_bytes:
        raise SystemExit(f"parallel small-object size mismatch for {path}")


with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
    list(executor.map(download_parallel, range(1, small_count + 1)))
parallel_download_end = time.monotonic_ns()

with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
    list(executor.map(delete_key, [f"parallel-small-{index}.bin" for index in range(1, small_count + 1)]))

control_client.delete_bucket(Bucket=bucket)

for pattern in ("payload-*.bin", "small-download-*.bin", "parallel-download-*.bin"):
    for path in workdir_path.glob(pattern):
        path.unlink(missing_ok=True)
src_path.unlink(missing_ok=True)
dst_path.unlink(missing_ok=True)

print(
    json.dumps(
        {
            "size_bytes": large_size_bytes,
            "upload_ns": upload_end - upload_start,
            "download_ns": download_end - download_start,
            "small_object_count": small_count,
            "small_total_bytes": small_total_bytes,
            "small_upload_ns": small_upload_end - small_upload_start,
            "small_download_ns": small_download_end - small_download_start,
            "parallel_small_upload_ns": parallel_upload_end - parallel_upload_start,
            "parallel_small_download_ns": parallel_download_end - parallel_download_start,
            "parallelism": parallelism,
        }
    )
)
PY
EOS
  )"; then
    die "LightningStor S3 benchmark failed"
  fi

  local size_bytes upload_mibps download_mibps
  local small_total_bytes small_object_count small_object_mib
  local small_upload_mibps small_download_mibps small_put_ops small_get_ops
  local parallel_small_upload_mibps parallel_small_download_mibps parallel_small_put_ops parallel_small_get_ops parallelism
  size_bytes="$(printf '%s' "${result_json}" | jq -r '.size_bytes')"
  [[ -n "${size_bytes}" && "${size_bytes}" != "null" && "${size_bytes}" != "0" ]] || die "LightningStor S3 benchmark returned no object size"
  upload_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.upload_ns')")"
  download_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.download_ns')")"
  small_total_bytes="$(printf '%s' "${result_json}" | jq -r '.small_total_bytes')"
  small_object_count="$(printf '%s' "${result_json}" | jq -r '.small_object_count')"
  small_object_mib="$(awk "BEGIN { printf \"%.0f\", ${small_total_bytes} / 1048576 }")"
  small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
  small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
  small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
  small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
  parallel_small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
  parallel_small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
  parallel_small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
  parallel_small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
  parallelism="$(printf '%s' "${result_json}" | jq -r '.parallelism')"

  log "LightningStor S3 benchmark: upload=${upload_mibps} MiB/s download=${download_mibps} MiB/s object_size=$((size_bytes / 1048576)) MiB"
  log "LightningStor small-object batch: objects=${small_object_count} size_per_object=4 MiB upload=${small_upload_mibps} MiB/s download=${small_download_mibps} MiB/s put_rate=${small_put_ops} obj/s get_rate=${small_get_ops} obj/s"
  log "LightningStor parallel small-object batch: objects=${small_object_count} size_per_object=4 MiB parallelism=${parallelism} upload=${parallel_small_upload_mibps} MiB/s download=${parallel_small_download_mibps} MiB/s put_rate=${parallel_small_put_ops} obj/s get_rate=${parallel_small_get_ops} obj/s"

  printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
    "${upload_mibps}" "${download_mibps}" "$((size_bytes / 1048576))" \
    "${small_object_count}" "${small_object_mib}" "${small_upload_mibps}" "${small_download_mibps}" \
    "${small_put_ops}/${small_get_ops}" \
    "${parallel_small_upload_mibps}" "${parallel_small_download_mibps}" \
    "${parallel_small_put_ops}/${parallel_small_get_ops}"
}

benchmark_plasmavmc_image_path() {
  log "Benchmarking PlasmaVMC image import plus CoronaFS-backed volume clone latency"

  local iam_tunnel="" ls_tunnel="" vm_tunnel=""
  local image_id="" cold_volume_id="" warm_volume_id="" image_source_path=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
  vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"

  cleanup_plasmavmc_image_bench() {
    if [[ -n "${cold_volume_id}" ]]; then
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${cold_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
        127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
    fi
    if [[ -n "${warm_volume_id}" ]]; then
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${warm_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
        127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
    fi
    if [[ -n "${image_id}" ]]; then
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')" \
        127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
    fi
    if [[ -n "${image_source_path}" ]]; then
      ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
    fi
    stop_ssh_tunnel node01 "${vm_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${ls_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
  }
  trap cleanup_plasmavmc_image_bench RETURN

  local org_id="plasmavmc-bench-org-$(date +%s)"
  local project_id="plasmavmc-bench-project"
  local principal_id="plasmavmc-bench-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
  wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC benchmark image import"

  local guest_image_local_path guest_image_sha artifact_size_bytes artifact_mib virtual_size_bytes virtual_mib
  guest_image_local_path="$(guest_image_path)"
  [[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image for PlasmaVMC benchmark"
  guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
  artifact_size_bytes="$(stat -c %s "${guest_image_local_path}")"
  virtual_size_bytes="$(qemu-img info --output json "${guest_image_local_path}" | jq -r '."virtual-size"')"
  artifact_mib="$(awk "BEGIN { printf \"%.0f\", ${artifact_size_bytes} / 1048576 }")"
  virtual_mib="$(awk "BEGIN { printf \"%.0f\", ${virtual_size_bytes} / 1048576 }")"

  local image_name="bench-image-$(date +%s)"
  ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
  image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
  scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
  [[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC benchmark image checksum mismatch after distribution"

  local create_image_json create_image_response create_image_start_ns create_image_end_ns
  create_image_json="$(
    jq -cn \
      --arg name "${image_name}" \
      --arg org "${org_id}" \
      --arg sha "${guest_image_sha}" \
      --arg source_url "file://${image_source_path}" \
      '{
        name:$name,
        orgId:$org,
        visibility:"VISIBILITY_PRIVATE",
        format:"IMAGE_FORMAT_QCOW2",
        osType:"OS_TYPE_LINUX",
        osVersion:"bench",
        architecture:"ARCHITECTURE_X86_64",
        minDiskGib:1,
        minMemoryMib:512,
        metadata:{purpose:"bench", sourceSha256:$sha},
        sourceUrl:$source_url
      }'
  )"
  create_image_start_ns="$(date +%s%N)"
  create_image_response="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${create_image_json}" \
    127.0.0.1:15082 plasmavmc.v1.ImageService/CreateImage)"
  create_image_end_ns="$(date +%s%N)"
  image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
  [[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC benchmark image import did not return an image ID"
  printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null

  local cold_request warm_request cold_response warm_response cold_start_ns cold_end_ns warm_start_ns warm_end_ns
  cold_request="$(jq -cn --arg name "bench-cold-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
    name:$name,
    orgId:$org,
    projectId:$project,
    sizeGib:4,
    driver:"VOLUME_DRIVER_KIND_MANAGED",
    storageClass:"coronafs-managed",
    imageId:$image,
    metadata:{purpose:"bench-cold"},
    labels:{}
  }')"
  cold_start_ns="$(date +%s%N)"
  cold_response="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${cold_request}" \
    127.0.0.1:15082 plasmavmc.v1.VolumeService/CreateVolume)"
  cold_end_ns="$(date +%s%N)"
  cold_volume_id="$(printf '%s' "${cold_response}" | jq -r '.id')"
  [[ -n "${cold_volume_id}" && "${cold_volume_id}" != "null" ]] || die "PlasmaVMC cold image-backed volume create did not return a volume ID"
  printf '%s' "${cold_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${cold_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
    127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null
  cold_volume_id=""

  warm_request="$(jq -cn --arg name "bench-warm-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
    name:$name,
    orgId:$org,
    projectId:$project,
    sizeGib:4,
    driver:"VOLUME_DRIVER_KIND_MANAGED",
    storageClass:"coronafs-managed",
    imageId:$image,
    metadata:{purpose:"bench-warm"},
    labels:{}
  }')"
  warm_start_ns="$(date +%s%N)"
  warm_response="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${warm_request}" \
    127.0.0.1:15082 plasmavmc.v1.VolumeService/CreateVolume)"
  warm_end_ns="$(date +%s%N)"
  warm_volume_id="$(printf '%s' "${warm_response}" | jq -r '.id')"
  [[ -n "${warm_volume_id}" && "${warm_volume_id}" != "null" ]] || die "PlasmaVMC warm image-backed volume create did not return a volume ID"
  printf '%s' "${warm_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null

  local image_import_sec cold_clone_sec warm_clone_sec
  image_import_sec="$(calc_seconds_from_ns "$((create_image_end_ns - create_image_start_ns))")"
  cold_clone_sec="$(calc_seconds_from_ns "$((cold_end_ns - cold_start_ns))")"
  warm_clone_sec="$(calc_seconds_from_ns "$((warm_end_ns - warm_start_ns))")"

  log "PlasmaVMC image artifact benchmark: artifact=${artifact_mib} MiB virtual_size=${virtual_mib} MiB import=${image_import_sec}s cold_clone=${cold_clone_sec}s warm_clone=${warm_clone_sec}s"

  printf '%s\t%s\t%s\t%s\t%s\n' \
    "${artifact_mib}" "${virtual_mib}" "${image_import_sec}" "${cold_clone_sec}" "${warm_clone_sec}"
}

benchmark_plasmavmc_guest_runtime() {
  log "Benchmarking PlasmaVMC guest-side CoronaFS runtime throughput"

  local iam_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
  local image_id="" vm_id="" image_source_path=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
  vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
  coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"

  cleanup_plasmavmc_guest_runtime() {
    if [[ -n "${vm_id}" ]]; then
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:true, timeoutSeconds:30}')" \
        127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null 2>&1 || true
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}' )" \
        127.0.0.1:15082 plasmavmc.v1.VmService/DeleteVm >/dev/null 2>&1 || true
    fi
    if [[ -n "${image_id}" ]]; then
      grpcurl -plaintext \
        -H "authorization: Bearer ${token}" \
        -import-path "${PLASMAVMC_PROTO_DIR}" \
        -proto "${PLASMAVMC_PROTO}" \
        -d "$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')" \
        127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
    fi
    if [[ -n "${image_source_path}" ]]; then
      ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
    fi
    stop_ssh_tunnel node01 "${coronafs_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${vm_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${ls_tunnel}" >/dev/null 2>&1 || true
    stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
  }
  trap cleanup_plasmavmc_guest_runtime RETURN

  wait_for_plasmavmc_workers_registered 15082

  local org_id="plasmavmc-runtime-org-$(date +%s)"
  local project_id="plasmavmc-runtime-project"
  local principal_id="plasmavmc-runtime-$(date +%s)"
  local token
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"

  ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
  wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC runtime benchmark image import"

  local guest_image_local_path guest_image_sha image_name create_image_json create_image_response
  guest_image_local_path="$(guest_bench_image_path)"
  [[ -n "${guest_image_local_path}" ]] || die "failed to locate VM benchmark guest image"
  guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
  image_name="bench-runtime-image-$(date +%s)"
  ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
  image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
  scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
  [[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC runtime benchmark image checksum mismatch after distribution"

  create_image_json="$(
    jq -cn \
      --arg name "${image_name}" \
      --arg org "${org_id}" \
      --arg sha "${guest_image_sha}" \
      --arg source_url "file://${image_source_path}" \
      '{
        name:$name,
        orgId:$org,
        visibility:"VISIBILITY_PRIVATE",
        format:"IMAGE_FORMAT_QCOW2",
        osType:"OS_TYPE_LINUX",
        osVersion:"bench-runtime",
        architecture:"ARCHITECTURE_X86_64",
        minDiskGib:1,
        minMemoryMib:512,
        metadata:{purpose:"bench-runtime", sourceSha256:$sha},
        sourceUrl:$source_url
      }'
  )"
  create_image_response="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${create_image_json}" \
    127.0.0.1:15082 plasmavmc.v1.ImageService/CreateImage)"
  image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
  [[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC runtime benchmark image import did not return an image ID"
  printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null

  local create_vm_json get_vm_json create_response node_id peer_node
  create_vm_json="$(
    jq -cn \
      --arg name "bench-runtime-vm-$(date +%s)" \
      --arg org "${org_id}" \
      --arg project "${project_id}" \
      --arg image_id "${image_id}" \
      '{
        name:$name,
        orgId:$org,
        projectId:$project,
        hypervisor:"HYPERVISOR_TYPE_KVM",
        spec:{
          cpu:{vcpus:4, coresPerSocket:1, sockets:1},
          memory:{sizeMib:1536},
          disks:[
            {
              id:"root",
              source:{imageId:$image_id},
              sizeGib:4,
              bus:"DISK_BUS_VIRTIO",
              cache:"DISK_CACHE_NONE",
              bootIndex:1
            },
            {
              id:"data",
              source:{blank:true},
              sizeGib:4,
              bus:"DISK_BUS_VIRTIO",
              cache:"DISK_CACHE_NONE"
            }
          ]
        }
      }'
  )"
  create_response="$(grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "${create_vm_json}" \
    127.0.0.1:15082 plasmavmc.v1.VmService/CreateVm)"
  vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
  [[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "PlasmaVMC runtime benchmark VM create did not return a VM ID"

  get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"
  local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
  while true; do
    local vm_json
    if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
      if (( SECONDS >= deadline )); then
        die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
      fi
      sleep 2
      continue
    fi
    node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
    if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
    fi
    sleep 2
  done
  if [[ "${node_id}" == "node04" ]]; then
    peer_node="node05"
  else
    peer_node="node04"
  fi

  local start_ns attach_ns ready_ns attach_sec ready_sec
  local root_volume_id="${vm_id}-root"
  local data_volume_id="${vm_id}-data"
  local root_uri data_uri

  start_ns="$(date +%s%N)"
  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
    127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null

  root_uri="$(coronafs_export_volume_json 15088 "${root_volume_id}" | jq -r '.export.uri')"
  data_uri="$(coronafs_export_volume_json 15088 "${data_volume_id}" | jq -r '.export.uri')"
  [[ -n "${root_uri}" && "${root_uri}" != "null" ]] || die "runtime benchmark root volume export URI missing"
  [[ -n "${data_uri}" && "${data_uri}" != "null" ]] || die "runtime benchmark data volume export URI missing"

  wait_for_qemu_volume_present "${node_id}" "${root_uri}"
  wait_for_qemu_volume_present "${node_id}" "${data_uri}"
  attach_ns="$(date +%s%N)"

  wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT"
  ready_ns="$(date +%s%N)"

  local result_line seq_write_mibps seq_read_mibps randread_iops
  result_line="$(read_vm_console_line_matching "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT")"
  seq_write_mibps="$(printf '%s\n' "${result_line}" | sed -n 's/.*seq_write_mibps=\([^ ]*\).*/\1/p')"
  seq_read_mibps="$(printf '%s\n' "${result_line}" | sed -n 's/.*seq_read_mibps=\([^ ]*\).*/\1/p')"
  randread_iops="$(printf '%s\n' "${result_line}" | sed -n 's/.*randread_iops=\([^ ]*\).*/\1/p')"
  [[ -n "${seq_write_mibps}" && -n "${seq_read_mibps}" && -n "${randread_iops}" ]] || die "failed to parse runtime benchmark result line: ${result_line}"

  attach_sec="$(calc_seconds_from_ns "$((attach_ns - start_ns))")"
  ready_sec="$(calc_seconds_from_ns "$((ready_ns - start_ns))")"

  log "PlasmaVMC guest runtime benchmark: attach=${attach_sec}s guest_ready=${ready_sec}s seq_write=${seq_write_mibps} MiB/s seq_read=${seq_read_mibps} MiB/s randread=${randread_iops} IOPS"
  printf '%s\t%s\t%s\t%s\t%s\n' \
    "${attach_sec}" "${ready_sec}" "${seq_write_mibps}" "${seq_read_mibps}" "${randread_iops}"
}

write_storage_benchmark_report() {
  local coronafs_network_mibps="$1"
  local coronafs_network_retransmits="$2"
  local lightningstor_network_mibps="$3"
  local lightningstor_network_retransmits="$4"
  local local_write_mibps="$5"
  local local_read_mibps="$6"
  local local_rand_iops="$7"
  local local_rand_depth_iops="$8"
  local coronafs_write_mibps="$9"
  local coronafs_read_mibps="${10}"
  local coronafs_rand_iops="${11}"
  local coronafs_rand_depth_iops="${12}"
  local coronafs_cross_read_mibps="${13}"
  local local_depth_write_mibps="${14}"
  local local_depth_read_mibps="${15}"
  local coronafs_depth_write_mibps="${16}"
  local coronafs_depth_read_mibps="${17}"
  local lightningstor_upload_mibps="${18}"
  local lightningstor_download_mibps="${19}"
  local lightningstor_object_mib="${20}"
  local lightningstor_small_object_count="${21}"
  local lightningstor_small_object_mib="${22}"
  local lightningstor_small_upload_mibps="${23}"
  local lightningstor_small_download_mibps="${24}"
  local lightningstor_small_ops="${25}"
  local lightningstor_parallel_small_upload_mibps="${26}"
  local lightningstor_parallel_small_download_mibps="${27}"
  local lightningstor_parallel_small_ops="${28}"
  local plasmavmc_image_artifact_mib="${29}"
  local plasmavmc_image_virtual_mib="${30}"
  local plasmavmc_image_import_sec="${31}"
  local plasmavmc_cold_clone_sec="${32}"
  local plasmavmc_warm_clone_sec="${33}"
  local plasmavmc_runtime_attach_sec="${34}"
  local plasmavmc_runtime_ready_sec="${35}"
  local plasmavmc_runtime_seq_write_mibps="${36}"
  local plasmavmc_runtime_seq_read_mibps="${37}"
  local plasmavmc_runtime_randread_iops="${38}"
  local coronafs_read_ratio coronafs_rand_ratio coronafs_rand_depth_ratio coronafs_cross_read_ratio coronafs_vs_network_ratio coronafs_depth_read_ratio lightningstor_vs_network_ratio
  local lightningstor_small_put_ops lightningstor_small_get_ops
  local lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops

  IFS=/ read -r lightningstor_small_put_ops lightningstor_small_get_ops <<<"${lightningstor_small_ops}"
  IFS=/ read -r lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops <<<"${lightningstor_parallel_small_ops}"

  coronafs_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_read_mibps} / ${local_read_mibps}) * 100 }")"
  coronafs_rand_ratio="$(awk "BEGIN { if (${local_rand_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_rand_iops} / ${local_rand_iops}) * 100 }")"
  coronafs_rand_depth_ratio="$(awk "BEGIN { if (${local_rand_depth_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_rand_depth_iops} / ${local_rand_depth_iops}) * 100 }")"
  coronafs_cross_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_cross_read_mibps} / ${local_read_mibps}) * 100 }")"
  coronafs_vs_network_ratio="$(awk "BEGIN { if (${coronafs_network_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_read_mibps} / ${coronafs_network_mibps}) * 100 }")"
  coronafs_depth_read_ratio="$(awk "BEGIN { if (${local_depth_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_depth_read_mibps} / ${local_depth_read_mibps}) * 100 }")"
  lightningstor_vs_network_ratio="$(awk "BEGIN { if (${lightningstor_network_mibps} == 0) print 0; else printf \"%.1f\", (${lightningstor_download_mibps} / ${lightningstor_network_mibps}) * 100 }")"

  cat > "${REPO_ROOT}/docs/storage-benchmarks.md" <<EOF
# Storage Benchmarks

Generated on $(date -Iseconds) with:

\`\`\`bash
nix run ./nix/test-cluster#cluster -- ${STORAGE_BENCHMARK_COMMAND}
\`\`\`

## CoronaFS

Cluster network baseline, measured with \`iperf3\` from \`node04\` to \`node01\` before the storage tests:

| Metric | Result |
|---|---:|
| TCP throughput | ${coronafs_network_mibps} MiB/s |
| TCP retransmits | ${coronafs_network_retransmits} |

Measured from \`node04\`.
Local worker disk is the baseline. CoronaFS is the shared block volume path used for mutable VM disks, exported from \`node01\` over NBD.

| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Sequential write | ${local_write_mibps} MiB/s | ${coronafs_write_mibps} MiB/s |
| Sequential read | ${local_read_mibps} MiB/s | ${coronafs_read_mibps} MiB/s |
| 4k random read | ${local_rand_iops} IOPS | ${coronafs_rand_iops} IOPS |
| 4k queued random read (\`iodepth=32\`) | ${local_rand_depth_iops} IOPS | ${coronafs_rand_depth_iops} IOPS |

Queue-depth profile (\`libaio\`, \`iodepth=32\`) from the same worker:

| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Depth-32 write | ${local_depth_write_mibps} MiB/s | ${coronafs_depth_write_mibps} MiB/s |
| Depth-32 read | ${local_depth_read_mibps} MiB/s | ${coronafs_depth_read_mibps} MiB/s |

Cross-worker shared-volume visibility, measured by writing on \`node04\` and reading from \`node05\` with direct I/O over the same CoronaFS NBD export:

| Metric | Result |
|---|---:|
| Cross-worker sequential read | ${coronafs_cross_read_mibps} MiB/s |

## LightningStor

Measured from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` against the S3-compatible endpoint on \`node01\`.
The object path exercised the distributed backend with replication across the worker storage nodes.

Cluster network baseline for this client, measured with \`iperf3\` from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` to \`node01\` before the storage tests:

| Metric | Result |
|---|---:|
| TCP throughput | ${lightningstor_network_mibps} MiB/s |
| TCP retransmits | ${lightningstor_network_retransmits} |

### Large-object path

| Metric | Result |
|---|---:|
| Object size | ${lightningstor_object_mib} MiB |
| Upload throughput | ${lightningstor_upload_mibps} MiB/s |
| Download throughput | ${lightningstor_download_mibps} MiB/s |

### Small-object batch

Measured as ${lightningstor_small_object_count} objects of 4 MiB each (${lightningstor_small_object_mib} MiB total).

| Metric | Result |
|---|---:|
| Batch upload throughput | ${lightningstor_small_upload_mibps} MiB/s |
| Batch download throughput | ${lightningstor_small_download_mibps} MiB/s |
| PUT rate | ${lightningstor_small_put_ops} objects/s |
| GET rate | ${lightningstor_small_get_ops} objects/s |

### Parallel small-object batch

Measured as the same ${lightningstor_small_object_count} objects of 4 MiB each, but with 8 concurrent client jobs from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\`.

| Metric | Result |
|---|---:|
| Parallel batch upload throughput | ${lightningstor_parallel_small_upload_mibps} MiB/s |
| Parallel batch download throughput | ${lightningstor_parallel_small_download_mibps} MiB/s |
| Parallel PUT rate | ${lightningstor_parallel_small_put_ops} objects/s |
| Parallel GET rate | ${lightningstor_parallel_small_get_ops} objects/s |

## VM Image Path

Measured against the \`PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume\` clone path on \`node01\`.

| Metric | Result |
|---|---:|
| Guest image artifact size | ${plasmavmc_image_artifact_mib} MiB |
| Guest image virtual size | ${plasmavmc_image_virtual_mib} MiB |
| \`CreateImage\` latency | ${plasmavmc_image_import_sec} s |
| First image-backed \`CreateVolume\` latency | ${plasmavmc_cold_clone_sec} s |
| Second image-backed \`CreateVolume\` latency | ${plasmavmc_warm_clone_sec} s |

## VM Runtime Path

Measured against the real \`StartVm -> qemu attach -> guest boot -> guest fio\` path on a worker node, using a CoronaFS-backed root disk and data disk.

| Metric | Result |
|---|---:|
| \`StartVm\` to qemu attach | ${plasmavmc_runtime_attach_sec} s |
| \`StartVm\` to guest benchmark result | ${plasmavmc_runtime_ready_sec} s |
| Guest sequential write | ${plasmavmc_runtime_seq_write_mibps} MiB/s |
| Guest sequential read | ${plasmavmc_runtime_seq_read_mibps} MiB/s |
| Guest 4k random read | ${plasmavmc_runtime_randread_iops} IOPS |

## Assessment

- CoronaFS shared-volume reads are currently ${coronafs_read_ratio}% of the measured local-disk baseline on this nested-QEMU lab cluster.
- CoronaFS 4k random reads are currently ${coronafs_rand_ratio}% of the measured local-disk baseline.
- CoronaFS queued 4k random reads are currently ${coronafs_rand_depth_ratio}% of the measured local queued-random-read baseline.
- CoronaFS cross-worker reads are currently ${coronafs_cross_read_ratio}% of the measured local-disk sequential-read baseline, which is the more relevant signal for VM restart and migration paths.
- CoronaFS sequential reads are currently ${coronafs_vs_network_ratio}% of the measured node04->node01 TCP baseline, which helps separate NBD/export overhead from raw cluster-network limits.
- CoronaFS depth-32 reads are currently ${coronafs_depth_read_ratio}% of the local depth-32 baseline, which is a better proxy for queued guest I/O than the single-depth path.
- The shared-volume path is functionally correct for mutable VM disks and migration tests, but its read-side throughput is still too low to call production-ready for heavier VM workloads.
- LightningStor's replicated S3 path is working correctly, but ${lightningstor_upload_mibps} MiB/s upload and ${lightningstor_download_mibps} MiB/s download are still lab-grade numbers rather than strong object-store throughput.
- LightningStor large-object downloads are currently ${lightningstor_vs_network_ratio}% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
- LightningStor's small-object batch path is also functional, but ${lightningstor_small_put_ops} PUT/s and ${lightningstor_small_get_ops} GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches ${lightningstor_parallel_small_put_ops} PUT/s and ${lightningstor_parallel_small_get_ops} GET/s.
- The VM image section measures clone/materialization cost, not guest runtime I/O.
- The VM runtime section is the real \`PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel\` path; use it to judge whether QEMU/NBD tuning is helping.
- The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.
EOF
}

benchmark_storage() {
  local coronafs_network_results lightningstor_network_results coronafs_results lightningstor_results plasmavmc_results plasmavmc_runtime_results
  local coronafs_network_mibps coronafs_network_retransmits
  local lightningstor_network_mibps lightningstor_network_retransmits
  local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
  local coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps
  local local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps
  local lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib
  local lightningstor_small_object_count lightningstor_small_object_mib
  local lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops
  local lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops
  local plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib
  local plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec
  local plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec
  local plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops

  coronafs_network_results="$(run_remote_iperf_json node04 node01 10.100.0.11)"
  lightningstor_network_results="$(run_remote_iperf_json "${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}" node01 10.100.0.11)"
  coronafs_results="$(benchmark_coronafs_performance)"
  lightningstor_results="$(benchmark_lightningstor_performance)"
  if [[ "${STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH}" == "1" ]]; then
    plasmavmc_results=$'0\t0\t0\t0\t0'
  else
    plasmavmc_results="$(benchmark_plasmavmc_image_path)"
  fi
  if [[ "${STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH}" == "1" ]]; then
    plasmavmc_runtime_results=$'0\t0\t0\t0\t0'
  else
    plasmavmc_runtime_results="$(benchmark_plasmavmc_guest_runtime)"
  fi

  coronafs_network_mibps="$(bps_to_mibps "$(printf '%s' "${coronafs_network_results}" | jq -r '.bits_per_second')")"
  coronafs_network_retransmits="$(printf '%s' "${coronafs_network_results}" | jq -r '.retransmits')"
  lightningstor_network_mibps="$(bps_to_mibps "$(printf '%s' "${lightningstor_network_results}" | jq -r '.bits_per_second')")"
  lightningstor_network_retransmits="$(printf '%s' "${lightningstor_network_results}" | jq -r '.retransmits')"
  IFS=$'\t' read -r \
    local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops \
    coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps \
    local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps <<<"${coronafs_results}"
  IFS=$'\t' read -r \
    lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib \
    lightningstor_small_object_count lightningstor_small_object_mib lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops \
    lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops <<<"${lightningstor_results}"
  IFS=$'\t' read -r \
    plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec <<<"${plasmavmc_results}"
  IFS=$'\t' read -r \
    plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops <<<"${plasmavmc_runtime_results}"

  write_storage_benchmark_report \
    "${coronafs_network_mibps}" "${coronafs_network_retransmits}" \
    "${lightningstor_network_mibps}" "${lightningstor_network_retransmits}" \
    "${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
    "${coronafs_write_mibps}" "${coronafs_read_mibps}" "${coronafs_rand_iops}" "${coronafs_rand_depth_iops}" "${coronafs_cross_read_mibps}" \
    "${local_depth_write_mibps}" "${local_depth_read_mibps}" "${coronafs_depth_write_mibps}" "${coronafs_depth_read_mibps}" \
    "${lightningstor_upload_mibps}" "${lightningstor_download_mibps}" "${lightningstor_object_mib}" \
    "${lightningstor_small_object_count}" "${lightningstor_small_object_mib}" "${lightningstor_small_upload_mibps}" "${lightningstor_small_download_mibps}" "${lightningstor_small_ops}" \
    "${lightningstor_parallel_small_upload_mibps}" "${lightningstor_parallel_small_download_mibps}" "${lightningstor_parallel_small_ops}" \
    "${plasmavmc_image_artifact_mib}" "${plasmavmc_image_virtual_mib}" "${plasmavmc_image_import_sec}" "${plasmavmc_cold_clone_sec}" "${plasmavmc_warm_clone_sec}" \
    "${plasmavmc_runtime_attach_sec}" "${plasmavmc_runtime_ready_sec}" "${plasmavmc_runtime_seq_write_mibps}" "${plasmavmc_runtime_seq_read_mibps}" "${plasmavmc_runtime_randread_iops}"

  log "Storage benchmark report written to ${REPO_ROOT}/docs/storage-benchmarks.md"
}

validate_control_plane_fault_injection() {
  log "Injecting control-plane failure: stopping node02 and validating quorum behavior"

  local iam_tunnel="" iam_tunnel_alt=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  iam_tunnel_alt="$(start_ssh_tunnel node03 15083 50080)"
  local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
  trap 'start_vm node02 >/dev/null 2>&1 || true; wait_for_ssh node02 || true; stop_ssh_tunnel node03 "${iam_tunnel_alt}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  stop_vm node02
  wait_for_ssh_down node02 90

  ssh_node_script node01 <<'EOS'
set -euo pipefail
key="fault-chainfire-$(date +%s)"
value="ok-$RANDOM"
nodes=(10.100.0.11 10.100.0.13)
writer=""
deadline=$((SECONDS + 60))
while [[ -z "${writer}" ]]; do
  for ip in "${nodes[@]}"; do
    code="$(curl -sS -o /tmp/chainfire-fault.out -w '%{http_code}' \
      -X PUT "http://${ip}:8081/api/v1/kv/${key}" \
      -H 'Content-Type: application/json' \
      -d "{\"value\":\"${value}\"}" || true)"
    if [[ "${code}" == "200" ]]; then
      writer="${ip}"
      break
    fi
  done
  if [[ -n "${writer}" ]]; then
    break
  fi
  if (( SECONDS >= deadline )); then
    echo "chainfire quorum writer did not become available after node02 stop" >&2
    exit 1
  fi
  sleep 1
done
for ip in "${nodes[@]}"; do
  deadline=$((SECONDS + 60))
  while true; do
    actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
    if [[ "${actual}" == "${value}" ]]; then
      break
    fi
    if (( SECONDS >= deadline )); then
      echo "chainfire quorum write did not converge on ${ip}" >&2
      exit 1
    fi
    sleep 1
  done
done
EOS

  ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
  ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
set -euo pipefail
proto_root="$1"
key="fault-flaredb-strong-$(date +%s)"
value="ok-$RANDOM"
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
nodes=(10.100.0.11 10.100.0.13)
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"fault"}')"
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"fault"}')"
writer=""
deadline=$((SECONDS + 90))
while [[ -z "${writer}" ]]; do
  for ip in "${nodes[@]}"; do
    if timeout 15 grpcurl -plaintext \
      -import-path "${proto_root}" \
      -proto "${proto_root}/kvrpc.proto" \
      -d "${request}" \
      "${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-fault-cas.out 2>/dev/null; then
      if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-fault-cas.out >/dev/null; then
        writer="${ip}"
        break
      fi
    fi
  done
  if [[ -n "${writer}" ]]; then
    break
  fi
  if (( SECONDS >= deadline )); then
    echo "flaredb quorum writer did not become available after node02 stop" >&2
    exit 1
  fi
  sleep 1
done
deadline=$((SECONDS + 90))
while true; do
  if timeout 15 grpcurl -plaintext \
    -import-path "${proto_root}" \
    -proto "${proto_root}/kvrpc.proto" \
    -d "${get_request}" \
    "${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-fault-get.out 2>/dev/null; then
    if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-fault-get.out >/dev/null; then
      break
    fi
  fi
  if (( SECONDS >= deadline )); then
    echo "flaredb strong quorum write did not remain readable on leader ${writer}" >&2
    exit 1
  fi
  sleep 1
done
EOS

  local org_id="fault-iam-org"
  local project_id="fault-iam-project"
  local principal_id="fault-iam-$(date +%s)"
  local token iam_fault_port
  read -r iam_fault_port token < <(issue_project_admin_token_any "${org_id}" "${project_id}" "${principal_id}" 15080 15083)
  grpcurl -plaintext \
    -import-path "${IAM_PROTO_DIR}" \
    -proto "${IAM_PROTO}" \
    -d "$(jq -cn --arg token "${token}" '{token:$token}')" \
    127.0.0.1:"${iam_fault_port}" iam.v1.IamToken/ValidateToken \
    | jq -e '.valid == true' >/dev/null

  start_vm node02
  wait_for_ssh node02
  wait_for_unit node02 chainfire
  wait_for_unit node02 flaredb
  wait_for_unit node02 iam
  wait_for_flaredb_region node02
  wait_for_flaredb_route_metadata node01

  trap - RETURN
  stop_ssh_tunnel node03 "${iam_tunnel_alt}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_worker_fault_injection() {
  log "Injecting worker failure: stopping node04 and validating degraded worker operation"

  local iam_tunnel="" ls_tunnel="" vm_tunnel=""
  iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
  ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
  vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
  trap 'start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN

  stop_vm node04
  wait_for_ssh_down node04 90

  wait_for_http node05 http://127.0.0.1:8084/health
  wait_for_tcp_port node05 50086

  grpcurl -plaintext \
    -import-path "${PLASMAVMC_PROTO_DIR}" \
    -proto "${PLASMAVMC_PROTO}" \
    -d '{}' \
    127.0.0.1:15082 plasmavmc.v1.NodeService/ListNodes \
    | jq -e '([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null' >/dev/null

  local org_id="worker-fault-org"
  local project_id="worker-fault-project"
  local principal_id="worker-fault-$(date +%s)"
  local token bucket key tmpfile
  token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
  bucket="worker-fault-$(date +%s)"
  key="survive-${RANDOM}.txt"
  ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"

  tmpfile="$(mktemp)"
  trap 'rm -f "${tmpfile}"; start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
  printf 'worker-fault-check-%s\n' "${RANDOM}" >"${tmpfile}"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "$(jq -cn \
      --arg bucket "${bucket}" \
      --arg key "${key}" \
      --arg body "$(base64 -w0 "${tmpfile}")" \
      '{bucket:$bucket, key:$key, body:$body, metadata:{contentType:"text/plain"}}')" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject >/dev/null

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null

  download_lightningstor_object_to_file 15086 "${token}" "${bucket}" "${key}" "${tmpfile}.downloaded"
  cmp -s "${tmpfile}" "${tmpfile}.downloaded"

  grpcurl -plaintext \
    -H "authorization: Bearer ${token}" \
    -import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
    -proto "${LIGHTNINGSTOR_PROTO}" \
    -d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
    127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject >/dev/null

  rm -f "${tmpfile}" "${tmpfile}.downloaded"
  start_vm node04
  wait_for_ssh node04
  wait_for_unit node04 plasmavmc
  wait_for_unit node04 lightningstor
  wait_for_http node04 http://127.0.0.1:8084/health
  wait_for_tcp_port node04 50086
  wait_for_plasmavmc_workers_registered 15082

  trap - RETURN
  stop_ssh_tunnel node01 "${vm_tunnel}"
  stop_ssh_tunnel node01 "${ls_tunnel}"
  stop_ssh_tunnel node01 "${iam_tunnel}"
}

validate_fault_injection() {
  validate_control_plane_fault_injection
  validate_worker_fault_injection
}

validate_cluster() {
  preflight
  wait_requested
  validate_units
  validate_control_plane
  validate_iam_flow
  validate_prismnet_flow
  validate_flashdns_flow
  validate_fiberlb_flow
  validate_workers
  validate_lightningstor_distributed_storage
  validate_vm_storage_flow
  validate_k8shost_flow
  validate_gateway
  validate_nightlight_flow
  validate_creditservice_flow
  validate_deployer_flow
  validate_fault_injection
  validate_nested_kvm_workers
  validate_native_runtime_flow
  log "Cluster validation succeeded"
}

validate_storage_cluster() {
  preflight
  wait_requested "${STORAGE_NODES[@]}"
  validate_storage_units
  validate_storage_control_plane
  validate_workers
  validate_lightningstor_distributed_storage
  validate_vm_storage_flow
  validate_nested_kvm_workers
  log "Storage cluster validation succeeded"
}

smoke_requested() {
  start_requested "$@"
  validate_cluster
}

fresh_smoke_requested() {
  clean_requested "$@"
  smoke_requested "$@"
}

storage_smoke_requested() {
  BUILD_PROFILE="storage"
  start_requested "${STORAGE_NODES[@]}"
  validate_storage_cluster
}

fresh_storage_smoke_requested() {
  BUILD_PROFILE="storage"
  clean_requested "${STORAGE_NODES[@]}"
  storage_smoke_requested
}

matrix_requested() {
  start_requested "$@"
  validate_component_matrix
}

fresh_matrix_requested() {
  clean_requested "$@"
  matrix_requested "$@"
}

bench_storage_requested() {
  STORAGE_BENCHMARK_COMMAND="${STORAGE_BENCHMARK_COMMAND:-bench-storage}"
  start_requested "$@"
  validate_units
  benchmark_storage
}

fresh_bench_storage_requested() {
  STORAGE_BENCHMARK_COMMAND="fresh-bench-storage"
  clean_requested "$@"
  bench_storage_requested "$@"
}

storage_bench_requested() {
  LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
  BUILD_PROFILE="storage"
  start_requested "${STORAGE_NODES[@]}"
  validate_storage_units
  validate_storage_control_plane
  benchmark_storage
}

fresh_storage_bench_requested() {
  STORAGE_BENCHMARK_COMMAND="fresh-storage-bench"
  LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
  BUILD_PROFILE="storage"
  clean_requested "${STORAGE_NODES[@]}"
  storage_bench_requested
}

status_requested() {
  local nodes
  mapfile -t nodes < <(all_or_requested_nodes "$@")
  validate_nodes_exist "${nodes[@]}"

  local node pid_path
  printf 'PhotonCloud test cluster status\n'
  printf '===============================\n'
  for node in "${nodes[@]}"; do
    pid_path="$(pid_file "${node}")"
    if is_running "${node}"; then
      printf '%s: RUNNING (pid=%s, ssh=%s, runtime=%s)\n' \
        "${node}" "$(<"${pid_path}")" "$(ssh_port_for_node "${node}")" "$(runtime_dir "${node}")"
    else
      printf '%s: STOPPED\n' "${node}"
    fi
  done
}

stop_requested() {
  acquire_cluster_lock
  local nodes
  mapfile -t nodes < <(all_or_requested_nodes "$@")
  validate_nodes_exist "${nodes[@]}"

  if [[ "$#" -eq 0 ]]; then
    stop_nodes_all_profiles "${nodes[@]}"
  else
    stop_nodes_current_profile "${nodes[@]}"
  fi
}

clean_requested() {
  acquire_cluster_lock
  stop_requested "$@"
  if [[ "$#" -eq 0 ]]; then
    remove_runtime_state_all_profiles
  else
    local node
    for node in "$@"; do
      log "Removing runtime state for ${node}"
      find "$(runtime_dir "${node}")" -mindepth 1 -delete 2>/dev/null || true
      rmdir "$(runtime_dir "${node}")" 2>/dev/null || true
      rm -f "$(build_link "${node}")"
    done
  fi
}

ssh_requested() {
  local node="${1:-node01}"
  validate_nodes_exist "${node}"
  local ssh_port
  ssh_port="$(ssh_port_for_node "${node}")"
  exec sshpass -p "${SSH_PASSWORD}" \
    ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1
}

logs_requested() {
  local node="${1:-node01}"
  local lines="${2:-120}"
  validate_nodes_exist "${node}"
  tail -n "${lines}" "$(log_file "${node}")"
}

usage() {
  cat <<USAGE
PhotonCloud VM test cluster

Usage: $0 <command> [nodes...]

Commands:
  build      Build one or more VM derivations
  start      Build if needed, start VMs, and wait for SSH
  wait       Wait for SSH on running VMs
  validate   Run the cluster smoke validation
  smoke      start + validate
  fresh-smoke clean local runtime state, rebuild on the host, start, and validate
  storage-smoke start the storage lab (node01-05) and validate CoronaFS/LightningStor/PlasmaVMC
  fresh-storage-smoke clean local runtime state, rebuild node01-05 on the host, start, and validate the storage lab
  matrix     Start the cluster and validate composed service configurations against the current running VMs
  fresh-matrix clean local runtime state, rebuild on the host, start, and validate composed service configurations
  bench-storage start the cluster and benchmark CoronaFS plus LightningStor against the current running VMs
  fresh-bench-storage clean local runtime state, rebuild on the host, start, and benchmark CoronaFS plus LightningStor
  storage-bench start the storage lab (node01-05) and benchmark CoronaFS plus LightningStor
  fresh-storage-bench clean local runtime state, rebuild node01-05 on the host, start, and benchmark the storage lab
  stop       Stop one or more VMs
  status     Show VM process status
  ssh        SSH to a node (default: node01)
  logs       Show VM log for a node (default: node01)
  clean      Stop VMs and remove local runtime state
  help       Show this help

Examples:
  $0 smoke
  $0 fresh-smoke
  $0 storage-smoke
  $0 fresh-storage-smoke
  $0 matrix
  $0 fresh-matrix
  $0 bench-storage
  $0 fresh-bench-storage
  $0 storage-bench
  $0 fresh-storage-bench
  $0 start node01 node02 node03
  $0 validate
  $0 ssh node04
USAGE
}

main() {
  local cmd="${1:-help}"
  shift || true

  case "${cmd}" in
    build) build_requested "$@" ;;
    start) start_requested "$@" ;;
    wait) wait_requested "$@" ;;
    validate) validate_cluster ;;
    smoke) smoke_requested "$@" ;;
    fresh-smoke) fresh_smoke_requested "$@" ;;
    storage-smoke) storage_smoke_requested ;;
    fresh-storage-smoke) fresh_storage_smoke_requested ;;
    matrix) matrix_requested "$@" ;;
    fresh-matrix) fresh_matrix_requested "$@" ;;
    bench-storage) bench_storage_requested "$@" ;;
    fresh-bench-storage) fresh_bench_storage_requested "$@" ;;
    storage-bench) storage_bench_requested ;;
    fresh-storage-bench) fresh_storage_bench_requested ;;
    stop) stop_requested "$@" ;;
    status) status_requested "$@" ;;
    ssh) ssh_requested "$@" ;;
    logs) logs_requested "$@" ;;
    clean) clean_requested "$@" ;;
    help|--help|-h) usage ;;
    *) die "unknown command: ${cmd}" ;;
  esac
}

main "$@"