photoncloud-monorepo/nix/test-cluster/run-cluster.sh

6339 lines
230 KiB
Bash
Executable file

#!/usr/bin/env bash
# PhotonCloud VM test-cluster harness
#
# Commands:
# build Build one or more VM derivations
# start Build if needed, start VMs, and wait for SSH
# wait Wait for SSH on running VMs
# validate Run multi-node smoke validation, including nested KVM on workers
# smoke start + validate
# fresh-smoke clean + host-build + start + validate
# fresh-matrix clean + host-build + start + composed-configuration validation
# fresh-bench-storage clean + host-build + start + storage benchmark
# stop Stop running VMs
# status Show VM process status
# ssh Open an interactive SSH session to a node
# logs Show the VM log for a node
# clean Stop VMs and remove local runtime state
#
# Examples:
# ./run-cluster.sh smoke
# ./run-cluster.sh start node01 node02 node03
# ./run-cluster.sh validate
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
CLUSTER_DIR="${SCRIPT_DIR}"
CLUSTER_FLAKE_REF="${PHOTON_CLUSTER_FLAKE:-${CLUSTER_DIR}}"
VM_DIR_BASE="${PHOTON_VM_DIR:-${HOME}/.photoncloud-test-cluster}"
VDE_SWITCH_DIR_BASE="${PHOTON_CLUSTER_VDE_SWITCH_DIR:-/tmp/photoncloud-test-cluster-vde.sock}"
CORONAFS_API_PORT="${PHOTON_CORONAFS_API_PORT:-50088}"
CORONAFS_VOLUME_ROOT="/var/lib/coronafs/volumes"
SSH_PASSWORD="${PHOTON_VM_ROOT_PASSWORD:-test}"
SSH_CONNECT_TIMEOUT="${PHOTON_VM_SSH_CONNECT_TIMEOUT:-5}"
SSH_WAIT_TIMEOUT="${PHOTON_VM_SSH_WAIT_TIMEOUT:-300}"
UNIT_WAIT_TIMEOUT="${PHOTON_VM_UNIT_WAIT_TIMEOUT:-240}"
HTTP_WAIT_TIMEOUT="${PHOTON_VM_HTTP_WAIT_TIMEOUT:-180}"
KVM_WAIT_TIMEOUT="${PHOTON_VM_KVM_WAIT_TIMEOUT:-180}"
FLAREDB_WAIT_TIMEOUT="${PHOTON_VM_FLAREDB_WAIT_TIMEOUT:-180}"
GRPCURL_MAX_MSG_SIZE="${PHOTON_VM_GRPCURL_MAX_MSG_SIZE:-1073741824}"
GRPCURL_TIMEOUT_SECS="${PHOTON_VM_GRPCURL_TIMEOUT_SECS:-30}"
TUNNEL_WAIT_TIMEOUT="${PHOTON_VM_TUNNEL_WAIT_TIMEOUT:-30}"
STORAGE_BENCHMARK_COMMAND="${PHOTON_VM_STORAGE_BENCH_COMMAND:-bench-storage}"
LIGHTNINGSTOR_BENCH_CLIENT_NODE="${PHOTON_VM_LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node06}"
STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_IMAGE_BENCH:-0}"
STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH:-0}"
CLUSTER_NIX_MAX_JOBS="${PHOTON_CLUSTER_NIX_MAX_JOBS:-2}"
CLUSTER_NIX_BUILD_CORES="${PHOTON_CLUSTER_NIX_BUILD_CORES:-4}"
BUILD_PROFILE="${PHOTON_CLUSTER_BUILD_PROFILE:-default}"
CLUSTER_SKIP_BUILD="${PHOTON_CLUSTER_SKIP_BUILD:-0}"
CLUSTER_LOCK_HELD=0
NODES=(node01 node02 node03 node04 node05 node06)
STORAGE_NODES=(node01 node02 node03 node04 node05)
IAM_PROTO_DIR="${REPO_ROOT}/iam/proto"
IAM_PROTO="${IAM_PROTO_DIR}/iam.proto"
PRISMNET_PROTO_DIR="${REPO_ROOT}/prismnet/crates/prismnet-api/proto"
PRISMNET_PROTO="${PRISMNET_PROTO_DIR}/prismnet.proto"
FLASHDNS_PROTO_DIR="${REPO_ROOT}/flashdns/crates/flashdns-api/proto"
FLASHDNS_PROTO="${FLASHDNS_PROTO_DIR}/flashdns.proto"
FIBERLB_PROTO_DIR="${REPO_ROOT}/fiberlb/crates/fiberlb-api/proto"
FIBERLB_PROTO="${FIBERLB_PROTO_DIR}/fiberlb.proto"
K8SHOST_PROTO_DIR="${REPO_ROOT}/k8shost/crates/k8shost-proto/proto"
K8SHOST_PROTO="${K8SHOST_PROTO_DIR}/k8s.proto"
CREDITSERVICE_PROTO_DIR="${REPO_ROOT}/creditservice/proto"
CREDITSERVICE_PROTO="${CREDITSERVICE_PROTO_DIR}/creditservice.proto"
LIGHTNINGSTOR_PROTO_DIR="${REPO_ROOT}/lightningstor/crates/lightningstor-api/proto"
LIGHTNINGSTOR_PROTO="${LIGHTNINGSTOR_PROTO_DIR}/lightningstor.proto"
PLASMAVMC_PROTO_DIR="${REPO_ROOT}/plasmavmc/proto"
PLASMAVMC_PROTO="${PLASMAVMC_PROTO_DIR}/plasmavmc.proto"
FLAREDB_PROTO_DIR="${REPO_ROOT}/flaredb/crates/flaredb-proto/src"
FLAREDB_PROTO="${FLAREDB_PROTO_DIR}/kvrpc.proto"
# shellcheck disable=SC2034
NODE_PHASES=(
"node01 node02 node03"
"node04 node05"
"node06"
)
declare -A SSH_PORTS=(
[node01]=2201
[node02]=2202
[node03]=2203
[node04]=2204
[node05]=2205
[node06]=2206
)
declare -A STORAGE_SSH_PORTS=(
[node01]=2301
[node02]=2302
[node03]=2303
[node04]=2304
[node05]=2305
)
declare -A NODE_IPS=(
[node01]=10.100.0.11
[node02]=10.100.0.12
[node03]=10.100.0.13
[node04]=10.100.0.21
[node05]=10.100.0.22
[node06]=10.100.0.100
)
declare -A NODE_UNITS=(
[node01]="chainfire flaredb iam prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost"
[node02]="chainfire flaredb iam"
[node03]="chainfire flaredb iam"
[node04]="plasmavmc lightningstor node-agent"
[node05]="plasmavmc lightningstor node-agent"
[node06]="apigateway nightlight creditservice deployer fleet-scheduler"
)
declare -A STORAGE_BUILD_TARGETS=(
[node01]=storage-node01
[node02]=storage-node02
[node03]=storage-node03
[node04]=storage-node04
[node05]=storage-node05
)
SSH_OPTS=(
-o StrictHostKeyChecking=no
-o UserKnownHostsFile=/dev/null
-o LogLevel=ERROR
-o ConnectTimeout="${SSH_CONNECT_TIMEOUT}"
-o PreferredAuthentications=password
-o PubkeyAuthentication=no
-o KbdInteractiveAuthentication=no
)
log() {
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >&2
}
die() {
log "ERROR: $*"
exit 1
}
warn() {
log "WARN: $*"
}
run_deployer_ctl() {
RUST_LOG="${RUST_LOG:-warn}" \
nix --option warn-dirty false run --quiet \
--extra-experimental-features 'nix-command flakes' \
"${REPO_ROOT}#deployer-ctl" -- "$@"
}
release_cluster_lock() {
local lock_dir
local owner=""
lock_dir="$(cluster_lock_dir)"
if [[ "${CLUSTER_LOCK_HELD}" -ne 1 ]]; then
return 0
fi
if [[ -d "${lock_dir}" ]]; then
if [[ -f "${lock_dir}/pid" ]]; then
owner="$(<"${lock_dir}/pid")"
fi
if [[ -z "${owner}" || "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ]]; then
rm -rf "${lock_dir}"
fi
fi
CLUSTER_LOCK_HELD=0
unset PHOTON_CLUSTER_LOCK_OWNER
}
acquire_cluster_lock() {
local lock_dir
local owner=""
lock_dir="$(cluster_lock_dir)"
if [[ "${CLUSTER_LOCK_HELD}" -eq 1 ]]; then
return 0
fi
mkdir -p "$(dirname "${lock_dir}")"
if mkdir "${lock_dir}" 2>/dev/null; then
printf '%s\n' "$$" >"${lock_dir}/pid"
CLUSTER_LOCK_HELD=1
export PHOTON_CLUSTER_LOCK_OWNER="$$"
trap release_cluster_lock EXIT
return 0
fi
if [[ -f "${lock_dir}/pid" ]]; then
owner="$(<"${lock_dir}/pid")"
fi
if [[ -n "${owner}" && ( "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ) ]]; then
CLUSTER_LOCK_HELD=1
export PHOTON_CLUSTER_LOCK_OWNER="${owner}"
trap release_cluster_lock EXIT
return 0
fi
if [[ -n "${owner}" ]] && ! kill -0 "${owner}" >/dev/null 2>&1; then
warn "reclaiming stale PhotonCloud test-cluster lock from pid ${owner}"
rm -f "${lock_dir}/pid"
rmdir "${lock_dir}" 2>/dev/null || true
if mkdir "${lock_dir}" 2>/dev/null; then
printf '%s\n' "$$" >"${lock_dir}/pid"
CLUSTER_LOCK_HELD=1
export PHOTON_CLUSTER_LOCK_OWNER="$$"
trap release_cluster_lock EXIT
return 0
fi
fi
die "another PhotonCloud test-cluster run is active${owner:+ (pid ${owner})}; lock: ${lock_dir}"
}
lightningstor_data_root() {
case "$1" in
node01) printf '%s\n' /var/lib/lightningstor/node ;;
node04|node05) printf '%s\n' /var/lib/lightningstor ;;
*) die "no LightningStor data root mapping for $1" ;;
esac
}
profile_slug() {
local slug
slug="$(printf '%s' "${BUILD_PROFILE}" | tr -c 'A-Za-z0-9._-' '-')"
slug="${slug##-}"
slug="${slug%%-}"
if [[ -z "${slug}" ]]; then
slug="default"
fi
printf '%s\n' "${slug}"
}
profile_state_suffix() {
local slug
slug="$(profile_slug)"
if [[ "${slug}" == "default" ]]; then
printf '\n'
else
printf -- '-%s\n' "${slug}"
fi
}
vm_dir() {
printf '%s%s\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
}
cluster_lock_dir() {
printf '%s%s.lock\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
}
vde_switch_dir() {
printf '%s%s\n' "${VDE_SWITCH_DIR_BASE}" "$(profile_state_suffix)"
}
vde_switch_pid_file() {
printf '%s/vde-switch.pid\n' "$(vm_dir)"
}
all_build_profiles() {
local seen=""
local profile
for profile in default storage "${BUILD_PROFILE}"; do
[[ -n "${profile}" ]] || continue
case " ${seen} " in
*" ${profile} "*) continue ;;
esac
seen="${seen} ${profile}"
printf '%s\n' "${profile}"
done
}
with_build_profile() {
local next_profile="$1"
local prev_profile="${BUILD_PROFILE}"
shift
BUILD_PROFILE="${next_profile}"
"$@"
local rc=$?
BUILD_PROFILE="${prev_profile}"
return "${rc}"
}
lightningstor_data_file_count() {
local node="$1"
local root
root="$(lightningstor_data_root "${node}")"
local deadline=$((SECONDS + SSH_WAIT_TIMEOUT))
local output=""
while true; do
if output="$(ssh_node "${node}" "find ${root} -type f ! -name '*.tmp' | wc -l" 2>/dev/null)"; then
printf '%s\n' "${output}"
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out collecting LightningStor file count from ${node}"
fi
sleep 2
done
}
lightningstor_count_triplet() {
printf '%s %s %s\n' \
"$(lightningstor_data_file_count node01)" \
"$(lightningstor_data_file_count node04)" \
"$(lightningstor_data_file_count node05)"
}
wait_for_lightningstor_counts_greater_than() {
local before_node01="$1"
local before_node04="$2"
local before_node05="$3"
local context="$4"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local count_node01 count_node04 count_node05
read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
if (( count_node01 > before_node01 )) && (( count_node04 > before_node04 )) && (( count_node05 > before_node05 )); then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for distributed LightningStor replicas for ${context}"
fi
sleep 2
done
}
wait_for_lightningstor_counts_equal() {
local expected_node01="$1"
local expected_node04="$2"
local expected_node05="$3"
local context="$4"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local count_node01 count_node04 count_node05
read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
if (( count_node01 == expected_node01 )) && (( count_node04 == expected_node04 )) && (( count_node05 == expected_node05 )); then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for distributed LightningStor counts to settle for ${context}: expected ${expected_node01}/${expected_node04}/${expected_node05}, got ${count_node01}/${count_node04}/${count_node05}"
fi
sleep 2
done
}
require_cmd() {
command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"
}
grpcurl_capture() {
local status=0
local output=""
output="$(timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl "$@" 2>&1)" || status=$?
printf '%s' "${output}"
return "${status}"
}
build_link() {
printf '%s/build-%s' "$(vm_dir)" "$1"
}
guest_image_link() {
printf '%s/build-vm-guest-image' "$(vm_dir)"
}
guest_bench_image_link() {
printf '%s/build-vm-bench-guest-image' "$(vm_dir)"
}
runtime_dir() {
printf '%s/%s' "$(vm_dir)" "$1"
}
pid_file() {
printf '%s/%s/vm.pid' "$(vm_dir)" "$1"
}
log_file() {
printf '%s/%s/vm.log' "$(vm_dir)" "$1"
}
runvm_path() {
local node="$1"
find -L "$(build_link "${node}")/bin" -maxdepth 1 -name 'run-*-vm' | head -n1
}
guest_image_path() {
local link_path
link_path="$(guest_image_link)"
build_guest_image
find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
}
guest_bench_image_path() {
local link_path
link_path="$(guest_bench_image_link)"
build_guest_bench_image
find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
}
all_or_requested_nodes() {
if [[ "$#" -eq 0 ]]; then
printf '%s\n' "${NODES[@]}"
else
printf '%s\n' "$@"
fi
}
validate_nodes_exist() {
local node
for node in "$@"; do
[[ -n "${SSH_PORTS[${node}]:-}" ]] || die "unknown node: ${node}"
done
}
ssh_port_for_node() {
local node="$1"
if [[ "${BUILD_PROFILE}" == "storage" && -n "${STORAGE_SSH_PORTS[${node}]:-}" ]]; then
printf '%s\n' "${STORAGE_SSH_PORTS[${node}]}"
else
printf '%s\n' "${SSH_PORTS[${node}]}"
fi
}
host_nested_param_path() {
if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then
printf '%s\n' /sys/module/kvm_intel/parameters/nested
elif [[ -f /sys/module/kvm_amd/parameters/nested ]]; then
printf '%s\n' /sys/module/kvm_amd/parameters/nested
fi
}
preflight() {
acquire_cluster_lock
require_cmd nix
require_cmd qemu-system-x86_64
require_cmd ssh
require_cmd sshpass
require_cmd curl
require_cmd grpcurl
require_cmd vde_switch
mkdir -p "$(vm_dir)"
log "Cluster build profile: ${BUILD_PROFILE} (state dir $(vm_dir))"
[[ -e /dev/kvm ]] || die "/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization"
[[ -r /dev/kvm && -w /dev/kvm ]] || warn "/dev/kvm exists but current user may not have full access"
local nested_path
nested_path="$(host_nested_param_path || true)"
if [[ -n "${nested_path}" ]]; then
log "Host nested virtualization parameter: ${nested_path}=$(<"${nested_path}")"
else
warn "Could not locate host nested virtualization parameter; guest nested-KVM validation may fail"
fi
}
vde_switch_ctl_path() {
printf '%s/ctl\n' "$(vde_switch_dir)"
}
vde_switch_running() {
if [[ -f "$(vde_switch_pid_file)" ]] && kill -0 "$(<"$(vde_switch_pid_file)")" 2>/dev/null; then
[[ -S "$(vde_switch_ctl_path)" ]]
return
fi
[[ -S "$(vde_switch_ctl_path)" ]]
}
ensure_vde_switch() {
local deadline
local vde_dir
vde_dir="$(vde_switch_dir)"
if vde_switch_running; then
return 0
fi
rm -rf "${vde_dir}"
rm -f "$(vde_switch_pid_file)"
log "Starting VDE switch at ${vde_dir}"
vde_switch \
-sock "${vde_dir}" \
-daemon \
-pidfile "$(vde_switch_pid_file)"
deadline=$((SECONDS + 10))
while true; do
if vde_switch_running; then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VDE switch at ${vde_dir}"
fi
sleep 1
done
}
stop_vde_switch() {
local pid=""
local vde_dir
vde_dir="$(vde_switch_dir)"
if [[ -f "$(vde_switch_pid_file)" ]]; then
pid="$(<"$(vde_switch_pid_file)")"
fi
if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then
log "Stopping VDE switch (PID ${pid})"
kill "${pid}" || true
for _ in {1..10}; do
if ! kill -0 "${pid}" 2>/dev/null; then
break
fi
sleep 1
done
if kill -0 "${pid}" 2>/dev/null; then
warn "VDE switch did not stop after SIGTERM; sending SIGKILL"
kill -9 "${pid}" || true
fi
fi
rm -f "$(vde_switch_pid_file)"
rm -rf "${vde_dir}"
}
any_vm_running() {
local node
for node in "${NODES[@]}"; do
if is_running "${node}"; then
return 0
fi
done
return 1
}
terminate_pids() {
local context="$1"
shift
local pids=("$@")
local pid
[[ "${#pids[@]}" -gt 0 ]] || return 0
log "Stopping stale ${context}: ${pids[*]}"
kill "${pids[@]}" 2>/dev/null || true
for _ in {1..20}; do
local remaining=0
for pid in "${pids[@]}"; do
if kill -0 "${pid}" 2>/dev/null; then
remaining=1
break
fi
done
if [[ "${remaining}" -eq 0 ]]; then
return 0
fi
sleep 1
done
warn "Force-killing stale ${context}: ${pids[*]}"
kill -9 "${pids[@]}" 2>/dev/null || true
}
stale_vm_pids_for_nodes_current_profile() {
local nodes=("$@")
local pid cmd node port runtime_path
declare -A seen=()
while read -r pid cmd; do
[[ -n "${pid:-}" ]] || continue
for node in "${nodes[@]}"; do
port="$(ssh_port_for_node "${node}")"
runtime_path="$(runtime_dir "${node}")/${node}.qcow2"
if [[ "${cmd}" == *"qemu-system"* ]] && {
[[ "${cmd}" == *"file=${runtime_path}"* ]] ||
[[ "${cmd}" == *"hostfwd=tcp::${port}-:22"* ]];
}; then
seen["${pid}"]=1
fi
done
done < <(pgrep -af 'qemu-system[^[:space:]]*|run-.*-vm' || true)
for node in "${nodes[@]}"; do
port="$(ssh_port_for_node "${node}")"
while read -r pid; do
[[ -n "${pid:-}" ]] || continue
seen["${pid}"]=1
done < <(
ss -H -ltnp "( sport = :${port} )" 2>/dev/null \
| sed -n 's/.*pid=\([0-9]\+\).*/\1/p' \
| sort -u
)
done
printf '%s\n' "${!seen[@]}" | sort -n
}
stop_stale_vm_processes_current_profile() {
local nodes=("$@")
local pids=()
mapfile -t pids < <(stale_vm_pids_for_nodes_current_profile "${nodes[@]}")
terminate_pids "VM processes" "${pids[@]}"
}
stop_nodes_current_profile() {
local nodes=("$@")
local node
for node in "${nodes[@]}"; do
stop_vm "${node}"
done
stop_stale_vm_processes_current_profile "${nodes[@]}"
if ! any_vm_running; then
stop_vde_switch
fi
}
stop_nodes_all_profiles() {
local nodes=("$@")
local profile
while IFS= read -r profile; do
with_build_profile "${profile}" stop_nodes_current_profile "${nodes[@]}"
done < <(all_build_profiles)
}
remove_runtime_state_current_profile() {
local state_dir
state_dir="$(vm_dir)"
if [[ -d "${state_dir}" ]]; then
log "Removing runtime state under ${state_dir}"
find "${state_dir}" -mindepth 1 -delete 2>/dev/null || true
fi
}
remove_runtime_state_all_profiles() {
local profile
while IFS= read -r profile; do
with_build_profile "${profile}" remove_runtime_state_current_profile
done < <(all_build_profiles)
}
build_vm() {
local node="$1"
local target
local out
target="$(build_target_for_node "${node}")"
log "Building ${node} VM derivation (${target})"
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
--extra-experimental-features 'nix-command flakes' \
"${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm" \
--no-link --print-out-paths | tail -n1)"
[[ -n "${out}" ]] || die "failed to resolve VM output for ${node}"
ln -sfn "${out}" "$(build_link "${node}")"
}
build_target_for_node() {
local node="$1"
if [[ "${BUILD_PROFILE}" == "storage" ]]; then
printf '%s\n' "${STORAGE_BUILD_TARGETS[${node}]:-${node}}"
else
printf '%s\n' "${node}"
fi
}
build_vms() {
local nodes=("$@")
local targets=()
local outputs=()
local node
local target
local i
for node in "${nodes[@]}"; do
target="$(build_target_for_node "${node}")"
targets+=("${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm")
done
log "Building VM derivations in one Nix invocation: ${nodes[*]}"
mapfile -t outputs < <(
NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
--extra-experimental-features 'nix-command flakes' \
"${targets[@]}" \
--no-link --print-out-paths
)
[[ "${#outputs[@]}" -eq "${#nodes[@]}" ]] || die "expected ${#nodes[@]} VM outputs, got ${#outputs[@]}"
for i in "${!nodes[@]}"; do
ln -sfn "${outputs[${i}]}" "$(build_link "${nodes[${i}]}")"
done
}
build_guest_image() {
local out
log "Building bootable VM guest image on the host"
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
--extra-experimental-features 'nix-command flakes' \
"${CLUSTER_FLAKE_REF}#vmGuestImage" \
--no-link --print-out-paths | tail -n1)"
[[ -n "${out}" ]] || die "failed to resolve VM guest image output"
ln -sfn "${out}" "$(guest_image_link)"
}
build_guest_bench_image() {
local out
log "Building VM benchmark guest image on the host"
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
--extra-experimental-features 'nix-command flakes' \
"${CLUSTER_FLAKE_REF}#vmBenchGuestImage" \
--no-link --print-out-paths | tail -n1)"
[[ -n "${out}" ]] || die "failed to resolve VM benchmark guest image output"
ln -sfn "${out}" "$(guest_bench_image_link)"
}
build_requested() {
local nodes
mapfile -t nodes < <(all_or_requested_nodes "$@")
validate_nodes_exist "${nodes[@]}"
preflight
build_vms "${nodes[@]}"
}
is_running() {
local node="$1"
local pid_path
pid_path="$(pid_file "${node}")"
[[ -f "${pid_path}" ]] || return 1
kill -0 "$(<"${pid_path}")" 2>/dev/null
}
start_vm() {
local node="$1"
local build_path runvm node_runtime pid_path vm_log ssh_port
ensure_vde_switch
build_path="$(build_link "${node}")"
[[ -L "${build_path}" ]] || build_vm "${node}"
runvm="$(runvm_path "${node}")"
[[ -n "${runvm}" ]] || die "failed to locate run-*-vm for ${node}"
node_runtime="$(runtime_dir "${node}")"
pid_path="$(pid_file "${node}")"
vm_log="$(log_file "${node}")"
mkdir -p "${node_runtime}"
if is_running "${node}"; then
log "${node} already running (PID $(<"${pid_path}"))"
return 0
fi
ssh_port="$(ssh_port_for_node "${node}")"
if ss -H -ltn "( sport = :${ssh_port} )" | grep -q .; then
warn "port ${ssh_port} is already in use before starting ${node}"
ss -H -ltnp "( sport = :${ssh_port} )" || true
die "SSH forward port ${ssh_port} for ${node} is already in use"
fi
log "Starting ${node}"
(
cd "${node_runtime}"
nohup setsid "${runvm}" </dev/null >"${vm_log}" 2>&1 &
echo $! >"${pid_path}"
)
sleep 2
if ! is_running "${node}"; then
warn "${node} failed to stay running; recent log follows"
tail -n 80 "${vm_log}" || true
die "failed to start ${node}"
fi
}
stop_vm() {
local node="$1"
local pid_path pid
pid_path="$(pid_file "${node}")"
if [[ ! -f "${pid_path}" ]]; then
log "${node} is not running"
return 0
fi
pid="$(<"${pid_path}")"
if kill -0 "${pid}" 2>/dev/null; then
log "Stopping ${node} (PID ${pid})"
kill "${pid}" || true
for _ in {1..20}; do
if ! kill -0 "${pid}" 2>/dev/null; then
break
fi
sleep 1
done
if kill -0 "${pid}" 2>/dev/null; then
warn "${node} did not stop after SIGTERM; sending SIGKILL"
kill -9 "${pid}" || true
fi
fi
rm -f "${pid_path}"
}
ssh_node() {
local node="$1"
shift
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 "$@"
}
ssh_node_script() {
local node="$1"
shift
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 bash -se -- "$@"
}
scp_to_node() {
local node="$1"
local local_path="$2"
local remote_path="$3"
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
sshpass -p "${SSH_PASSWORD}" \
scp "${SSH_OPTS[@]}" -P "${ssh_port}" "${local_path}" "root@127.0.0.1:${remote_path}"
}
start_ssh_tunnel() {
local node="$1"
local local_port="$2"
local remote_port="$3"
local remote_host="${4:-127.0.0.1}"
local control_socket
control_socket="$(vm_dir)/tunnel-${node}-${local_port}.ctl"
local deadline
local attempt_deadline
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
if [[ -e "${control_socket}" ]]; then
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" \
-S "${control_socket}" \
-O exit \
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
rm -f "${control_socket}"
fi
if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
pkill -f -- "ssh .* -L ${local_port}:${remote_host}:${remote_port} " >/dev/null 2>&1 || true
for _ in {1..10}; do
if ! ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
break
fi
sleep 1
done
if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
die "local tunnel port ${local_port} is already in use"
fi
fi
deadline=$((SECONDS + TUNNEL_WAIT_TIMEOUT))
while true; do
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" \
-o ExitOnForwardFailure=yes \
-S "${control_socket}" \
-M -f -N \
-L "${local_port}:${remote_host}:${remote_port}" \
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
attempt_deadline=$((SECONDS + 10))
while true; do
if sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" \
-S "${control_socket}" \
-O check \
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1; then
printf '%s\n' "${control_socket}"
return 0
fi
if (( SECONDS >= attempt_deadline )); then
break
fi
sleep 1
done
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" \
-S "${control_socket}" \
-O exit \
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
rm -f "${control_socket}"
if (( SECONDS >= deadline )); then
warn "failed to establish ssh tunnel for ${node}:${remote_port} on local port ${local_port}"
ss -H -ltnp "( sport = :${local_port} )" || true
ps -ef | grep -F -- "-L ${local_port}:${remote_host}:${remote_port}" | grep -v grep || true
die "ssh tunnel for ${node}:${remote_host}:${remote_port} did not bind local port ${local_port}"
fi
sleep 1
done
}
stop_ssh_tunnel() {
local node="$1"
local control_socket="$2"
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
[[ -n "${control_socket}" ]] || return 0
if [[ -e "${control_socket}" ]]; then
sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" \
-S "${control_socket}" \
-O exit \
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
rm -f "${control_socket}"
fi
}
issue_project_admin_token() {
local iam_port="$1"
local org_id="$2"
local project_id="$3"
local principal_id="$4"
local create_principal_json create_binding_json issue_token_json token deadline output
create_principal_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
)"
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT + 180))
while true; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${create_principal_json}" \
127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
)" && break
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out creating IAM principal ${principal_id}: ${output}"
fi
sleep 2
done
create_binding_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
)"
while true; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${create_binding_json}" \
127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreateBinding 2>&1
)" && break
if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out creating IAM binding for ${principal_id}: ${output}"
fi
sleep 2
done
issue_token_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
)"
while true; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${issue_token_json}" \
127.0.0.1:"${iam_port}" iam.v1.IamToken/IssueToken 2>&1
)" && {
token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
if [[ -n "${token}" ]]; then
break
fi
}
if (( SECONDS >= deadline )); then
die "timed out issuing IAM token for ${principal_id}: ${output}"
fi
sleep 2
done
wait_for_project_admin_authorization "${iam_port}" "${org_id}" "${project_id}" "${principal_id}"
printf '%s\n' "${token}"
}
issue_project_admin_token_any() {
local org_id="$1"
local project_id="$2"
local principal_id="$3"
shift 3
local ports=("$@")
local create_principal_json create_binding_json issue_token_json token deadline output
local selected_port="" create_port="" binding_port="" issue_port="" port
create_principal_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
)"
create_binding_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
)"
issue_token_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
)"
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while [[ -z "${create_port}" ]]; do
for port in "${ports[@]}"; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${create_principal_json}" \
127.0.0.1:"${port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
)" && {
create_port="${port}"
break
}
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
create_port="${port}"
break
fi
done
if [[ -n "${create_port}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out creating IAM principal ${principal_id}: ${output}"
fi
sleep 2
done
while [[ -z "${binding_port}" ]]; do
for port in "${ports[@]}"; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${create_binding_json}" \
127.0.0.1:"${port}" iam.v1.IamAdmin/CreateBinding 2>&1
)" && {
binding_port="${port}"
break
}
if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
binding_port="${port}"
break
fi
done
if [[ -n "${binding_port}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out creating IAM binding for ${principal_id}: ${output}"
fi
sleep 2
done
while [[ -z "${issue_port}" ]]; do
for port in "${ports[@]}"; do
output="$(
timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${issue_token_json}" \
127.0.0.1:"${port}" iam.v1.IamToken/IssueToken 2>&1
)" && {
token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
if [[ -n "${token}" ]]; then
issue_port="${port}"
break
fi
}
done
if [[ -n "${issue_port}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out issuing IAM token for ${principal_id}: ${output}"
fi
sleep 2
done
selected_port="$(wait_for_project_admin_authorization_any "${org_id}" "${project_id}" "${principal_id}" "${ports[@]}")"
printf '%s\t%s\n' "${selected_port}" "${token}"
}
wait_for_project_admin_authorization() {
local iam_port="$1"
local org_id="$2"
local project_id="$3"
local principal_id="$4"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
local authorize_json
authorize_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{
principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
action:"storage:buckets:create",
resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
}'
)"
while true; do
if timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${authorize_json}" \
127.0.0.1:"${iam_port}" iam.v1.IamAuthz/Authorize \
| jq -e '.allowed == true' >/dev/null 2>&1; then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
fi
sleep 2
done
}
wait_for_project_admin_authorization_any() {
local org_id="$1"
local project_id="$2"
local principal_id="$3"
shift 3
local ports=("$@")
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
local authorize_json port
authorize_json="$(
jq -cn \
--arg id "${principal_id}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{
principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
action:"storage:buckets:create",
resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
}'
)"
while true; do
for port in "${ports[@]}"; do
if timeout 15 grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "${authorize_json}" \
127.0.0.1:"${port}" iam.v1.IamAuthz/Authorize \
| jq -e '.allowed == true' >/dev/null 2>&1; then
printf '%s\n' "${port}"
return 0
fi
done
if (( SECONDS >= deadline )); then
die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
fi
sleep 2
done
}
ensure_lightningstor_bucket() {
local ls_port="$1"
local token="$2"
local bucket="$3"
local org_id="$4"
local project_id="$5"
local head_json create_json
head_json="$(jq -cn --arg bucket "${bucket}" '{bucket:$bucket}')"
create_json="$(
jq -cn \
--arg bucket "${bucket}" \
--arg org "${org_id}" \
--arg project "${project_id}" \
'{bucket:$bucket, region:"default", orgId:$org, projectId:$project}'
)"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
local output=""
while true; do
if timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_json}" \
127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/HeadBucket >/dev/null 2>&1; then
return 0
fi
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${create_json}" \
127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/CreateBucket
)" && return 0
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out ensuring LightningStor bucket ${bucket}: ${output}"
fi
sleep 2
done
}
wait_for_lightningstor_write_quorum() {
local ls_port="$1"
local token="$2"
local bucket="$3"
local context="$4"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
local key="write-quorum-probe-$(date +%s)-$RANDOM"
local body="quorum-probe-${key}"
local body_b64 put_json delete_json output status
local before_node01 before_node04 before_node05
read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)
body_b64="$(printf '%s' "${body}" | base64 -w0)"
put_json="$(
jq -cn \
--arg bucket "${bucket}" \
--arg key "${key}" \
--arg body "${body_b64}" \
'{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
)"
delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
while true; do
status=0
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${put_json}" \
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/PutObject
)" || status=$?
if (( status == 0 )); then
wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe"
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${delete_json}" \
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/DeleteObject
)" || die "failed to delete LightningStor write quorum probe for ${context}: ${output}"
wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe cleanup"
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for LightningStor write quorum for ${context}: ${output}"
fi
if ! grep -q "Not enough healthy nodes" <<<"${output}"; then
die "unexpected LightningStor write quorum failure for ${context}: ${output}"
fi
sleep 2
done
}
download_lightningstor_object_to_file() {
local ls_port="$1"
local token="$2"
local bucket="$3"
local key="$4"
local output_path="$5"
local get_json
get_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
-max-msg-sz "${GRPCURL_MAX_MSG_SIZE}" \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${get_json}" \
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/GetObject \
| jq -r '.bodyChunk? // empty' \
| base64 -d >"${output_path}"
}
calc_mib_per_s() {
local bytes="$1"
local elapsed_ns="$2"
awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
BEGIN {
if (elapsed_ns <= 0) {
print "0.00"
} else {
printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
}
}
'
}
calc_ops_per_s() {
local operations="$1"
local elapsed_ns="$2"
awk -v operations="${operations}" -v elapsed_ns="${elapsed_ns}" '
BEGIN {
if (elapsed_ns <= 0) {
print "0.00"
} else {
printf "%.2f", operations / (elapsed_ns / 1000000000.0)
}
}
'
}
calc_seconds_from_ns() {
local elapsed_ns="$1"
awk -v elapsed_ns="${elapsed_ns}" '
BEGIN {
if (elapsed_ns <= 0) {
print "0.00"
} else {
printf "%.2f", elapsed_ns / 1000000000.0
}
}
'
}
bw_bytes_to_mibps() {
local bw_bytes="$1"
awk -v bw_bytes="${bw_bytes}" 'BEGIN { printf "%.2f", bw_bytes / 1048576.0 }'
}
bps_to_mibps() {
local bits_per_second="$1"
awk -v bits_per_second="${bits_per_second}" 'BEGIN { printf "%.2f", bits_per_second / 8.0 / 1048576.0 }'
}
allocate_free_listener_port() {
local node="$1"
local start_port="${2:-18080}"
local end_port="${3:-18999}"
ssh_node_script "${node}" "${start_port}" "${end_port}" <<'EOS'
set -euo pipefail
start_port="$1"
end_port="$2"
for ((port=start_port; port<=end_port; port++)); do
if ! ss -ltnH "( sport = :${port} )" | grep -q .; then
printf '%s\n' "${port}"
exit 0
fi
done
exit 1
EOS
}
run_remote_fio_json() {
local node="$1"
local target_path="$2"
local rw="$3"
local bs="$4"
local size_mb="$5"
local runtime_secs="${6:-0}"
local iodepth="${7:-1}"
local ioengine="${8:-sync}"
ssh_node_script "${node}" "${target_path}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${iodepth}" "${ioengine}" <<'EOS'
set -euo pipefail
target_path="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"
iodepth="$6"
ioengine="$7"
mkdir -p "$(dirname "${target_path}")"
if [[ "${rw}" == *read* ]]; then
dd if=/dev/zero of="${target_path}" bs=1M count="${size_mb}" status=none conv=fsync
fi
fio_args=(
--name=photon-bench
--filename="${target_path}"
--rw="${rw}"
--bs="${bs}"
--size="${size_mb}M"
--ioengine="${ioengine}"
--direct=1
--iodepth="${iodepth}"
--output-format=json
)
if [[ "${runtime_secs}" != "0" ]]; then
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi
if [[ "${rw}" == *write* ]]; then
fio_args+=(--fdatasync=1)
fi
result_json="$(fio "${fio_args[@]}")"
rm -f "${target_path}"
if [[ "${rw}" == *read* ]]; then
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}
run_remote_block_fio_json() {
local node="$1"
local target="$2"
local rw="$3"
local bs="$4"
local size_mb="$5"
local runtime_secs="${6:-0}"
ssh_node_script "${node}" "${target}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" <<'EOS'
set -euo pipefail
target="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"
fio_args=(
--name=photon-bench
--filename="${target}"
--rw="${rw}"
--bs="${bs}"
--size="${size_mb}M"
--ioengine=libaio
--direct=1
--output-format=json
)
if [[ "${runtime_secs}" != "0" ]]; then
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi
if [[ "${rw}" == *write* ]]; then
fio_args+=(--fdatasync=1)
fi
result_json="$(fio "${fio_args[@]}")"
if [[ "${rw}" == *read* ]]; then
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}
run_remote_dd_read_json() {
local node="$1"
local target_path="$2"
local size_mb="$3"
ssh_node_script "${node}" "${target_path}" "${size_mb}" <<'EOS'
set -euo pipefail
target_path="$1"
size_mb="$2"
[[ -f "${target_path}" ]]
start_ns="$(date +%s%N)"
dd if="${target_path}" of=/dev/null bs=1M status=none
end_ns="$(date +%s%N)"
printf '{"size_bytes":%s,"duration_ns":%s}\n' \
"$((size_mb * 1024 * 1024))" \
"$((end_ns - start_ns))"
EOS
}
coronafs_api_url() {
printf 'http://127.0.0.1:%s' "${1:-15088}"
}
coronafs_api_request() {
local base_port="$1"
local method="$2"
local path="$3"
local payload="${4:-}"
if [[ -n "${payload}" ]]; then
curl -fsS -X "${method}" \
-H 'content-type: application/json' \
--data "${payload}" \
"$(coronafs_api_url "${base_port}")${path}"
else
curl -fsS -X "${method}" "$(coronafs_api_url "${base_port}")${path}"
fi
}
coronafs_create_volume() {
local base_port="$1"
local volume_id="$2"
local size_bytes="$3"
coronafs_api_request "${base_port}" PUT "/v1/volumes/${volume_id}" "$(jq -cn --argjson size_bytes "${size_bytes}" '{size_bytes:$size_bytes}')"
}
coronafs_export_volume_json() {
local base_port="$1"
local volume_id="$2"
coronafs_api_request "${base_port}" POST "/v1/volumes/${volume_id}/export"
}
coronafs_get_volume_json() {
local base_port="$1"
local volume_id="$2"
coronafs_api_request "${base_port}" GET "/v1/volumes/${volume_id}"
}
coronafs_delete_volume() {
local base_port="$1"
local volume_id="$2"
coronafs_api_request "${base_port}" DELETE "/v1/volumes/${volume_id}" >/dev/null
}
run_remote_nbd_fio_json() {
local node="$1"
local nbd_uri="$2"
local rw="$3"
local bs="$4"
local size_mb="$5"
local runtime_secs="${6:-0}"
local nbd_device="${7:-/dev/nbd0}"
local iodepth="${8:-1}"
ssh_node_script "${node}" "${nbd_uri}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${nbd_device}" "${iodepth}" <<'EOS'
set -euo pipefail
nbd_uri="$1"
rw="$2"
bs="$3"
size_mb="$4"
runtime_secs="$5"
nbd_device="$6"
iodepth="$7"
modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
qemu-nbd \
--format=raw \
--cache=none \
--aio=io_uring \
--connect="${nbd_device}" \
"${nbd_uri}"
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT
fio_args=(
--name=photon-bench
--filename="${nbd_device}"
--rw="${rw}"
--bs="${bs}"
--size="${size_mb}M"
--ioengine=libaio
--direct=1
--iodepth="${iodepth}"
--output-format=json
)
if [[ "${runtime_secs}" != "0" ]]; then
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
fi
if [[ "${rw}" == *write* ]]; then
fio_args+=(--fdatasync=1)
fi
result_json="$(fio "${fio_args[@]}")"
if [[ "${rw}" == *read* ]]; then
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
else
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
fi
EOS
}
run_remote_nbd_dd_read_json() {
local node="$1"
local nbd_uri="$2"
local size_mb="$3"
local nbd_device="${4:-/dev/nbd0}"
ssh_node_script "${node}" "${nbd_uri}" "${size_mb}" "${nbd_device}" <<'EOS'
set -euo pipefail
nbd_uri="$1"
size_mb="$2"
nbd_device="$3"
modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
qemu-nbd \
--format=raw \
--cache=none \
--aio=io_uring \
--connect="${nbd_device}" \
"${nbd_uri}"
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT
start_ns="$(date +%s%N)"
dd if="${nbd_device}" of=/dev/null bs=1M count="${size_mb}" status=none
end_ns="$(date +%s%N)"
printf '{"size_bytes":%s,"duration_ns":%s}\n' \
"$((size_mb * 1024 * 1024))" \
"$((end_ns - start_ns))"
EOS
}
run_remote_iperf_json() {
local client_node="$1"
local server_node="$2"
local server_ip="$3"
local duration_secs="${4:-10}"
local server_port
local server_pid
server_port="$(allocate_free_listener_port "${server_node}" 19000 19100)"
server_pid="$(ssh_node_script "${server_node}" "${server_port}" <<'EOS'
set -euo pipefail
server_port="$1"
log_path="/tmp/iperf3-server-${server_port}.log"
rm -f "${log_path}"
nohup iperf3 -s -1 -p "${server_port}" >"${log_path}" 2>&1 &
printf '%s\n' "$!"
EOS
)"
sleep 1
ssh_node_script "${client_node}" "${server_ip}" "${server_port}" "${duration_secs}" "${server_pid}" <<'EOS'
set -euo pipefail
server_ip="$1"
server_port="$2"
duration_secs="$3"
server_pid="$4"
client_json="$(iperf3 -c "${server_ip}" -p "${server_port}" -t "${duration_secs}" -J)"
printf '%s' "${client_json}" | jq -c '{
bits_per_second: (
.end.sum_received.bits_per_second //
.end.sum.bits_per_second //
.end.sum_sent.bits_per_second //
0
),
retransmits: (.end.sum_sent.retransmits // 0)
}'
EOS
}
wait_for_plasmavmc_workers_registered() {
local vm_port="$1"
local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for PlasmaVMC workers to register with the control plane"
until grpcurl -plaintext \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d '{}' \
127.0.0.1:"${vm_port}" plasmavmc.v1.NodeService/ListNodes \
| jq -e '
([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node04")) != null
and
([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null
' >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for PlasmaVMC workers to register"
fi
sleep 2
done
}
wait_for_ssh() {
local node="$1"
local timeout="${2:-${SSH_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
local observed_host=""
log "Waiting for SSH on ${node}"
while true; do
observed_host="$(ssh_node "${node}" "hostname" 2>/dev/null || true)"
if [[ "${observed_host}" == "${node}" ]]; then
break
fi
if ! is_running "${node}"; then
tail -n 100 "$(log_file "${node}")" || true
die "${node} VM process exited while waiting for SSH"
fi
if (( SECONDS >= deadline )); then
if [[ -n "${observed_host}" ]]; then
warn "SSH on port $(ssh_port_for_node "${node}") answered as '${observed_host}' while waiting for ${node}"
fi
tail -n 100 "$(log_file "${node}")" || true
die "timed out waiting for SSH on ${node}"
fi
sleep 2
done
}
wait_for_ssh_down() {
local node="$1"
local timeout="${2:-60}"
local deadline=$((SECONDS + timeout))
log "Waiting for SSH to stop on ${node}"
until ! ssh_node "${node}" true >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for SSH shutdown on ${node}"
fi
sleep 2
done
}
wait_for_unit() {
local node="$1"
local unit="$2"
local timeout="${3:-${UNIT_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
local stable_checks=0
local required_stable_checks=3
log "Waiting for ${unit}.service on ${node}"
while (( stable_checks < required_stable_checks )); do
if ssh_node "${node}" "state=\$(systemctl show --property=ActiveState --value ${unit}.service); sub=\$(systemctl show --property=SubState --value ${unit}.service); [[ \"\${state}\" == active && (\"\${sub}\" == running || \"\${sub}\" == exited) ]]" >/dev/null 2>&1; then
stable_checks=$((stable_checks + 1))
else
stable_checks=0
fi
if ! is_running "${node}"; then
tail -n 100 "$(log_file "${node}")" || true
die "${node} VM process exited while waiting for ${unit}.service"
fi
if (( SECONDS >= deadline )); then
ssh_node "${node}" "systemctl status --no-pager ${unit}.service || true" || true
ssh_node "${node}" "journalctl -u ${unit}.service -n 80 --no-pager || true" || true
die "timed out waiting for ${unit}.service on ${node}"
fi
sleep 2
done
}
wait_for_http() {
local node="$1"
local url="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for HTTP endpoint on ${node}: ${url}"
until ssh_node "${node}" "curl -fsS '${url}' >/dev/null" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for ${url} on ${node}"
fi
sleep 2
done
}
wait_for_http_status() {
local node="$1"
local url="$2"
local expected_codes="$3"
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for HTTP status on ${node}: ${url} (${expected_codes})"
until ssh_node "${node}" "code=\$(curl -sS -o /dev/null -w '%{http_code}' '${url}' || true); case \" ${expected_codes} \" in *\" \${code} \"*) exit 0 ;; *) exit 1 ;; esac" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for HTTP status ${expected_codes} from ${url} on ${node}"
fi
sleep 2
done
}
wait_for_http_body() {
local node="$1"
local url="$2"
local expected="$3"
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for HTTP body on ${node}: ${url}"
until ssh_node_script "${node}" "${url}" "${expected}" <<'EOF' >/dev/null 2>&1
set -euo pipefail
url="$1"
expected="$2"
body="$(curl -fsS "${url}")"
[[ "${body}" == "${expected}" ]]
EOF
do
if (( SECONDS >= deadline )); then
ssh_node "${node}" "curl -fsS '${url}' || true" || true
die "timed out waiting for expected HTTP body from ${url} on ${node}"
fi
sleep 2
done
}
wait_for_host_http() {
local url="$1"
local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for host HTTP endpoint: ${url}"
until curl -fsS "${url}" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for host HTTP endpoint ${url}"
fi
sleep 2
done
}
host_api_request() {
local stage="$1"
local method="$2"
local url="$3"
local token="$4"
local body="${5:-}"
local response_file headers_file stderr_file http_code
response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-response-XXXXXX)"
headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-headers-XXXXXX)"
stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-stderr-XXXXXX)"
if [[ -n "${body}" ]]; then
http_code="$(
curl -sS \
-D "${headers_file}" \
-o "${response_file}" \
-w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-X "${method}" \
-d "${body}" \
"${url}" \
2>"${stderr_file}" || true
)"
else
http_code="$(
curl -sS \
-D "${headers_file}" \
-o "${response_file}" \
-w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-X "${method}" \
"${url}" \
2>"${stderr_file}" || true
)"
fi
if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
cat "${response_file}"
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
return 0
fi
log "Host API request failed during ${stage}: ${method} ${url} (status=${http_code:-curl-error})"
if [[ -s "${stderr_file}" ]]; then
sed 's/^/[curl] /' "${stderr_file}" >&2
fi
if [[ -s "${headers_file}" ]]; then
sed 's/^/[headers] /' "${headers_file}" >&2
fi
if [[ -s "${response_file}" ]]; then
sed 's/^/[body] /' "${response_file}" >&2
fi
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
die "host API request failed during ${stage}"
}
gateway_api_request() {
local stage="$1"
local method="$2"
local request_path="$3"
local token="$4"
local body="${5:-}"
local body_b64=""
if [[ -n "${body}" ]]; then
body_b64="$(printf '%s' "${body}" | base64 | tr -d '\n')"
fi
if ssh_node_script node06 "${method}" "${request_path}" "${token}" "${body_b64}" <<'EOF'
set -euo pipefail
method="$1"
request_path="$2"
token="$3"
body_b64="${4:-}"
url="http://127.0.0.1:8080${request_path}"
response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-response-XXXXXX)"
headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-headers-XXXXXX)"
stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-stderr-XXXXXX)"
body_file=""
cleanup() {
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
if [[ -n "${body_file}" ]]; then
rm -f "${body_file}"
fi
}
trap cleanup EXIT
if [[ -n "${body_b64}" ]]; then
body_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-body-XXXXXX)"
printf '%s' "${body_b64}" | base64 -d >"${body_file}"
http_code="$(
curl -sS \
-D "${headers_file}" \
-o "${response_file}" \
-w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-X "${method}" \
--data-binary @"${body_file}" \
"${url}" \
2>"${stderr_file}" || true
)"
else
http_code="$(
curl -sS \
-D "${headers_file}" \
-o "${response_file}" \
-w '%{http_code}' \
-H "Authorization: Bearer ${token}" \
-X "${method}" \
"${url}" \
2>"${stderr_file}" || true
)"
fi
if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
cat "${response_file}"
exit 0
fi
echo "status=${http_code:-curl-error}" >&2
if [[ -s "${stderr_file}" ]]; then
sed 's/^/[curl] /' "${stderr_file}" >&2
fi
if [[ -s "${headers_file}" ]]; then
sed 's/^/[headers] /' "${headers_file}" >&2
fi
if [[ -s "${response_file}" ]]; then
sed 's/^/[body] /' "${response_file}" >&2
fi
exit 1
EOF
then
return 0
fi
log "Gateway API request failed during ${stage}: ${method} ${request_path}"
die "gateway API request failed during ${stage}"
}
grpc_health_check() {
local node="$1"
local port="$2"
local service="$3"
ssh_node "${node}" \
"grpcurl -plaintext -d '{\"service\":\"${service}\"}' 127.0.0.1:${port} grpc.health.v1.Health/Check | jq -e '.status == \"SERVING\"' >/dev/null"
}
wait_for_grpc_health() {
local node="$1"
local port="$2"
local service="$3"
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for gRPC health on ${node}:${port} (${service})"
until grpc_health_check "${node}" "${port}" "${service}" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for gRPC health ${service} on ${node}:${port}"
fi
sleep 2
done
}
check_tcp_port() {
local node="$1"
local port="$2"
ssh_node "${node}" "ss -H -ltn '( sport = :${port} )' | grep -q ."
}
check_udp_port() {
local node="$1"
local port="$2"
ssh_node "${node}" "ss -H -lun '( sport = :${port} )' | grep -q ."
}
wait_for_tcp_port() {
local node="$1"
local port="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for TCP port ${port} on ${node}"
until check_tcp_port "${node}" "${port}" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for TCP port ${port} on ${node}"
fi
sleep 2
done
}
wait_for_udp_port() {
local node="$1"
local port="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for UDP port ${port} on ${node}"
until check_udp_port "${node}" "${port}" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for UDP port ${port} on ${node}"
fi
sleep 2
done
}
wait_for_flaredb_region() {
local node="$1"
local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for FlareDB region metadata on ${node}"
until ssh_node "${node}" "curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -e '(.data.leader_id > 0) and ((.data.peers | sort) == [1,2,3])' >/dev/null" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
die "timed out waiting for FlareDB region metadata on ${node}"
fi
sleep 2
done
}
wait_for_flaredb_route_metadata() {
local node="$1"
local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
log "Waiting for FlareDB route metadata on ${node}"
until ssh_node "${node}" "bash -se" <<'EOF' >/dev/null 2>&1
set -euo pipefail
actual="$(curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -r '.data.leader_id')"
recorded="$(curl -fsS http://127.0.0.1:8081/api/v1/kv/flaredb/regions/1 | jq -r '.data.value | fromjson | .leader_id')"
[[ "${actual}" != "0" ]]
[[ "${actual}" == "${recorded}" ]]
EOF
do
if (( SECONDS >= deadline )); then
die "timed out waiting for FlareDB route metadata on ${node}"
fi
sleep 2
done
}
ensure_flaredb_proto_on_node() {
local node="$1"
local proto_root="${2:-/var/lib/photon-test-protos/flaredb}"
ssh_node "${node}" "install -d -m 0755 ${proto_root}"
scp_to_node "${node}" "${FLAREDB_PROTO}" "${proto_root}/kvrpc.proto"
}
vm_runtime_dir_path() {
printf '%s/%s\n' /run/libvirt/plasmavmc "$1"
}
vm_console_path() {
printf '%s/console.log\n' "$(vm_runtime_dir_path "$1")"
}
wait_for_vm_console_pattern() {
local node="$1"
local vm_id="$2"
local pattern="$3"
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
local console_path console_q pattern_q
console_path="$(vm_console_path "${vm_id}")"
console_q="$(printf '%q' "${console_path}")"
pattern_q="$(printf '%q' "${pattern}")"
log "Waiting for VM console output on ${node}: ${pattern}"
until ssh_node "${node}" "bash -lc 'test -f ${console_q} && grep -F -- ${pattern_q} ${console_q} >/dev/null'" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
ssh_node "${node}" "bash -lc 'test -f ${console_q} && tail -n 80 ${console_q} || true'" || true
die "timed out waiting for VM console pattern ${pattern} on ${node}"
fi
sleep 2
done
}
read_vm_console_line_matching() {
local node="$1"
local vm_id="$2"
local pattern="$3"
local console_path console_q pattern_q
console_path="$(vm_console_path "${vm_id}")"
console_q="$(printf '%q' "${console_path}")"
pattern_q="$(printf '%q' "${pattern}")"
ssh_node "${node}" "bash -lc 'grep -F -- ${pattern_q} ${console_q} | tail -n1'"
}
wait_for_qemu_volume_present() {
local node="$1"
local volume_path="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
until ssh_node "${node}" "pgrep -fa '[q]emu-system' | grep -F '${volume_path}' >/dev/null" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" || true
die "timed out waiting for qemu to attach ${volume_path} on ${node}"
fi
sleep 2
done
}
wait_for_qemu_volume_absent() {
local node="$1"
local volume_path="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
until ssh_node "${node}" "bash -lc '! pgrep -fa \"[q]emu-system\" | grep -F \"${volume_path}\" >/dev/null'" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" || true
die "timed out waiting for qemu to release ${volume_path} on ${node}"
fi
sleep 2
done
}
try_get_vm_json() {
local token="$1"
local get_vm_json="$2"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${get_vm_json}" \
127.0.0.1:15082 plasmavmc.v1.VmService/GetVm
}
wait_requested() {
local nodes
mapfile -t nodes < <(all_or_requested_nodes "$@")
validate_nodes_exist "${nodes[@]}"
preflight
local node
for node in "${nodes[@]}"; do
wait_for_ssh "${node}"
done
}
start_requested() {
local nodes
mapfile -t nodes < <(all_or_requested_nodes "$@")
validate_nodes_exist "${nodes[@]}"
preflight
if [[ "${CLUSTER_SKIP_BUILD}" == "1" ]]; then
local node
for node in "${nodes[@]}"; do
[[ -L "$(build_link "${node}")" ]] || die "missing VM build link for ${node} while PHOTON_CLUSTER_SKIP_BUILD=1"
done
log "Skipping VM build because PHOTON_CLUSTER_SKIP_BUILD=1"
else
build_vms "${nodes[@]}"
fi
if [[ "$#" -eq 0 ]]; then
local phase node
for phase in "${NODE_PHASES[@]}"; do
for node in ${phase}; do
start_vm "${node}"
done
for node in ${phase}; do
wait_for_ssh "${node}"
done
done
else
local node
for node in "${nodes[@]}"; do
start_vm "${node}"
done
for node in "${nodes[@]}"; do
wait_for_ssh "${node}"
done
fi
}
validate_units() {
local node unit
for node in node01 node02 node03; do
wait_for_unit "${node}" chainfire
wait_for_unit "${node}" flaredb
done
for node in node01 node02 node03; do
wait_for_flaredb_region "${node}"
done
for node in node01 node02 node03; do
wait_for_unit "${node}" iam
done
for unit in prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost; do
wait_for_unit node01 "${unit}"
done
for node in node04 node05; do
for unit in ${NODE_UNITS[${node}]}; do
wait_for_unit "${node}" "${unit}"
done
done
for unit in ${NODE_UNITS[node06]}; do
wait_for_unit node06 "${unit}"
done
}
validate_storage_units() {
local node unit
for node in node01 node02 node03; do
wait_for_unit "${node}" chainfire
wait_for_unit "${node}" flaredb
done
for node in node01 node02 node03; do
wait_for_flaredb_region "${node}"
done
for node in node01 node02 node03; do
wait_for_unit "${node}" iam
done
for unit in plasmavmc lightningstor coronafs; do
wait_for_unit node01 "${unit}"
done
for node in node04 node05; do
for unit in ${NODE_UNITS[${node}]}; do
wait_for_unit "${node}" "${unit}"
done
done
}
validate_storage_control_plane() {
wait_for_http node01 http://127.0.0.1:8081/health
wait_for_http node01 http://127.0.0.1:8082/health
wait_for_http node01 http://127.0.0.1:8083/health
wait_for_http node01 http://127.0.0.1:8084/health
wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
wait_for_tcp_port node01 50086
wait_for_tcp_port node01 9000
wait_for_http node02 http://127.0.0.1:8081/health
wait_for_http node02 http://127.0.0.1:8082/health
wait_for_http node02 http://127.0.0.1:8083/health
wait_for_http node03 http://127.0.0.1:8081/health
wait_for_http node03 http://127.0.0.1:8082/health
wait_for_http node03 http://127.0.0.1:8083/health
}
validate_control_plane() {
wait_for_http node01 http://127.0.0.1:8081/health
wait_for_http node01 http://127.0.0.1:8082/health
wait_for_http node01 http://127.0.0.1:8083/health
wait_for_http node01 http://127.0.0.1:8087/health
wait_for_http node01 http://127.0.0.1:8084/health
wait_for_http node01 http://127.0.0.1:8085/health
wait_for_http node02 http://127.0.0.1:8081/health
wait_for_http node02 http://127.0.0.1:8082/health
wait_for_http node02 http://127.0.0.1:8083/health
wait_for_http node03 http://127.0.0.1:8081/health
wait_for_http node03 http://127.0.0.1:8082/health
wait_for_http node03 http://127.0.0.1:8083/health
wait_for_tcp_port node01 50084
wait_for_http node01 http://127.0.0.1:9097/metrics
wait_for_udp_port node01 5353
wait_for_tcp_port node01 50085
wait_for_http node01 http://127.0.0.1:9098/metrics
wait_for_tcp_port node01 50086
wait_for_tcp_port node01 50090
wait_for_http_status node01 http://127.0.0.1:9000 "200 403"
wait_for_http node01 http://127.0.0.1:9099/metrics
wait_for_http node01 http://127.0.0.1:9198/metrics
log "Validating ChainFire replication across control-plane nodes"
ssh_node_script node01 <<'EOS'
set -euo pipefail
key="validation-chainfire-$(date +%s)"
value="ok-$RANDOM"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
leader=""
for ip in "${nodes[@]}"; do
code="$(curl -sS -o /tmp/chainfire-put.out -w '%{http_code}' \
-X PUT "http://${ip}:8081/api/v1/kv/${key}" \
-H 'Content-Type: application/json' \
-d "{\"value\":\"${value}\"}" || true)"
if [[ "${code}" == "200" ]]; then
leader="${ip}"
break
fi
done
[[ -n "${leader}" ]]
curl -fsS http://10.100.0.11:8081/api/v1/cluster/status | jq -e '.data.term >= 1' >/dev/null
for ip in "${nodes[@]}"; do
deadline=$((SECONDS + 30))
while true; do
actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
if [[ "${actual}" == "${value}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
echo "chainfire replication did not converge on ${ip}" >&2
exit 1
fi
sleep 1
done
done
EOS
log "Validating FlareDB replication across control-plane nodes"
wait_for_flaredb_region node01
wait_for_flaredb_region node02
wait_for_flaredb_region node03
ssh_node_script node01 <<'EOS'
set -euo pipefail
key="validation-flaredb-$(date +%s)"
value="ok-$RANDOM"
namespace="validation"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
writer=""
for ip in "${nodes[@]}"; do
code="$(curl -sS -o /tmp/flaredb-put.out -w '%{http_code}' \
-X PUT "http://${ip}:8082/api/v1/kv/${key}" \
-H 'Content-Type: application/json' \
-d "{\"value\":\"${value}\",\"namespace\":\"${namespace}\"}" || true)"
if [[ "${code}" == "200" ]]; then
writer="${ip}"
break
fi
done
[[ -n "${writer}" ]]
for ip in "${nodes[@]}"; do
deadline=$((SECONDS + 120))
while true; do
actual="$(curl -fsS --get "http://${ip}:8082/api/v1/scan" \
--data-urlencode "start=${key}" \
--data-urlencode "end=${key}~" \
--data-urlencode "namespace=${namespace}" 2>/dev/null \
| jq -r '.data.items[0].value // empty' 2>/dev/null || true)"
if [[ "${actual}" == "${value}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
echo "flaredb replication did not converge on ${ip}" >&2
exit 1
fi
sleep 1
done
done
EOS
log "Validating FlareDB strong-consistency CAS on the control plane"
local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
set -euo pipefail
proto_root="$1"
key="validation-flaredb-strong-$(date +%s)"
value="ok-$RANDOM"
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"default"}')"
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"default"}')"
writer=""
for ip in "${nodes[@]}"; do
if grpcurl -plaintext \
-import-path "${proto_root}" \
-proto "${proto_root}/kvrpc.proto" \
-d "${request}" \
"${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-cas.out 2>/dev/null; then
if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-cas.out >/dev/null; then
writer="${ip}"
break
fi
fi
done
[[ -n "${writer}" ]]
deadline=$((SECONDS + 90))
while true; do
if grpcurl -plaintext \
-import-path "${proto_root}" \
-proto "${proto_root}/kvrpc.proto" \
-d "${get_request}" \
"${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-cas-get.out 2>/dev/null; then
if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-cas-get.out >/dev/null; then
break
fi
fi
if (( SECONDS >= deadline )); then
echo "flaredb strong CAS read did not converge on leader ${writer}" >&2
exit 1
fi
sleep 1
done
EOS
}
validate_iam_flow() {
log "Validating IAM token issuance, validation, and scoped authorization"
local iam_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="iam-smoke-org"
local project_id="iam-smoke-project"
local principal_id="iam-smoke-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "$(jq -cn --arg token "${token}" '{token:$token}')" \
127.0.0.1:15080 iam.v1.IamToken/ValidateToken \
| jq -e --arg org "${org_id}" --arg project "${project_id}" --arg principal "${principal_id}" \
'.valid == true and .claims.orgId == $org and .claims.projectId == $project and .claims.principalId == $principal' >/dev/null
grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"allow-check", orgId:$org, projectId:$project}}')" \
127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
| jq -e '.allowed == true' >/dev/null
grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"deny-check", orgId:$org, projectId:($project + "-other")}}')" \
127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
| jq -e '(.allowed // false) == false' >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_prismnet_flow() {
log "Validating PrismNet VPC, subnet, and port lifecycle"
local iam_tunnel="" prism_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
trap 'stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="prismnet-smoke-org"
local project_id="prismnet-smoke-project"
local principal_id="prismnet-smoke-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
local vpc_resp subnet_resp port_resp
local vpc_id subnet_id port_id
vpc_resp="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg name "prismnet-smoke-vpc" \
'{orgId:$org, projectId:$project, name:$name, description:"smoke vpc", cidrBlock:"10.44.0.0/16"}')" \
127.0.0.1:15081 prismnet.VpcService/CreateVpc)"
vpc_id="$(printf '%s' "${vpc_resp}" | jq -r '.vpc.id')"
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "PrismNet CreateVpc did not return a VPC ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
127.0.0.1:15081 prismnet.VpcService/GetVpc \
| jq -e --arg id "${vpc_id}" '.vpc.id == $id' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, pageSize:100, pageToken:""}')" \
127.0.0.1:15081 prismnet.VpcService/ListVpcs \
| jq -e --arg id "${vpc_id}" '.vpcs | any(.id == $id)' >/dev/null
subnet_resp="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"prismnet-smoke-subnet", description:"smoke subnet", cidrBlock:"10.44.10.0/24", gatewayIp:"10.44.10.1", dhcpEnabled:true}')" \
127.0.0.1:15081 prismnet.SubnetService/CreateSubnet)"
subnet_id="$(printf '%s' "${subnet_resp}" | jq -r '.subnet.id')"
[[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "PrismNet CreateSubnet did not return a subnet ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
127.0.0.1:15081 prismnet.SubnetService/GetSubnet \
| jq -e --arg id "${subnet_id}" '.subnet.id == $id' >/dev/null
port_resp="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"prismnet-smoke-port", description:"smoke port", ipAddress:""}')" \
127.0.0.1:15081 prismnet.PortService/CreatePort)"
port_id="$(printf '%s' "${port_resp}" | jq -r '.port.id')"
[[ -n "${port_id}" && "${port_id}" != "null" ]] || die "PrismNet CreatePort did not return a port ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
127.0.0.1:15081 prismnet.PortService/GetPort \
| jq -e --arg id "${port_id}" '.port.id == $id and (.port.ipAddress | length) > 0' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, deviceId:"", pageSize:100, pageToken:""}')" \
127.0.0.1:15081 prismnet.PortService/ListPorts \
| jq -e --arg id "${port_id}" '.ports | any(.id == $id)' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id, name:"prismnet-smoke-port-updated", description:"updated", securityGroupIds:[], adminStateUp:false}')" \
127.0.0.1:15081 prismnet.PortService/UpdatePort \
| jq -e '.port.name == "prismnet-smoke-port-updated" and (.port.adminStateUp // false) == false' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${prism_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_flashdns_flow() {
log "Validating FlashDNS zone, record, and authoritative query flow"
local iam_tunnel="" dns_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
trap 'stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="flashdns-smoke-org"
local project_id="flashdns-smoke-project"
local principal_id="flashdns-smoke-$(date +%s)"
local token zone_name zone_resp zone_id record_resp record_id fqdn
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
zone_name="smoke-$(date +%s).cluster.test"
zone_resp="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.smoke.test", adminEmail:"admin@smoke.test"}')" \
127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone)"
zone_id="$(printf '%s' "${zone_resp}" | jq -r '.zone.id')"
[[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "FlashDNS CreateZone did not return a zone ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${zone_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.ZoneService/GetZone \
| jq -e --arg id "${zone_id}" --arg name "${zone_name}" \
'.zone.id == $id and (.zone.name == $name or .zone.name == ($name + "."))' >/dev/null
record_resp="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"api", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord)"
record_id="$(printf '%s' "${record_resp}" | jq -r '.record.id')"
[[ -n "${record_id}" && "${record_id}" != "null" ]] || die "FlashDNS CreateRecord did not return a record ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.RecordService/GetRecord \
| jq -e --arg id "${record_id}" '.record.id == $id' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, nameFilter:"", typeFilter:"", pageSize:100, pageToken:""}')" \
127.0.0.1:15084 flashdns.v1.RecordService/ListRecords \
| jq -e --arg id "${record_id}" '.records | any(.id == $id)' >/dev/null
fqdn="api.${zone_name}"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for authoritative FlashDNS answer for ${fqdn}"
fi
sleep 2
done
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${dns_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_fiberlb_flow() {
log "Validating FiberLB management API, runtime listeners, and backend failover behavior"
local iam_tunnel="" lb_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="fiberlb-smoke-org"
local project_id="fiberlb-smoke-project"
local principal_id="fiberlb-smoke-$(date +%s)"
local token lb_id pool_id backend_id listener_id listener_port
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
listener_port=$((18080 + (RANDOM % 100)))
lb_id="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg name "fiberlb-smoke-lb" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, description:"smoke lb"}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer \
| jq -r '.loadbalancer.id')"
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "FiberLB CreateLoadBalancer did not return an ID"
pool_id="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg name "fiberlb-smoke-pool" --arg lb "${lb_id}" '{name:$name, loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool \
| jq -r '.pool.id')"
[[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "FiberLB CreatePool did not return an ID"
backend_id="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg name "fiberlb-smoke-backend" --arg pool "${pool_id}" '{name:$name, poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend \
| jq -r '.backend.id')"
[[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "FiberLB CreateBackend did not return an ID"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/GetBackend \
| jq -e --arg id "${backend_id}" '.backend.id == $id' >/dev/null
listener_id="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg name "fiberlb-smoke-listener" --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:$name, loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener \
| jq -r '.listener.id')"
[[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "FiberLB CreateListener did not return an ID"
wait_for_tcp_port node01 "${listener_port}"
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
local fiberlb_pid fiberlb_peak_cpu load_pid settle_ok
fiberlb_pid="$(ssh_node node01 'pidof fiberlb')"
[[ -n "${fiberlb_pid}" ]] || die "FiberLB process is not running on node01"
ssh_node node01 \
"bash -lc 'seq 1 256 | xargs -P 32 -I{} curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null'" &
load_pid=$!
sleep 1
fiberlb_peak_cpu="$(ssh_node node01 "top -b -d 1 -n 5 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0; if (cpu > max) max = cpu } END { print max + 0 }'")"
wait "${load_pid}"
log "FiberLB peak CPU during synthetic load: ${fiberlb_peak_cpu}%"
settle_ok=0
for _ in {1..10}; do
if ssh_node node01 \
"top -b -d 1 -n 2 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0 } END { exit !(cpu < 20.0) }'"; then
settle_ok=1
break
fi
sleep 2
done
[[ "${settle_ok}" -eq 1 ]] || die "FiberLB CPU did not settle after synthetic load"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_DISABLED"}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
| jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_DISABLED"' >/dev/null
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ! ssh_node node01 "curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null" >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for disabled FiberLB backend to stop serving traffic"
fi
sleep 2
done
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_ENABLED"}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
| jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_ENABLED"' >/dev/null
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${lb_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_k8shost_flow() {
log "Validating K8sHost node, pod, service, and controller integrations"
local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel="" k8s_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
k8s_tunnel="$(start_ssh_tunnel node01 15087 50087)"
trap 'stop_ssh_tunnel node01 "${k8s_tunnel}"; stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="default-org"
local project_id="default-project"
local principal_id="k8shost-smoke-$(date +%s)"
local token node_name pod_name service_name service_port
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
node_name="smoke-node-$(date +%s)"
pod_name="smoke-pod-$(date +%s)"
service_name="smoke-svc-$(date +%s)"
service_port=$((18180 + (RANDOM % 100)))
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg name "${node_name}" --arg org "${org_id}" --arg project "${project_id}" '{node:{metadata:{name:$name, orgId:$org, projectId:$project}, spec:{podCidr:"10.244.0.0/24"}, status:{addresses:[{type:"InternalIP", address:"10.100.0.21"}], conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4", memory:"8192Mi"}, allocatable:{cpu:"4", memory:"8192Mi"}}}}')" \
127.0.0.1:15087 k8shost.NodeService/RegisterNode >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg name "${node_name}" '{nodeName:$name, status:{conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4"}, allocatable:{cpu:"4"}}}')" \
127.0.0.1:15087 k8shost.NodeService/Heartbeat \
| jq -e '.success == true' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d '{}' \
127.0.0.1:15087 k8shost.NodeService/ListNodes \
| jq -e --arg name "${node_name}" '.items | any(.metadata.name == $name)' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg name "${pod_name}" --arg org "${org_id}" --arg project "${project_id}" '{pod:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project, labels:{app:"k8shost-smoke"}}, spec:{containers:[{name:"backend", image:"smoke", ports:[{containerPort:8081, protocol:"TCP"}]}]}, status:{phase:"Running", podIp:"10.100.0.11", hostIp:"10.100.0.11"}}}')" \
127.0.0.1:15087 k8shost.PodService/CreatePod >/dev/null
log "Matrix case: K8sHost + PrismNet"
local pools_json
pools_json="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
127.0.0.1:15081 prismnet.IpamService/ListServiceIPPools)"
if ! printf '%s' "${pools_json}" | jq -e '.pools | length > 0' >/dev/null; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" \
-proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, name:"default-cluster-ip-pool", description:"smoke-created default ClusterIP pool", cidrBlock:"10.96.42.0/24", poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
127.0.0.1:15081 prismnet.IpamService/CreateServiceIPPool >/dev/null
fi
log "Matrix case: K8sHost + PrismNet + FiberLB + FlashDNS"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg name "${service_name}" --arg org "${org_id}" --arg project "${project_id}" --argjson port "${service_port}" '{service:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project}, spec:{ports:[{name:"http", port:$port, targetPort:8081, protocol:"TCP"}], selector:{app:"k8shost-smoke"}, type:"LoadBalancer"}}}')" \
127.0.0.1:15087 k8shost.ServiceService/CreateService >/dev/null
local service_json cluster_ip lb_id record_id zone_id
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
service_json="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
127.0.0.1:15087 k8shost.ServiceService/GetService 2>/dev/null || true)"
if [[ -n "${service_json}" ]] && printf '%s' "${service_json}" | jq -e '
.service.status.loadBalancer.ingress[0].ip != null and
.service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"] != null and
.service.metadata.annotations["flashdns.plasmacloud.io/record-id"] != null' >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for K8sHost controllers to provision service ${service_name}"
fi
sleep 2
done
cluster_ip="$(printf '%s' "${service_json}" | jq -r '.service.spec.clusterIp')"
lb_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"]')"
record_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/record-id"]')"
zone_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/zone-id"]')"
[[ -n "${cluster_ip}" && "${cluster_ip}" != "null" ]] || die "K8sHost service did not get a cluster IP"
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "K8sHost service did not get a FiberLB load balancer"
[[ -n "${record_id}" && "${record_id}" != "null" ]] || die "K8sHost service did not get a FlashDNS record"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn '{namespace:"default"}')" \
127.0.0.1:15087 k8shost.ServiceService/ListServices \
| jq -e --arg name "${service_name}" '.items | any(.metadata.name == $name)' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn '{namespace:"default", labelSelector:{app:"k8shost-smoke"}}')" \
127.0.0.1:15087 k8shost.PodService/ListPods \
| jq -e --arg name "${pod_name}" '.items | any(.metadata.name == $name)' >/dev/null
log "Matrix case: K8sHost + FlashDNS"
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_name}.default.svc.cluster.local A | grep -Fx '${cluster_ip}'" >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for K8sHost FlashDNS record for ${service_name}"
fi
sleep 2
done
log "Matrix case: K8sHost + FiberLB"
wait_for_http node01 "http://127.0.0.1:${service_port}/health"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
127.0.0.1:15087 k8shost.ServiceService/DeleteService >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${K8SHOST_PROTO_DIR}" \
-proto "${K8SHOST_PROTO}" \
-d "$(jq -cn --arg ns "default" --arg name "${pod_name}" '{namespace:$ns, name:$name}')" \
127.0.0.1:15087 k8shost.PodService/DeletePod >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" \
-proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${k8s_tunnel}"
stop_ssh_tunnel node01 "${lb_tunnel}"
stop_ssh_tunnel node01 "${dns_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_workers() {
wait_for_http node04 http://127.0.0.1:8084/health
wait_for_http node05 http://127.0.0.1:8084/health
wait_for_tcp_port node04 50086
wait_for_tcp_port node05 50086
wait_for_http node04 http://127.0.0.1:9098/metrics
wait_for_http node05 http://127.0.0.1:9098/metrics
wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
log "Validating CoronaFS block export accessibility on worker nodes"
local coronafs_tunnel="" probe_volume="worker-probe-$(date +%s)"
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
trap 'stop_ssh_tunnel node01 "${coronafs_tunnel}"' RETURN
coronafs_create_volume 15088 "${probe_volume}" $((64 * 1024 * 1024)) >/dev/null
local probe_export_json probe_uri
probe_export_json="$(coronafs_export_volume_json 15088 "${probe_volume}")"
probe_uri="$(printf '%s' "${probe_export_json}" | jq -r '.export.uri')"
[[ -n "${probe_uri}" && "${probe_uri}" != "null" ]] || die "CoronaFS probe volume did not return an export URI"
run_remote_nbd_fio_json node04 "${probe_uri}" write 1M 32 >/dev/null
run_remote_nbd_dd_read_json node05 "${probe_uri}" 32 >/dev/null
coronafs_delete_volume 15088 "${probe_volume}"
stop_ssh_tunnel node01 "${coronafs_tunnel}"
trap - RETURN
}
validate_nested_kvm_workers() {
log "Validating nested KVM inside worker VMs"
for node in node04 node05; do
ssh_node_script "${node}" <<'EOS'
set -euo pipefail
modprobe kvm_intel >/dev/null 2>&1 || modprobe kvm_amd >/dev/null 2>&1 || true
[[ -c /dev/kvm ]]
grep -Eq 'vmx|svm' /proc/cpuinfo
qemu-system-x86_64 \
-accel kvm \
-cpu host \
-machine q35 \
-m 256 \
-display none \
-nodefaults \
-no-reboot \
-daemonize \
-pidfile /tmp/nested-kvm.pid \
-serial file:/tmp/nested-kvm.log \
-kernel /run/current-system/kernel \
-append 'console=ttyS0' >/tmp/nested-kvm.cmd.log 2>&1
sleep 5
kill -0 "$(cat /tmp/nested-kvm.pid)"
kill "$(cat /tmp/nested-kvm.pid)"
EOS
done
}
validate_lightningstor_distributed_storage() {
log "Validating distributed LightningStor object replication across node01/node04/node05"
local iam_tunnel="" ls_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
trap 'stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="smoke-org"
local project_id="smoke-project"
local principal_id="lightningstor-smoke-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
local bucket="dist-smoke-$(date +%s)"
ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"
wait_for_lightningstor_write_quorum 15086 "${token}" "${bucket}" "distributed LightningStor validation"
local before_node01 before_node04 before_node05
read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)
local key="replication-check-$(date +%s)"
local body="distributed-object-${key}"
local body_b64
body_b64="$(printf '%s' "${body}" | base64 -w0)"
local put_json head_json delete_json output
put_json="$(
jq -cn \
--arg bucket "${bucket}" \
--arg key "${key}" \
--arg body "${body_b64}" \
'{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
)"
log "LightningStor distributed replication: PUT ${bucket}/${key}"
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${put_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject
)" || die "failed to write LightningStor distributed replication probe ${bucket}/${key}: ${output}"
head_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
log "LightningStor distributed replication: HEAD ${bucket}/${key}"
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
)" || die "failed to head LightningStor distributed replication probe ${bucket}/${key}: ${output}"
printf '%s\n' "${output}" \
| jq -e --arg size "$(printf '%s' "${body}" | wc -c | awk '{print $1}')" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
|| die "LightningStor distributed replication probe ${bucket}/${key} returned unexpected metadata: ${output}"
local fetched_body
log "LightningStor distributed replication: GET ${bucket}/${key}"
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/GetObject
)" || die "failed to fetch LightningStor distributed replication probe ${bucket}/${key}: ${output}"
fetched_body="$(printf '%s\n' "${output}" | jq -rsr '[.[] | .bodyChunk? | select(. != null) | @base64d] | join("")')" \
|| die "failed to decode LightningStor distributed replication probe ${bucket}/${key}: ${output}"
[[ "${fetched_body}" == "${body}" ]] || die "distributed LightningStor returned unexpected object payload"
wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "generic object replication"
delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
log "LightningStor distributed replication: DELETE ${bucket}/${key}"
output="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${delete_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject
)" || die "failed to delete LightningStor distributed replication probe ${bucket}/${key}: ${output}"
wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "generic object cleanup"
trap - RETURN
stop_ssh_tunnel node01 "${ls_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_vm_storage_flow() {
log "Validating PlasmaVMC image import, shared-volume execution, and live migration"
local iam_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
local image_source_path=""
local node01_proto_root="/var/lib/plasmavmc/test-protos"
cleanup_vm_storage_flow() {
if [[ -n "${image_source_path}" ]]; then
ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
fi
stop_ssh_tunnel node01 "${coronafs_tunnel}"
stop_ssh_tunnel node01 "${vm_tunnel}"
stop_ssh_tunnel node01 "${ls_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
trap cleanup_vm_storage_flow RETURN
wait_for_plasmavmc_workers_registered 15082
local org_id="vm-smoke-org"
local project_id="vm-smoke-project"
local principal_id="plasmavmc-smoke-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC image import"
local image_name="vm-image-$(date +%s)"
local image_id=""
local guest_image_local_path guest_image_sha guest_image_size remote_guest_image_sha
local image_before_node01 image_before_node04 image_before_node05
local image_after_node01 image_after_node04 image_after_node05
read -r image_before_node01 image_before_node04 image_before_node05 < <(lightningstor_count_triplet)
guest_image_local_path="$(guest_image_path)"
[[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image"
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
guest_image_size="$(stat -c %s "${guest_image_local_path}")"
ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
ssh_node node01 "install -d -m 0755 ${node01_proto_root}/iam ${node01_proto_root}/plasmavmc ${node01_proto_root}/lightningstor"
scp_to_node node01 "${IAM_PROTO}" "${node01_proto_root}/iam/iam.proto"
scp_to_node node01 "${PLASMAVMC_PROTO}" "${node01_proto_root}/plasmavmc/plasmavmc.proto"
scp_to_node node01 "${LIGHTNINGSTOR_PROTO}" "${node01_proto_root}/lightningstor/lightningstor.proto"
ssh_node node01 "find /var/lib/plasmavmc/imports -maxdepth 1 -type f -name 'vm-image-*.qcow2' -delete"
image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
remote_guest_image_sha="$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")"
[[ "${remote_guest_image_sha}" == "${guest_image_sha}" ]] || die "bootable VM guest image checksum mismatch after host distribution"
local create_image_json
log "Matrix case: PlasmaVMC + LightningStor"
create_image_json="$(
jq -cn \
--arg name "${image_name}" \
--arg org "${org_id}" \
--arg sha "${guest_image_sha}" \
--arg source_url "file://${image_source_path}" \
'{
name:$name,
orgId:$org,
visibility:"VISIBILITY_PRIVATE",
format:"IMAGE_FORMAT_QCOW2",
osType:"OS_TYPE_LINUX",
osVersion:"smoke",
architecture:"ARCHITECTURE_X86_64",
minDiskGib:1,
minMemoryMib:512,
metadata:{purpose:"smoke", sourceSha256:$sha},
sourceUrl:$source_url
}'
)"
local create_image_response
create_image_response="$(
ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_image_json}" | base64 | tr -d '\n')" <<'EOS'
set -euo pipefail
proto_root="$1"
token="$2"
request_b64="$3"
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${proto_root}/plasmavmc" \
-proto "${proto_root}/plasmavmc/plasmavmc.proto" \
-d "${request_json}" \
127.0.0.1:50082 plasmavmc.v1.ImageService/CreateImage
EOS
)"
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "failed to create image through PlasmaVMC"
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE" and .format == "IMAGE_FORMAT_QCOW2"' >/dev/null
local image_key="${org_id}/${project_id}/${image_id}.qcow2"
local get_image_json
get_image_json="$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${get_image_json}" \
127.0.0.1:15082 plasmavmc.v1.ImageService/GetImage \
| jq -e --arg image "${image_id}" '.id == $image and .status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" '{orgId:$org, pageSize:100, pageToken:"", includePublic:false}')" \
127.0.0.1:15082 plasmavmc.v1.ImageService/ListImages \
| jq -e --arg image "${image_id}" '.images | any(.id == $image)' >/dev/null
local head_image_json head_image_response
head_image_json="$(jq -cn --arg bucket "plasmavmc-images" --arg key "${image_key}" '{bucket:$bucket, key:$key}')"
head_image_response="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_image_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
)" || die "failed to head imported PlasmaVMC image object ${image_key}: ${head_image_response}"
printf '%s\n' "${head_image_response}" \
| jq -e --arg size "${guest_image_size}" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
|| die "imported PlasmaVMC image object ${image_key} returned unexpected size: ${head_image_response}"
local image_checksum
image_checksum="$(printf '%s' "${create_image_response}" | jq -r '.checksum')"
[[ -n "${image_checksum}" && "${image_checksum}" != "null" ]] || die "CreateImage response did not return an imported image checksum"
# CreateImage computes the checksum from the normalized qcow2 artifact before upload.
[[ "${image_checksum}" == "${guest_image_sha}" ]] || die "imported PlasmaVMC image checksum mismatch"
ssh_node node01 "rm -f ${image_source_path}"
image_source_path=""
wait_for_lightningstor_counts_greater_than "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image import"
read -r image_after_node01 image_after_node04 image_after_node05 < <(lightningstor_count_triplet)
local create_vm_rest_json
create_vm_rest_json="$(
jq -cn \
--arg name "smoke-vm-$(date +%s)" \
--arg org "${org_id}" \
--arg project "${project_id}" \
--arg image_id "${image_id}" \
'{
name:$name,
org_id:$org,
project_id:$project,
hypervisor:"kvm",
vcpus:1,
memory_mib:1024,
disks:[
{
id:"root",
source:{type:"image", image_id:$image_id},
size_gib:4,
boot_index:1
},
{
id:"data",
source:{type:"blank"},
size_gib:2
}
]
}'
)"
local create_vm_grpc_json
create_vm_grpc_json="$(
jq -cn \
--arg name "$(printf '%s' "${create_vm_rest_json}" | jq -r '.name')" \
--arg org "${org_id}" \
--arg project "${project_id}" \
--arg image_id "${image_id}" \
'{
name:$name,
orgId:$org,
projectId:$project,
hypervisor:"HYPERVISOR_TYPE_KVM",
spec:{
cpu:{vcpus:1, coresPerSocket:1, sockets:1},
memory:{sizeMib:1024},
disks:[
{
id:"root",
source:{imageId:$image_id},
sizeGib:4,
bus:"DISK_BUS_VIRTIO",
cache:"DISK_CACHE_NONE",
bootIndex:1
},
{
id:"data",
source:{blank:true},
sizeGib:2,
bus:"DISK_BUS_VIRTIO",
cache:"DISK_CACHE_NONE"
}
]
}
}'
)"
local create_response vm_id
create_response="$(
ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_vm_grpc_json}" | base64 | tr -d '\n')" <<'EOS'
set -euo pipefail
proto_root="$1"
token="$2"
request_b64="$3"
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${proto_root}/plasmavmc" \
-proto "${proto_root}/plasmavmc/plasmavmc.proto" \
-d "${request_json}" \
127.0.0.1:50082 plasmavmc.v1.VmService/CreateVm
EOS
)"
vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
[[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "failed to create VM through PlasmaVMC"
local get_vm_json
get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
local node_id=""
local peer_node=""
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
fi
sleep 2
continue
fi
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
fi
sleep 2
done
if [[ "${node_id}" == "node04" ]]; then
peer_node="node05"
else
peer_node="node04"
fi
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to reach RUNNING"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to reach RUNNING"
fi
sleep 2
done
log "Matrix case: PlasmaVMC + CoronaFS"
local volume_id="${vm_id}-root"
local data_volume_id="${vm_id}-data"
local volume_path="${CORONAFS_VOLUME_ROOT}/${volume_id}.raw"
local data_volume_path="${CORONAFS_VOLUME_ROOT}/${data_volume_id}.raw"
local volume_export_json data_volume_export_json volume_uri data_volume_uri
volume_export_json="$(coronafs_export_volume_json 15088 "${volume_id}")"
data_volume_export_json="$(coronafs_export_volume_json 15088 "${data_volume_id}")"
volume_uri="$(printf '%s' "${volume_export_json}" | jq -r '.export.uri')"
data_volume_uri="$(printf '%s' "${data_volume_export_json}" | jq -r '.export.uri')"
[[ -n "${volume_uri}" && "${volume_uri}" != "null" ]] || die "CoronaFS root volume export URI missing"
[[ -n "${data_volume_uri}" && "${data_volume_uri}" != "null" ]] || die "CoronaFS data volume export URI missing"
ssh_node node01 "test -f ${volume_path}"
ssh_node node01 "test -f ${data_volume_path}"
wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM startup"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=1"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=1"
log "Matrix case: PlasmaVMC + CoronaFS + LightningStor"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop after first boot"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop after first boot"
fi
sleep 2
done
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
fi
sleep 2
done
if [[ "${node_id}" == "node04" ]]; then
peer_node="node05"
else
peer_node="node04"
fi
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=2"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=2"
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM restart"
local migrate_vm_json
migrate_vm_json="$(
jq -cn \
--arg org "${org_id}" \
--arg project "${project_id}" \
--arg vm "${vm_id}" \
--arg destination_node "${peer_node}" \
'{
orgId:$org,
projectId:$project,
vmId:$vm,
destinationNodeId:$destination_node,
timeoutSeconds:120,
wait:true
}'
)"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${migrate_vm_json}" \
127.0.0.1:15082 plasmavmc.v1.VmService/MigrateVm >/dev/null
local source_node="${node_id}"
local destination_node="${peer_node}"
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} live migration to ${destination_node}"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e --arg node "${destination_node}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} live migration to ${destination_node}"
fi
sleep 2
done
node_id="${destination_node}"
wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
wait_for_qemu_volume_absent "${source_node}" "${volume_uri}"
wait_for_qemu_volume_absent "${source_node}" "${data_volume_uri}"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_HEARTBEAT count=2"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop after live migration"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop after live migration"
fi
sleep 2
done
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e --arg node "${node_id}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
fi
sleep 2
done
wait_for_qemu_volume_present "${node_id}" "${volume_uri}"
wait_for_qemu_volume_present "${node_id}" "${data_volume_uri}"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=3"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=3"
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM post-migration restart"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop"
fi
sleep 2
continue
fi
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} to stop"
fi
sleep 2
done
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/DeleteVm >/dev/null
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ! grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${get_vm_json}" \
127.0.0.1:15082 plasmavmc.v1.VmService/GetVm >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for VM ${vm_id} deletion to propagate"
fi
sleep 2
done
ssh_node "${node_id}" "bash -lc '[[ ! -d $(printf '%q' "$(vm_runtime_dir_path "${vm_id}")") ]]'"
ssh_node node01 "bash -lc '[[ ! -f ${volume_path} ]]'"
ssh_node node01 "bash -lc '[[ ! -f ${data_volume_path} ]]'"
if coronafs_get_volume_json 15088 "${volume_id}" >/dev/null 2>&1; then
die "CoronaFS root volume metadata still exists after VM deletion"
fi
if coronafs_get_volume_json 15088 "${data_volume_id}" >/dev/null 2>&1; then
die "CoronaFS data volume metadata still exists after VM deletion"
fi
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM deletion"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_image_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject \
| jq -e '(.object.size | tonumber) > 0' >/dev/null
if grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${volume_id}.raw" '{bucket:$bucket, key:$key}')" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
die "shared-fs VM volume unexpectedly persisted to LightningStor object storage"
fi
if grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${data_volume_id}.raw" '{bucket:$bucket, key:$key}')" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
die "shared-fs VM data volume unexpectedly persisted to LightningStor object storage"
fi
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${get_image_json}" \
127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null
if grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "${head_image_json}" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
die "image object still present after ImageService/DeleteImage"
fi
wait_for_lightningstor_counts_equal "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image cleanup"
trap - RETURN
cleanup_vm_storage_flow
}
validate_gateway() {
wait_for_http node06 http://127.0.0.1:8080/health
wait_for_http node06 http://127.0.0.1:9090/api/v1/series
wait_for_tcp_port node06 50089
wait_for_http node06 http://127.0.0.1:3011/health
log "Validating host-forwarded gateway endpoints"
wait_for_host_http http://127.0.0.1:8080/health
wait_for_host_http http://127.0.0.1:9090/api/v1/series
log "Validating API Gateway proxy routes"
local iam_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local org_id="gateway-smoke-org"
local project_id="gateway-smoke-project"
local principal_id="gateway-smoke-$(date +%s)"
local token vpc_json vpc_id
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
vpc_json="$(
curl -fsS \
-X POST http://127.0.0.1:8080/api/v1/vpcs \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-d "$(jq -cn --arg name "gateway-smoke-vpc" --arg org "${org_id}" --arg project "${project_id}" \
'{name:$name, org_id:$org, project_id:$project, cidr_block:"10.55.0.0/16", description:"gateway proxy smoke"}')"
)"
vpc_id="$(printf '%s' "${vpc_json}" | jq -r '.data.id')"
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "API Gateway VPC create did not return an ID"
curl -fsS --get http://127.0.0.1:8080/api/v1/vpcs \
-H "Authorization: Bearer ${token}" \
--data-urlencode "org_id=${org_id}" \
--data-urlencode "project_id=${project_id}" \
| jq -e --arg id "${vpc_id}" '.data.vpcs | any(.id == $id)' >/dev/null
curl -fsS http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
-H "Authorization: Bearer ${token}" \
| jq -e --arg id "${vpc_id}" '.data.id == $id' >/dev/null
curl -fsS http://127.0.0.1:8080/api/v1/vms \
-H "Authorization: Bearer ${token}" \
| jq -e '.data.vms != null' >/dev/null
curl -fsS -X DELETE http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
-H "Authorization: Bearer ${token}" >/dev/null
trap - RETURN
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_nightlight_flow() {
log "Validating NightLight remote_write ingestion and query endpoints"
local metric_name="nightlight_smoke_metric_$(date +%s)"
local metric_value
metric_value="$(awk 'BEGIN{srand(); printf "%.3f\n", (rand()*100)+1}')"
python3 "${REPO_ROOT}/nix/test-cluster/nightlight_remote_write.py" \
--url http://127.0.0.1:9090/api/v1/write \
--metric "${metric_name}" \
--value "${metric_value}" \
--label source=smoke \
--label cluster=photoncloud
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if curl -fsS --get http://127.0.0.1:9090/api/v1/query \
--data-urlencode "query=${metric_name}{source=\"smoke\"}" \
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
.status == "success"
and (.data.result | length) >= 1
and (.data.result | any(.metric.__name__ == $name and (.value[1] >= ($expected - 0.001)) and (.value[1] <= ($expected + 0.001))))
' >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for NightLight query result for ${metric_name}"
fi
sleep 2
done
curl -fsS http://127.0.0.1:9090/api/v1/label/__name__/values \
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | index($name)) != null' >/dev/null
curl -fsS http://127.0.0.1:9090/api/v1/series \
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.__name__ == $name))' >/dev/null
}
validate_creditservice_flow() {
log "Validating CreditService REST and gRPC quota flows"
local iam_tunnel="" credit_grpc_tunnel="" credit_http_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
credit_grpc_tunnel="$(start_ssh_tunnel node06 15089 50089)"
credit_http_tunnel="$(start_ssh_tunnel node06 13011 3011)"
trap 'stop_ssh_tunnel node06 "${credit_http_tunnel}"; stop_ssh_tunnel node06 "${credit_grpc_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
local suffix
suffix="$(date +%s)"
local org_id="credit-smoke-org-${suffix}"
local project_id="credit-smoke-project-${suffix}"
local principal_id="credit-smoke-$(date +%s)"
local token reservation_json reservation_id
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
curl -fsS \
-X POST http://127.0.0.1:13011/api/v1/wallets \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{org_id:$org, project_id:$project, initial_balance:1000}')" \
| jq -e '.data.project_id != null and .data.balance == 1000 and .data.available == 1000' >/dev/null
curl -fsS http://127.0.0.1:13011/api/v1/wallets/"${project_id}" \
-H "Authorization: Bearer ${token}" \
| jq -e --arg project "${project_id}" '.data.project_id == $project and .data.balance == 1000' >/dev/null
curl -fsS \
-X POST http://127.0.0.1:13011/api/v1/wallets/"${project_id}"/topup \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-d '{"amount":250,"description":"smoke topup"}' \
| jq -e '.data.balance == 1250 and .data.total_deposited == 1250' >/dev/null
reservation_json="$(
curl -fsS \
-X POST http://127.0.0.1:13011/api/v1/reservations \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-d "$(jq -cn --arg project "${project_id}" '{project_id:$project, amount:200, description:"smoke reservation", resource_type:"vm", ttl_seconds:120}')"
)"
reservation_id="$(printf '%s' "${reservation_json}" | jq -r '.data.id')"
[[ -n "${reservation_id}" && "${reservation_id}" != "null" ]] || die "CreditService reservation did not return an ID"
curl -fsS \
-X POST http://127.0.0.1:13011/api/v1/reservations/"${reservation_id}"/commit \
-H "Authorization: Bearer ${token}" \
-H 'Content-Type: application/json' \
-d '{"actual_amount":150,"resource_id":"smoke-vm"}' \
| jq -e '.data.balance == 1100 and .data.reserved == 0 and .data.available == 1100' >/dev/null
curl -fsS http://127.0.0.1:13011/api/v1/wallets/"${project_id}"/transactions \
-H "Authorization: Bearer ${token}" \
| jq -e '.data.transactions | length >= 3' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${CREDITSERVICE_PROTO_DIR}" \
-proto "${CREDITSERVICE_PROTO}" \
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", limit:2}')" \
127.0.0.1:15089 creditservice.v1.CreditService/SetQuota \
| jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${CREDITSERVICE_PROTO_DIR}" \
-proto "${CREDITSERVICE_PROTO}" \
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE"}')" \
127.0.0.1:15089 creditservice.v1.CreditService/GetQuota \
| jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${CREDITSERVICE_PROTO_DIR}" \
-proto "${CREDITSERVICE_PROTO}" \
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project}')" \
127.0.0.1:15089 creditservice.v1.CreditService/ListQuotas \
| jq -e '.quotas | length >= 1' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${CREDITSERVICE_PROTO_DIR}" \
-proto "${CREDITSERVICE_PROTO}" \
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", quantity:3, estimatedCost:0}')" \
127.0.0.1:15089 creditservice.v1.CreditService/CheckQuota \
| jq -e '(.allowed // false) == false and (.availableQuota == "2" or .availableQuota == 2)' >/dev/null
ssh_node_script node06 <<'EOS'
set -euo pipefail
systemctl is-active --quiet creditservice.service
journalctl -u creditservice.service --no-pager | grep -F 'Connecting to IAM server at' >/dev/null
EOS
trap - RETURN
stop_ssh_tunnel node06 "${credit_http_tunnel}"
stop_ssh_tunnel node06 "${credit_grpc_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_deployer_flow() {
log "Validating Deployer health, admin registration, and phone-home flows"
local deployer_tunnel=""
deployer_tunnel="$(start_ssh_tunnel node06 13012 8088)"
trap 'stop_ssh_tunnel node06 "${deployer_tunnel}"' RETURN
wait_for_http node06 "http://127.0.0.1:8088/health"
curl -fsS http://127.0.0.1:13012/health | grep -Fx 'OK' >/dev/null
local machine_id node_id phone_home_json
machine_id="smoke-machine-$(date +%s)"
node_id="smoke-node-$(date +%s)"
curl -fsS \
-H 'content-type: application/json' \
-H 'x-deployer-token: test-admin-token' \
-d "$(jq -cn \
--arg machine "${machine_id}" \
--arg node "${node_id}" \
'{machine_id:$machine, node_id:$node, role:"worker", ip:"10.100.0.250", services:["plasmavmc"], ssh_authorized_keys:["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFiberLBSmokeKey smoke@test"]}')" \
http://127.0.0.1:13012/api/v1/admin/nodes \
| jq -e --arg machine "${machine_id}" --arg node "${node_id}" '.success == true and .machine_id == $machine and .node_id == $node' >/dev/null
curl -fsS \
-H 'x-deployer-token: test-admin-token' \
http://127.0.0.1:13012/api/v1/admin/nodes \
| jq -e --arg node "${node_id}" '.nodes | any(.node_id == $node and .ip == "10.100.0.250" and .role == "worker")' >/dev/null
phone_home_json="$(curl -fsS \
-H 'content-type: application/json' \
-H 'x-deployer-token: test-bootstrap-token' \
-d "$(jq -cn \
--arg machine "${machine_id}" \
--arg node "${node_id}" \
'{machine_id:$machine, node_id:$node, hostname:$node, ip:"10.100.0.250", metadata:{rack:"smoke-a1"}}')" \
http://127.0.0.1:13012/api/v1/phone-home)"
printf '%s' "${phone_home_json}" | jq -e --arg node "${node_id}" '
.success == true and
.node_id == $node and
.state == "provisioning" and
.node_config.hostname == $node and
.node_config.role == "worker" and
(.node_config.services | index("plasmavmc")) != null
' >/dev/null
trap - RETURN
stop_ssh_tunnel node06 "${deployer_tunnel}"
}
validate_native_runtime_flow() {
log "Validating native deployer + scheduler runtime orchestration"
wait_for_unit node04 node-agent
wait_for_unit node05 node-agent
wait_for_unit node06 fleet-scheduler
wait_for_http node06 "http://127.0.0.1:8088/health"
local tmp_dir native_config drained_config restored_config
local chainfire_tunnel_node01="" chainfire_tunnel_node02="" chainfire_tunnel_node03=""
local chainfire_endpoint="http://127.0.0.1:12379,http://127.0.0.1:12380,http://127.0.0.1:12381"
local iam_tunnel="" lb_tunnel="" token lb_name
local native_fresh_healthy_map_expr native_fresh_healthy_count_expr
tmp_dir="$(mktemp -d -p "${TMPDIR:-/tmp}" photon-native-runtime-XXXXXX)"
native_config="${tmp_dir}/native-runtime.yaml"
drained_config="${tmp_dir}/native-runtime-drained.yaml"
restored_config="${tmp_dir}/native-runtime-restored.yaml"
native_fresh_healthy_map_expr='map(select(.state == "healthy" and (((((.last_heartbeat // .observed_at) // "") | sub("\\.[0-9]+"; "") | sub("\\+00:00$"; "Z") | fromdateiso8601?) // 0) >= (now - 300))))'
native_fresh_healthy_count_expr="${native_fresh_healthy_map_expr} | length"
chainfire_tunnel_node01="$(start_ssh_tunnel node01 12379 2379 "${NODE_IPS[node01]}")"
chainfire_tunnel_node02="$(start_ssh_tunnel node02 12380 2379 "${NODE_IPS[node02]}")"
chainfire_tunnel_node03="$(start_ssh_tunnel node03 12381 2379 "${NODE_IPS[node03]}")"
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"; stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"; stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"; stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"; rm -rf "${tmp_dir}"' RETURN
native_dump_values() {
local prefix="$1"
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
dump --prefix "${prefix}" --format json \
| jq -rc '.value'
}
wait_for_native_dump_count() {
local prefix="$1"
local jq_expr="$2"
local expected="$3"
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
while true; do
local count
count="$(
native_dump_values "${prefix}" \
| sed '/^$/d' \
| jq -sr "${jq_expr}" 2>/dev/null \
|| printf '0'
)"
if [[ "${count}" == "${expected}" ]]; then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for prefix ${prefix} to satisfy ${jq_expr} == ${expected} (got ${count})"
fi
sleep 2
done
}
native_first_healthy_instance() {
local service="$1"
native_dump_values "photoncloud/clusters/test-cluster/instances/${service}/" \
| sed '/^$/d' \
| jq -sr "${native_fresh_healthy_map_expr} | sort_by(.instance_id) | first"
}
wait_for_native_instance_node() {
local service="$1"
local expected_node="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
local instance_value="" node_id=""
while true; do
instance_value="$(native_first_healthy_instance "${service}")"
node_id="$(printf '%s' "${instance_value}" | jq -r '.node_id // empty')"
if [[ "${node_id}" == "${expected_node}" ]]; then
printf '%s' "${instance_value}"
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for ${service} to run on ${expected_node}"
fi
sleep 2
done
}
native_publication_state() {
native_dump_values "photoncloud/clusters/test-cluster/publications/" \
| sed '/^$/d' \
| jq -sr 'map(select(.service == "native-web")) | first'
}
wait_for_native_dns_record() {
local fqdn="$1"
local expected_ip="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
local deadline=$((SECONDS + timeout))
while true; do
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '${expected_ip}'" >/dev/null 2>&1; then
return 0
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for native DNS record for ${fqdn}"
fi
sleep 2
done
}
wait_for_native_lb_backends() {
local pool_id="$1"
local expected_count="$2"
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
shift 3
local deadline=$((SECONDS + timeout))
local response=""
while true; do
response="$(
grpcurl_capture -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg pool_id "${pool_id}" '{poolId:$pool_id, pageSize:100, pageToken:""}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/ListBackends
)" || true
if printf '%s' "${response}" \
| jq -e --argjson expected "${expected_count}" '(.backends | length) == $expected' >/dev/null 2>&1; then
local matched=1
local expected_ip
for expected_ip in "$@"; do
if ! printf '%s' "${response}" | jq -e --arg ip "${expected_ip}" '.backends | any(.address == $ip)' >/dev/null 2>&1; then
matched=0
break
fi
done
if [[ "${matched}" == "1" ]]; then
return 0
fi
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for native FiberLB backends for pool ${pool_id}: ${response}"
fi
sleep 2
done
}
cat >"${native_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: provisioning
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: provisioning
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 2
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
cat >"${drained_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: draining
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: active
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
cat >"${restored_config}" <<'EOF'
cluster:
cluster_id: test-cluster
environment: test
node_classes:
- name: worker-linux
description: Native runtime worker
roles:
- worker
labels:
tier: general
runtime: native
pools:
- name: general
description: General-purpose native worker pool
node_class: worker-linux
labels:
pool.photoncloud.io/name: general
nodes:
- node_id: node04
hostname: node04
ip: 10.100.0.21
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-b
state: active
- node_id: node05
hostname: node05
ip: 10.100.0.22
roles:
- worker
labels:
runtime: native
pool: general
node_class: worker-linux
failure_domain: zone-c
state: active
services:
- name: native-web
protocol: http
ports:
http: 18190
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
spread_by_label: failure_domain
max_instances_per_node: 1
instance_port: 18190
process:
command: python3
args:
- -m
- http.server
- ${INSTANCE_PORT}
- --bind
- ${INSTANCE_IP}
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 3
publish:
dns:
zone: native.cluster.test
name: web
ttl: 30
mode: load_balancer
load_balancer:
org_id: native-services
project_id: test-cluster
listener_port: 18191
protocol: http
pool_protocol: http
- name: native-container
protocol: http
ports:
http: 18192
schedule:
replicas: 1
placement:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
runtime: native
max_instances_per_node: 1
instance_port: 18192
container:
image: docker.io/library/nginx:1.27-alpine
runtime: podman
pull_policy: if-not-present
ports:
- container_port: 80
host_port: 18192
protocol: tcp
health_check:
type: http
path: /
interval_secs: 5
timeout_secs: 5
startup_grace_secs: 120
instances: []
mtls_policies: []
EOF
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${native_config}"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \
'map(select(.labels.runtime == "native" and .state == "active")) | length' \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
"2" \
300
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"2" \
300
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
'length' \
"1" \
360
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
360
wait_for_http node04 "http://10.100.0.21:18190/" 240
wait_for_http node05 "http://10.100.0.22:18190/" 240
local container_value container_node container_ip container_port
container_value="$(native_first_healthy_instance "native-container")"
container_node="$(printf '%s' "${container_value}" | jq -r '.node_id')"
container_ip="$(printf '%s' "${container_value}" | jq -r '.ip')"
container_port="$(printf '%s' "${container_value}" | jq -r '.port')"
[[ -n "${container_node}" && "${container_node}" != "null" ]] || die "native-container did not report a healthy instance"
wait_for_http "${container_node}" "http://${container_ip}:${container_port}/" 360
wait_for_http node01 "http://127.0.0.1:18191/" 240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/publications/" \
'map(select(.service == "native-web")) | length' \
"1" \
180
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
token="$(issue_project_admin_token 15080 "native-services" "test-cluster" "native-runtime-$(date +%s)")"
lb_name="test-cluster-native-web"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" \
-proto "${FIBERLB_PROTO}" \
-d "$(jq -cn '{orgId:"native-services", projectId:"test-cluster", pageSize:100, pageToken:""}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/ListLoadBalancers \
| jq -e --arg name "${lb_name}" '.loadbalancers | any(.name == $name)' >/dev/null
local publication_value publication_fqdn publication_ip publication_pool_id
publication_value="$(native_publication_state)"
publication_fqdn="$(printf '%s' "${publication_value}" | jq -r '.dns.fqdn')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
[[ -n "${publication_fqdn}" && "${publication_fqdn}" != "null" ]] || die "native publication missing fqdn"
[[ -n "${publication_ip}" && "${publication_ip}" != "null" ]] || die "native publication missing dns value"
[[ -n "${publication_pool_id}" && "${publication_pool_id}" != "null" ]] || die "native publication missing pool id"
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${drained_config}"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
'length' \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local drained_web_value drained_web_node drained_container_value drained_container_node
drained_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
drained_web_node="$(printf '%s' "${drained_web_value}" | jq -r '.node_id')"
[[ "${drained_web_node}" == "node05" ]] || die "native-web did not relocate to node05 after draining node04"
drained_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
drained_container_node="$(printf '%s' "${drained_container_value}" | jq -r '.node_id')"
[[ "${drained_container_node}" == "node05" ]] || die "native-container did not relocate to node05 after draining node04"
wait_for_http node05 "http://10.100.0.22:18190/" 240
wait_for_http node05 "http://10.100.0.22:18192/" 240
wait_for_http node01 "http://127.0.0.1:18191/" 240
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
run_deployer_ctl \
--chainfire-endpoint "${chainfire_endpoint}" \
--cluster-id "test-cluster" \
--cluster-namespace "photoncloud" \
--deployer-namespace "deployer" \
apply --config "${restored_config}"
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
'length' \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
'length' \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local restored_web_value restored_web_node restored_container_value restored_container_node
restored_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
restored_web_node="$(printf '%s' "${restored_web_value}" | jq -r '.node_id')"
[[ "${restored_web_node}" == "node05" ]] || die "native-web unexpectedly moved after node04 returned to service"
restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')"
[[ "${restored_container_node}" == "node05" ]] || die "native-container unexpectedly moved after node04 returned to service"
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node01 "http://127.0.0.1:18191/" 240
log "Simulating native worker loss and scheduler failover"
stop_vm node05
wait_for_ssh_down node05 120
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-web/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-container/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local failover_web_value failover_web_node failover_container_value failover_container_node
failover_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
failover_web_node="$(printf '%s' "${failover_web_value}" | jq -r '.node_id')"
[[ "${failover_web_node}" == "node04" ]] || die "native-web did not fail over to node04 after node05 stopped"
failover_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
failover_container_node="$(printf '%s' "${failover_container_value}" | jq -r '.node_id')"
[[ "${failover_container_node}" == "node04" ]] || die "native-container did not fail over to node04 after node05 stopped"
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 240 10.100.0.21
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node04 "http://10.100.0.21:18190/" 240
wait_for_http node04 "http://10.100.0.21:18192/" 240
wait_for_http node01 "http://127.0.0.1:18191/" 240
log "Restarting native worker and ensuring placement stays stable"
start_vm node05
wait_for_ssh node05
wait_for_unit node05 plasmavmc
wait_for_unit node05 lightningstor
wait_for_unit node05 node-agent
local recovered_web_value recovered_web_node recovered_container_value recovered_container_node
recovered_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
recovered_web_node="$(printf '%s' "${recovered_web_value}" | jq -r '.node_id')"
[[ "${recovered_web_node}" == "node04" ]] || die "native-web unexpectedly churned after node05 recovered"
recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')"
[[ "${recovered_container_node}" == "node04" ]] || die "native-container unexpectedly churned after node05 recovered"
publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.21
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node01 "http://127.0.0.1:18191/" 240
trap - RETURN
stop_ssh_tunnel node01 "${lb_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"
stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"
stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"
rm -rf "${tmp_dir}"
}
validate_network_provider_matrix() {
log "Validating component matrix: PrismNet, FlashDNS, and FiberLB in composed tenant scenarios"
local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
local org_id="matrix-net-org"
local project_id="matrix-net-project"
local principal_id="matrix-net-$(date +%s)"
local token=""
local vpc_id="" subnet_id="" port_id="" port_ip=""
local zone_id="" zone_name="matrix-$(date +%s).cluster.test"
local workload_record_id="" service_record_id=""
local lb_id="" pool_id="" backend_id="" listener_id="" listener_port=""
local workload_fqdn="" service_fqdn=""
cleanup_network_provider_matrix() {
if [[ -n "${service_record_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${service_record_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
fi
if [[ -n "${workload_record_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${workload_record_id}" '{id:$id}')" \
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
fi
if [[ -n "${listener_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null 2>&1 || true
fi
if [[ -n "${backend_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null 2>&1 || true
fi
if [[ -n "${pool_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null 2>&1 || true
fi
if [[ -n "${lb_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null 2>&1 || true
fi
if [[ -n "${port_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null 2>&1 || true
fi
if [[ -n "${subnet_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null 2>&1 || true
fi
if [[ -n "${vpc_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null 2>&1 || true
fi
if [[ -n "${zone_id}" ]]; then
grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null 2>&1 || true
fi
stop_ssh_tunnel node01 "${lb_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${dns_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${prism_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
}
trap cleanup_network_provider_matrix RETURN EXIT
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
log "Matrix case: PrismNet only"
vpc_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, name:"matrix-vpc", description:"component matrix", cidrBlock:"10.52.0.0/16"}')" \
127.0.0.1:15081 prismnet.VpcService/CreateVpc | jq -r '.vpc.id')"
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "component matrix PrismNet VPC creation failed"
subnet_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"matrix-subnet", description:"component matrix", cidrBlock:"10.52.10.0/24", gatewayIp:"10.52.10.1", dhcpEnabled:true}')" \
127.0.0.1:15081 prismnet.SubnetService/CreateSubnet | jq -r '.subnet.id')"
[[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "component matrix PrismNet subnet creation failed"
local port_response
port_response="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"matrix-port", description:"component matrix", ipAddress:""}')" \
127.0.0.1:15081 prismnet.PortService/CreatePort)"
port_id="$(printf '%s' "${port_response}" | jq -r '.port.id')"
port_ip="$(printf '%s' "${port_response}" | jq -r '.port.ipAddress')"
[[ -n "${port_id}" && "${port_id}" != "null" && -n "${port_ip}" && "${port_ip}" != "null" ]] || die "component matrix PrismNet port creation failed"
log "Matrix case: PrismNet + FlashDNS"
zone_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.matrix.test", adminEmail:"admin@matrix.test"}')" \
127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone | jq -r '.zone.id')"
[[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "component matrix FlashDNS zone creation failed"
workload_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg zone "${zone_id}" --arg address "${port_ip}" '{zoneId:$zone, name:"workload", recordType:"A", ttl:60, data:{a:{address:$address}}}')" \
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
[[ -n "${workload_record_id}" && "${workload_record_id}" != "null" ]] || die "component matrix FlashDNS workload record creation failed"
workload_fqdn="workload.${zone_name}"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${workload_fqdn} A | grep -Fx '${port_ip}'" >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for FlashDNS answer for ${workload_fqdn}"
fi
sleep 2
done
log "Matrix case: PrismNet + FiberLB"
listener_port="$(allocate_free_listener_port node01 18180 18999)" || die "failed to allocate a free FiberLB listener port for component matrix"
lb_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{name:"matrix-lb", orgId:$org, projectId:$project, description:"component matrix"}')" \
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer | jq -r '.loadbalancer.id')"
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "component matrix FiberLB creation failed"
pool_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg lb "${lb_id}" '{name:"matrix-pool", loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool | jq -r '.pool.id')"
[[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "component matrix FiberLB pool creation failed"
backend_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg pool "${pool_id}" '{name:"matrix-backend", poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend | jq -r '.backend.id')"
[[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "component matrix FiberLB backend creation failed"
listener_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
-d "$(jq -cn --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:"matrix-listener", loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener | jq -r '.listener.id')"
[[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "component matrix FiberLB listener creation failed"
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
log "Matrix case: PrismNet + FlashDNS + FiberLB"
service_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"service", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
[[ -n "${service_record_id}" && "${service_record_id}" != "null" ]] || die "component matrix FlashDNS service record creation failed"
service_fqdn="service.${zone_name}"
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for FlashDNS answer for ${service_fqdn}"
fi
sleep 2
done
ssh_node node01 "curl -fsS --max-time 5 --resolve ${service_fqdn}:${listener_port}:10.100.0.11 http://${service_fqdn}:${listener_port}/health >/dev/null"
trap - RETURN EXIT
cleanup_network_provider_matrix
}
validate_component_matrix() {
validate_control_plane
validate_iam_flow
validate_network_provider_matrix
validate_vm_storage_flow
validate_k8shost_flow
validate_gateway
validate_nightlight_flow
validate_creditservice_flow
validate_deployer_flow
validate_native_runtime_flow
log "Component matrix validation succeeded"
}
benchmark_coronafs_performance() {
log "Benchmarking CoronaFS NBD-backed volume throughput against local worker disk"
local local_write_json local_read_json local_rand_json
local coronafs_write_json coronafs_read_json coronafs_rand_json
local local_depth_write_json local_depth_read_json
local coronafs_depth_write_json coronafs_depth_read_json
local cross_worker_read_json
local coronafs_tunnel="" bench_volume="coronafs-bench-$(date +%s)"
local coronafs_export_json coronafs_uri
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
cleanup_coronafs_bench() {
coronafs_delete_volume 15088 "${bench_volume}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${coronafs_tunnel}"
}
trap cleanup_coronafs_bench RETURN
coronafs_create_volume 15088 "${bench_volume}" $((512 * 1024 * 1024)) >/dev/null
coronafs_export_json="$(coronafs_export_volume_json 15088 "${bench_volume}")"
coronafs_uri="$(printf '%s' "${coronafs_export_json}" | jq -r '.export.uri')"
[[ -n "${coronafs_uri}" && "${coronafs_uri}" != "null" ]] || die "CoronaFS benchmark volume did not return an export URI"
local_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqwrite.dat write 1M 256)"
local_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqread.dat read 1M 256)"
local_rand_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread.dat randread 4k 128 10)"
local_rand_depth_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread-depth.dat randread 4k 512 15 32 libaio)"
coronafs_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 256)"
coronafs_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" read 1M 256)"
coronafs_rand_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 128 10)"
coronafs_rand_depth_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 512 15 /dev/nbd0 32)"
local_depth_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthwrite.dat write 1M 1024 15 32 libaio)"
local_depth_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthread.dat read 1M 1024 15 32 libaio)"
coronafs_depth_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 1024 15 /dev/nbd0 32)"
coronafs_depth_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" read 1M 1024 15 /dev/nbd0 32)"
cross_worker_read_json="$(run_remote_nbd_fio_json node05 "${coronafs_uri}" read 1M 256 0 /dev/nbd1 1)"
local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
local coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps
local local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps
local_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_write_json}" | jq -r '.bw_bytes')")"
local_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_read_json}" | jq -r '.bw_bytes')")"
local_rand_iops="$(printf '%s' "${local_rand_json}" | jq -r '.iops | floor')"
local_rand_depth_iops="$(printf '%s' "${local_rand_depth_json}" | jq -r '.iops | floor')"
coronafs_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_write_json}" | jq -r '.bw_bytes')")"
coronafs_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_read_json}" | jq -r '.bw_bytes')")"
coronafs_rand_iops="$(printf '%s' "${coronafs_rand_json}" | jq -r '.iops | floor')"
coronafs_rand_depth_iops="$(printf '%s' "${coronafs_rand_depth_json}" | jq -r '.iops | floor')"
local_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_write_json}" | jq -r '.bw_bytes')")"
local_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_read_json}" | jq -r '.bw_bytes')")"
coronafs_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_depth_write_json}" | jq -r '.bw_bytes')")"
coronafs_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_depth_read_json}" | jq -r '.bw_bytes')")"
coronafs_cross_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${cross_worker_read_json}" | jq -r '.bw_bytes')")"
log "CoronaFS local baseline: write=${local_write_mibps} MiB/s read=${local_read_mibps} MiB/s randread=${local_rand_iops} IOPS queued_randread=${local_rand_depth_iops} IOPS"
log "CoronaFS shared block volume: write=${coronafs_write_mibps} MiB/s read=${coronafs_read_mibps} MiB/s randread=${coronafs_rand_iops} IOPS queued_randread=${coronafs_rand_depth_iops} IOPS"
log "CoronaFS queued depth-32 profile: local_write=${local_depth_write_mibps} MiB/s local_read=${local_depth_read_mibps} MiB/s shared_write=${coronafs_depth_write_mibps} MiB/s shared_read=${coronafs_depth_read_mibps} MiB/s"
log "CoronaFS cross-worker shared read: read=${coronafs_cross_read_mibps} MiB/s (node04 write -> node05 direct read over the same NBD export)"
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
"${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
"${coronafs_write_mibps}" "${coronafs_read_mibps}" "${coronafs_rand_iops}" "${coronafs_rand_depth_iops}" \
"${coronafs_cross_read_mibps}" \
"${local_depth_write_mibps}" "${local_depth_read_mibps}" \
"${coronafs_depth_write_mibps}" "${coronafs_depth_read_mibps}"
trap - RETURN
cleanup_coronafs_bench
}
benchmark_lightningstor_performance() {
local client_node="${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}"
log "Benchmarking LightningStor S3 throughput from ${client_node}"
local bucket="ls-bench-$(date +%s)"
local object_key="bench-object.bin"
local result_json
if ! result_json="$(ssh_node_script "${client_node}" "${bucket}" "${object_key}" 256 32 4 8 <<'EOS'
set -euo pipefail
bucket="$1"
object_key="$2"
size_mb="$3"
small_count="$4"
small_size_mb="$5"
parallelism="$6"
endpoint="http://10.100.0.11:9000"
workdir="/var/tmp/photon-bench-s3"
src="${workdir}/upload.bin"
dst="${workdir}/download.bin"
mkdir -p "${workdir}"
python3 - "${bucket}" "${object_key}" "${size_mb}" "${small_count}" "${small_size_mb}" "${parallelism}" "${endpoint}" "${workdir}" "${src}" "${dst}" <<'PY'
import concurrent.futures
import hashlib
import json
import os
import pathlib
import threading
import time
import boto3
from botocore.config import Config
bucket, object_key, size_mb, small_count, small_size_mb, parallelism, endpoint, workdir, src, dst = os.sys.argv[1:11]
size_mb = int(size_mb)
small_count = int(small_count)
small_size_mb = int(small_size_mb)
parallelism = int(parallelism)
workdir_path = pathlib.Path(workdir)
src_path = pathlib.Path(src)
dst_path = pathlib.Path(dst)
small_size_bytes = small_size_mb * 1024 * 1024
large_size_bytes = size_mb * 1024 * 1024
thread_local = threading.local()
def ensure_sparse_file(path: pathlib.Path, size_bytes: int) -> None:
if path.exists() and path.stat().st_size == size_bytes:
return
with path.open("wb") as handle:
handle.truncate(size_bytes)
def sha256_file(path: pathlib.Path) -> str:
digest = hashlib.sha256()
with path.open("rb") as handle:
while True:
chunk = handle.read(8 * 1024 * 1024)
if not chunk:
break
digest.update(chunk)
return digest.hexdigest()
def new_client():
return boto3.session.Session().client(
"s3",
endpoint_url=endpoint,
region_name="us-east-1",
aws_access_key_id="photoncloud-test",
aws_secret_access_key="photoncloud-test-secret",
use_ssl=False,
verify=False,
config=Config(
retries={"max_attempts": 8, "mode": "standard"},
s3={"addressing_style": "path"},
max_pool_connections=max(32, parallelism * 4),
signature_version="s3v4",
),
)
def client():
existing = getattr(thread_local, "client", None)
if existing is None:
existing = new_client()
thread_local.client = existing
return existing
def put_file(key: str, path: pathlib.Path) -> None:
with path.open("rb") as handle:
client().put_object(Bucket=bucket, Key=key, Body=handle)
def get_file(key: str, path: pathlib.Path) -> None:
response = client().get_object(Bucket=bucket, Key=key)
with path.open("wb") as handle:
body = response["Body"]
for chunk in body.iter_chunks(chunk_size=8 * 1024 * 1024):
if chunk:
handle.write(chunk)
def delete_key(key: str) -> None:
client().delete_object(Bucket=bucket, Key=key)
workdir_path.mkdir(parents=True, exist_ok=True)
ensure_sparse_file(src_path, large_size_bytes)
src_sha = sha256_file(src_path)
small_paths = []
for index in range(1, small_count + 1):
path = workdir_path / f"payload-{index}.bin"
ensure_sparse_file(path, small_size_bytes)
small_paths.append(path)
control_client = new_client()
control_client.create_bucket(Bucket=bucket)
upload_start = time.monotonic_ns()
put_file(object_key, src_path)
upload_end = time.monotonic_ns()
if dst_path.exists():
dst_path.unlink()
download_start = time.monotonic_ns()
get_file(object_key, dst_path)
download_end = time.monotonic_ns()
if sha256_file(dst_path) != src_sha:
raise SystemExit("large-object checksum mismatch")
head = control_client.head_object(Bucket=bucket, Key=object_key)
if int(head["ContentLength"]) != large_size_bytes:
raise SystemExit("large-object size mismatch")
delete_key(object_key)
small_total_bytes = small_count * small_size_bytes
small_upload_start = time.monotonic_ns()
for index, path in enumerate(small_paths, start=1):
put_file(f"small-{index}.bin", path)
small_upload_end = time.monotonic_ns()
small_download_start = time.monotonic_ns()
for index in range(1, small_count + 1):
small_dst = workdir_path / f"small-download-{index}.bin"
get_file(f"small-{index}.bin", small_dst)
if small_dst.stat().st_size != small_size_bytes:
raise SystemExit(f"small-object size mismatch for {small_dst}")
small_download_end = time.monotonic_ns()
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
list(executor.map(delete_key, [f"small-{index}.bin" for index in range(1, small_count + 1)]))
parallel_upload_start = time.monotonic_ns()
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
list(
executor.map(
lambda item: put_file(f"parallel-small-{item[0]}.bin", item[1]),
list(enumerate(small_paths, start=1)),
)
)
parallel_upload_end = time.monotonic_ns()
parallel_download_start = time.monotonic_ns()
def download_parallel(index: int) -> None:
path = workdir_path / f"parallel-download-{index}.bin"
get_file(f"parallel-small-{index}.bin", path)
if path.stat().st_size != small_size_bytes:
raise SystemExit(f"parallel small-object size mismatch for {path}")
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
list(executor.map(download_parallel, range(1, small_count + 1)))
parallel_download_end = time.monotonic_ns()
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
list(executor.map(delete_key, [f"parallel-small-{index}.bin" for index in range(1, small_count + 1)]))
control_client.delete_bucket(Bucket=bucket)
for pattern in ("payload-*.bin", "small-download-*.bin", "parallel-download-*.bin"):
for path in workdir_path.glob(pattern):
path.unlink(missing_ok=True)
src_path.unlink(missing_ok=True)
dst_path.unlink(missing_ok=True)
print(
json.dumps(
{
"size_bytes": large_size_bytes,
"upload_ns": upload_end - upload_start,
"download_ns": download_end - download_start,
"small_object_count": small_count,
"small_total_bytes": small_total_bytes,
"small_upload_ns": small_upload_end - small_upload_start,
"small_download_ns": small_download_end - small_download_start,
"parallel_small_upload_ns": parallel_upload_end - parallel_upload_start,
"parallel_small_download_ns": parallel_download_end - parallel_download_start,
"parallelism": parallelism,
}
)
)
PY
EOS
)"; then
die "LightningStor S3 benchmark failed"
fi
local size_bytes upload_mibps download_mibps
local small_total_bytes small_object_count small_object_mib
local small_upload_mibps small_download_mibps small_put_ops small_get_ops
local parallel_small_upload_mibps parallel_small_download_mibps parallel_small_put_ops parallel_small_get_ops parallelism
size_bytes="$(printf '%s' "${result_json}" | jq -r '.size_bytes')"
[[ -n "${size_bytes}" && "${size_bytes}" != "null" && "${size_bytes}" != "0" ]] || die "LightningStor S3 benchmark returned no object size"
upload_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.upload_ns')")"
download_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.download_ns')")"
small_total_bytes="$(printf '%s' "${result_json}" | jq -r '.small_total_bytes')"
small_object_count="$(printf '%s' "${result_json}" | jq -r '.small_object_count')"
small_object_mib="$(awk "BEGIN { printf \"%.0f\", ${small_total_bytes} / 1048576 }")"
small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
parallel_small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
parallel_small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
parallel_small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
parallel_small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
parallelism="$(printf '%s' "${result_json}" | jq -r '.parallelism')"
log "LightningStor S3 benchmark: upload=${upload_mibps} MiB/s download=${download_mibps} MiB/s object_size=$((size_bytes / 1048576)) MiB"
log "LightningStor small-object batch: objects=${small_object_count} size_per_object=4 MiB upload=${small_upload_mibps} MiB/s download=${small_download_mibps} MiB/s put_rate=${small_put_ops} obj/s get_rate=${small_get_ops} obj/s"
log "LightningStor parallel small-object batch: objects=${small_object_count} size_per_object=4 MiB parallelism=${parallelism} upload=${parallel_small_upload_mibps} MiB/s download=${parallel_small_download_mibps} MiB/s put_rate=${parallel_small_put_ops} obj/s get_rate=${parallel_small_get_ops} obj/s"
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
"${upload_mibps}" "${download_mibps}" "$((size_bytes / 1048576))" \
"${small_object_count}" "${small_object_mib}" "${small_upload_mibps}" "${small_download_mibps}" \
"${small_put_ops}/${small_get_ops}" \
"${parallel_small_upload_mibps}" "${parallel_small_download_mibps}" \
"${parallel_small_put_ops}/${parallel_small_get_ops}"
}
benchmark_plasmavmc_image_path() {
log "Benchmarking PlasmaVMC image import plus CoronaFS-backed volume clone latency"
local iam_tunnel="" ls_tunnel="" vm_tunnel=""
local image_id="" cold_volume_id="" warm_volume_id="" image_source_path=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
cleanup_plasmavmc_image_bench() {
if [[ -n "${cold_volume_id}" ]]; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${cold_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
fi
if [[ -n "${warm_volume_id}" ]]; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${warm_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
fi
if [[ -n "${image_id}" ]]; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')" \
127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
fi
if [[ -n "${image_source_path}" ]]; then
ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
fi
stop_ssh_tunnel node01 "${vm_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${ls_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
}
trap cleanup_plasmavmc_image_bench RETURN
local org_id="plasmavmc-bench-org-$(date +%s)"
local project_id="plasmavmc-bench-project"
local principal_id="plasmavmc-bench-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC benchmark image import"
local guest_image_local_path guest_image_sha artifact_size_bytes artifact_mib virtual_size_bytes virtual_mib
guest_image_local_path="$(guest_image_path)"
[[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image for PlasmaVMC benchmark"
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
artifact_size_bytes="$(stat -c %s "${guest_image_local_path}")"
virtual_size_bytes="$(qemu-img info --output json "${guest_image_local_path}" | jq -r '."virtual-size"')"
artifact_mib="$(awk "BEGIN { printf \"%.0f\", ${artifact_size_bytes} / 1048576 }")"
virtual_mib="$(awk "BEGIN { printf \"%.0f\", ${virtual_size_bytes} / 1048576 }")"
local image_name="bench-image-$(date +%s)"
ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
[[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC benchmark image checksum mismatch after distribution"
local create_image_json create_image_response create_image_start_ns create_image_end_ns
create_image_json="$(
jq -cn \
--arg name "${image_name}" \
--arg org "${org_id}" \
--arg sha "${guest_image_sha}" \
--arg source_url "file://${image_source_path}" \
'{
name:$name,
orgId:$org,
visibility:"VISIBILITY_PRIVATE",
format:"IMAGE_FORMAT_QCOW2",
osType:"OS_TYPE_LINUX",
osVersion:"bench",
architecture:"ARCHITECTURE_X86_64",
minDiskGib:1,
minMemoryMib:512,
metadata:{purpose:"bench", sourceSha256:$sha},
sourceUrl:$source_url
}'
)"
create_image_start_ns="$(date +%s%N)"
create_image_response="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${create_image_json}" \
127.0.0.1:15082 plasmavmc.v1.ImageService/CreateImage)"
create_image_end_ns="$(date +%s%N)"
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC benchmark image import did not return an image ID"
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
local cold_request warm_request cold_response warm_response cold_start_ns cold_end_ns warm_start_ns warm_end_ns
cold_request="$(jq -cn --arg name "bench-cold-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
name:$name,
orgId:$org,
projectId:$project,
sizeGib:4,
driver:"VOLUME_DRIVER_KIND_MANAGED",
storageClass:"coronafs-managed",
imageId:$image,
metadata:{purpose:"bench-cold"},
labels:{}
}')"
cold_start_ns="$(date +%s%N)"
cold_response="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${cold_request}" \
127.0.0.1:15082 plasmavmc.v1.VolumeService/CreateVolume)"
cold_end_ns="$(date +%s%N)"
cold_volume_id="$(printf '%s' "${cold_response}" | jq -r '.id')"
[[ -n "${cold_volume_id}" && "${cold_volume_id}" != "null" ]] || die "PlasmaVMC cold image-backed volume create did not return a volume ID"
printf '%s' "${cold_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${cold_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
127.0.0.1:15082 plasmavmc.v1.VolumeService/DeleteVolume >/dev/null
cold_volume_id=""
warm_request="$(jq -cn --arg name "bench-warm-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
name:$name,
orgId:$org,
projectId:$project,
sizeGib:4,
driver:"VOLUME_DRIVER_KIND_MANAGED",
storageClass:"coronafs-managed",
imageId:$image,
metadata:{purpose:"bench-warm"},
labels:{}
}')"
warm_start_ns="$(date +%s%N)"
warm_response="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${warm_request}" \
127.0.0.1:15082 plasmavmc.v1.VolumeService/CreateVolume)"
warm_end_ns="$(date +%s%N)"
warm_volume_id="$(printf '%s' "${warm_response}" | jq -r '.id')"
[[ -n "${warm_volume_id}" && "${warm_volume_id}" != "null" ]] || die "PlasmaVMC warm image-backed volume create did not return a volume ID"
printf '%s' "${warm_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null
local image_import_sec cold_clone_sec warm_clone_sec
image_import_sec="$(calc_seconds_from_ns "$((create_image_end_ns - create_image_start_ns))")"
cold_clone_sec="$(calc_seconds_from_ns "$((cold_end_ns - cold_start_ns))")"
warm_clone_sec="$(calc_seconds_from_ns "$((warm_end_ns - warm_start_ns))")"
log "PlasmaVMC image artifact benchmark: artifact=${artifact_mib} MiB virtual_size=${virtual_mib} MiB import=${image_import_sec}s cold_clone=${cold_clone_sec}s warm_clone=${warm_clone_sec}s"
printf '%s\t%s\t%s\t%s\t%s\n' \
"${artifact_mib}" "${virtual_mib}" "${image_import_sec}" "${cold_clone_sec}" "${warm_clone_sec}"
}
benchmark_plasmavmc_guest_runtime() {
log "Benchmarking PlasmaVMC guest-side CoronaFS runtime throughput"
local iam_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
local image_id="" vm_id="" image_source_path=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
cleanup_plasmavmc_guest_runtime() {
if [[ -n "${vm_id}" ]]; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:true, timeoutSeconds:30}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null 2>&1 || true
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}' )" \
127.0.0.1:15082 plasmavmc.v1.VmService/DeleteVm >/dev/null 2>&1 || true
fi
if [[ -n "${image_id}" ]]; then
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')" \
127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
fi
if [[ -n "${image_source_path}" ]]; then
ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
fi
stop_ssh_tunnel node01 "${coronafs_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${vm_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${ls_tunnel}" >/dev/null 2>&1 || true
stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
}
trap cleanup_plasmavmc_guest_runtime RETURN
wait_for_plasmavmc_workers_registered 15082
local org_id="plasmavmc-runtime-org-$(date +%s)"
local project_id="plasmavmc-runtime-project"
local principal_id="plasmavmc-runtime-$(date +%s)"
local token
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC runtime benchmark image import"
local guest_image_local_path guest_image_sha image_name create_image_json create_image_response
guest_image_local_path="$(guest_bench_image_path)"
[[ -n "${guest_image_local_path}" ]] || die "failed to locate VM benchmark guest image"
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
image_name="bench-runtime-image-$(date +%s)"
ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
image_source_path="/var/lib/plasmavmc/imports/${image_name}.qcow2"
scp_to_node node01 "${guest_image_local_path}" "${image_source_path}"
[[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC runtime benchmark image checksum mismatch after distribution"
create_image_json="$(
jq -cn \
--arg name "${image_name}" \
--arg org "${org_id}" \
--arg sha "${guest_image_sha}" \
--arg source_url "file://${image_source_path}" \
'{
name:$name,
orgId:$org,
visibility:"VISIBILITY_PRIVATE",
format:"IMAGE_FORMAT_QCOW2",
osType:"OS_TYPE_LINUX",
osVersion:"bench-runtime",
architecture:"ARCHITECTURE_X86_64",
minDiskGib:1,
minMemoryMib:512,
metadata:{purpose:"bench-runtime", sourceSha256:$sha},
sourceUrl:$source_url
}'
)"
create_image_response="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${create_image_json}" \
127.0.0.1:15082 plasmavmc.v1.ImageService/CreateImage)"
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC runtime benchmark image import did not return an image ID"
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
local create_vm_json get_vm_json create_response node_id peer_node
create_vm_json="$(
jq -cn \
--arg name "bench-runtime-vm-$(date +%s)" \
--arg org "${org_id}" \
--arg project "${project_id}" \
--arg image_id "${image_id}" \
'{
name:$name,
orgId:$org,
projectId:$project,
hypervisor:"HYPERVISOR_TYPE_KVM",
spec:{
cpu:{vcpus:4, coresPerSocket:1, sockets:1},
memory:{sizeMib:1536},
disks:[
{
id:"root",
source:{imageId:$image_id},
sizeGib:4,
bus:"DISK_BUS_VIRTIO",
cache:"DISK_CACHE_NONE",
bootIndex:1
},
{
id:"data",
source:{blank:true},
sizeGib:4,
bus:"DISK_BUS_VIRTIO",
cache:"DISK_CACHE_NONE"
}
]
}
}'
)"
create_response="$(grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "${create_vm_json}" \
127.0.0.1:15082 plasmavmc.v1.VmService/CreateVm)"
vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
[[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "PlasmaVMC runtime benchmark VM create did not return a VM ID"
get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
while true; do
local vm_json
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
if (( SECONDS >= deadline )); then
die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
fi
sleep 2
continue
fi
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
break
fi
if (( SECONDS >= deadline )); then
die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
fi
sleep 2
done
if [[ "${node_id}" == "node04" ]]; then
peer_node="node05"
else
peer_node="node04"
fi
local start_ns attach_ns ready_ns attach_sec ready_sec
local root_volume_id="${vm_id}-root"
local data_volume_id="${vm_id}-data"
local root_uri data_uri
start_ns="$(date +%s%N)"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
root_uri="$(coronafs_export_volume_json 15088 "${root_volume_id}" | jq -r '.export.uri')"
data_uri="$(coronafs_export_volume_json 15088 "${data_volume_id}" | jq -r '.export.uri')"
[[ -n "${root_uri}" && "${root_uri}" != "null" ]] || die "runtime benchmark root volume export URI missing"
[[ -n "${data_uri}" && "${data_uri}" != "null" ]] || die "runtime benchmark data volume export URI missing"
wait_for_qemu_volume_present "${node_id}" "${root_uri}"
wait_for_qemu_volume_present "${node_id}" "${data_uri}"
attach_ns="$(date +%s%N)"
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT"
ready_ns="$(date +%s%N)"
local result_line seq_write_mibps seq_read_mibps randread_iops
result_line="$(read_vm_console_line_matching "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT")"
seq_write_mibps="$(printf '%s\n' "${result_line}" | sed -n 's/.*seq_write_mibps=\([^ ]*\).*/\1/p')"
seq_read_mibps="$(printf '%s\n' "${result_line}" | sed -n 's/.*seq_read_mibps=\([^ ]*\).*/\1/p')"
randread_iops="$(printf '%s\n' "${result_line}" | sed -n 's/.*randread_iops=\([^ ]*\).*/\1/p')"
[[ -n "${seq_write_mibps}" && -n "${seq_read_mibps}" && -n "${randread_iops}" ]] || die "failed to parse runtime benchmark result line: ${result_line}"
attach_sec="$(calc_seconds_from_ns "$((attach_ns - start_ns))")"
ready_sec="$(calc_seconds_from_ns "$((ready_ns - start_ns))")"
log "PlasmaVMC guest runtime benchmark: attach=${attach_sec}s guest_ready=${ready_sec}s seq_write=${seq_write_mibps} MiB/s seq_read=${seq_read_mibps} MiB/s randread=${randread_iops} IOPS"
printf '%s\t%s\t%s\t%s\t%s\n' \
"${attach_sec}" "${ready_sec}" "${seq_write_mibps}" "${seq_read_mibps}" "${randread_iops}"
}
write_storage_benchmark_report() {
local coronafs_network_mibps="$1"
local coronafs_network_retransmits="$2"
local lightningstor_network_mibps="$3"
local lightningstor_network_retransmits="$4"
local local_write_mibps="$5"
local local_read_mibps="$6"
local local_rand_iops="$7"
local local_rand_depth_iops="$8"
local coronafs_write_mibps="$9"
local coronafs_read_mibps="${10}"
local coronafs_rand_iops="${11}"
local coronafs_rand_depth_iops="${12}"
local coronafs_cross_read_mibps="${13}"
local local_depth_write_mibps="${14}"
local local_depth_read_mibps="${15}"
local coronafs_depth_write_mibps="${16}"
local coronafs_depth_read_mibps="${17}"
local lightningstor_upload_mibps="${18}"
local lightningstor_download_mibps="${19}"
local lightningstor_object_mib="${20}"
local lightningstor_small_object_count="${21}"
local lightningstor_small_object_mib="${22}"
local lightningstor_small_upload_mibps="${23}"
local lightningstor_small_download_mibps="${24}"
local lightningstor_small_ops="${25}"
local lightningstor_parallel_small_upload_mibps="${26}"
local lightningstor_parallel_small_download_mibps="${27}"
local lightningstor_parallel_small_ops="${28}"
local plasmavmc_image_artifact_mib="${29}"
local plasmavmc_image_virtual_mib="${30}"
local plasmavmc_image_import_sec="${31}"
local plasmavmc_cold_clone_sec="${32}"
local plasmavmc_warm_clone_sec="${33}"
local plasmavmc_runtime_attach_sec="${34}"
local plasmavmc_runtime_ready_sec="${35}"
local plasmavmc_runtime_seq_write_mibps="${36}"
local plasmavmc_runtime_seq_read_mibps="${37}"
local plasmavmc_runtime_randread_iops="${38}"
local coronafs_read_ratio coronafs_rand_ratio coronafs_rand_depth_ratio coronafs_cross_read_ratio coronafs_vs_network_ratio coronafs_depth_read_ratio lightningstor_vs_network_ratio
local lightningstor_small_put_ops lightningstor_small_get_ops
local lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops
IFS=/ read -r lightningstor_small_put_ops lightningstor_small_get_ops <<<"${lightningstor_small_ops}"
IFS=/ read -r lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops <<<"${lightningstor_parallel_small_ops}"
coronafs_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_read_mibps} / ${local_read_mibps}) * 100 }")"
coronafs_rand_ratio="$(awk "BEGIN { if (${local_rand_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_rand_iops} / ${local_rand_iops}) * 100 }")"
coronafs_rand_depth_ratio="$(awk "BEGIN { if (${local_rand_depth_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_rand_depth_iops} / ${local_rand_depth_iops}) * 100 }")"
coronafs_cross_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_cross_read_mibps} / ${local_read_mibps}) * 100 }")"
coronafs_vs_network_ratio="$(awk "BEGIN { if (${coronafs_network_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_read_mibps} / ${coronafs_network_mibps}) * 100 }")"
coronafs_depth_read_ratio="$(awk "BEGIN { if (${local_depth_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_depth_read_mibps} / ${local_depth_read_mibps}) * 100 }")"
lightningstor_vs_network_ratio="$(awk "BEGIN { if (${lightningstor_network_mibps} == 0) print 0; else printf \"%.1f\", (${lightningstor_download_mibps} / ${lightningstor_network_mibps}) * 100 }")"
cat > "${REPO_ROOT}/docs/storage-benchmarks.md" <<EOF
# Storage Benchmarks
Generated on $(date -Iseconds) with:
\`\`\`bash
nix run ./nix/test-cluster#cluster -- ${STORAGE_BENCHMARK_COMMAND}
\`\`\`
## CoronaFS
Cluster network baseline, measured with \`iperf3\` from \`node04\` to \`node01\` before the storage tests:
| Metric | Result |
|---|---:|
| TCP throughput | ${coronafs_network_mibps} MiB/s |
| TCP retransmits | ${coronafs_network_retransmits} |
Measured from \`node04\`.
Local worker disk is the baseline. CoronaFS is the shared block volume path used for mutable VM disks, exported from \`node01\` over NBD.
| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Sequential write | ${local_write_mibps} MiB/s | ${coronafs_write_mibps} MiB/s |
| Sequential read | ${local_read_mibps} MiB/s | ${coronafs_read_mibps} MiB/s |
| 4k random read | ${local_rand_iops} IOPS | ${coronafs_rand_iops} IOPS |
| 4k queued random read (\`iodepth=32\`) | ${local_rand_depth_iops} IOPS | ${coronafs_rand_depth_iops} IOPS |
Queue-depth profile (\`libaio\`, \`iodepth=32\`) from the same worker:
| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Depth-32 write | ${local_depth_write_mibps} MiB/s | ${coronafs_depth_write_mibps} MiB/s |
| Depth-32 read | ${local_depth_read_mibps} MiB/s | ${coronafs_depth_read_mibps} MiB/s |
Cross-worker shared-volume visibility, measured by writing on \`node04\` and reading from \`node05\` with direct I/O over the same CoronaFS NBD export:
| Metric | Result |
|---|---:|
| Cross-worker sequential read | ${coronafs_cross_read_mibps} MiB/s |
## LightningStor
Measured from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` against the S3-compatible endpoint on \`node01\`.
The object path exercised the distributed backend with replication across the worker storage nodes.
Cluster network baseline for this client, measured with \`iperf3\` from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` to \`node01\` before the storage tests:
| Metric | Result |
|---|---:|
| TCP throughput | ${lightningstor_network_mibps} MiB/s |
| TCP retransmits | ${lightningstor_network_retransmits} |
### Large-object path
| Metric | Result |
|---|---:|
| Object size | ${lightningstor_object_mib} MiB |
| Upload throughput | ${lightningstor_upload_mibps} MiB/s |
| Download throughput | ${lightningstor_download_mibps} MiB/s |
### Small-object batch
Measured as ${lightningstor_small_object_count} objects of 4 MiB each (${lightningstor_small_object_mib} MiB total).
| Metric | Result |
|---|---:|
| Batch upload throughput | ${lightningstor_small_upload_mibps} MiB/s |
| Batch download throughput | ${lightningstor_small_download_mibps} MiB/s |
| PUT rate | ${lightningstor_small_put_ops} objects/s |
| GET rate | ${lightningstor_small_get_ops} objects/s |
### Parallel small-object batch
Measured as the same ${lightningstor_small_object_count} objects of 4 MiB each, but with 8 concurrent client jobs from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\`.
| Metric | Result |
|---|---:|
| Parallel batch upload throughput | ${lightningstor_parallel_small_upload_mibps} MiB/s |
| Parallel batch download throughput | ${lightningstor_parallel_small_download_mibps} MiB/s |
| Parallel PUT rate | ${lightningstor_parallel_small_put_ops} objects/s |
| Parallel GET rate | ${lightningstor_parallel_small_get_ops} objects/s |
## VM Image Path
Measured against the \`PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume\` clone path on \`node01\`.
| Metric | Result |
|---|---:|
| Guest image artifact size | ${plasmavmc_image_artifact_mib} MiB |
| Guest image virtual size | ${plasmavmc_image_virtual_mib} MiB |
| \`CreateImage\` latency | ${plasmavmc_image_import_sec} s |
| First image-backed \`CreateVolume\` latency | ${plasmavmc_cold_clone_sec} s |
| Second image-backed \`CreateVolume\` latency | ${plasmavmc_warm_clone_sec} s |
## VM Runtime Path
Measured against the real \`StartVm -> qemu attach -> guest boot -> guest fio\` path on a worker node, using a CoronaFS-backed root disk and data disk.
| Metric | Result |
|---|---:|
| \`StartVm\` to qemu attach | ${plasmavmc_runtime_attach_sec} s |
| \`StartVm\` to guest benchmark result | ${plasmavmc_runtime_ready_sec} s |
| Guest sequential write | ${plasmavmc_runtime_seq_write_mibps} MiB/s |
| Guest sequential read | ${plasmavmc_runtime_seq_read_mibps} MiB/s |
| Guest 4k random read | ${plasmavmc_runtime_randread_iops} IOPS |
## Assessment
- CoronaFS shared-volume reads are currently ${coronafs_read_ratio}% of the measured local-disk baseline on this nested-QEMU lab cluster.
- CoronaFS 4k random reads are currently ${coronafs_rand_ratio}% of the measured local-disk baseline.
- CoronaFS queued 4k random reads are currently ${coronafs_rand_depth_ratio}% of the measured local queued-random-read baseline.
- CoronaFS cross-worker reads are currently ${coronafs_cross_read_ratio}% of the measured local-disk sequential-read baseline, which is the more relevant signal for VM restart and migration paths.
- CoronaFS sequential reads are currently ${coronafs_vs_network_ratio}% of the measured node04->node01 TCP baseline, which helps separate NBD/export overhead from raw cluster-network limits.
- CoronaFS depth-32 reads are currently ${coronafs_depth_read_ratio}% of the local depth-32 baseline, which is a better proxy for queued guest I/O than the single-depth path.
- The shared-volume path is functionally correct for mutable VM disks and migration tests, but its read-side throughput is still too low to call production-ready for heavier VM workloads.
- LightningStor's replicated S3 path is working correctly, but ${lightningstor_upload_mibps} MiB/s upload and ${lightningstor_download_mibps} MiB/s download are still lab-grade numbers rather than strong object-store throughput.
- LightningStor large-object downloads are currently ${lightningstor_vs_network_ratio}% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
- LightningStor's small-object batch path is also functional, but ${lightningstor_small_put_ops} PUT/s and ${lightningstor_small_get_ops} GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches ${lightningstor_parallel_small_put_ops} PUT/s and ${lightningstor_parallel_small_get_ops} GET/s.
- The VM image section measures clone/materialization cost, not guest runtime I/O.
- The VM runtime section is the real \`PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel\` path; use it to judge whether QEMU/NBD tuning is helping.
- The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.
EOF
}
benchmark_storage() {
local coronafs_network_results lightningstor_network_results coronafs_results lightningstor_results plasmavmc_results plasmavmc_runtime_results
local coronafs_network_mibps coronafs_network_retransmits
local lightningstor_network_mibps lightningstor_network_retransmits
local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
local coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps
local local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps
local lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib
local lightningstor_small_object_count lightningstor_small_object_mib
local lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops
local lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops
local plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib
local plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec
local plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec
local plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops
coronafs_network_results="$(run_remote_iperf_json node04 node01 10.100.0.11)"
lightningstor_network_results="$(run_remote_iperf_json "${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}" node01 10.100.0.11)"
coronafs_results="$(benchmark_coronafs_performance)"
lightningstor_results="$(benchmark_lightningstor_performance)"
if [[ "${STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH}" == "1" ]]; then
plasmavmc_results=$'0\t0\t0\t0\t0'
else
plasmavmc_results="$(benchmark_plasmavmc_image_path)"
fi
if [[ "${STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH}" == "1" ]]; then
plasmavmc_runtime_results=$'0\t0\t0\t0\t0'
else
plasmavmc_runtime_results="$(benchmark_plasmavmc_guest_runtime)"
fi
coronafs_network_mibps="$(bps_to_mibps "$(printf '%s' "${coronafs_network_results}" | jq -r '.bits_per_second')")"
coronafs_network_retransmits="$(printf '%s' "${coronafs_network_results}" | jq -r '.retransmits')"
lightningstor_network_mibps="$(bps_to_mibps "$(printf '%s' "${lightningstor_network_results}" | jq -r '.bits_per_second')")"
lightningstor_network_retransmits="$(printf '%s' "${lightningstor_network_results}" | jq -r '.retransmits')"
IFS=$'\t' read -r \
local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops \
coronafs_write_mibps coronafs_read_mibps coronafs_rand_iops coronafs_rand_depth_iops coronafs_cross_read_mibps \
local_depth_write_mibps local_depth_read_mibps coronafs_depth_write_mibps coronafs_depth_read_mibps <<<"${coronafs_results}"
IFS=$'\t' read -r \
lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib \
lightningstor_small_object_count lightningstor_small_object_mib lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops \
lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops <<<"${lightningstor_results}"
IFS=$'\t' read -r \
plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec <<<"${plasmavmc_results}"
IFS=$'\t' read -r \
plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops <<<"${plasmavmc_runtime_results}"
write_storage_benchmark_report \
"${coronafs_network_mibps}" "${coronafs_network_retransmits}" \
"${lightningstor_network_mibps}" "${lightningstor_network_retransmits}" \
"${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
"${coronafs_write_mibps}" "${coronafs_read_mibps}" "${coronafs_rand_iops}" "${coronafs_rand_depth_iops}" "${coronafs_cross_read_mibps}" \
"${local_depth_write_mibps}" "${local_depth_read_mibps}" "${coronafs_depth_write_mibps}" "${coronafs_depth_read_mibps}" \
"${lightningstor_upload_mibps}" "${lightningstor_download_mibps}" "${lightningstor_object_mib}" \
"${lightningstor_small_object_count}" "${lightningstor_small_object_mib}" "${lightningstor_small_upload_mibps}" "${lightningstor_small_download_mibps}" "${lightningstor_small_ops}" \
"${lightningstor_parallel_small_upload_mibps}" "${lightningstor_parallel_small_download_mibps}" "${lightningstor_parallel_small_ops}" \
"${plasmavmc_image_artifact_mib}" "${plasmavmc_image_virtual_mib}" "${plasmavmc_image_import_sec}" "${plasmavmc_cold_clone_sec}" "${plasmavmc_warm_clone_sec}" \
"${plasmavmc_runtime_attach_sec}" "${plasmavmc_runtime_ready_sec}" "${plasmavmc_runtime_seq_write_mibps}" "${plasmavmc_runtime_seq_read_mibps}" "${plasmavmc_runtime_randread_iops}"
log "Storage benchmark report written to ${REPO_ROOT}/docs/storage-benchmarks.md"
}
validate_control_plane_fault_injection() {
log "Injecting control-plane failure: stopping node02 and validating quorum behavior"
local iam_tunnel="" iam_tunnel_alt=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
iam_tunnel_alt="$(start_ssh_tunnel node03 15083 50080)"
local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
trap 'start_vm node02 >/dev/null 2>&1 || true; wait_for_ssh node02 || true; stop_ssh_tunnel node03 "${iam_tunnel_alt}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
stop_vm node02
wait_for_ssh_down node02 90
ssh_node_script node01 <<'EOS'
set -euo pipefail
key="fault-chainfire-$(date +%s)"
value="ok-$RANDOM"
nodes=(10.100.0.11 10.100.0.13)
writer=""
deadline=$((SECONDS + 60))
while [[ -z "${writer}" ]]; do
for ip in "${nodes[@]}"; do
code="$(curl -sS -o /tmp/chainfire-fault.out -w '%{http_code}' \
-X PUT "http://${ip}:8081/api/v1/kv/${key}" \
-H 'Content-Type: application/json' \
-d "{\"value\":\"${value}\"}" || true)"
if [[ "${code}" == "200" ]]; then
writer="${ip}"
break
fi
done
if [[ -n "${writer}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
echo "chainfire quorum writer did not become available after node02 stop" >&2
exit 1
fi
sleep 1
done
for ip in "${nodes[@]}"; do
deadline=$((SECONDS + 60))
while true; do
actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
if [[ "${actual}" == "${value}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
echo "chainfire quorum write did not converge on ${ip}" >&2
exit 1
fi
sleep 1
done
done
EOS
ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
set -euo pipefail
proto_root="$1"
key="fault-flaredb-strong-$(date +%s)"
value="ok-$RANDOM"
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
nodes=(10.100.0.11 10.100.0.13)
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"fault"}')"
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"fault"}')"
writer=""
deadline=$((SECONDS + 90))
while [[ -z "${writer}" ]]; do
for ip in "${nodes[@]}"; do
if timeout 15 grpcurl -plaintext \
-import-path "${proto_root}" \
-proto "${proto_root}/kvrpc.proto" \
-d "${request}" \
"${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-fault-cas.out 2>/dev/null; then
if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-fault-cas.out >/dev/null; then
writer="${ip}"
break
fi
fi
done
if [[ -n "${writer}" ]]; then
break
fi
if (( SECONDS >= deadline )); then
echo "flaredb quorum writer did not become available after node02 stop" >&2
exit 1
fi
sleep 1
done
deadline=$((SECONDS + 90))
while true; do
if timeout 15 grpcurl -plaintext \
-import-path "${proto_root}" \
-proto "${proto_root}/kvrpc.proto" \
-d "${get_request}" \
"${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-fault-get.out 2>/dev/null; then
if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-fault-get.out >/dev/null; then
break
fi
fi
if (( SECONDS >= deadline )); then
echo "flaredb strong quorum write did not remain readable on leader ${writer}" >&2
exit 1
fi
sleep 1
done
EOS
local org_id="fault-iam-org"
local project_id="fault-iam-project"
local principal_id="fault-iam-$(date +%s)"
local token iam_fault_port
read -r iam_fault_port token < <(issue_project_admin_token_any "${org_id}" "${project_id}" "${principal_id}" 15080 15083)
grpcurl -plaintext \
-import-path "${IAM_PROTO_DIR}" \
-proto "${IAM_PROTO}" \
-d "$(jq -cn --arg token "${token}" '{token:$token}')" \
127.0.0.1:"${iam_fault_port}" iam.v1.IamToken/ValidateToken \
| jq -e '.valid == true' >/dev/null
start_vm node02
wait_for_ssh node02
wait_for_unit node02 chainfire
wait_for_unit node02 flaredb
wait_for_unit node02 iam
wait_for_flaredb_region node02
wait_for_flaredb_route_metadata node01
trap - RETURN
stop_ssh_tunnel node03 "${iam_tunnel_alt}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_worker_fault_injection() {
log "Injecting worker failure: stopping node04 and validating degraded worker operation"
local iam_tunnel="" ls_tunnel="" vm_tunnel=""
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
trap 'start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
stop_vm node04
wait_for_ssh_down node04 90
wait_for_http node05 http://127.0.0.1:8084/health
wait_for_tcp_port node05 50086
grpcurl -plaintext \
-import-path "${PLASMAVMC_PROTO_DIR}" \
-proto "${PLASMAVMC_PROTO}" \
-d '{}' \
127.0.0.1:15082 plasmavmc.v1.NodeService/ListNodes \
| jq -e '([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null' >/dev/null
local org_id="worker-fault-org"
local project_id="worker-fault-project"
local principal_id="worker-fault-$(date +%s)"
local token bucket key tmpfile
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
bucket="worker-fault-$(date +%s)"
key="survive-${RANDOM}.txt"
ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"
tmpfile="$(mktemp)"
trap 'rm -f "${tmpfile}"; start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
printf 'worker-fault-check-%s\n' "${RANDOM}" >"${tmpfile}"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "$(jq -cn \
--arg bucket "${bucket}" \
--arg key "${key}" \
--arg body "$(base64 -w0 "${tmpfile}")" \
'{bucket:$bucket, key:$key, body:$body, metadata:{contentType:"text/plain"}}')" \
127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject >/dev/null
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null
download_lightningstor_object_to_file 15086 "${token}" "${bucket}" "${key}" "${tmpfile}.downloaded"
cmp -s "${tmpfile}" "${tmpfile}.downloaded"
grpcurl -plaintext \
-H "authorization: Bearer ${token}" \
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
-proto "${LIGHTNINGSTOR_PROTO}" \
-d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject >/dev/null
rm -f "${tmpfile}" "${tmpfile}.downloaded"
start_vm node04
wait_for_ssh node04
wait_for_unit node04 plasmavmc
wait_for_unit node04 lightningstor
wait_for_http node04 http://127.0.0.1:8084/health
wait_for_tcp_port node04 50086
wait_for_plasmavmc_workers_registered 15082
trap - RETURN
stop_ssh_tunnel node01 "${vm_tunnel}"
stop_ssh_tunnel node01 "${ls_tunnel}"
stop_ssh_tunnel node01 "${iam_tunnel}"
}
validate_fault_injection() {
validate_control_plane_fault_injection
validate_worker_fault_injection
}
validate_cluster() {
preflight
wait_requested
validate_units
validate_control_plane
validate_iam_flow
validate_prismnet_flow
validate_flashdns_flow
validate_fiberlb_flow
validate_workers
validate_lightningstor_distributed_storage
validate_vm_storage_flow
validate_k8shost_flow
validate_gateway
validate_nightlight_flow
validate_creditservice_flow
validate_deployer_flow
validate_fault_injection
validate_nested_kvm_workers
validate_native_runtime_flow
log "Cluster validation succeeded"
}
validate_storage_cluster() {
preflight
wait_requested "${STORAGE_NODES[@]}"
validate_storage_units
validate_storage_control_plane
validate_workers
validate_lightningstor_distributed_storage
validate_vm_storage_flow
validate_nested_kvm_workers
log "Storage cluster validation succeeded"
}
smoke_requested() {
start_requested "$@"
validate_cluster
}
fresh_smoke_requested() {
clean_requested "$@"
smoke_requested "$@"
}
storage_smoke_requested() {
BUILD_PROFILE="storage"
start_requested "${STORAGE_NODES[@]}"
validate_storage_cluster
}
fresh_storage_smoke_requested() {
BUILD_PROFILE="storage"
clean_requested "${STORAGE_NODES[@]}"
storage_smoke_requested
}
matrix_requested() {
start_requested "$@"
validate_component_matrix
}
fresh_matrix_requested() {
clean_requested "$@"
matrix_requested "$@"
}
bench_storage_requested() {
STORAGE_BENCHMARK_COMMAND="${STORAGE_BENCHMARK_COMMAND:-bench-storage}"
start_requested "$@"
validate_units
benchmark_storage
}
fresh_bench_storage_requested() {
STORAGE_BENCHMARK_COMMAND="fresh-bench-storage"
clean_requested "$@"
bench_storage_requested "$@"
}
storage_bench_requested() {
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
BUILD_PROFILE="storage"
start_requested "${STORAGE_NODES[@]}"
validate_storage_units
validate_storage_control_plane
benchmark_storage
}
fresh_storage_bench_requested() {
STORAGE_BENCHMARK_COMMAND="fresh-storage-bench"
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
BUILD_PROFILE="storage"
clean_requested "${STORAGE_NODES[@]}"
storage_bench_requested
}
status_requested() {
local nodes
mapfile -t nodes < <(all_or_requested_nodes "$@")
validate_nodes_exist "${nodes[@]}"
local node pid_path
printf 'PhotonCloud test cluster status\n'
printf '===============================\n'
for node in "${nodes[@]}"; do
pid_path="$(pid_file "${node}")"
if is_running "${node}"; then
printf '%s: RUNNING (pid=%s, ssh=%s, runtime=%s)\n' \
"${node}" "$(<"${pid_path}")" "$(ssh_port_for_node "${node}")" "$(runtime_dir "${node}")"
else
printf '%s: STOPPED\n' "${node}"
fi
done
}
stop_requested() {
acquire_cluster_lock
local nodes
mapfile -t nodes < <(all_or_requested_nodes "$@")
validate_nodes_exist "${nodes[@]}"
if [[ "$#" -eq 0 ]]; then
stop_nodes_all_profiles "${nodes[@]}"
else
stop_nodes_current_profile "${nodes[@]}"
fi
}
clean_requested() {
acquire_cluster_lock
stop_requested "$@"
if [[ "$#" -eq 0 ]]; then
remove_runtime_state_all_profiles
else
local node
for node in "$@"; do
log "Removing runtime state for ${node}"
find "$(runtime_dir "${node}")" -mindepth 1 -delete 2>/dev/null || true
rmdir "$(runtime_dir "${node}")" 2>/dev/null || true
rm -f "$(build_link "${node}")"
done
fi
}
ssh_requested() {
local node="${1:-node01}"
validate_nodes_exist "${node}"
local ssh_port
ssh_port="$(ssh_port_for_node "${node}")"
exec sshpass -p "${SSH_PASSWORD}" \
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1
}
logs_requested() {
local node="${1:-node01}"
local lines="${2:-120}"
validate_nodes_exist "${node}"
tail -n "${lines}" "$(log_file "${node}")"
}
usage() {
cat <<USAGE
PhotonCloud VM test cluster
Usage: $0 <command> [nodes...]
Commands:
build Build one or more VM derivations
start Build if needed, start VMs, and wait for SSH
wait Wait for SSH on running VMs
validate Run the cluster smoke validation
smoke start + validate
fresh-smoke clean local runtime state, rebuild on the host, start, and validate
storage-smoke start the storage lab (node01-05) and validate CoronaFS/LightningStor/PlasmaVMC
fresh-storage-smoke clean local runtime state, rebuild node01-05 on the host, start, and validate the storage lab
matrix Start the cluster and validate composed service configurations against the current running VMs
fresh-matrix clean local runtime state, rebuild on the host, start, and validate composed service configurations
bench-storage start the cluster and benchmark CoronaFS plus LightningStor against the current running VMs
fresh-bench-storage clean local runtime state, rebuild on the host, start, and benchmark CoronaFS plus LightningStor
storage-bench start the storage lab (node01-05) and benchmark CoronaFS plus LightningStor
fresh-storage-bench clean local runtime state, rebuild node01-05 on the host, start, and benchmark the storage lab
stop Stop one or more VMs
status Show VM process status
ssh SSH to a node (default: node01)
logs Show VM log for a node (default: node01)
clean Stop VMs and remove local runtime state
help Show this help
Examples:
$0 smoke
$0 fresh-smoke
$0 storage-smoke
$0 fresh-storage-smoke
$0 matrix
$0 fresh-matrix
$0 bench-storage
$0 fresh-bench-storage
$0 storage-bench
$0 fresh-storage-bench
$0 start node01 node02 node03
$0 validate
$0 ssh node04
USAGE
}
main() {
local cmd="${1:-help}"
shift || true
case "${cmd}" in
build) build_requested "$@" ;;
start) start_requested "$@" ;;
wait) wait_requested "$@" ;;
validate) validate_cluster ;;
smoke) smoke_requested "$@" ;;
fresh-smoke) fresh_smoke_requested "$@" ;;
storage-smoke) storage_smoke_requested ;;
fresh-storage-smoke) fresh_storage_smoke_requested ;;
matrix) matrix_requested "$@" ;;
fresh-matrix) fresh_matrix_requested "$@" ;;
bench-storage) bench_storage_requested "$@" ;;
fresh-bench-storage) fresh_bench_storage_requested "$@" ;;
storage-bench) storage_bench_requested ;;
fresh-storage-bench) fresh_storage_bench_requested ;;
stop) stop_requested "$@" ;;
status) status_requested "$@" ;;
ssh) ssh_requested "$@" ;;
logs) logs_requested "$@" ;;
clean) clean_requested "$@" ;;
help|--help|-h) usage ;;
*) die "unknown command: ${cmd}" ;;
esac
}
main "$@"