7476 lines
295 KiB
Bash
Executable file
7476 lines
295 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
# PhotonCloud VM test-cluster harness
|
|
#
|
|
# Commands:
|
|
# build Build one or more VM derivations
|
|
# start Build if needed, start VMs, and wait for SSH
|
|
# wait Wait for SSH on running VMs
|
|
# validate Run multi-node smoke validation, including nested KVM on workers
|
|
# smoke start + validate
|
|
# fresh-smoke clean + host-build + start + validate
|
|
# fresh-matrix clean + host-build + start + composed-configuration validation
|
|
# fresh-bench-storage clean + host-build + start + storage benchmark
|
|
# stop Stop running VMs
|
|
# status Show VM process status
|
|
# ssh Open an interactive SSH session to a node
|
|
# logs Show the VM log for a node
|
|
# clean Stop VMs and remove local runtime state
|
|
#
|
|
# Examples:
|
|
# ./run-cluster.sh smoke
|
|
# ./run-cluster.sh start node01 node02 node03
|
|
# ./run-cluster.sh validate
|
|
|
|
set -euo pipefail
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
|
CLUSTER_DIR="${SCRIPT_DIR}"
|
|
CLUSTER_FLAKE_REF="${PHOTON_CLUSTER_FLAKE:-${CLUSTER_DIR}}"
|
|
VM_DIR_BASE="${PHOTON_VM_DIR:-${HOME}/.photoncloud-test-cluster}"
|
|
VDE_SWITCH_DIR_BASE="${PHOTON_CLUSTER_VDE_SWITCH_DIR:-/tmp/photoncloud-test-cluster-vde.sock}"
|
|
CORONAFS_API_PORT="${PHOTON_CORONAFS_API_PORT:-50088}"
|
|
CORONAFS_VOLUME_ROOT="/var/lib/coronafs/volumes"
|
|
SSH_PASSWORD="${PHOTON_VM_ROOT_PASSWORD:-test}"
|
|
SSH_CONNECT_TIMEOUT="${PHOTON_VM_SSH_CONNECT_TIMEOUT:-5}"
|
|
SSH_WAIT_TIMEOUT="${PHOTON_VM_SSH_WAIT_TIMEOUT:-300}"
|
|
UNIT_WAIT_TIMEOUT="${PHOTON_VM_UNIT_WAIT_TIMEOUT:-240}"
|
|
HTTP_WAIT_TIMEOUT="${PHOTON_VM_HTTP_WAIT_TIMEOUT:-180}"
|
|
KVM_WAIT_TIMEOUT="${PHOTON_VM_KVM_WAIT_TIMEOUT:-180}"
|
|
FLAREDB_WAIT_TIMEOUT="${PHOTON_VM_FLAREDB_WAIT_TIMEOUT:-180}"
|
|
GRPCURL_MAX_MSG_SIZE="${PHOTON_VM_GRPCURL_MAX_MSG_SIZE:-1073741824}"
|
|
GRPCURL_TIMEOUT_SECS="${PHOTON_VM_GRPCURL_TIMEOUT_SECS:-30}"
|
|
TUNNEL_WAIT_TIMEOUT="${PHOTON_VM_TUNNEL_WAIT_TIMEOUT:-30}"
|
|
STORAGE_BENCHMARK_COMMAND="${PHOTON_VM_STORAGE_BENCH_COMMAND:-bench-storage}"
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="${PHOTON_VM_LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node06}"
|
|
STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_IMAGE_BENCH:-0}"
|
|
STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH="${PHOTON_VM_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH:-0}"
|
|
HOST_CPU_COUNT="$(getconf _NPROCESSORS_ONLN 2>/dev/null || nproc 2>/dev/null || echo 4)"
|
|
DEFAULT_CLUSTER_NIX_MAX_JOBS=2
|
|
DEFAULT_CLUSTER_NIX_BUILD_CORES=4
|
|
if [[ "${HOST_CPU_COUNT}" =~ ^[0-9]+$ ]] && (( HOST_CPU_COUNT >= 12 )); then
|
|
DEFAULT_CLUSTER_NIX_MAX_JOBS=3
|
|
DEFAULT_CLUSTER_NIX_BUILD_CORES=6
|
|
fi
|
|
CLUSTER_NIX_MAX_JOBS="${PHOTON_CLUSTER_NIX_MAX_JOBS:-${DEFAULT_CLUSTER_NIX_MAX_JOBS}}"
|
|
CLUSTER_NIX_BUILD_CORES="${PHOTON_CLUSTER_NIX_BUILD_CORES:-${DEFAULT_CLUSTER_NIX_BUILD_CORES}}"
|
|
BUILD_PROFILE="${PHOTON_CLUSTER_BUILD_PROFILE:-default}"
|
|
CLUSTER_SKIP_BUILD="${PHOTON_CLUSTER_SKIP_BUILD:-0}"
|
|
CLUSTER_LOCK_HELD=0
|
|
|
|
NODES=(node01 node02 node03 node04 node05 node06)
|
|
STORAGE_NODES=(node01 node02 node03 node04 node05)
|
|
|
|
IAM_PROTO_DIR="${REPO_ROOT}/iam/proto"
|
|
IAM_PROTO="${IAM_PROTO_DIR}/iam.proto"
|
|
PRISMNET_PROTO_DIR="${REPO_ROOT}/prismnet/crates/prismnet-api/proto"
|
|
PRISMNET_PROTO="${PRISMNET_PROTO_DIR}/prismnet.proto"
|
|
FLASHDNS_PROTO_DIR="${REPO_ROOT}/flashdns/crates/flashdns-api/proto"
|
|
FLASHDNS_PROTO="${FLASHDNS_PROTO_DIR}/flashdns.proto"
|
|
FIBERLB_PROTO_DIR="${REPO_ROOT}/fiberlb/crates/fiberlb-api/proto"
|
|
FIBERLB_PROTO="${FIBERLB_PROTO_DIR}/fiberlb.proto"
|
|
K8SHOST_PROTO_DIR="${REPO_ROOT}/k8shost/crates/k8shost-proto/proto"
|
|
K8SHOST_PROTO="${K8SHOST_PROTO_DIR}/k8s.proto"
|
|
CREDITSERVICE_PROTO_DIR="${REPO_ROOT}/creditservice/proto"
|
|
CREDITSERVICE_PROTO="${CREDITSERVICE_PROTO_DIR}/creditservice.proto"
|
|
LIGHTNINGSTOR_PROTO_DIR="${REPO_ROOT}/lightningstor/crates/lightningstor-api/proto"
|
|
LIGHTNINGSTOR_PROTO="${LIGHTNINGSTOR_PROTO_DIR}/lightningstor.proto"
|
|
NIGHTLIGHT_PROTO_DIR="${REPO_ROOT}/nightlight/crates/nightlight-api/proto"
|
|
NIGHTLIGHT_QUERY_PROTO="${NIGHTLIGHT_PROTO_DIR}/query.proto"
|
|
NIGHTLIGHT_ADMIN_PROTO="${NIGHTLIGHT_PROTO_DIR}/admin.proto"
|
|
PLASMAVMC_PROTO_DIR="${REPO_ROOT}/plasmavmc/proto"
|
|
PLASMAVMC_PROTO="${PLASMAVMC_PROTO_DIR}/plasmavmc.proto"
|
|
FLAREDB_PROTO_DIR="${REPO_ROOT}/flaredb/crates/flaredb-proto/src"
|
|
FLAREDB_PROTO="${FLAREDB_PROTO_DIR}/kvrpc.proto"
|
|
|
|
# shellcheck disable=SC2034
|
|
NODE_PHASES=(
|
|
"node01 node02 node03"
|
|
"node04 node05"
|
|
"node06"
|
|
)
|
|
|
|
declare -A SSH_PORTS=(
|
|
[node01]=2201
|
|
[node02]=2202
|
|
[node03]=2203
|
|
[node04]=2204
|
|
[node05]=2205
|
|
[node06]=2206
|
|
)
|
|
|
|
declare -A STORAGE_SSH_PORTS=(
|
|
[node01]=2301
|
|
[node02]=2302
|
|
[node03]=2303
|
|
[node04]=2304
|
|
[node05]=2305
|
|
)
|
|
|
|
declare -A NODE_IPS=(
|
|
[node01]=10.100.0.11
|
|
[node02]=10.100.0.12
|
|
[node03]=10.100.0.13
|
|
[node04]=10.100.0.21
|
|
[node05]=10.100.0.22
|
|
[node06]=10.100.0.100
|
|
)
|
|
|
|
declare -A NODE_UNITS=(
|
|
[node01]="chainfire flaredb iam prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost"
|
|
[node02]="chainfire flaredb iam"
|
|
[node03]="chainfire flaredb iam"
|
|
[node04]="plasmavmc lightningstor coronafs node-agent"
|
|
[node05]="plasmavmc lightningstor coronafs node-agent"
|
|
[node06]="apigateway nightlight creditservice deployer fleet-scheduler"
|
|
)
|
|
|
|
declare -A STORAGE_BUILD_TARGETS=(
|
|
[node01]=storage-node01
|
|
[node02]=storage-node02
|
|
[node03]=storage-node03
|
|
[node04]=storage-node04
|
|
[node05]=storage-node05
|
|
)
|
|
|
|
SSH_OPTS=(
|
|
-o StrictHostKeyChecking=no
|
|
-o UserKnownHostsFile=/dev/null
|
|
-o LogLevel=ERROR
|
|
-o ConnectTimeout="${SSH_CONNECT_TIMEOUT}"
|
|
-o PreferredAuthentications=password
|
|
-o PubkeyAuthentication=no
|
|
-o KbdInteractiveAuthentication=no
|
|
)
|
|
|
|
log() {
|
|
printf '[%s] %s\n' "$(date '+%Y-%m-%d %H:%M:%S')" "$*" >&2
|
|
}
|
|
|
|
die() {
|
|
log "ERROR: $*"
|
|
exit 1
|
|
}
|
|
|
|
warn() {
|
|
log "WARN: $*"
|
|
}
|
|
|
|
run_deployer_ctl() {
|
|
RUST_LOG="${RUST_LOG:-warn}" \
|
|
nix --option warn-dirty false run --quiet \
|
|
--extra-experimental-features 'nix-command flakes' \
|
|
"${REPO_ROOT}#deployer-ctl" -- "$@"
|
|
}
|
|
|
|
release_cluster_lock() {
|
|
local lock_dir
|
|
local owner=""
|
|
lock_dir="$(cluster_lock_dir)"
|
|
|
|
if [[ "${CLUSTER_LOCK_HELD}" -ne 1 ]]; then
|
|
return 0
|
|
fi
|
|
|
|
if [[ -d "${lock_dir}" ]]; then
|
|
if [[ -f "${lock_dir}/pid" ]]; then
|
|
owner="$(<"${lock_dir}/pid")"
|
|
fi
|
|
|
|
if [[ -z "${owner}" || "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ]]; then
|
|
rm -rf "${lock_dir}"
|
|
fi
|
|
fi
|
|
|
|
CLUSTER_LOCK_HELD=0
|
|
unset PHOTON_CLUSTER_LOCK_OWNER
|
|
}
|
|
|
|
acquire_cluster_lock() {
|
|
local lock_dir
|
|
local owner=""
|
|
lock_dir="$(cluster_lock_dir)"
|
|
|
|
if [[ "${CLUSTER_LOCK_HELD}" -eq 1 ]]; then
|
|
return 0
|
|
fi
|
|
|
|
mkdir -p "$(dirname "${lock_dir}")"
|
|
|
|
if mkdir "${lock_dir}" 2>/dev/null; then
|
|
printf '%s\n' "$$" >"${lock_dir}/pid"
|
|
CLUSTER_LOCK_HELD=1
|
|
export PHOTON_CLUSTER_LOCK_OWNER="$$"
|
|
trap release_cluster_lock EXIT
|
|
return 0
|
|
fi
|
|
|
|
if [[ -f "${lock_dir}/pid" ]]; then
|
|
owner="$(<"${lock_dir}/pid")"
|
|
fi
|
|
|
|
if [[ -n "${owner}" && ( "${owner}" == "$$" || "${owner}" == "${PHOTON_CLUSTER_LOCK_OWNER:-}" ) ]]; then
|
|
CLUSTER_LOCK_HELD=1
|
|
export PHOTON_CLUSTER_LOCK_OWNER="${owner}"
|
|
trap release_cluster_lock EXIT
|
|
return 0
|
|
fi
|
|
|
|
if [[ -n "${owner}" ]] && ! kill -0 "${owner}" >/dev/null 2>&1; then
|
|
warn "reclaiming stale PhotonCloud test-cluster lock from pid ${owner}"
|
|
rm -f "${lock_dir}/pid"
|
|
rmdir "${lock_dir}" 2>/dev/null || true
|
|
if mkdir "${lock_dir}" 2>/dev/null; then
|
|
printf '%s\n' "$$" >"${lock_dir}/pid"
|
|
CLUSTER_LOCK_HELD=1
|
|
export PHOTON_CLUSTER_LOCK_OWNER="$$"
|
|
trap release_cluster_lock EXIT
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
die "another PhotonCloud test-cluster run is active${owner:+ (pid ${owner})}; lock: ${lock_dir}"
|
|
}
|
|
|
|
lightningstor_data_root() {
|
|
case "$1" in
|
|
node01) printf '%s\n' /var/lib/lightningstor/node ;;
|
|
node04|node05) printf '%s\n' /var/lib/lightningstor ;;
|
|
*) die "no LightningStor data root mapping for $1" ;;
|
|
esac
|
|
}
|
|
|
|
profile_slug() {
|
|
local slug
|
|
slug="$(printf '%s' "${BUILD_PROFILE}" | tr -c 'A-Za-z0-9._-' '-')"
|
|
slug="${slug##-}"
|
|
slug="${slug%%-}"
|
|
if [[ -z "${slug}" ]]; then
|
|
slug="default"
|
|
fi
|
|
printf '%s\n' "${slug}"
|
|
}
|
|
|
|
profile_state_suffix() {
|
|
local slug
|
|
slug="$(profile_slug)"
|
|
if [[ "${slug}" == "default" ]]; then
|
|
printf '\n'
|
|
else
|
|
printf -- '-%s\n' "${slug}"
|
|
fi
|
|
}
|
|
|
|
vm_dir() {
|
|
printf '%s%s\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
|
|
}
|
|
|
|
cluster_lock_dir() {
|
|
printf '%s%s.lock\n' "${VM_DIR_BASE}" "$(profile_state_suffix)"
|
|
}
|
|
|
|
vde_switch_dir() {
|
|
printf '%s%s\n' "${VDE_SWITCH_DIR_BASE}" "$(profile_state_suffix)"
|
|
}
|
|
|
|
vde_switch_pid_file() {
|
|
printf '%s/vde-switch.pid\n' "$(vm_dir)"
|
|
}
|
|
|
|
all_build_profiles() {
|
|
local seen=""
|
|
local profile
|
|
|
|
for profile in default storage "${BUILD_PROFILE}"; do
|
|
[[ -n "${profile}" ]] || continue
|
|
case " ${seen} " in
|
|
*" ${profile} "*) continue ;;
|
|
esac
|
|
seen="${seen} ${profile}"
|
|
printf '%s\n' "${profile}"
|
|
done
|
|
}
|
|
|
|
with_build_profile() {
|
|
local next_profile="$1"
|
|
local prev_profile="${BUILD_PROFILE}"
|
|
shift
|
|
|
|
BUILD_PROFILE="${next_profile}"
|
|
"$@"
|
|
local rc=$?
|
|
BUILD_PROFILE="${prev_profile}"
|
|
return "${rc}"
|
|
}
|
|
|
|
lightningstor_data_file_count() {
|
|
local node="$1"
|
|
local root
|
|
root="$(lightningstor_data_root "${node}")"
|
|
local deadline=$((SECONDS + SSH_WAIT_TIMEOUT))
|
|
local output=""
|
|
|
|
while true; do
|
|
if output="$(ssh_node "${node}" "find ${root} -type f ! -name '*.tmp' | wc -l" 2>/dev/null)"; then
|
|
printf '%s\n' "${output}"
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out collecting LightningStor file count from ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
lightningstor_count_triplet() {
|
|
printf '%s %s %s\n' \
|
|
"$(lightningstor_data_file_count node01)" \
|
|
"$(lightningstor_data_file_count node04)" \
|
|
"$(lightningstor_data_file_count node05)"
|
|
}
|
|
|
|
capture_stable_lightningstor_count_triplet() {
|
|
local min_node01="${1:-0}"
|
|
local min_node04="${2:-0}"
|
|
local min_node05="${3:-0}"
|
|
local settle_secs="${4:-6}"
|
|
local timeout="${5:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local stable_since=0
|
|
local last_triplet=""
|
|
|
|
while true; do
|
|
local count_node01 count_node04 count_node05 triplet
|
|
read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
|
|
if (( count_node01 >= min_node01 )) && (( count_node04 >= min_node04 )) && (( count_node05 >= min_node05 )); then
|
|
triplet="${count_node01} ${count_node04} ${count_node05}"
|
|
if [[ "${triplet}" == "${last_triplet}" ]]; then
|
|
if (( stable_since > 0 )) && (( SECONDS - stable_since >= settle_secs )); then
|
|
printf '%s\n' "${triplet}"
|
|
return 0
|
|
fi
|
|
else
|
|
last_triplet="${triplet}"
|
|
stable_since="${SECONDS}"
|
|
fi
|
|
else
|
|
last_triplet=""
|
|
stable_since=0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for distributed LightningStor counts to settle: minimum ${min_node01}/${min_node04}/${min_node05}, last ${count_node01:-?}/${count_node04:-?}/${count_node05:-?}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_lightningstor_counts_greater_than() {
|
|
local before_node01="$1"
|
|
local before_node04="$2"
|
|
local before_node05="$3"
|
|
local context="$4"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
|
|
while true; do
|
|
local count_node01 count_node04 count_node05
|
|
read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
|
|
if (( count_node01 > before_node01 )) && (( count_node04 > before_node04 )) && (( count_node05 > before_node05 )); then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for distributed LightningStor replicas for ${context}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_lightningstor_counts_equal() {
|
|
local expected_node01="$1"
|
|
local expected_node04="$2"
|
|
local expected_node05="$3"
|
|
local context="$4"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
|
|
while true; do
|
|
local count_node01 count_node04 count_node05
|
|
read -r count_node01 count_node04 count_node05 < <(lightningstor_count_triplet)
|
|
if (( count_node01 == expected_node01 )) && (( count_node04 == expected_node04 )) && (( count_node05 == expected_node05 )); then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for distributed LightningStor counts to settle for ${context}: expected ${expected_node01}/${expected_node04}/${expected_node05}, got ${count_node01}/${count_node04}/${count_node05}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
require_cmd() {
|
|
command -v "$1" >/dev/null 2>&1 || die "required command not found: $1"
|
|
}
|
|
|
|
grpcurl_capture() {
|
|
local status=0
|
|
local output=""
|
|
|
|
output="$(timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl "$@" 2>&1)" || status=$?
|
|
printf '%s' "${output}"
|
|
return "${status}"
|
|
}
|
|
|
|
create_prismnet_vpc_with_retry() {
|
|
local token="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local name="$4"
|
|
local description="$5"
|
|
local cidr_block="$6"
|
|
local timeout="${7:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local request_json output=""
|
|
|
|
wait_for_flaredb_route_metadata node01
|
|
|
|
request_json="$(
|
|
jq -cn \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg name "${name}" \
|
|
--arg description "${description}" \
|
|
--arg cidr "${cidr_block}" \
|
|
'{orgId:$org, projectId:$project, name:$name, description:$description, cidrBlock:$cidr}'
|
|
)"
|
|
|
|
while true; do
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "${request_json}" \
|
|
127.0.0.1:15081 prismnet.VpcService/CreateVpc
|
|
)" && {
|
|
printf '%s' "${output}"
|
|
return 0
|
|
}
|
|
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for PrismNet VPC writes to succeed: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
prismnet_get_port_json() {
|
|
local token="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local subnet_id="$4"
|
|
local port_id="$5"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/GetPort
|
|
}
|
|
|
|
wait_for_prismnet_port_binding() {
|
|
local token="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local subnet_id="$4"
|
|
local port_id="$5"
|
|
local vm_id="$6"
|
|
local timeout="${7:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local port_json=""
|
|
|
|
while true; do
|
|
if port_json="$(prismnet_get_port_json "${token}" "${org_id}" "${project_id}" "${subnet_id}" "${port_id}" 2>/dev/null || true)"; then
|
|
if [[ -n "${port_json}" ]] && printf '%s' "${port_json}" | jq -e --arg vm "${vm_id}" '
|
|
.port.deviceId == $vm and .port.deviceType == "DEVICE_TYPE_VM"
|
|
' >/dev/null 2>&1; then
|
|
printf '%s\n' "${port_json}"
|
|
return 0
|
|
fi
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for PrismNet port ${port_id} to bind to VM ${vm_id}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_prismnet_port_detachment() {
|
|
local token="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local subnet_id="$4"
|
|
local port_id="$5"
|
|
local timeout="${6:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local port_json=""
|
|
|
|
while true; do
|
|
if port_json="$(prismnet_get_port_json "${token}" "${org_id}" "${project_id}" "${subnet_id}" "${port_id}" 2>/dev/null || true)"; then
|
|
if [[ -n "${port_json}" ]] && printf '%s' "${port_json}" | jq -e '
|
|
(.port.deviceId // "") == "" and
|
|
((.port.deviceType // "") == "DEVICE_TYPE_NONE" or (.port.deviceType // "") == "DEVICE_TYPE_UNSPECIFIED")
|
|
' >/dev/null 2>&1; then
|
|
printf '%s\n' "${port_json}"
|
|
return 0
|
|
fi
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for PrismNet port ${port_id} to detach"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_vm_network_spec() {
|
|
local token="$1"
|
|
local get_vm_json="$2"
|
|
local port_id="$3"
|
|
local subnet_id="$4"
|
|
local mac_address="$5"
|
|
local ip_address="$6"
|
|
local vm_port="${7:-15082}"
|
|
local timeout="${8:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local vm_json=""
|
|
|
|
while true; do
|
|
if vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" "${vm_port}" 2>/dev/null || true)"; then
|
|
if [[ -n "${vm_json}" ]] && printf '%s' "${vm_json}" | jq -e \
|
|
--arg port "${port_id}" \
|
|
--arg subnet "${subnet_id}" \
|
|
--arg mac "${mac_address}" \
|
|
--arg ip "${ip_address}" '
|
|
(.spec.network // []) | any(
|
|
.portId == $port and
|
|
.subnetId == $subnet and
|
|
.macAddress == $mac and
|
|
.ipAddress == $ip
|
|
)
|
|
' >/dev/null 2>&1; then
|
|
printf '%s\n' "${vm_json}"
|
|
return 0
|
|
fi
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM network spec to reflect PrismNet port ${port_id}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
build_link() {
|
|
printf '%s/build-%s' "$(vm_dir)" "$1"
|
|
}
|
|
|
|
guest_image_link() {
|
|
printf '%s/build-vm-guest-image' "$(vm_dir)"
|
|
}
|
|
|
|
guest_bench_image_link() {
|
|
printf '%s/build-vm-bench-guest-image' "$(vm_dir)"
|
|
}
|
|
|
|
reuse_guest_images_requested() {
|
|
[[ "${PHOTON_CLUSTER_REUSE_GUEST_IMAGES:-0}" == "1" ]]
|
|
}
|
|
|
|
preserve_build_links_requested() {
|
|
[[ "${PHOTON_CLUSTER_PRESERVE_BUILD_LINKS:-0}" == "1" ]]
|
|
}
|
|
|
|
runtime_dir() {
|
|
printf '%s/%s' "$(vm_dir)" "$1"
|
|
}
|
|
|
|
pid_file() {
|
|
printf '%s/%s/vm.pid' "$(vm_dir)" "$1"
|
|
}
|
|
|
|
log_file() {
|
|
printf '%s/%s/vm.log' "$(vm_dir)" "$1"
|
|
}
|
|
|
|
runvm_path() {
|
|
local node="$1"
|
|
find -L "$(build_link "${node}")/bin" -maxdepth 1 -name 'run-*-vm' | head -n1
|
|
}
|
|
|
|
guest_image_path() {
|
|
local link_path
|
|
link_path="$(guest_image_link)"
|
|
build_guest_image
|
|
find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
|
|
}
|
|
|
|
guest_bench_image_path() {
|
|
local link_path
|
|
link_path="$(guest_bench_image_link)"
|
|
build_guest_bench_image
|
|
find -L "${link_path}" -maxdepth 2 -type f -name '*.qcow2' | head -n1
|
|
}
|
|
|
|
prepare_node01_image_source() {
|
|
local local_path="$1"
|
|
local remote_name="$2"
|
|
|
|
if [[ "${local_path}" == /nix/store/* ]] && ssh_node node01 "test -r ${local_path}" >/dev/null 2>&1; then
|
|
printf '%s\n' "${local_path}"
|
|
return 0
|
|
fi
|
|
|
|
ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports"
|
|
local remote_path="/var/lib/plasmavmc/imports/${remote_name}.qcow2"
|
|
scp_to_node node01 "${local_path}" "${remote_path}"
|
|
printf '%s\n' "${remote_path}"
|
|
}
|
|
|
|
all_or_requested_nodes() {
|
|
if [[ "$#" -eq 0 ]]; then
|
|
printf '%s\n' "${NODES[@]}"
|
|
else
|
|
printf '%s\n' "$@"
|
|
fi
|
|
}
|
|
|
|
validate_nodes_exist() {
|
|
local node
|
|
for node in "$@"; do
|
|
[[ -n "${SSH_PORTS[${node}]:-}" ]] || die "unknown node: ${node}"
|
|
done
|
|
}
|
|
|
|
ssh_port_for_node() {
|
|
local node="$1"
|
|
|
|
if [[ "${BUILD_PROFILE}" == "storage" && -n "${STORAGE_SSH_PORTS[${node}]:-}" ]]; then
|
|
printf '%s\n' "${STORAGE_SSH_PORTS[${node}]}"
|
|
else
|
|
printf '%s\n' "${SSH_PORTS[${node}]}"
|
|
fi
|
|
}
|
|
|
|
host_nested_param_path() {
|
|
if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then
|
|
printf '%s\n' /sys/module/kvm_intel/parameters/nested
|
|
elif [[ -f /sys/module/kvm_amd/parameters/nested ]]; then
|
|
printf '%s\n' /sys/module/kvm_amd/parameters/nested
|
|
fi
|
|
}
|
|
|
|
preflight() {
|
|
acquire_cluster_lock
|
|
require_cmd nix
|
|
require_cmd qemu-system-x86_64
|
|
require_cmd ssh
|
|
require_cmd sshpass
|
|
require_cmd curl
|
|
require_cmd grpcurl
|
|
require_cmd vde_switch
|
|
|
|
mkdir -p "$(vm_dir)"
|
|
log "Cluster build profile: ${BUILD_PROFILE} (state dir $(vm_dir))"
|
|
|
|
[[ -e /dev/kvm ]] || die "/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization"
|
|
[[ -r /dev/kvm && -w /dev/kvm ]] || warn "/dev/kvm exists but current user may not have full access"
|
|
|
|
local nested_path
|
|
nested_path="$(host_nested_param_path || true)"
|
|
if [[ -n "${nested_path}" ]]; then
|
|
log "Host nested virtualization parameter: ${nested_path}=$(<"${nested_path}")"
|
|
else
|
|
warn "Could not locate host nested virtualization parameter; guest nested-KVM validation may fail"
|
|
fi
|
|
}
|
|
|
|
vde_switch_ctl_path() {
|
|
printf '%s/ctl\n' "$(vde_switch_dir)"
|
|
}
|
|
|
|
vde_switch_running() {
|
|
if [[ -f "$(vde_switch_pid_file)" ]] && kill -0 "$(<"$(vde_switch_pid_file)")" 2>/dev/null; then
|
|
[[ -S "$(vde_switch_ctl_path)" ]]
|
|
return
|
|
fi
|
|
|
|
[[ -S "$(vde_switch_ctl_path)" ]]
|
|
}
|
|
|
|
ensure_vde_switch() {
|
|
local deadline
|
|
local vde_dir
|
|
|
|
vde_dir="$(vde_switch_dir)"
|
|
|
|
if vde_switch_running; then
|
|
return 0
|
|
fi
|
|
|
|
rm -rf "${vde_dir}"
|
|
rm -f "$(vde_switch_pid_file)"
|
|
|
|
log "Starting VDE switch at ${vde_dir}"
|
|
vde_switch \
|
|
-sock "${vde_dir}" \
|
|
-daemon \
|
|
-pidfile "$(vde_switch_pid_file)"
|
|
|
|
deadline=$((SECONDS + 10))
|
|
while true; do
|
|
if vde_switch_running; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VDE switch at ${vde_dir}"
|
|
fi
|
|
sleep 1
|
|
done
|
|
}
|
|
|
|
stop_vde_switch() {
|
|
local pid=""
|
|
local vde_dir
|
|
|
|
vde_dir="$(vde_switch_dir)"
|
|
|
|
if [[ -f "$(vde_switch_pid_file)" ]]; then
|
|
pid="$(<"$(vde_switch_pid_file)")"
|
|
fi
|
|
|
|
if [[ -n "${pid}" ]] && kill -0 "${pid}" 2>/dev/null; then
|
|
log "Stopping VDE switch (PID ${pid})"
|
|
kill "${pid}" || true
|
|
for _ in {1..10}; do
|
|
if ! kill -0 "${pid}" 2>/dev/null; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
if kill -0 "${pid}" 2>/dev/null; then
|
|
warn "VDE switch did not stop after SIGTERM; sending SIGKILL"
|
|
kill -9 "${pid}" || true
|
|
fi
|
|
fi
|
|
|
|
rm -f "$(vde_switch_pid_file)"
|
|
rm -rf "${vde_dir}"
|
|
}
|
|
|
|
any_vm_running() {
|
|
local node
|
|
for node in "${NODES[@]}"; do
|
|
if is_running "${node}"; then
|
|
return 0
|
|
fi
|
|
done
|
|
return 1
|
|
}
|
|
|
|
terminate_pids() {
|
|
local context="$1"
|
|
shift
|
|
local pids=("$@")
|
|
local pid
|
|
|
|
[[ "${#pids[@]}" -gt 0 ]] || return 0
|
|
|
|
log "Stopping stale ${context}: ${pids[*]}"
|
|
kill "${pids[@]}" 2>/dev/null || true
|
|
for _ in {1..20}; do
|
|
local remaining=0
|
|
for pid in "${pids[@]}"; do
|
|
if kill -0 "${pid}" 2>/dev/null; then
|
|
remaining=1
|
|
break
|
|
fi
|
|
done
|
|
if [[ "${remaining}" -eq 0 ]]; then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
warn "Force-killing stale ${context}: ${pids[*]}"
|
|
kill -9 "${pids[@]}" 2>/dev/null || true
|
|
}
|
|
|
|
stale_vm_pids_for_nodes_current_profile() {
|
|
local nodes=("$@")
|
|
local pid cmd node port runtime_path
|
|
declare -A seen=()
|
|
|
|
while read -r pid cmd; do
|
|
[[ -n "${pid:-}" ]] || continue
|
|
for node in "${nodes[@]}"; do
|
|
port="$(ssh_port_for_node "${node}")"
|
|
runtime_path="$(runtime_dir "${node}")/${node}.qcow2"
|
|
if [[ "${cmd}" == *"qemu-system"* ]] && {
|
|
[[ "${cmd}" == *"file=${runtime_path}"* ]] ||
|
|
[[ "${cmd}" == *"hostfwd=tcp::${port}-:22"* ]];
|
|
}; then
|
|
seen["${pid}"]=1
|
|
fi
|
|
done
|
|
done < <(pgrep -af 'qemu-system[^[:space:]]*|run-.*-vm' || true)
|
|
|
|
for node in "${nodes[@]}"; do
|
|
port="$(ssh_port_for_node "${node}")"
|
|
while read -r pid; do
|
|
[[ -n "${pid:-}" ]] || continue
|
|
seen["${pid}"]=1
|
|
done < <(
|
|
ss -H -ltnp "( sport = :${port} )" 2>/dev/null \
|
|
| sed -n 's/.*pid=\([0-9]\+\).*/\1/p' \
|
|
| sort -u
|
|
)
|
|
done
|
|
|
|
printf '%s\n' "${!seen[@]}" | sort -n
|
|
}
|
|
|
|
stop_stale_vm_processes_current_profile() {
|
|
local nodes=("$@")
|
|
local pids=()
|
|
|
|
mapfile -t pids < <(stale_vm_pids_for_nodes_current_profile "${nodes[@]}")
|
|
terminate_pids "VM processes" "${pids[@]}"
|
|
}
|
|
|
|
stop_nodes_current_profile() {
|
|
local nodes=("$@")
|
|
local node
|
|
|
|
for node in "${nodes[@]}"; do
|
|
stop_vm "${node}"
|
|
done
|
|
|
|
stop_stale_vm_processes_current_profile "${nodes[@]}"
|
|
|
|
if ! any_vm_running; then
|
|
stop_vde_switch
|
|
fi
|
|
}
|
|
|
|
stop_nodes_all_profiles() {
|
|
local nodes=("$@")
|
|
local profile
|
|
|
|
while IFS= read -r profile; do
|
|
with_build_profile "${profile}" stop_nodes_current_profile "${nodes[@]}"
|
|
done < <(all_build_profiles)
|
|
}
|
|
|
|
remove_runtime_state_current_profile() {
|
|
local state_dir
|
|
state_dir="$(vm_dir)"
|
|
|
|
if [[ -d "${state_dir}" ]]; then
|
|
log "Removing runtime state under ${state_dir}"
|
|
if preserve_build_links_requested; then
|
|
find "${state_dir}" -mindepth 1 ! -name 'build-*' -delete 2>/dev/null || true
|
|
else
|
|
find "${state_dir}" -mindepth 1 -delete 2>/dev/null || true
|
|
fi
|
|
fi
|
|
}
|
|
|
|
remove_runtime_state_all_profiles() {
|
|
local profile
|
|
|
|
while IFS= read -r profile; do
|
|
with_build_profile "${profile}" remove_runtime_state_current_profile
|
|
done < <(all_build_profiles)
|
|
}
|
|
|
|
build_vm() {
|
|
local node="$1"
|
|
local target
|
|
local out
|
|
|
|
target="$(build_target_for_node "${node}")"
|
|
log "Building ${node} VM derivation (${target})"
|
|
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
|
|
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
|
|
--extra-experimental-features 'nix-command flakes' \
|
|
"${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm" \
|
|
--no-link --print-out-paths | tail -n1)"
|
|
[[ -n "${out}" ]] || die "failed to resolve VM output for ${node}"
|
|
ln -sfn "${out}" "$(build_link "${node}")"
|
|
}
|
|
|
|
build_target_for_node() {
|
|
local node="$1"
|
|
|
|
if [[ "${BUILD_PROFILE}" == "storage" ]]; then
|
|
printf '%s\n' "${STORAGE_BUILD_TARGETS[${node}]:-${node}}"
|
|
else
|
|
printf '%s\n' "${node}"
|
|
fi
|
|
}
|
|
|
|
build_vms() {
|
|
local nodes=("$@")
|
|
local targets=()
|
|
local outputs=()
|
|
local node
|
|
local target
|
|
local i
|
|
|
|
for node in "${nodes[@]}"; do
|
|
target="$(build_target_for_node "${node}")"
|
|
targets+=("${CLUSTER_FLAKE_REF}#nixosConfigurations.${target}.config.system.build.vm")
|
|
done
|
|
|
|
log "Building VM derivations in one Nix invocation: ${nodes[*]}"
|
|
mapfile -t outputs < <(
|
|
NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
|
|
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
|
|
--extra-experimental-features 'nix-command flakes' \
|
|
"${targets[@]}" \
|
|
--no-link --print-out-paths
|
|
)
|
|
|
|
[[ "${#outputs[@]}" -eq "${#nodes[@]}" ]] || die "expected ${#nodes[@]} VM outputs, got ${#outputs[@]}"
|
|
|
|
for i in "${!nodes[@]}"; do
|
|
ln -sfn "${outputs[${i}]}" "$(build_link "${nodes[${i}]}")"
|
|
done
|
|
}
|
|
|
|
build_guest_image() {
|
|
local out
|
|
|
|
if reuse_guest_images_requested && [[ -L "$(guest_image_link)" ]] && [[ -e "$(readlink -f "$(guest_image_link)")" ]]; then
|
|
log "Reusing cached bootable VM guest image from $(readlink -f "$(guest_image_link)")"
|
|
return 0
|
|
fi
|
|
|
|
log "Building bootable VM guest image on the host"
|
|
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
|
|
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
|
|
--extra-experimental-features 'nix-command flakes' \
|
|
"${CLUSTER_FLAKE_REF}#vmGuestImage" \
|
|
--no-link --print-out-paths | tail -n1)"
|
|
[[ -n "${out}" ]] || die "failed to resolve VM guest image output"
|
|
ln -sfn "${out}" "$(guest_image_link)"
|
|
}
|
|
|
|
build_guest_bench_image() {
|
|
local out
|
|
|
|
if reuse_guest_images_requested && [[ -L "$(guest_bench_image_link)" ]] && [[ -e "$(readlink -f "$(guest_bench_image_link)")" ]]; then
|
|
log "Reusing cached VM benchmark guest image from $(readlink -f "$(guest_bench_image_link)")"
|
|
return 0
|
|
fi
|
|
|
|
log "Building VM benchmark guest image on the host"
|
|
out="$(NIX_BUILD_CORES="${CLUSTER_NIX_BUILD_CORES}" nix build -L \
|
|
--max-jobs "${CLUSTER_NIX_MAX_JOBS}" \
|
|
--extra-experimental-features 'nix-command flakes' \
|
|
"${CLUSTER_FLAKE_REF}#vmBenchGuestImage" \
|
|
--no-link --print-out-paths | tail -n1)"
|
|
[[ -n "${out}" ]] || die "failed to resolve VM benchmark guest image output"
|
|
ln -sfn "${out}" "$(guest_bench_image_link)"
|
|
}
|
|
|
|
build_requested() {
|
|
local nodes
|
|
mapfile -t nodes < <(all_or_requested_nodes "$@")
|
|
validate_nodes_exist "${nodes[@]}"
|
|
preflight
|
|
|
|
build_vms "${nodes[@]}"
|
|
}
|
|
|
|
is_running() {
|
|
local node="$1"
|
|
local pid_path
|
|
pid_path="$(pid_file "${node}")"
|
|
[[ -f "${pid_path}" ]] || return 1
|
|
kill -0 "$(<"${pid_path}")" 2>/dev/null
|
|
}
|
|
|
|
start_vm() {
|
|
local node="$1"
|
|
local build_path runvm node_runtime pid_path vm_log ssh_port
|
|
|
|
ensure_vde_switch
|
|
|
|
build_path="$(build_link "${node}")"
|
|
[[ -L "${build_path}" ]] || build_vm "${node}"
|
|
runvm="$(runvm_path "${node}")"
|
|
[[ -n "${runvm}" ]] || die "failed to locate run-*-vm for ${node}"
|
|
|
|
node_runtime="$(runtime_dir "${node}")"
|
|
pid_path="$(pid_file "${node}")"
|
|
vm_log="$(log_file "${node}")"
|
|
mkdir -p "${node_runtime}"
|
|
|
|
if is_running "${node}"; then
|
|
log "${node} already running (PID $(<"${pid_path}"))"
|
|
return 0
|
|
fi
|
|
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
if ss -H -ltn "( sport = :${ssh_port} )" | grep -q .; then
|
|
warn "port ${ssh_port} is already in use before starting ${node}"
|
|
ss -H -ltnp "( sport = :${ssh_port} )" || true
|
|
die "SSH forward port ${ssh_port} for ${node} is already in use"
|
|
fi
|
|
|
|
log "Starting ${node}"
|
|
(
|
|
cd "${node_runtime}"
|
|
nohup setsid "${runvm}" </dev/null >"${vm_log}" 2>&1 &
|
|
echo $! >"${pid_path}"
|
|
)
|
|
sleep 2
|
|
|
|
if ! is_running "${node}"; then
|
|
warn "${node} failed to stay running; recent log follows"
|
|
tail -n 80 "${vm_log}" || true
|
|
die "failed to start ${node}"
|
|
fi
|
|
}
|
|
|
|
stop_vm() {
|
|
local node="$1"
|
|
local pid_path pid
|
|
|
|
pid_path="$(pid_file "${node}")"
|
|
if [[ ! -f "${pid_path}" ]]; then
|
|
log "${node} is not running"
|
|
return 0
|
|
fi
|
|
|
|
pid="$(<"${pid_path}")"
|
|
if kill -0 "${pid}" 2>/dev/null; then
|
|
log "Stopping ${node} (PID ${pid})"
|
|
kill "${pid}" || true
|
|
for _ in {1..20}; do
|
|
if ! kill -0 "${pid}" 2>/dev/null; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
if kill -0 "${pid}" 2>/dev/null; then
|
|
warn "${node} did not stop after SIGTERM; sending SIGKILL"
|
|
kill -9 "${pid}" || true
|
|
fi
|
|
fi
|
|
|
|
rm -f "${pid_path}"
|
|
}
|
|
|
|
ssh_node() {
|
|
local node="$1"
|
|
shift
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 "$@"
|
|
}
|
|
|
|
ssh_node_script() {
|
|
local node="$1"
|
|
shift
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1 bash -se -- "$@"
|
|
}
|
|
|
|
scp_to_node() {
|
|
local node="$1"
|
|
local local_path="$2"
|
|
local remote_path="$3"
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
scp "${SSH_OPTS[@]}" -P "${ssh_port}" "${local_path}" "root@127.0.0.1:${remote_path}"
|
|
}
|
|
|
|
start_ssh_tunnel() {
|
|
local node="$1"
|
|
local local_port="$2"
|
|
local remote_port="$3"
|
|
local remote_host="${4:-127.0.0.1}"
|
|
local control_socket
|
|
control_socket="$(vm_dir)/tunnel-${node}-${local_port}.ctl"
|
|
local deadline
|
|
local attempt_deadline
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
|
|
if [[ -e "${control_socket}" ]]; then
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" \
|
|
-S "${control_socket}" \
|
|
-O exit \
|
|
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
|
|
rm -f "${control_socket}"
|
|
fi
|
|
|
|
if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
|
|
pkill -f -- "ssh .* -L ${local_port}:${remote_host}:${remote_port} " >/dev/null 2>&1 || true
|
|
for _ in {1..10}; do
|
|
if ! ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
if ss -H -ltn "( sport = :${local_port} )" | grep -q .; then
|
|
die "local tunnel port ${local_port} is already in use"
|
|
fi
|
|
fi
|
|
|
|
deadline=$((SECONDS + TUNNEL_WAIT_TIMEOUT))
|
|
while true; do
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" \
|
|
-o ExitOnForwardFailure=yes \
|
|
-S "${control_socket}" \
|
|
-M -f -N \
|
|
-L "${local_port}:${remote_host}:${remote_port}" \
|
|
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
|
|
|
|
attempt_deadline=$((SECONDS + 10))
|
|
while true; do
|
|
if sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" \
|
|
-S "${control_socket}" \
|
|
-O check \
|
|
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1; then
|
|
printf '%s\n' "${control_socket}"
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= attempt_deadline )); then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" \
|
|
-S "${control_socket}" \
|
|
-O exit \
|
|
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
|
|
rm -f "${control_socket}"
|
|
|
|
if (( SECONDS >= deadline )); then
|
|
warn "failed to establish ssh tunnel for ${node}:${remote_port} on local port ${local_port}"
|
|
ss -H -ltnp "( sport = :${local_port} )" || true
|
|
ps -ef | grep -F -- "-L ${local_port}:${remote_host}:${remote_port}" | grep -v grep || true
|
|
die "ssh tunnel for ${node}:${remote_host}:${remote_port} did not bind local port ${local_port}"
|
|
fi
|
|
sleep 1
|
|
done
|
|
}
|
|
|
|
stop_ssh_tunnel() {
|
|
local node="$1"
|
|
local control_socket="$2"
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
|
|
[[ -n "${control_socket}" ]] || return 0
|
|
if [[ -e "${control_socket}" ]]; then
|
|
sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" \
|
|
-S "${control_socket}" \
|
|
-O exit \
|
|
-p "${ssh_port}" root@127.0.0.1 >/dev/null 2>&1 || true
|
|
rm -f "${control_socket}"
|
|
fi
|
|
}
|
|
|
|
issue_project_admin_token() {
|
|
local iam_port="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local principal_id="$4"
|
|
local create_principal_json create_binding_json issue_token_json token deadline output
|
|
|
|
create_principal_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
|
|
)"
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT + 180))
|
|
while true; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${create_principal_json}" \
|
|
127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
|
|
)" && break
|
|
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out creating IAM principal ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
create_binding_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
|
|
)"
|
|
while true; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${create_binding_json}" \
|
|
127.0.0.1:"${iam_port}" iam.v1.IamAdmin/CreateBinding 2>&1
|
|
)" && break
|
|
if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out creating IAM binding for ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
issue_token_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
|
|
)"
|
|
while true; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${issue_token_json}" \
|
|
127.0.0.1:"${iam_port}" iam.v1.IamToken/IssueToken 2>&1
|
|
)" && {
|
|
token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
|
|
if [[ -n "${token}" ]]; then
|
|
break
|
|
fi
|
|
}
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out issuing IAM token for ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
wait_for_project_admin_authorization "${iam_port}" "${org_id}" "${project_id}" "${principal_id}"
|
|
printf '%s\n' "${token}"
|
|
}
|
|
|
|
issue_s3_credential() {
|
|
local iam_port="$1"
|
|
local principal_id="$2"
|
|
local org_id="$3"
|
|
local project_id="$4"
|
|
local description="${5:-storage-bench}"
|
|
local create_credential_json access_key_id secret_key deadline output
|
|
|
|
create_credential_json="$(
|
|
jq -cn \
|
|
--arg principal "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg description "${description}" \
|
|
'{principalId:$principal, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", orgId:$org, projectId:$project, description:$description}'
|
|
)"
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT + 120))
|
|
while true; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${create_credential_json}" \
|
|
127.0.0.1:"${iam_port}" iam.v1.IamCredential/CreateS3Credential 2>&1
|
|
)" && {
|
|
access_key_id="$(printf '%s\n' "${output}" | jq -r '.accessKeyId // empty' 2>/dev/null || true)"
|
|
secret_key="$(printf '%s\n' "${output}" | jq -r '.secretKey // empty' 2>/dev/null || true)"
|
|
if [[ -n "${access_key_id}" && -n "${secret_key}" ]]; then
|
|
printf '%s\t%s\n' "${access_key_id}" "${secret_key}"
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out issuing IAM S3 credential for ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
issue_project_admin_token_any() {
|
|
local org_id="$1"
|
|
local project_id="$2"
|
|
local principal_id="$3"
|
|
shift 3
|
|
local ports=("$@")
|
|
local create_principal_json create_binding_json issue_token_json token deadline output
|
|
local selected_port="" create_port="" binding_port="" issue_port="" port
|
|
|
|
create_principal_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{id:$id, kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", name:$id, orgId:$org, projectId:$project}'
|
|
)"
|
|
create_binding_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, role:"roles/ProjectAdmin", scope:{project:{id:$project, orgId:$org}}}'
|
|
)"
|
|
issue_token_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{principalId:$id, principalKind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", scope:{project:{id:$project, orgId:$org}}, ttlSeconds:3600}'
|
|
)"
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while [[ -z "${create_port}" ]]; do
|
|
for port in "${ports[@]}"; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${create_principal_json}" \
|
|
127.0.0.1:"${port}" iam.v1.IamAdmin/CreatePrincipal 2>&1
|
|
)" && {
|
|
create_port="${port}"
|
|
break
|
|
}
|
|
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
|
|
create_port="${port}"
|
|
break
|
|
fi
|
|
done
|
|
if [[ -n "${create_port}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out creating IAM principal ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
while [[ -z "${binding_port}" ]]; do
|
|
for port in "${ports[@]}"; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${create_binding_json}" \
|
|
127.0.0.1:"${port}" iam.v1.IamAdmin/CreateBinding 2>&1
|
|
)" && {
|
|
binding_port="${port}"
|
|
break
|
|
}
|
|
if grep -Eq 'AlreadyExists|already exists|duplicate' <<<"${output}"; then
|
|
binding_port="${port}"
|
|
break
|
|
fi
|
|
done
|
|
if [[ -n "${binding_port}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out creating IAM binding for ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
while [[ -z "${issue_port}" ]]; do
|
|
for port in "${ports[@]}"; do
|
|
output="$(
|
|
timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${issue_token_json}" \
|
|
127.0.0.1:"${port}" iam.v1.IamToken/IssueToken 2>&1
|
|
)" && {
|
|
token="$(printf '%s\n' "${output}" | jq -r '.token // empty' 2>/dev/null || true)"
|
|
if [[ -n "${token}" ]]; then
|
|
issue_port="${port}"
|
|
break
|
|
fi
|
|
}
|
|
done
|
|
if [[ -n "${issue_port}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out issuing IAM token for ${principal_id}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
selected_port="$(wait_for_project_admin_authorization_any "${org_id}" "${project_id}" "${principal_id}" "${ports[@]}")"
|
|
printf '%s\t%s\n' "${selected_port}" "${token}"
|
|
}
|
|
|
|
wait_for_project_admin_authorization() {
|
|
local iam_port="$1"
|
|
local org_id="$2"
|
|
local project_id="$3"
|
|
local principal_id="$4"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
local authorize_json
|
|
|
|
authorize_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{
|
|
principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
|
|
action:"storage:buckets:create",
|
|
resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
|
|
}'
|
|
)"
|
|
|
|
while true; do
|
|
if timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${authorize_json}" \
|
|
127.0.0.1:"${iam_port}" iam.v1.IamAuthz/Authorize \
|
|
| jq -e '.allowed == true' >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_project_admin_authorization_any() {
|
|
local org_id="$1"
|
|
local project_id="$2"
|
|
local principal_id="$3"
|
|
shift 3
|
|
local ports=("$@")
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
local authorize_json port
|
|
|
|
authorize_json="$(
|
|
jq -cn \
|
|
--arg id "${principal_id}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{
|
|
principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id},
|
|
action:"storage:buckets:create",
|
|
resource:{kind:"bucket", id:"authz-probe", orgId:$org, projectId:$project}
|
|
}'
|
|
)"
|
|
|
|
while true; do
|
|
for port in "${ports[@]}"; do
|
|
if timeout 15 grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "${authorize_json}" \
|
|
127.0.0.1:"${port}" iam.v1.IamAuthz/Authorize \
|
|
| jq -e '.allowed == true' >/dev/null 2>&1; then
|
|
printf '%s\n' "${port}"
|
|
return 0
|
|
fi
|
|
done
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for IAM ProjectAdmin binding to become effective for ${principal_id}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
ensure_lightningstor_bucket() {
|
|
local ls_port="$1"
|
|
local token="$2"
|
|
local bucket="$3"
|
|
local org_id="$4"
|
|
local project_id="$5"
|
|
local head_json create_json
|
|
|
|
head_json="$(jq -cn --arg bucket "${bucket}" '{bucket:$bucket}')"
|
|
create_json="$(
|
|
jq -cn \
|
|
--arg bucket "${bucket}" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
'{bucket:$bucket, region:"default", orgId:$org, projectId:$project}'
|
|
)"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
local output=""
|
|
|
|
while true; do
|
|
if timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_json}" \
|
|
127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/HeadBucket >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${create_json}" \
|
|
127.0.0.1:"${ls_port}" lightningstor.v1.BucketService/CreateBucket
|
|
)" && return 0
|
|
|
|
if grep -Eq 'AlreadyExists|already exists' <<<"${output}"; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out ensuring LightningStor bucket ${bucket}: ${output}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_lightningstor_write_quorum() {
|
|
local ls_port="$1"
|
|
local token="$2"
|
|
local bucket="$3"
|
|
local context="$4"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
local key="write-quorum-probe-$(date +%s)-$RANDOM"
|
|
local body="quorum-probe-${key}"
|
|
local body_b64 put_json delete_json output status
|
|
local before_node01 before_node04 before_node05
|
|
|
|
read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)
|
|
body_b64="$(printf '%s' "${body}" | base64 -w0)"
|
|
put_json="$(
|
|
jq -cn \
|
|
--arg bucket "${bucket}" \
|
|
--arg key "${key}" \
|
|
--arg body "${body_b64}" \
|
|
'{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
|
|
)"
|
|
delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
|
|
|
|
while true; do
|
|
status=0
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${put_json}" \
|
|
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/PutObject
|
|
)" || status=$?
|
|
|
|
if (( status == 0 )); then
|
|
wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe"
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${delete_json}" \
|
|
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/DeleteObject
|
|
)" || die "failed to delete LightningStor write quorum probe for ${context}: ${output}"
|
|
wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "${context} write quorum probe cleanup"
|
|
return 0
|
|
fi
|
|
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for LightningStor write quorum for ${context}: ${output}"
|
|
fi
|
|
|
|
if ! grep -q "Not enough healthy nodes" <<<"${output}"; then
|
|
die "unexpected LightningStor write quorum failure for ${context}: ${output}"
|
|
fi
|
|
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
download_lightningstor_object_to_file() {
|
|
local ls_port="$1"
|
|
local token="$2"
|
|
local bucket="$3"
|
|
local key="$4"
|
|
local output_path="$5"
|
|
local get_json
|
|
|
|
get_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
|
|
timeout "${GRPCURL_TIMEOUT_SECS}" grpcurl -plaintext \
|
|
-max-msg-sz "${GRPCURL_MAX_MSG_SIZE}" \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${get_json}" \
|
|
127.0.0.1:"${ls_port}" lightningstor.v1.ObjectService/GetObject \
|
|
| jq -r '.bodyChunk? // empty' \
|
|
| base64 -d >"${output_path}"
|
|
}
|
|
|
|
calc_mib_per_s() {
|
|
local bytes="$1"
|
|
local elapsed_ns="$2"
|
|
awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
|
|
BEGIN {
|
|
if (elapsed_ns <= 0) {
|
|
print "0.00"
|
|
} else {
|
|
printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
|
|
}
|
|
}
|
|
'
|
|
}
|
|
|
|
calc_ops_per_s() {
|
|
local operations="$1"
|
|
local elapsed_ns="$2"
|
|
awk -v operations="${operations}" -v elapsed_ns="${elapsed_ns}" '
|
|
BEGIN {
|
|
if (elapsed_ns <= 0) {
|
|
print "0.00"
|
|
} else {
|
|
printf "%.2f", operations / (elapsed_ns / 1000000000.0)
|
|
}
|
|
}
|
|
'
|
|
}
|
|
|
|
calc_seconds_from_ns() {
|
|
local elapsed_ns="$1"
|
|
awk -v elapsed_ns="${elapsed_ns}" '
|
|
BEGIN {
|
|
if (elapsed_ns <= 0) {
|
|
print "0.00"
|
|
} else {
|
|
printf "%.2f", elapsed_ns / 1000000000.0
|
|
}
|
|
}
|
|
'
|
|
}
|
|
|
|
bw_bytes_to_mibps() {
|
|
local bw_bytes="$1"
|
|
awk -v bw_bytes="${bw_bytes}" 'BEGIN { printf "%.2f", bw_bytes / 1048576.0 }'
|
|
}
|
|
|
|
bps_to_mibps() {
|
|
local bits_per_second="$1"
|
|
awk -v bits_per_second="${bits_per_second}" 'BEGIN { printf "%.2f", bits_per_second / 8.0 / 1048576.0 }'
|
|
}
|
|
|
|
allocate_free_listener_port() {
|
|
local node="$1"
|
|
local start_port="${2:-18080}"
|
|
local end_port="${3:-18999}"
|
|
|
|
ssh_node_script "${node}" "${start_port}" "${end_port}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
start_port="$1"
|
|
end_port="$2"
|
|
|
|
for ((port=start_port; port<=end_port; port++)); do
|
|
if ! ss -ltnH "( sport = :${port} )" | grep -q .; then
|
|
printf '%s\n' "${port}"
|
|
exit 0
|
|
fi
|
|
done
|
|
|
|
exit 1
|
|
EOS
|
|
}
|
|
|
|
run_remote_fio_json() {
|
|
local node="$1"
|
|
local target_path="$2"
|
|
local rw="$3"
|
|
local bs="$4"
|
|
local size_mb="$5"
|
|
local runtime_secs="${6:-0}"
|
|
local iodepth="${7:-1}"
|
|
local ioengine="${8:-sync}"
|
|
|
|
ssh_node_script "${node}" "${target_path}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${iodepth}" "${ioengine}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
target_path="$1"
|
|
rw="$2"
|
|
bs="$3"
|
|
size_mb="$4"
|
|
runtime_secs="$5"
|
|
iodepth="$6"
|
|
ioengine="$7"
|
|
|
|
mkdir -p "$(dirname "${target_path}")"
|
|
|
|
if [[ "${rw}" == *read* ]]; then
|
|
dd if=/dev/zero of="${target_path}" bs=1M count="${size_mb}" status=none conv=fsync
|
|
fi
|
|
|
|
fio_args=(
|
|
--name=photon-bench
|
|
--filename="${target_path}"
|
|
--rw="${rw}"
|
|
--bs="${bs}"
|
|
--size="${size_mb}M"
|
|
--ioengine="${ioengine}"
|
|
--direct=1
|
|
--iodepth="${iodepth}"
|
|
--output-format=json
|
|
)
|
|
|
|
if [[ "${runtime_secs}" != "0" ]]; then
|
|
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
|
|
fi
|
|
|
|
if [[ "${rw}" == *write* ]]; then
|
|
fio_args+=(--end_fsync=1)
|
|
fi
|
|
|
|
result_json="$(fio "${fio_args[@]}")"
|
|
rm -f "${target_path}"
|
|
|
|
if [[ "${rw}" == *read* ]]; then
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
|
|
else
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
|
|
fi
|
|
EOS
|
|
}
|
|
|
|
run_remote_block_fio_json() {
|
|
local node="$1"
|
|
local target="$2"
|
|
local rw="$3"
|
|
local bs="$4"
|
|
local size_mb="$5"
|
|
local runtime_secs="${6:-0}"
|
|
|
|
ssh_node_script "${node}" "${target}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
target="$1"
|
|
rw="$2"
|
|
bs="$3"
|
|
size_mb="$4"
|
|
runtime_secs="$5"
|
|
|
|
fio_args=(
|
|
--name=photon-bench
|
|
--filename="${target}"
|
|
--rw="${rw}"
|
|
--bs="${bs}"
|
|
--size="${size_mb}M"
|
|
--ioengine=libaio
|
|
--direct=1
|
|
--output-format=json
|
|
)
|
|
|
|
if [[ "${runtime_secs}" != "0" ]]; then
|
|
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
|
|
fi
|
|
|
|
if [[ "${rw}" == *write* ]]; then
|
|
fio_args+=(--end_fsync=1)
|
|
fi
|
|
|
|
result_json="$(fio "${fio_args[@]}")"
|
|
|
|
if [[ "${rw}" == *read* ]]; then
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
|
|
else
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
|
|
fi
|
|
EOS
|
|
}
|
|
|
|
run_remote_dd_read_json() {
|
|
local node="$1"
|
|
local target_path="$2"
|
|
local size_mb="$3"
|
|
|
|
ssh_node_script "${node}" "${target_path}" "${size_mb}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
target_path="$1"
|
|
size_mb="$2"
|
|
|
|
[[ -f "${target_path}" ]]
|
|
start_ns="$(date +%s%N)"
|
|
dd if="${target_path}" of=/dev/null bs=1M status=none
|
|
end_ns="$(date +%s%N)"
|
|
|
|
printf '{"size_bytes":%s,"duration_ns":%s}\n' \
|
|
"$((size_mb * 1024 * 1024))" \
|
|
"$((end_ns - start_ns))"
|
|
EOS
|
|
}
|
|
|
|
coronafs_api_url() {
|
|
printf 'http://127.0.0.1:%s' "${1:-15088}"
|
|
}
|
|
|
|
coronafs_api_request() {
|
|
local base_port="$1"
|
|
local method="$2"
|
|
local path="$3"
|
|
local payload="${4:-}"
|
|
if [[ -n "${payload}" ]]; then
|
|
curl -fsS -X "${method}" \
|
|
-H 'content-type: application/json' \
|
|
--data "${payload}" \
|
|
"$(coronafs_api_url "${base_port}")${path}"
|
|
else
|
|
curl -fsS -X "${method}" "$(coronafs_api_url "${base_port}")${path}"
|
|
fi
|
|
}
|
|
|
|
coronafs_create_volume() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
local size_bytes="$3"
|
|
coronafs_api_request "${base_port}" PUT "/v1/volumes/${volume_id}" "$(jq -cn --argjson size_bytes "${size_bytes}" '{size_bytes:$size_bytes}')"
|
|
}
|
|
|
|
coronafs_export_volume_json() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
coronafs_api_request "${base_port}" POST "/v1/volumes/${volume_id}/export"
|
|
}
|
|
|
|
coronafs_materialize_volume() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
local source_uri="$3"
|
|
local size_bytes="$4"
|
|
coronafs_api_request "${base_port}" POST "/v1/volumes/${volume_id}/materialize" \
|
|
"$(jq -cn \
|
|
--arg source_uri "${source_uri}" \
|
|
--argjson size_bytes "${size_bytes}" \
|
|
'{source_uri:$source_uri,size_bytes:$size_bytes}')"
|
|
}
|
|
|
|
coronafs_get_volume_json() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
coronafs_api_request "${base_port}" GET "/v1/volumes/${volume_id}"
|
|
}
|
|
|
|
assert_coronafs_materialized_volume() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
local json
|
|
json="$(coronafs_get_volume_json "${base_port}" "${volume_id}")"
|
|
printf '%s' "${json}" | jq -e '
|
|
.node_local == true and
|
|
.path != null
|
|
' >/dev/null
|
|
}
|
|
|
|
coronafs_volume_export_uri() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
coronafs_get_volume_json "${base_port}" "${volume_id}" | jq -r '.export.uri // empty'
|
|
}
|
|
|
|
coronafs_volume_qemu_ref() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
coronafs_get_volume_json "${base_port}" "${volume_id}" | jq -r '.export.uri // .path // empty'
|
|
}
|
|
|
|
coronafs_delete_volume() {
|
|
local base_port="$1"
|
|
local volume_id="$2"
|
|
coronafs_api_request "${base_port}" DELETE "/v1/volumes/${volume_id}" >/dev/null
|
|
}
|
|
|
|
run_remote_nbd_fio_json() {
|
|
local node="$1"
|
|
local nbd_uri="$2"
|
|
local rw="$3"
|
|
local bs="$4"
|
|
local size_mb="$5"
|
|
local runtime_secs="${6:-0}"
|
|
local nbd_device="${7:-/dev/nbd0}"
|
|
local iodepth="${8:-1}"
|
|
|
|
ssh_node_script "${node}" "${nbd_uri}" "${rw}" "${bs}" "${size_mb}" "${runtime_secs}" "${nbd_device}" "${iodepth}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
nbd_uri="$1"
|
|
rw="$2"
|
|
bs="$3"
|
|
size_mb="$4"
|
|
runtime_secs="$5"
|
|
nbd_device="$6"
|
|
iodepth="$7"
|
|
|
|
resolve_qemu_nbd_aio_mode() {
|
|
local requested_mode="${PHOTON_TEST_CLUSTER_NBD_AIO_MODE:-io_uring}"
|
|
if qemu-nbd --help 2>&1 | grep -q "io_uring"; then
|
|
case "${requested_mode}" in
|
|
io_uring|threads|native)
|
|
printf '%s\n' "${requested_mode}"
|
|
return
|
|
;;
|
|
esac
|
|
fi
|
|
printf 'threads\n'
|
|
}
|
|
|
|
modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
|
|
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
|
|
aio_mode="$(resolve_qemu_nbd_aio_mode)"
|
|
qemu-nbd \
|
|
--format=raw \
|
|
--cache=none \
|
|
--aio="${aio_mode}" \
|
|
--connect="${nbd_device}" \
|
|
"${nbd_uri}"
|
|
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT
|
|
|
|
fio_args=(
|
|
--name=photon-bench
|
|
--filename="${nbd_device}"
|
|
--rw="${rw}"
|
|
--bs="${bs}"
|
|
--size="${size_mb}M"
|
|
--ioengine=libaio
|
|
--direct=1
|
|
--iodepth="${iodepth}"
|
|
--output-format=json
|
|
)
|
|
|
|
if [[ "${runtime_secs}" != "0" ]]; then
|
|
fio_args+=(--runtime="${runtime_secs}" --time_based=1)
|
|
fi
|
|
|
|
if [[ "${rw}" == *write* ]]; then
|
|
fio_args+=(--end_fsync=1)
|
|
fi
|
|
|
|
result_json="$(fio "${fio_args[@]}")"
|
|
|
|
if [[ "${rw}" == *read* ]]; then
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].read.bw_bytes // 0), iops:(.jobs[0].read.iops // 0)}'
|
|
else
|
|
printf '%s' "${result_json}" | jq -c '{bw_bytes:(.jobs[0].write.bw_bytes // 0), iops:(.jobs[0].write.iops // 0)}'
|
|
fi
|
|
EOS
|
|
}
|
|
|
|
run_remote_nbd_dd_read_json() {
|
|
local node="$1"
|
|
local nbd_uri="$2"
|
|
local size_mb="$3"
|
|
local nbd_device="${4:-/dev/nbd0}"
|
|
|
|
ssh_node_script "${node}" "${nbd_uri}" "${size_mb}" "${nbd_device}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
nbd_uri="$1"
|
|
size_mb="$2"
|
|
nbd_device="$3"
|
|
|
|
resolve_qemu_nbd_aio_mode() {
|
|
local requested_mode="${PHOTON_TEST_CLUSTER_NBD_AIO_MODE:-io_uring}"
|
|
if qemu-nbd --help 2>&1 | grep -q "io_uring"; then
|
|
case "${requested_mode}" in
|
|
io_uring|threads|native)
|
|
printf '%s\n' "${requested_mode}"
|
|
return
|
|
;;
|
|
esac
|
|
fi
|
|
printf 'threads\n'
|
|
}
|
|
|
|
modprobe nbd nbds_max=16 max_part=8 >/dev/null 2>&1 || true
|
|
qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true
|
|
aio_mode="$(resolve_qemu_nbd_aio_mode)"
|
|
qemu-nbd \
|
|
--format=raw \
|
|
--cache=none \
|
|
--aio="${aio_mode}" \
|
|
--connect="${nbd_device}" \
|
|
"${nbd_uri}"
|
|
trap 'qemu-nbd --disconnect "${nbd_device}" >/dev/null 2>&1 || true' EXIT
|
|
|
|
start_ns="$(date +%s%N)"
|
|
dd if="${nbd_device}" of=/dev/null bs=1M count="${size_mb}" status=none
|
|
end_ns="$(date +%s%N)"
|
|
|
|
printf '{"size_bytes":%s,"duration_ns":%s}\n' \
|
|
"$((size_mb * 1024 * 1024))" \
|
|
"$((end_ns - start_ns))"
|
|
EOS
|
|
}
|
|
|
|
run_remote_iperf_json() {
|
|
local client_node="$1"
|
|
local server_node="$2"
|
|
local server_ip="$3"
|
|
local duration_secs="${4:-10}"
|
|
local server_port
|
|
local server_pid
|
|
|
|
server_port="$(allocate_free_listener_port "${server_node}" 19000 19100)"
|
|
server_pid="$(ssh_node_script "${server_node}" "${server_port}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
server_port="$1"
|
|
log_path="/tmp/iperf3-server-${server_port}.log"
|
|
rm -f "${log_path}"
|
|
nohup iperf3 -s -1 -p "${server_port}" >"${log_path}" 2>&1 &
|
|
printf '%s\n' "$!"
|
|
EOS
|
|
)"
|
|
|
|
sleep 1
|
|
|
|
ssh_node_script "${client_node}" "${server_ip}" "${server_port}" "${duration_secs}" "${server_pid}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
server_ip="$1"
|
|
server_port="$2"
|
|
duration_secs="$3"
|
|
server_pid="$4"
|
|
|
|
client_json="$(iperf3 -c "${server_ip}" -p "${server_port}" -t "${duration_secs}" -J)"
|
|
printf '%s' "${client_json}" | jq -c '{
|
|
bits_per_second: (
|
|
.end.sum_received.bits_per_second //
|
|
.end.sum.bits_per_second //
|
|
.end.sum_sent.bits_per_second //
|
|
0
|
|
),
|
|
retransmits: (.end.sum_sent.retransmits // 0)
|
|
}'
|
|
EOS
|
|
}
|
|
|
|
wait_for_plasmavmc_workers_registered() {
|
|
local vm_port="$1"
|
|
local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for PlasmaVMC workers to register with the control plane"
|
|
until grpcurl -plaintext \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d '{}' \
|
|
127.0.0.1:"${vm_port}" plasmavmc.v1.NodeService/ListNodes \
|
|
| jq -e '
|
|
([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node04")) != null
|
|
and
|
|
([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null
|
|
' >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for PlasmaVMC workers to register"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_ssh() {
|
|
local node="$1"
|
|
local timeout="${2:-${SSH_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local observed_host=""
|
|
|
|
log "Waiting for SSH on ${node}"
|
|
while true; do
|
|
observed_host="$(ssh_node "${node}" "hostname" 2>/dev/null || true)"
|
|
if [[ "${observed_host}" == "${node}" ]]; then
|
|
break
|
|
fi
|
|
if ! is_running "${node}"; then
|
|
tail -n 100 "$(log_file "${node}")" || true
|
|
die "${node} VM process exited while waiting for SSH"
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
if [[ -n "${observed_host}" ]]; then
|
|
warn "SSH on port $(ssh_port_for_node "${node}") answered as '${observed_host}' while waiting for ${node}"
|
|
fi
|
|
tail -n 100 "$(log_file "${node}")" || true
|
|
die "timed out waiting for SSH on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_ssh_down() {
|
|
local node="$1"
|
|
local timeout="${2:-60}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for SSH to stop on ${node}"
|
|
until ! ssh_node "${node}" true >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for SSH shutdown on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_unit() {
|
|
local node="$1"
|
|
local unit="$2"
|
|
local timeout="${3:-${UNIT_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local stable_checks=0
|
|
local required_stable_checks=3
|
|
|
|
log "Waiting for ${unit}.service on ${node}"
|
|
while (( stable_checks < required_stable_checks )); do
|
|
if ssh_node "${node}" "state=\$(systemctl show --property=ActiveState --value ${unit}.service); sub=\$(systemctl show --property=SubState --value ${unit}.service); [[ \"\${state}\" == active && (\"\${sub}\" == running || \"\${sub}\" == exited) ]]" >/dev/null 2>&1; then
|
|
stable_checks=$((stable_checks + 1))
|
|
else
|
|
stable_checks=0
|
|
fi
|
|
if ! is_running "${node}"; then
|
|
tail -n 100 "$(log_file "${node}")" || true
|
|
die "${node} VM process exited while waiting for ${unit}.service"
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
ssh_node "${node}" "systemctl status --no-pager ${unit}.service || true" || true
|
|
ssh_node "${node}" "journalctl -u ${unit}.service -n 80 --no-pager || true" || true
|
|
die "timed out waiting for ${unit}.service on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
assert_unit_clean_boot() {
|
|
local node="$1"
|
|
local unit="$2"
|
|
local restart_count
|
|
|
|
restart_count="$(ssh_node "${node}" "systemctl show --property=NRestarts --value ${unit}.service" 2>/dev/null || true)"
|
|
restart_count="${restart_count:-0}"
|
|
if [[ ! "${restart_count}" =~ ^[0-9]+$ ]]; then
|
|
ssh_node "${node}" "systemctl status --no-pager ${unit}.service || true" || true
|
|
die "could not determine restart count for ${unit}.service on ${node}: ${restart_count}"
|
|
fi
|
|
if (( restart_count != 0 )); then
|
|
ssh_node "${node}" "systemctl status --no-pager ${unit}.service || true" || true
|
|
ssh_node "${node}" "journalctl -u ${unit}.service -n 120 --no-pager || true" || true
|
|
die "${unit}.service on ${node} restarted ${restart_count} times during boot"
|
|
fi
|
|
}
|
|
|
|
wait_for_http() {
|
|
local node="$1"
|
|
local url="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for HTTP endpoint on ${node}: ${url}"
|
|
until ssh_node "${node}" "curl -fsS '${url}' >/dev/null" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for ${url} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_http_status() {
|
|
local node="$1"
|
|
local url="$2"
|
|
local expected_codes="$3"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for HTTP status on ${node}: ${url} (${expected_codes})"
|
|
until ssh_node "${node}" "code=\$(curl -sS -o /dev/null -w '%{http_code}' '${url}' || true); case \" ${expected_codes} \" in *\" \${code} \"*) exit 0 ;; *) exit 1 ;; esac" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for HTTP status ${expected_codes} from ${url} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_http_body() {
|
|
local node="$1"
|
|
local url="$2"
|
|
local expected="$3"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for HTTP body on ${node}: ${url}"
|
|
until ssh_node_script "${node}" "${url}" "${expected}" <<'EOF' >/dev/null 2>&1
|
|
set -euo pipefail
|
|
url="$1"
|
|
expected="$2"
|
|
body="$(curl -fsS "${url}")"
|
|
[[ "${body}" == "${expected}" ]]
|
|
EOF
|
|
do
|
|
if (( SECONDS >= deadline )); then
|
|
ssh_node "${node}" "curl -fsS '${url}' || true" || true
|
|
die "timed out waiting for expected HTTP body from ${url} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_host_http() {
|
|
local url="$1"
|
|
local timeout="${2:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for host HTTP endpoint: ${url}"
|
|
until curl -fsS "${url}" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for host HTTP endpoint ${url}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
host_api_request() {
|
|
local stage="$1"
|
|
local method="$2"
|
|
local url="$3"
|
|
local token="$4"
|
|
local body="${5:-}"
|
|
local response_file headers_file stderr_file http_code
|
|
|
|
response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-response-XXXXXX)"
|
|
headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-headers-XXXXXX)"
|
|
stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-host-api-stderr-XXXXXX)"
|
|
|
|
if [[ -n "${body}" ]]; then
|
|
http_code="$(
|
|
curl -sS \
|
|
-D "${headers_file}" \
|
|
-o "${response_file}" \
|
|
-w '%{http_code}' \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-X "${method}" \
|
|
-d "${body}" \
|
|
"${url}" \
|
|
2>"${stderr_file}" || true
|
|
)"
|
|
else
|
|
http_code="$(
|
|
curl -sS \
|
|
-D "${headers_file}" \
|
|
-o "${response_file}" \
|
|
-w '%{http_code}' \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-X "${method}" \
|
|
"${url}" \
|
|
2>"${stderr_file}" || true
|
|
)"
|
|
fi
|
|
|
|
if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
|
|
cat "${response_file}"
|
|
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
|
|
return 0
|
|
fi
|
|
|
|
log "Host API request failed during ${stage}: ${method} ${url} (status=${http_code:-curl-error})"
|
|
if [[ -s "${stderr_file}" ]]; then
|
|
sed 's/^/[curl] /' "${stderr_file}" >&2
|
|
fi
|
|
if [[ -s "${headers_file}" ]]; then
|
|
sed 's/^/[headers] /' "${headers_file}" >&2
|
|
fi
|
|
if [[ -s "${response_file}" ]]; then
|
|
sed 's/^/[body] /' "${response_file}" >&2
|
|
fi
|
|
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
|
|
die "host API request failed during ${stage}"
|
|
}
|
|
|
|
gateway_api_request() {
|
|
local stage="$1"
|
|
local method="$2"
|
|
local request_path="$3"
|
|
local token="$4"
|
|
local body="${5:-}"
|
|
local body_b64=""
|
|
|
|
if [[ -n "${body}" ]]; then
|
|
body_b64="$(printf '%s' "${body}" | base64 | tr -d '\n')"
|
|
fi
|
|
|
|
if ssh_node_script node06 "${method}" "${request_path}" "${token}" "${body_b64}" <<'EOF'
|
|
set -euo pipefail
|
|
|
|
method="$1"
|
|
request_path="$2"
|
|
token="$3"
|
|
body_b64="${4:-}"
|
|
url="http://127.0.0.1:8080${request_path}"
|
|
response_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-response-XXXXXX)"
|
|
headers_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-headers-XXXXXX)"
|
|
stderr_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-stderr-XXXXXX)"
|
|
body_file=""
|
|
|
|
cleanup() {
|
|
rm -f "${response_file}" "${headers_file}" "${stderr_file}"
|
|
if [[ -n "${body_file}" ]]; then
|
|
rm -f "${body_file}"
|
|
fi
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
if [[ -n "${body_b64}" ]]; then
|
|
body_file="$(mktemp -p "${TMPDIR:-/tmp}" photon-gateway-api-body-XXXXXX)"
|
|
printf '%s' "${body_b64}" | base64 -d >"${body_file}"
|
|
http_code="$(
|
|
curl -sS \
|
|
-D "${headers_file}" \
|
|
-o "${response_file}" \
|
|
-w '%{http_code}' \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-X "${method}" \
|
|
--data-binary @"${body_file}" \
|
|
"${url}" \
|
|
2>"${stderr_file}" || true
|
|
)"
|
|
else
|
|
http_code="$(
|
|
curl -sS \
|
|
-D "${headers_file}" \
|
|
-o "${response_file}" \
|
|
-w '%{http_code}' \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-X "${method}" \
|
|
"${url}" \
|
|
2>"${stderr_file}" || true
|
|
)"
|
|
fi
|
|
|
|
if [[ "${http_code}" =~ ^2[0-9][0-9]$ ]]; then
|
|
cat "${response_file}"
|
|
exit 0
|
|
fi
|
|
|
|
echo "status=${http_code:-curl-error}" >&2
|
|
if [[ -s "${stderr_file}" ]]; then
|
|
sed 's/^/[curl] /' "${stderr_file}" >&2
|
|
fi
|
|
if [[ -s "${headers_file}" ]]; then
|
|
sed 's/^/[headers] /' "${headers_file}" >&2
|
|
fi
|
|
if [[ -s "${response_file}" ]]; then
|
|
sed 's/^/[body] /' "${response_file}" >&2
|
|
fi
|
|
exit 1
|
|
EOF
|
|
then
|
|
return 0
|
|
fi
|
|
|
|
log "Gateway API request failed during ${stage}: ${method} ${request_path}"
|
|
die "gateway API request failed during ${stage}"
|
|
}
|
|
|
|
grpc_health_check() {
|
|
local node="$1"
|
|
local port="$2"
|
|
local service="$3"
|
|
ssh_node "${node}" \
|
|
"grpcurl -plaintext -d '{\"service\":\"${service}\"}' 127.0.0.1:${port} grpc.health.v1.Health/Check | jq -e '.status == \"SERVING\"' >/dev/null"
|
|
}
|
|
|
|
wait_for_grpc_health() {
|
|
local node="$1"
|
|
local port="$2"
|
|
local service="$3"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for gRPC health on ${node}:${port} (${service})"
|
|
until grpc_health_check "${node}" "${port}" "${service}" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for gRPC health ${service} on ${node}:${port}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
check_tcp_port() {
|
|
local node="$1"
|
|
local port="$2"
|
|
ssh_node "${node}" "ss -H -ltn '( sport = :${port} )' | grep -q ."
|
|
}
|
|
|
|
check_udp_port() {
|
|
local node="$1"
|
|
local port="$2"
|
|
ssh_node "${node}" "ss -H -lun '( sport = :${port} )' | grep -q ."
|
|
}
|
|
|
|
wait_for_tcp_port() {
|
|
local node="$1"
|
|
local port="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for TCP port ${port} on ${node}"
|
|
until check_tcp_port "${node}" "${port}" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for TCP port ${port} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_udp_port() {
|
|
local node="$1"
|
|
local port="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for UDP port ${port} on ${node}"
|
|
until check_udp_port "${node}" "${port}" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for UDP port ${port} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_flaredb_region() {
|
|
local node="$1"
|
|
local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for FlareDB region metadata on ${node}"
|
|
until ssh_node "${node}" "curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -e '(.data.leader_id > 0) and ((.data.peers | sort) == [1,2,3])' >/dev/null" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for FlareDB region metadata on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_flaredb_route_metadata() {
|
|
local node="$1"
|
|
local timeout="${2:-${FLAREDB_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
log "Waiting for FlareDB route metadata on ${node}"
|
|
until ssh_node "${node}" "bash -se" <<'EOF' >/dev/null 2>&1
|
|
set -euo pipefail
|
|
actual="$(curl -fsS http://127.0.0.1:8082/api/v1/regions/1 | jq -r '.data.leader_id')"
|
|
recorded="$(curl -fsS http://127.0.0.1:8081/api/v1/kv/flaredb/regions/1 | jq -r '.data.value | fromjson | .leader_id')"
|
|
[[ "${actual}" != "0" ]]
|
|
[[ "${actual}" == "${recorded}" ]]
|
|
EOF
|
|
do
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for FlareDB route metadata on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
ensure_flaredb_proto_on_node() {
|
|
local node="$1"
|
|
local proto_root="${2:-/var/lib/photon-test-protos/flaredb}"
|
|
|
|
ssh_node "${node}" "install -d -m 0755 ${proto_root}"
|
|
scp_to_node "${node}" "${FLAREDB_PROTO}" "${proto_root}/kvrpc.proto"
|
|
}
|
|
|
|
vm_runtime_dir_path() {
|
|
printf '%s/%s\n' /run/libvirt/plasmavmc "$1"
|
|
}
|
|
|
|
vm_console_path() {
|
|
printf '%s/console.log\n' "$(vm_runtime_dir_path "$1")"
|
|
}
|
|
|
|
wait_for_vm_console_pattern() {
|
|
local node="$1"
|
|
local vm_id="$2"
|
|
local pattern="$3"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local console_path console_q pattern_q prefix prefix_q target_count
|
|
|
|
console_path="$(vm_console_path "${vm_id}")"
|
|
console_q="$(printf '%q' "${console_path}")"
|
|
pattern_q="$(printf '%q' "${pattern}")"
|
|
|
|
log "Waiting for VM console output on ${node}: ${pattern}"
|
|
if [[ "${pattern}" =~ ^(.*count=)([0-9]+)$ ]]; then
|
|
prefix="${BASH_REMATCH[1]}"
|
|
target_count="${BASH_REMATCH[2]}"
|
|
prefix_q="$(printf '%q' "${prefix}")"
|
|
until ssh_node "${node}" "bash -lc 'test -f ${console_q} && awk -v prefix=${prefix_q} -v target=${target_count} '\''index(\$0, prefix) { if (match(\$0, /count=([0-9]+)/, m) && (m[1] + 0) >= target) found = 1 } END { exit(found ? 0 : 1) }'\'' ${console_q}'" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
ssh_node "${node}" "bash -lc 'test -f ${console_q} && tail -n 80 ${console_q} || true'" || true
|
|
die "timed out waiting for VM console pattern ${pattern} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
return 0
|
|
fi
|
|
|
|
until ssh_node "${node}" "bash -lc 'test -f ${console_q} && grep -F -- ${pattern_q} ${console_q} >/dev/null'" >/dev/null 2>&1; do
|
|
if (( SECONDS >= deadline )); then
|
|
ssh_node "${node}" "bash -lc 'test -f ${console_q} && tail -n 80 ${console_q} || true'" || true
|
|
die "timed out waiting for VM console pattern ${pattern} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
read_vm_console_line_matching() {
|
|
local node="$1"
|
|
local vm_id="$2"
|
|
local pattern="$3"
|
|
local console_path console_q pattern_q
|
|
|
|
console_path="$(vm_console_path "${vm_id}")"
|
|
console_q="$(printf '%q' "${console_path}")"
|
|
pattern_q="$(printf '%q' "${pattern}")"
|
|
ssh_node "${node}" "bash -lc 'grep -F -- ${pattern_q} ${console_q} | tail -n1'"
|
|
}
|
|
|
|
wait_for_qemu_volume_present() {
|
|
local node="$1"
|
|
local volume_ref="$2"
|
|
local alternate_ref="${3:-}"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local qemu_processes
|
|
|
|
while true; do
|
|
qemu_processes="$(ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" 2>/dev/null || true)"
|
|
if [[ "${qemu_processes}" == *"${volume_ref}"* ]]; then
|
|
return 0
|
|
fi
|
|
if [[ -n "${alternate_ref}" && "${qemu_processes}" == *"${alternate_ref}"* ]]; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
printf '%s\n' "${qemu_processes}" >&2
|
|
die "timed out waiting for qemu to attach ${volume_ref}${alternate_ref:+ or ${alternate_ref}} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_qemu_volume_absent() {
|
|
local node="$1"
|
|
local volume_ref="$2"
|
|
local alternate_ref="${3:-}"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local qemu_processes
|
|
|
|
while true; do
|
|
qemu_processes="$(ssh_node "${node}" "pgrep -fa '[q]emu-system' || true" 2>/dev/null || true)"
|
|
if [[ "${qemu_processes}" != *"${volume_ref}"* ]] && [[ -z "${alternate_ref}" || "${qemu_processes}" != *"${alternate_ref}"* ]]; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
printf '%s\n' "${qemu_processes}" >&2
|
|
die "timed out waiting for qemu to release ${volume_ref}${alternate_ref:+ or ${alternate_ref}} on ${node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
try_get_vm_json() {
|
|
local token="$1"
|
|
local get_vm_json="$2"
|
|
local vm_port="${3:-15082}"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${get_vm_json}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VmService/GetVm
|
|
}
|
|
|
|
try_get_volume_json() {
|
|
local token="$1"
|
|
local get_volume_json="$2"
|
|
local vm_port="${3:-15082}"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${get_volume_json}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/GetVolume
|
|
}
|
|
|
|
start_plasmavmc_vm_watch() {
|
|
local node="$1"
|
|
local proto_root="$2"
|
|
local token="$3"
|
|
local org_id="$4"
|
|
local project_id="$5"
|
|
local vm_id="$6"
|
|
local output_path="$7"
|
|
|
|
ssh_node_script "${node}" "${proto_root}" "${token}" "${org_id}" "${project_id}" "${vm_id}" "${output_path}" <<'EOS'
|
|
set -euo pipefail
|
|
proto_root="$1"
|
|
token="$2"
|
|
org_id="$3"
|
|
project_id="$4"
|
|
vm_id="$5"
|
|
output_path="$6"
|
|
rm -f "${output_path}" "${output_path}.pid" "${output_path}.stderr"
|
|
nohup timeout 600 grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${proto_root}/plasmavmc" \
|
|
-proto "${proto_root}/plasmavmc/plasmavmc.proto" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
|
|
127.0.0.1:50082 plasmavmc.v1.VmService/WatchVm \
|
|
>"${output_path}" 2>"${output_path}.stderr" &
|
|
echo $! >"${output_path}.pid"
|
|
EOS
|
|
}
|
|
|
|
wait_for_plasmavmc_vm_watch_completion() {
|
|
local node="$1"
|
|
local output_path="$2"
|
|
local timeout="${3:-60}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
while true; do
|
|
if ssh_node_script "${node}" "${output_path}" <<'EOS'
|
|
set -euo pipefail
|
|
output_path="$1"
|
|
if [[ ! -f "${output_path}.pid" ]]; then
|
|
exit 0
|
|
fi
|
|
pid="$(cat "${output_path}.pid")"
|
|
if kill -0 "${pid}" >/dev/null 2>&1; then
|
|
exit 1
|
|
fi
|
|
EOS
|
|
then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
ssh_node "${node}" "test -f ${output_path}.stderr && cat ${output_path}.stderr || true" >&2 || true
|
|
ssh_node "${node}" "test -f ${output_path} && cat ${output_path} || true" >&2 || true
|
|
die "timed out waiting for PlasmaVMC watch stream to exit"
|
|
fi
|
|
sleep 1
|
|
done
|
|
}
|
|
|
|
assert_plasmavmc_vm_watch_events() {
|
|
local node="$1"
|
|
local output_path="$2"
|
|
local vm_id="$3"
|
|
|
|
ssh_node_script "${node}" "${output_path}" "${vm_id}" <<'EOS'
|
|
set -euo pipefail
|
|
output_path="$1"
|
|
vm_id="$2"
|
|
[[ -s "${output_path}" ]] || {
|
|
echo "PlasmaVMC watch output is empty" >&2
|
|
test -f "${output_path}.stderr" && cat "${output_path}.stderr" >&2 || true
|
|
exit 1
|
|
}
|
|
jq -s --arg vm "${vm_id}" '
|
|
any(.vmId == $vm and .eventType == "VM_EVENT_TYPE_STATE_CHANGED" and .vm.state == "VM_STATE_RUNNING") and
|
|
any(.vmId == $vm and .eventType == "VM_EVENT_TYPE_STATE_CHANGED" and .vm.state == "VM_STATE_STOPPED") and
|
|
any(.vmId == $vm and .eventType == "VM_EVENT_TYPE_DELETED")
|
|
' "${output_path}" >/dev/null || {
|
|
cat "${output_path}" >&2
|
|
test -f "${output_path}.stderr" && cat "${output_path}.stderr" >&2 || true
|
|
exit 1
|
|
}
|
|
EOS
|
|
}
|
|
|
|
wait_requested() {
|
|
local nodes
|
|
mapfile -t nodes < <(all_or_requested_nodes "$@")
|
|
validate_nodes_exist "${nodes[@]}"
|
|
preflight
|
|
|
|
local node
|
|
for node in "${nodes[@]}"; do
|
|
wait_for_ssh "${node}"
|
|
done
|
|
}
|
|
|
|
start_requested() {
|
|
local nodes
|
|
mapfile -t nodes < <(all_or_requested_nodes "$@")
|
|
validate_nodes_exist "${nodes[@]}"
|
|
preflight
|
|
if [[ "${CLUSTER_SKIP_BUILD}" == "1" ]]; then
|
|
local node
|
|
for node in "${nodes[@]}"; do
|
|
[[ -L "$(build_link "${node}")" ]] || die "missing VM build link for ${node} while PHOTON_CLUSTER_SKIP_BUILD=1"
|
|
done
|
|
log "Skipping VM build because PHOTON_CLUSTER_SKIP_BUILD=1"
|
|
else
|
|
build_vms "${nodes[@]}"
|
|
fi
|
|
|
|
if [[ "$#" -eq 0 ]]; then
|
|
local phase node
|
|
for phase in "${NODE_PHASES[@]}"; do
|
|
for node in ${phase}; do
|
|
start_vm "${node}"
|
|
done
|
|
for node in ${phase}; do
|
|
wait_for_ssh "${node}"
|
|
done
|
|
done
|
|
else
|
|
local node
|
|
for node in "${nodes[@]}"; do
|
|
start_vm "${node}"
|
|
done
|
|
for node in "${nodes[@]}"; do
|
|
wait_for_ssh "${node}"
|
|
done
|
|
fi
|
|
}
|
|
|
|
validate_units() {
|
|
local node unit
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_unit "${node}" chainfire
|
|
wait_for_unit "${node}" flaredb
|
|
done
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_flaredb_region "${node}"
|
|
done
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_unit "${node}" iam
|
|
done
|
|
|
|
for unit in prismnet flashdns fiberlb plasmavmc lightningstor coronafs k8shost; do
|
|
wait_for_unit node01 "${unit}"
|
|
done
|
|
|
|
for node in node04 node05; do
|
|
for unit in ${NODE_UNITS[${node}]}; do
|
|
wait_for_unit "${node}" "${unit}"
|
|
done
|
|
done
|
|
|
|
for unit in ${NODE_UNITS[node06]}; do
|
|
wait_for_unit node06 "${unit}"
|
|
done
|
|
}
|
|
|
|
validate_storage_units() {
|
|
local node unit
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_unit "${node}" chainfire
|
|
wait_for_unit "${node}" flaredb
|
|
done
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_flaredb_region "${node}"
|
|
done
|
|
|
|
for node in node01 node02 node03; do
|
|
wait_for_unit "${node}" iam
|
|
done
|
|
|
|
for unit in plasmavmc lightningstor coronafs; do
|
|
wait_for_unit node01 "${unit}"
|
|
done
|
|
assert_unit_clean_boot node01 plasmavmc
|
|
assert_unit_clean_boot node01 lightningstor
|
|
|
|
for node in node04 node05; do
|
|
for unit in ${NODE_UNITS[${node}]}; do
|
|
wait_for_unit "${node}" "${unit}"
|
|
done
|
|
assert_unit_clean_boot "${node}" plasmavmc
|
|
assert_unit_clean_boot "${node}" lightningstor
|
|
done
|
|
}
|
|
|
|
validate_storage_control_plane() {
|
|
wait_for_http node01 http://127.0.0.1:8081/health
|
|
wait_for_http node01 http://127.0.0.1:8082/health
|
|
wait_for_http node01 http://127.0.0.1:8083/health
|
|
wait_for_http node01 http://127.0.0.1:8084/health
|
|
wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
|
|
wait_for_tcp_port node01 50086
|
|
wait_for_tcp_port node01 9000
|
|
ssh_node_script node01 <<'EOS'
|
|
set -euo pipefail
|
|
capabilities="$(curl -fsS http://127.0.0.1:50088/v1/capabilities)"
|
|
printf '%s' "${capabilities}" | grep -q '"mode":"combined"'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_node_api":true'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_controller_api":true'
|
|
EOS
|
|
wait_for_http node04 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
|
|
wait_for_http node05 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
|
|
ssh_node_script node04 <<'EOS'
|
|
set -euo pipefail
|
|
capabilities="$(curl -fsS http://127.0.0.1:50088/v1/capabilities)"
|
|
printf '%s' "${capabilities}" | grep -q '"mode":"node"'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_node_api":true'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_controller_api":false'
|
|
EOS
|
|
ssh_node_script node05 <<'EOS'
|
|
set -euo pipefail
|
|
capabilities="$(curl -fsS http://127.0.0.1:50088/v1/capabilities)"
|
|
printf '%s' "${capabilities}" | grep -q '"mode":"node"'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_node_api":true'
|
|
printf '%s' "${capabilities}" | grep -q '"supports_controller_api":false'
|
|
EOS
|
|
ssh_node_script node01 <<'EOS'
|
|
set -euo pipefail
|
|
volume="coronafs-chainfire-smoke-$(date +%s)"
|
|
prefix="/coronafs/test-cluster/storage/volumes"
|
|
cleanup() {
|
|
curl -fsS -X DELETE "http://127.0.0.1:50088/v1/volumes/${volume}" >/dev/null 2>&1 || true
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
create_response="$(curl -fsS -X PUT \
|
|
-H 'content-type: application/json' \
|
|
-d '{"size_bytes":67108864}' \
|
|
"http://127.0.0.1:50088/v1/volumes/${volume}")"
|
|
printf '%s' "${create_response}" | jq -e --arg id "${volume}" '.id == $id' >/dev/null
|
|
|
|
export_response="$(curl -fsS -X POST "http://127.0.0.1:50088/v1/volumes/${volume}/export")"
|
|
printf '%s' "${export_response}" | jq -e '.export.uri != null and .export.port != null and .export.read_only == false' >/dev/null
|
|
|
|
readonly_response="$(curl -fsS -X POST "http://127.0.0.1:50088/v1/volumes/${volume}/export?read_only=true")"
|
|
printf '%s' "${readonly_response}" | jq -e '.export.uri != null and .export.port != null and .export.read_only == true' >/dev/null
|
|
|
|
rewritable_response="$(curl -fsS -X POST "http://127.0.0.1:50088/v1/volumes/${volume}/export")"
|
|
printf '%s' "${rewritable_response}" | jq -e '.export.uri != null and .export.port != null and .export.read_only == false' >/dev/null
|
|
|
|
curl -fsS --get http://127.0.0.1:8081/api/v1/kv \
|
|
--data-urlencode "prefix=${prefix}/${volume}" \
|
|
| jq -e --arg key "${prefix}/${volume}" --arg id "${volume}" '
|
|
.data.items | length == 1 and
|
|
.[0].key == $key and
|
|
((.[0].value | fromjson).id == $id)
|
|
' >/dev/null
|
|
|
|
systemctl restart coronafs
|
|
for _ in $(seq 1 30); do
|
|
if curl -fsS http://127.0.0.1:50088/healthz >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
curl -fsS http://127.0.0.1:50088/healthz >/dev/null
|
|
|
|
after_restart="$(curl -fsS "http://127.0.0.1:50088/v1/volumes/${volume}")"
|
|
printf '%s' "${after_restart}" | jq -e --arg id "${volume}" '.id == $id and (.export == null)' >/dev/null
|
|
|
|
reexport_response="$(curl -fsS -X POST "http://127.0.0.1:50088/v1/volumes/${volume}/export")"
|
|
printf '%s' "${reexport_response}" | jq -e '.export.uri != null and .export.port != null and .export.read_only == false' >/dev/null
|
|
|
|
curl -fsS -X DELETE "http://127.0.0.1:50088/v1/volumes/${volume}" >/dev/null
|
|
if curl -fsS "http://127.0.0.1:8081/api/v1/kv/${prefix#"/"}/${volume}" >/tmp/coronafs-chainfire-delete.out 2>/dev/null; then
|
|
echo "ChainFire metadata still exists for deleted CoronaFS volume ${volume}" >&2
|
|
cat /tmp/coronafs-chainfire-delete.out >&2 || true
|
|
exit 1
|
|
fi
|
|
trap - EXIT
|
|
EOS
|
|
wait_for_http node02 http://127.0.0.1:8081/health
|
|
wait_for_http node02 http://127.0.0.1:8082/health
|
|
wait_for_http node02 http://127.0.0.1:8083/health
|
|
wait_for_http node03 http://127.0.0.1:8081/health
|
|
wait_for_http node03 http://127.0.0.1:8082/health
|
|
wait_for_http node03 http://127.0.0.1:8083/health
|
|
}
|
|
|
|
validate_control_plane() {
|
|
wait_for_http node01 http://127.0.0.1:8081/health
|
|
wait_for_http node01 http://127.0.0.1:8082/health
|
|
wait_for_http node01 http://127.0.0.1:8083/health
|
|
wait_for_http node01 http://127.0.0.1:8087/health
|
|
wait_for_http node01 http://127.0.0.1:8084/health
|
|
wait_for_http node01 http://127.0.0.1:8085/health
|
|
wait_for_http node02 http://127.0.0.1:8081/health
|
|
wait_for_http node02 http://127.0.0.1:8082/health
|
|
wait_for_http node02 http://127.0.0.1:8083/health
|
|
wait_for_http node03 http://127.0.0.1:8081/health
|
|
wait_for_http node03 http://127.0.0.1:8082/health
|
|
wait_for_http node03 http://127.0.0.1:8083/health
|
|
|
|
wait_for_tcp_port node01 50084
|
|
wait_for_http node01 http://127.0.0.1:9097/metrics
|
|
wait_for_udp_port node01 5353
|
|
wait_for_tcp_port node01 50085
|
|
wait_for_http node01 http://127.0.0.1:9098/metrics
|
|
wait_for_tcp_port node01 50086
|
|
wait_for_tcp_port node01 50090
|
|
wait_for_http_status node01 http://127.0.0.1:9000 "200 403"
|
|
wait_for_http node01 http://127.0.0.1:9099/metrics
|
|
wait_for_http node01 http://127.0.0.1:9198/metrics
|
|
|
|
log "Validating ChainFire replication across control-plane nodes"
|
|
ssh_node_script node01 <<'EOS'
|
|
set -euo pipefail
|
|
key="validation-chainfire-$(date +%s)"
|
|
value="ok-$RANDOM"
|
|
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
|
|
leader=""
|
|
for ip in "${nodes[@]}"; do
|
|
code="$(curl -sS -o /tmp/chainfire-put.out -w '%{http_code}' \
|
|
-X PUT "http://${ip}:8081/api/v1/kv/${key}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"value\":\"${value}\"}" || true)"
|
|
if [[ "${code}" == "200" ]]; then
|
|
leader="${ip}"
|
|
break
|
|
fi
|
|
done
|
|
[[ -n "${leader}" ]]
|
|
curl -fsS http://10.100.0.11:8081/api/v1/cluster/status | jq -e '.data.term >= 1' >/dev/null
|
|
for ip in "${nodes[@]}"; do
|
|
deadline=$((SECONDS + 30))
|
|
while true; do
|
|
actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
|
|
if [[ "${actual}" == "${value}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "chainfire replication did not converge on ${ip}" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
done
|
|
EOS
|
|
|
|
log "Validating FlareDB replication across control-plane nodes"
|
|
wait_for_flaredb_region node01
|
|
wait_for_flaredb_region node02
|
|
wait_for_flaredb_region node03
|
|
ssh_node_script node01 <<'EOS'
|
|
set -euo pipefail
|
|
key="validation-flaredb-$(date +%s)"
|
|
value="ok-$RANDOM"
|
|
namespace="validation"
|
|
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
|
|
writer=""
|
|
for ip in "${nodes[@]}"; do
|
|
code="$(curl -sS -o /tmp/flaredb-put.out -w '%{http_code}' \
|
|
-X PUT "http://${ip}:8082/api/v1/kv/${key}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"value\":\"${value}\",\"namespace\":\"${namespace}\"}" || true)"
|
|
if [[ "${code}" == "200" ]]; then
|
|
writer="${ip}"
|
|
break
|
|
fi
|
|
done
|
|
[[ -n "${writer}" ]]
|
|
for ip in "${nodes[@]}"; do
|
|
deadline=$((SECONDS + 120))
|
|
while true; do
|
|
actual="$(curl -fsS --get "http://${ip}:8082/api/v1/scan" \
|
|
--data-urlencode "start=${key}" \
|
|
--data-urlencode "end=${key}~" \
|
|
--data-urlencode "namespace=${namespace}" 2>/dev/null \
|
|
| jq -r '.data.items[0].value // empty' 2>/dev/null || true)"
|
|
if [[ "${actual}" == "${value}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "flaredb replication did not converge on ${ip}" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
done
|
|
EOS
|
|
wait_for_flaredb_route_metadata node01
|
|
|
|
log "Validating FlareDB strong-consistency CAS on the control plane"
|
|
local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
|
|
ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
|
|
ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
|
|
set -euo pipefail
|
|
proto_root="$1"
|
|
key="validation-flaredb-strong-$(date +%s)"
|
|
value="ok-$RANDOM"
|
|
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
|
|
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
|
|
nodes=(10.100.0.11 10.100.0.12 10.100.0.13)
|
|
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"default"}')"
|
|
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"default"}')"
|
|
writer=""
|
|
for ip in "${nodes[@]}"; do
|
|
if grpcurl -plaintext \
|
|
-import-path "${proto_root}" \
|
|
-proto "${proto_root}/kvrpc.proto" \
|
|
-d "${request}" \
|
|
"${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-cas.out 2>/dev/null; then
|
|
if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-cas.out >/dev/null; then
|
|
writer="${ip}"
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
[[ -n "${writer}" ]]
|
|
deadline=$((SECONDS + 90))
|
|
while true; do
|
|
if grpcurl -plaintext \
|
|
-import-path "${proto_root}" \
|
|
-proto "${proto_root}/kvrpc.proto" \
|
|
-d "${get_request}" \
|
|
"${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-cas-get.out 2>/dev/null; then
|
|
if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-cas-get.out >/dev/null; then
|
|
break
|
|
fi
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "flaredb strong CAS read did not converge on leader ${writer}" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
EOS
|
|
}
|
|
|
|
validate_iam_flow() {
|
|
log "Validating IAM token issuance, validation, and scoped authorization"
|
|
|
|
local iam_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="iam-smoke-org"
|
|
local project_id="iam-smoke-project"
|
|
local principal_id="iam-smoke-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "$(jq -cn --arg token "${token}" '{token:$token}')" \
|
|
127.0.0.1:15080 iam.v1.IamToken/ValidateToken \
|
|
| jq -e --arg org "${org_id}" --arg project "${project_id}" --arg principal "${principal_id}" \
|
|
'.valid == true and .claims.orgId == $org and .claims.projectId == $project and .claims.principalId == $principal' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
|
|
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"allow-check", orgId:$org, projectId:$project}}')" \
|
|
127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
|
|
| jq -e '.allowed == true' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "$(jq -cn --arg id "${principal_id}" --arg org "${org_id}" --arg project "${project_id}" \
|
|
'{principal:{kind:"PRINCIPAL_KIND_SERVICE_ACCOUNT", id:$id}, action:"storage:buckets:create", resource:{kind:"bucket", id:"deny-check", orgId:$org, projectId:($project + "-other")}}')" \
|
|
127.0.0.1:15080 iam.v1.IamAuthz/Authorize \
|
|
| jq -e '(.allowed // false) == false' >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_prismnet_flow() {
|
|
log "Validating PrismNet VPC, subnet, and port lifecycle"
|
|
|
|
local iam_tunnel="" prism_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
|
|
trap 'stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="prismnet-smoke-org"
|
|
local project_id="prismnet-smoke-project"
|
|
local principal_id="prismnet-smoke-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
local vpc_resp subnet_resp port_resp
|
|
local vpc_id subnet_id port_id
|
|
|
|
vpc_resp="$(create_prismnet_vpc_with_retry \
|
|
"${token}" \
|
|
"${org_id}" \
|
|
"${project_id}" \
|
|
"prismnet-smoke-vpc" \
|
|
"smoke vpc" \
|
|
"10.44.0.0/16")"
|
|
vpc_id="$(printf '%s' "${vpc_resp}" | jq -r '.vpc.id')"
|
|
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "PrismNet CreateVpc did not return a VPC ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/GetVpc \
|
|
| jq -e --arg id "${vpc_id}" '.vpc.id == $id' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, pageSize:100, pageToken:""}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/ListVpcs \
|
|
| jq -e --arg id "${vpc_id}" '.vpcs | any(.id == $id)' >/dev/null
|
|
|
|
subnet_resp="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"prismnet-smoke-subnet", description:"smoke subnet", cidrBlock:"10.44.10.0/24", gatewayIp:"10.44.10.1", dhcpEnabled:true}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/CreateSubnet)"
|
|
subnet_id="$(printf '%s' "${subnet_resp}" | jq -r '.subnet.id')"
|
|
[[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "PrismNet CreateSubnet did not return a subnet ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/GetSubnet \
|
|
| jq -e --arg id "${subnet_id}" '.subnet.id == $id' >/dev/null
|
|
|
|
port_resp="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"prismnet-smoke-port", description:"smoke port", ipAddress:""}')" \
|
|
127.0.0.1:15081 prismnet.PortService/CreatePort)"
|
|
port_id="$(printf '%s' "${port_resp}" | jq -r '.port.id')"
|
|
[[ -n "${port_id}" && "${port_id}" != "null" ]] || die "PrismNet CreatePort did not return a port ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/GetPort \
|
|
| jq -e --arg id "${port_id}" '.port.id == $id and (.port.ipAddress | length) > 0' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, deviceId:"", pageSize:100, pageToken:""}')" \
|
|
127.0.0.1:15081 prismnet.PortService/ListPorts \
|
|
| jq -e --arg id "${port_id}" '.ports | any(.id == $id)' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id, name:"prismnet-smoke-port-updated", description:"updated", securityGroupIds:[], adminStateUp:false}')" \
|
|
127.0.0.1:15081 prismnet.PortService/UpdatePort \
|
|
| jq -e '.port.name == "prismnet-smoke-port-updated" and (.port.adminStateUp // false) == false' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${prism_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_flashdns_flow() {
|
|
log "Validating FlashDNS zone, record, and authoritative query flow"
|
|
|
|
local iam_tunnel="" dns_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
|
|
trap 'stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="flashdns-smoke-org"
|
|
local project_id="flashdns-smoke-project"
|
|
local principal_id="flashdns-smoke-$(date +%s)"
|
|
local token zone_name zone_resp zone_id record_resp record_id fqdn
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
zone_name="smoke-$(date +%s).cluster.test"
|
|
|
|
zone_resp="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.smoke.test", adminEmail:"admin@smoke.test"}')" \
|
|
127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone)"
|
|
zone_id="$(printf '%s' "${zone_resp}" | jq -r '.zone.id')"
|
|
[[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "FlashDNS CreateZone did not return a zone ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${zone_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.ZoneService/GetZone \
|
|
| jq -e --arg id "${zone_id}" --arg name "${zone_name}" \
|
|
'.zone.id == $id and (.zone.name == $name or .zone.name == ($name + "."))' >/dev/null
|
|
|
|
record_resp="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"api", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord)"
|
|
record_id="$(printf '%s' "${record_resp}" | jq -r '.record.id')"
|
|
[[ -n "${record_id}" && "${record_id}" != "null" ]] || die "FlashDNS CreateRecord did not return a record ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/GetRecord \
|
|
| jq -e --arg id "${record_id}" '.record.id == $id' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, nameFilter:"", typeFilter:"", pageSize:100, pageToken:""}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/ListRecords \
|
|
| jq -e --arg id "${record_id}" '.records | any(.id == $id)' >/dev/null
|
|
|
|
fqdn="api.${zone_name}"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for authoritative FlashDNS answer for ${fqdn}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
|
|
127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${dns_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_fiberlb_flow() {
|
|
log "Validating FiberLB management API, runtime listeners, and backend failover behavior"
|
|
|
|
local iam_tunnel="" lb_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
|
|
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="fiberlb-smoke-org"
|
|
local project_id="fiberlb-smoke-project"
|
|
local principal_id="fiberlb-smoke-$(date +%s)"
|
|
local token lb_id pool_id backend_id listener_id listener_port
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
listener_port=$((18080 + (RANDOM % 100)))
|
|
|
|
lb_id="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg name "fiberlb-smoke-lb" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, description:"smoke lb"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer \
|
|
| jq -r '.loadbalancer.id')"
|
|
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "FiberLB CreateLoadBalancer did not return an ID"
|
|
|
|
pool_id="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg name "fiberlb-smoke-pool" --arg lb "${lb_id}" '{name:$name, loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool \
|
|
| jq -r '.pool.id')"
|
|
[[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "FiberLB CreatePool did not return an ID"
|
|
|
|
backend_id="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg name "fiberlb-smoke-backend" --arg pool "${pool_id}" '{name:$name, poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend \
|
|
| jq -r '.backend.id')"
|
|
[[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "FiberLB CreateBackend did not return an ID"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/GetBackend \
|
|
| jq -e --arg id "${backend_id}" '.backend.id == $id' >/dev/null
|
|
|
|
listener_id="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg name "fiberlb-smoke-listener" --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:$name, loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
|
|
127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener \
|
|
| jq -r '.listener.id')"
|
|
[[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "FiberLB CreateListener did not return an ID"
|
|
|
|
wait_for_tcp_port node01 "${listener_port}"
|
|
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
|
|
|
|
local fiberlb_pid fiberlb_peak_cpu load_pid settle_ok
|
|
fiberlb_pid="$(ssh_node node01 'pidof fiberlb')"
|
|
[[ -n "${fiberlb_pid}" ]] || die "FiberLB process is not running on node01"
|
|
|
|
ssh_node node01 \
|
|
"bash -lc 'seq 1 256 | xargs -P 32 -I{} curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null'" &
|
|
load_pid=$!
|
|
sleep 1
|
|
fiberlb_peak_cpu="$(ssh_node node01 "top -b -d 1 -n 5 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0; if (cpu > max) max = cpu } END { print max + 0 }'")"
|
|
wait "${load_pid}"
|
|
log "FiberLB peak CPU during synthetic load: ${fiberlb_peak_cpu}%"
|
|
|
|
settle_ok=0
|
|
for _ in {1..10}; do
|
|
if ssh_node node01 \
|
|
"top -b -d 1 -n 2 -p ${fiberlb_pid} | awk -v pid=${fiberlb_pid} '\$1 == pid { cpu = \$9 + 0 } END { exit !(cpu < 20.0) }'"; then
|
|
settle_ok=1
|
|
break
|
|
fi
|
|
sleep 2
|
|
done
|
|
[[ "${settle_ok}" -eq 1 ]] || die "FiberLB CPU did not settle after synthetic load"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_DISABLED"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
|
|
| jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_DISABLED"' >/dev/null
|
|
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ! ssh_node node01 "curl -fsS --max-time 2 http://127.0.0.1:${listener_port}/health >/dev/null" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for disabled FiberLB backend to stop serving traffic"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${backend_id}" '{id:$id, adminState:"BACKEND_ADMIN_STATE_ENABLED"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/UpdateBackend \
|
|
| jq -e '.backend.adminState == "BACKEND_ADMIN_STATE_ENABLED"' >/dev/null
|
|
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${lb_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_k8shost_flow() {
|
|
log "Validating K8sHost node, pod, service, and controller integrations"
|
|
|
|
local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel="" k8s_tunnel="" k8s_http_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
|
|
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
|
|
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
|
|
k8s_tunnel="$(start_ssh_tunnel node01 15087 50087)"
|
|
k8s_http_tunnel="$(start_ssh_tunnel node01 18087 8085)"
|
|
trap 'stop_ssh_tunnel node01 "${k8s_http_tunnel}"; stop_ssh_tunnel node01 "${k8s_tunnel}"; stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${dns_tunnel}"; stop_ssh_tunnel node01 "${prism_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="default-org"
|
|
local project_id="default-project"
|
|
local principal_id="k8shost-smoke-$(date +%s)"
|
|
local token node_name deployment_name pod_name service_name service_port
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
node_name="smoke-node-$(date +%s)"
|
|
deployment_name="smoke-deploy-$(date +%s)"
|
|
pod_name="smoke-pod-$(date +%s)"
|
|
service_name="smoke-svc-$(date +%s)"
|
|
service_port=$((18180 + (RANDOM % 100)))
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg name "${node_name}" --arg org "${org_id}" --arg project "${project_id}" '{node:{metadata:{name:$name, orgId:$org, projectId:$project}, spec:{podCidr:"10.244.0.0/24"}, status:{addresses:[{type:"InternalIP", address:"10.100.0.21"}], conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4", memory:"8192Mi"}, allocatable:{cpu:"4", memory:"8192Mi"}}}}')" \
|
|
127.0.0.1:15087 k8shost.NodeService/RegisterNode >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg name "${node_name}" '{nodeName:$name, status:{conditions:[{type:"Ready", status:"True"}], capacity:{cpu:"4"}, allocatable:{cpu:"4"}}}')" \
|
|
127.0.0.1:15087 k8shost.NodeService/Heartbeat \
|
|
| jq -e '.success == true' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d '{}' \
|
|
127.0.0.1:15087 k8shost.NodeService/ListNodes \
|
|
| jq -e --arg name "${node_name}" '.items | any(.metadata.name == $name)' >/dev/null
|
|
|
|
curl -fsS \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$(jq -cn --arg name "${deployment_name}" '{name:$name, namespace:"default", replicas:2, selector:{app:"k8shost-deployment-smoke", deployment:$name}, containers:[{name:"backend", image:"smoke", ports:[{container_port:8082, protocol:"TCP"}]}]}')" \
|
|
http://127.0.0.1:18087/api/v1/deployments \
|
|
| jq -e --arg name "${deployment_name}" '.data.name == $name and .data.replicas == 2' >/dev/null
|
|
curl -fsS \
|
|
-H "Authorization: Bearer ${token}" \
|
|
http://127.0.0.1:18087/api/v1/deployments?namespace=default \
|
|
| jq -e --arg name "${deployment_name}" '.data.deployments | any(.name == $name)' >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local deployment_pods_json
|
|
deployment_pods_json="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg deploy "${deployment_name}" '{namespace:$ns, labelSelector:{deployment:$deploy}}')" \
|
|
127.0.0.1:15087 k8shost.PodService/ListPods 2>/dev/null || true)"
|
|
if [[ -n "${deployment_pods_json}" ]] && printf '%s' "${deployment_pods_json}" | jq -e --arg node "${node_name}" '
|
|
(.items | length) == 2 and
|
|
all(.items[]; .spec.nodeName == $node)' >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for K8sHost Deployment ${deployment_name} to create and schedule pods"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
curl -fsS \
|
|
-H "Authorization: Bearer ${token}" \
|
|
http://127.0.0.1:18087/api/v1/deployments/default/${deployment_name} \
|
|
| jq -e --arg name "${deployment_name}" '.data.name == $name and .data.ready_replicas >= 0' >/dev/null
|
|
curl -fsS \
|
|
-X PUT \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H "Content-Type: application/json" \
|
|
-d '{"replicas":1}' \
|
|
http://127.0.0.1:18087/api/v1/deployments/default/${deployment_name} \
|
|
| jq -e '.data.replicas == 1' >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local scaled_pods_json
|
|
scaled_pods_json="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg deploy "${deployment_name}" '{namespace:$ns, labelSelector:{deployment:$deploy}}')" \
|
|
127.0.0.1:15087 k8shost.PodService/ListPods 2>/dev/null || true)"
|
|
if [[ -n "${scaled_pods_json}" ]] && printf '%s' "${scaled_pods_json}" | jq -e '.items | length == 1' >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for K8sHost Deployment ${deployment_name} to scale down"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
curl -fsS \
|
|
-X DELETE \
|
|
-H "Authorization: Bearer ${token}" \
|
|
http://127.0.0.1:18087/api/v1/deployments/default/${deployment_name} \
|
|
| jq -e '.data.deleted == true' >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local deleted_pods_json
|
|
deleted_pods_json="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg deploy "${deployment_name}" '{namespace:$ns, labelSelector:{deployment:$deploy}}')" \
|
|
127.0.0.1:15087 k8shost.PodService/ListPods 2>/dev/null || true)"
|
|
if [[ -n "${deleted_pods_json}" ]] && printf '%s' "${deleted_pods_json}" | jq -e '.items | length == 0' >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for K8sHost Deployment ${deployment_name} to delete managed pods"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg name "${pod_name}" --arg org "${org_id}" --arg project "${project_id}" '{pod:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project, labels:{app:"k8shost-smoke"}}, spec:{containers:[{name:"backend", image:"smoke", ports:[{containerPort:8081, protocol:"TCP"}]}]}, status:{phase:"Running", podIp:"10.100.0.11", hostIp:"10.100.0.11"}}}')" \
|
|
127.0.0.1:15087 k8shost.PodService/CreatePod >/dev/null
|
|
|
|
log "Matrix case: K8sHost + PrismNet"
|
|
local pools_json
|
|
pools_json="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
|
|
127.0.0.1:15081 prismnet.IpamService/ListServiceIPPools)"
|
|
if ! printf '%s' "${pools_json}" | jq -e '.pools | length > 0' >/dev/null; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{orgId:$org, projectId:$project, name:"default-cluster-ip-pool", description:"smoke-created default ClusterIP pool", cidrBlock:"10.96.42.0/24", poolType:"SERVICE_IP_POOL_TYPE_CLUSTER_IP"}')" \
|
|
127.0.0.1:15081 prismnet.IpamService/CreateServiceIPPool >/dev/null
|
|
fi
|
|
|
|
log "Matrix case: K8sHost + PrismNet + FiberLB + FlashDNS"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg name "${service_name}" --arg org "${org_id}" --arg project "${project_id}" --argjson port "${service_port}" '{service:{metadata:{name:$name, namespace:"default", orgId:$org, projectId:$project}, spec:{ports:[{name:"http", port:$port, targetPort:8081, protocol:"TCP"}], selector:{app:"k8shost-smoke"}, type:"LoadBalancer"}}}')" \
|
|
127.0.0.1:15087 k8shost.ServiceService/CreateService >/dev/null
|
|
|
|
local service_json cluster_ip lb_id record_id zone_id
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
service_json="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
|
|
127.0.0.1:15087 k8shost.ServiceService/GetService 2>/dev/null || true)"
|
|
if [[ -n "${service_json}" ]] && printf '%s' "${service_json}" | jq -e '
|
|
.service.status.loadBalancer.ingress[0].ip != null and
|
|
.service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"] != null and
|
|
.service.metadata.annotations["flashdns.plasmacloud.io/record-id"] != null' >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for K8sHost controllers to provision service ${service_name}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
cluster_ip="$(printf '%s' "${service_json}" | jq -r '.service.spec.clusterIp')"
|
|
lb_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["fiberlb.plasmacloud.io/lb-id"]')"
|
|
record_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/record-id"]')"
|
|
zone_id="$(printf '%s' "${service_json}" | jq -r '.service.metadata.annotations["flashdns.plasmacloud.io/zone-id"]')"
|
|
[[ -n "${cluster_ip}" && "${cluster_ip}" != "null" ]] || die "K8sHost service did not get a cluster IP"
|
|
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "K8sHost service did not get a FiberLB load balancer"
|
|
[[ -n "${record_id}" && "${record_id}" != "null" ]] || die "K8sHost service did not get a FlashDNS record"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn '{namespace:"default"}')" \
|
|
127.0.0.1:15087 k8shost.ServiceService/ListServices \
|
|
| jq -e --arg name "${service_name}" '.items | any(.metadata.name == $name)' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn '{namespace:"default", labelSelector:{app:"k8shost-smoke"}}')" \
|
|
127.0.0.1:15087 k8shost.PodService/ListPods \
|
|
| jq -e --arg name "${pod_name}" '.items | any(.metadata.name == $name)' >/dev/null
|
|
|
|
log "Matrix case: K8sHost + FlashDNS"
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_name}.default.svc.cluster.local A | grep -Fx '${cluster_ip}'" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for K8sHost FlashDNS record for ${service_name}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
log "Matrix case: K8sHost + FiberLB"
|
|
wait_for_http node01 "http://127.0.0.1:${service_port}/health"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg name "${service_name}" '{namespace:$ns, name:$name}')" \
|
|
127.0.0.1:15087 k8shost.ServiceService/DeleteService >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${K8SHOST_PROTO_DIR}" \
|
|
-proto "${K8SHOST_PROTO}" \
|
|
-d "$(jq -cn --arg ns "default" --arg name "${pod_name}" '{namespace:$ns, name:$name}')" \
|
|
127.0.0.1:15087 k8shost.PodService/DeletePod >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" \
|
|
-proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${record_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${k8s_tunnel}"
|
|
stop_ssh_tunnel node01 "${lb_tunnel}"
|
|
stop_ssh_tunnel node01 "${dns_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_workers() {
|
|
wait_for_http node04 http://127.0.0.1:8084/health
|
|
wait_for_http node05 http://127.0.0.1:8084/health
|
|
wait_for_tcp_port node04 50086
|
|
wait_for_tcp_port node05 50086
|
|
wait_for_http node04 http://127.0.0.1:9098/metrics
|
|
wait_for_http node05 http://127.0.0.1:9098/metrics
|
|
wait_for_http node01 "http://127.0.0.1:${CORONAFS_API_PORT}/healthz"
|
|
|
|
log "Validating CoronaFS block export accessibility on worker nodes"
|
|
local coronafs_tunnel="" worker_coronafs_tunnel="" probe_volume="" worker_probe_volume=""
|
|
probe_volume="worker-probe-$(date +%s)"
|
|
worker_probe_volume="${probe_volume}-node04"
|
|
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
|
|
worker_coronafs_tunnel="$(start_ssh_tunnel node04 25088 "${CORONAFS_API_PORT}")"
|
|
trap 'coronafs_delete_volume 25088 "'"${worker_probe_volume}"'" >/dev/null 2>&1 || true; coronafs_delete_volume 15088 "'"${probe_volume}"'" >/dev/null 2>&1 || true; stop_ssh_tunnel node04 "'"${worker_coronafs_tunnel}"'"; stop_ssh_tunnel node01 "'"${coronafs_tunnel}"'"' RETURN
|
|
coronafs_create_volume 15088 "${probe_volume}" $((64 * 1024 * 1024)) >/dev/null
|
|
local probe_export_json probe_uri worker_export_json worker_probe_uri
|
|
probe_export_json="$(coronafs_export_volume_json 15088 "${probe_volume}")"
|
|
probe_uri="$(printf '%s' "${probe_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${probe_uri}" && "${probe_uri}" != "null" ]] || die "CoronaFS probe volume did not return an export URI"
|
|
run_remote_nbd_fio_json node04 "${probe_uri}" write 1M 32 >/dev/null
|
|
run_remote_nbd_dd_read_json node05 "${probe_uri}" 32 >/dev/null
|
|
coronafs_materialize_volume 25088 "${worker_probe_volume}" "${probe_uri}" $((64 * 1024 * 1024)) >/dev/null
|
|
worker_export_json="$(coronafs_export_volume_json 25088 "${worker_probe_volume}")"
|
|
worker_probe_uri="$(printf '%s' "${worker_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${worker_probe_uri}" && "${worker_probe_uri}" != "null" ]] || die "worker-local CoronaFS materialization did not return an export URI"
|
|
run_remote_nbd_fio_json node04 "${worker_probe_uri}" read 1M 32 >/dev/null
|
|
run_remote_nbd_dd_read_json node05 "${worker_probe_uri}" 32 >/dev/null
|
|
coronafs_delete_volume 25088 "${worker_probe_volume}"
|
|
coronafs_delete_volume 15088 "${probe_volume}"
|
|
stop_ssh_tunnel node04 "${worker_coronafs_tunnel}"
|
|
stop_ssh_tunnel node01 "${coronafs_tunnel}"
|
|
trap - RETURN
|
|
}
|
|
|
|
validate_nested_kvm_workers() {
|
|
log "Validating nested KVM inside worker VMs"
|
|
for node in node04 node05; do
|
|
ssh_node_script "${node}" <<'EOS'
|
|
set -euo pipefail
|
|
modprobe kvm_intel >/dev/null 2>&1 || modprobe kvm_amd >/dev/null 2>&1 || true
|
|
[[ -c /dev/kvm ]]
|
|
grep -Eq 'vmx|svm' /proc/cpuinfo
|
|
qemu-system-x86_64 \
|
|
-accel kvm \
|
|
-cpu host \
|
|
-machine q35 \
|
|
-m 256 \
|
|
-display none \
|
|
-nodefaults \
|
|
-no-reboot \
|
|
-daemonize \
|
|
-pidfile /tmp/nested-kvm.pid \
|
|
-serial file:/tmp/nested-kvm.log \
|
|
-kernel /run/current-system/kernel \
|
|
-append 'console=ttyS0' >/tmp/nested-kvm.cmd.log 2>&1
|
|
sleep 5
|
|
kill -0 "$(cat /tmp/nested-kvm.pid)"
|
|
kill "$(cat /tmp/nested-kvm.pid)"
|
|
EOS
|
|
done
|
|
}
|
|
|
|
validate_lightningstor_distributed_storage() {
|
|
log "Validating distributed LightningStor object replication across node01/node04/node05"
|
|
|
|
local iam_tunnel="" ls_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
|
|
trap 'stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="smoke-org"
|
|
local project_id="smoke-project"
|
|
local principal_id="lightningstor-smoke-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
local bucket="dist-smoke-$(date +%s)"
|
|
ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"
|
|
wait_for_lightningstor_write_quorum 15086 "${token}" "${bucket}" "distributed LightningStor validation"
|
|
|
|
local before_node01 before_node04 before_node05
|
|
read -r before_node01 before_node04 before_node05 < <(lightningstor_count_triplet)
|
|
|
|
local key="replication-check-$(date +%s)"
|
|
local body="distributed-object-${key}"
|
|
local body_b64
|
|
body_b64="$(printf '%s' "${body}" | base64 -w0)"
|
|
|
|
local put_json head_json delete_json output
|
|
put_json="$(
|
|
jq -cn \
|
|
--arg bucket "${bucket}" \
|
|
--arg key "${key}" \
|
|
--arg body "${body_b64}" \
|
|
'{bucket:$bucket, key:$key, body:$body, contentMd5:"", ifNoneMatch:""}'
|
|
)"
|
|
log "LightningStor distributed replication: PUT ${bucket}/${key}"
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${put_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject
|
|
)" || die "failed to write LightningStor distributed replication probe ${bucket}/${key}: ${output}"
|
|
|
|
head_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
|
|
log "LightningStor distributed replication: HEAD ${bucket}/${key}"
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
|
|
)" || die "failed to head LightningStor distributed replication probe ${bucket}/${key}: ${output}"
|
|
printf '%s\n' "${output}" \
|
|
| jq -e --arg size "$(printf '%s' "${body}" | wc -c | awk '{print $1}')" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
|
|
|| die "LightningStor distributed replication probe ${bucket}/${key} returned unexpected metadata: ${output}"
|
|
local fetched_body
|
|
log "LightningStor distributed replication: GET ${bucket}/${key}"
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/GetObject
|
|
)" || die "failed to fetch LightningStor distributed replication probe ${bucket}/${key}: ${output}"
|
|
fetched_body="$(printf '%s\n' "${output}" | jq -rsr '[.[] | .bodyChunk? | select(. != null) | @base64d] | join("")')" \
|
|
|| die "failed to decode LightningStor distributed replication probe ${bucket}/${key}: ${output}"
|
|
[[ "${fetched_body}" == "${body}" ]] || die "distributed LightningStor returned unexpected object payload"
|
|
|
|
wait_for_lightningstor_counts_greater_than "${before_node01}" "${before_node04}" "${before_node05}" "generic object replication"
|
|
|
|
delete_json="$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')"
|
|
log "LightningStor distributed replication: DELETE ${bucket}/${key}"
|
|
output="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${delete_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject
|
|
)" || die "failed to delete LightningStor distributed replication probe ${bucket}/${key}: ${output}"
|
|
|
|
wait_for_lightningstor_counts_equal "${before_node01}" "${before_node04}" "${before_node05}" "generic object cleanup"
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${ls_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_vm_storage_flow() {
|
|
log "Validating PlasmaVMC image import, shared-volume execution, and cross-node migration"
|
|
|
|
local iam_tunnel="" prism_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
|
|
local node04_coronafs_tunnel="" node05_coronafs_tunnel=""
|
|
local current_worker_coronafs_port="" peer_worker_coronafs_port=""
|
|
local vm_port=15082
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
|
|
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
|
|
vm_tunnel="$(start_ssh_tunnel node01 "${vm_port}" 50082)"
|
|
coronafs_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
|
|
node04_coronafs_tunnel="$(start_ssh_tunnel node04 25088 "${CORONAFS_API_PORT}")"
|
|
node05_coronafs_tunnel="$(start_ssh_tunnel node05 35088 "${CORONAFS_API_PORT}")"
|
|
local image_source_path=""
|
|
local vm_watch_output=""
|
|
local node01_proto_root="/var/lib/plasmavmc/test-protos"
|
|
local vpc_id="" subnet_id="" port_id="" port_ip="" port_mac=""
|
|
cleanup_vm_storage_flow() {
|
|
if [[ -n "${token:-}" && -n "${port_id:-}" && -n "${subnet_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${token:-}" && -n "${subnet_id:-}" && -n "${vpc_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${token:-}" && -n "${vpc_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${image_source_path}" && "${image_source_path}" != /nix/store/* ]]; then
|
|
ssh_node node01 "rm -f ${image_source_path}" >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${vm_watch_output}" ]]; then
|
|
ssh_node node01 "rm -f ${vm_watch_output} ${vm_watch_output}.pid ${vm_watch_output}.stderr" >/dev/null 2>&1 || true
|
|
fi
|
|
stop_ssh_tunnel node05 "${node05_coronafs_tunnel}"
|
|
stop_ssh_tunnel node04 "${node04_coronafs_tunnel}"
|
|
stop_ssh_tunnel node01 "${coronafs_tunnel}"
|
|
stop_ssh_tunnel node01 "${vm_tunnel}"
|
|
stop_ssh_tunnel node01 "${ls_tunnel}"
|
|
stop_ssh_tunnel node01 "${prism_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
trap cleanup_vm_storage_flow RETURN
|
|
|
|
wait_for_plasmavmc_workers_registered 15082
|
|
|
|
local org_id="vm-smoke-org"
|
|
local project_id="vm-smoke-project"
|
|
local principal_id="plasmavmc-smoke-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
log "Matrix case: PlasmaVMC + PrismNet"
|
|
vpc_id="$(create_prismnet_vpc_with_retry \
|
|
"${token}" \
|
|
"${org_id}" \
|
|
"${project_id}" \
|
|
"vm-network-vpc" \
|
|
"vm storage matrix networking" \
|
|
"10.62.0.0/16" | jq -r '.vpc.id')"
|
|
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "failed to create PrismNet VPC for PlasmaVMC matrix"
|
|
|
|
subnet_id="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"vm-network-subnet", description:"vm storage matrix subnet", cidrBlock:"10.62.10.0/24", gatewayIp:"10.62.10.1", dhcpEnabled:true}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/CreateSubnet | jq -r '.subnet.id')"
|
|
[[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "failed to create PrismNet subnet for PlasmaVMC matrix"
|
|
|
|
local prismnet_port_response
|
|
prismnet_port_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"vm-network-port", description:"vm storage matrix port", ipAddress:""}')" \
|
|
127.0.0.1:15081 prismnet.PortService/CreatePort)"
|
|
port_id="$(printf '%s' "${prismnet_port_response}" | jq -r '.port.id')"
|
|
port_ip="$(printf '%s' "${prismnet_port_response}" | jq -r '.port.ipAddress')"
|
|
port_mac="$(printf '%s' "${prismnet_port_response}" | jq -r '.port.macAddress')"
|
|
[[ -n "${port_id}" && "${port_id}" != "null" ]] || die "failed to create PrismNet port for PlasmaVMC matrix"
|
|
[[ -n "${port_ip}" && "${port_ip}" != "null" ]] || die "PrismNet port ${port_id} did not return an IP address"
|
|
[[ -n "${port_mac}" && "${port_mac}" != "null" ]] || die "PrismNet port ${port_id} did not return a MAC address"
|
|
|
|
ensure_lightningstor_bucket 15086 "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
|
|
wait_for_lightningstor_write_quorum 15086 "${token}" "plasmavmc-images" "PlasmaVMC image import"
|
|
|
|
local image_name="vm-image-$(date +%s)"
|
|
local image_id=""
|
|
local guest_image_local_path guest_image_sha guest_image_size remote_guest_image_sha
|
|
local image_before_node01 image_before_node04 image_before_node05
|
|
local image_after_node01 image_after_node04 image_after_node05
|
|
read -r image_before_node01 image_before_node04 image_before_node05 < <(lightningstor_count_triplet)
|
|
guest_image_local_path="$(guest_image_path)"
|
|
[[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image"
|
|
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
|
|
guest_image_size="$(stat -c %s "${guest_image_local_path}")"
|
|
ssh_node node01 "install -d -m 0755 ${node01_proto_root}/iam ${node01_proto_root}/plasmavmc ${node01_proto_root}/lightningstor"
|
|
scp_to_node node01 "${IAM_PROTO}" "${node01_proto_root}/iam/iam.proto"
|
|
scp_to_node node01 "${PLASMAVMC_PROTO}" "${node01_proto_root}/plasmavmc/plasmavmc.proto"
|
|
scp_to_node node01 "${LIGHTNINGSTOR_PROTO}" "${node01_proto_root}/lightningstor/lightningstor.proto"
|
|
if [[ "${guest_image_local_path}" != /nix/store/* ]]; then
|
|
ssh_node node01 "install -d -m 0755 /var/lib/plasmavmc/imports && find /var/lib/plasmavmc/imports -maxdepth 1 -type f -name 'vm-image-*.qcow2' -delete"
|
|
fi
|
|
image_source_path="$(prepare_node01_image_source "${guest_image_local_path}" "${image_name}")"
|
|
remote_guest_image_sha="$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")"
|
|
[[ "${remote_guest_image_sha}" == "${guest_image_sha}" ]] || die "bootable VM guest image checksum mismatch after host distribution"
|
|
|
|
local create_image_json
|
|
log "Matrix case: PlasmaVMC + LightningStor"
|
|
create_image_json="$(
|
|
jq -cn \
|
|
--arg name "${image_name}" \
|
|
--arg org "${org_id}" \
|
|
--arg sha "${guest_image_sha}" \
|
|
--arg source_url "file://${image_source_path}" \
|
|
'{
|
|
name:$name,
|
|
orgId:$org,
|
|
visibility:"VISIBILITY_PRIVATE",
|
|
format:"IMAGE_FORMAT_QCOW2",
|
|
osType:"OS_TYPE_LINUX",
|
|
osVersion:"smoke",
|
|
architecture:"ARCHITECTURE_X86_64",
|
|
minDiskGib:1,
|
|
minMemoryMib:512,
|
|
metadata:{purpose:"smoke", sourceSha256:$sha},
|
|
sourceUrl:$source_url
|
|
}'
|
|
)"
|
|
local create_image_response
|
|
create_image_response="$(
|
|
ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_image_json}" | base64 | tr -d '\n')" <<'EOS'
|
|
set -euo pipefail
|
|
proto_root="$1"
|
|
token="$2"
|
|
request_b64="$3"
|
|
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${proto_root}/plasmavmc" \
|
|
-proto "${proto_root}/plasmavmc/plasmavmc.proto" \
|
|
-d "${request_json}" \
|
|
127.0.0.1:50082 plasmavmc.v1.ImageService/CreateImage
|
|
EOS
|
|
)"
|
|
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
|
|
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "failed to create image through PlasmaVMC"
|
|
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE" and .format == "IMAGE_FORMAT_QCOW2"' >/dev/null
|
|
|
|
local image_key="${org_id}/${project_id}/${image_id}.qcow2"
|
|
local get_image_json
|
|
get_image_json="$(jq -cn --arg org "${org_id}" --arg image "${image_id}" '{orgId:$org, imageId:$image}')"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${get_image_json}" \
|
|
127.0.0.1:15082 plasmavmc.v1.ImageService/GetImage \
|
|
| jq -e --arg image "${image_id}" '.id == $image and .status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" '{orgId:$org, pageSize:100, pageToken:"", includePublic:false}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.ImageService/ListImages \
|
|
| jq -e --arg image "${image_id}" '.images | any(.id == $image)' >/dev/null
|
|
|
|
local head_image_json head_image_response
|
|
head_image_json="$(jq -cn --arg bucket "plasmavmc-images" --arg key "${image_key}" '{bucket:$bucket, key:$key}')"
|
|
head_image_response="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_image_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject
|
|
)" || die "failed to head imported PlasmaVMC image object ${image_key}: ${head_image_response}"
|
|
printf '%s\n' "${head_image_response}" \
|
|
| jq -e --arg size "${guest_image_size}" '(.object.size | tonumber) == ($size | tonumber)' >/dev/null \
|
|
|| die "imported PlasmaVMC image object ${image_key} returned unexpected size: ${head_image_response}"
|
|
local image_checksum
|
|
image_checksum="$(printf '%s' "${create_image_response}" | jq -r '.checksum')"
|
|
[[ -n "${image_checksum}" && "${image_checksum}" != "null" ]] || die "CreateImage response did not return an imported image checksum"
|
|
# CreateImage computes the checksum from the normalized qcow2 artifact before upload.
|
|
[[ "${image_checksum}" == "${guest_image_sha}" ]] || die "imported PlasmaVMC image checksum mismatch"
|
|
ssh_node node01 "rm -f ${image_source_path}"
|
|
image_source_path=""
|
|
wait_for_lightningstor_counts_greater_than "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image import"
|
|
read -r image_after_node01 image_after_node04 image_after_node05 < <(
|
|
capture_stable_lightningstor_count_triplet \
|
|
"$((image_before_node01 + 1))" \
|
|
"$((image_before_node04 + 1))" \
|
|
"$((image_before_node05 + 1))"
|
|
)
|
|
|
|
local create_vm_rest_json
|
|
create_vm_rest_json="$(
|
|
jq -cn \
|
|
--arg name "smoke-vm-$(date +%s)" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg image_id "${image_id}" \
|
|
'{
|
|
name:$name,
|
|
org_id:$org,
|
|
project_id:$project,
|
|
hypervisor:"kvm",
|
|
vcpus:1,
|
|
memory_mib:1024,
|
|
disks:[
|
|
{
|
|
id:"root",
|
|
source:{type:"image", image_id:$image_id},
|
|
size_gib:4,
|
|
boot_index:1
|
|
},
|
|
{
|
|
id:"data",
|
|
source:{type:"blank"},
|
|
size_gib:2
|
|
}
|
|
]
|
|
}'
|
|
)"
|
|
|
|
local create_vm_grpc_json
|
|
create_vm_grpc_json="$(
|
|
jq -cn \
|
|
--arg name "$(printf '%s' "${create_vm_rest_json}" | jq -r '.name')" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg image_id "${image_id}" \
|
|
--arg subnet_id "${subnet_id}" \
|
|
--arg port_id "${port_id}" \
|
|
'{
|
|
name:$name,
|
|
orgId:$org,
|
|
projectId:$project,
|
|
hypervisor:"HYPERVISOR_TYPE_KVM",
|
|
spec:{
|
|
cpu:{vcpus:1, coresPerSocket:1, sockets:1},
|
|
memory:{sizeMib:1024},
|
|
disks:[
|
|
{
|
|
id:"root",
|
|
source:{imageId:$image_id},
|
|
sizeGib:4,
|
|
bus:"DISK_BUS_VIRTIO",
|
|
cache:"DISK_CACHE_WRITEBACK",
|
|
bootIndex:1
|
|
},
|
|
{
|
|
id:"data",
|
|
source:{blank:true},
|
|
sizeGib:2,
|
|
bus:"DISK_BUS_VIRTIO",
|
|
cache:"DISK_CACHE_WRITEBACK"
|
|
}
|
|
],
|
|
network:[
|
|
{
|
|
id:"tenant0",
|
|
subnetId:$subnet_id,
|
|
portId:$port_id,
|
|
model:"NIC_MODEL_VIRTIO_NET"
|
|
}
|
|
]
|
|
}
|
|
}'
|
|
)"
|
|
|
|
local create_response vm_id
|
|
create_response="$(
|
|
ssh_node_script node01 "${node01_proto_root}" "${token}" "$(printf '%s' "${create_vm_grpc_json}" | base64 | tr -d '\n')" <<'EOS'
|
|
set -euo pipefail
|
|
proto_root="$1"
|
|
token="$2"
|
|
request_b64="$3"
|
|
request_json="$(printf '%s' "${request_b64}" | base64 -d)"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${proto_root}/plasmavmc" \
|
|
-proto "${proto_root}/plasmavmc/plasmavmc.proto" \
|
|
-d "${request_json}" \
|
|
127.0.0.1:50082 plasmavmc.v1.VmService/CreateVm
|
|
EOS
|
|
)"
|
|
vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
|
|
[[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "failed to create VM through PlasmaVMC"
|
|
vm_watch_output="/tmp/plasmavmc-watch-${vm_id}.json"
|
|
start_plasmavmc_vm_watch node01 "${node01_proto_root}" "${token}" "${org_id}" "${project_id}" "${vm_id}" "${vm_watch_output}"
|
|
sleep 2
|
|
|
|
local get_vm_json
|
|
get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"
|
|
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
local node_id=""
|
|
local peer_node=""
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" "${vm_port}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
|
|
if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to be scheduled onto a worker"
|
|
fi
|
|
sleep 2
|
|
done
|
|
if [[ "${node_id}" == "node04" ]]; then
|
|
peer_node="node05"
|
|
current_worker_coronafs_port=25088
|
|
peer_worker_coronafs_port=35088
|
|
else
|
|
peer_node="node04"
|
|
current_worker_coronafs_port=35088
|
|
peer_worker_coronafs_port=25088
|
|
fi
|
|
wait_for_vm_network_spec "${token}" "${get_vm_json}" "${port_id}" "${subnet_id}" "${port_mac}" "${port_ip}" "${vm_port}" >/dev/null
|
|
wait_for_prismnet_port_binding "${token}" "${org_id}" "${project_id}" "${subnet_id}" "${port_id}" "${vm_id}" >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" "${vm_port}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to reach RUNNING"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to reach RUNNING"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
log "Matrix case: PlasmaVMC + PrismNet + CoronaFS + LightningStor"
|
|
local volume_id="${vm_id}-root"
|
|
local data_volume_id="${vm_id}-data"
|
|
local volume_path="${CORONAFS_VOLUME_ROOT}/${volume_id}.raw"
|
|
local data_volume_path="${CORONAFS_VOLUME_ROOT}/${data_volume_id}.raw"
|
|
local volume_export_json data_volume_export_json volume_uri data_volume_uri
|
|
local current_volume_qemu_ref current_data_volume_qemu_ref
|
|
volume_export_json="$(coronafs_export_volume_json 15088 "${volume_id}")"
|
|
data_volume_export_json="$(coronafs_export_volume_json 15088 "${data_volume_id}")"
|
|
volume_uri="$(printf '%s' "${volume_export_json}" | jq -r '.export.uri')"
|
|
data_volume_uri="$(printf '%s' "${data_volume_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${volume_uri}" && "${volume_uri}" != "null" ]] || die "CoronaFS root volume export URI missing"
|
|
[[ -n "${data_volume_uri}" && "${data_volume_uri}" != "null" ]] || die "CoronaFS data volume export URI missing"
|
|
ssh_node node01 "test -f ${volume_path}"
|
|
ssh_node node01 "test -f ${data_volume_path}"
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${volume_id}"
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${data_volume_id}"
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable root volume ${volume_id}"
|
|
fi
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable data volume ${data_volume_id}"
|
|
fi
|
|
ssh_node "${node_id}" "test -f ${volume_path}"
|
|
ssh_node "${node_id}" "test -f ${data_volume_path}"
|
|
current_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${volume_id}")"
|
|
current_data_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${data_volume_id}")"
|
|
[[ -n "${current_volume_qemu_ref}" ]] || die "worker ${node_id} did not expose an attachable local ref for ${volume_id}"
|
|
[[ -n "${current_data_volume_qemu_ref}" ]] || die "worker ${node_id} did not expose an attachable local ref for ${data_volume_id}"
|
|
wait_for_qemu_volume_present "${node_id}" "${volume_path}" "${current_volume_qemu_ref}"
|
|
wait_for_qemu_volume_present "${node_id}" "${data_volume_path}" "${current_data_volume_qemu_ref}"
|
|
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM startup"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=1"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=1"
|
|
local get_root_volume_json get_data_volume_json
|
|
local root_volume_state_json data_volume_state_json
|
|
local root_attachment_generation data_attachment_generation
|
|
get_root_volume_json="$(
|
|
jq -cn \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg volume "${volume_id}" \
|
|
'{orgId:$org, projectId:$project, volumeId:$volume}'
|
|
)"
|
|
get_data_volume_json="$(
|
|
jq -cn \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg volume "${data_volume_id}" \
|
|
'{orgId:$org, projectId:$project, volumeId:$volume}'
|
|
)"
|
|
root_volume_state_json="$(try_get_volume_json "${token}" "${get_root_volume_json}")"
|
|
data_volume_state_json="$(try_get_volume_json "${token}" "${get_data_volume_json}")"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachedToVm // empty')" == "${vm_id}" ]] || die "root volume ${volume_id} is not attached to VM ${vm_id}"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "root volume ${volume_id} is not owned by node ${node_id}"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachedToVm // empty')" == "${vm_id}" ]] || die "data volume ${data_volume_id} is not attached to VM ${vm_id}"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "data volume ${data_volume_id} is not owned by node ${node_id}"
|
|
root_attachment_generation="$(printf '%s' "${root_volume_state_json}" | jq -r '.attachmentGeneration // 0')"
|
|
data_attachment_generation="$(printf '%s' "${data_volume_state_json}" | jq -r '.attachmentGeneration // 0')"
|
|
(( root_attachment_generation >= 1 )) || die "root volume ${volume_id} did not report a positive attachment generation"
|
|
(( data_attachment_generation >= 1 )) || die "data volume ${data_volume_id} did not report a positive attachment generation"
|
|
|
|
log "Matrix case: PlasmaVMC + CoronaFS + LightningStor"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop after first boot"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop after first boot"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
|
|
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to reach RUNNING after restart"
|
|
fi
|
|
sleep 2
|
|
done
|
|
if [[ "${node_id}" == "node04" ]]; then
|
|
peer_node="node05"
|
|
current_worker_coronafs_port=25088
|
|
peer_worker_coronafs_port=35088
|
|
else
|
|
peer_node="node04"
|
|
current_worker_coronafs_port=35088
|
|
peer_worker_coronafs_port=25088
|
|
fi
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${volume_id}"
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${data_volume_id}"
|
|
current_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${volume_id}")"
|
|
current_data_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${data_volume_id}")"
|
|
[[ -n "${current_volume_qemu_ref}" ]] || die "worker ${node_id} did not republish an attachable local ref for ${volume_id} after restart"
|
|
[[ -n "${current_data_volume_qemu_ref}" ]] || die "worker ${node_id} did not republish an attachable local ref for ${data_volume_id} after restart"
|
|
wait_for_qemu_volume_present "${node_id}" "${volume_path}" "${current_volume_qemu_ref}"
|
|
wait_for_qemu_volume_present "${node_id}" "${data_volume_path}" "${current_data_volume_qemu_ref}"
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable root volume ${volume_id} after restart"
|
|
fi
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable data volume ${data_volume_id} after restart"
|
|
fi
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=2"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=2"
|
|
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM restart"
|
|
root_volume_state_json="$(try_get_volume_json "${token}" "${get_root_volume_json}")"
|
|
data_volume_state_json="$(try_get_volume_json "${token}" "${get_data_volume_json}")"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "root volume ${volume_id} drifted away from node ${node_id} after restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "data volume ${data_volume_id} drifted away from node ${node_id} after restart"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachmentGeneration // 0')" == "${root_attachment_generation}" ]] || die "root volume ${volume_id} attachment generation changed across same-node restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachmentGeneration // 0')" == "${data_attachment_generation}" ]] || die "data volume ${data_volume_id} attachment generation changed across same-node restart"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0')" == "${root_attachment_generation}" ]] || die "root volume ${volume_id} was not flushed before same-node restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0')" == "${data_attachment_generation}" ]] || die "data volume ${data_volume_id} was not flushed before same-node restart"
|
|
|
|
local migrate_vm_json
|
|
migrate_vm_json="$(
|
|
jq -cn \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg vm "${vm_id}" \
|
|
--arg destination_node "${peer_node}" \
|
|
'{
|
|
orgId:$org,
|
|
projectId:$project,
|
|
vmId:$vm,
|
|
destinationNodeId:$destination_node,
|
|
timeoutSeconds:120,
|
|
wait:true
|
|
}'
|
|
)"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${migrate_vm_json}" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/MigrateVm >/dev/null
|
|
|
|
local source_node="${node_id}"
|
|
local destination_node="${peer_node}"
|
|
local source_worker_coronafs_port="${current_worker_coronafs_port}"
|
|
local source_volume_qemu_ref="${current_volume_qemu_ref}"
|
|
local source_data_volume_qemu_ref="${current_data_volume_qemu_ref}"
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} migration to ${destination_node}"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e --arg node "${destination_node}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} migration to ${destination_node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
node_id="${destination_node}"
|
|
if [[ "${node_id}" == "node04" ]]; then
|
|
current_worker_coronafs_port=25088
|
|
peer_worker_coronafs_port=35088
|
|
else
|
|
current_worker_coronafs_port=35088
|
|
peer_worker_coronafs_port=25088
|
|
fi
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${volume_id}"
|
|
assert_coronafs_materialized_volume "${current_worker_coronafs_port}" "${data_volume_id}"
|
|
current_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${volume_id}")"
|
|
current_data_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${data_volume_id}")"
|
|
[[ -n "${current_volume_qemu_ref}" ]] || die "destination worker ${node_id} did not expose an attachable local ref for ${volume_id}"
|
|
[[ -n "${current_data_volume_qemu_ref}" ]] || die "destination worker ${node_id} did not expose an attachable local ref for ${data_volume_id}"
|
|
if coronafs_get_volume_json "${source_worker_coronafs_port}" "${volume_id}" >/dev/null 2>&1; then
|
|
die "source worker ${source_node} unexpectedly retained mutable root volume ${volume_id} after migration"
|
|
fi
|
|
if coronafs_get_volume_json "${source_worker_coronafs_port}" "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "source worker ${source_node} unexpectedly retained mutable data volume ${data_volume_id} after migration"
|
|
fi
|
|
ssh_node "${node_id}" "test -f ${volume_path}"
|
|
ssh_node "${node_id}" "test -f ${data_volume_path}"
|
|
wait_for_qemu_volume_present "${node_id}" "${volume_path}" "${current_volume_qemu_ref}"
|
|
wait_for_qemu_volume_present "${node_id}" "${data_volume_path}" "${current_data_volume_qemu_ref}"
|
|
wait_for_qemu_volume_absent "${source_node}" "${volume_path}" "${source_volume_qemu_ref}"
|
|
wait_for_qemu_volume_absent "${source_node}" "${data_volume_path}" "${source_data_volume_qemu_ref}"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_HEARTBEAT count=2"
|
|
root_volume_state_json="$(try_get_volume_json "${token}" "${get_root_volume_json}")"
|
|
data_volume_state_json="$(try_get_volume_json "${token}" "${get_data_volume_json}")"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "root volume ${volume_id} is not owned by migrated node ${node_id}"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "data volume ${data_volume_id} is not owned by migrated node ${node_id}"
|
|
local next_root_attachment_generation next_data_attachment_generation
|
|
next_root_attachment_generation="$(printf '%s' "${root_volume_state_json}" | jq -r '.attachmentGeneration // 0')"
|
|
next_data_attachment_generation="$(printf '%s' "${data_volume_state_json}" | jq -r '.attachmentGeneration // 0')"
|
|
(( next_root_attachment_generation > root_attachment_generation )) || die "root volume ${volume_id} attachment generation did not advance after migration"
|
|
(( next_data_attachment_generation > data_attachment_generation )) || die "data volume ${data_volume_id} attachment generation did not advance after migration"
|
|
(( $(printf '%s' "${root_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0') < next_root_attachment_generation )) || die "root volume ${volume_id} unexpectedly reported destination flush before post-migration stop"
|
|
(( $(printf '%s' "${data_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0') < next_data_attachment_generation )) || die "data volume ${data_volume_id} unexpectedly reported destination flush before post-migration stop"
|
|
root_attachment_generation="${next_root_attachment_generation}"
|
|
data_attachment_generation="${next_data_attachment_generation}"
|
|
wait_for_prismnet_port_binding "${token}" "${org_id}" "${project_id}" "${subnet_id}" "${port_id}" "${vm_id}" >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop after migration"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop after migration"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StartVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e --arg node "${node_id}" '.nodeId == $node and .state == "VM_STATE_RUNNING" and .status.actualState == "VM_STATE_RUNNING"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to restart on migrated worker ${node_id}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
coronafs_get_volume_json "${current_worker_coronafs_port}" "${volume_id}" >/dev/null
|
|
coronafs_get_volume_json "${current_worker_coronafs_port}" "${data_volume_id}" >/dev/null
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable root volume ${volume_id} after post-migration restart"
|
|
fi
|
|
if coronafs_get_volume_json "${peer_worker_coronafs_port}" "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "peer worker ${peer_node} unexpectedly materialized mutable data volume ${data_volume_id} after post-migration restart"
|
|
fi
|
|
current_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${volume_id}")"
|
|
current_data_volume_qemu_ref="$(coronafs_volume_qemu_ref "${current_worker_coronafs_port}" "${data_volume_id}")"
|
|
[[ -n "${current_volume_qemu_ref}" ]] || die "worker ${node_id} did not republish an attachable local ref for ${volume_id} after post-migration restart"
|
|
[[ -n "${current_data_volume_qemu_ref}" ]] || die "worker ${node_id} did not republish an attachable local ref for ${data_volume_id} after post-migration restart"
|
|
wait_for_qemu_volume_present "${node_id}" "${volume_path}" "${current_volume_qemu_ref}"
|
|
wait_for_qemu_volume_present "${node_id}" "${data_volume_path}" "${current_data_volume_qemu_ref}"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_READY count=3"
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_SMOKE_DATA_READY count=3"
|
|
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM post-migration restart"
|
|
root_volume_state_json="$(try_get_volume_json "${token}" "${get_root_volume_json}")"
|
|
data_volume_state_json="$(try_get_volume_json "${token}" "${get_data_volume_json}")"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "root volume ${volume_id} drifted away from migrated node ${node_id} after restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachedToNode // empty')" == "${node_id}" ]] || die "data volume ${data_volume_id} drifted away from migrated node ${node_id} after restart"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.attachmentGeneration // 0')" == "${root_attachment_generation}" ]] || die "root volume ${volume_id} attachment generation changed across migrated-node restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.attachmentGeneration // 0')" == "${data_attachment_generation}" ]] || die "data volume ${data_volume_id} attachment generation changed across migrated-node restart"
|
|
[[ "$(printf '%s' "${root_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0')" == "${root_attachment_generation}" ]] || die "root volume ${volume_id} was not flushed before migrated-node restart"
|
|
[[ "$(printf '%s' "${data_volume_state_json}" | jq -r '.lastFlushedAttachmentGeneration // 0')" == "${data_attachment_generation}" ]] || die "data volume ${data_volume_id} was not flushed before migrated-node restart"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false, timeoutSeconds:30}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/StopVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
if printf '%s' "${vm_json}" | jq -e '.state == "VM_STATE_STOPPED" and .status.actualState == "VM_STATE_STOPPED"' >/dev/null; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} to stop"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm, force:false}')" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/DeleteVm >/dev/null
|
|
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ! grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${get_vm_json}" \
|
|
127.0.0.1:15082 plasmavmc.v1.VmService/GetVm >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for VM ${vm_id} deletion to propagate"
|
|
fi
|
|
sleep 2
|
|
done
|
|
wait_for_plasmavmc_vm_watch_completion node01 "${vm_watch_output}" 60
|
|
assert_plasmavmc_vm_watch_events node01 "${vm_watch_output}" "${vm_id}"
|
|
wait_for_prismnet_port_detachment "${token}" "${org_id}" "${project_id}" "${subnet_id}" "${port_id}" >/dev/null
|
|
|
|
ssh_node "${node_id}" "bash -lc '[[ ! -d $(printf '%q' "$(vm_runtime_dir_path "${vm_id}")") ]]'"
|
|
ssh_node node01 "bash -lc '[[ ! -f ${volume_path} ]]'"
|
|
ssh_node node01 "bash -lc '[[ ! -f ${data_volume_path} ]]'"
|
|
if coronafs_get_volume_json 15088 "${volume_id}" >/dev/null 2>&1; then
|
|
die "CoronaFS root volume metadata still exists after VM deletion"
|
|
fi
|
|
if coronafs_get_volume_json 15088 "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "CoronaFS data volume metadata still exists after VM deletion"
|
|
fi
|
|
if coronafs_get_volume_json 25088 "${volume_id}" >/dev/null 2>&1; then
|
|
die "worker node04 retained mutable root volume metadata after VM deletion"
|
|
fi
|
|
if coronafs_get_volume_json 25088 "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "worker node04 retained mutable data volume metadata after VM deletion"
|
|
fi
|
|
if coronafs_get_volume_json 35088 "${volume_id}" >/dev/null 2>&1; then
|
|
die "worker node05 retained mutable root volume metadata after VM deletion"
|
|
fi
|
|
if coronafs_get_volume_json 35088 "${data_volume_id}" >/dev/null 2>&1; then
|
|
die "worker node05 retained mutable data volume metadata after VM deletion"
|
|
fi
|
|
wait_for_lightningstor_counts_equal "${image_after_node01}" "${image_after_node04}" "${image_after_node05}" "shared-fs VM deletion"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_image_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject \
|
|
| jq -e '(.object.size | tonumber) > 0' >/dev/null
|
|
if grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${volume_id}.raw" '{bucket:$bucket, key:$key}')" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
|
|
die "shared-fs VM volume unexpectedly persisted to LightningStor object storage"
|
|
fi
|
|
if grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "$(jq -cn --arg bucket "plasmavmc-volumes" --arg key "${org_id}/${project_id}/${data_volume_id}.raw" '{bucket:$bucket, key:$key}')" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
|
|
die "shared-fs VM data volume unexpectedly persisted to LightningStor object storage"
|
|
fi
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null
|
|
port_id=""
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null
|
|
subnet_id=""
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" \
|
|
-proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null
|
|
vpc_id=""
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${get_image_json}" \
|
|
127.0.0.1:15082 plasmavmc.v1.ImageService/DeleteImage >/dev/null
|
|
if grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "${head_image_json}" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null 2>&1; then
|
|
die "image object still present after ImageService/DeleteImage"
|
|
fi
|
|
wait_for_lightningstor_counts_equal "${image_before_node01}" "${image_before_node04}" "${image_before_node05}" "PlasmaVMC image cleanup"
|
|
|
|
trap - RETURN
|
|
cleanup_vm_storage_flow
|
|
}
|
|
|
|
validate_gateway() {
|
|
wait_for_http node06 http://127.0.0.1:8080/health
|
|
wait_for_http node06 http://127.0.0.1:9090/api/v1/series
|
|
wait_for_tcp_port node06 50089
|
|
wait_for_http node06 http://127.0.0.1:3011/health
|
|
|
|
log "Validating host-forwarded gateway endpoints"
|
|
wait_for_host_http http://127.0.0.1:8080/health
|
|
wait_for_host_http http://127.0.0.1:9090/api/v1/series
|
|
|
|
log "Validating API Gateway proxy routes"
|
|
|
|
local iam_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local org_id="gateway-smoke-org"
|
|
local project_id="gateway-smoke-project"
|
|
local principal_id="gateway-smoke-$(date +%s)"
|
|
local token vpc_json vpc_id
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
vpc_json="$(
|
|
curl -fsS \
|
|
-X POST http://127.0.0.1:8080/api/v1/vpcs \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "$(jq -cn --arg name "gateway-smoke-vpc" --arg org "${org_id}" --arg project "${project_id}" \
|
|
'{name:$name, org_id:$org, project_id:$project, cidr_block:"10.55.0.0/16", description:"gateway proxy smoke"}')"
|
|
)"
|
|
vpc_id="$(printf '%s' "${vpc_json}" | jq -r '.data.id')"
|
|
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "API Gateway VPC create did not return an ID"
|
|
|
|
curl -fsS --get http://127.0.0.1:8080/api/v1/vpcs \
|
|
-H "Authorization: Bearer ${token}" \
|
|
--data-urlencode "org_id=${org_id}" \
|
|
--data-urlencode "project_id=${project_id}" \
|
|
| jq -e --arg id "${vpc_id}" '.data.vpcs | any(.id == $id)' >/dev/null
|
|
|
|
curl -fsS http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
| jq -e --arg id "${vpc_id}" '.data.id == $id' >/dev/null
|
|
|
|
curl -fsS http://127.0.0.1:8080/api/v1/vms \
|
|
-H "Authorization: Bearer ${token}" \
|
|
| jq -e '.data.vms != null' >/dev/null
|
|
|
|
curl -fsS -X DELETE http://127.0.0.1:8080/api/v1/vpcs/"${vpc_id}" \
|
|
-H "Authorization: Bearer ${token}" >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_nightlight_flow_with_base() {
|
|
local base_url="$1"
|
|
local flow_name="$2"
|
|
|
|
log "Validating ${flow_name} remote_write ingestion and query endpoints"
|
|
|
|
local metric_name="nightlight_smoke_metric_$(date +%s)"
|
|
local metric_value
|
|
metric_value="$(awk 'BEGIN{srand(); printf "%.3f\n", (rand()*100)+1}')"
|
|
|
|
python3 "${REPO_ROOT}/nix/test-cluster/nightlight_remote_write.py" \
|
|
--url "${base_url}/write" \
|
|
--metric "${metric_name}" \
|
|
--value "${metric_value}" \
|
|
--label source=smoke \
|
|
--label cluster=photoncloud
|
|
|
|
wait_for_nightlight_query_result "${base_url}" "${flow_name}" "${metric_name}" "${metric_value}" "source=\"smoke\""
|
|
|
|
curl -fsS "${base_url}/label/__name__/values" \
|
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | index($name)) != null' >/dev/null
|
|
curl -fsS "${base_url}/series" \
|
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.__name__ == $name))' >/dev/null
|
|
}
|
|
|
|
wait_for_nightlight_query_result() {
|
|
local base_url="$1"
|
|
local flow_name="$2"
|
|
local metric_name="$3"
|
|
local metric_value="$4"
|
|
local selector_suffix="${5:-}"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if curl -fsS --get "${base_url}/query" \
|
|
--data-urlencode "query=${metric_name}{${selector_suffix}}" \
|
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
|
.status == "success"
|
|
and (.data.result | length) >= 1
|
|
and (.data.result | any(.metric.__name__ == $name and (.value[1] >= ($expected - 0.001)) and (.value[1] <= ($expected + 0.001))))
|
|
' >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for ${flow_name} query result for ${metric_name}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
validate_nightlight_flow() {
|
|
validate_nightlight_flow_with_base "http://127.0.0.1:9090/api/v1" "NightLight"
|
|
validate_nightlight_grpc_and_persistence
|
|
}
|
|
|
|
validate_apigateway_nightlight_flow() {
|
|
log "Matrix case: NightLight + APIGateway"
|
|
wait_for_host_http http://127.0.0.1:8080/health
|
|
validate_nightlight_flow_with_base "http://127.0.0.1:8080/api/v1/metrics" "API Gateway -> NightLight"
|
|
}
|
|
|
|
validate_nightlight_grpc_and_persistence() {
|
|
log "Validating NightLight gRPC query/admin APIs and restart persistence"
|
|
|
|
local base_url="http://127.0.0.1:9090/api/v1"
|
|
local grpc_tunnel=""
|
|
local metric_name="nightlight_persist_metric_$(date +%s)"
|
|
local metric_value
|
|
metric_value="$(awk 'BEGIN{srand(); printf "%.3f\n", (rand()*100)+1}')"
|
|
|
|
grpc_tunnel="$(start_ssh_tunnel node06 15090 50088)"
|
|
trap 'stop_ssh_tunnel node06 "${grpc_tunnel}"' RETURN
|
|
|
|
python3 "${REPO_ROOT}/nix/test-cluster/nightlight_remote_write.py" \
|
|
--url "${base_url}/write" \
|
|
--metric "${metric_name}" \
|
|
--value "${metric_value}" \
|
|
--label source=grpc \
|
|
--label cluster=photoncloud
|
|
|
|
wait_for_nightlight_query_result "${base_url}" "NightLight persistence pre-restart" "${metric_name}" "${metric_value}" "source=\"grpc\""
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
|
-d "$(jq -cn --arg query "${metric_name}{source=\"grpc\"}" '{query:$query, time:0, timeout:5000}')" \
|
|
127.0.0.1:15090 nightlight.MetricQuery/InstantQuery \
|
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
|
.status == "success"
|
|
and (.data.result | any(.metric.__name__ == $name and (.value.value >= ($expected - 0.001)) and (.value.value <= ($expected + 0.001))))
|
|
' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
|
-d "$(jq -cn --arg match "__name__=${metric_name}" '{match:[$match]}')" \
|
|
127.0.0.1:15090 nightlight.MetricQuery/SeriesQuery \
|
|
| jq -e --arg name "${metric_name}" '.status == "success" and (.data | any(.labels.__name__ == $name))' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
|
-d "$(jq -cn --arg label "source" --arg match "__name__=${metric_name}" '{labelName:$label, match:[$match]}')" \
|
|
127.0.0.1:15090 nightlight.MetricQuery/LabelValuesQuery \
|
|
| jq -e '.status == "success" and (.data | index("grpc")) != null' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_ADMIN_PROTO}" \
|
|
-d '{}' \
|
|
127.0.0.1:15090 nightlight.Admin/Health \
|
|
| jq -e '.status == "ok" and (.components | any(.name == "storage" and .status == "ok"))' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_ADMIN_PROTO}" \
|
|
-d '{}' \
|
|
127.0.0.1:15090 nightlight.Admin/Stats \
|
|
| jq -e '.storage.totalSamples >= 1 and .ingestion.samplesIngestedTotal >= 1 and .query.queriesTotal >= 1' >/dev/null
|
|
|
|
ssh_node node06 "systemctl restart nightlight.service"
|
|
wait_for_host_http http://127.0.0.1:9090/healthz
|
|
wait_for_tcp_port node06 50088
|
|
|
|
wait_for_nightlight_query_result "${base_url}" "NightLight persistence post-restart" "${metric_name}" "${metric_value}" "source=\"grpc\""
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${NIGHTLIGHT_PROTO_DIR}" \
|
|
-proto "${NIGHTLIGHT_QUERY_PROTO}" \
|
|
-d "$(jq -cn --arg query "${metric_name}{source=\"grpc\"}" '{query:$query, time:0, timeout:5000}')" \
|
|
127.0.0.1:15090 nightlight.MetricQuery/InstantQuery \
|
|
| jq -e --arg name "${metric_name}" --argjson expected "${metric_value}" '
|
|
.status == "success"
|
|
and (.data.result | any(.metric.__name__ == $name and (.value.value >= ($expected - 0.001)) and (.value.value <= ($expected + 0.001))))
|
|
' >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node06 "${grpc_tunnel}"
|
|
}
|
|
|
|
validate_creditservice_rest_flow() {
|
|
local base_url="$1"
|
|
local token="$2"
|
|
local org_id="$3"
|
|
local project_id="$4"
|
|
local flow_name="$5"
|
|
local reservation_json reservation_id
|
|
|
|
log "Validating ${flow_name} REST wallet and reservation flows"
|
|
|
|
curl -fsS \
|
|
-X POST "${base_url}/wallets" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{org_id:$org, project_id:$project, initial_balance:1000}')" \
|
|
| jq -e '.data.project_id != null and .data.balance == 1000 and .data.available == 1000' >/dev/null
|
|
|
|
curl -fsS "${base_url}/wallets/${project_id}" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
| jq -e --arg project "${project_id}" '.data.project_id == $project and .data.balance == 1000' >/dev/null
|
|
|
|
curl -fsS \
|
|
-X POST "${base_url}/wallets/${project_id}/topup" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"amount":250,"description":"smoke topup"}' \
|
|
| jq -e '.data.balance == 1250 and .data.total_deposited == 1250' >/dev/null
|
|
|
|
reservation_json="$(
|
|
curl -fsS \
|
|
-X POST "${base_url}/reservations" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "$(jq -cn --arg project "${project_id}" '{project_id:$project, amount:200, description:"smoke reservation", resource_type:"vm", ttl_seconds:120}')"
|
|
)"
|
|
reservation_id="$(printf '%s' "${reservation_json}" | jq -r '.data.id')"
|
|
[[ -n "${reservation_id}" && "${reservation_id}" != "null" ]] || die "${flow_name} reservation did not return an ID"
|
|
|
|
curl -fsS \
|
|
-X POST "${base_url}/reservations/${reservation_id}/commit" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"actual_amount":150,"resource_id":"smoke-vm"}' \
|
|
| jq -e '.data.balance == 1100 and .data.reserved == 0 and .data.available == 1100' >/dev/null
|
|
|
|
curl -fsS "${base_url}/wallets/${project_id}/transactions" \
|
|
-H "Authorization: Bearer ${token}" \
|
|
| jq -e '.data.transactions | length >= 3' >/dev/null
|
|
}
|
|
|
|
validate_creditservice_flow() {
|
|
log "Validating CreditService REST and gRPC quota flows"
|
|
|
|
local iam_tunnel="" credit_grpc_tunnel="" credit_http_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
credit_grpc_tunnel="$(start_ssh_tunnel node06 15089 50089)"
|
|
credit_http_tunnel="$(start_ssh_tunnel node06 13011 3011)"
|
|
trap 'stop_ssh_tunnel node06 "${credit_http_tunnel}"; stop_ssh_tunnel node06 "${credit_grpc_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
local suffix
|
|
suffix="$(date +%s)"
|
|
local org_id="credit-smoke-org-${suffix}"
|
|
local project_id="credit-smoke-project-${suffix}"
|
|
local principal_id="credit-smoke-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
validate_creditservice_rest_flow \
|
|
"http://127.0.0.1:13011/api/v1" \
|
|
"${token}" \
|
|
"${org_id}" \
|
|
"${project_id}" \
|
|
"CreditService"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${CREDITSERVICE_PROTO_DIR}" \
|
|
-proto "${CREDITSERVICE_PROTO}" \
|
|
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", limit:2}')" \
|
|
127.0.0.1:15089 creditservice.v1.CreditService/SetQuota \
|
|
| jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${CREDITSERVICE_PROTO_DIR}" \
|
|
-proto "${CREDITSERVICE_PROTO}" \
|
|
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE"}')" \
|
|
127.0.0.1:15089 creditservice.v1.CreditService/GetQuota \
|
|
| jq -e '.quota.limit == "2" or .quota.limit == 2' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${CREDITSERVICE_PROTO_DIR}" \
|
|
-proto "${CREDITSERVICE_PROTO}" \
|
|
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project}')" \
|
|
127.0.0.1:15089 creditservice.v1.CreditService/ListQuotas \
|
|
| jq -e '.quotas | length >= 1' >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${CREDITSERVICE_PROTO_DIR}" \
|
|
-proto "${CREDITSERVICE_PROTO}" \
|
|
-d "$(jq -cn --arg project "${project_id}" '{projectId:$project, resourceType:"RESOURCE_TYPE_VM_INSTANCE", quantity:3, estimatedCost:0}')" \
|
|
127.0.0.1:15089 creditservice.v1.CreditService/CheckQuota \
|
|
| jq -e '(.allowed // false) == false and (.availableQuota == "2" or .availableQuota == 2)' >/dev/null
|
|
|
|
ssh_node_script node06 <<'EOS'
|
|
set -euo pipefail
|
|
systemctl is-active --quiet creditservice.service
|
|
journalctl -u creditservice.service --no-pager | grep -F 'Connecting to IAM server at' >/dev/null
|
|
EOS
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node06 "${credit_http_tunnel}"
|
|
stop_ssh_tunnel node06 "${credit_grpc_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_apigateway_creditservice_flow() {
|
|
log "Matrix case: CreditService + IAM + APIGateway"
|
|
|
|
local iam_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
trap 'stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
wait_for_host_http http://127.0.0.1:8080/health
|
|
|
|
local suffix
|
|
suffix="$(date +%s)"
|
|
local org_id="credit-gateway-org-${suffix}"
|
|
local project_id="credit-gateway-project-${suffix}"
|
|
local principal_id="credit-gateway-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
validate_creditservice_rest_flow \
|
|
"http://127.0.0.1:8080/api/v1/credits" \
|
|
"${token}" \
|
|
"${org_id}" \
|
|
"${project_id}" \
|
|
"API Gateway -> CreditService"
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_deployer_flow() {
|
|
log "Validating Deployer health, admin registration, and phone-home flows"
|
|
|
|
local deployer_tunnel=""
|
|
deployer_tunnel="$(start_ssh_tunnel node06 13012 8088)"
|
|
trap 'stop_ssh_tunnel node06 "${deployer_tunnel}"' RETURN
|
|
|
|
wait_for_http node06 "http://127.0.0.1:8088/health"
|
|
curl -fsS http://127.0.0.1:13012/health | grep -Fx 'OK' >/dev/null
|
|
|
|
local machine_id node_id phone_home_json
|
|
machine_id="smoke-machine-$(date +%s)"
|
|
node_id="smoke-node-$(date +%s)"
|
|
|
|
curl -fsS \
|
|
-H 'content-type: application/json' \
|
|
-H 'x-deployer-token: test-admin-token' \
|
|
-d "$(jq -cn \
|
|
--arg machine "${machine_id}" \
|
|
--arg node "${node_id}" \
|
|
'{machine_id:$machine, node_id:$node, role:"worker", ip:"10.100.0.250", services:["plasmavmc"], ssh_authorized_keys:["ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFiberLBSmokeKey smoke@test"]}')" \
|
|
http://127.0.0.1:13012/api/v1/admin/nodes \
|
|
| jq -e --arg machine "${machine_id}" --arg node "${node_id}" '.success == true and .machine_id == $machine and .node_id == $node' >/dev/null
|
|
|
|
curl -fsS \
|
|
-H 'x-deployer-token: test-admin-token' \
|
|
http://127.0.0.1:13012/api/v1/admin/nodes \
|
|
| jq -e --arg node "${node_id}" '.nodes | any(.node_id == $node and .ip == "10.100.0.250" and .role == "worker")' >/dev/null
|
|
|
|
phone_home_json="$(curl -fsS \
|
|
-H 'content-type: application/json' \
|
|
-H 'x-deployer-token: test-bootstrap-token' \
|
|
-d "$(jq -cn \
|
|
--arg machine "${machine_id}" \
|
|
--arg node "${node_id}" \
|
|
'{machine_id:$machine, node_id:$node, hostname:$node, ip:"10.100.0.250", metadata:{rack:"smoke-a1"}}')" \
|
|
http://127.0.0.1:13012/api/v1/phone-home)"
|
|
printf '%s' "${phone_home_json}" | jq -e --arg node "${node_id}" '
|
|
.success == true and
|
|
.node_id == $node and
|
|
.state == "provisioning" and
|
|
.node_config.hostname == $node and
|
|
.node_config.role == "worker" and
|
|
(.node_config.services | index("plasmavmc")) != null
|
|
' >/dev/null
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node06 "${deployer_tunnel}"
|
|
}
|
|
|
|
validate_native_runtime_flow() {
|
|
log "Validating native deployer + scheduler runtime orchestration from declarative Nix seed"
|
|
|
|
wait_for_unit node04 node-agent
|
|
wait_for_unit node05 node-agent
|
|
wait_for_unit node06 fleet-scheduler
|
|
wait_for_http node06 "http://127.0.0.1:8088/health"
|
|
|
|
local chainfire_tunnel_node01="" chainfire_tunnel_node02="" chainfire_tunnel_node03=""
|
|
local chainfire_endpoint="http://127.0.0.1:12379,http://127.0.0.1:12380,http://127.0.0.1:12381"
|
|
local iam_tunnel="" lb_tunnel="" token lb_name
|
|
local native_fresh_healthy_map_expr native_fresh_healthy_count_expr
|
|
native_fresh_healthy_map_expr='map(select(.state == "healthy" and (((((.last_heartbeat // .observed_at) // "") | sub("\\.[0-9]+"; "") | sub("\\+00:00$"; "Z") | fromdateiso8601?) // 0) >= (now - 300))))'
|
|
native_fresh_healthy_count_expr="${native_fresh_healthy_map_expr} | length"
|
|
chainfire_tunnel_node01="$(start_ssh_tunnel node01 12379 2379 "${NODE_IPS[node01]}")"
|
|
chainfire_tunnel_node02="$(start_ssh_tunnel node02 12380 2379 "${NODE_IPS[node02]}")"
|
|
chainfire_tunnel_node03="$(start_ssh_tunnel node03 12381 2379 "${NODE_IPS[node03]}")"
|
|
trap 'stop_ssh_tunnel node01 "${lb_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"; stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"; stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"; stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"' RETURN
|
|
|
|
native_dump_values() {
|
|
local prefix="$1"
|
|
run_deployer_ctl \
|
|
--chainfire-endpoint "${chainfire_endpoint}" \
|
|
--cluster-id "test-cluster" \
|
|
--cluster-namespace "photoncloud" \
|
|
--deployer-namespace "deployer" \
|
|
dump --prefix "${prefix}" --format json \
|
|
| jq -rc '.value'
|
|
}
|
|
|
|
wait_for_native_dump_count() {
|
|
local prefix="$1"
|
|
local jq_expr="$2"
|
|
local expected="$3"
|
|
local timeout="${4:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
while true; do
|
|
local count
|
|
count="$(
|
|
native_dump_values "${prefix}" \
|
|
| sed '/^$/d' \
|
|
| jq -sr "${jq_expr}" 2>/dev/null \
|
|
|| printf '0'
|
|
)"
|
|
if [[ "${count}" == "${expected}" ]]; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for prefix ${prefix} to satisfy ${jq_expr} == ${expected} (got ${count})"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
native_first_healthy_instance() {
|
|
local service="$1"
|
|
native_dump_values "photoncloud/clusters/test-cluster/instances/${service}/" \
|
|
| sed '/^$/d' \
|
|
| jq -sr "${native_fresh_healthy_map_expr} | sort_by(.instance_id) | first"
|
|
}
|
|
|
|
wait_for_native_instance_node() {
|
|
local service="$1"
|
|
local expected_node="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
local instance_value="" node_id=""
|
|
|
|
while true; do
|
|
instance_value="$(
|
|
native_dump_values "photoncloud/clusters/test-cluster/instances/${service}/" \
|
|
| sed '/^$/d' \
|
|
| jq -sr \
|
|
--arg node "${expected_node}" \
|
|
"${native_fresh_healthy_map_expr} | map(select(.node_id == \$node)) | sort_by(.instance_id) | first"
|
|
)"
|
|
node_id="$(printf '%s' "${instance_value}" | jq -r '.node_id // empty')"
|
|
if [[ "${node_id}" == "${expected_node}" ]]; then
|
|
printf '%s' "${instance_value}"
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for ${service} to run on ${expected_node}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
native_publication_state() {
|
|
local service="$1"
|
|
native_dump_values "photoncloud/clusters/test-cluster/publications/" \
|
|
| sed '/^$/d' \
|
|
| jq -sr --arg service "${service}" 'map(select(.service == $service)) | first'
|
|
}
|
|
|
|
wait_for_native_dns_record() {
|
|
local fqdn="$1"
|
|
local expected_ip="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
while true; do
|
|
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A | grep -Fx '${expected_ip}'" >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for native DNS record for ${fqdn}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_native_dns_records() {
|
|
local fqdn="$1"
|
|
local timeout="$2"
|
|
shift 2
|
|
local expected_json actual_json
|
|
local deadline=$((SECONDS + timeout))
|
|
|
|
expected_json="$(printf '%s\n' "$@" | sed '/^$/d' | sort -u | jq -R . | jq -cs 'sort')"
|
|
|
|
while true; do
|
|
actual_json="$(
|
|
ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${fqdn} A" 2>/dev/null \
|
|
| sed '/^$/d' \
|
|
| sort -u \
|
|
| jq -R . \
|
|
| jq -cs 'sort'
|
|
)" || actual_json="[]"
|
|
if [[ "${actual_json}" == "${expected_json}" ]]; then
|
|
return 0
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for native DNS record set for ${fqdn}: expected ${expected_json}, got ${actual_json}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
wait_for_native_lb_backends() {
|
|
local pool_id="$1"
|
|
local expected_count="$2"
|
|
local timeout="${3:-${HTTP_WAIT_TIMEOUT}}"
|
|
shift 3
|
|
local deadline=$((SECONDS + timeout))
|
|
local response=""
|
|
|
|
while true; do
|
|
response="$(
|
|
grpcurl_capture -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg pool_id "${pool_id}" '{poolId:$pool_id, pageSize:100, pageToken:""}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/ListBackends
|
|
)" || true
|
|
|
|
if printf '%s' "${response}" \
|
|
| jq -e --argjson expected "${expected_count}" '(.backends | length) == $expected' >/dev/null 2>&1; then
|
|
local matched=1
|
|
local expected_ip
|
|
for expected_ip in "$@"; do
|
|
if ! printf '%s' "${response}" | jq -e --arg ip "${expected_ip}" '.backends | any(.address == $ip)' >/dev/null 2>&1; then
|
|
matched=0
|
|
break
|
|
fi
|
|
done
|
|
if [[ "${matched}" == "1" ]]; then
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for native FiberLB backends for pool ${pool_id}: ${response}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
}
|
|
|
|
set_native_node_state() {
|
|
local node_id="$1"
|
|
local state="$2"
|
|
run_deployer_ctl \
|
|
--chainfire-endpoint "${chainfire_endpoint}" \
|
|
--cluster-id "test-cluster" \
|
|
--cluster-namespace "photoncloud" \
|
|
--deployer-namespace "deployer" \
|
|
node set-state --node-id "${node_id}" --state "${state}"
|
|
}
|
|
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/services/" \
|
|
'map(select(.name == "native-web" or .name == "native-container" or .name == "native-daemon")) | length' \
|
|
"3" \
|
|
180
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/nodes/" \
|
|
'map(select(.labels.runtime == "native" and .state == "active")) | length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
'length' \
|
|
"2" \
|
|
300
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
300
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
'length' \
|
|
"1" \
|
|
360
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
360
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
'length' \
|
|
"2" \
|
|
300
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
300
|
|
|
|
wait_for_http node04 "http://10.100.0.21:18190/" 240
|
|
wait_for_http node05 "http://10.100.0.22:18190/" 240
|
|
wait_for_http node04 "http://10.100.0.21:18193/" 240
|
|
wait_for_http node05 "http://10.100.0.22:18193/" 240
|
|
local container_value container_node container_ip container_port
|
|
container_value="$(native_first_healthy_instance "native-container")"
|
|
container_node="$(printf '%s' "${container_value}" | jq -r '.node_id')"
|
|
container_ip="$(printf '%s' "${container_value}" | jq -r '.ip')"
|
|
container_port="$(printf '%s' "${container_value}" | jq -r '.port')"
|
|
[[ -n "${container_node}" && "${container_node}" != "null" ]] || die "native-container did not report a healthy instance"
|
|
wait_for_http "${container_node}" "http://${container_ip}:${container_port}/" 360
|
|
wait_for_http node01 "http://127.0.0.1:18191/" 240
|
|
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/publications/" \
|
|
'map(select(.service == "native-web" or .service == "native-daemon")) | length' \
|
|
"2" \
|
|
180
|
|
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
|
|
|
|
token="$(issue_project_admin_token 15080 "native-services" "test-cluster" "native-runtime-$(date +%s)")"
|
|
lb_name="test-cluster-native-web"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" \
|
|
-proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn '{orgId:"native-services", projectId:"test-cluster", pageSize:100, pageToken:""}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/ListLoadBalancers \
|
|
| jq -e --arg name "${lb_name}" '.loadbalancers | any(.name == $name)' >/dev/null
|
|
|
|
local publication_value publication_fqdn publication_ip publication_pool_id
|
|
local daemon_publication_value daemon_publication_fqdn
|
|
publication_value="$(native_publication_state "native-web")"
|
|
publication_fqdn="$(printf '%s' "${publication_value}" | jq -r '.dns.fqdn')"
|
|
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.values[0]')"
|
|
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
|
|
daemon_publication_value="$(native_publication_state "native-daemon")"
|
|
daemon_publication_fqdn="$(printf '%s' "${daemon_publication_value}" | jq -r '.dns.fqdn')"
|
|
[[ -n "${publication_fqdn}" && "${publication_fqdn}" != "null" ]] || die "native-web publication missing fqdn"
|
|
[[ -n "${publication_ip}" && "${publication_ip}" != "null" ]] || die "native-web publication missing dns value"
|
|
[[ -n "${publication_pool_id}" && "${publication_pool_id}" != "null" ]] || die "native-web publication missing pool id"
|
|
[[ -n "${daemon_publication_fqdn}" && "${daemon_publication_fqdn}" != "null" ]] || die "native-daemon publication missing fqdn"
|
|
|
|
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
|
|
wait_for_native_dns_records "${daemon_publication_fqdn}" 180 10.100.0.21 10.100.0.22
|
|
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
|
|
|
|
log "Draining node04 through deployer lifecycle state"
|
|
set_native_node_state "node04" "draining"
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/nodes/" \
|
|
'map(select(.node_id == "node04" and .state == "draining")) | length' \
|
|
"1" \
|
|
120
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
'length' \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
'length' \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
'length' \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
local drained_web_value drained_web_node drained_container_value drained_container_node
|
|
drained_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
|
|
drained_web_node="$(printf '%s' "${drained_web_value}" | jq -r '.node_id')"
|
|
[[ "${drained_web_node}" == "node05" ]] || die "native-web did not relocate to node05 after draining node04"
|
|
drained_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
|
|
drained_container_node="$(printf '%s' "${drained_container_value}" | jq -r '.node_id')"
|
|
[[ "${drained_container_node}" == "node05" ]] || die "native-container did not relocate to node05 after draining node04"
|
|
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
|
|
wait_for_http node05 "http://10.100.0.22:18190/" 240
|
|
wait_for_http node05 "http://10.100.0.22:18192/" 240
|
|
wait_for_http node05 "http://10.100.0.22:18193/" 240
|
|
wait_for_http node01 "http://127.0.0.1:18191/" 240
|
|
publication_value="$(native_publication_state "native-web")"
|
|
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
|
|
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.values[0]')"
|
|
daemon_publication_value="$(native_publication_state "native-daemon")"
|
|
wait_for_native_lb_backends "${publication_pool_id}" "1" 180 10.100.0.22
|
|
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
|
|
wait_for_native_dns_records "${daemon_publication_fqdn}" 180 10.100.0.22
|
|
|
|
log "Restoring node04 and ensuring capacity returns without moving healthy singleton work"
|
|
set_native_node_state "node04" "active"
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/nodes/" \
|
|
'map(select(.node_id == "node04" and .state == "active")) | length' \
|
|
"1" \
|
|
120
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
'length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
'length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
240
|
|
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
|
|
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
|
|
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
|
|
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
|
|
local restored_container_value restored_container_node
|
|
restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
|
|
restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')"
|
|
[[ "${restored_container_node}" == "node05" ]] || die "native-container unexpectedly moved after node04 returned to service"
|
|
publication_value="$(native_publication_state "native-web")"
|
|
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
|
|
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.values[0]')"
|
|
daemon_publication_value="$(native_publication_state "native-daemon")"
|
|
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
|
|
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
|
|
wait_for_native_dns_records "${daemon_publication_fqdn}" 180 10.100.0.21 10.100.0.22
|
|
wait_for_http node01 "http://127.0.0.1:18191/" 240
|
|
|
|
log "Simulating native worker loss and scheduler failover"
|
|
stop_vm node05
|
|
wait_for_ssh_down node05 120
|
|
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
|
|
local failover_web_value failover_web_node failover_container_value failover_container_node
|
|
failover_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
|
|
failover_web_node="$(printf '%s' "${failover_web_value}" | jq -r '.node_id')"
|
|
[[ "${failover_web_node}" == "node04" ]] || die "native-web did not fail over to node04 after node05 stopped"
|
|
failover_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
|
|
failover_container_node="$(printf '%s' "${failover_container_value}" | jq -r '.node_id')"
|
|
[[ "${failover_container_node}" == "node04" ]] || die "native-container did not fail over to node04 after node05 stopped"
|
|
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
|
|
publication_value="$(native_publication_state "native-web")"
|
|
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
|
|
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.values[0]')"
|
|
daemon_publication_value="$(native_publication_state "native-daemon")"
|
|
wait_for_native_lb_backends "${publication_pool_id}" "1" 240 10.100.0.21
|
|
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
|
|
wait_for_native_dns_records "${daemon_publication_fqdn}" 180 10.100.0.21
|
|
wait_for_http node04 "http://10.100.0.21:18190/" 240
|
|
wait_for_http node04 "http://10.100.0.21:18192/" 240
|
|
wait_for_http node04 "http://10.100.0.21:18193/" 240
|
|
wait_for_http node01 "http://127.0.0.1:18191/" 240
|
|
|
|
log "Restarting native worker and ensuring declarative replica count is restored"
|
|
start_vm node05
|
|
wait_for_ssh node05
|
|
wait_for_unit node05 plasmavmc
|
|
wait_for_unit node05 lightningstor
|
|
wait_for_unit node05 node-agent
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/nodes/" \
|
|
'map(select(.labels.runtime == "native" and .state == "active")) | length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
'length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-web/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-container/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"1" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
'length' \
|
|
"2" \
|
|
240
|
|
wait_for_native_dump_count \
|
|
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
|
|
"${native_fresh_healthy_count_expr}" \
|
|
"2" \
|
|
240
|
|
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
|
|
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
|
|
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
|
|
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
|
|
local recovered_container_value recovered_container_node
|
|
recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
|
|
recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')"
|
|
[[ "${recovered_container_node}" == "node04" ]] || die "native-container unexpectedly churned after node05 recovered"
|
|
publication_value="$(native_publication_state "native-web")"
|
|
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
|
|
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.values[0]')"
|
|
daemon_publication_value="$(native_publication_state "native-daemon")"
|
|
wait_for_native_lb_backends "${publication_pool_id}" "2" 180 10.100.0.21 10.100.0.22
|
|
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
|
|
wait_for_native_dns_records "${daemon_publication_fqdn}" 180 10.100.0.21 10.100.0.22
|
|
wait_for_http node01 "http://127.0.0.1:18191/" 240
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${lb_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
stop_ssh_tunnel node01 "${chainfire_tunnel_node01}"
|
|
stop_ssh_tunnel node02 "${chainfire_tunnel_node02}"
|
|
stop_ssh_tunnel node03 "${chainfire_tunnel_node03}"
|
|
}
|
|
|
|
validate_network_provider_matrix() {
|
|
log "Validating component matrix: PrismNet, FlashDNS, and FiberLB in composed tenant scenarios"
|
|
|
|
local iam_tunnel="" prism_tunnel="" dns_tunnel="" lb_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
prism_tunnel="$(start_ssh_tunnel node01 15081 50081)"
|
|
dns_tunnel="$(start_ssh_tunnel node01 15084 50084)"
|
|
lb_tunnel="$(start_ssh_tunnel node01 15085 50085)"
|
|
|
|
local org_id="matrix-net-org"
|
|
local project_id="matrix-net-project"
|
|
local principal_id="matrix-net-$(date +%s)"
|
|
local token=""
|
|
local vpc_id="" subnet_id="" port_id="" port_ip=""
|
|
local zone_id="" zone_name="matrix-$(date +%s).cluster.test"
|
|
local workload_record_id="" service_record_id=""
|
|
local lb_id="" pool_id="" backend_id="" listener_id="" listener_port=""
|
|
local workload_fqdn="" service_fqdn=""
|
|
|
|
cleanup_network_provider_matrix() {
|
|
if [[ -n "${service_record_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${service_record_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${workload_record_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${workload_record_id}" '{id:$id}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/DeleteRecord >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${listener_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${listener_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.ListenerService/DeleteListener >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${backend_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${backend_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/DeleteBackend >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${pool_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${pool_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.PoolService/DeletePool >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${lb_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg id "${lb_id}" '{id:$id}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/DeleteLoadBalancer >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${port_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" --arg id "${port_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.PortService/DeletePort >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${subnet_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vpc "${vpc_id}" --arg id "${subnet_id}" '{orgId:$org, projectId:$project, vpcId:$vpc, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/DeleteSubnet >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${vpc_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg id "${vpc_id}" '{orgId:$org, projectId:$project, id:$id}')" \
|
|
127.0.0.1:15081 prismnet.VpcService/DeleteVpc >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${zone_id:-}" ]]; then
|
|
grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg id "${zone_id}" '{id:$id, force:true}')" \
|
|
127.0.0.1:15084 flashdns.v1.ZoneService/DeleteZone >/dev/null 2>&1 || true
|
|
fi
|
|
|
|
stop_ssh_tunnel node01 "${lb_tunnel}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${dns_tunnel}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${prism_tunnel}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${iam_tunnel}" >/dev/null 2>&1 || true
|
|
}
|
|
trap cleanup_network_provider_matrix RETURN
|
|
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
log "Matrix case: PrismNet only"
|
|
vpc_id="$(create_prismnet_vpc_with_retry \
|
|
"${token}" \
|
|
"${org_id}" \
|
|
"${project_id}" \
|
|
"matrix-vpc" \
|
|
"component matrix" \
|
|
"10.52.0.0/16" \
|
|
240 | jq -r '.vpc.id')"
|
|
[[ -n "${vpc_id}" && "${vpc_id}" != "null" ]] || die "component matrix PrismNet VPC creation failed"
|
|
|
|
subnet_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg vpc "${vpc_id}" '{vpcId:$vpc, name:"matrix-subnet", description:"component matrix", cidrBlock:"10.52.10.0/24", gatewayIp:"10.52.10.1", dhcpEnabled:true}')" \
|
|
127.0.0.1:15081 prismnet.SubnetService/CreateSubnet | jq -r '.subnet.id')"
|
|
[[ -n "${subnet_id}" && "${subnet_id}" != "null" ]] || die "component matrix PrismNet subnet creation failed"
|
|
|
|
local port_response
|
|
port_response="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${PRISMNET_PROTO_DIR}" -proto "${PRISMNET_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg subnet "${subnet_id}" '{orgId:$org, projectId:$project, subnetId:$subnet, name:"matrix-port", description:"component matrix", ipAddress:""}')" \
|
|
127.0.0.1:15081 prismnet.PortService/CreatePort)"
|
|
port_id="$(printf '%s' "${port_response}" | jq -r '.port.id')"
|
|
port_ip="$(printf '%s' "${port_response}" | jq -r '.port.ipAddress')"
|
|
[[ -n "${port_id}" && "${port_id}" != "null" && -n "${port_ip}" && "${port_ip}" != "null" ]] || die "component matrix PrismNet port creation failed"
|
|
|
|
log "Matrix case: PrismNet + FlashDNS"
|
|
zone_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg name "${zone_name}" --arg org "${org_id}" --arg project "${project_id}" '{name:$name, orgId:$org, projectId:$project, primaryNs:"ns1.matrix.test", adminEmail:"admin@matrix.test"}')" \
|
|
127.0.0.1:15084 flashdns.v1.ZoneService/CreateZone | jq -r '.zone.id')"
|
|
[[ -n "${zone_id}" && "${zone_id}" != "null" ]] || die "component matrix FlashDNS zone creation failed"
|
|
|
|
workload_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg zone "${zone_id}" --arg address "${port_ip}" '{zoneId:$zone, name:"workload", recordType:"A", ttl:60, data:{a:{address:$address}}}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
|
|
[[ -n "${workload_record_id}" && "${workload_record_id}" != "null" ]] || die "component matrix FlashDNS workload record creation failed"
|
|
|
|
workload_fqdn="workload.${zone_name}"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${workload_fqdn} A | grep -Fx '${port_ip}'" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for FlashDNS answer for ${workload_fqdn}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
log "Matrix case: PrismNet + FiberLB"
|
|
listener_port="$(allocate_free_listener_port node01 18180 18999)" || die "failed to allocate a free FiberLB listener port for component matrix"
|
|
lb_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" '{name:"matrix-lb", orgId:$org, projectId:$project, description:"component matrix"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.LoadBalancerService/CreateLoadBalancer | jq -r '.loadbalancer.id')"
|
|
[[ -n "${lb_id}" && "${lb_id}" != "null" ]] || die "component matrix FiberLB creation failed"
|
|
|
|
pool_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg lb "${lb_id}" '{name:"matrix-pool", loadbalancerId:$lb, algorithm:"POOL_ALGORITHM_ROUND_ROBIN", protocol:"POOL_PROTOCOL_TCP"}')" \
|
|
127.0.0.1:15085 fiberlb.v1.PoolService/CreatePool | jq -r '.pool.id')"
|
|
[[ -n "${pool_id}" && "${pool_id}" != "null" ]] || die "component matrix FiberLB pool creation failed"
|
|
|
|
backend_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg pool "${pool_id}" '{name:"matrix-backend", poolId:$pool, address:"10.100.0.11", port:8081, weight:1}')" \
|
|
127.0.0.1:15085 fiberlb.v1.BackendService/CreateBackend | jq -r '.backend.id')"
|
|
[[ -n "${backend_id}" && "${backend_id}" != "null" ]] || die "component matrix FiberLB backend creation failed"
|
|
|
|
listener_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FIBERLB_PROTO_DIR}" -proto "${FIBERLB_PROTO}" \
|
|
-d "$(jq -cn --arg lb "${lb_id}" --arg pool "${pool_id}" --argjson port "${listener_port}" '{name:"matrix-listener", loadbalancerId:$lb, protocol:"LISTENER_PROTOCOL_TCP", port:$port, defaultPoolId:$pool, connectionLimit:0}')" \
|
|
127.0.0.1:15085 fiberlb.v1.ListenerService/CreateListener | jq -r '.listener.id')"
|
|
[[ -n "${listener_id}" && "${listener_id}" != "null" ]] || die "component matrix FiberLB listener creation failed"
|
|
wait_for_http node01 "http://127.0.0.1:${listener_port}/health"
|
|
|
|
log "Matrix case: PrismNet + FlashDNS + FiberLB"
|
|
service_record_id="$(grpcurl -plaintext -H "authorization: Bearer ${token}" \
|
|
-import-path "${FLASHDNS_PROTO_DIR}" -proto "${FLASHDNS_PROTO}" \
|
|
-d "$(jq -cn --arg zone "${zone_id}" '{zoneId:$zone, name:"service", recordType:"A", ttl:60, data:{a:{address:"10.100.0.11"}}}')" \
|
|
127.0.0.1:15084 flashdns.v1.RecordService/CreateRecord | jq -r '.record.id')"
|
|
[[ -n "${service_record_id}" && "${service_record_id}" != "null" ]] || die "component matrix FlashDNS service record creation failed"
|
|
|
|
service_fqdn="service.${zone_name}"
|
|
deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
if ssh_node node01 "dig @127.0.0.1 -p 5353 +short ${service_fqdn} A | grep -Fx '10.100.0.11'" >/dev/null 2>&1; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for FlashDNS answer for ${service_fqdn}"
|
|
fi
|
|
sleep 2
|
|
done
|
|
|
|
ssh_node node01 "curl -fsS --max-time 5 --resolve ${service_fqdn}:${listener_port}:10.100.0.11 http://${service_fqdn}:${listener_port}/health >/dev/null"
|
|
|
|
trap - RETURN
|
|
cleanup_network_provider_matrix
|
|
}
|
|
|
|
validate_component_matrix() {
|
|
validate_control_plane
|
|
validate_iam_flow
|
|
validate_network_provider_matrix
|
|
validate_vm_storage_flow
|
|
validate_k8shost_flow
|
|
validate_gateway
|
|
validate_nightlight_flow
|
|
validate_apigateway_nightlight_flow
|
|
validate_creditservice_flow
|
|
validate_apigateway_creditservice_flow
|
|
validate_deployer_flow
|
|
validate_native_runtime_flow
|
|
log "Component matrix validation succeeded"
|
|
}
|
|
|
|
benchmark_coronafs_performance() {
|
|
log "Benchmarking CoronaFS controller-export and node-local volume throughput against local worker disk"
|
|
|
|
local local_write_json local_read_json local_rand_json
|
|
local coronafs_controller_write_json coronafs_controller_read_json coronafs_controller_rand_json
|
|
local local_depth_write_json local_depth_read_json
|
|
local coronafs_controller_depth_write_json coronafs_controller_depth_read_json
|
|
local coronafs_local_write_json coronafs_local_read_json coronafs_local_rand_json
|
|
local coronafs_local_depth_write_json coronafs_local_depth_read_json
|
|
local coronafs_target_local_read_json
|
|
local coronafs_controller_tunnel="" coronafs_node04_tunnel="" coronafs_node05_tunnel=""
|
|
local bench_volume="coronafs-bench-$(date +%s)"
|
|
local node04_local_volume="${bench_volume}-node04-local"
|
|
local node05_local_volume="${bench_volume}-node05-local"
|
|
local coronafs_export_json coronafs_uri node04_export_json node04_local_uri node05_export_json node05_local_uri
|
|
local node04_materialize_ns_start node04_materialize_ns_end node05_materialize_ns_start node05_materialize_ns_end
|
|
local node04_materialize_sec node05_materialize_sec
|
|
|
|
coronafs_controller_tunnel="$(start_ssh_tunnel node01 15088 "${CORONAFS_API_PORT}")"
|
|
coronafs_node04_tunnel="$(start_ssh_tunnel node04 25088 "${CORONAFS_API_PORT}")"
|
|
coronafs_node05_tunnel="$(start_ssh_tunnel node05 26088 "${CORONAFS_API_PORT}")"
|
|
cleanup_coronafs_bench() {
|
|
coronafs_delete_volume 25088 "${node04_local_volume}" >/dev/null 2>&1 || true
|
|
coronafs_delete_volume 26088 "${node05_local_volume}" >/dev/null 2>&1 || true
|
|
coronafs_delete_volume 15088 "${bench_volume}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node05 "${coronafs_node05_tunnel}"
|
|
stop_ssh_tunnel node04 "${coronafs_node04_tunnel}"
|
|
stop_ssh_tunnel node01 "${coronafs_controller_tunnel}"
|
|
}
|
|
trap cleanup_coronafs_bench RETURN
|
|
|
|
coronafs_create_volume 15088 "${bench_volume}" $((512 * 1024 * 1024)) >/dev/null
|
|
coronafs_export_json="$(coronafs_export_volume_json 15088 "${bench_volume}")"
|
|
coronafs_uri="$(printf '%s' "${coronafs_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${coronafs_uri}" && "${coronafs_uri}" != "null" ]] || die "CoronaFS benchmark volume did not return an export URI"
|
|
|
|
local_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqwrite.dat write 1M 256)"
|
|
local_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-seqread.dat read 1M 256)"
|
|
local_rand_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread.dat randread 4k 128 10)"
|
|
local_rand_depth_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-randread-depth.dat randread 4k 512 15 32 libaio)"
|
|
|
|
coronafs_controller_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 256)"
|
|
node04_materialize_ns_start="$(date +%s%N)"
|
|
coronafs_materialize_volume 25088 "${node04_local_volume}" "${coronafs_uri}" $((512 * 1024 * 1024)) >/dev/null
|
|
node04_materialize_ns_end="$(date +%s%N)"
|
|
node05_materialize_ns_start="$(date +%s%N)"
|
|
coronafs_materialize_volume 26088 "${node05_local_volume}" "${coronafs_uri}" $((512 * 1024 * 1024)) >/dev/null
|
|
node05_materialize_ns_end="$(date +%s%N)"
|
|
|
|
node04_export_json="$(coronafs_export_volume_json 25088 "${node04_local_volume}")"
|
|
node04_local_uri="$(printf '%s' "${node04_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${node04_local_uri}" && "${node04_local_uri}" != "null" ]] || die "node04 local CoronaFS benchmark volume did not return an export URI"
|
|
node05_export_json="$(coronafs_export_volume_json 26088 "${node05_local_volume}")"
|
|
node05_local_uri="$(printf '%s' "${node05_export_json}" | jq -r '.export.uri')"
|
|
[[ -n "${node05_local_uri}" && "${node05_local_uri}" != "null" ]] || die "node05 local CoronaFS benchmark volume did not return an export URI"
|
|
|
|
node04_materialize_sec="$(calc_seconds_from_ns "$((node04_materialize_ns_end - node04_materialize_ns_start))")"
|
|
node05_materialize_sec="$(calc_seconds_from_ns "$((node05_materialize_ns_end - node05_materialize_ns_start))")"
|
|
coronafs_controller_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" read 1M 256)"
|
|
coronafs_controller_rand_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 128 10)"
|
|
coronafs_controller_depth_read_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" randread 4k 512 15 /dev/nbd0 32)"
|
|
local_depth_write_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthwrite.dat write 1M 1024 15 32 libaio)"
|
|
local_depth_read_json="$(run_remote_fio_json node04 /var/tmp/photon-bench/local-depthread.dat read 1M 1024 15 32 libaio)"
|
|
coronafs_controller_depth_write_json="$(run_remote_nbd_fio_json node04 "${coronafs_uri}" write 1M 1024 15 /dev/nbd0 32)"
|
|
coronafs_local_write_json="$(run_remote_nbd_fio_json node04 "${node04_local_uri}" write 1M 256)"
|
|
coronafs_local_read_json="$(run_remote_nbd_fio_json node04 "${node04_local_uri}" read 1M 256)"
|
|
coronafs_local_rand_json="$(run_remote_nbd_fio_json node04 "${node04_local_uri}" randread 4k 128 10)"
|
|
coronafs_local_depth_read_json="$(run_remote_nbd_fio_json node04 "${node04_local_uri}" randread 4k 512 15 /dev/nbd0 32)"
|
|
coronafs_local_depth_write_json="$(run_remote_nbd_fio_json node04 "${node04_local_uri}" write 1M 1024 15 /dev/nbd0 32)"
|
|
coronafs_target_local_read_json="$(run_remote_nbd_fio_json node05 "${node05_local_uri}" read 1M 256 0 /dev/nbd1 1)"
|
|
|
|
local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
|
|
local coronafs_controller_write_mibps coronafs_controller_read_mibps coronafs_controller_rand_iops coronafs_controller_rand_depth_iops
|
|
local local_depth_write_mibps local_depth_read_mibps coronafs_controller_depth_write_mibps coronafs_controller_depth_read_mibps
|
|
local coronafs_local_write_mibps coronafs_local_read_mibps coronafs_local_rand_iops coronafs_local_depth_read_iops
|
|
local coronafs_local_depth_write_mibps coronafs_local_depth_read_mibps coronafs_target_local_read_mibps
|
|
|
|
local_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_write_json}" | jq -r '.bw_bytes')")"
|
|
local_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_read_json}" | jq -r '.bw_bytes')")"
|
|
local_rand_iops="$(printf '%s' "${local_rand_json}" | jq -r '.iops | floor')"
|
|
local_rand_depth_iops="$(printf '%s' "${local_rand_depth_json}" | jq -r '.iops | floor')"
|
|
|
|
coronafs_controller_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_controller_write_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_controller_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_controller_read_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_controller_rand_iops="$(printf '%s' "${coronafs_controller_rand_json}" | jq -r '.iops | floor')"
|
|
coronafs_controller_rand_depth_iops="$(printf '%s' "${coronafs_controller_depth_read_json}" | jq -r '.iops | floor')"
|
|
local_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_write_json}" | jq -r '.bw_bytes')")"
|
|
local_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${local_depth_read_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_controller_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_controller_depth_write_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_controller_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_controller_depth_read_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_local_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_local_write_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_local_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_local_read_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_local_rand_iops="$(printf '%s' "${coronafs_local_rand_json}" | jq -r '.iops | floor')"
|
|
coronafs_local_depth_read_iops="$(printf '%s' "${coronafs_local_depth_read_json}" | jq -r '.iops | floor')"
|
|
coronafs_local_depth_write_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_local_depth_write_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_local_depth_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_local_depth_read_json}" | jq -r '.bw_bytes')")"
|
|
coronafs_target_local_read_mibps="$(bw_bytes_to_mibps "$(printf '%s' "${coronafs_target_local_read_json}" | jq -r '.bw_bytes')")"
|
|
|
|
log "CoronaFS local baseline: write=${local_write_mibps} MiB/s read=${local_read_mibps} MiB/s randread=${local_rand_iops} IOPS queued_randread=${local_rand_depth_iops} IOPS"
|
|
log "CoronaFS controller export path: write=${coronafs_controller_write_mibps} MiB/s read=${coronafs_controller_read_mibps} MiB/s randread=${coronafs_controller_rand_iops} IOPS queued_randread=${coronafs_controller_rand_depth_iops} IOPS"
|
|
log "CoronaFS node-local export path: write=${coronafs_local_write_mibps} MiB/s read=${coronafs_local_read_mibps} MiB/s randread=${coronafs_local_rand_iops} IOPS queued_randread=${coronafs_local_depth_read_iops} IOPS"
|
|
log "CoronaFS depth-32 profile: local_write=${local_depth_write_mibps} MiB/s local_read=${local_depth_read_mibps} MiB/s controller_write=${coronafs_controller_depth_write_mibps} MiB/s controller_read=${coronafs_controller_depth_read_mibps} MiB/s node_local_write=${coronafs_local_depth_write_mibps} MiB/s node_local_read=${coronafs_local_depth_read_mibps} MiB/s"
|
|
log "CoronaFS materialize latency: node04=${node04_materialize_sec}s node05=${node05_materialize_sec}s target_local_read=${coronafs_target_local_read_mibps} MiB/s"
|
|
|
|
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
|
|
"${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
|
|
"${coronafs_controller_write_mibps}" "${coronafs_controller_read_mibps}" "${coronafs_controller_rand_iops}" "${coronafs_controller_rand_depth_iops}" \
|
|
"${local_depth_write_mibps}" "${local_depth_read_mibps}" \
|
|
"${coronafs_controller_depth_write_mibps}" "${coronafs_controller_depth_read_mibps}" \
|
|
"${coronafs_local_write_mibps}" "${coronafs_local_read_mibps}" "${coronafs_local_rand_iops}" "${coronafs_local_depth_read_iops}" \
|
|
"${coronafs_local_depth_write_mibps}" "${coronafs_local_depth_read_mibps}" \
|
|
"${node04_materialize_sec}" "${node05_materialize_sec}" "${coronafs_target_local_read_mibps}"
|
|
|
|
trap - RETURN
|
|
cleanup_coronafs_bench
|
|
}
|
|
|
|
benchmark_lightningstor_performance() {
|
|
local client_node="${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}"
|
|
local large_object_size_mb="${LIGHTNINGSTOR_BENCH_SIZE_MB:-256}"
|
|
local small_object_count="${LIGHTNINGSTOR_BENCH_SMALL_COUNT:-32}"
|
|
local small_object_size_mb="${LIGHTNINGSTOR_BENCH_SMALL_SIZE_MB:-4}"
|
|
local parallelism="${LIGHTNINGSTOR_BENCH_PARALLELISM:-8}"
|
|
log "Benchmarking LightningStor S3 throughput from ${client_node}"
|
|
|
|
local org_id="bench-org"
|
|
local project_id="bench-project"
|
|
local principal_id="lightningstor-s3-bench-$(date +%s)"
|
|
local iam_tunnel=""
|
|
local s3_access_key="" s3_secret_key=""
|
|
local bucket="ls-bench-$(date +%s)"
|
|
local object_key="bench-object.bin"
|
|
local result_json
|
|
cleanup_lightningstor_bench_auth() {
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
trap cleanup_lightningstor_bench_auth RETURN
|
|
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}" >/dev/null
|
|
IFS=$'\t' read -r s3_access_key s3_secret_key < <(issue_s3_credential 15080 "${principal_id}" "${org_id}" "${project_id}" "lightningstor-storage-benchmark")
|
|
|
|
if ! result_json="$(ssh_node_script "${client_node}" "${bucket}" "${object_key}" "${large_object_size_mb}" "${small_object_count}" "${small_object_size_mb}" "${parallelism}" "${s3_access_key}" "${s3_secret_key}" <<'EOS'
|
|
set -euo pipefail
|
|
|
|
bucket="$1"
|
|
object_key="$2"
|
|
size_mb="$3"
|
|
small_count="$4"
|
|
small_size_mb="$5"
|
|
parallelism="$6"
|
|
access_key_id="$7"
|
|
secret_key="$8"
|
|
endpoint="http://10.100.0.11:9000"
|
|
workdir="/var/tmp/photon-bench-s3"
|
|
src="${workdir}/upload.bin"
|
|
dst="${workdir}/download.bin"
|
|
mkdir -p "${workdir}"
|
|
python3 - "${bucket}" "${object_key}" "${size_mb}" "${small_count}" "${small_size_mb}" "${parallelism}" "${endpoint}" "${workdir}" "${src}" "${dst}" "${access_key_id}" "${secret_key}" <<'PY'
|
|
import concurrent.futures
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import pathlib
|
|
import threading
|
|
import time
|
|
|
|
import boto3
|
|
from botocore.config import Config
|
|
|
|
|
|
bucket, object_key, size_mb, small_count, small_size_mb, parallelism, endpoint, workdir, src, dst, access_key_id, secret_key = os.sys.argv[1:13]
|
|
size_mb = int(size_mb)
|
|
small_count = int(small_count)
|
|
small_size_mb = int(small_size_mb)
|
|
parallelism = int(parallelism)
|
|
workdir_path = pathlib.Path(workdir)
|
|
src_path = pathlib.Path(src)
|
|
dst_path = pathlib.Path(dst)
|
|
small_size_bytes = small_size_mb * 1024 * 1024
|
|
large_size_bytes = size_mb * 1024 * 1024
|
|
thread_local = threading.local()
|
|
|
|
|
|
def ensure_sparse_file(path: pathlib.Path, size_bytes: int) -> None:
|
|
if path.exists() and path.stat().st_size == size_bytes:
|
|
return
|
|
with path.open("wb") as handle:
|
|
handle.truncate(size_bytes)
|
|
|
|
|
|
def sha256_file(path: pathlib.Path) -> str:
|
|
digest = hashlib.sha256()
|
|
with path.open("rb") as handle:
|
|
while True:
|
|
chunk = handle.read(8 * 1024 * 1024)
|
|
if not chunk:
|
|
break
|
|
digest.update(chunk)
|
|
return digest.hexdigest()
|
|
|
|
|
|
def new_client():
|
|
return boto3.session.Session().client(
|
|
"s3",
|
|
endpoint_url=endpoint,
|
|
region_name="us-east-1",
|
|
aws_access_key_id=access_key_id,
|
|
aws_secret_access_key=secret_key,
|
|
use_ssl=False,
|
|
verify=False,
|
|
config=Config(
|
|
retries={"max_attempts": 8, "mode": "standard"},
|
|
s3={"addressing_style": "path"},
|
|
max_pool_connections=max(32, parallelism * 4),
|
|
signature_version="s3v4",
|
|
),
|
|
)
|
|
|
|
|
|
def client():
|
|
existing = getattr(thread_local, "client", None)
|
|
if existing is None:
|
|
existing = new_client()
|
|
thread_local.client = existing
|
|
return existing
|
|
|
|
|
|
def put_file(key: str, path: pathlib.Path) -> None:
|
|
with path.open("rb") as handle:
|
|
client().put_object(Bucket=bucket, Key=key, Body=handle)
|
|
|
|
|
|
def get_file(key: str, path: pathlib.Path) -> None:
|
|
response = client().get_object(Bucket=bucket, Key=key)
|
|
with path.open("wb") as handle:
|
|
body = response["Body"]
|
|
for chunk in body.iter_chunks(chunk_size=8 * 1024 * 1024):
|
|
if chunk:
|
|
handle.write(chunk)
|
|
|
|
|
|
def delete_key(key: str) -> None:
|
|
client().delete_object(Bucket=bucket, Key=key)
|
|
|
|
|
|
workdir_path.mkdir(parents=True, exist_ok=True)
|
|
ensure_sparse_file(src_path, large_size_bytes)
|
|
src_sha = sha256_file(src_path)
|
|
small_paths = []
|
|
for index in range(1, small_count + 1):
|
|
path = workdir_path / f"payload-{index}.bin"
|
|
ensure_sparse_file(path, small_size_bytes)
|
|
small_paths.append(path)
|
|
|
|
control_client = new_client()
|
|
control_client.create_bucket(Bucket=bucket)
|
|
|
|
upload_start = time.monotonic_ns()
|
|
put_file(object_key, src_path)
|
|
upload_end = time.monotonic_ns()
|
|
|
|
if dst_path.exists():
|
|
dst_path.unlink()
|
|
download_start = time.monotonic_ns()
|
|
get_file(object_key, dst_path)
|
|
download_end = time.monotonic_ns()
|
|
|
|
if sha256_file(dst_path) != src_sha:
|
|
raise SystemExit("large-object checksum mismatch")
|
|
|
|
head = control_client.head_object(Bucket=bucket, Key=object_key)
|
|
if int(head["ContentLength"]) != large_size_bytes:
|
|
raise SystemExit("large-object size mismatch")
|
|
|
|
delete_key(object_key)
|
|
|
|
small_total_bytes = small_count * small_size_bytes
|
|
|
|
small_upload_start = time.monotonic_ns()
|
|
for index, path in enumerate(small_paths, start=1):
|
|
put_file(f"small-{index}.bin", path)
|
|
small_upload_end = time.monotonic_ns()
|
|
|
|
small_download_start = time.monotonic_ns()
|
|
for index in range(1, small_count + 1):
|
|
small_dst = workdir_path / f"small-download-{index}.bin"
|
|
get_file(f"small-{index}.bin", small_dst)
|
|
if small_dst.stat().st_size != small_size_bytes:
|
|
raise SystemExit(f"small-object size mismatch for {small_dst}")
|
|
small_download_end = time.monotonic_ns()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
|
|
list(executor.map(delete_key, [f"small-{index}.bin" for index in range(1, small_count + 1)]))
|
|
|
|
parallel_upload_start = time.monotonic_ns()
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
|
|
list(
|
|
executor.map(
|
|
lambda item: put_file(f"parallel-small-{item[0]}.bin", item[1]),
|
|
list(enumerate(small_paths, start=1)),
|
|
)
|
|
)
|
|
parallel_upload_end = time.monotonic_ns()
|
|
|
|
parallel_download_start = time.monotonic_ns()
|
|
|
|
|
|
def download_parallel(index: int) -> None:
|
|
path = workdir_path / f"parallel-download-{index}.bin"
|
|
get_file(f"parallel-small-{index}.bin", path)
|
|
if path.stat().st_size != small_size_bytes:
|
|
raise SystemExit(f"parallel small-object size mismatch for {path}")
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
|
|
list(executor.map(download_parallel, range(1, small_count + 1)))
|
|
parallel_download_end = time.monotonic_ns()
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=parallelism) as executor:
|
|
list(executor.map(delete_key, [f"parallel-small-{index}.bin" for index in range(1, small_count + 1)]))
|
|
|
|
control_client.delete_bucket(Bucket=bucket)
|
|
|
|
for pattern in ("payload-*.bin", "small-download-*.bin", "parallel-download-*.bin"):
|
|
for path in workdir_path.glob(pattern):
|
|
path.unlink(missing_ok=True)
|
|
src_path.unlink(missing_ok=True)
|
|
dst_path.unlink(missing_ok=True)
|
|
|
|
print(
|
|
json.dumps(
|
|
{
|
|
"size_bytes": large_size_bytes,
|
|
"upload_ns": upload_end - upload_start,
|
|
"download_ns": download_end - download_start,
|
|
"small_object_count": small_count,
|
|
"small_total_bytes": small_total_bytes,
|
|
"small_upload_ns": small_upload_end - small_upload_start,
|
|
"small_download_ns": small_download_end - small_download_start,
|
|
"parallel_small_upload_ns": parallel_upload_end - parallel_upload_start,
|
|
"parallel_small_download_ns": parallel_download_end - parallel_download_start,
|
|
"parallelism": parallelism,
|
|
}
|
|
)
|
|
)
|
|
PY
|
|
EOS
|
|
)"; then
|
|
die "LightningStor S3 benchmark failed"
|
|
fi
|
|
|
|
local size_bytes upload_mibps download_mibps
|
|
local small_total_bytes small_object_count small_object_mib
|
|
local small_upload_mibps small_download_mibps small_put_ops small_get_ops
|
|
local parallel_small_upload_mibps parallel_small_download_mibps parallel_small_put_ops parallel_small_get_ops parallelism
|
|
size_bytes="$(printf '%s' "${result_json}" | jq -r '.size_bytes')"
|
|
[[ -n "${size_bytes}" && "${size_bytes}" != "null" && "${size_bytes}" != "0" ]] || die "LightningStor S3 benchmark returned no object size"
|
|
upload_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.upload_ns')")"
|
|
download_mibps="$(calc_mib_per_s "${size_bytes}" "$(printf '%s' "${result_json}" | jq -r '.download_ns')")"
|
|
small_total_bytes="$(printf '%s' "${result_json}" | jq -r '.small_total_bytes')"
|
|
small_object_count="$(printf '%s' "${result_json}" | jq -r '.small_object_count')"
|
|
small_object_mib="$(awk "BEGIN { printf \"%.0f\", ${small_total_bytes} / 1048576 }")"
|
|
small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
|
|
small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
|
|
small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_upload_ns')")"
|
|
small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.small_download_ns')")"
|
|
parallel_small_upload_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
|
|
parallel_small_download_mibps="$(calc_mib_per_s "${small_total_bytes}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
|
|
parallel_small_put_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_upload_ns')")"
|
|
parallel_small_get_ops="$(calc_ops_per_s "${small_object_count}" "$(printf '%s' "${result_json}" | jq -r '.parallel_small_download_ns')")"
|
|
parallelism="$(printf '%s' "${result_json}" | jq -r '.parallelism')"
|
|
|
|
log "LightningStor S3 benchmark: upload=${upload_mibps} MiB/s download=${download_mibps} MiB/s object_size=$((size_bytes / 1048576)) MiB"
|
|
log "LightningStor small-object batch: objects=${small_object_count} size_per_object=4 MiB upload=${small_upload_mibps} MiB/s download=${small_download_mibps} MiB/s put_rate=${small_put_ops} obj/s get_rate=${small_get_ops} obj/s"
|
|
log "LightningStor parallel small-object batch: objects=${small_object_count} size_per_object=4 MiB parallelism=${parallelism} upload=${parallel_small_upload_mibps} MiB/s download=${parallel_small_download_mibps} MiB/s put_rate=${parallel_small_put_ops} obj/s get_rate=${parallel_small_get_ops} obj/s"
|
|
|
|
printf '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n' \
|
|
"${upload_mibps}" "${download_mibps}" "$((size_bytes / 1048576))" \
|
|
"${small_object_count}" "${small_object_mib}" "${small_upload_mibps}" "${small_download_mibps}" \
|
|
"${small_put_ops}/${small_get_ops}" \
|
|
"${parallel_small_upload_mibps}" "${parallel_small_download_mibps}" \
|
|
"${parallel_small_put_ops}/${parallel_small_get_ops}"
|
|
|
|
trap - RETURN
|
|
cleanup_lightningstor_bench_auth
|
|
}
|
|
|
|
benchmark_plasmavmc_image_path() {
|
|
log "Benchmarking PlasmaVMC image import plus CoronaFS-backed volume clone latency"
|
|
|
|
local iam_tunnel="" ls_tunnel="" vm_tunnel=""
|
|
local iam_port=15180
|
|
local ls_port=15186
|
|
local vm_port=15182
|
|
local image_id="" cold_volume_id="" warm_volume_id="" image_source_path=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 "${iam_port}" 50080)"
|
|
ls_tunnel="$(start_ssh_tunnel node01 "${ls_port}" 50086)"
|
|
vm_tunnel="$(start_ssh_tunnel node01 "${vm_port}" 50082)"
|
|
|
|
cleanup_plasmavmc_image_bench() {
|
|
if [[ -n "${cold_volume_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg volume "${cold_volume_id:-}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${warm_volume_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg volume "${warm_volume_id:-}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/DeleteVolume >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${image_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg image "${image_id:-}" '{orgId:$org, imageId:$image}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${image_source_path:-}" && "${image_source_path}" != /nix/store/* ]]; then
|
|
ssh_node node01 "rm -f ${image_source_path:-}" >/dev/null 2>&1 || true
|
|
fi
|
|
stop_ssh_tunnel node01 "${vm_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${ls_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${iam_tunnel:-}" >/dev/null 2>&1 || true
|
|
}
|
|
trap cleanup_plasmavmc_image_bench RETURN
|
|
|
|
local org_id="plasmavmc-bench-org-$(date +%s)"
|
|
local project_id="plasmavmc-bench-project"
|
|
local principal_id="plasmavmc-bench-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token "${iam_port}" "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
ensure_lightningstor_bucket "${ls_port}" "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
|
|
wait_for_lightningstor_write_quorum "${ls_port}" "${token}" "plasmavmc-images" "PlasmaVMC benchmark image import"
|
|
|
|
local guest_image_local_path guest_image_sha artifact_size_bytes artifact_mib virtual_size_bytes virtual_mib
|
|
guest_image_local_path="$(guest_image_path)"
|
|
[[ -n "${guest_image_local_path}" ]] || die "failed to locate bootable VM guest image for PlasmaVMC benchmark"
|
|
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
|
|
artifact_size_bytes="$(stat -c %s "${guest_image_local_path}")"
|
|
virtual_size_bytes="$(qemu-img info --output json "${guest_image_local_path}" | jq -r '."virtual-size"')"
|
|
artifact_mib="$(awk "BEGIN { printf \"%.0f\", ${artifact_size_bytes} / 1048576 }")"
|
|
virtual_mib="$(awk "BEGIN { printf \"%.0f\", ${virtual_size_bytes} / 1048576 }")"
|
|
|
|
local image_name="bench-image-$(date +%s)"
|
|
image_source_path="$(prepare_node01_image_source "${guest_image_local_path}" "${image_name}")"
|
|
[[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC benchmark image checksum mismatch after distribution"
|
|
|
|
local create_image_json create_image_response create_image_start_ns create_image_end_ns
|
|
create_image_json="$(
|
|
jq -cn \
|
|
--arg name "${image_name}" \
|
|
--arg org "${org_id}" \
|
|
--arg sha "${guest_image_sha}" \
|
|
--arg source_url "file://${image_source_path}" \
|
|
'{
|
|
name:$name,
|
|
orgId:$org,
|
|
visibility:"VISIBILITY_PRIVATE",
|
|
format:"IMAGE_FORMAT_QCOW2",
|
|
osType:"OS_TYPE_LINUX",
|
|
osVersion:"bench",
|
|
architecture:"ARCHITECTURE_X86_64",
|
|
minDiskGib:1,
|
|
minMemoryMib:512,
|
|
metadata:{purpose:"bench", sourceSha256:$sha},
|
|
sourceUrl:$source_url
|
|
}'
|
|
)"
|
|
create_image_start_ns="$(date +%s%N)"
|
|
create_image_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${create_image_json}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.ImageService/CreateImage)"
|
|
create_image_end_ns="$(date +%s%N)"
|
|
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
|
|
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC benchmark image import did not return an image ID"
|
|
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
|
|
|
|
local cold_request warm_request cold_response warm_response cold_start_ns cold_end_ns warm_start_ns warm_end_ns
|
|
cold_request="$(jq -cn --arg name "bench-cold-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
|
|
name:$name,
|
|
orgId:$org,
|
|
projectId:$project,
|
|
sizeGib:4,
|
|
driver:"VOLUME_DRIVER_KIND_MANAGED",
|
|
storageClass:"coronafs-managed",
|
|
imageId:$image,
|
|
metadata:{purpose:"bench-cold"},
|
|
labels:{}
|
|
}')"
|
|
cold_start_ns="$(date +%s%N)"
|
|
cold_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${cold_request}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/CreateVolume)"
|
|
cold_end_ns="$(date +%s%N)"
|
|
cold_volume_id="$(printf '%s' "${cold_response}" | jq -r '.id')"
|
|
[[ -n "${cold_volume_id}" && "${cold_volume_id}" != "null" ]] || die "PlasmaVMC cold image-backed volume create did not return a volume ID"
|
|
printf '%s' "${cold_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg volume "${cold_volume_id}" '{orgId:$org, projectId:$project, volumeId:$volume}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/DeleteVolume >/dev/null
|
|
cold_volume_id=""
|
|
|
|
warm_request="$(jq -cn --arg name "bench-warm-$(date +%s)" --arg org "${org_id}" --arg project "${project_id}" --arg image "${image_id}" '{
|
|
name:$name,
|
|
orgId:$org,
|
|
projectId:$project,
|
|
sizeGib:4,
|
|
driver:"VOLUME_DRIVER_KIND_MANAGED",
|
|
storageClass:"coronafs-managed",
|
|
imageId:$image,
|
|
metadata:{purpose:"bench-warm"},
|
|
labels:{}
|
|
}')"
|
|
warm_start_ns="$(date +%s%N)"
|
|
warm_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${warm_request}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VolumeService/CreateVolume)"
|
|
warm_end_ns="$(date +%s%N)"
|
|
warm_volume_id="$(printf '%s' "${warm_response}" | jq -r '.id')"
|
|
[[ -n "${warm_volume_id}" && "${warm_volume_id}" != "null" ]] || die "PlasmaVMC warm image-backed volume create did not return a volume ID"
|
|
printf '%s' "${warm_response}" | jq -e '.status | tostring | test("AVAILABLE$")' >/dev/null
|
|
|
|
local image_import_sec cold_clone_sec warm_clone_sec
|
|
image_import_sec="$(calc_seconds_from_ns "$((create_image_end_ns - create_image_start_ns))")"
|
|
cold_clone_sec="$(calc_seconds_from_ns "$((cold_end_ns - cold_start_ns))")"
|
|
warm_clone_sec="$(calc_seconds_from_ns "$((warm_end_ns - warm_start_ns))")"
|
|
|
|
log "PlasmaVMC image artifact benchmark: artifact=${artifact_mib} MiB virtual_size=${virtual_mib} MiB import=${image_import_sec}s cold_clone=${cold_clone_sec}s warm_clone=${warm_clone_sec}s"
|
|
|
|
printf '%s\t%s\t%s\t%s\t%s\n' \
|
|
"${artifact_mib}" "${virtual_mib}" "${image_import_sec}" "${cold_clone_sec}" "${warm_clone_sec}"
|
|
}
|
|
|
|
benchmark_plasmavmc_guest_runtime() {
|
|
log "Benchmarking PlasmaVMC guest-side CoronaFS runtime throughput"
|
|
|
|
local iam_tunnel="" ls_tunnel="" vm_tunnel="" coronafs_tunnel=""
|
|
local node04_coronafs_tunnel="" node05_coronafs_tunnel="" current_worker_coronafs_port=""
|
|
local iam_port=15280
|
|
local ls_port=15286
|
|
local vm_port=15282
|
|
local coronafs_port=15288
|
|
local image_id="" vm_id="" image_source_path=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 "${iam_port}" 50080)"
|
|
ls_tunnel="$(start_ssh_tunnel node01 "${ls_port}" 50086)"
|
|
vm_tunnel="$(start_ssh_tunnel node01 "${vm_port}" 50082)"
|
|
coronafs_tunnel="$(start_ssh_tunnel node01 "${coronafs_port}" "${CORONAFS_API_PORT}")"
|
|
node04_coronafs_tunnel="$(start_ssh_tunnel node04 25288 "${CORONAFS_API_PORT}")"
|
|
node05_coronafs_tunnel="$(start_ssh_tunnel node05 35288 "${CORONAFS_API_PORT}")"
|
|
|
|
cleanup_plasmavmc_guest_runtime() {
|
|
if [[ -n "${vm_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg vm "${vm_id:-}" '{orgId:$org, projectId:$project, vmId:$vm, force:true, timeoutSeconds:30}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VmService/StopVm >/dev/null 2>&1 || true
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg project "${project_id:-}" --arg vm "${vm_id:-}" '{orgId:$org, projectId:$project, vmId:$vm}' )" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VmService/DeleteVm >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${image_id:-}" ]]; then
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token:-}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id:-}" --arg image "${image_id:-}" '{orgId:$org, imageId:$image}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.ImageService/DeleteImage >/dev/null 2>&1 || true
|
|
fi
|
|
if [[ -n "${image_source_path:-}" && "${image_source_path}" != /nix/store/* ]]; then
|
|
ssh_node node01 "rm -f ${image_source_path:-}" >/dev/null 2>&1 || true
|
|
fi
|
|
stop_ssh_tunnel node05 "${node05_coronafs_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node04 "${node04_coronafs_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${coronafs_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${vm_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${ls_tunnel:-}" >/dev/null 2>&1 || true
|
|
stop_ssh_tunnel node01 "${iam_tunnel:-}" >/dev/null 2>&1 || true
|
|
}
|
|
trap cleanup_plasmavmc_guest_runtime RETURN
|
|
|
|
wait_for_plasmavmc_workers_registered "${vm_port}"
|
|
|
|
local org_id="plasmavmc-runtime-org-$(date +%s)"
|
|
local project_id="plasmavmc-runtime-project"
|
|
local principal_id="plasmavmc-runtime-$(date +%s)"
|
|
local token
|
|
token="$(issue_project_admin_token "${iam_port}" "${org_id}" "${project_id}" "${principal_id}")"
|
|
|
|
ensure_lightningstor_bucket "${ls_port}" "${token}" "plasmavmc-images" "${org_id}" "${project_id}"
|
|
wait_for_lightningstor_write_quorum "${ls_port}" "${token}" "plasmavmc-images" "PlasmaVMC runtime benchmark image import"
|
|
|
|
local guest_image_local_path guest_image_sha image_name create_image_json create_image_response
|
|
guest_image_local_path="$(guest_bench_image_path)"
|
|
[[ -n "${guest_image_local_path}" ]] || die "failed to locate VM benchmark guest image"
|
|
guest_image_sha="$(sha256sum "${guest_image_local_path}" | awk '{print $1}')"
|
|
image_name="bench-runtime-image-$(date +%s)"
|
|
image_source_path="$(prepare_node01_image_source "${guest_image_local_path}" "${image_name}")"
|
|
[[ "$(ssh_node node01 "sha256sum ${image_source_path} | awk '{print \$1}'")" == "${guest_image_sha}" ]] || die "PlasmaVMC runtime benchmark image checksum mismatch after distribution"
|
|
|
|
create_image_json="$(
|
|
jq -cn \
|
|
--arg name "${image_name}" \
|
|
--arg org "${org_id}" \
|
|
--arg sha "${guest_image_sha}" \
|
|
--arg source_url "file://${image_source_path}" \
|
|
'{
|
|
name:$name,
|
|
orgId:$org,
|
|
visibility:"VISIBILITY_PRIVATE",
|
|
format:"IMAGE_FORMAT_QCOW2",
|
|
osType:"OS_TYPE_LINUX",
|
|
osVersion:"bench-runtime",
|
|
architecture:"ARCHITECTURE_X86_64",
|
|
minDiskGib:1,
|
|
minMemoryMib:512,
|
|
metadata:{purpose:"bench-runtime", sourceSha256:$sha},
|
|
sourceUrl:$source_url
|
|
}'
|
|
)"
|
|
create_image_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${create_image_json}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.ImageService/CreateImage)"
|
|
image_id="$(printf '%s' "${create_image_response}" | jq -r '.id')"
|
|
[[ -n "${image_id}" && "${image_id}" != "null" ]] || die "PlasmaVMC runtime benchmark image import did not return an image ID"
|
|
printf '%s' "${create_image_response}" | jq -e '.status == "IMAGE_STATUS_AVAILABLE"' >/dev/null
|
|
|
|
local create_vm_json get_vm_json create_response node_id peer_node
|
|
create_vm_json="$(
|
|
jq -cn \
|
|
--arg name "bench-runtime-vm-$(date +%s)" \
|
|
--arg org "${org_id}" \
|
|
--arg project "${project_id}" \
|
|
--arg image_id "${image_id}" \
|
|
'{
|
|
name:$name,
|
|
orgId:$org,
|
|
projectId:$project,
|
|
hypervisor:"HYPERVISOR_TYPE_KVM",
|
|
spec:{
|
|
cpu:{vcpus:4, coresPerSocket:1, sockets:1},
|
|
memory:{sizeMib:1536},
|
|
disks:[
|
|
{
|
|
id:"root",
|
|
source:{imageId:$image_id},
|
|
sizeGib:4,
|
|
bus:"DISK_BUS_VIRTIO",
|
|
cache:"DISK_CACHE_NONE",
|
|
bootIndex:1
|
|
},
|
|
{
|
|
id:"data",
|
|
source:{blank:true},
|
|
sizeGib:4,
|
|
bus:"DISK_BUS_VIRTIO",
|
|
cache:"DISK_CACHE_NONE"
|
|
}
|
|
]
|
|
}
|
|
}'
|
|
)"
|
|
create_response="$(grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "${create_vm_json}" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VmService/CreateVm)"
|
|
vm_id="$(printf '%s' "${create_response}" | jq -r '.id')"
|
|
[[ -n "${vm_id}" && "${vm_id}" != "null" ]] || die "PlasmaVMC runtime benchmark VM create did not return a VM ID"
|
|
|
|
get_vm_json="$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')"
|
|
local deadline=$((SECONDS + HTTP_WAIT_TIMEOUT))
|
|
while true; do
|
|
local vm_json
|
|
if ! vm_json="$(try_get_vm_json "${token}" "${get_vm_json}" "${vm_port}" 2>/dev/null)"; then
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
|
|
fi
|
|
sleep 2
|
|
continue
|
|
fi
|
|
node_id="$(printf '%s' "${vm_json}" | jq -r '.nodeId // empty')"
|
|
if [[ "${node_id}" == "node04" || "${node_id}" == "node05" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
die "timed out waiting for runtime benchmark VM ${vm_id} scheduling"
|
|
fi
|
|
sleep 2
|
|
done
|
|
if [[ "${node_id}" == "node04" ]]; then
|
|
peer_node="node05"
|
|
current_worker_coronafs_port=25288
|
|
else
|
|
peer_node="node04"
|
|
current_worker_coronafs_port=35288
|
|
fi
|
|
|
|
local start_ns attach_ns ready_ns attach_sec ready_sec
|
|
local root_volume_id="${vm_id}-root"
|
|
local data_volume_id="${vm_id}-data"
|
|
local root_uri data_uri
|
|
|
|
start_ns="$(date +%s%N)"
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d "$(jq -cn --arg org "${org_id}" --arg project "${project_id}" --arg vm "${vm_id}" '{orgId:$org, projectId:$project, vmId:$vm}')" \
|
|
127.0.0.1:${vm_port} plasmavmc.v1.VmService/StartVm >/dev/null
|
|
|
|
root_uri="$(coronafs_export_volume_json "${coronafs_port}" "${root_volume_id}" | jq -r '.export.uri')"
|
|
data_uri="$(coronafs_export_volume_json "${coronafs_port}" "${data_volume_id}" | jq -r '.export.uri')"
|
|
[[ -n "${root_uri}" && "${root_uri}" != "null" ]] || die "runtime benchmark root volume export URI missing"
|
|
[[ -n "${data_uri}" && "${data_uri}" != "null" ]] || die "runtime benchmark data volume export URI missing"
|
|
coronafs_get_volume_json "${current_worker_coronafs_port}" "${root_volume_id}" >/dev/null
|
|
coronafs_get_volume_json "${current_worker_coronafs_port}" "${data_volume_id}" >/dev/null
|
|
|
|
local root_local_export_uri data_local_export_uri
|
|
root_local_export_uri="$(coronafs_volume_export_uri "${current_worker_coronafs_port}" "${root_volume_id}")"
|
|
data_local_export_uri="$(coronafs_volume_export_uri "${current_worker_coronafs_port}" "${data_volume_id}")"
|
|
wait_for_qemu_volume_present "${node_id}" "${CORONAFS_VOLUME_ROOT}/${root_volume_id}.raw" "${root_local_export_uri}"
|
|
wait_for_qemu_volume_present "${node_id}" "${CORONAFS_VOLUME_ROOT}/${data_volume_id}.raw" "${data_local_export_uri}"
|
|
attach_ns="$(date +%s%N)"
|
|
|
|
wait_for_vm_console_pattern "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT"
|
|
ready_ns="$(date +%s%N)"
|
|
|
|
local result_line seq_write_mibps seq_read_mibps randread_iops
|
|
result_line="$(read_vm_console_line_matching "${node_id}" "${vm_id}" "PHOTON_VM_BENCH_RESULT")"
|
|
seq_write_mibps="$(printf '%s\n' "${result_line}" | tr -d '\r' | sed -n 's/.*seq_write_mibps=\([^ ]*\).*/\1/p')"
|
|
seq_read_mibps="$(printf '%s\n' "${result_line}" | tr -d '\r' | sed -n 's/.*seq_read_mibps=\([^ ]*\).*/\1/p')"
|
|
randread_iops="$(printf '%s\n' "${result_line}" | tr -d '\r' | sed -n 's/.*randread_iops=\([^ ]*\).*/\1/p')"
|
|
[[ -n "${seq_write_mibps}" && -n "${seq_read_mibps}" && -n "${randread_iops}" ]] || die "failed to parse runtime benchmark result line: ${result_line}"
|
|
|
|
attach_sec="$(calc_seconds_from_ns "$((attach_ns - start_ns))")"
|
|
ready_sec="$(calc_seconds_from_ns "$((ready_ns - start_ns))")"
|
|
|
|
log "PlasmaVMC guest runtime benchmark: attach=${attach_sec}s guest_ready=${ready_sec}s seq_write=${seq_write_mibps} MiB/s seq_read=${seq_read_mibps} MiB/s randread=${randread_iops} IOPS"
|
|
printf '%s\t%s\t%s\t%s\t%s\n' \
|
|
"${attach_sec}" "${ready_sec}" "${seq_write_mibps}" "${seq_read_mibps}" "${randread_iops}"
|
|
}
|
|
|
|
write_storage_benchmark_report() {
|
|
local coronafs_network_mibps="$1"
|
|
local coronafs_network_retransmits="$2"
|
|
local lightningstor_network_mibps="$3"
|
|
local lightningstor_network_retransmits="$4"
|
|
local local_write_mibps="$5"
|
|
local local_read_mibps="$6"
|
|
local local_rand_iops="$7"
|
|
local local_rand_depth_iops="$8"
|
|
local coronafs_controller_write_mibps="$9"
|
|
local coronafs_controller_read_mibps="${10}"
|
|
local coronafs_controller_rand_iops="${11}"
|
|
local coronafs_controller_rand_depth_iops="${12}"
|
|
local local_depth_write_mibps="${13}"
|
|
local local_depth_read_mibps="${14}"
|
|
local coronafs_controller_depth_write_mibps="${15}"
|
|
local coronafs_controller_depth_read_mibps="${16}"
|
|
local coronafs_local_write_mibps="${17}"
|
|
local coronafs_local_read_mibps="${18}"
|
|
local coronafs_local_rand_iops="${19}"
|
|
local coronafs_local_rand_depth_iops="${20}"
|
|
local coronafs_local_depth_write_mibps="${21}"
|
|
local coronafs_local_depth_read_mibps="${22}"
|
|
local coronafs_node04_materialize_sec="${23}"
|
|
local coronafs_node05_materialize_sec="${24}"
|
|
local coronafs_target_local_read_mibps="${25}"
|
|
local lightningstor_upload_mibps="${26}"
|
|
local lightningstor_download_mibps="${27}"
|
|
local lightningstor_object_mib="${28}"
|
|
local lightningstor_small_object_count="${29}"
|
|
local lightningstor_small_object_mib="${30}"
|
|
local lightningstor_small_upload_mibps="${31}"
|
|
local lightningstor_small_download_mibps="${32}"
|
|
local lightningstor_small_ops="${33}"
|
|
local lightningstor_parallel_small_upload_mibps="${34}"
|
|
local lightningstor_parallel_small_download_mibps="${35}"
|
|
local lightningstor_parallel_small_ops="${36}"
|
|
local plasmavmc_image_artifact_mib="${37}"
|
|
local plasmavmc_image_virtual_mib="${38}"
|
|
local plasmavmc_image_import_sec="${39}"
|
|
local plasmavmc_cold_clone_sec="${40}"
|
|
local plasmavmc_warm_clone_sec="${41}"
|
|
local plasmavmc_runtime_attach_sec="${42}"
|
|
local plasmavmc_runtime_ready_sec="${43}"
|
|
local plasmavmc_runtime_seq_write_mibps="${44}"
|
|
local plasmavmc_runtime_seq_read_mibps="${45}"
|
|
local plasmavmc_runtime_randread_iops="${46}"
|
|
local coronafs_controller_read_ratio coronafs_controller_rand_ratio coronafs_controller_rand_depth_ratio coronafs_controller_vs_network_ratio coronafs_controller_depth_read_ratio
|
|
local coronafs_local_read_ratio coronafs_local_rand_ratio coronafs_local_rand_depth_ratio coronafs_local_depth_read_ratio coronafs_target_local_read_ratio
|
|
local lightningstor_vs_network_ratio
|
|
local lightningstor_small_put_ops lightningstor_small_get_ops
|
|
local lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops
|
|
|
|
IFS=/ read -r lightningstor_small_put_ops lightningstor_small_get_ops <<<"${lightningstor_small_ops}"
|
|
IFS=/ read -r lightningstor_parallel_small_put_ops lightningstor_parallel_small_get_ops <<<"${lightningstor_parallel_small_ops}"
|
|
|
|
coronafs_controller_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_controller_read_mibps} / ${local_read_mibps}) * 100 }")"
|
|
coronafs_controller_rand_ratio="$(awk "BEGIN { if (${local_rand_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_controller_rand_iops} / ${local_rand_iops}) * 100 }")"
|
|
coronafs_controller_rand_depth_ratio="$(awk "BEGIN { if (${local_rand_depth_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_controller_rand_depth_iops} / ${local_rand_depth_iops}) * 100 }")"
|
|
coronafs_controller_vs_network_ratio="$(awk "BEGIN { if (${coronafs_network_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_controller_read_mibps} / ${coronafs_network_mibps}) * 100 }")"
|
|
coronafs_controller_depth_read_ratio="$(awk "BEGIN { if (${local_depth_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_controller_depth_read_mibps} / ${local_depth_read_mibps}) * 100 }")"
|
|
coronafs_local_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_local_read_mibps} / ${local_read_mibps}) * 100 }")"
|
|
coronafs_local_rand_ratio="$(awk "BEGIN { if (${local_rand_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_local_rand_iops} / ${local_rand_iops}) * 100 }")"
|
|
coronafs_local_rand_depth_ratio="$(awk "BEGIN { if (${local_rand_depth_iops} == 0) print 0; else printf \"%.1f\", (${coronafs_local_rand_depth_iops} / ${local_rand_depth_iops}) * 100 }")"
|
|
coronafs_local_depth_read_ratio="$(awk "BEGIN { if (${local_depth_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_local_depth_read_mibps} / ${local_depth_read_mibps}) * 100 }")"
|
|
coronafs_target_local_read_ratio="$(awk "BEGIN { if (${local_read_mibps} == 0) print 0; else printf \"%.1f\", (${coronafs_target_local_read_mibps} / ${local_read_mibps}) * 100 }")"
|
|
lightningstor_vs_network_ratio="$(awk "BEGIN { if (${lightningstor_network_mibps} == 0) print 0; else printf \"%.1f\", (${lightningstor_download_mibps} / ${lightningstor_network_mibps}) * 100 }")"
|
|
|
|
cat > "${REPO_ROOT}/docs/storage-benchmarks.md" <<EOF
|
|
# Storage Benchmarks
|
|
|
|
Generated on $(date -Iseconds) with:
|
|
|
|
\`\`\`bash
|
|
nix run ./nix/test-cluster#cluster -- ${STORAGE_BENCHMARK_COMMAND}
|
|
\`\`\`
|
|
|
|
## CoronaFS
|
|
|
|
Cluster network baseline, measured with \`iperf3\` from \`node04\` to \`node01\` before the storage tests:
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| TCP throughput | ${coronafs_network_mibps} MiB/s |
|
|
| TCP retransmits | ${coronafs_network_retransmits} |
|
|
|
|
Measured from \`node04\`.
|
|
Local worker disk is the baseline. CoronaFS now has two relevant data paths in the lab: the controller export sourced from \`node01\`, and the node-local export materialized onto the worker that actually attaches the mutable VM disk.
|
|
|
|
| Metric | Local Disk | Controller Export | Node-local Export |
|
|
|---|---:|---:|---:|
|
|
| Sequential write | ${local_write_mibps} MiB/s | ${coronafs_controller_write_mibps} MiB/s | ${coronafs_local_write_mibps} MiB/s |
|
|
| Sequential read | ${local_read_mibps} MiB/s | ${coronafs_controller_read_mibps} MiB/s | ${coronafs_local_read_mibps} MiB/s |
|
|
| 4k random read | ${local_rand_iops} IOPS | ${coronafs_controller_rand_iops} IOPS | ${coronafs_local_rand_iops} IOPS |
|
|
| 4k queued random read (\`iodepth=32\`) | ${local_rand_depth_iops} IOPS | ${coronafs_controller_rand_depth_iops} IOPS | ${coronafs_local_rand_depth_iops} IOPS |
|
|
|
|
Queue-depth profile (\`libaio\`, \`iodepth=32\`) from the same worker:
|
|
|
|
| Metric | Local Disk | Controller Export | Node-local Export |
|
|
|---|---:|---:|---:|
|
|
| Depth-32 write | ${local_depth_write_mibps} MiB/s | ${coronafs_controller_depth_write_mibps} MiB/s | ${coronafs_local_depth_write_mibps} MiB/s |
|
|
| Depth-32 read | ${local_depth_read_mibps} MiB/s | ${coronafs_controller_depth_read_mibps} MiB/s | ${coronafs_local_depth_read_mibps} MiB/s |
|
|
|
|
Node-local materialization timing and target-node steady-state read path:
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| Node04 materialize latency | ${coronafs_node04_materialize_sec} s |
|
|
| Node05 materialize latency | ${coronafs_node05_materialize_sec} s |
|
|
| Node05 node-local sequential read | ${coronafs_target_local_read_mibps} MiB/s |
|
|
|
|
PlasmaVMC now prefers the worker-local CoronaFS export for mutable node-local volumes, even when the underlying materialization is a qcow2 overlay. The VM runtime section below is therefore the closest end-to-end proxy for real local-attach VM I/O, while the node-local export numbers remain useful for CoronaFS service consumers and for diagnosing exporter overhead.
|
|
|
|
## LightningStor
|
|
|
|
Measured from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` against the S3-compatible endpoint on \`node01\`.
|
|
The object path exercised the distributed backend with replication across the worker storage nodes.
|
|
|
|
Cluster network baseline for this client, measured with \`iperf3\` from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\` to \`node01\` before the storage tests:
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| TCP throughput | ${lightningstor_network_mibps} MiB/s |
|
|
| TCP retransmits | ${lightningstor_network_retransmits} |
|
|
|
|
### Large-object path
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| Object size | ${lightningstor_object_mib} MiB |
|
|
| Upload throughput | ${lightningstor_upload_mibps} MiB/s |
|
|
| Download throughput | ${lightningstor_download_mibps} MiB/s |
|
|
|
|
### Small-object batch
|
|
|
|
Measured as ${lightningstor_small_object_count} objects of 4 MiB each (${lightningstor_small_object_mib} MiB total).
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| Batch upload throughput | ${lightningstor_small_upload_mibps} MiB/s |
|
|
| Batch download throughput | ${lightningstor_small_download_mibps} MiB/s |
|
|
| PUT rate | ${lightningstor_small_put_ops} objects/s |
|
|
| GET rate | ${lightningstor_small_get_ops} objects/s |
|
|
|
|
### Parallel small-object batch
|
|
|
|
Measured as the same ${lightningstor_small_object_count} objects of 4 MiB each, but with 8 concurrent client jobs from \`${LIGHTNINGSTOR_BENCH_CLIENT_NODE}\`.
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| Parallel batch upload throughput | ${lightningstor_parallel_small_upload_mibps} MiB/s |
|
|
| Parallel batch download throughput | ${lightningstor_parallel_small_download_mibps} MiB/s |
|
|
| Parallel PUT rate | ${lightningstor_parallel_small_put_ops} objects/s |
|
|
| Parallel GET rate | ${lightningstor_parallel_small_get_ops} objects/s |
|
|
|
|
## VM Image Path
|
|
|
|
Measured against the \`PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume\` clone path on \`node01\`.
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| Guest image artifact size | ${plasmavmc_image_artifact_mib} MiB |
|
|
| Guest image virtual size | ${plasmavmc_image_virtual_mib} MiB |
|
|
| \`CreateImage\` latency | ${plasmavmc_image_import_sec} s |
|
|
| First image-backed \`CreateVolume\` latency | ${plasmavmc_cold_clone_sec} s |
|
|
| Second image-backed \`CreateVolume\` latency | ${plasmavmc_warm_clone_sec} s |
|
|
|
|
## VM Runtime Path
|
|
|
|
Measured against the real \`StartVm -> qemu attach -> guest boot -> guest fio\` path on a worker node, using a CoronaFS-backed root disk and data disk.
|
|
|
|
| Metric | Result |
|
|
|---|---:|
|
|
| \`StartVm\` to qemu attach | ${plasmavmc_runtime_attach_sec} s |
|
|
| \`StartVm\` to guest benchmark result | ${plasmavmc_runtime_ready_sec} s |
|
|
| Guest sequential write | ${plasmavmc_runtime_seq_write_mibps} MiB/s |
|
|
| Guest sequential read | ${plasmavmc_runtime_seq_read_mibps} MiB/s |
|
|
| Guest 4k random read | ${plasmavmc_runtime_randread_iops} IOPS |
|
|
|
|
## Assessment
|
|
|
|
- CoronaFS controller-export reads are currently ${coronafs_controller_read_ratio}% of the measured local-disk baseline on this nested-QEMU lab cluster.
|
|
- CoronaFS controller-export 4k random reads are currently ${coronafs_controller_rand_ratio}% of the measured local-disk baseline.
|
|
- CoronaFS controller-export queued 4k random reads are currently ${coronafs_controller_rand_depth_ratio}% of the measured local queued-random-read baseline.
|
|
- CoronaFS controller-export sequential reads are currently ${coronafs_controller_vs_network_ratio}% of the measured node04->node01 TCP baseline, which isolates the centralized source path from raw cluster-network limits.
|
|
- CoronaFS controller-export depth-32 reads are currently ${coronafs_controller_depth_read_ratio}% of the local depth-32 baseline.
|
|
- CoronaFS node-local reads are currently ${coronafs_local_read_ratio}% of the measured local-disk baseline, which is the more relevant steady-state signal for mutable VM disks after attachment.
|
|
- CoronaFS node-local 4k random reads are currently ${coronafs_local_rand_ratio}% of the measured local-disk baseline.
|
|
- CoronaFS node-local queued 4k random reads are currently ${coronafs_local_rand_depth_ratio}% of the measured local queued-random-read baseline.
|
|
- CoronaFS node-local depth-32 reads are currently ${coronafs_local_depth_read_ratio}% of the local depth-32 baseline.
|
|
- The target worker's node-local read path is ${coronafs_target_local_read_ratio}% of the measured local sequential-read baseline after materialization, which is the better proxy for restart and migration steady state than the old shared-export read.
|
|
- PlasmaVMC now attaches writable node-local volumes through the worker-local CoronaFS export, so the guest-runtime section should be treated as the real local VM steady-state path rather than the node-local export numbers alone.
|
|
- CoronaFS single-depth writes remain sensitive to the nested-QEMU/VDE lab transport, so the queued-depth and guest-runtime numbers are still the more reliable proxy for real VM workload behavior than the single-stream write figure alone.
|
|
- The central export path is now best understood as a source/materialization path; the worker-local export is the path that should determine VM-disk readiness going forward.
|
|
- LightningStor's replicated S3 path is working correctly, but ${lightningstor_upload_mibps} MiB/s upload and ${lightningstor_download_mibps} MiB/s download are still lab-grade numbers rather than strong object-store throughput.
|
|
- LightningStor large-object downloads are currently ${lightningstor_vs_network_ratio}% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
|
|
- The current S3 frontend tuning baseline is the built-in 16 MiB streaming threshold with multipart PUT/FETCH concurrency of 8; that combination is the best default observed on this lab cluster so far.
|
|
- LightningStor uploads should be read against the replication write quorum and the same ~${lightningstor_network_mibps} MiB/s lab network ceiling; this environment still limits end-to-end throughput well before modern bare-metal NICs would.
|
|
- LightningStor's small-object batch path is also functional, but ${lightningstor_small_put_ops} PUT/s and ${lightningstor_small_get_ops} GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
|
|
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches ${lightningstor_parallel_small_put_ops} PUT/s and ${lightningstor_parallel_small_get_ops} GET/s.
|
|
- The VM image section measures clone/materialization cost, not guest runtime I/O.
|
|
- The PlasmaVMC local image-backed clone fast path is now active again; a ${plasmavmc_warm_clone_sec} s second clone indicates the CoronaFS qcow2 backing-file path is being hit on node01 rather than falling back to eager raw materialization.
|
|
- The VM runtime section is the real \`PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel\` path; use it to judge whether QEMU/NBD tuning is helping.
|
|
- The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.
|
|
EOF
|
|
}
|
|
|
|
benchmark_storage() {
|
|
local coronafs_network_results lightningstor_network_results coronafs_results lightningstor_results plasmavmc_results plasmavmc_runtime_results
|
|
local coronafs_network_mibps coronafs_network_retransmits
|
|
local lightningstor_network_mibps lightningstor_network_retransmits
|
|
local local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops
|
|
local coronafs_controller_write_mibps coronafs_controller_read_mibps coronafs_controller_rand_iops coronafs_controller_rand_depth_iops
|
|
local local_depth_write_mibps local_depth_read_mibps coronafs_controller_depth_write_mibps coronafs_controller_depth_read_mibps
|
|
local coronafs_local_write_mibps coronafs_local_read_mibps coronafs_local_rand_iops coronafs_local_rand_depth_iops
|
|
local coronafs_local_depth_write_mibps coronafs_local_depth_read_mibps coronafs_node04_materialize_sec coronafs_node05_materialize_sec coronafs_target_local_read_mibps
|
|
local lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib
|
|
local lightningstor_small_object_count lightningstor_small_object_mib
|
|
local lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops
|
|
local lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops
|
|
local plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib
|
|
local plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec
|
|
local plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec
|
|
local plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops
|
|
|
|
coronafs_network_results="$(run_remote_iperf_json node04 node01 10.100.0.11)"
|
|
lightningstor_network_results="$(run_remote_iperf_json "${LIGHTNINGSTOR_BENCH_CLIENT_NODE:-node03}" node01 10.100.0.11)"
|
|
coronafs_results="$(benchmark_coronafs_performance)"
|
|
lightningstor_results="$(benchmark_lightningstor_performance)"
|
|
if [[ "${STORAGE_SKIP_PLASMAVMC_IMAGE_BENCH}" == "1" ]]; then
|
|
plasmavmc_results=$'0\t0\t0\t0\t0'
|
|
else
|
|
plasmavmc_results="$(benchmark_plasmavmc_image_path)"
|
|
fi
|
|
if [[ "${STORAGE_SKIP_PLASMAVMC_GUEST_RUNTIME_BENCH}" == "1" ]]; then
|
|
plasmavmc_runtime_results=$'0\t0\t0\t0\t0'
|
|
else
|
|
plasmavmc_runtime_results="$(benchmark_plasmavmc_guest_runtime)"
|
|
fi
|
|
|
|
coronafs_network_mibps="$(bps_to_mibps "$(printf '%s' "${coronafs_network_results}" | jq -r '.bits_per_second')")"
|
|
coronafs_network_retransmits="$(printf '%s' "${coronafs_network_results}" | jq -r '.retransmits')"
|
|
lightningstor_network_mibps="$(bps_to_mibps "$(printf '%s' "${lightningstor_network_results}" | jq -r '.bits_per_second')")"
|
|
lightningstor_network_retransmits="$(printf '%s' "${lightningstor_network_results}" | jq -r '.retransmits')"
|
|
IFS=$'\t' read -r \
|
|
local_write_mibps local_read_mibps local_rand_iops local_rand_depth_iops \
|
|
coronafs_controller_write_mibps coronafs_controller_read_mibps coronafs_controller_rand_iops coronafs_controller_rand_depth_iops \
|
|
local_depth_write_mibps local_depth_read_mibps coronafs_controller_depth_write_mibps coronafs_controller_depth_read_mibps \
|
|
coronafs_local_write_mibps coronafs_local_read_mibps coronafs_local_rand_iops coronafs_local_rand_depth_iops \
|
|
coronafs_local_depth_write_mibps coronafs_local_depth_read_mibps coronafs_node04_materialize_sec coronafs_node05_materialize_sec coronafs_target_local_read_mibps <<<"${coronafs_results}"
|
|
IFS=$'\t' read -r \
|
|
lightningstor_upload_mibps lightningstor_download_mibps lightningstor_object_mib \
|
|
lightningstor_small_object_count lightningstor_small_object_mib lightningstor_small_upload_mibps lightningstor_small_download_mibps lightningstor_small_ops \
|
|
lightningstor_parallel_small_upload_mibps lightningstor_parallel_small_download_mibps lightningstor_parallel_small_ops <<<"${lightningstor_results}"
|
|
IFS=$'\t' read -r \
|
|
plasmavmc_image_artifact_mib plasmavmc_image_virtual_mib plasmavmc_image_import_sec plasmavmc_cold_clone_sec plasmavmc_warm_clone_sec <<<"${plasmavmc_results}"
|
|
IFS=$'\t' read -r \
|
|
plasmavmc_runtime_attach_sec plasmavmc_runtime_ready_sec plasmavmc_runtime_seq_write_mibps plasmavmc_runtime_seq_read_mibps plasmavmc_runtime_randread_iops <<<"${plasmavmc_runtime_results}"
|
|
|
|
write_storage_benchmark_report \
|
|
"${coronafs_network_mibps}" "${coronafs_network_retransmits}" \
|
|
"${lightningstor_network_mibps}" "${lightningstor_network_retransmits}" \
|
|
"${local_write_mibps}" "${local_read_mibps}" "${local_rand_iops}" "${local_rand_depth_iops}" \
|
|
"${coronafs_controller_write_mibps}" "${coronafs_controller_read_mibps}" "${coronafs_controller_rand_iops}" "${coronafs_controller_rand_depth_iops}" \
|
|
"${local_depth_write_mibps}" "${local_depth_read_mibps}" "${coronafs_controller_depth_write_mibps}" "${coronafs_controller_depth_read_mibps}" \
|
|
"${coronafs_local_write_mibps}" "${coronafs_local_read_mibps}" "${coronafs_local_rand_iops}" "${coronafs_local_rand_depth_iops}" \
|
|
"${coronafs_local_depth_write_mibps}" "${coronafs_local_depth_read_mibps}" "${coronafs_node04_materialize_sec}" "${coronafs_node05_materialize_sec}" "${coronafs_target_local_read_mibps}" \
|
|
"${lightningstor_upload_mibps}" "${lightningstor_download_mibps}" "${lightningstor_object_mib}" \
|
|
"${lightningstor_small_object_count}" "${lightningstor_small_object_mib}" "${lightningstor_small_upload_mibps}" "${lightningstor_small_download_mibps}" "${lightningstor_small_ops}" \
|
|
"${lightningstor_parallel_small_upload_mibps}" "${lightningstor_parallel_small_download_mibps}" "${lightningstor_parallel_small_ops}" \
|
|
"${plasmavmc_image_artifact_mib}" "${plasmavmc_image_virtual_mib}" "${plasmavmc_image_import_sec}" "${plasmavmc_cold_clone_sec}" "${plasmavmc_warm_clone_sec}" \
|
|
"${plasmavmc_runtime_attach_sec}" "${plasmavmc_runtime_ready_sec}" "${plasmavmc_runtime_seq_write_mibps}" "${plasmavmc_runtime_seq_read_mibps}" "${plasmavmc_runtime_randread_iops}"
|
|
|
|
log "Storage benchmark report written to ${REPO_ROOT}/docs/storage-benchmarks.md"
|
|
}
|
|
|
|
validate_control_plane_fault_injection() {
|
|
log "Injecting control-plane failure: stopping node02 and validating quorum behavior"
|
|
|
|
local iam_tunnel="" iam_tunnel_alt=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
iam_tunnel_alt="$(start_ssh_tunnel node03 15083 50080)"
|
|
local flaredb_proto_root="/var/lib/photon-test-protos/flaredb"
|
|
trap 'start_vm node02 >/dev/null 2>&1 || true; wait_for_ssh node02 || true; stop_ssh_tunnel node03 "${iam_tunnel_alt}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
stop_vm node02
|
|
wait_for_ssh_down node02 90
|
|
|
|
ssh_node_script node01 <<'EOS'
|
|
set -euo pipefail
|
|
key="fault-chainfire-$(date +%s)"
|
|
value="ok-$RANDOM"
|
|
nodes=(10.100.0.11 10.100.0.13)
|
|
writer=""
|
|
deadline=$((SECONDS + 60))
|
|
while [[ -z "${writer}" ]]; do
|
|
for ip in "${nodes[@]}"; do
|
|
code="$(curl -sS -o /tmp/chainfire-fault.out -w '%{http_code}' \
|
|
-X PUT "http://${ip}:8081/api/v1/kv/${key}" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{\"value\":\"${value}\"}" || true)"
|
|
if [[ "${code}" == "200" ]]; then
|
|
writer="${ip}"
|
|
break
|
|
fi
|
|
done
|
|
if [[ -n "${writer}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "chainfire quorum writer did not become available after node02 stop" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
for ip in "${nodes[@]}"; do
|
|
deadline=$((SECONDS + 60))
|
|
while true; do
|
|
actual="$(curl -fsS "http://${ip}:8081/api/v1/kv/${key}" 2>/dev/null | jq -r '.data.value' 2>/dev/null || true)"
|
|
if [[ "${actual}" == "${value}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "chainfire quorum write did not converge on ${ip}" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
done
|
|
EOS
|
|
|
|
ensure_flaredb_proto_on_node node01 "${flaredb_proto_root}"
|
|
ssh_node_script node01 "${flaredb_proto_root}" <<'EOS'
|
|
set -euo pipefail
|
|
proto_root="$1"
|
|
key="fault-flaredb-strong-$(date +%s)"
|
|
value="ok-$RANDOM"
|
|
key_b64="$(printf '%s' "${key}" | base64 | tr -d '\n')"
|
|
value_b64="$(printf '%s' "${value}" | base64 | tr -d '\n')"
|
|
nodes=(10.100.0.11 10.100.0.13)
|
|
request="$(jq -cn --arg key "${key_b64}" --arg value "${value_b64}" '{key:$key, value:$value, expectedVersion:0, namespace:"fault"}')"
|
|
get_request="$(jq -cn --arg key "${key_b64}" '{key:$key, namespace:"fault"}')"
|
|
writer=""
|
|
deadline=$((SECONDS + 90))
|
|
while [[ -z "${writer}" ]]; do
|
|
for ip in "${nodes[@]}"; do
|
|
if timeout 15 grpcurl -plaintext \
|
|
-import-path "${proto_root}" \
|
|
-proto "${proto_root}/kvrpc.proto" \
|
|
-d "${request}" \
|
|
"${ip}:2479" kvrpc.KvCas/CompareAndSwap >/tmp/flaredb-fault-cas.out 2>/dev/null; then
|
|
if jq -e '.success == true and (.newVersion | tonumber) >= 1' /tmp/flaredb-fault-cas.out >/dev/null; then
|
|
writer="${ip}"
|
|
break
|
|
fi
|
|
fi
|
|
done
|
|
if [[ -n "${writer}" ]]; then
|
|
break
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "flaredb quorum writer did not become available after node02 stop" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
deadline=$((SECONDS + 90))
|
|
while true; do
|
|
if timeout 15 grpcurl -plaintext \
|
|
-import-path "${proto_root}" \
|
|
-proto "${proto_root}/kvrpc.proto" \
|
|
-d "${get_request}" \
|
|
"${writer}:2479" kvrpc.KvCas/Get >/tmp/flaredb-fault-get.out 2>/dev/null; then
|
|
if jq -e --arg value "${value_b64}" '.found == true and .value == $value and (.version | tonumber) >= 1' /tmp/flaredb-fault-get.out >/dev/null; then
|
|
break
|
|
fi
|
|
fi
|
|
if (( SECONDS >= deadline )); then
|
|
echo "flaredb strong quorum write did not remain readable on leader ${writer}" >&2
|
|
exit 1
|
|
fi
|
|
sleep 1
|
|
done
|
|
EOS
|
|
|
|
local org_id="fault-iam-org"
|
|
local project_id="fault-iam-project"
|
|
local principal_id="fault-iam-$(date +%s)"
|
|
local token iam_fault_port
|
|
read -r iam_fault_port token < <(issue_project_admin_token_any "${org_id}" "${project_id}" "${principal_id}" 15080 15083)
|
|
grpcurl -plaintext \
|
|
-import-path "${IAM_PROTO_DIR}" \
|
|
-proto "${IAM_PROTO}" \
|
|
-d "$(jq -cn --arg token "${token}" '{token:$token}')" \
|
|
127.0.0.1:"${iam_fault_port}" iam.v1.IamToken/ValidateToken \
|
|
| jq -e '.valid == true' >/dev/null
|
|
|
|
start_vm node02
|
|
wait_for_ssh node02
|
|
wait_for_unit node02 chainfire
|
|
wait_for_unit node02 flaredb
|
|
wait_for_unit node02 iam
|
|
wait_for_flaredb_region node02
|
|
wait_for_flaredb_route_metadata node01
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node03 "${iam_tunnel_alt}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_worker_fault_injection() {
|
|
log "Injecting worker failure: stopping node04 and validating degraded worker operation"
|
|
|
|
local iam_tunnel="" ls_tunnel="" vm_tunnel=""
|
|
iam_tunnel="$(start_ssh_tunnel node01 15080 50080)"
|
|
ls_tunnel="$(start_ssh_tunnel node01 15086 50086)"
|
|
vm_tunnel="$(start_ssh_tunnel node01 15082 50082)"
|
|
trap 'start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
|
|
stop_vm node04
|
|
wait_for_ssh_down node04 90
|
|
|
|
wait_for_http node05 http://127.0.0.1:8084/health
|
|
wait_for_tcp_port node05 50086
|
|
|
|
grpcurl -plaintext \
|
|
-import-path "${PLASMAVMC_PROTO_DIR}" \
|
|
-proto "${PLASMAVMC_PROTO}" \
|
|
-d '{}' \
|
|
127.0.0.1:15082 plasmavmc.v1.NodeService/ListNodes \
|
|
| jq -e '([.nodes[] | select(.state == "NODE_STATE_READY") | .id] | index("node05")) != null' >/dev/null
|
|
|
|
local org_id="worker-fault-org"
|
|
local project_id="worker-fault-project"
|
|
local principal_id="worker-fault-$(date +%s)"
|
|
local token bucket key tmpfile
|
|
token="$(issue_project_admin_token 15080 "${org_id}" "${project_id}" "${principal_id}")"
|
|
bucket="worker-fault-$(date +%s)"
|
|
key="survive-${RANDOM}.txt"
|
|
ensure_lightningstor_bucket 15086 "${token}" "${bucket}" "${org_id}" "${project_id}"
|
|
|
|
tmpfile="$(mktemp)"
|
|
trap 'rm -f "${tmpfile}"; start_vm node04 >/dev/null 2>&1 || true; wait_for_ssh node04 || true; stop_ssh_tunnel node01 "${vm_tunnel}"; stop_ssh_tunnel node01 "${ls_tunnel}"; stop_ssh_tunnel node01 "${iam_tunnel}"' RETURN
|
|
printf 'worker-fault-check-%s\n' "${RANDOM}" >"${tmpfile}"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "$(jq -cn \
|
|
--arg bucket "${bucket}" \
|
|
--arg key "${key}" \
|
|
--arg body "$(base64 -w0 "${tmpfile}")" \
|
|
'{bucket:$bucket, key:$key, body:$body, metadata:{contentType:"text/plain"}}')" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/PutObject >/dev/null
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/HeadObject >/dev/null
|
|
|
|
download_lightningstor_object_to_file 15086 "${token}" "${bucket}" "${key}" "${tmpfile}.downloaded"
|
|
cmp -s "${tmpfile}" "${tmpfile}.downloaded"
|
|
|
|
grpcurl -plaintext \
|
|
-H "authorization: Bearer ${token}" \
|
|
-import-path "${LIGHTNINGSTOR_PROTO_DIR}" \
|
|
-proto "${LIGHTNINGSTOR_PROTO}" \
|
|
-d "$(jq -cn --arg bucket "${bucket}" --arg key "${key}" '{bucket:$bucket, key:$key}')" \
|
|
127.0.0.1:15086 lightningstor.v1.ObjectService/DeleteObject >/dev/null
|
|
|
|
rm -f "${tmpfile}" "${tmpfile}.downloaded"
|
|
start_vm node04
|
|
wait_for_ssh node04
|
|
wait_for_unit node04 plasmavmc
|
|
wait_for_unit node04 lightningstor
|
|
wait_for_http node04 http://127.0.0.1:8084/health
|
|
wait_for_tcp_port node04 50086
|
|
wait_for_plasmavmc_workers_registered 15082
|
|
|
|
trap - RETURN
|
|
stop_ssh_tunnel node01 "${vm_tunnel}"
|
|
stop_ssh_tunnel node01 "${ls_tunnel}"
|
|
stop_ssh_tunnel node01 "${iam_tunnel}"
|
|
}
|
|
|
|
validate_fault_injection() {
|
|
validate_control_plane_fault_injection
|
|
validate_worker_fault_injection
|
|
}
|
|
|
|
validate_cluster() {
|
|
preflight
|
|
wait_requested
|
|
validate_units
|
|
validate_control_plane
|
|
validate_iam_flow
|
|
validate_prismnet_flow
|
|
validate_flashdns_flow
|
|
validate_fiberlb_flow
|
|
validate_workers
|
|
validate_lightningstor_distributed_storage
|
|
validate_vm_storage_flow
|
|
validate_k8shost_flow
|
|
validate_gateway
|
|
validate_nightlight_flow
|
|
validate_creditservice_flow
|
|
validate_deployer_flow
|
|
validate_fault_injection
|
|
validate_nested_kvm_workers
|
|
validate_native_runtime_flow
|
|
log "Cluster validation succeeded"
|
|
}
|
|
|
|
validate_storage_cluster() {
|
|
preflight
|
|
wait_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_units
|
|
validate_storage_control_plane
|
|
validate_workers
|
|
validate_lightningstor_distributed_storage
|
|
validate_vm_storage_flow
|
|
validate_nested_kvm_workers
|
|
log "Storage cluster validation succeeded"
|
|
}
|
|
|
|
smoke_requested() {
|
|
start_requested "$@"
|
|
validate_cluster
|
|
}
|
|
|
|
fresh_smoke_requested() {
|
|
clean_requested "$@"
|
|
smoke_requested "$@"
|
|
}
|
|
|
|
storage_smoke_requested() {
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_cluster
|
|
}
|
|
|
|
fresh_storage_smoke_requested() {
|
|
BUILD_PROFILE="storage"
|
|
clean_requested "${STORAGE_NODES[@]}"
|
|
storage_smoke_requested
|
|
}
|
|
|
|
matrix_requested() {
|
|
start_requested "$@"
|
|
validate_component_matrix
|
|
}
|
|
|
|
fresh_matrix_requested() {
|
|
clean_requested "$@"
|
|
matrix_requested "$@"
|
|
}
|
|
|
|
bench_storage_requested() {
|
|
STORAGE_BENCHMARK_COMMAND="${STORAGE_BENCHMARK_COMMAND:-bench-storage}"
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_storage
|
|
}
|
|
|
|
fresh_bench_storage_requested() {
|
|
STORAGE_BENCHMARK_COMMAND="fresh-bench-storage"
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
clean_requested "${STORAGE_NODES[@]}"
|
|
bench_storage_requested "$@"
|
|
}
|
|
|
|
validate_storage_bench_prereqs() {
|
|
if [[ "${PHOTON_CLUSTER_SKIP_VALIDATE:-0}" == "1" ]]; then
|
|
log "Skipping storage validation because PHOTON_CLUSTER_SKIP_VALIDATE=1"
|
|
return 0
|
|
fi
|
|
|
|
validate_storage_units
|
|
validate_storage_control_plane
|
|
}
|
|
|
|
plasmavmc_image_bench_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_plasmavmc_image_path
|
|
}
|
|
|
|
plasmavmc_runtime_bench_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_plasmavmc_guest_runtime
|
|
}
|
|
|
|
coronafs_bench_requested() {
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_coronafs_performance
|
|
}
|
|
|
|
coronafs_local_bench_requested() {
|
|
BUILD_PROFILE="storage"
|
|
"${REPO_ROOT}/coronafs/scripts/benchmark-local-export.sh"
|
|
}
|
|
|
|
coronafs_local_matrix_requested() {
|
|
BUILD_PROFILE="storage"
|
|
local -a cache_modes=("none" "writeback")
|
|
local -a aio_modes=("io_uring" "threads")
|
|
local cache_mode=""
|
|
local aio_mode=""
|
|
|
|
for cache_mode in "${cache_modes[@]}"; do
|
|
for aio_mode in "${aio_modes[@]}"; do
|
|
log "Running CoronaFS local export matrix case: cache=${cache_mode} aio=${aio_mode}"
|
|
CORONAFS_BENCH_EXPORT_CACHE_MODE="${cache_mode}" \
|
|
CORONAFS_BENCH_EXPORT_AIO_MODE="${aio_mode}" \
|
|
"${REPO_ROOT}/coronafs/scripts/benchmark-local-export.sh"
|
|
done
|
|
done
|
|
}
|
|
|
|
lightningstor_bench_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_lightningstor_performance
|
|
}
|
|
|
|
set_lightningstor_runtime_s3_tuning() {
|
|
local streaming_threshold_bytes="${1:-default}"
|
|
local inline_put_max_bytes="${2:-134217728}"
|
|
local multipart_put_concurrency="${3:-8}"
|
|
local multipart_fetch_concurrency="${4:-8}"
|
|
|
|
ssh_node_script node01 "${streaming_threshold_bytes}" "${inline_put_max_bytes}" "${multipart_put_concurrency}" "${multipart_fetch_concurrency}" <<'EOS'
|
|
set -euo pipefail
|
|
streaming_threshold_bytes="$1"
|
|
inline_put_max_bytes="$2"
|
|
multipart_put_concurrency="$3"
|
|
multipart_fetch_concurrency="$4"
|
|
|
|
systemctl unset-environment \
|
|
LIGHTNINGSTOR_S3_STREAMING_PUT_THRESHOLD_BYTES \
|
|
LIGHTNINGSTOR_S3_INLINE_PUT_MAX_BYTES \
|
|
LIGHTNINGSTOR_S3_MULTIPART_PUT_CONCURRENCY \
|
|
LIGHTNINGSTOR_S3_MULTIPART_FETCH_CONCURRENCY
|
|
|
|
if [[ "${streaming_threshold_bytes}" != "default" ]]; then
|
|
systemctl set-environment \
|
|
LIGHTNINGSTOR_S3_STREAMING_PUT_THRESHOLD_BYTES="${streaming_threshold_bytes}" \
|
|
LIGHTNINGSTOR_S3_INLINE_PUT_MAX_BYTES="${inline_put_max_bytes}" \
|
|
LIGHTNINGSTOR_S3_MULTIPART_PUT_CONCURRENCY="${multipart_put_concurrency}" \
|
|
LIGHTNINGSTOR_S3_MULTIPART_FETCH_CONCURRENCY="${multipart_fetch_concurrency}"
|
|
fi
|
|
|
|
systemctl restart lightningstor.service
|
|
for _ in $(seq 1 30); do
|
|
if systemctl is-active --quiet lightningstor.service; then
|
|
exit 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo "timed out waiting for lightningstor.service to restart" >&2
|
|
exit 1
|
|
EOS
|
|
}
|
|
|
|
benchmark_lightningstor_threshold_matrix() {
|
|
local tuned_8 tuned_16 baseline tuned_32 tuned_64 tuned_128
|
|
|
|
set_lightningstor_runtime_s3_tuning 8388608
|
|
tuned_8="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning 16777216
|
|
tuned_16="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning default
|
|
baseline="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning 33554432
|
|
tuned_32="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning 67108864
|
|
tuned_64="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning 134217728
|
|
tuned_128="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning default
|
|
|
|
printf 'threshold\tlarge_up\tlarge_down\tobject_mib\tsmall_count\tsmall_total_mib\tsmall_up\tsmall_down\tsmall_ops\tparallel_up\tparallel_down\tparallel_ops\n'
|
|
printf '8MiB\t%s\n' "${tuned_8}"
|
|
printf '16MiB\t%s\n' "${tuned_16}"
|
|
printf 'default(16MiB)\t%s\n' "${baseline}"
|
|
printf '32MiB\t%s\n' "${tuned_32}"
|
|
printf '64MiB\t%s\n' "${tuned_64}"
|
|
printf '128MiB\t%s\n' "${tuned_128}"
|
|
}
|
|
|
|
lightningstor_threshold_matrix_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_lightningstor_threshold_matrix
|
|
}
|
|
|
|
benchmark_lightningstor_concurrency_matrix() {
|
|
local threshold_bytes="${LIGHTNINGSTOR_MATRIX_THRESHOLD_BYTES:-16777216}"
|
|
local tuned_4 tuned_8 tuned_16
|
|
|
|
set_lightningstor_runtime_s3_tuning "${threshold_bytes}" 134217728 4 4
|
|
tuned_4="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning "${threshold_bytes}" 134217728 8 8
|
|
tuned_8="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning "${threshold_bytes}" 134217728 16 16
|
|
tuned_16="$(benchmark_lightningstor_performance)"
|
|
set_lightningstor_runtime_s3_tuning default
|
|
|
|
printf 'concurrency\tlarge_up\tlarge_down\tobject_mib\tsmall_count\tsmall_total_mib\tsmall_up\tsmall_down\tsmall_ops\tparallel_up\tparallel_down\tparallel_ops\tthreshold_bytes\n'
|
|
printf '4\t%s\t%s\n' "${tuned_4}" "${threshold_bytes}"
|
|
printf '8(default)\t%s\t%s\n' "${tuned_8}" "${threshold_bytes}"
|
|
printf '16\t%s\t%s\n' "${tuned_16}" "${threshold_bytes}"
|
|
}
|
|
|
|
lightningstor_concurrency_matrix_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_lightningstor_concurrency_matrix
|
|
}
|
|
|
|
storage_bench_requested() {
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
start_requested "${STORAGE_NODES[@]}"
|
|
validate_storage_bench_prereqs
|
|
benchmark_storage
|
|
}
|
|
|
|
fresh_storage_bench_requested() {
|
|
STORAGE_BENCHMARK_COMMAND="fresh-storage-bench"
|
|
LIGHTNINGSTOR_BENCH_CLIENT_NODE="node03"
|
|
BUILD_PROFILE="storage"
|
|
clean_requested "${STORAGE_NODES[@]}"
|
|
storage_bench_requested
|
|
}
|
|
|
|
status_requested() {
|
|
local nodes
|
|
mapfile -t nodes < <(all_or_requested_nodes "$@")
|
|
validate_nodes_exist "${nodes[@]}"
|
|
|
|
local node pid_path
|
|
printf 'PhotonCloud test cluster status\n'
|
|
printf '===============================\n'
|
|
for node in "${nodes[@]}"; do
|
|
pid_path="$(pid_file "${node}")"
|
|
if is_running "${node}"; then
|
|
printf '%s: RUNNING (pid=%s, ssh=%s, runtime=%s)\n' \
|
|
"${node}" "$(<"${pid_path}")" "$(ssh_port_for_node "${node}")" "$(runtime_dir "${node}")"
|
|
else
|
|
printf '%s: STOPPED\n' "${node}"
|
|
fi
|
|
done
|
|
}
|
|
|
|
stop_requested() {
|
|
acquire_cluster_lock
|
|
local nodes
|
|
mapfile -t nodes < <(all_or_requested_nodes "$@")
|
|
validate_nodes_exist "${nodes[@]}"
|
|
|
|
if [[ "$#" -eq 0 ]]; then
|
|
stop_nodes_all_profiles "${nodes[@]}"
|
|
else
|
|
stop_nodes_current_profile "${nodes[@]}"
|
|
fi
|
|
}
|
|
|
|
clean_requested() {
|
|
acquire_cluster_lock
|
|
stop_requested "$@"
|
|
if [[ "$#" -eq 0 ]]; then
|
|
remove_runtime_state_all_profiles
|
|
else
|
|
local node
|
|
for node in "$@"; do
|
|
log "Removing runtime state for ${node}"
|
|
find "$(runtime_dir "${node}")" -mindepth 1 -delete 2>/dev/null || true
|
|
rmdir "$(runtime_dir "${node}")" 2>/dev/null || true
|
|
if ! preserve_build_links_requested; then
|
|
rm -f "$(build_link "${node}")"
|
|
fi
|
|
done
|
|
fi
|
|
}
|
|
|
|
ssh_requested() {
|
|
local node="${1:-node01}"
|
|
validate_nodes_exist "${node}"
|
|
local ssh_port
|
|
ssh_port="$(ssh_port_for_node "${node}")"
|
|
exec sshpass -p "${SSH_PASSWORD}" \
|
|
ssh "${SSH_OPTS[@]}" -p "${ssh_port}" root@127.0.0.1
|
|
}
|
|
|
|
logs_requested() {
|
|
local node="${1:-node01}"
|
|
local lines="${2:-120}"
|
|
validate_nodes_exist "${node}"
|
|
tail -n "${lines}" "$(log_file "${node}")"
|
|
}
|
|
|
|
usage() {
|
|
cat <<USAGE
|
|
PhotonCloud VM test cluster
|
|
|
|
Usage: $0 <command> [nodes...]
|
|
|
|
Commands:
|
|
build Build one or more VM derivations
|
|
start Build if needed, start VMs, and wait for SSH
|
|
wait Wait for SSH on running VMs
|
|
validate Run the cluster smoke validation
|
|
smoke start + validate
|
|
fresh-smoke clean local runtime state, rebuild on the host, start, and validate
|
|
storage-smoke start the storage lab (node01-05) and validate CoronaFS/LightningStor/PlasmaVMC
|
|
fresh-storage-smoke clean local runtime state, rebuild node01-05 on the host, start, and validate the storage lab
|
|
matrix Start the cluster and validate composed service configurations against the current running VMs
|
|
fresh-matrix clean local runtime state, rebuild on the host, start, and validate composed service configurations
|
|
bench-storage start the cluster and benchmark CoronaFS plus LightningStor against the current running VMs
|
|
fresh-bench-storage clean local runtime state, rebuild on the host, start, and benchmark CoronaFS plus LightningStor
|
|
bench-coronafs start the storage lab and benchmark CoronaFS against the current running VMs
|
|
bench-coronafs-local run the local single-process CoronaFS export benchmark without starting the VM lab
|
|
bench-coronafs-local-matrix run the local CoronaFS export benchmark across cache/aio combinations
|
|
bench-lightningstor start the storage lab and benchmark LightningStor against the current running VMs
|
|
bench-lightningstor-thresholds start the storage lab and benchmark LightningStor with 8/16/default/64/128 MiB multipart thresholds
|
|
bench-lightningstor-concurrency start the storage lab and benchmark LightningStor with 4/default/16 multipart fetch+put concurrency
|
|
bench-plasmavmc-image start the storage lab and benchmark the PlasmaVMC image import and clone path
|
|
bench-plasmavmc-runtime start the storage lab and benchmark the PlasmaVMC guest runtime path
|
|
storage-bench start the storage lab (node01-05) and benchmark CoronaFS plus LightningStor
|
|
fresh-storage-bench clean local runtime state, rebuild node01-05 on the host, start, and benchmark the storage lab
|
|
stop Stop one or more VMs
|
|
status Show VM process status
|
|
ssh SSH to a node (default: node01)
|
|
logs Show VM log for a node (default: node01)
|
|
clean Stop VMs and remove local runtime state
|
|
help Show this help
|
|
|
|
Examples:
|
|
$0 smoke
|
|
$0 fresh-smoke
|
|
$0 storage-smoke
|
|
$0 fresh-storage-smoke
|
|
$0 matrix
|
|
$0 fresh-matrix
|
|
$0 bench-storage
|
|
$0 fresh-bench-storage
|
|
$0 bench-coronafs
|
|
$0 bench-coronafs-local
|
|
$0 bench-coronafs-local-matrix
|
|
$0 bench-lightningstor
|
|
$0 bench-lightningstor-thresholds
|
|
$0 bench-lightningstor-concurrency
|
|
$0 bench-plasmavmc-image
|
|
$0 bench-plasmavmc-runtime
|
|
$0 storage-bench
|
|
$0 fresh-storage-bench
|
|
$0 start node01 node02 node03
|
|
$0 validate
|
|
$0 ssh node04
|
|
USAGE
|
|
}
|
|
|
|
main() {
|
|
local cmd="${1:-help}"
|
|
shift || true
|
|
|
|
case "${cmd}" in
|
|
build) build_requested "$@" ;;
|
|
start) start_requested "$@" ;;
|
|
wait) wait_requested "$@" ;;
|
|
validate) validate_cluster ;;
|
|
smoke) smoke_requested "$@" ;;
|
|
fresh-smoke) fresh_smoke_requested "$@" ;;
|
|
storage-smoke) storage_smoke_requested ;;
|
|
fresh-storage-smoke) fresh_storage_smoke_requested ;;
|
|
matrix) matrix_requested "$@" ;;
|
|
fresh-matrix) fresh_matrix_requested "$@" ;;
|
|
bench-storage) bench_storage_requested "$@" ;;
|
|
fresh-bench-storage) fresh_bench_storage_requested "$@" ;;
|
|
bench-coronafs) coronafs_bench_requested ;;
|
|
bench-coronafs-local) coronafs_local_bench_requested ;;
|
|
bench-coronafs-local-matrix) coronafs_local_matrix_requested ;;
|
|
bench-lightningstor) lightningstor_bench_requested ;;
|
|
bench-lightningstor-thresholds) lightningstor_threshold_matrix_requested ;;
|
|
bench-lightningstor-concurrency) lightningstor_concurrency_matrix_requested ;;
|
|
bench-plasmavmc-image) plasmavmc_image_bench_requested ;;
|
|
bench-plasmavmc-runtime) plasmavmc_runtime_bench_requested ;;
|
|
storage-bench) storage_bench_requested ;;
|
|
fresh-storage-bench) fresh_storage_bench_requested ;;
|
|
stop) stop_requested "$@" ;;
|
|
status) status_requested "$@" ;;
|
|
ssh) ssh_requested "$@" ;;
|
|
logs) logs_requested "$@" ;;
|
|
clean) clean_requested "$@" ;;
|
|
help|--help|-h) usage ;;
|
|
*) die "unknown command: ${cmd}" ;;
|
|
esac
|
|
}
|
|
|
|
main "$@"
|