photoncloud-monorepo/deployer/scripts/verify-deployer-bootstrap-e2e.sh

477 lines
14 KiB
Bash
Executable file

#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
fi
tmp_dir="$(mktemp -d)"
cf_pid=""
deployer_pid=""
cleanup() {
set +e
if [[ -n "$deployer_pid" ]]; then
kill "$deployer_pid" 2>/dev/null || true
wait "$deployer_pid" 2>/dev/null || true
fi
if [[ -n "$cf_pid" ]]; then
kill "$cf_pid" 2>/dev/null || true
wait "$cf_pid" 2>/dev/null || true
fi
rm -rf "$tmp_dir"
}
trap cleanup EXIT
free_port() {
python3 - <<'PY'
import socket
s = socket.socket()
s.bind(("127.0.0.1", 0))
print(s.getsockname()[1])
s.close()
PY
}
wait_for_port() {
local host="$1"
local port="$2"
local timeout_secs="${3:-60}"
local deadline=$((SECONDS + timeout_secs))
while (( SECONDS < deadline )); do
if python3 - "$host" "$port" <<'PY'
import socket
import sys
host = sys.argv[1]
port = int(sys.argv[2])
with socket.socket() as sock:
sock.settimeout(0.5)
try:
sock.connect((host, port))
except OSError:
raise SystemExit(1)
raise SystemExit(0)
PY
then
return 0
fi
sleep 1
done
echo "timed out waiting for ${host}:${port}" >&2
return 1
}
wait_for_http() {
local url="$1"
local timeout_secs="${2:-60}"
local deadline=$((SECONDS + timeout_secs))
while (( SECONDS < deadline )); do
if python3 - "$url" <<'PY'
import sys
import urllib.request
try:
with urllib.request.urlopen(sys.argv[1], timeout=2):
pass
except Exception:
raise SystemExit(1)
raise SystemExit(0)
PY
then
return 0
fi
sleep 1
done
echo "timed out waiting for $url" >&2
return 1
}
api_port="$(free_port)"
http_port="$(free_port)"
raft_port="$(free_port)"
gossip_port="$(free_port)"
deployer_port="$(free_port)"
bootstrap_token="bootstrap-secret"
printf 'bundle-bytes' >"$tmp_dir/flake-bundle.tar.gz"
cat >"$tmp_dir/chainfire.toml" <<EOF
[node]
id = 1
name = "chainfire-1"
role = "control_plane"
[storage]
data_dir = "$tmp_dir/chainfire-data"
[network]
api_addr = "127.0.0.1:${api_port}"
http_addr = "127.0.0.1:${http_port}"
raft_addr = "127.0.0.1:${raft_port}"
gossip_addr = "127.0.0.1:${gossip_port}"
[cluster]
id = 1
initial_members = []
bootstrap = true
[raft]
role = "voter"
EOF
echo "Starting ChainFire on 127.0.0.1:${api_port}"
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
--config "$tmp_dir/chainfire.toml" \
>"$tmp_dir/chainfire.log" 2>&1 &
cf_pid="$!"
wait_for_port "127.0.0.1" "$api_port" 120
cat >"$tmp_dir/deployer.toml" <<EOF
bind_addr = "127.0.0.1:${deployer_port}"
cluster_id = "test-cluster"
cluster_namespace = "photoncloud"
heartbeat_timeout_secs = 300
local_state_path = "$tmp_dir/deployer-state"
bootstrap_flake_bundle_path = "$tmp_dir/flake-bundle.tar.gz"
bootstrap_token = "${bootstrap_token}"
require_chainfire = true
allow_unknown_nodes = false
allow_unauthenticated = false
allow_test_mappings = false
tls_self_signed = false
[chainfire]
endpoints = ["http://127.0.0.1:${api_port}"]
namespace = "deployer"
EOF
echo "Starting Deployer on 127.0.0.1:${deployer_port}"
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- \
--config "$tmp_dir/deployer.toml" \
>"$tmp_dir/deployer.log" 2>&1 &
deployer_pid="$!"
wait_for_http "http://127.0.0.1:${deployer_port}/health" 120
cat >"$tmp_dir/cluster.yaml" <<'EOF'
cluster:
cluster_id: test-cluster
environment: dev
node_classes:
- name: general-worker
nix_profile: profiles/worker-linux
install_plan:
nixos_configuration: worker-golden
disko_config_path: profiles/worker-linux/disko.nix
target_disk_by_id: /dev/disk/by-id/worker-default
roles:
- worker
labels:
tier: general
- name: edge-metal
nix_profile: profiles/edge-metal
install_plan:
nixos_configuration: edge-metal
disko_config_path: profiles/edge-metal/disko.nix
target_disk_by_id: /dev/disk/by-id/edge-default
roles:
- edge
labels:
tier: edge
pools:
- name: general
node_class: general-worker
labels:
env: dev
- name: edge
node_class: edge-metal
labels:
env: dev
lane: edge
nodes:
- node_id: node-seeded
machine_id: known-machine-01
hostname: node-seeded
ip: 10.0.0.11
pool: general
failure_domain: rack-a
install_plan:
nixos_configuration: node01
disko_config_path: nix/nodes/vm-cluster/node01/disko.nix
target_disk: /dev/vda
desired_system:
flake_ref: "github:centra/cloud"
health_check_command:
- systemctl
- is-system-running
- "--wait"
rollback_on_failure: true
state: pending
enrollment_rules:
- name: edge-metal-auto
priority: 100
match_labels:
rack: edge
sku: metal
pool: edge
labels:
managed-by: deployer
services:
- prismnet
ssh_authorized_keys:
- ssh-ed25519 AAAATEST edge@test
node_id_prefix: edge
EOF
chainfire_endpoint="http://127.0.0.1:${api_port}"
deployer_endpoint="http://127.0.0.1:${deployer_port}"
run_deployer_ctl() {
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
--chainfire-endpoint "$chainfire_endpoint" \
--cluster-id test-cluster \
--cluster-namespace photoncloud \
--deployer-namespace deployer \
"$@"
}
echo "Applying declarative cluster/bootstrap config"
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
echo "Validating seeded machine bootstrap mapping"
python3 - "$deployer_endpoint" "$bootstrap_token" <<'PY'
import json
import sys
import urllib.request
endpoint, token = sys.argv[1], sys.argv[2]
request = urllib.request.Request(
endpoint + "/api/v1/phone-home",
data=json.dumps({"machine_id": "known-machine-01", "ip": "10.0.0.11"}).encode(),
headers={
"Content-Type": "application/json",
"X-Deployer-Token": token,
},
)
with urllib.request.urlopen(request, timeout=5) as response:
payload = json.loads(response.read().decode("utf-8"))
assert payload["success"] is True
assert payload["node_id"] == "node-seeded"
assert payload["node_config"]["pool"] == "general"
assert payload["node_config"]["node_class"] == "general-worker"
assert payload["node_config"]["nix_profile"] == "profiles/worker-linux"
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "node01"
assert payload["node_config"]["install_plan"]["disko_config_path"] == "nix/nodes/vm-cluster/node01/disko.nix"
assert payload["node_config"]["install_plan"]["target_disk"] == "/dev/vda"
assert payload["node_config"]["failure_domain"] == "rack-a"
print("Seeded mapping validated")
PY
echo "Validating cloud-init metadata endpoints"
python3 - "$deployer_endpoint" "$bootstrap_token" <<'PY'
import sys
import urllib.request
endpoint, token = sys.argv[1], sys.argv[2]
for path, expected in (
("/api/v1/cloud-init/known-machine-01/meta-data", "instance-id: node-seeded"),
("/api/v1/cloud-init/known-machine-01/user-data", "#cloud-config"),
):
request = urllib.request.Request(
endpoint + path,
headers={"X-Deployer-Token": token},
)
with urllib.request.urlopen(request, timeout=5) as response:
payload = response.read().decode("utf-8")
assert expected in payload
if path.endswith("user-data"):
assert "/etc/plasmacloud/node-config.json" in payload
assert "profiles/worker-linux" in payload
assert "\"nixos_configuration\": \"node01\"" in payload
print("cloud-init endpoints validated")
PY
echo "Validating bootstrap flake bundle endpoint"
python3 - "$deployer_endpoint" "$bootstrap_token" <<'PY'
import sys
import urllib.request
endpoint, token = sys.argv[1], sys.argv[2]
request = urllib.request.Request(
endpoint + "/api/v1/bootstrap/flake-bundle",
headers={"X-Deployer-Token": token},
)
with urllib.request.urlopen(request, timeout=5) as response:
payload = response.read()
assert payload == b"bundle-bytes"
print("bootstrap flake bundle endpoint validated")
PY
echo "Validating enrollment-rule bootstrap path"
dynamic_node_id="$(
python3 - "$deployer_endpoint" "$bootstrap_token" <<'PY'
import json
import sys
import urllib.request
endpoint, token = sys.argv[1], sys.argv[2]
request = urllib.request.Request(
endpoint + "/api/v1/phone-home",
data=json.dumps(
{
"machine_id": "dynamic-metal-01",
"ip": "10.0.9.25",
"metadata": {
"rack": "edge",
"sku": "metal",
"topology.kubernetes.io/zone": "rack-z",
},
"hardware_facts": {
"architecture": "x86_64",
"cpu_model": "Example CPU",
"cpu_threads": 32,
"cpu_cores": 16,
"memory_bytes": 137438953472,
"disks": [
{
"name": "nvme0n1",
"path": "/dev/nvme0n1",
"by_id": "/dev/disk/by-id/nvme-dynamic-metal-01",
"size_bytes": 2000398934016,
"model": "Example NVMe",
"serial": "disk-serial-01",
"rotational": False
}
],
"nics": [
{
"name": "eno1",
"mac_address": "52:54:00:aa:bb:cc",
"oper_state": "up"
}
],
"dmi": {
"vendor": "ExampleVendor",
"product_name": "ExampleMetal",
"serial_number": "dynamic-metal-serial"
}
},
}
).encode(),
headers={
"Content-Type": "application/json",
"X-Deployer-Token": token,
},
)
with urllib.request.urlopen(request, timeout=5) as response:
payload = json.loads(response.read().decode("utf-8"))
assert payload["success"] is True
assert payload["node_id"].startswith("edge-")
assert payload["node_config"]["role"] == "edge"
assert payload["node_config"]["pool"] == "edge"
assert payload["node_config"]["node_class"] == "edge-metal"
assert payload["node_config"]["nix_profile"] == "profiles/edge-metal"
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "edge-metal"
assert payload["node_config"]["install_plan"]["disko_config_path"] == "profiles/edge-metal/disko.nix"
assert payload["node_config"]["install_plan"]["target_disk_by_id"] == "/dev/disk/by-id/edge-default"
assert "prismnet" in payload["node_config"]["services"]
assert payload["node_config"]["labels"]["managed-by"] == "deployer"
print(payload["node_id"])
PY
)"
echo "Inspecting stored cluster node records"
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes.dump"
python3 - "$tmp_dir/nodes.dump" "$dynamic_node_id" <<'PY'
import json
import sys
path = sys.argv[1]
dynamic_id = sys.argv[2]
records = {}
with open(path, "r", encoding="utf-8") as handle:
for line in handle:
line = line.strip()
if " key=" not in line or " value=" not in line:
continue
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
if key.endswith("/desired-system"):
continue
value = line.split(" value=", 1)[1]
record = json.loads(value)
records[record["node_id"]] = record
seeded = records.get("node-seeded")
dynamic = records.get(dynamic_id)
if seeded is None:
raise SystemExit("missing seeded node record")
if dynamic is None:
raise SystemExit("missing dynamic node record")
if seeded.get("pool") != "general" or seeded.get("node_class") != "general-worker":
raise SystemExit(f"unexpected seeded node record: {seeded}")
if dynamic.get("pool") != "edge" or dynamic.get("node_class") != "edge-metal":
raise SystemExit(f"unexpected dynamic node record: {dynamic}")
if dynamic.get("failure_domain") != "rack-z":
raise SystemExit(f"unexpected dynamic failure domain: {dynamic}")
if dynamic.get("labels", {}).get("lane") != "edge":
raise SystemExit(f"missing pool label propagation: {dynamic}")
if seeded.get("install_plan", {}).get("target_disk") != "/dev/vda":
raise SystemExit(f"missing seeded target disk: {seeded}")
if dynamic.get("install_plan", {}).get("target_disk_by_id") != "/dev/disk/by-id/edge-default":
raise SystemExit(f"missing dynamic target disk by-id: {dynamic}")
facts = dynamic.get("hardware_facts") or {}
if facts.get("architecture") != "x86_64":
raise SystemExit(f"missing dynamic hardware architecture: {dynamic}")
if facts.get("disks", [{}])[0].get("by_id") != "/dev/disk/by-id/nvme-dynamic-metal-01":
raise SystemExit(f"missing dynamic hardware disk facts: {dynamic}")
if dynamic.get("labels", {}).get("hardware.architecture") != "x86_64":
raise SystemExit(f"missing hardware metadata labels: {dynamic}")
if dynamic.get("labels", {}).get("hardware.disk_count") != "1":
raise SystemExit(f"missing hardware disk count label: {dynamic}")
print("Deployer bootstrap records validated")
PY
echo "Inspecting desired-system state"
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node-seeded/desired-system" >"$tmp_dir/desired-system.dump"
python3 - "$tmp_dir/desired-system.dump" <<'PY'
import json
import sys
path = sys.argv[1]
with open(path, "r", encoding="utf-8") as handle:
lines = [line.strip() for line in handle if " value=" in line]
if len(lines) != 1:
raise SystemExit(f"unexpected desired-system dump: {lines}")
payload = json.loads(lines[0].split(" value=", 1)[1])
assert payload["node_id"] == "node-seeded"
assert payload["nixos_configuration"] == "node01"
assert payload["flake_ref"] == "github:centra/cloud"
assert payload["health_check_command"] == ["systemctl", "is-system-running", "--wait"]
assert payload["rollback_on_failure"] is True
print("desired-system state validated")
PY
echo "Deployer bootstrap E2E verification passed"