#!/usr/bin/env bash set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@" fi run_chainfire_server_bin() { if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@" else cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@" fi } run_deployer_ctl_bin() { if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@" fi } run_plasmacloud_reconciler_bin() { if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then "$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@" fi } tmp_dir="$(mktemp -d)" cf_pid="" redfish_pid="" cleanup() { set +e if [[ -n "$redfish_pid" ]]; then kill "$redfish_pid" 2>/dev/null || true wait "$redfish_pid" 2>/dev/null || true fi if [[ -n "$cf_pid" ]]; then kill "$cf_pid" 2>/dev/null || true wait "$cf_pid" 2>/dev/null || true fi rm -rf "$tmp_dir" } trap cleanup EXIT free_port() { python3 - <<'PY' import socket s = socket.socket() s.bind(("127.0.0.1", 0)) print(s.getsockname()[1]) s.close() PY } wait_for_port() { local host="$1" local port="$2" local timeout_secs="${3:-60}" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if python3 - "$host" "$port" <<'PY' import socket import sys host = sys.argv[1] port = int(sys.argv[2]) with socket.socket() as sock: sock.settimeout(0.5) try: sock.connect((host, port)) except OSError: raise SystemExit(1) raise SystemExit(0) PY then return 0 fi sleep 1 done echo "timed out waiting for ${host}:${port}" >&2 return 1 } api_port="$(free_port)" http_port="$(free_port)" raft_port="$(free_port)" gossip_port="$(free_port)" redfish_port="$(free_port)" cat >"$tmp_dir/chainfire.toml" <"$tmp_dir/mock-redfish.py" <<'PY' import http.server import json import sys port = int(sys.argv[1]) log_path = sys.argv[2] class Handler(http.server.BaseHTTPRequestHandler): def log_message(self, format, *args): pass def do_GET(self): if self.path == "/redfish/v1/Systems/node01": body = json.dumps({"PowerState": "On"}).encode("utf-8") self.send_response(200) self.send_header("Content-Type", "application/json") self.send_header("Content-Length", str(len(body))) self.end_headers() self.wfile.write(body) return self.send_error(404) def do_POST(self): if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset": self.send_error(404) return length = int(self.headers.get("Content-Length", "0")) payload = self.rfile.read(length).decode("utf-8") with open(log_path, "a", encoding="utf-8") as handle: handle.write(payload + "\n") self.send_response(204) self.end_headers() server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler) server.serve_forever() PY echo "Starting ChainFire on 127.0.0.1:${api_port}" run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 & cf_pid="$!" wait_for_port "127.0.0.1" "$api_port" 120 wait_for_port "127.0.0.1" "$http_port" 120 echo "Starting mock Redfish on 127.0.0.1:${redfish_port}" python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 & redfish_pid="$!" wait_for_port "127.0.0.1" "$redfish_port" 30 cat >"$tmp_dir/cluster.yaml" <"$tmp_dir/deployment-1.json" python3 - "$tmp_dir/deployment-1.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload["status"] assert status["phase"] == "running", payload assert status["in_progress_nodes"] == ["node01"], payload assert status["failed_nodes"] == [], payload print("initial rollout wave validated") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump" python3 - "$tmp_dir/nodes-1.dump" <<'PY' import json import sys desired = {} with open(sys.argv[1], "r", encoding="utf-8") as handle: for line in handle: if " key=" not in line or " value=" not in line: continue key = line.split(" key=", 1)[1].split(" value=", 1)[0] if not key.endswith("/desired-system"): continue payload = json.loads(line.split(" value=", 1)[1]) desired[payload["node_id"]] = payload assert sorted(desired) == ["node01"], desired assert desired["node01"]["deployment_id"] == "worker-rollout", desired print("desired-system first wave validated") PY echo "Pausing and resuming deployment via CLI" run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json" python3 - "$tmp_dir/pause.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) assert payload["paused"] is True, payload assert payload["paused_by_operator"] is True, payload print("pause command validated") PY run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json" python3 - "$tmp_dir/resume.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) assert payload["paused"] is False, payload assert payload["paused_by_operator"] is False, payload print("resume command validated") PY echo "Marking node01 rollout complete and reconciling next wave" run_deployer_ctl node set-observed \ --node-id node01 \ --status active \ --nixos-configuration worker-next >/dev/null run_hosts_once run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json" python3 - "$tmp_dir/deployment-2.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload["status"] assert status["completed_nodes"] == ["node01"], payload assert status["in_progress_nodes"] == ["node02"], payload print("second rollout wave validated") PY echo "Marking node02 rollout failed and validating auto-pause" run_deployer_ctl node set-observed \ --node-id node02 \ --status rolled-back \ --nixos-configuration worker-next >/dev/null run_hosts_once run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json" python3 - "$tmp_dir/deployment-3.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload["status"] assert status["paused"] is True, payload assert status["failed_nodes"] == ["node02"], payload print("auto-pause on failure validated") PY echo "Refreshing power state through Redfish" run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json" python3 - "$tmp_dir/node-power.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) assert payload["power_state"] == "on", payload print("power refresh validated") PY echo "Requesting reinstall with power cycle" run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json" python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) assert payload["state"] == "provisioning", payload assert payload["install_state"] == "reinstall_requested", payload assert payload["power_state"] == "cycling", payload lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()] assert any('"ResetType":"PowerCycle"' in line for line in lines), lines print("reinstall orchestration validated") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump" python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY' import sys lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")] assert not any("/desired-system" in line for line in lines), lines assert not any("/observed-system" in line for line in lines), lines print("reinstall state cleanup validated") PY echo "Aborting deployment and clearing desired-system" run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json" python3 - "$tmp_dir/abort.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) assert payload["phase"] == "aborted", payload assert payload["paused"] is True, payload print("abort command validated") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump" python3 - "$tmp_dir/nodes-2.dump" <<'PY' import json import sys desired_nodes = [] with open(sys.argv[1], "r", encoding="utf-8") as handle: for line in handle: if " key=" not in line or " value=" not in line: continue key = line.split(" key=", 1)[1].split(" value=", 1)[0] if not key.endswith("/desired-system"): continue payload = json.loads(line.split(" value=", 1)[1]) if payload.get("deployment_id") == "worker-rollout": desired_nodes.append(payload["node_id"]) assert desired_nodes == [], desired_nodes print("desired-system cleanup validated") PY echo "Host lifecycle E2E verification passed"