photoncloud-monorepo/deployer/scripts/verify-host-lifecycle-e2e.sh

431 lines
12 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
fi
run_chainfire_server_bin() {
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
else
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
fi
}
run_deployer_ctl_bin() {
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
fi
}
run_plasmacloud_reconciler_bin() {
if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then
"$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@"
fi
}
tmp_dir="$(mktemp -d)"
cf_pid=""
redfish_pid=""
cleanup() {
set +e
if [[ -n "$redfish_pid" ]]; then
kill "$redfish_pid" 2>/dev/null || true
wait "$redfish_pid" 2>/dev/null || true
fi
if [[ -n "$cf_pid" ]]; then
kill "$cf_pid" 2>/dev/null || true
wait "$cf_pid" 2>/dev/null || true
fi
rm -rf "$tmp_dir"
}
trap cleanup EXIT
free_port() {
python3 - <<'PY'
import socket
s = socket.socket()
s.bind(("127.0.0.1", 0))
print(s.getsockname()[1])
s.close()
PY
}
wait_for_port() {
local host="$1"
local port="$2"
local timeout_secs="${3:-60}"
local deadline=$((SECONDS + timeout_secs))
while (( SECONDS < deadline )); do
if python3 - "$host" "$port" <<'PY'
import socket
import sys
host = sys.argv[1]
port = int(sys.argv[2])
with socket.socket() as sock:
sock.settimeout(0.5)
try:
sock.connect((host, port))
except OSError:
raise SystemExit(1)
raise SystemExit(0)
PY
then
return 0
fi
sleep 1
done
echo "timed out waiting for ${host}:${port}" >&2
return 1
}
api_port="$(free_port)"
http_port="$(free_port)"
raft_port="$(free_port)"
gossip_port="$(free_port)"
redfish_port="$(free_port)"
cat >"$tmp_dir/chainfire.toml" <<EOF
[node]
id = 1
name = "chainfire-1"
role = "control_plane"
[storage]
data_dir = "$tmp_dir/chainfire-data"
[network]
api_addr = "127.0.0.1:${api_port}"
http_addr = "127.0.0.1:${http_port}"
raft_addr = "127.0.0.1:${raft_port}"
gossip_addr = "127.0.0.1:${gossip_port}"
[cluster]
id = 1
initial_members = []
bootstrap = true
[raft]
role = "voter"
EOF
cat >"$tmp_dir/mock-redfish.py" <<'PY'
import http.server
import json
import sys
port = int(sys.argv[1])
log_path = sys.argv[2]
class Handler(http.server.BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass
def do_GET(self):
if self.path == "/redfish/v1/Systems/node01":
body = json.dumps({"PowerState": "On"}).encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
self.send_error(404)
def do_POST(self):
if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset":
self.send_error(404)
return
length = int(self.headers.get("Content-Length", "0"))
payload = self.rfile.read(length).decode("utf-8")
with open(log_path, "a", encoding="utf-8") as handle:
handle.write(payload + "\n")
self.send_response(204)
self.end_headers()
server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
server.serve_forever()
PY
echo "Starting ChainFire on 127.0.0.1:${api_port}"
run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 &
cf_pid="$!"
wait_for_port "127.0.0.1" "$api_port" 120
wait_for_port "127.0.0.1" "$http_port" 120
echo "Starting mock Redfish on 127.0.0.1:${redfish_port}"
python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 &
redfish_pid="$!"
wait_for_port "127.0.0.1" "$redfish_port" 30
cat >"$tmp_dir/cluster.yaml" <<EOF
cluster:
cluster_id: test-cluster
environment: dev
node_classes:
- name: worker-linux
roles:
- worker
labels:
tier: general
pools:
- name: general
node_class: worker-linux
labels:
env: dev
nodes:
- node_id: node01
hostname: node01
ip: 10.0.0.11
roles:
- worker
labels:
tier: general
pool: general
node_class: worker-linux
state: active
commission_state: commissioned
install_state: installed
bmc_ref: "redfish+http://127.0.0.1:${redfish_port}/redfish/v1/Systems/node01"
- node_id: node02
hostname: node02
ip: 10.0.0.12
roles:
- worker
labels:
tier: general
pool: general
node_class: worker-linux
state: active
commission_state: commissioned
install_state: installed
host_deployments:
- name: worker-rollout
selector:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
tier: general
nixos_configuration: worker-next
flake_ref: "github:centra/cloud"
batch_size: 1
max_unavailable: 1
health_check_command:
- "true"
switch_action: switch
rollback_on_failure: true
EOF
chainfire_endpoint="http://127.0.0.1:${api_port}"
run_deployer_ctl() {
run_deployer_ctl_bin \
--chainfire-endpoint "$chainfire_endpoint" \
--cluster-id test-cluster \
--cluster-namespace photoncloud \
--deployer-namespace deployer \
"$@"
}
run_hosts_once() {
run_plasmacloud_reconciler_bin \
hosts \
--endpoint "$chainfire_endpoint" \
--cluster-namespace photoncloud \
--cluster-id test-cluster \
--heartbeat-timeout-secs 300 \
--once
}
echo "Applying host lifecycle cluster config"
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
echo "Running host rollout controller"
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-1.json"
python3 - "$tmp_dir/deployment-1.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["phase"] == "running", payload
assert status["in_progress_nodes"] == ["node01"], payload
assert status["failed_nodes"] == [], payload
print("initial rollout wave validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump"
python3 - "$tmp_dir/nodes-1.dump" <<'PY'
import json
import sys
desired = {}
with open(sys.argv[1], "r", encoding="utf-8") as handle:
for line in handle:
if " key=" not in line or " value=" not in line:
continue
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
if not key.endswith("/desired-system"):
continue
payload = json.loads(line.split(" value=", 1)[1])
desired[payload["node_id"]] = payload
assert sorted(desired) == ["node01"], desired
assert desired["node01"]["deployment_id"] == "worker-rollout", desired
print("desired-system first wave validated")
PY
echo "Pausing and resuming deployment via CLI"
run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json"
python3 - "$tmp_dir/pause.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["paused"] is True, payload
assert payload["paused_by_operator"] is True, payload
print("pause command validated")
PY
run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json"
python3 - "$tmp_dir/resume.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["paused"] is False, payload
assert payload["paused_by_operator"] is False, payload
print("resume command validated")
PY
echo "Marking node01 rollout complete and reconciling next wave"
run_deployer_ctl node set-observed \
--node-id node01 \
--status active \
--nixos-configuration worker-next >/dev/null
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json"
python3 - "$tmp_dir/deployment-2.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["completed_nodes"] == ["node01"], payload
assert status["in_progress_nodes"] == ["node02"], payload
print("second rollout wave validated")
PY
echo "Marking node02 rollout failed and validating auto-pause"
run_deployer_ctl node set-observed \
--node-id node02 \
--status rolled-back \
--nixos-configuration worker-next >/dev/null
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json"
python3 - "$tmp_dir/deployment-3.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["paused"] is True, payload
assert status["failed_nodes"] == ["node02"], payload
print("auto-pause on failure validated")
PY
echo "Refreshing power state through Redfish"
run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json"
python3 - "$tmp_dir/node-power.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["power_state"] == "on", payload
print("power refresh validated")
PY
echo "Requesting reinstall with power cycle"
run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json"
python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["state"] == "provisioning", payload
assert payload["install_state"] == "reinstall_requested", payload
assert payload["power_state"] == "cycling", payload
lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()]
assert any('"ResetType":"PowerCycle"' in line for line in lines), lines
print("reinstall orchestration validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump"
python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY'
import sys
lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")]
assert not any("/desired-system" in line for line in lines), lines
assert not any("/observed-system" in line for line in lines), lines
print("reinstall state cleanup validated")
PY
echo "Aborting deployment and clearing desired-system"
run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json"
python3 - "$tmp_dir/abort.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["phase"] == "aborted", payload
assert payload["paused"] is True, payload
print("abort command validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump"
python3 - "$tmp_dir/nodes-2.dump" <<'PY'
import json
import sys
desired_nodes = []
with open(sys.argv[1], "r", encoding="utf-8") as handle:
for line in handle:
if " key=" not in line or " value=" not in line:
continue
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
if not key.endswith("/desired-system"):
continue
payload = json.loads(line.split(" value=", 1)[1])
if payload.get("deployment_id") == "worker-rollout":
desired_nodes.append(payload["node_id"])
assert desired_nodes == [], desired_nodes
print("desired-system cleanup validated")
PY
echo "Host lifecycle E2E verification passed"