431 lines
12 KiB
Bash
431 lines
12 KiB
Bash
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
|
|
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
|
|
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
|
|
fi
|
|
|
|
run_chainfire_server_bin() {
|
|
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
|
|
else
|
|
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
|
|
fi
|
|
}
|
|
|
|
run_deployer_ctl_bin() {
|
|
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
|
|
else
|
|
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
|
|
fi
|
|
}
|
|
|
|
run_plasmacloud_reconciler_bin() {
|
|
if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@"
|
|
else
|
|
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@"
|
|
fi
|
|
}
|
|
|
|
tmp_dir="$(mktemp -d)"
|
|
cf_pid=""
|
|
redfish_pid=""
|
|
|
|
cleanup() {
|
|
set +e
|
|
if [[ -n "$redfish_pid" ]]; then
|
|
kill "$redfish_pid" 2>/dev/null || true
|
|
wait "$redfish_pid" 2>/dev/null || true
|
|
fi
|
|
if [[ -n "$cf_pid" ]]; then
|
|
kill "$cf_pid" 2>/dev/null || true
|
|
wait "$cf_pid" 2>/dev/null || true
|
|
fi
|
|
rm -rf "$tmp_dir"
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
free_port() {
|
|
python3 - <<'PY'
|
|
import socket
|
|
s = socket.socket()
|
|
s.bind(("127.0.0.1", 0))
|
|
print(s.getsockname()[1])
|
|
s.close()
|
|
PY
|
|
}
|
|
|
|
wait_for_port() {
|
|
local host="$1"
|
|
local port="$2"
|
|
local timeout_secs="${3:-60}"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
|
|
while (( SECONDS < deadline )); do
|
|
if python3 - "$host" "$port" <<'PY'
|
|
import socket
|
|
import sys
|
|
|
|
host = sys.argv[1]
|
|
port = int(sys.argv[2])
|
|
|
|
with socket.socket() as sock:
|
|
sock.settimeout(0.5)
|
|
try:
|
|
sock.connect((host, port))
|
|
except OSError:
|
|
raise SystemExit(1)
|
|
raise SystemExit(0)
|
|
PY
|
|
then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo "timed out waiting for ${host}:${port}" >&2
|
|
return 1
|
|
}
|
|
|
|
api_port="$(free_port)"
|
|
http_port="$(free_port)"
|
|
raft_port="$(free_port)"
|
|
gossip_port="$(free_port)"
|
|
redfish_port="$(free_port)"
|
|
|
|
cat >"$tmp_dir/chainfire.toml" <<EOF
|
|
[node]
|
|
id = 1
|
|
name = "chainfire-1"
|
|
role = "control_plane"
|
|
|
|
[storage]
|
|
data_dir = "$tmp_dir/chainfire-data"
|
|
|
|
[network]
|
|
api_addr = "127.0.0.1:${api_port}"
|
|
http_addr = "127.0.0.1:${http_port}"
|
|
raft_addr = "127.0.0.1:${raft_port}"
|
|
gossip_addr = "127.0.0.1:${gossip_port}"
|
|
|
|
[cluster]
|
|
id = 1
|
|
initial_members = []
|
|
bootstrap = true
|
|
|
|
[raft]
|
|
role = "voter"
|
|
EOF
|
|
|
|
cat >"$tmp_dir/mock-redfish.py" <<'PY'
|
|
import http.server
|
|
import json
|
|
import sys
|
|
|
|
port = int(sys.argv[1])
|
|
log_path = sys.argv[2]
|
|
|
|
class Handler(http.server.BaseHTTPRequestHandler):
|
|
def log_message(self, format, *args):
|
|
pass
|
|
|
|
def do_GET(self):
|
|
if self.path == "/redfish/v1/Systems/node01":
|
|
body = json.dumps({"PowerState": "On"}).encode("utf-8")
|
|
self.send_response(200)
|
|
self.send_header("Content-Type", "application/json")
|
|
self.send_header("Content-Length", str(len(body)))
|
|
self.end_headers()
|
|
self.wfile.write(body)
|
|
return
|
|
self.send_error(404)
|
|
|
|
def do_POST(self):
|
|
if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset":
|
|
self.send_error(404)
|
|
return
|
|
length = int(self.headers.get("Content-Length", "0"))
|
|
payload = self.rfile.read(length).decode("utf-8")
|
|
with open(log_path, "a", encoding="utf-8") as handle:
|
|
handle.write(payload + "\n")
|
|
self.send_response(204)
|
|
self.end_headers()
|
|
|
|
server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
|
|
server.serve_forever()
|
|
PY
|
|
|
|
echo "Starting ChainFire on 127.0.0.1:${api_port}"
|
|
run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 &
|
|
cf_pid="$!"
|
|
wait_for_port "127.0.0.1" "$api_port" 120
|
|
wait_for_port "127.0.0.1" "$http_port" 120
|
|
|
|
echo "Starting mock Redfish on 127.0.0.1:${redfish_port}"
|
|
python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 &
|
|
redfish_pid="$!"
|
|
wait_for_port "127.0.0.1" "$redfish_port" 30
|
|
|
|
cat >"$tmp_dir/cluster.yaml" <<EOF
|
|
cluster:
|
|
cluster_id: test-cluster
|
|
environment: dev
|
|
|
|
node_classes:
|
|
- name: worker-linux
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: general
|
|
|
|
pools:
|
|
- name: general
|
|
node_class: worker-linux
|
|
labels:
|
|
env: dev
|
|
|
|
nodes:
|
|
- node_id: node01
|
|
hostname: node01
|
|
ip: 10.0.0.11
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: general
|
|
pool: general
|
|
node_class: worker-linux
|
|
state: active
|
|
commission_state: commissioned
|
|
install_state: installed
|
|
bmc_ref: "redfish+http://127.0.0.1:${redfish_port}/redfish/v1/Systems/node01"
|
|
- node_id: node02
|
|
hostname: node02
|
|
ip: 10.0.0.12
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: general
|
|
pool: general
|
|
node_class: worker-linux
|
|
state: active
|
|
commission_state: commissioned
|
|
install_state: installed
|
|
|
|
host_deployments:
|
|
- name: worker-rollout
|
|
selector:
|
|
roles:
|
|
- worker
|
|
pools:
|
|
- general
|
|
node_classes:
|
|
- worker-linux
|
|
match_labels:
|
|
tier: general
|
|
nixos_configuration: worker-next
|
|
flake_ref: "github:centra/cloud"
|
|
batch_size: 1
|
|
max_unavailable: 1
|
|
health_check_command:
|
|
- "true"
|
|
switch_action: switch
|
|
rollback_on_failure: true
|
|
EOF
|
|
|
|
chainfire_endpoint="http://127.0.0.1:${api_port}"
|
|
|
|
run_deployer_ctl() {
|
|
run_deployer_ctl_bin \
|
|
--chainfire-endpoint "$chainfire_endpoint" \
|
|
--cluster-id test-cluster \
|
|
--cluster-namespace photoncloud \
|
|
--deployer-namespace deployer \
|
|
"$@"
|
|
}
|
|
|
|
run_hosts_once() {
|
|
run_plasmacloud_reconciler_bin \
|
|
hosts \
|
|
--endpoint "$chainfire_endpoint" \
|
|
--cluster-namespace photoncloud \
|
|
--cluster-id test-cluster \
|
|
--heartbeat-timeout-secs 300 \
|
|
--once
|
|
}
|
|
|
|
echo "Applying host lifecycle cluster config"
|
|
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
|
|
|
|
echo "Running host rollout controller"
|
|
run_hosts_once
|
|
|
|
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-1.json"
|
|
python3 - "$tmp_dir/deployment-1.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
status = payload["status"]
|
|
assert status["phase"] == "running", payload
|
|
assert status["in_progress_nodes"] == ["node01"], payload
|
|
assert status["failed_nodes"] == [], payload
|
|
print("initial rollout wave validated")
|
|
PY
|
|
|
|
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump"
|
|
python3 - "$tmp_dir/nodes-1.dump" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
desired = {}
|
|
with open(sys.argv[1], "r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
if " key=" not in line or " value=" not in line:
|
|
continue
|
|
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
|
|
if not key.endswith("/desired-system"):
|
|
continue
|
|
payload = json.loads(line.split(" value=", 1)[1])
|
|
desired[payload["node_id"]] = payload
|
|
|
|
assert sorted(desired) == ["node01"], desired
|
|
assert desired["node01"]["deployment_id"] == "worker-rollout", desired
|
|
print("desired-system first wave validated")
|
|
PY
|
|
|
|
echo "Pausing and resuming deployment via CLI"
|
|
run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json"
|
|
python3 - "$tmp_dir/pause.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
assert payload["paused"] is True, payload
|
|
assert payload["paused_by_operator"] is True, payload
|
|
print("pause command validated")
|
|
PY
|
|
run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json"
|
|
python3 - "$tmp_dir/resume.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
assert payload["paused"] is False, payload
|
|
assert payload["paused_by_operator"] is False, payload
|
|
print("resume command validated")
|
|
PY
|
|
|
|
echo "Marking node01 rollout complete and reconciling next wave"
|
|
run_deployer_ctl node set-observed \
|
|
--node-id node01 \
|
|
--status active \
|
|
--nixos-configuration worker-next >/dev/null
|
|
run_hosts_once
|
|
|
|
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json"
|
|
python3 - "$tmp_dir/deployment-2.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
status = payload["status"]
|
|
assert status["completed_nodes"] == ["node01"], payload
|
|
assert status["in_progress_nodes"] == ["node02"], payload
|
|
print("second rollout wave validated")
|
|
PY
|
|
|
|
echo "Marking node02 rollout failed and validating auto-pause"
|
|
run_deployer_ctl node set-observed \
|
|
--node-id node02 \
|
|
--status rolled-back \
|
|
--nixos-configuration worker-next >/dev/null
|
|
run_hosts_once
|
|
|
|
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json"
|
|
python3 - "$tmp_dir/deployment-3.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
status = payload["status"]
|
|
assert status["paused"] is True, payload
|
|
assert status["failed_nodes"] == ["node02"], payload
|
|
print("auto-pause on failure validated")
|
|
PY
|
|
|
|
echo "Refreshing power state through Redfish"
|
|
run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json"
|
|
python3 - "$tmp_dir/node-power.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
assert payload["power_state"] == "on", payload
|
|
print("power refresh validated")
|
|
PY
|
|
|
|
echo "Requesting reinstall with power cycle"
|
|
run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json"
|
|
python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
assert payload["state"] == "provisioning", payload
|
|
assert payload["install_state"] == "reinstall_requested", payload
|
|
assert payload["power_state"] == "cycling", payload
|
|
|
|
lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()]
|
|
assert any('"ResetType":"PowerCycle"' in line for line in lines), lines
|
|
print("reinstall orchestration validated")
|
|
PY
|
|
|
|
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump"
|
|
python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY'
|
|
import sys
|
|
|
|
lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")]
|
|
assert not any("/desired-system" in line for line in lines), lines
|
|
assert not any("/observed-system" in line for line in lines), lines
|
|
print("reinstall state cleanup validated")
|
|
PY
|
|
|
|
echo "Aborting deployment and clearing desired-system"
|
|
run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json"
|
|
python3 - "$tmp_dir/abort.json" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
|
assert payload["phase"] == "aborted", payload
|
|
assert payload["paused"] is True, payload
|
|
print("abort command validated")
|
|
PY
|
|
|
|
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump"
|
|
python3 - "$tmp_dir/nodes-2.dump" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
desired_nodes = []
|
|
with open(sys.argv[1], "r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
if " key=" not in line or " value=" not in line:
|
|
continue
|
|
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
|
|
if not key.endswith("/desired-system"):
|
|
continue
|
|
payload = json.loads(line.split(" value=", 1)[1])
|
|
if payload.get("deployment_id") == "worker-rollout":
|
|
desired_nodes.append(payload["node_id"])
|
|
|
|
assert desired_nodes == [], desired_nodes
|
|
print("desired-system cleanup validated")
|
|
PY
|
|
|
|
echo "Host lifecycle E2E verification passed"
|