#!/usr/bin/env bash set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@" fi run_chainfire_server_bin() { if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@" else cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@" fi } run_deployer_ctl_bin() { if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@" fi } run_node_agent_bin() { if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then "$PHOTONCLOUD_NODE_AGENT_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@" fi } run_fleet_scheduler_bin() { if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then "$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@" fi } tmp_dir="$(mktemp -d)" cf_pid="" cleanup() { set +e if [[ -d "$tmp_dir/pids" ]]; then while IFS= read -r -d '' pid_file; do [[ -f "$pid_file" ]] || continue kill "$(cat "$pid_file")" 2>/dev/null || true done < <(find "$tmp_dir/pids" -type f -name '*.pid' -print0 2>/dev/null) fi if [[ -n "$cf_pid" ]]; then kill "$cf_pid" 2>/dev/null || true wait "$cf_pid" 2>/dev/null || true fi rm -rf "$tmp_dir" } trap cleanup EXIT free_port() { python3 - <<'PY' import socket s = socket.socket() s.bind(("127.0.0.1", 0)) print(s.getsockname()[1]) s.close() PY } wait_for_port() { local host="$1" local port="$2" local timeout_secs="${3:-60}" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if python3 - "$host" "$port" <<'PY' import socket import sys host = sys.argv[1] port = int(sys.argv[2]) with socket.socket() as sock: sock.settimeout(0.5) try: sock.connect((host, port)) except OSError: raise SystemExit(1) raise SystemExit(0) PY then return 0 fi sleep 1 done echo "timed out waiting for ${host}:${port}" >&2 return 1 } api_port="$(free_port)" http_port="$(free_port)" raft_port="$(free_port)" gossip_port="$(free_port)" cat >"$tmp_dir/chainfire.toml" <"$tmp_dir/chainfire.log" 2>&1 & cf_pid="$!" wait_for_port "127.0.0.1" "$api_port" 120 cat >"$tmp_dir/cluster.yaml" <<'EOF' cluster: cluster_id: test-cluster environment: dev node_classes: - name: worker-linux description: Standard worker nodes nix_profile: profiles/worker-linux roles: - worker labels: tier: general pools: - name: general description: Default capacity pool node_class: worker-linux min_size: 2 max_size: 10 labels: env: dev nodes: - node_id: node01 hostname: node01 ip: 127.0.0.2 pool: general failure_domain: rack-a state: pending - node_id: node02 hostname: node02 ip: 127.0.0.3 pool: general failure_domain: rack-b state: pending services: - name: api ports: http: 18080 protocol: http schedule: replicas: 2 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18080 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 EOF cat >"$tmp_dir/cluster-scaled.yaml" <<'EOF' cluster: cluster_id: test-cluster environment: dev node_classes: - name: worker-linux description: Standard worker nodes nix_profile: profiles/worker-linux roles: - worker labels: tier: general pools: - name: general description: Default capacity pool node_class: worker-linux min_size: 2 max_size: 10 labels: env: dev nodes: - node_id: node01 hostname: node01 ip: 127.0.0.2 pool: general failure_domain: rack-a state: active - node_id: node02 hostname: node02 ip: 127.0.0.3 pool: general failure_domain: rack-b state: active services: - name: api ports: http: 18080 protocol: http schedule: replicas: 1 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18080 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 EOF endpoint="http://127.0.0.1:${api_port}" run_deployer_ctl() { run_deployer_ctl_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ "$@" } run_node_agent_once() { local node_id="$1" local pid_dir="$tmp_dir/pids/$node_id" mkdir -p "$pid_dir" run_node_agent_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ --node-id "$node_id" \ --pid-dir "$pid_dir" \ --interval-secs 1 \ --apply \ --once } run_scheduler_once() { run_fleet_scheduler_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ --interval-secs 1 \ --once } echo "Applying cluster declaration" run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" echo "Activating nodes through node-agent" run_node_agent_once node01 run_node_agent_once node02 echo "Scheduling managed instances" run_scheduler_once echo "Reconciling processes and health" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done echo "Validating HTTP endpoints" python3 - <<'PY' import urllib.request for address in ("http://127.0.0.2:18080/", "http://127.0.0.3:18080/"): with urllib.request.urlopen(address, timeout=5) as response: body = response.read().decode("utf-8") if response.status != 200: raise SystemExit(f"{address} returned {response.status}") if "Directory listing" not in body and "DOCTYPE" not in body: raise SystemExit(f"{address} returned unexpected body") print("HTTP endpoints are healthy") PY echo "Inspecting instance state in ChainFire" run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances.dump" python3 - "$tmp_dir/instances.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 2: raise SystemExit(f"expected 2 scheduled instances, found {len(instances)}") node_ids = sorted(instance["node_id"] for instance in instances) states = sorted(instance.get("state") for instance in instances) if node_ids != ["node01", "node02"]: raise SystemExit(f"unexpected node placement: {node_ids}") if states != ["healthy", "healthy"]: raise SystemExit(f"unexpected health states: {states}") print("Observed two healthy scheduled instances across node01 and node02") PY echo "Applying scaled declaration" run_deployer_ctl apply --config "$tmp_dir/cluster-scaled.yaml" --prune echo "Re-running scheduler after scale-down" run_scheduler_once echo "Reconciling processes and health after scale-down" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done echo "Inspecting scaled instance state in ChainFire" run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances-scaled.dump" python3 - "$tmp_dir/instances-scaled.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 1: raise SystemExit(f"expected 1 scheduled instance after scale-down, found {len(instances)}") instance = instances[0] if instance["node_id"] != "node01": raise SystemExit(f"expected remaining instance on node01, found {instance['node_id']}") if instance.get("state") != "healthy": raise SystemExit(f"expected remaining instance to be healthy, found {instance.get('state')}") print("Observed one healthy scheduled instance on node01 after scale-down") PY echo "Validating endpoint convergence after scale-down" python3 - <<'PY' import socket import urllib.request with urllib.request.urlopen("http://127.0.0.2:18080/", timeout=5) as response: if response.status != 200: raise SystemExit(f"node01 endpoint returned {response.status}") sock = socket.socket() sock.settimeout(1.5) try: sock.connect(("127.0.0.3", 18080)) except OSError: pass else: raise SystemExit("node02 endpoint still accepts connections after scale-down") finally: sock.close() print("Endpoint convergence validated") PY echo "Fleet scheduler E2E verification passed"