#!/usr/bin/env bash set -euo pipefail ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@" fi run_chainfire_server_bin() { if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@" else cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@" fi } run_deployer_ctl_bin() { if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@" fi } run_node_agent_bin() { if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then "$PHOTONCLOUD_NODE_AGENT_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@" fi } run_fleet_scheduler_bin() { if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then "$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@" else cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@" fi } tmp_dir="$(mktemp -d)" cf_pid="" cleanup() { set +e if [[ -d "$tmp_dir/pids" ]]; then while IFS= read -r -d '' pid_file; do [[ -f "$pid_file" ]] || continue kill "$(cat "$pid_file")" 2>/dev/null || true done < <(find "$tmp_dir/pids" -type f -name '*.pid' -print0 2>/dev/null) fi if [[ -n "$cf_pid" ]]; then kill "$cf_pid" 2>/dev/null || true wait "$cf_pid" 2>/dev/null || true fi rm -rf "$tmp_dir" } trap cleanup EXIT free_port() { python3 - <<'PY' import socket s = socket.socket() s.bind(("127.0.0.1", 0)) print(s.getsockname()[1]) s.close() PY } wait_for_port() { local host="$1" local port="$2" local timeout_secs="${3:-60}" local deadline=$((SECONDS + timeout_secs)) while (( SECONDS < deadline )); do if python3 - "$host" "$port" <<'PY' import socket import sys host = sys.argv[1] port = int(sys.argv[2]) with socket.socket() as sock: sock.settimeout(0.5) try: sock.connect((host, port)) except OSError: raise SystemExit(1) raise SystemExit(0) PY then return 0 fi sleep 1 done echo "timed out waiting for ${host}:${port}" >&2 return 1 } api_port="$(free_port)" http_port="$(free_port)" raft_port="$(free_port)" gossip_port="$(free_port)" cat >"$tmp_dir/chainfire.toml" <"$tmp_dir/chainfire.log" 2>&1 & cf_pid="$!" wait_for_port "127.0.0.1" "$api_port" 120 cat >"$tmp_dir/cluster.yaml" <<'EOF' cluster: cluster_id: test-cluster environment: dev node_classes: - name: worker-linux description: Standard worker nodes nix_profile: profiles/worker-linux roles: - worker labels: tier: general pools: - name: general description: Default capacity pool node_class: worker-linux min_size: 2 max_size: 10 labels: env: dev nodes: - node_id: node01 hostname: node01 ip: 127.0.0.2 pool: general failure_domain: rack-a state: pending - node_id: node02 hostname: node02 ip: 127.0.0.3 pool: general failure_domain: rack-b state: pending services: - name: api ports: http: 18080 protocol: http schedule: replicas: 2 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18080 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 - name: worker ports: http: 18081 protocol: http depends_on: - service: api condition: healthy min_ready: 2 schedule: replicas: 2 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18081 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 EOF cat >"$tmp_dir/cluster-scaled.yaml" <<'EOF' cluster: cluster_id: test-cluster environment: dev node_classes: - name: worker-linux description: Standard worker nodes nix_profile: profiles/worker-linux roles: - worker labels: tier: general pools: - name: general description: Default capacity pool node_class: worker-linux min_size: 2 max_size: 10 labels: env: dev nodes: - node_id: node01 hostname: node01 ip: 127.0.0.2 pool: general failure_domain: rack-a state: active - node_id: node02 hostname: node02 ip: 127.0.0.3 pool: general failure_domain: rack-b state: active services: - name: api ports: http: 18080 protocol: http schedule: replicas: 1 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18080 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 - name: worker ports: http: 18081 protocol: http depends_on: - service: api condition: healthy min_ready: 1 schedule: replicas: 1 placement: roles: - worker pools: - general node_classes: - worker-linux match_labels: tier: general spread_by_label: failure_domain max_instances_per_node: 1 instance_port: 18081 process: command: python3 args: - -m - http.server - ${INSTANCE_PORT} - --bind - ${INSTANCE_IP} health_check: type: http path: / interval_secs: 1 timeout_secs: 2 EOF endpoint="http://127.0.0.1:${api_port}" run_deployer_ctl() { run_deployer_ctl_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ "$@" } run_node_agent_once() { local node_id="$1" local pid_dir="$tmp_dir/pids/$node_id" mkdir -p "$pid_dir" run_node_agent_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ --node-id "$node_id" \ --pid-dir "$pid_dir" \ --interval-secs 1 \ --apply \ --once } run_scheduler_once() { run_fleet_scheduler_bin \ --chainfire-endpoint "$endpoint" \ --cluster-id test-cluster \ --interval-secs 1 \ --once } echo "Applying cluster declaration" run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" echo "Activating nodes through node-agent" run_node_agent_once node01 run_node_agent_once node02 echo "Scheduling managed instances" run_scheduler_once echo "Validating dependency block before api is healthy" run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-blocked.dump" python3 - "$tmp_dir/worker-blocked.dump" <<'PY' import sys path = sys.argv[1] lines = [line.strip() for line in open(path, "r", encoding="utf-8") if line.strip()] if lines: raise SystemExit(f"expected no worker instances before api is healthy, found {len(lines)}") print("worker instances correctly blocked before dependency becomes healthy") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/service-statuses/worker" >"$tmp_dir/worker-status-blocked.dump" python3 - "$tmp_dir/worker-status-blocked.dump" <<'PY' import json import sys path = sys.argv[1] statuses = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue statuses.append(json.loads(line.split(marker, 1)[1])) if len(statuses) != 1: raise SystemExit(f"expected exactly one worker service status, found {len(statuses)}") status = statuses[0] if status.get("phase") != "blocked": raise SystemExit(f"expected worker phase=blocked, found {status.get('phase')}") blockers = status.get("blockers") or [] if not blockers or "dependency api has 0/2 healthy instance(s)" not in blockers[0]: raise SystemExit(f"unexpected blockers: {blockers}") print("worker service status reports dependency block") PY run_deployer_ctl service inspect --name worker >"$tmp_dir/worker-inspect-blocked.json" python3 - "$tmp_dir/worker-inspect-blocked.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload.get("status") or {} if payload.get("spec", {}).get("name") != "worker": raise SystemExit("service inspect did not return worker spec") if status.get("phase") != "blocked": raise SystemExit(f"expected worker inspect phase=blocked, found {status.get('phase')}") print("service inspect reports blocked dependency state") PY echo "Reconciling processes and health for api" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done run_deployer_ctl service inspect --name api --include-instances >"$tmp_dir/api-inspect-healthy.json" python3 - "$tmp_dir/api-inspect-healthy.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload.get("status") or {} instances = payload.get("instances") or [] if status.get("phase") != "healthy": raise SystemExit(f"expected api inspect phase=healthy before scheduler rerun, found {status.get('phase')}") if len(instances) != 2: raise SystemExit(f"expected 2 api instances from service inspect, found {len(instances)}") print("api service inspect refreshed to healthy from node-agent updates") PY echo "Re-running scheduler after api became healthy" run_scheduler_once echo "Reconciling processes and health for dependent worker service" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done echo "Validating HTTP endpoints" python3 - <<'PY' import urllib.request for address in ( "http://127.0.0.2:18080/", "http://127.0.0.3:18080/", "http://127.0.0.2:18081/", "http://127.0.0.3:18081/", ): with urllib.request.urlopen(address, timeout=5) as response: body = response.read().decode("utf-8") if response.status != 200: raise SystemExit(f"{address} returned {response.status}") if "Directory listing" not in body and "DOCTYPE" not in body: raise SystemExit(f"{address} returned unexpected body") print("HTTP endpoints are healthy") PY echo "Inspecting instance state in ChainFire" run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances.dump" python3 - "$tmp_dir/instances.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 2: raise SystemExit(f"expected 2 scheduled instances, found {len(instances)}") node_ids = sorted(instance["node_id"] for instance in instances) states = sorted(instance.get("state") for instance in instances) if node_ids != ["node01", "node02"]: raise SystemExit(f"unexpected node placement: {node_ids}") if states != ["healthy", "healthy"]: raise SystemExit(f"unexpected health states: {states}") print("Observed two healthy scheduled instances across node01 and node02") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-instances.dump" python3 - "$tmp_dir/worker-instances.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 2: raise SystemExit(f"expected 2 worker instances, found {len(instances)}") node_ids = sorted(instance["node_id"] for instance in instances) states = sorted(instance.get("state") for instance in instances) if node_ids != ["node01", "node02"]: raise SystemExit(f"unexpected worker placement: {node_ids}") if states != ["healthy", "healthy"]: raise SystemExit(f"unexpected worker states: {states}") print("Observed two healthy dependent worker instances across node01 and node02") PY run_deployer_ctl service inspect --name worker --include-instances >"$tmp_dir/worker-inspect-healthy.json" python3 - "$tmp_dir/worker-inspect-healthy.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload.get("status") or {} instances = payload.get("instances") or [] if status.get("phase") != "healthy": raise SystemExit(f"expected worker inspect phase=healthy, found {status.get('phase')}") if len(instances) != 2: raise SystemExit(f"expected 2 worker instances from service inspect, found {len(instances)}") print("service inspect reports healthy dependent instances") PY echo "Applying scaled declaration" run_deployer_ctl apply --config "$tmp_dir/cluster-scaled.yaml" --prune echo "Re-running scheduler after scale-down" run_scheduler_once echo "Reconciling api after scale-down" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done echo "Re-running scheduler after scaled api became healthy" run_scheduler_once echo "Reconciling dependent worker service after scale-down" for _ in 1 2 3; do run_node_agent_once node01 run_node_agent_once node02 sleep 1 done echo "Inspecting scaled instance state in ChainFire" run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances-scaled.dump" python3 - "$tmp_dir/instances-scaled.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 1: raise SystemExit(f"expected 1 scheduled instance after scale-down, found {len(instances)}") instance = instances[0] if instance["node_id"] != "node01": raise SystemExit(f"expected remaining instance on node01, found {instance['node_id']}") if instance.get("state") != "healthy": raise SystemExit(f"expected remaining instance to be healthy, found {instance.get('state')}") print("Observed one healthy scheduled instance on node01 after scale-down") PY run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-instances-scaled.dump" python3 - "$tmp_dir/worker-instances-scaled.dump" <<'PY' import json import sys path = sys.argv[1] instances = [] with open(path, "r", encoding="utf-8") as handle: for line in handle: line = line.strip() if not line: continue marker = " value=" if marker not in line: continue value = line.split(marker, 1)[1] instances.append(json.loads(value)) if len(instances) != 1: raise SystemExit(f"expected 1 worker instance after scale-down, found {len(instances)}") instance = instances[0] if instance["node_id"] != "node01": raise SystemExit(f"expected remaining worker instance on node01, found {instance['node_id']}") if instance.get("state") != "healthy": raise SystemExit(f"expected remaining worker instance to be healthy, found {instance.get('state')}") print("Observed one healthy dependent worker instance on node01 after scale-down") PY run_deployer_ctl service inspect --name worker --include-instances >"$tmp_dir/worker-inspect-scaled.json" python3 - "$tmp_dir/worker-inspect-scaled.json" <<'PY' import json import sys payload = json.load(open(sys.argv[1], "r", encoding="utf-8")) status = payload.get("status") or {} instances = payload.get("instances") or [] if status.get("phase") != "healthy": raise SystemExit(f"expected scaled worker inspect phase=healthy, found {status.get('phase')}") if len(instances) != 1: raise SystemExit(f"expected 1 scaled worker instance from service inspect, found {len(instances)}") print("service inspect reports scaled healthy worker state without waiting for scheduler status refresh") PY echo "Validating endpoint convergence after scale-down" python3 - <<'PY' import socket import urllib.request with urllib.request.urlopen("http://127.0.0.2:18080/", timeout=5) as response: if response.status != 200: raise SystemExit(f"node01 endpoint returned {response.status}") with urllib.request.urlopen("http://127.0.0.2:18081/", timeout=5) as response: if response.status != 200: raise SystemExit(f"node01 worker endpoint returned {response.status}") for port, label in ((18080, "api"), (18081, "worker")): sock = socket.socket() sock.settimeout(1.5) try: sock.connect(("127.0.0.3", port)) except OSError: pass else: raise SystemExit(f"node02 {label} endpoint still accepts connections after scale-down") finally: sock.close() print("Endpoint convergence validated") PY echo "Fleet scheduler E2E verification passed"