452 lines
10 KiB
Bash
Executable file
452 lines
10 KiB
Bash
Executable file
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
|
|
|
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
|
|
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
|
|
fi
|
|
|
|
run_chainfire_server_bin() {
|
|
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
|
|
else
|
|
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
|
|
fi
|
|
}
|
|
|
|
run_deployer_ctl_bin() {
|
|
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
|
|
else
|
|
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
|
|
fi
|
|
}
|
|
|
|
run_node_agent_bin() {
|
|
if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_NODE_AGENT_BIN" "$@"
|
|
else
|
|
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@"
|
|
fi
|
|
}
|
|
|
|
run_fleet_scheduler_bin() {
|
|
if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then
|
|
"$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@"
|
|
else
|
|
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@"
|
|
fi
|
|
}
|
|
|
|
tmp_dir="$(mktemp -d)"
|
|
cf_pid=""
|
|
|
|
cleanup() {
|
|
set +e
|
|
|
|
if [[ -d "$tmp_dir/pids" ]]; then
|
|
while IFS= read -r -d '' pid_file; do
|
|
[[ -f "$pid_file" ]] || continue
|
|
kill "$(cat "$pid_file")" 2>/dev/null || true
|
|
done < <(find "$tmp_dir/pids" -type f -name '*.pid' -print0 2>/dev/null)
|
|
fi
|
|
|
|
if [[ -n "$cf_pid" ]]; then
|
|
kill "$cf_pid" 2>/dev/null || true
|
|
wait "$cf_pid" 2>/dev/null || true
|
|
fi
|
|
|
|
rm -rf "$tmp_dir"
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
free_port() {
|
|
python3 - <<'PY'
|
|
import socket
|
|
s = socket.socket()
|
|
s.bind(("127.0.0.1", 0))
|
|
print(s.getsockname()[1])
|
|
s.close()
|
|
PY
|
|
}
|
|
|
|
wait_for_port() {
|
|
local host="$1"
|
|
local port="$2"
|
|
local timeout_secs="${3:-60}"
|
|
local deadline=$((SECONDS + timeout_secs))
|
|
|
|
while (( SECONDS < deadline )); do
|
|
if python3 - "$host" "$port" <<'PY'
|
|
import socket
|
|
import sys
|
|
|
|
host = sys.argv[1]
|
|
port = int(sys.argv[2])
|
|
|
|
with socket.socket() as sock:
|
|
sock.settimeout(0.5)
|
|
try:
|
|
sock.connect((host, port))
|
|
except OSError:
|
|
raise SystemExit(1)
|
|
raise SystemExit(0)
|
|
PY
|
|
then
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
echo "timed out waiting for ${host}:${port}" >&2
|
|
return 1
|
|
}
|
|
|
|
api_port="$(free_port)"
|
|
http_port="$(free_port)"
|
|
raft_port="$(free_port)"
|
|
gossip_port="$(free_port)"
|
|
|
|
cat >"$tmp_dir/chainfire.toml" <<EOF
|
|
[node]
|
|
id = 1
|
|
name = "chainfire-1"
|
|
role = "control_plane"
|
|
|
|
[storage]
|
|
data_dir = "$tmp_dir/chainfire-data"
|
|
|
|
[network]
|
|
api_addr = "127.0.0.1:${api_port}"
|
|
http_addr = "127.0.0.1:${http_port}"
|
|
raft_addr = "127.0.0.1:${raft_port}"
|
|
gossip_addr = "127.0.0.1:${gossip_port}"
|
|
|
|
[cluster]
|
|
id = 1
|
|
initial_members = []
|
|
bootstrap = true
|
|
|
|
[raft]
|
|
role = "voter"
|
|
EOF
|
|
|
|
mkdir -p "$tmp_dir/pids"
|
|
|
|
echo "Starting ChainFire on 127.0.0.1:${api_port}"
|
|
run_chainfire_server_bin \
|
|
--config "$tmp_dir/chainfire.toml" \
|
|
>"$tmp_dir/chainfire.log" 2>&1 &
|
|
cf_pid="$!"
|
|
|
|
wait_for_port "127.0.0.1" "$api_port" 120
|
|
|
|
cat >"$tmp_dir/cluster.yaml" <<'EOF'
|
|
cluster:
|
|
cluster_id: test-cluster
|
|
environment: dev
|
|
|
|
node_classes:
|
|
- name: worker-linux
|
|
description: Standard worker nodes
|
|
nix_profile: profiles/worker-linux
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: general
|
|
|
|
pools:
|
|
- name: general
|
|
description: Default capacity pool
|
|
node_class: worker-linux
|
|
min_size: 2
|
|
max_size: 10
|
|
labels:
|
|
env: dev
|
|
|
|
nodes:
|
|
- node_id: node01
|
|
hostname: node01
|
|
ip: 127.0.0.2
|
|
pool: general
|
|
failure_domain: rack-a
|
|
state: pending
|
|
- node_id: node02
|
|
hostname: node02
|
|
ip: 127.0.0.3
|
|
pool: general
|
|
failure_domain: rack-b
|
|
state: pending
|
|
|
|
services:
|
|
- name: api
|
|
ports:
|
|
http: 18080
|
|
protocol: http
|
|
schedule:
|
|
replicas: 2
|
|
placement:
|
|
roles:
|
|
- worker
|
|
pools:
|
|
- general
|
|
node_classes:
|
|
- worker-linux
|
|
match_labels:
|
|
tier: general
|
|
spread_by_label: failure_domain
|
|
max_instances_per_node: 1
|
|
instance_port: 18080
|
|
process:
|
|
command: python3
|
|
args:
|
|
- -m
|
|
- http.server
|
|
- ${INSTANCE_PORT}
|
|
- --bind
|
|
- ${INSTANCE_IP}
|
|
health_check:
|
|
type: http
|
|
path: /
|
|
interval_secs: 1
|
|
timeout_secs: 2
|
|
EOF
|
|
|
|
cat >"$tmp_dir/cluster-scaled.yaml" <<'EOF'
|
|
cluster:
|
|
cluster_id: test-cluster
|
|
environment: dev
|
|
|
|
node_classes:
|
|
- name: worker-linux
|
|
description: Standard worker nodes
|
|
nix_profile: profiles/worker-linux
|
|
roles:
|
|
- worker
|
|
labels:
|
|
tier: general
|
|
|
|
pools:
|
|
- name: general
|
|
description: Default capacity pool
|
|
node_class: worker-linux
|
|
min_size: 2
|
|
max_size: 10
|
|
labels:
|
|
env: dev
|
|
|
|
nodes:
|
|
- node_id: node01
|
|
hostname: node01
|
|
ip: 127.0.0.2
|
|
pool: general
|
|
failure_domain: rack-a
|
|
state: active
|
|
- node_id: node02
|
|
hostname: node02
|
|
ip: 127.0.0.3
|
|
pool: general
|
|
failure_domain: rack-b
|
|
state: active
|
|
|
|
services:
|
|
- name: api
|
|
ports:
|
|
http: 18080
|
|
protocol: http
|
|
schedule:
|
|
replicas: 1
|
|
placement:
|
|
roles:
|
|
- worker
|
|
pools:
|
|
- general
|
|
node_classes:
|
|
- worker-linux
|
|
match_labels:
|
|
tier: general
|
|
spread_by_label: failure_domain
|
|
max_instances_per_node: 1
|
|
instance_port: 18080
|
|
process:
|
|
command: python3
|
|
args:
|
|
- -m
|
|
- http.server
|
|
- ${INSTANCE_PORT}
|
|
- --bind
|
|
- ${INSTANCE_IP}
|
|
health_check:
|
|
type: http
|
|
path: /
|
|
interval_secs: 1
|
|
timeout_secs: 2
|
|
EOF
|
|
|
|
endpoint="http://127.0.0.1:${api_port}"
|
|
|
|
run_deployer_ctl() {
|
|
run_deployer_ctl_bin \
|
|
--chainfire-endpoint "$endpoint" \
|
|
--cluster-id test-cluster \
|
|
"$@"
|
|
}
|
|
|
|
run_node_agent_once() {
|
|
local node_id="$1"
|
|
local pid_dir="$tmp_dir/pids/$node_id"
|
|
mkdir -p "$pid_dir"
|
|
run_node_agent_bin \
|
|
--chainfire-endpoint "$endpoint" \
|
|
--cluster-id test-cluster \
|
|
--node-id "$node_id" \
|
|
--pid-dir "$pid_dir" \
|
|
--interval-secs 1 \
|
|
--apply \
|
|
--once
|
|
}
|
|
|
|
run_scheduler_once() {
|
|
run_fleet_scheduler_bin \
|
|
--chainfire-endpoint "$endpoint" \
|
|
--cluster-id test-cluster \
|
|
--interval-secs 1 \
|
|
--once
|
|
}
|
|
|
|
echo "Applying cluster declaration"
|
|
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml"
|
|
|
|
echo "Activating nodes through node-agent"
|
|
run_node_agent_once node01
|
|
run_node_agent_once node02
|
|
|
|
echo "Scheduling managed instances"
|
|
run_scheduler_once
|
|
|
|
echo "Reconciling processes and health"
|
|
for _ in 1 2 3; do
|
|
run_node_agent_once node01
|
|
run_node_agent_once node02
|
|
sleep 1
|
|
done
|
|
|
|
echo "Validating HTTP endpoints"
|
|
python3 - <<'PY'
|
|
import urllib.request
|
|
|
|
for address in ("http://127.0.0.2:18080/", "http://127.0.0.3:18080/"):
|
|
with urllib.request.urlopen(address, timeout=5) as response:
|
|
body = response.read().decode("utf-8")
|
|
if response.status != 200:
|
|
raise SystemExit(f"{address} returned {response.status}")
|
|
if "Directory listing" not in body and "DOCTYPE" not in body:
|
|
raise SystemExit(f"{address} returned unexpected body")
|
|
print("HTTP endpoints are healthy")
|
|
PY
|
|
|
|
echo "Inspecting instance state in ChainFire"
|
|
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances.dump"
|
|
python3 - "$tmp_dir/instances.dump" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
instances = []
|
|
|
|
with open(path, "r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
marker = " value="
|
|
if marker not in line:
|
|
continue
|
|
value = line.split(marker, 1)[1]
|
|
instances.append(json.loads(value))
|
|
|
|
if len(instances) != 2:
|
|
raise SystemExit(f"expected 2 scheduled instances, found {len(instances)}")
|
|
|
|
node_ids = sorted(instance["node_id"] for instance in instances)
|
|
states = sorted(instance.get("state") for instance in instances)
|
|
|
|
if node_ids != ["node01", "node02"]:
|
|
raise SystemExit(f"unexpected node placement: {node_ids}")
|
|
if states != ["healthy", "healthy"]:
|
|
raise SystemExit(f"unexpected health states: {states}")
|
|
|
|
print("Observed two healthy scheduled instances across node01 and node02")
|
|
PY
|
|
|
|
echo "Applying scaled declaration"
|
|
run_deployer_ctl apply --config "$tmp_dir/cluster-scaled.yaml" --prune
|
|
|
|
echo "Re-running scheduler after scale-down"
|
|
run_scheduler_once
|
|
|
|
echo "Reconciling processes and health after scale-down"
|
|
for _ in 1 2 3; do
|
|
run_node_agent_once node01
|
|
run_node_agent_once node02
|
|
sleep 1
|
|
done
|
|
|
|
echo "Inspecting scaled instance state in ChainFire"
|
|
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/api/" >"$tmp_dir/instances-scaled.dump"
|
|
python3 - "$tmp_dir/instances-scaled.dump" <<'PY'
|
|
import json
|
|
import sys
|
|
|
|
path = sys.argv[1]
|
|
instances = []
|
|
|
|
with open(path, "r", encoding="utf-8") as handle:
|
|
for line in handle:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
marker = " value="
|
|
if marker not in line:
|
|
continue
|
|
value = line.split(marker, 1)[1]
|
|
instances.append(json.loads(value))
|
|
|
|
if len(instances) != 1:
|
|
raise SystemExit(f"expected 1 scheduled instance after scale-down, found {len(instances)}")
|
|
|
|
instance = instances[0]
|
|
if instance["node_id"] != "node01":
|
|
raise SystemExit(f"expected remaining instance on node01, found {instance['node_id']}")
|
|
if instance.get("state") != "healthy":
|
|
raise SystemExit(f"expected remaining instance to be healthy, found {instance.get('state')}")
|
|
|
|
print("Observed one healthy scheduled instance on node01 after scale-down")
|
|
PY
|
|
|
|
echo "Validating endpoint convergence after scale-down"
|
|
python3 - <<'PY'
|
|
import socket
|
|
import urllib.request
|
|
|
|
with urllib.request.urlopen("http://127.0.0.2:18080/", timeout=5) as response:
|
|
if response.status != 200:
|
|
raise SystemExit(f"node01 endpoint returned {response.status}")
|
|
|
|
sock = socket.socket()
|
|
sock.settimeout(1.5)
|
|
try:
|
|
sock.connect(("127.0.0.3", 18080))
|
|
except OSError:
|
|
pass
|
|
else:
|
|
raise SystemExit("node02 endpoint still accepts connections after scale-down")
|
|
finally:
|
|
sock.close()
|
|
|
|
print("Endpoint convergence validated")
|
|
PY
|
|
|
|
echo "Fleet scheduler E2E verification passed"
|