From c1d4178a52df7e2e1192d3eecd68c961dceaf024 Mon Sep 17 00:00:00 2001
From: centra
Date: Fri, 10 Apr 2026 19:28:44 +0900
Subject: [PATCH] Establish baseline product surface and proof lanes
---
.github/workflows/nix.yml | 28 +-
.gitignore | 3 +
README.md | 210 +-
TODO.md | 411 ++++
apigateway/README.md | 17 +
.../crates/apigateway-server/src/main.rs | 153 +-
chainfire/Cargo.lock | 11 -
.../baremetal/pxe-server/assets/.gitkeep | 2 +-
chainfire/baremetal/pxe-server/ipxe/boot.ipxe | 4 +-
chainfire/chainfire-client/src/client.rs | 51 +-
chainfire/chainfire-client/src/watch.rs | 7 +-
chainfire/crates/chainfire-api/build.rs | 5 +-
.../chainfire-api/src/cluster_service.rs | 145 +-
.../chainfire-api/src/internal_service.rs | 41 +-
.../crates/chainfire-api/src/kv_service.rs | 40 +-
.../crates/chainfire-api/src/lease_service.rs | 3 +-
chainfire/crates/chainfire-api/src/lib.rs | 14 +-
.../crates/chainfire-api/src/raft_client.rs | 21 +-
.../crates/chainfire-api/src/watch_service.rs | 15 +-
chainfire/crates/chainfire-core/Cargo.toml | 27 +-
.../crates/chainfire-core/src/builder.rs | 238 ---
.../crates/chainfire-core/src/callbacks.rs | 103 -
.../crates/chainfire-core/src/cluster.rs | 313 ---
chainfire/crates/chainfire-core/src/config.rs | 162 --
chainfire/crates/chainfire-core/src/events.rs | 198 --
chainfire/crates/chainfire-core/src/kvs.rs | 290 ---
chainfire/crates/chainfire-core/src/lib.rs | 60 +-
chainfire/crates/chainfire-core/src/traits.rs | 60 -
.../crates/chainfire-gossip/src/broadcast.rs | 6 +-
.../crates/chainfire-gossip/src/identity.rs | 8 +-
.../crates/chainfire-gossip/src/membership.rs | 10 +-
chainfire/crates/chainfire-raft/src/core.rs | 396 ++--
chainfire/crates/chainfire-raft/src/lib.rs | 5 +-
.../crates/chainfire-raft/src/network.rs | 22 +-
.../chainfire-server/benches/kv_bench.rs | 21 +-
chainfire/crates/chainfire-server/src/main.rs | 23 +-
chainfire/crates/chainfire-server/src/node.rs | 18 +-
chainfire/crates/chainfire-server/src/rest.rs | 127 +-
.../crates/chainfire-server/src/server.rs | 21 +-
.../benches/storage_bench.rs | 8 +-
.../crates/chainfire-storage/src/kv_store.rs | 4 +-
.../chainfire-storage/src/lease_store.rs | 22 +-
chainfire/crates/chainfire-storage/src/lib.rs | 2 +-
.../chainfire-storage/src/log_storage.rs | 21 +-
.../crates/chainfire-storage/src/snapshot.rs | 14 +-
.../chainfire-storage/src/state_machine.rs | 8 +-
.../crates/chainfire-storage/src/store.rs | 5 +-
.../crates/chainfire-types/src/command.rs | 8 +-
chainfire/crates/chainfire-types/src/kv.rs | 4 +-
chainfire/crates/chainfire-types/src/node.rs | 4 +-
.../crates/chainfire-watch/src/registry.rs | 15 +-
chainfire/proto/chainfire.proto | 90 +-
chainfire/proto/internal.proto | 25 -
creditservice/README.md | 37 +-
.../crates/creditservice-api/src/billing.rs | 40 +-
.../creditservice-api/src/credit_service.rs | 39 +-
.../creditservice-api/src/flaredb_storage.rs | 16 +-
.../crates/creditservice-api/src/lib.rs | 16 +-
.../creditservice-api/src/nightlight.rs | 10 +-
.../creditservice-api/src/sql_storage.rs | 48 +-
.../crates/creditservice-api/src/storage.rs | 10 +-
.../crates/creditservice-server/src/config.rs | 2 +-
.../crates/creditservice-server/src/main.rs | 15 +-
.../crates/creditservice-types/src/lib.rs | 16 +-
.../creditservice-types/src/reservation.rs | 4 +-
.../crates/creditservice-types/src/wallet.rs | 4 +-
creditservice/creditservice-client/src/lib.rs | 5 +-
deployer/crates/deployer-ctl/src/chainfire.rs | 2 +-
deployer/crates/deployer-ctl/src/main.rs | 4 +-
deployer/crates/fleet-scheduler/src/main.rs | 4 +-
deployer/crates/nix-agent/src/main.rs | 2 +-
deployer/crates/node-agent/src/agent.rs | 2 +-
deployer/crates/node-agent/src/main.rs | 2 +-
deployer/crates/node-agent/src/process.rs | 192 +-
.../crates/ultracloud-reconciler/src/hosts.rs | 2 +-
docs/README.md | 48 +-
docs/component-matrix.md | 116 +-
docs/control-plane-ops.md | 77 +
docs/edge-trial-surface.md | 83 +
docs/hardware-bringup.md | 135 ++
docs/provider-vm-reality.md | 37 +
docs/rollout-bundle.md | 103 +
docs/testing.md | 247 ++-
fiberlb/Cargo.lock | 111 +
fiberlb/crates/fiberlb-server/Cargo.toml | 1 +
.../fiberlb-server/proto/api/attribute.proto | 4 +-
.../crates/fiberlb-server/src/dataplane.rs | 335 ++-
.../crates/fiberlb-server/src/healthcheck.rs | 5 +-
.../crates/fiberlb-server/src/l7_dataplane.rs | 283 ++-
.../src/services/certificate.rs | 40 +-
fiberlb/crates/fiberlb-server/src/tls.rs | 15 +-
.../crates/fiberlb-types/src/certificate.rs | 4 +-
fiberlb/crates/fiberlb-types/src/listener.rs | 2 +-
flake.nix | 993 ++++++++-
flaredb/crates/flaredb-client/src/client.rs | 13 +-
.../crates/flaredb-proto/src/chainfire.proto | 36 +-
flaredb/crates/flaredb-raft/src/network.rs | 11 +-
.../flaredb-raft/src/persistent_storage.rs | 52 +-
flaredb/crates/flaredb-raft/src/raft_node.rs | 11 +-
flaredb/crates/flaredb-raft/src/storage.rs | 52 +-
flaredb/crates/flaredb-raft/src/types.rs | 4 +-
.../flaredb-server/benches/storage_bench.rs | 6 +-
.../crates/flaredb-server/src/config/mod.rs | 5 +-
.../crates/flaredb-server/src/heartbeat.rs | 3 +-
.../crates/flaredb-server/src/raft_service.rs | 5 +-
flaredb/crates/flaredb-server/src/rest.rs | 169 +-
flaredb/crates/flaredb-sql/src/error.rs | 5 +-
flaredb/crates/flaredb-sql/src/metadata.rs | 36 +-
flaredb/crates/flaredb-sql/src/types.rs | 5 +-
.../flaredb-storage/src/rocks_engine.rs | 52 +-
iam/crates/iam-api/src/credential_service.rs | 71 +
iam/crates/iam-api/src/lib.rs | 6 +-
iam/crates/iam-authn/src/jwt.rs | 4 +-
iam/crates/iam-authn/src/mtls.rs | 41 +
iam/crates/iam-authz/src/evaluator.rs | 5 +-
iam/crates/iam-server/src/main.rs | 2 +-
iam/crates/iam-store/src/backend.rs | 101 +-
iam/crates/iam-store/src/org_store.rs | 36 +-
iam/crates/iam-store/src/project_store.rs | 40 +-
iam/crates/iam-types/src/tenant.rs | 6 +-
k8shost/Cargo.toml | 7 +
k8shost/README.md | 20 +
k8shost/crates/k8shost-cni/src/main.rs | 42 +-
.../crates/k8shost-controllers/src/main.rs | 73 +-
k8shost/crates/k8shost-csi/src/main.rs | 38 +-
.../crates/k8shost-server/src/services/pod.rs | 138 +-
k8shost/crates/k8shost-server/src/storage.rs | 4 +-
.../src/backends/erasure_coded.rs | 74 +-
.../src/backends/replicated.rs | 52 +-
.../src/chunk/mod.rs | 21 +-
.../src/erasure/mod.rs | 6 +-
.../src/node/client.rs | 28 +-
.../src/node/mock.rs | 4 +-
.../src/node/registry.rs | 12 +-
.../lightningstor-distributed/src/repair.rs | 1 -
.../crates/lightningstor-node/src/main.rs | 12 +-
.../crates/lightningstor-node/src/service.rs | 44 +-
.../crates/lightningstor-node/src/storage.rs | 17 +-
.../src/bucket_service.rs | 283 ++-
.../crates/lightningstor-server/src/lib.rs | 2 +-
.../src/object_service.rs | 393 +++-
.../lightningstor-server/src/s3/auth.rs | 2 +-
.../lightningstor-server/src/s3/router.rs | 54 +
.../crates/lightningstor-server/src/tenant.rs | 8 +-
.../lightningstor-storage/src/backend.rs | 49 +-
.../lightningstor-storage/src/local_fs.rs | 145 +-
.../crates/lightningstor-types/src/bucket.rs | 14 +-
.../crates/lightningstor-types/src/object.rs | 1 -
nightlight/README.md | 19 +
nix-nos/flake.nix | 4 +-
nix-nos/modules/default.nix | 9 +-
nix/ci/flake.nix | 7 +
nix/images/netboot-all-in-one.nix | 249 +--
nix/iso/ultracloud-iso.nix | 244 ++-
nix/modules/creditservice.nix | 8 +-
nix/modules/default.nix | 1 +
nix/modules/deployer.nix | 10 +-
nix/modules/fleet-scheduler.nix | 12 +-
nix/modules/k8shost.nix | 62 +-
nix/modules/lightningstor.nix | 23 +-
nix/modules/nix-agent.nix | 19 +-
nix/modules/node-agent.nix | 16 +-
nix/modules/plasmavmc.nix | 41 +-
nix/modules/ultracloud-cluster.nix | 10 +-
nix/modules/ultracloud-resources.nix | 2 +-
nix/nodes/baremetal-qemu/common.nix | 87 +
.../control-plane/configuration.nix | 46 +
.../baremetal-qemu/control-plane/disko.nix | 5 +
.../baremetal-qemu/worker/configuration.nix | 36 +
nix/nodes/baremetal-qemu/worker/disko.nix | 5 +
nix/nodes/vm-cluster/common-disko.nix | 17 +-
nix/single-node/base.nix | 421 ++++
nix/single-node/qemu-vm.nix | 24 +
nix/single-node/surface.nix | 240 +++
nix/test-cluster/README.md | 84 +-
nix/test-cluster/common.nix | 2 +-
nix/test-cluster/flake.nix | 43 +-
nix/test-cluster/hardware-smoke.sh | 615 ++++++
nix/test-cluster/node01.nix | 38 +
nix/test-cluster/node06.nix | 2 +-
nix/test-cluster/run-baremetal-iso-e2e.sh | 199 ++
nix/test-cluster/run-cluster.sh | 1881 ++++++++++++++++-
.../run-core-control-plane-ops-proof.sh | 124 ++
nix/test-cluster/run-local-baseline.sh | 198 ++
nix/test-cluster/run-publishable-kvm-suite.sh | 231 ++
.../run-supported-surface-final-proof.sh | 196 ++
nix/test-cluster/verify-baremetal-iso.sh | 1098 ++++++++++
nix/test-cluster/vm-guest-image.nix | 4 +-
nix/test-cluster/work-root-budget.sh | 238 +++
.../verify-fleet-scheduler-e2e-stable.sh | 284 +++
.../main-reaggregation-2026-04-06.md | 43 +
plasmavmc/Cargo.lock | 1 -
plasmavmc/Cargo.toml | 8 +-
.../crates/plasmavmc-firecracker/src/lib.rs | 25 +-
plasmavmc/crates/plasmavmc-kvm/src/lib.rs | 8 +-
plasmavmc/crates/plasmavmc-server/Cargo.toml | 1 -
plasmavmc/crates/plasmavmc-server/src/main.rs | 34 +-
plasmavmc/crates/plasmavmc-server/src/rest.rs | 72 +-
.../crates/plasmavmc-server/src/vm_service.rs | 49 +-
.../crates/prismnet-server/src/ovn/client.rs | 126 +-
.../src/services/security_group.rs | 16 +-
201 files changed, 12545 insertions(+), 3643 deletions(-)
create mode 100644 TODO.md
create mode 100644 apigateway/README.md
delete mode 100644 chainfire/crates/chainfire-core/src/builder.rs
delete mode 100644 chainfire/crates/chainfire-core/src/callbacks.rs
delete mode 100644 chainfire/crates/chainfire-core/src/cluster.rs
delete mode 100644 chainfire/crates/chainfire-core/src/config.rs
delete mode 100644 chainfire/crates/chainfire-core/src/events.rs
delete mode 100644 chainfire/crates/chainfire-core/src/kvs.rs
delete mode 100644 chainfire/crates/chainfire-core/src/traits.rs
create mode 100644 docs/control-plane-ops.md
create mode 100644 docs/edge-trial-surface.md
create mode 100644 docs/hardware-bringup.md
create mode 100644 docs/provider-vm-reality.md
create mode 100644 docs/rollout-bundle.md
create mode 100644 k8shost/README.md
create mode 100644 nightlight/README.md
create mode 100644 nix/nodes/baremetal-qemu/common.nix
create mode 100644 nix/nodes/baremetal-qemu/control-plane/configuration.nix
create mode 100644 nix/nodes/baremetal-qemu/control-plane/disko.nix
create mode 100644 nix/nodes/baremetal-qemu/worker/configuration.nix
create mode 100644 nix/nodes/baremetal-qemu/worker/disko.nix
create mode 100644 nix/single-node/base.nix
create mode 100644 nix/single-node/qemu-vm.nix
create mode 100644 nix/single-node/surface.nix
create mode 100755 nix/test-cluster/hardware-smoke.sh
create mode 100755 nix/test-cluster/run-baremetal-iso-e2e.sh
create mode 100755 nix/test-cluster/run-core-control-plane-ops-proof.sh
create mode 100755 nix/test-cluster/run-local-baseline.sh
create mode 100755 nix/test-cluster/run-publishable-kvm-suite.sh
create mode 100755 nix/test-cluster/run-supported-surface-final-proof.sh
create mode 100644 nix/test-cluster/verify-baremetal-iso.sh
create mode 100755 nix/test-cluster/work-root-budget.sh
create mode 100644 nix/tests/verify-fleet-scheduler-e2e-stable.sh
create mode 100644 plans/baselines/main-reaggregation-2026-04-06.md
diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml
index 3b96a4d..7ef2f93 100644
--- a/.github/workflows/nix.yml
+++ b/.github/workflows/nix.yml
@@ -96,6 +96,23 @@ jobs:
run: |
nix run ./nix/ci#gate-ci -- --shared-crate ${{ matrix.crate }} --tier 0 --no-logs
+ portable-regressions:
+ needs: filter
+ if: ${{ needs.filter.outputs.any_changed == 'true' || needs.filter.outputs.global_changed == 'true' || needs.filter.outputs.shared_crates_changed == 'true' }}
+ runs-on: ubuntu-latest
+ name: portable regressions
+ steps:
+ - uses: actions/checkout@v4
+ - uses: DeterminateSystems/nix-installer-action@v11
+ - uses: DeterminateSystems/magic-nix-cache-action@v8
+
+ - name: Run portable canonical profile regressions
+ run: |
+ nix build \
+ .#checks.x86_64-linux.canonical-profile-eval-guards \
+ .#checks.x86_64-linux.portable-control-plane-regressions \
+ --accept-flake-config
+
# Build server packages (tier 1+)
build:
needs: [filter, gate]
@@ -116,7 +133,7 @@ jobs:
# Summary job for PR status checks
ci-status:
- needs: [filter, gate, shared-crates-gate]
+ needs: [filter, gate, shared-crates-gate, portable-regressions]
if: always()
runs-on: ubuntu-latest
steps:
@@ -128,11 +145,18 @@ jobs:
if [[ "${{ needs.shared-crates-gate.result }}" == "failure" ]]; then
exit 1
fi
- if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" ]]; then
+ if [[ "${{ needs.portable-regressions.result }}" == "failure" ]]; then
+ exit 1
+ fi
+ if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" || "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then
if [[ "${{ needs.gate.result }}" == "skipped" ]]; then
echo "Gate was skipped despite changes. This is unexpected."
exit 1
fi
+ if [[ "${{ needs.portable-regressions.result }}" == "skipped" ]]; then
+ echo "Portable regressions were skipped despite changes. This is unexpected."
+ exit 1
+ fi
fi
if [[ "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then
if [[ "${{ needs.shared-crates-gate.result }}" == "skipped" ]]; then
diff --git a/.gitignore b/.gitignore
index fa11ae8..38cb842 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@
.code/
.codex/
.claude.json
+.agent-r/
+agent-r.config.toml
.ralphrc
.sisyphus/
@@ -39,6 +41,7 @@ Thumbs.db
# Logs
*.log
+nohup.out
quanta/test_output_renamed.log
plasmavmc/kvm_test_output.log
diff --git a/README.md b/README.md
index 1a2f63a..646aeb6 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,8 @@
UltraCloud is a Nix-first cloud platform workspace that assembles a small control plane, network services, VM hosting, shared storage, object storage, and gateway services into one reproducible repository.
-The canonical local proof path is the six-node VM cluster under [`nix/test-cluster`](/home/centra/cloud/nix/test-cluster/README.md). It builds all guest images on the host, boots them as hardware-like QEMU nodes, and validates real multi-node behavior.
+The fastest public entrypoint is the one-command single-node quickstart. The `3-node HA control plane` profile lives in `nixosConfigurations.node01`, `nixosConfigurations.node02`, and `nixosConfigurations.node03`; the six-node VM cluster under [`nix/test-cluster`](nix/test-cluster/README.md) is the publishable harness that extends that HA baseline with worker and optional service bundles on host-built QEMU guests.
+The canonical bare-metal bootstrap proof is the ISO-on-QEMU path under [`nix/test-cluster`](nix/test-cluster/README.md), which drives phone-home, Disko install, reboot, and desired-system convergence for one control-plane node and one worker-equivalent node.
## Components
@@ -15,38 +16,217 @@ The canonical local proof path is the six-node VM cluster under [`nix/test-clust
- `plasmavmc`: VM control plane and worker agents
- `coronafs`: shared filesystem for mutable VM volumes
- `lightningstor`: object storage and VM image backing
-- `k8shost`: Kubernetes-style hosting control plane
+- `k8shost`: Kubernetes-style hosting control plane for tenant pods and services
- `apigateway`: external API and proxy surface
- `nightlight`: metrics ingestion and query service
-- `creditservice`: minimal reference quota/credit service
-- `deployer`: bootstrap and phone-home deployment service
+- `creditservice`: quota, reservation, and admission-control service
+- `deployer`: bootstrap and phone-home deployment service that owns install plans and desired-system intent
- `fleet-scheduler`: non-Kubernetes service scheduler for bare-metal cluster services
+## Core API Notes
+
+- `chainfire` ships a fixed-membership cluster API on the supported surface. Public cluster management is `MemberList` plus `Status`, and the internal Raft transport surface is `Vote` plus `AppendEntries`. `chainfire-core` is workspace-internal only; the old embeddable builder and distributed-KV scaffold are not part of the supported product contract.
+- `flaredb` ships SQL on both gRPC and REST. The supported REST SQL surface is `POST /api/v1/sql` for statement execution and `GET /api/v1/tables` for table discovery, alongside the existing KV and scan endpoints.
+- `plasmavmc` ships a KVM-only public VM backend contract. The supported create and recovery surface is the KVM path exercised in `single-node-quickstart`, `fresh-smoke`, and `fresh-matrix`; Firecracker and mvisor remain archived non-product backends outside the supported surface until they have real tenant-network coverage.
+- `lightningstor` keeps its optional gRPC surface live: bucket versioning, bucket policy, bucket tagging, and explicit object version listing are part of the supported contract for the canonical optional bundle.
+- `fiberlb` backend `Https` health checks currently do not verify backend TLS certificates. Supported scope is limited to TCP reachability plus HTTP status for the backend endpoint until CA-aware verification is wired through config, server code, and the canonical harness.
+- `k8shost` keeps `WatchPods` on the supported surface as a bounded snapshot stream for the current matching pod set. The published contract is the tenant workload API, not a separate long-lived controller event bus.
+- `k8shost` is fixed as an API/control-plane product surface; runtime dataplane helpers stay archived non-product until they have their own published contract and proof.
+- `k8shost-cni`, `k8shost-controllers`, `lightningstor-csi`, `nixosConfigurations.netboot-worker`, and the older scripts under `baremetal/vm-cluster` are archived internal scaffolds or `legacy/manual` debugging paths outside the supported surface.
+
+## Core Control Plane Operations
+
+The control-plane operator contract is fixed in [docs/control-plane-ops.md](docs/control-plane-ops.md).
+
+- ChainFire dynamic membership, replace-node, and scale-out are unsupported on the supported surface; the supported operator path is fixed-membership restore or whole-cluster replacement backed by the `durability-proof` backup/restore baseline.
+- FlareDB online migration and schema evolution must start from the durability-proof backup/restore baseline and stay additive-first until a later destructive cleanup window. FlareDB destructive DDL and fully automated online migration remain outside the supported product contract for this release.
+- IAM bootstrap hardening requires an explicit admin token, an explicit signing key, and a 32-byte IAM_CRED_MASTER_KEY. Signing-key rotation, credential overlap-and-revoke rotation, and mTLS overlap-and-cutover rotation are part of the supported operator contract; multi-node IAM failover remains outside the supported product contract. The standalone proof is `./nix/test-cluster/run-core-control-plane-ops-proof.sh`.
+
+## Edge And Trial Surface
+
+The edge-bundle and trial-surface contract is fixed in [docs/edge-trial-surface.md](docs/edge-trial-surface.md).
+
+- APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer; live in-process reload is not part of the product contract.
+- NightLight is supported as a single-node WAL/snapshot service; replicated HA metrics storage is not part of the product contract.
+- CreditService export and backend migration are supported as offline export/import or backend-native snapshot workflows, not live mixed-writer migration.
+- OCI/Docker artifact is intentionally not the public trial surface.
+- Use `./nix/test-cluster/work-root-budget.sh status` for disk budget, GC, and cleanup guidance, `./nix/test-cluster/work-root-budget.sh enforce` for a stronger local budget gate, and `./nix/test-cluster/work-root-budget.sh prune-proof-logs 2` for safer dated-proof cleanup.
+
## Quick Start
+Single-node quickstart:
+
+```bash
+nix run .#single-node-quickstart
+```
+
+This app is also the automated smoke check for the smallest realistic trial surface. It builds the minimal VM stack, boots a QEMU VM, waits for `chainfire`, `flaredb`, `iam`, `prismnet`, and `plasmavmc`, checks their health endpoints, and verifies the in-guest VM runtime prerequisites. For an interactive session, keep the VM running:
+
+```bash
+ULTRACLOUD_QUICKSTART_KEEP_VM=1 nix run .#single-node-quickstart
+```
+
+Buildable trial artifact:
+
+```bash
+nix build .#single-node-trial-vm
+nix run .#single-node-trial
+```
+
+`single-node-trial-vm` is the lightest supported artifact for local use: a host-built NixOS VM appliance for the VM-platform core. OCI/Docker artifact is intentionally not the public trial surface here, because the supported scope needs a guest kernel plus host KVM, `/dev/net/tun`, and OVS/libvirt semantics. A privileged container would be host-coupled and would not prove the same contract.
+
+The legacy name `.#all-in-one-quickstart` is kept as an alias, and `.#single-node-trial` is a friendlier alias for the same smoke launcher.
+
+Portable local proof on hosts without `/dev/kvm`:
+
+```bash
+nix build .#checks.x86_64-linux.canonical-profile-eval-guards
+nix build .#checks.x86_64-linux.portable-control-plane-regressions
+```
+
+This TCG-safe lane keeps canonical profile drift, the core `chainfire` / `deployer` control-plane path, the `deployer -> nix-agent` boundary, and the `fleet-scheduler -> node-agent` boundary under regression coverage without requiring nested virtualization.
+
+Publishable nested-KVM suite:
+
```bash
nix develop
nix run ./nix/test-cluster#cluster -- fresh-smoke
+nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp
+nix run ./nix/test-cluster#cluster -- fresh-matrix
+./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite
```
+The checked-in entrypoint for the publishable nested-KVM suite is the local wrapper `./nix/test-cluster/run-publishable-kvm-suite.sh`. Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally not part of this re-aggregated baseline.
+For the full supported-surface proof on a local AMD/KVM host, use `./nix/test-cluster/run-supported-surface-final-proof.sh ./work/final-proofs/latest`; it keeps builders local, builds `single-node-trial-vm`, runs `single-node-quickstart`, and captures the publishable KVM suite logs in one place.
+`nix run ./nix/test-cluster#cluster -- durability-proof` is the canonical chainfire flaredb deployer backup/restore lane. It persists artifacts under `./work/durability-proof/latest`, proves logical backup/restore for ChainFire keys and FlareDB SQL rows, uses the canonical Deployer admin pre-register request itself as the backup artifact, verifies that the pre-registered node survives a `deployer.service` restart, replays the same request idempotently, and injects CoronaFS plus LightningStor failures against the same live KVM cluster.
+`nix run ./nix/test-cluster#cluster -- rollout-soak` is the longer-running control-plane and rollout companion lane. It rebuilds from clean local KVM runtime state, persists artifacts under `./work/rollout-soak/latest`, validates exactly one planned `draining` maintenance cycle and one fail-stop worker-loss cycle on the two native-runtime workers, holds each degraded state for the configured soak window, then restarts `deployer`, `fleet-scheduler`, `node-agent`, `chainfire`, and `flaredb` before revalidating the cluster. The soak root also carries explicit scope markers so the supported boundary is encoded in the proof artifacts rather than only in docs. The steady-state KVM nodes do not run `nix-agent.service`, so the soak lane records explicit `nix-agent` scope markers instead of pretending a live-cluster `nix-agent` restart happened.
+`nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof` is the focused local-KVM reality lane for the provider and VM-hosting bundles. It stores artifacts under `./work/provider-vm-reality-proof/latest`, captures authoritative FlashDNS answers, FiberLB backend drain and restore evidence, and PlasmaVMC KVM shared-storage migration plus post-migration restart state.
+The 2026-04-10 local AMD/KVM proof logs are in `./work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final` for `supported-surface-guard`, `single-node-trial-vm`, and `single-node-quickstart`, and in `./work/publishable-kvm-suite` for the final passing `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix` run through `./nix/test-cluster/run-publishable-kvm-suite.sh`.
+The exact bare-metal check-runner proof from `2026-04-10` is in `./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; its outer `environment.txt` records `execution_model=materialized-check-runner`, and `state/environment.txt` records `vm_accelerator_mode=kvm`.
+The 2026-04-10 durability and failure-injection proof logs are in `./work/durability-proof/20260410T120618+0900`; `result.json` records `success=true`, `deployer_restore_mode="admin pre-register request replay with pre/post-restart list verification"`, and the artifact set includes `chainfire-backup-response.json`, `flaredb-restored.json`, `deployer-post-restart-list.json`, `coronafs-node04-local-state.json`, and `lightningstor-head-during-node05-outage.json`.
+The 2026-04-10 longer-running rollout and control-plane soak is in `./work/rollout-soak/20260410T164549+0900`; `result.json` records `success=true`, `fleet_supported_native_runtime_nodes=2`, `validated_maintenance_cycles=1`, `validated_power_loss_cycles=1`, and `soak_hold_secs=30`, while the artifact set includes `maintenance-held.json`, `power-loss-held.json`, `deployer-post-restart-nodes.json`, `chainfire-post-restart-put.json`, `flaredb-post-restart.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt`, and the `node01-nix-agent-scope.txt` / `node04-nix-agent-scope.txt` boundary markers.
+The 2026-04-10 provider and VM-hosting reality proof logs are in `./work/provider-vm-reality-proof/20260410T135827+0900`; `result.json` records `success=true`, and the artifact set includes `network-provider/fiberlb-drain-summary.txt`, `network-provider/flashdns-service-authoritative-answer.txt`, `vm-hosting/migration-summary.json`, and `vm-hosting/root-volume-after-post-migration-restart.json`.
+Physical-node bring-up now has a canonical preflight wrapper as well: `nix run ./nix/test-cluster#hardware-smoke -- preflight`. It writes `kernel-params.txt`, expected markers, failure markers, and a machine-readable blocked or ready state under `./work/hardware-smoke/latest`, and the same entrypoint can later be rerun as `run` or `capture` when USB or BMC/Redfish transport is actually present.
+
+Within that suite, `fresh-matrix` is the public provider-bundle proof: it exercises PrismNet VPC/subnet/port flows plus security-group ACL add/remove, FlashDNS record publication, and FiberLB TCP plus TLS-terminated `Https` / `TerminatedHttps` listeners in one tenant-scoped composition run. The published FiberLB L4 algorithms are kept honest with targeted server unit tests in-tree. `provider-vm-reality-proof` is the artifact-producing companion lane for the same bundle and for the VM-hosting path.
+PrismNet real OVS/OVN dataplane validation remains outside the supported local KVM surface. FiberLB native BGP or BFD peer interop plus hardware VIP ownership also remain outside the supported local KVM surface. PlasmaVMC real-hardware migration or storage handoff remains a later hardware proof; the current local-KVM proof fixes the release surface to KVM shared-storage migration on the worker pair.
+
+Project-done release proof now requires both halves of the public validation surface to be green:
+
+- `baremetal-iso` and `baremetal-iso-e2e` for the canonical `deployer -> installer -> nix-agent` bare-metal bootstrap path
+- the KVM publishable suite (`fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`) for the nested-KVM multi-node VM-hosting path
+
+Canonical bare-metal bootstrap proof:
+
+```bash
+nix run ./nix/test-cluster#cluster -- baremetal-iso
+nix build .#checks.x86_64-linux.baremetal-iso-e2e
+./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/latest
+```
+
+`baremetal-iso-e2e` now materializes the exact local-KVM proof runner instead of trying to boot QEMU inside a sandboxed `nixbld` build. That older build-time execution model degraded to `TCG`; the built runner keeps the canonical attr name but executes the same `verify-baremetal-iso.sh` harness as the direct QEMU proof, with host KVM and persistent logs under `./work`.
+
+The QEMU ISO proof is a stand-in for the real install route, not a separate workflow. Build `nixosConfigurations.ultracloud-iso`, boot it under KVM locally or write the same ISO to USB or BMC virtual media on hardware, and pass the same bootstrap inputs that the installer consumes in the harness: `ultracloud.deployer_url=`, `ultracloud.bootstrap_token=` for authenticated bootstrap or a lab-only `deployer` configured with `allow_unauthenticated=true`, optional `ultracloud.ca_cert_url=`, optional `ultracloud.binary_cache_url=`, and optional `ultracloud.node_id=` / `ultracloud.hostname=` overrides when DMI serials or DHCP names are not the desired identity.
+
+The networking contract is the same in QEMU and on hardware: the live ISO needs DHCP or equivalent L3 reachability to `deployer` before Disko starts, and it needs reachability to the optional binary cache if you want it to pull prebuilt closures instead of compiling locally. The local QEMU proof relies on the `10.0.2.2` fallback addresses from user-mode NAT; real hardware should set `ultracloud.deployer_url` and, when used, `ultracloud.binary_cache_url` to routable control-plane endpoints. USB media and BMC virtual media are only transport differences for the same ISO and kernel parameters. For the local proof keep `./work` or `ULTRACLOUD_WORK_ROOT` on a large disk; the checked-in wrappers force local builders and derive Nix parallelism from the host CPU count unless you override it explicitly.
+
+Canonical hardware preflight and handoff for the same path:
+
+```bash
+nix run ./nix/test-cluster#hardware-smoke -- preflight
+nix run ./nix/test-cluster#hardware-smoke -- run
+nix run ./nix/test-cluster#hardware-smoke -- capture
+```
+
+That wrapper keeps the QEMU proof and the physical-node proof on one contract by writing the exact kernel parameters, expected `ULTRACLOUD_MARKER` sequence, failure markers, and artifact root under `./work/hardware-smoke/latest`.
+
+Canonical hardware handoff for that path:
+
+1. Build `nixosConfigurations.ultracloud-iso` plus the target role configs (`baremetal-qemu-control-plane`, `baremetal-qemu-worker`, or their hardware-specific successors) and expose `deployer` plus an optional HTTP Nix cache on addresses the installer can reach.
+2. Publish cluster state so that the reusable node class owns the install contract: `install_plan.nixos_configuration`, `install_plan.disko_config_path`, and preferably `install_plan.target_disk_by_id`. Node entries should only bind identity, pool, and any desired-system override that truly differs per host. When you expose a binary cache, prefer setting `desired_system.target_system` to the prebuilt class-owned closure as well so post-install convergence does not rebuild a dirty local variant on each node.
+3. Boot the same ISO through USB or BMC virtual media and pass `ultracloud.deployer_url=...`, `ultracloud.bootstrap_token=...`, and, when used, `ultracloud.binary_cache_url=...` on the kernel command line.
+4. Watch the canonical marker sequence from the installer journal: `pre-install.boot`, `pre-install.phone-home.complete`, `install.bundle-downloaded`, `install.disko.complete`, `install.nixos-install.complete`, `reboot`, `post-install.boot`.
+5. Treat `nix-agent` reporting the desired system as `active` as the final convergence gate. The QEMU harness proves the same sequence, only with virtio disks and host-local endpoints standing in for the real chassis.
+
+The checked-in QEMU proof now mirrors the disk-selection contract that hardware should use. Its node classes install by stable `/dev/disk/by-id/virtio-uc-control-root` and `/dev/disk/by-id/virtio-uc-worker-root` selectors, backed by explicit QEMU disk serials, while the ISO resolves the prebuilt Disko script and target system from the install profile name embedded into the ISO. Hardware should keep the same class/profile structure and swap only the disk selector, routable URLs, and physical media transport.
+
+## Canonical Profiles
+
+UltraCloud now fixes the public support surface to three canonical profiles:
+
+| Profile | Canonical entrypoints | Required components | Optional components |
+| --- | --- | --- | --- |
+| `single-node dev` | `nix run .#single-node-quickstart`, `nix run .#single-node-trial`, `nix build .#single-node-trial-vm`, `nixosConfigurations.single-node-quickstart`, companion install image `nixosConfigurations.netboot-all-in-one` | `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` | `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost` |
+| `3-node HA control plane` | `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion install image `nixosConfigurations.netboot-control-plane` | `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node | `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` |
+| `bare-metal bootstrap` | `nix run ./nix/test-cluster#cluster -- baremetal-iso`, `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` | `deployer`, `first-boot-automation`, `install-target`, `nix-agent` | `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after bootstrap |
+
+`nixosConfigurations.netboot-all-in-one` and `nixosConfigurations.netboot-control-plane` are canonical companion images for the supported `single-node dev` and `3-node HA control plane` profiles. `packages.single-node-trial-vm` is the low-friction trial artifact for the minimal VM-platform core. `nixosConfigurations.netboot-worker`, `netboot-base`, `pxe-server`, `vm-smoke-target`, and older launch flows under `baremetal/vm-cluster` are archived helpers or `legacy/manual` debugging paths outside the canonical profiles and their guard set.
+
+## Cluster Authoring
+
+`ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is the only supported cluster authoring source. It is the canonical place to define nodes, reusable deployer classes and pools, rollout objects, service placement intent, and the generated per-node bootstrap metadata consumed by `deployer`, `fleet-scheduler`, `nix-agent`, and `node-agent`.
+
+`nix-nos` is limited to legacy compatibility and low-level network primitives such as interfaces, VLANs, BGP, and static routing. It is not the canonical source for cluster topology, rollout intent, scheduler state, or bootstrap inventory.
+
+## Responsibility Boundaries
+
+- `plasmavmc` owns tenant VM lifecycle plus KVM worker registration. It can run against explicit remote IAM, PrismNet, and FlareDB endpoints, but it does not own machine enrollment, desired-system rollout, or host-native service placement.
+- `k8shost` owns Kubernetes-style pod and service APIs for tenant workloads, then translates them into `prismnet`, `flashdns`, and `fiberlb` objects. It does not place host-native cluster daemons, and its runtime dataplane helpers remain archived non-product.
+- `fleet-scheduler` owns placement and failover of host-native service instances from declarative cluster state derived from `ultracloud.cluster`. It consumes `node-agent` heartbeats and writes instance placement, but it does not expose tenant-facing Kubernetes semantics.
+- `deployer` owns machine enrollment, `/api/v1/phone-home`, install plans, cluster metadata, and desired-system references. The supported declarative input for that state is the JSON generated from `ultracloud.cluster`; it decides what a node should become, but it does not execute the host-local switch.
+- `nix-agent` owns host-local NixOS convergence only. It reads desired-system state from `deployer` or `chainfire`, activates the target closure, and rolls back on failed health checks.
+- `node-agent` owns host-local runtime execution only. It reports heartbeats and applies scheduled service-instance state, but it does not install the base OS or rewrite desired-system targets.
+
+The single-node quickstart deliberately stops below that rollout stack: it ships only the VM-platform core plus optional add-ons, not `deployer`, `nix-agent`, `node-agent`, or `fleet-scheduler`.
+
+## Standalone Stories
+
+- `single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story. They keep the minimal KVM-backed VM surface light and intentionally exclude `deployer`, `nix-agent`, `fleet-scheduler`, and `node-agent`.
+- `deployer-vm-smoke`, `portable-control-plane-regressions`, and `baremetal-iso` are the standalone rollout-stack story. They validate `deployer -> nix-agent` and `deployer -> fleet-scheduler -> node-agent` without requiring the full VM-hosting bundle.
+
+## Rollout Bundle Operations
+
+The rollout-bundle operator contract is fixed in [docs/rollout-bundle.md](docs/rollout-bundle.md). As of 2026-04-10 the supported `deployer` recovery model is scope-fixed to one active writer plus optional cold-standby restore that reuses the same ChainFire namespace, credentials, bootstrap bundle, and local state backup. `deployer` is scope-fixed to one active writer plus optional cold-standby restore; automatic ChainFire-backed multi-instance failover is outside the supported product contract for this release.
+
+The same operator doc also fixes the `nix-agent` health-check and rollback contract, the `node-agent` logs/secrets/volume/upgrade contract, and the `fleet-scheduler` supported upper limit: the two native-runtime worker lab with one planned drain cycle, one fail-stop worker-loss cycle, and 30-second held degraded states in `rollout-soak`. `fleet-scheduler` is scope-fixed to the two native-runtime worker lab with one planned drain cycle, one fail-stop worker-loss cycle, and 30-second held degraded states in rollout-soak. The canonical proofs are `nix build .#checks.x86_64-linux.deployer-vm-rollback`, `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`, `nix build .#checks.x86_64-linux.portable-control-plane-regressions`, `nix run ./nix/test-cluster#cluster -- fresh-smoke`, `nix run ./nix/test-cluster#cluster -- rollout-soak`, and `nix run ./nix/test-cluster#cluster -- durability-proof`.
+
## Main Entrypoints
-- workspace flake: [flake.nix](/home/centra/cloud/flake.nix)
-- VM validation harness: [nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md)
-- shared volume notes: [coronafs/README.md](/home/centra/cloud/coronafs/README.md)
-- minimal quota-service rationale: [creditservice/README.md](/home/centra/cloud/creditservice/README.md)
-- archived manual VM launch scripts: [baremetal/vm-cluster/README.md](/home/centra/cloud/baremetal/vm-cluster/README.md)
+- workspace flake: [flake.nix](flake.nix)
+- single-node quickstart smoke: [`nix run .#single-node-quickstart`](docs/testing.md)
+- single-node trial artifact: [`nix build .#single-node-trial-vm`](docs/testing.md), [`nix run .#single-node-trial`](docs/testing.md)
+- smallest rollback proof for `deployer -> nix-agent`: [`nix build .#checks.x86_64-linux.deployer-vm-rollback`](docs/rollout-bundle.md)
+- `3-node HA control plane` configs: `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion image `nixosConfigurations.netboot-control-plane`
+- portable local proof: [`nix build .#checks.x86_64-linux.portable-control-plane-regressions`](docs/testing.md)
+- longer-running control-plane and rollout soak: [`nix run ./nix/test-cluster#cluster -- rollout-soak`](docs/testing.md)
+- canonical bare-metal bootstrap smoke: [`nix run ./nix/test-cluster#cluster -- baremetal-iso`](docs/testing.md)
+- canonical bare-metal exact proof runner: [`nix build .#checks.x86_64-linux.baremetal-iso-e2e`](docs/testing.md) then `./result/bin/baremetal-iso-e2e`
+- canonical physical-node preflight and handoff: [`nix run ./nix/test-cluster#hardware-smoke -- preflight`](docs/hardware-bringup.md), then `run` or `capture`
+- canonical profile guards: [`nix build .#checks.x86_64-linux.canonical-profile-eval-guards`](docs/testing.md), [`nix build .#checks.x86_64-linux.canonical-profile-build-guards`](docs/testing.md)
+- supported surface guard: [`nix build .#checks.x86_64-linux.supported-surface-guard`](docs/testing.md) for public docs wording, shipped server API completeness, and high-signal TODO or best-effort markers in the supported provider/backend servers
+- VM validation harness: [nix/test-cluster/README.md](nix/test-cluster/README.md)
+- work-root budget helper: [`./nix/test-cluster/work-root-budget.sh status`](docs/testing.md), `enforce`, and `prune-proof-logs`
+- shared volume notes: [coronafs/README.md](coronafs/README.md)
+- apigateway supported scope: [apigateway/README.md](apigateway/README.md)
+- nightlight supported scope: [nightlight/README.md](nightlight/README.md)
+- creditservice supported scope: [creditservice/README.md](creditservice/README.md)
+- k8shost supported scope: [k8shost/README.md](k8shost/README.md)
## Repository Guide
-- [docs/README.md](/home/centra/cloud/docs/README.md): documentation entrypoint
-- [docs/testing.md](/home/centra/cloud/docs/testing.md): validation path summary
-- [docs/component-matrix.md](/home/centra/cloud/docs/component-matrix.md): supported multi-component compositions
-- [docs/storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers
+- [docs/README.md](docs/README.md): documentation entrypoint
+- [docs/testing.md](docs/testing.md): validation path summary
+- [docs/component-matrix.md](docs/component-matrix.md): canonical profiles and optional bundles
+- [docs/rollout-bundle.md](docs/rollout-bundle.md): rollout-bundle HA, rollback, drain, logs, secrets, and volume contract
+- [docs/control-plane-ops.md](docs/control-plane-ops.md): ChainFire membership boundary, FlareDB schema or destructive-DDL boundary, and IAM bootstrap hardening plus signing-key, credential, and mTLS rotation
+- [docs/edge-trial-surface.md](docs/edge-trial-surface.md): APIGateway, NightLight, CreditService, trial-surface, and work-root budget contract
+- [docs/provider-vm-reality.md](docs/provider-vm-reality.md): PrismNet, FlashDNS, FiberLB, and PlasmaVMC local-KVM proof scope plus artifact contract
+- [docs/hardware-bringup.md](docs/hardware-bringup.md): USB/BMC/Redfish preflight, artifact capture, and hardware-smoke handoff
+- [docs/storage-benchmarks.md](docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers
- `plans/`: design notes and exploration documents
## Scope
-UltraCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products.
+UltraCloud is centered on reproducible infrastructure behavior. Optional add-ons such as `creditservice` and `k8shost` remain part of the supported surface only when the documented scope, harness coverage, and public contract stay aligned with what the repository actually ships.
-Host-level NixOS rollout validation is also expected to stay reproducible: the `deployer-vm-smoke` VM test now proves that `nix-agent` can activate a prebuilt target system closure directly, without recompiling the stack inside the guest.
+Host-level NixOS rollout validation is also expected to stay reproducible: `baremetal-iso-e2e` is now the materialized exact proof runner for the full install path, `canonical-profile-eval-guards` and `canonical-profile-build-guards` fail fast when supported outputs drift, `supported-surface-guard` now rejects unfinished public wording, shipped server API stubs, high-signal completeness markers such as `TODO:` or `best-effort` in the supported network or backend servers, and archived helper regressions such as worker netboot or backend scaffolds re-entering the default product surface, while `portable-control-plane-regressions` remains the non-KVM developer lane that keeps the main control-plane and rollout boundaries green on TCG-only hosts before the publishable nested-KVM suite is rerun.
diff --git a/TODO.md b/TODO.md
new file mode 100644
index 0000000..335fc84
--- /dev/null
+++ b/TODO.md
@@ -0,0 +1,411 @@
+# UltraCloud Baseline TODO (2026-04-10)
+
+- Task: `0fe10731-bdbc-4f8f-8bcc-5f5a16903200`
+- 作成ブランチ: `task/0fe10731-baseline-todo`
+- ベース: `origin/main` at `b8ebd24d4e9b2dbe71e34ba09b77092dfa7dd43c`
+- 引き継ぎ方針: `task/343c8c57-main-reaggregate` の dirty worktree は reset/revert せず、そのまま新ブランチへ持ち上げた。
+- この票の目的: 各コンポーネントの責務、正本 entrypoint、現時点の証拠、未証明事項、優先度付き問題票、依存関係を 1 枚に固定し、以後の自律実装の基準票にする。
+- 調査入力: `README.md`, `docs/component-matrix.md`, `docs/testing.md`, `nix/test-cluster/README.md`, `plans/cluster-investigation-2026-03-02/*`, 現在の `nix/modules/*`, `nix/single-node/*`, `nix/nodes/baremetal-qemu/*`, `nix/test-cluster/*`, 各 component の `src/main.rs` / API 定義。
+
+## Canonical Boundary Snapshot
+
+- 正本 profile は 3 つ: `single-node dev`, `3-node HA control plane`, `bare-metal bootstrap`。
+- 最小コアは `chainfire + flaredb + iam + prismnet + plasmavmc`。
+- ネットワーク provider bundle は `prismnet + flashdns + fiberlb`。
+- VM hosting bundle は `plasmavmc + prismnet + coronafs + lightningstor`。
+- edge/tenant bundle は `apigateway + nightlight + creditservice`。
+- rollout bundle は `deployer + nix-agent + fleet-scheduler + node-agent`。
+- 2026-04-10 の current branch では、QEMU/KVM を正本の local proof とし、bare-metal proof も `QEMU as hardware` として同一 ISO 契約で扱う構造が入っている。
+
+## 2026-03-02 Failure Split
+
+### 2026-03-02 の失敗で、2026-04-10 current branch では file-level に解消済みのもの
+
+- `ARCH-001`: `flake.nix` が欠損 `docs/.../configuration.nix` を参照していた件は解消済み。現在の正本は `nix/nodes/vm-cluster/node01`, `node02`, `node03` と `canonical-profile-eval-guards`。
+- `ARCH-002`: ISO install の `disko.nix` 欠損参照は解消済み。現在は `nix/nodes/baremetal-qemu/control-plane/disko.nix` と `.../worker/disko.nix` を `verify-baremetal-iso.sh` が直接使う。
+- `ARCH-003`: `deployer` の Nix wiring 欠損は解消済み。`nix/modules/deployer.nix`, `flake.nix` の package/app/check 定義, `deployer-server` の `/api/v1/phone-home` が存在する。
+- `TC-001`: `joinAddr` 不整合は解消済み。現在の `chainfire` / `flaredb` module は `initialPeers` 契約に揃っている。
+- `TC-002`: `node06` の `creditservice` 評価失敗は解消済み。現在の `nix/test-cluster/node06.nix` は `creditservice.nix` を import し、`flaredbAddr` も与えている。
+- `COMP-001` から `COMP-004`: IAM endpoint 注入ミスマッチは解消済み。`prismnet`, `plasmavmc`, `fiberlb`, `lightningstor`, `flashdns`, `creditservice` は現在 module から binary が実際に読む config key に変換している。
+- `ARCH-004`: first-boot の `leader_url` 契約不整合は解消済み。`nix/modules/first-boot-automation.nix` は `http://localhost:8081` / `8082` と `/admin/member/add` を前提にしている。
+- `ARCH-005`: FlareDB に first-boot 用 join API が無かった件は解消済み。`flaredb/crates/flaredb-server/src/rest.rs` に `POST /admin/member/add` がある。
+- `3.1 NightLight grpcPort mismatch`: 解消済み。`nightlight-server` は現在 HTTP と gRPC を両方 bind する。
+- `ARCH-006` / `cluster-config` 二重実装問題: 2026-03-02 にあった `nix-nos/topology.nix` 起点の重複は current tree ではそのまま見当たらず、正本は `nix/lib/cluster-schema.nix` と `nix/modules/ultracloud-cluster.nix` に寄っている。
+- `QLT-001`: `flake.nix` 上の大量 `doCheck = false` 群は、少なくとも current file-level ではそのまま残っていない。
+
+### 2026-03-02 の失敗と切り分けて、2026-04-10 では「構造 fix はあるが runtime 再証明が未了」のもの
+
+- `VERIFY-001`: 2026-04-10 の local AMD/KVM host で `supported-surface-guard`, `single-node-trial-vm`, `single-node-quickstart`, `fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`, `./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite`, `canonical-profile-eval-guards`, `portable-control-plane-regressions`, `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `fleet-scheduler-e2e`, `baremetal-iso`, `nix build .#checks.x86_64-linux.baremetal-iso-e2e`, and the built `./result/bin/baremetal-iso-e2e` exact runner は再走済みで pass。未再証明なのは実機 bare-metal smoke のみ。
+- `VERIFY-002`: bare-metal bootstrap は QEMU ISO proof まで閉じているが、USB/BMC/実機への同契約再証明はまだ無い。ただし 2026-04-10 に `nix run ./nix/test-cluster#hardware-smoke -- preflight` を追加し、transport 不在時の blocked state は `./work/hardware-smoke/latest/status.env` と `missing-requirements.txt` へ機械的に残せるようになった。
+- `VERIFY-003`: config-contract 修正は `run-publishable-kvm-suite.sh` で全 add-on 有効 profile まで再確認済み。`baremetal-iso-e2e` も materialized host-KVM runner へ移行済みで、残件は hardware bring-up に絞られた。
+
+## First Tranche Backlog
+
+- `TRANCHE-01`: 完了。`single-node dev` の optional bundle health gating は 2026-04-10 に修正済み。`coronafs` の port mismatch と `flashdns` / `fiberlb` / `lightningstor` の health 未監視を解消した。
+- `TRANCHE-02`: `baremetal-iso` と `baremetal-iso-e2e` exact runner は 2026-04-10 の local AMD/KVM host で再走済み。次段で USB/BMC/実機 1 台の smoke を追加する。
+- `TRANCHE-03`: 完了。2026-04-10 に `nix run ./nix/test-cluster#cluster -- durability-proof` を追加し、`chainfire` / `flaredb` の logical backup/restore と、`deployer` の admin pre-register request replay + restart persistence proof を product doc と harness へ固定した。
+- `TRANCHE-04`: 完了。`fleet-scheduler`, `nix-agent`, `node-agent`, `deployer-ctl` の local `chainfire` 既定 endpoint は 2026-04-10 に canonical `http://127.0.0.1:2379` へ正規化した。
+- `TRANCHE-05`: 完了。`fiberlb` の HTTPS health check は 2026-04-10 に supported scope を明文化し、現時点では backend TLS 証明書検証なしの `TCP reachability + HTTP status` のみが製品契約だと docs/guard/source comment へ固定した。
+- `TRANCHE-06`: 完了。`k8shost` は 2026-04-10 に API/control-plane 製品として固定し、runtime dataplane helpers は archived non-product と docs/guard/TODO を一致させた。
+- `TRANCHE-07`: 完了。2026-04-10 の `durability-proof` が `lightningstor` distributed backend の node-loss / repair と `coronafs` controller/node split outage を canonical failure-injection proof として保存する。
+- `TRANCHE-08`: 完了。2026-04-10 に `hardware-smoke` preflight/handoff wrapper を追加し、`deployer -> ISO -> first-boot -> nix-agent` の実機 bring-up を USB/BMC/Redfish 共通 entrypoint で準備できるようにした。transport 不在時の blocked artifact も `./work/hardware-smoke` に固定化した。
+- `TRANCHE-10`: 完了。2026-04-10 に `nix run ./nix/test-cluster#cluster -- rollout-soak` を longer-run KVM operator lane として固定し、`draining` maintenance, worker power-loss, `deployer` / `fleet-scheduler` / `node-agent` restart, fixed-membership `chainfire` / `flaredb` restart を同一 artifact root に保存した。steady-state `test-cluster` に `nix-agent.service` が載っていないことも scope marker artifact で明文化した。
+- `TRANCHE-11`: 完了。2026-04-10 に `DEPLOYER-P1-01` と `FLEET-P1-01` を scope-fixed final state へ更新し、`rollout-soak` が `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt` を `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900` へ保存するようにした。`deployer` は one active writer plus optional cold-standby restore、`fleet-scheduler` は two native-runtime workers 上の one drain + one fail-stop cycle with 30-second hold を release boundary として固定した。
+- `TRANCHE-12`: 完了。2026-04-10 に `FDB-P1-01`, `IAM-P1-01`, `HARNESS-P2-01` を次段処理した。`run-core-control-plane-ops-proof.sh` は `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00` へ `scope-fixed-contract.json`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, `result.json` を保存し、FlareDB destructive DDL/fully automated online migration は scope-fixed unsupported、IAM は signing-key + credential + mTLS overlap rotation までを supported lifecycle とし multi-node failover は unsupported に固定した。`work-root-budget.sh` には `enforce` と `prune-proof-logs` を追加し、disk budget advisory から stronger local gate と safer cleanup workflow へ進めた。
+
+## 2026-04-10 Physical Hardware Bring-Up Pack
+
+- `Task:` `3dba03d3-525b-4079-8c93-90af6a89d32b`
+- `Canonical entrypoint:` `nix run ./nix/test-cluster#hardware-smoke -- preflight`, then `run` or `capture`
+- `Current preflight artifact root:` `./work/hardware-smoke/latest`
+- `Artifact contract:` `status.env`, `missing-requirements.txt`, `kernel-params.txt`, `expected-markers.txt`, `failure-markers.txt`, `operator-handoff.md`, `environment.txt`
+- `Bridge to QEMU proof:` hardware wrapper reuses `nixosConfigurations.ultracloud-iso` and the same `ULTRACLOUD_MARKER pre-install.boot.*`, `pre-install.phone-home.complete.*`, `install.disko.complete.*`, `reboot.*`, `post-install.boot.*`, `desired-system-active.*` markers that `verify-baremetal-iso.sh` enforces in the QEMU harness.
+- `Blocked-state recording:` when USB device or BMC/Redfish transport is missing, `preflight` records `status=blocked` and the missing transport, kernel-parameter, and capture inputs in `missing-requirements.txt` without pretending the hardware proof ran.
+- `Still open:` an actual physical-node execution remains pending until a removable USB target or BMC/Redfish endpoint plus credentials are supplied.
+- `TRANCHE-09`: 完了。2026-04-10 に `docs/rollout-bundle.md` を追加し、`deployer` single-writer DR、`nix-agent` health-check/rollback、`node-agent` logs/secrets/volume/upgrade、`fleet-scheduler` drain/maintenance/failover の product contract と proof command を固定した。
+
+## 2026-04-10 Long-Run Control Plane And Rollout Soak
+
+- `Task:` `07d6137e-6e4c-4158-9142-8920f4f70a76`
+- `Canonical entrypoint:` `nix run ./nix/test-cluster#cluster -- rollout-soak`
+- `Artifact root:` `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900`
+- `Scenario proof:` one planned `node04 -> draining -> active` cycle, one `node05` power-loss and recovery cycle, restart of `deployer.service`, `fleet-scheduler.service`, `node-agent.service` on both worker nodes, and fixed-membership restart of `chainfire.service` plus `flaredb.service` on `node02`.
+- `Saved evidence:` `maintenance-during.json`, `maintenance-held.json`, `maintenance-restored.json`, `power-loss-during.json`, `power-loss-held.json`, `power-loss-restored.json`, `deployer-post-restart-nodes.json`, `fleet-scheduler-post-restart.json`, `node04-node-agent-post-restart.json`, `node05-node-agent-post-restart.json`, `chainfire-post-restart-put.json`, `flaredb-post-restart.json`, `post-control-plane-restarts.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt`, `result.json`.
+- `Long-run nix-agent boundary:` steady-state `nix/test-cluster` nodes do not ship `nix-agent.service`, so this soak records `node01-nix-agent-scope.txt` and `node04-nix-agent-scope.txt` instead of pretending a live-cluster `nix-agent` restart happened. The executable `nix-agent` proofs remain `deployer-vm-rollback`, `baremetal-iso`, and `baremetal-iso-e2e`.
+- `Result:` PASS on the local AMD/KVM host. `result.json` records `success=true`, `fleet_supported_native_runtime_nodes=2`, `validated_maintenance_cycles=1`, `validated_power_loss_cycles=1`, `soak_hold_secs=30`, and the summary `validated one planned drain cycle and one fail-stop worker-loss cycle on the two-node native-runtime lab, held each degraded state for the configured soak window, restarted deployer or scheduler or agent services, and revalidated fixed-membership control-plane restarts while keeping deployer HA scope-fixed to single-writer recovery`.
+
+## 2026-04-10 Local Executable Baseline
+
+- `Task:` `b1e811fb-158f-415c-a011-64c724e84c5c`
+- `Runner:` `nix/test-cluster/run-local-baseline.sh`
+- `Log root:` `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c`
+- `Local execution policy:` `ULTRACLOUD_WORK_ROOT=/mnt/d2/centra/photoncloud-monorepo/work`, `TMPDIR=/mnt/d2/centra/photoncloud-monorepo/work/tmp`, `XDG_CACHE_HOME=/mnt/d2/centra/photoncloud-monorepo/work/xdg-cache`, `PHOTON_CLUSTER_WORK_ROOT=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster`, `PHOTON_VM_DIR=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster/state`, `PHOTON_CLUSTER_VDE_SWITCH_DIR=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster/vde-switch`, `NIX_CONFIG builders =` で remote builder を禁止。
+- `Host evidence:` `environment.txt` に `host_cpu_count=12`, `ultracloud_local_nix_max_jobs=6`, `ultracloud_local_nix_build_cores=2`, `photon_cluster_nix_max_jobs=6`, `photon_cluster_nix_build_cores=2`, `nix_builders=` (empty), `kvm_access=rw`, `nested_param_value=1` を保存済み。
+- `Guard/build checks:`
+ - `canonical-profile-eval-guards`: PASS. command `nix build .#checks.x86_64-linux.canonical-profile-eval-guards --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/canonical-profile-eval-guards.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/canonical-profile-eval-guards.log`.
+ - `supported-surface-guard`: PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/supported-surface-guard.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/supported-surface-guard.log`.
+ - `portable-control-plane-regressions`: PASS. command `nix build .#checks.x86_64-linux.portable-control-plane-regressions`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/portable-control-plane-regressions.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/portable-control-plane-regressions.log`.
+ - `deployer-bootstrap-e2e`: PASS. command `nix build .#checks.x86_64-linux.deployer-bootstrap-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/deployer-bootstrap-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/deployer-bootstrap-e2e.log`.
+ - `host-lifecycle-e2e`: PASS. command `nix build .#checks.x86_64-linux.host-lifecycle-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/host-lifecycle-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/host-lifecycle-e2e.log`.
+ - `fleet-scheduler-e2e`: PASS. command `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fleet-scheduler-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fleet-scheduler-e2e.log`.
+- `Runtime path checks:`
+ - `single-node-quickstart`: PASS. command `nix run .#single-node-quickstart`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/single-node-quickstart.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/single-node-quickstart.log`; success marker `single-node quickstart smoke passed`.
+ - `baremetal-iso`: PASS. command `nix run ./nix/test-cluster#cluster -- baremetal-iso`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/baremetal-iso.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/baremetal-iso.log`; success markers `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, `Canonical ISO bare-metal QEMU verification succeeded`.
+ - `fresh-smoke`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-smoke`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fresh-smoke.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fresh-smoke.log`; success marker `Cluster validation succeeded`.
+- `2026-04-10 execution failures:` none. 2026-03-02 の historical failure split は上節のままで、この local AMD/KVM baseline では required command 群を fail として再現していない。
+- `2026-04-10 observed non-failure risk:`
+ - `HARNESS-OBS-20260410-01`: 2026-04-10 に解消。`nix/test-cluster/run-cluster.sh` の stale VM cleanup は current `vm_dir` / `vde_switch_dir` を cmdline で確認した PID のみ収集するように変更し、path 非依存の `hostfwd=tcp::${port}-:22` fallback を撤去した。
+
+## 2026-04-10 Bare-Metal Canonical Path
+
+- `Task:` `6d9f45e4-1954-4a0b-b886-c61482db6c3c`
+- `QEMU-as-hardware runtime proof:` PASS. command `nix run ./nix/test-cluster#cluster -- baremetal-iso`; log root `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso`; evidence files `environment.txt`, `deployer.log`, `chainfire.log`, `control-plane.serial.log`, `worker.serial.log`.
+- `Runtime PASS markers:` `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, `Canonical ISO bare-metal QEMU verification succeeded`.
+- `Runtime contract now proven:`
+ - reusable node classes own `install_plan.nixos_configuration`, `install_plan.disko_config_path`, and stable `install_plan.target_disk_by_id`
+ - nodes carry identity plus desired-system overrides only; when a cache-backed prebuilt closure is available they now publish `desired_system.target_system` to converge to the exact shipped system instead of a dirty local rebuild
+ - installed nodes now keep `nix-agent` alive across their own `switch-to-configuration` transaction long enough for activation to finish, which restored post-install `chainfire` and `nix-agent` convergence
+- `Historical blocker (resolved on 2026-04-10):` direct build-time execution of `nix build .#checks.x86_64-linux.baremetal-iso-e2e` ran under sandboxed `nixbld1` and fell back to `TCG`. The exact lane is now a materialized runner: the check build succeeds quickly and emits `./result/bin/baremetal-iso-e2e`, and that runner executes the same `verify-baremetal-iso.sh` harness with host KVM and logs under `./work`.
+
+## 2026-04-10 Responsibility And Minimal-Surface Alignment
+
+- `Task:` `65a13e46-1376-4f37-a5c1-e520b5b376ec`
+- `Authoring source decision:` `ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is now documented in `README.md`, `docs/README.md`, and `docs/testing.md` as the only supported cluster authoring source. `nix-nos` is explicitly reduced to legacy compatibility plus low-level network primitives.
+- `Module boundary alignment:` `services.deployer`, `services.fleet-scheduler`, `services.nix-agent`, and `services.node-agent` descriptions now agree on the canonical layering `ultracloud.cluster -> deployer -> (nix-agent | fleet-scheduler -> node-agent)`.
+- `Minimal-surface friction reduction:` `services.plasmavmc` and `services.k8shost` now wait only for local backing services that they actually use. When explicit remote endpoints are configured, they no longer hard-wire unrelated local control-plane units into startup ordering, which preserves a lighter standalone story for the VM-platform core and remote-provider deployments.
+- `Validation alignment:` `supported-surface-guard` now requires contract markers for the supported authoring source, the constrained `nix-nos` role, and the standalone VM-platform story so docs drift becomes a failing regression.
+- `Still open:` rollout-stack の default port mismatch は解消済み。残件は hardware bring-up と longer-duration durability proof。
+
+## 2026-04-10 Supported Surface Final Proof
+
+- `Task:` `32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0`
+- `Guard + minimal-trial proof root:` `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final`
+ - `supported-surface-guard`: PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/supported-surface-guard.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/supported-surface-guard.log`.
+ - `single-node-trial-vm`: PASS. command `nix build .#single-node-trial-vm --no-link --print-out-paths`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-trial-vm.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-trial-vm.log`; output path `/nix/store/1nq4pkadm3lbxmhkr54iz7lgjd6vm7z3-nixos-vm`.
+ - `single-node-quickstart`: PASS. command `nix run .#single-node-quickstart`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-quickstart.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-quickstart.log`; success marker `single-node quickstart smoke passed`.
+- `Publishable KVM suite root:` `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite`
+ - `environment.txt` captures `host_cpu_count=12`, `local_nix_max_jobs=6`, `local_nix_build_cores=2`, `photon_cluster_nix_max_jobs=6`, `photon_cluster_nix_build_cores=2`, `kvm_present=yes`, `kvm_access=rw`, `kvm_amd_nested=1`, `nix_builders=`, `finished_at=2026-04-10T09:36:09+09:00`, `exit_status=0`.
+ - `fresh-smoke`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-smoke`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-smoke.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-smoke.log`; success marker `Cluster validation succeeded`.
+ - `fresh-demo-vm-webapp`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-demo-vm-webapp.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-demo-vm-webapp.log`; success markers include `PHOTON_VM_DEMO_WEB_READY` and the guest web health check on `http://10.62.10.10:8080/health`.
+ - `fresh-matrix`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-matrix`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-matrix.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-matrix.log`; success marker `Component matrix validation succeeded`.
+ - `run-publishable-kvm-suite`: PASS. command `./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite`; environment `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/environment.txt`; final stdout marker `publishable KVM suite passed; logs in ./work/publishable-kvm-suite`.
+- `Fixed while proving the surface:`
+ - `NODEAGENT-FIX-20260410-01`: reboot-time PID reuse could make `node-agent` treat `native-daemon` as the resurrected `native-web` instance after worker reboot, stalling `fresh-smoke` at native runtime recovery. `deployer/crates/node-agent/src/process.rs` now persists argv + boot-id metadata, validates the live `/proc//cmdline`, and refuses to signal or reuse mismatched processes from stale pidfiles.
+ - `HARNESS-FIX-20260410-01`: `run-publishable-kvm-suite` exposed a control-plane LightningStor bootstrap race that was not consistently hit by ad-hoc reruns. `nix/test-cluster/node01.nix` now holds `lightningstor.service` behind explicit local control-plane and worker-replica TCP readiness with a longer start timeout, and `nix/test-cluster/run-cluster.sh` now waits the worker storage agents before gating the control-plane LightningStor unit.
+- `Still open after the final supported-surface proof:` real hardware `baremetal-iso` smoke.
+
+## 2026-04-10 baremetal-iso-e2e Local-KVM Exact Lane
+
+- `Task:` `0de75570-dabd-471b-95fe-5898c54e2e8c`
+- `Check build output:` `nix build .#checks.x86_64-linux.baremetal-iso-e2e` now materializes `./result/bin/baremetal-iso-e2e` instead of trying to execute QEMU inside the daemon sandbox.
+- `Exact proof root:` `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`
+- `Outer runner evidence:` `environment.txt` records `execution_model=materialized-check-runner`, `nix_builders=` (empty), `kvm_present=yes`, `kvm_access=rw`, and the local CPU-derived Nix parallelism.
+- `Exact check build:` PASS. command `nix build .#checks.x86_64-linux.baremetal-iso-e2e`; output path is a runner package that ships `bin/baremetal-iso-e2e` plus `share/ultracloud/README.txt` documenting the sandbox/TCG reason for the materialized execution model.
+- `Exact runner:` PASS. command `./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/baremetal-iso-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/baremetal-iso-e2e.log`.
+- `Inner runtime evidence:` state dir `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/state`; `state/environment.txt` records `vm_accelerator_mode=kvm`; success markers in `baremetal-iso-e2e.log` include `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, and `Canonical ISO bare-metal QEMU verification succeeded`.
+- `Remaining delta vs direct runtime proof:` the harness is now identical because both `nix run ./nix/test-cluster#cluster -- baremetal-iso` and `./result/bin/baremetal-iso-e2e` call `nix/test-cluster/verify-baremetal-iso.sh`. The only intentional difference is execution entrypoint: `nix build` materializes the runner because daemon-sandboxed `nixbld` builds would otherwise lose host KVM and degrade to `TCG`.
+
+## 2026-04-10 Durability And Product-Boundary Hardening
+
+- `Task:` `541356be-b289-4583-ba40-cbf46b0f9680`
+- `Guard rerun:` PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`.
+- `Runtime rerun:` PASS. command `nix run ./nix/test-cluster#cluster -- fresh-matrix`; success marker `Component matrix validation succeeded`.
+- `Durability proof:` PASS. command `nix run ./nix/test-cluster#cluster -- durability-proof`; artifact root `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/20260410T120618+0900`; convenience symlink `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/latest`.
+- `ChainFire proof:` `chainfire-backup-response.json` と `chainfire-restored-response.json` が同じ logical payload を返し、DELETE 後の `chainfire-after-delete.out` は 404 を返す。
+- `FlareDB proof:` `flaredb-backup.json` と `flaredb-restored.json` が同じ SQL row を返し、`flaredb-after-delete.json` は空集合を返す。
+- `Deployer proof:` `deployer-pre-register-request.json` を backup artifact とし、`deployer-backup-list.json` で pre-registered node を観測し、`deployer.service` restart 後も `deployer-post-restart-list.json` に残ることを確認し、同じ request を replay した後も `deployer-replayed-list.json` の summary が変わらないことを確認した。`result.json` の `deployer_restore_mode` は `admin pre-register request replay with pre/post-restart list verification`。
+- `CoronaFS failure injection:` `coronafs-node04-local-state.json` は controller 停止中も `node_local=true` と materialized path を保持し、`coronafs-node04-capabilities.json` は node-only capability split (`supports_controller_api=false`, `supports_node_api=true`) を維持した。
+- `LightningStor failure injection:` `lightningstor-put-during-node05-outage.json`, `lightningstor-head-during-node05-outage.json`, `lightningstor-object-during-node05-outage.txt`, `lightningstor-object-after-repair.txt` が node05 停止中 write と repair 後 read-back を保存する。
+- `FiberLB supported limitation:` `fiberlb/crates/fiberlb-server/src/healthcheck.rs`, `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `flake.nix` で、HTTPS backend health は TLS 証明書検証なしの限定契約だと固定した。
+- `k8shost boundary:` `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `k8shost/README.md`, `nix/test-cluster/README.md`, `flake.nix` が `k8shost` を API/control-plane 製品 surface のみに固定し、`k8shost-cni`, `k8shost-controllers`, `lightningstor-csi` を archived non-product として揃えた。
+- `Proof-lane hardening done during this tranche:` 初回 `durability-proof` は FlareDB cleanup tail の unsupported `DROP TABLE` で落ちたため unique namespace 前提に整理し、次に cleanup trap の unbound local で落ちたため trap cleanup を `${var:-}` と guarded tunnel shutdown に直した。現在の lane は zero-exit で artifact を残す。
+
+## 2026-04-10 Rollout Bundle HA And DR Hardening
+
+- `Task:` `a41343c5-116e-4313-8751-b333472f931c`
+- `Operator doc:` `docs/rollout-bundle.md`
+- `Verification reruns:` `nix build .#checks.x86_64-linux.portable-control-plane-regressions`, `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`, and `nix build .#checks.x86_64-linux.deployer-vm-rollback` all passed on 2026-04-10 with local-only Nix settings.
+- `Durability rerun:` `nix run ./nix/test-cluster#cluster -- durability-proof` passed again from a clean KVM cluster and wrote artifacts under `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/20260410T123535+0900`.
+- `Supported deployer boundary:` single-writer deployer with restart-in-place or cold-standby restore. ChainFire-backed multi-instance failover is explicitly unsupported for now and the restore runbook is fixed to `cluster-state apply + preserved pre-register request replay + admin verification`.
+- `Nix-agent proof:` `nix build .#checks.x86_64-linux.deployer-vm-rollback` passed on 2026-04-10 and is now the canonical reproducible proof for `health_check_command`, rollback, and `rolled-back` partial failure recovery semantics.
+- `Fleet-scheduler semantics:` `fresh-smoke` and `fleet-scheduler-e2e` remain the release proofs for short-lived `draining` maintenance, fail-stop worker loss, and replica restoration. Long-duration maintenance and large-cluster drain choreography stay scope-limited rather than silently implied.
+- `Node-agent contract:` product docs now fix `${stateDir}/pids/*.log` as the per-instance log location, `${stateDir}/pids/*.meta.json` as stale-pid metadata, secret delivery as caller-provided env or mounted files only, host-path volumes as pass-through only, and upgrades as replace-and-reconcile rather than in-place patching.
+
+## 2026-04-10 Core Control Plane Operator Lifecycle Proofs
+
+- `Task:` `dcdc961a-0aa6-47c3-aeba-a1c67bca27b7`
+- `Operator doc:` `docs/control-plane-ops.md`
+- `Focused proof:` `./nix/test-cluster/run-core-control-plane-ops-proof.sh /mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00`
+- `Focused proof result:` passed on 2026-04-10 and wrote `result.json`, `scope-fixed-contract.json`, `iam-key-rotation-tests.log`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, and the contract-marker logs under `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00`.
+- `Supported-surface guard:` rerun after the doc and proof updates so the public lifecycle contract is now guarded alongside the existing supported-surface wording.
+- `ChainFire boundary:` dynamic membership, replace-node, and scale-out are now explicit non-supported actions on the product surface. The supported path is fixed-membership restore or whole-cluster replacement anchored by the existing `durability-proof` backup/restore lane.
+- `FlareDB boundary:` online migration and schema evolution are now fixed to an additive-first, backup/restore-gated operator contract. Destructive DDL and fully automated online migration are explicit non-supported boundaries for this release rather than implied future promises.
+- `IAM boundary:` bootstrap hardening now requires explicit admin token, signing key, and 32-byte `IAM_CRED_MASTER_KEY` inputs in docs. The standalone proof reruns signing-key rotation, credential overlap-and-revoke rotation, and mTLS overlap-and-cutover rotation tests while checking the hardening markers in `iam-server`; multi-node IAM failover remains unsupported.
+
+## 2026-04-10 Edge And Trial-Surface Productization
+
+- `Task:` `cc24ac5a-b940-4a32-9136-d706ecadf875`
+- `Operator doc:` `docs/edge-trial-surface.md`
+- `Component docs:` `apigateway/README.md`, `nightlight/README.md`, and `creditservice/README.md`
+- `Helper:` `./nix/test-cluster/work-root-budget.sh status` now reports `./work` disk usage, soft budgets, and cleanup plus `nix store gc` guidance without mutating state by default.
+- `Edge bundle boundary:` APIGateway is now documented as stateless replicated behind external L4 or VIP distribution, but restart-based rollout remains the only supported config distribution or reload model proven on this branch. NightLight is fixed to a single-node WAL/snapshot product shape with process-wide retention, and CreditService export plus migration is fixed to offline export/import or backend-native snapshots instead of live mixed-writer migration.
+- `Trial boundary:` `single-node-trial-vm` and `single-node-quickstart` remain the only supported lightweight trial surface. OCI/Docker remains intentionally unsupported because it would not prove the same guest-kernel, KVM, `/dev/net/tun`, and OVS/libvirt contract.
+
+## 2026-04-10 Provider And VM-Hosting Reality Proof
+
+- `Task:` `41a074a3-dc5c-42fc-979e-c8ebf9919d55`
+- `Focused proof lane:` `nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof`
+- `Focused proof result:` passed on 2026-04-10 and wrote `result.json`, `meta.json`, journals, and provider or VM-hosting artifacts under `/mnt/d2/centra/photoncloud-monorepo/work/provider-vm-reality-proof/20260410T135827+0900`.
+- `Provider artifacts:` `network-provider/prismnet-port-create.json`, `network-provider/prismnet-security-group-after-add.json`, `network-provider/flashdns-workload-authoritative-answer.txt`, `network-provider/flashdns-service-authoritative-answer.txt`, `network-provider/fiberlb-drain-summary.txt`, `network-provider/fiberlb-tcp-health-before-drain.txt`, and `network-provider/fiberlb-tcp-health-after-restore.txt` fix the current local-KVM proof to tenant network lifecycle, authoritative DNS answers, and listener drain or re-convergence.
+- `VM-hosting artifacts:` `vm-hosting/vm-create-response.json`, `vm-hosting/root-volume-before-migration.json`, `vm-hosting/root-volume-after-migration.json`, `vm-hosting/data-volume-after-migration.json`, `vm-hosting/migration-summary.json`, `vm-hosting/prismnet-port-after-migration.json`, and `vm-hosting/demo-state-after-post-migration-restart.json` fix the current release proof to KVM shared-storage migration, CoronaFS handoff, and post-migration restart on the worker pair.
+- `Scope-fixed gaps:` real OVS/OVN dataplane validation, native BGP or BFD peer interop with hardware VIP ownership, and real-hardware VM migration or storage handoff remain outside the supported local-KVM surface and are now explicit docs or guard limits rather than implied release claims.
+
+## chainfire
+
+- `責務:` UltraCloud 全体の replicated coordination store。KV, lease, watch, cluster membership view, rollout stack の state anchor を持つ。
+- `Canonical entrypoint:` `nix/modules/chainfire.nix`; `chainfire/crates/chainfire-server/src/main.rs`; supported API は `chainfire/proto/chainfire.proto`。
+- `現在ある証拠:` `README.md` が `MemberList` / `Status` を supported surface と明示; `chainfire/crates/chainfire-server/src/rest.rs` に health と member add がある; `docs/testing.md` が quickstart と HA proof を定義; `nix/single-node/base.nix` と `nix/nodes/vm-cluster/*` が正本 wiring; 2026-04-10 の `durability-proof` は `chainfire-backup-response.json` / `chainfire-restored-response.json` で logical KV backup/restore を保存し、`rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/chainfire-post-restart-put.json` と `post-control-plane-restarts.json` で fixed-membership restart 後の live proof を保存した。
+- `未証明事項:` rolling upgrade 手順; 実機 3 ノード上での membership 変更; power-loss 後の復旧 runbook。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `CF-P1-01` は 2026-04-10 に scope freeze から live-restart-proof 付きへ進んだ。dynamic membership / scale-out / replace-node は supported surface では explicit に unsupported のままだが、fixed-membership restart 自体は `rollout-soak` により live KVM proof へ格上げされた。次段で残るのは、live membership mutation 自体を製品化したい場合の dedicated KVM proof 追加だけ。
+- `P2:` `CF-P2-01` `chainfire-core` の internal pruning が current branch で進行中なので、公開境界と workspace 内部境界の最終整理が必要。
+- `依存関係:` local disk; host networking; `flaredb`, `iam`, `deployer`, `fleet-scheduler`, `nix-agent`, `node-agent`, `coronafs` から参照される。
+
+## flaredb
+
+- `責務:` replicated KV/SQL metadata store。各サービスの metadata, quota state, object metadata, tenant network state の受け皿。
+- `Canonical entrypoint:` `nix/modules/flaredb.nix`; `flaredb/crates/flaredb-server/src/main.rs`; REST は `flaredb/crates/flaredb-server/src/rest.rs`。
+- `現在ある証拠:` `README.md` が `POST /api/v1/sql` と `GET /api/v1/tables` を supported と明記; `flaredb/crates/flaredb-server/src/rest.rs` に SQL/KV/scan/member add がある; `docs/testing.md` が control-plane proof と `fresh-matrix` 依存を説明; `nix/modules/flaredb.nix` が `pdAddr` と namespace mode を生成; 2026-04-10 の `rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/flaredb-post-restart-create.json`, `flaredb-post-restart-insert.json`, `flaredb-post-restart.json` で member restart 後の additive SQL を保存し、`run-core-control-plane-ops-proof.sh` は `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00/scope-fixed-contract.json` と `flaredb-migration-contract.log` で destructive DDL / fully automated online migration が supported surface の外だと固定した。
+- `未証明事項:` real hardware 上の storage pressure と multi-node repair。fully automated online migration と destructive DDL online cutover はこの release では intentionally unsupported。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `FDB-P1-01` は 2026-04-10 に scope-fixed final。supported SQL/KV surface の logical backup/restore は `durability-proof` と docs で固定済みで、online migration / schema-evolution は additive-first と backup/restore baseline 前提で整理された。`rollout-soak` は member restart 後の additive SQL を live KVM artifact として保存し、`run-core-control-plane-ops-proof.sh` は destructive DDL と fully automated online migration が supported surface の外だと `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00/scope-fixed-contract.json` に固定した。今後やるなら scope 拡張として destructive online migration proof を別 tranche で扱う。
+- `P2:` `FDB-P2-01` namespace ごとの `strong` / `eventual` 方針が module default に埋まっており、operator-facing contract としてはまだ弱い。
+- `依存関係:` `chainfire` を placement/coordination に使う; local disk; `iam`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `creditservice`, `k8shost` から参照される。
+
+## iam
+
+- `責務:` identity, token issuance, authn, authz, tenant principal 管理。
+- `Canonical entrypoint:` `nix/modules/iam.nix`; `iam/crates/iam-server/src/main.rs`; API package は `iam/crates/iam-api/src/lib.rs`。
+- `現在ある証拠:` `README.md` と `docs/component-matrix.md` が core component として扱う; `nix/modules/iam.nix` が `chainfire` / `flaredb` 接続を正本生成; `iam-authn`, `iam-authz`, `iam-store` crate が分離; `fresh-matrix` と gateway path が credit/k8shost/plasmavmc 経由で IAM を前提にしている; `run-core-control-plane-ops-proof.sh` は `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00` に `iam-key-rotation-tests.log`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, `scope-fixed-contract.json`, `result.json` を保存し、bootstrap hardening, signing-key rotation, credential overlap rotation, mTLS overlap rotation を standalone proof として固定した。
+- `未証明事項:` multi-node IAM failover; backend matrix 全体での same-lane lifecycle proof。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `IAM-P1-01` は 2026-04-10 に scope-fixed final。bootstrap hardening と token/signing-key rotation は `docs/control-plane-ops.md` と `run-core-control-plane-ops-proof.sh` で standalone に固定され、同じ proof root が credential overlap-and-revoke rotation と mTLS overlap-and-cutover rotation も保存するようになった。multi-node IAM failover は supported surface の外へ明示的に出した。今後やるなら scope 拡張として clustered IAM failover proof を別 tranche で扱う。
+- `P2:` `IAM-P2-01` `flaredb` / `postgres` / `sqlite` / `memory` の backend matrix 全体を harness ではまだ網羅していない。
+- `依存関係:` `flaredb` が主 storage; optional `chainfire`; `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `creditservice`, `k8shost`, `apigateway` が consumer。
+
+## prismnet
+
+- `責務:` tenant network control plane。VPC, subnet, port, router, security group, service IP pool を扱う。
+- `Canonical entrypoint:` `nix/modules/prismnet.nix`; `prismnet/crates/prismnet-server/src/main.rs`; API は `prismnet/crates/prismnet-api/proto/prismnet.proto`。
+- `現在ある証拠:` `docs/testing.md` と `README.md` が `fresh-matrix` で VPC/subnet/port と security-group ACL add/remove を正本 proof と明示; `prismnet/crates/prismnet-server/src/services/*` に service 実装がある; `prismnet/crates/prismnet-server/src/ovn/client.rs` が OVN client を持つ; `nix/modules/prismnet.nix` が binary-consumed config を生成する。
+- `未証明事項:` 実機 OVS/OVN dataplane; DHCP/metadata service の実ハード proof; multi-rack network integration。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `PRISMNET-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が local KVM lab で VPC/subnet/port lifecycle, security-group ACL add/remove, attached-VM networking artifact を dated root に保存するようになった。未解消の次段は real OVS/OVN dataplane と hardware-switch integration を release proof に昇格させること。
+- `P2:` `PRISMNET-P2-01` `ovn/mock.rs` が近接して残っているため、supported path と archived/test path の境界を継続監視する必要がある。
+- `依存関係:` `iam`, `flaredb`, optional `chainfire`; consumer は `flashdns`, `fiberlb`, `plasmavmc`, `k8shost`。
+
+## flashdns
+
+- `責務:` authoritative DNS publication。tenant record, reverse zone, DNS handler を持つ。
+- `Canonical entrypoint:` `nix/modules/flashdns.nix`; `flashdns/crates/flashdns-server/src/main.rs`; `flashdns/crates/flashdns-server/src/dns/*`。
+- `現在ある証拠:` `docs/testing.md` と `README.md` が `fresh-matrix` で record publication を正本 proof としている; `flashdns` server は record/zone/reverse-zone service を持つ; `nix/modules/flashdns.nix` が binary-consumed config を生成する。
+- `未証明事項:` real port 53 exposure; upstream/secondary integration; failover with real network gear。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `FLASHDNS-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が authoritative workload/service answers を dated root に保存するようになり、local KVM での publication evidence は release lane に入った。未解消の次段は real port 53 exposure と upstream/secondary interop を hardware or external-network proof に広げること。
+- `P2:` `FLASHDNS-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
+- `依存関係:` `iam`, `flaredb`, optional `chainfire`; publication source は `k8shost` と `fleet-scheduler`。
+
+## fiberlb
+
+- `責務:` service publication / VIP / L4-L7 load balancing / native BGP advertisement。
+- `Canonical entrypoint:` `nix/modules/fiberlb.nix`; `fiberlb/crates/fiberlb-server/src/main.rs`; dataplane は `dataplane.rs`, `l7_dataplane.rs`, `vip_manager.rs`, `bgp_client.rs`。
+- `現在ある証拠:` `README.md` と `docs/testing.md` が `fresh-matrix` で TCP と TLS-terminated `Https` / `TerminatedHttps` listener を正本 proof としている; server code に native BGP/BFD, VIP ownership, TLS store, L7 dataplane 実装がある; L4 algorithm は in-tree tests を持つ。
+- `未証明事項:` 実機 BGP peer との interop; L2/VIP 所有権の hardware proof; IPv6 と mixed peer topology。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `FIBERLB-P1-01` は 2026-04-10 に scope-fixed。`fiberlb/crates/fiberlb-server/src/healthcheck.rs` の HTTPS health check は依然として backend TLS 証明書検証をしないが、その理由と supported 範囲 (`TCP reachability + HTTP status`) は docs/guard/source comment に固定された。将来の CA-aware verification は別 tranche。
+- `P1:` `FIBERLB-P1-02` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が listener publication, backend disable, drain, restore, re-convergence の artifact を dated root に保存するようになった。未解消の次段は native BGP/BFD peer interop と hardware VIP ownership を real network proof へ広げること。
+- `P2:` `FIBERLB-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
+- `依存関係:` `iam`, `flaredb`, optional `chainfire`; publication consumer は `k8shost` と `fleet-scheduler`; 実ネットワーク peer が必要。
+
+## plasmavmc
+
+- `責務:` tenant VM control plane と worker agent。VM lifecycle, image/materialization, worker registration, hypervisor integration を持つ。
+- `Canonical entrypoint:` `nix/modules/plasmavmc.nix`; `plasmavmc/crates/plasmavmc-server/src/main.rs`; supported public backend は `plasmavmc-kvm`。
+- `現在ある証拠:` `README.md` が KVM-only public contract を明記; `docs/testing.md` が `single-node-quickstart`, `fresh-smoke`, `fresh-matrix` で `HYPERVISOR_TYPE_KVM` を正本 proof とする; `vm_service.rs` は `HYPERVISOR_TYPE_KVM` 以外を public surface 外とする; `volume_manager.rs` が `coronafs` / `lightningstor` integration を持つ。
+- `未証明事項:` 実機での migration / storage handoff; long-running guest upgrade; network + storage fault 下での recovery。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `PLASMAVMC-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が shared-storage migration, PrismNet-attached post-migration networking, CoronaFS handoff, post-migration restart state を dated root に保存するようになった。未解消の次段は real-hardware migration と storage handoff の release proof を足すこと。
+- `P2:` `PLASMAVMC-P2-01` Firecracker / mvisor の archived code が in-tree に残るため、supported surface への逆流を guard し続ける必要がある。
+- `依存関係:` `iam`, `flaredb`, `prismnet`, optional `chainfire`, `lightningstor`, `coronafs`, host KVM/QEMU。
+
+## coronafs
+
+- `責務:` mutable VM volume layer。raw volume を管理し、`qemu-nbd` で worker に export する。
+- `Canonical entrypoint:` `nix/modules/coronafs.nix`; `coronafs/crates/coronafs-server/src/main.rs`; 製品説明は `coronafs/README.md`。
+- `現在ある証拠:` `coronafs/README.md` が mutable VM-volume layer としての split を明言; `coronafs-server` は `/healthz` と volume/export API を持つ; `docs/testing.md` が `plasmavmc + coronafs + lightningstor` を `fresh-matrix` で proof 対象にしている; `plasmavmc/volume_manager.rs` に深い integration がある。
+- `未証明事項:` export interruption 後の recovery の長時間耐久; 実ディスク/実ネットワーク上での latency budget。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `CORONAFS-P1-01` は 2026-04-10 に解消済み。`nix/single-node/surface.nix` の quickstart health URL は `http://127.0.0.1:50088/healthz` に修正された。
+- `P1:` `CORONAFS-P1-02` は 2026-04-10 に解消済み。`durability-proof` が controller outage 中も node-local materialized volume の read と node-only capability split を検証する canonical failure-injection lane を持つ。
+- `P2:` `CORONAFS-P2-01` storage benchmark はあるが、canonical publish gate では recovery path の比重がまだ弱い。
+- `依存関係:` `qemu-nbd`, `qemu-img`, local disk; optional `chainfire` metadata backend; primary consumer は `plasmavmc`。
+
+## lightningstor
+
+- `責務:` object storage と VM image backing。metadata plane と data node plane を持つ。
+- `Canonical entrypoint:` `nix/modules/lightningstor.nix`; `lightningstor/crates/lightningstor-server/src/main.rs`; `lightningstor/crates/lightningstor-node/src/main.rs`; S3 path は `src/s3/*`。
+- `現在ある証拠:` `README.md` が bucket versioning / policy / tagging / object version listing を supported surface と明記; `docs/testing.md` が `fresh-matrix` で bucket metadata と object-version APIs を proof 対象にしている; server は S3 auth, distributed backend, repair queue を持つ; module は metadata/data/all-in-one mode を持つ。
+- `未証明事項:` distributed backend の実機 failover; S3 compatibility breadth; cold-start image distribution on hardware。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `LIGHTNINGSTOR-P1-01` は 2026-04-10 に解消済み。`durability-proof` が node05 outage 中の write/head/read と service restore 後の repair/read-back を canonical failure-injection artifact として保存する。
+- `P2:` `LIGHTNINGSTOR-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
+- `依存関係:` `iam`, `flaredb`, optional `chainfire`; optional `lightningstor-node`; consumer は `plasmavmc` と tenant object clients。
+
+## k8shost
+
+- `責務:` tenant workload API surface。pod/deployment/service を扱い、`prismnet`, `flashdns`, `fiberlb`, optional `creditservice` に投影する。
+- `Canonical entrypoint:` `nix/modules/k8shost.nix`; `k8shost/crates/k8shost-server/src/main.rs`; API protobuf は `k8shost/crates/k8shost-proto/proto/k8s.proto`。
+- `現在ある証拠:` `k8shost/README.md` が supported scope を定義; `README.md` が `WatchPods` を bounded snapshot stream と明記; `k8shost-server/src/services/pod.rs` が `ReceiverStream` ベースの `WatchPods` を実装; `docs/testing.md` が `fresh-smoke` / `fresh-matrix` で API contract を proof 対象にしている; 2026-04-10 には docs/guard/TODO で API/control-plane product surface のみに固定された。
+- `未証明事項:` 実 workload runtime; tenant networking dataplane with real CNI/CSI; node-level execution semantics。
+- `P0:` `K8SHOST-P0-01` は 2026-04-10 に解消済み。実 workload dataplane (`k8shost-cni`, `k8shost-controllers`, `lightningstor-csi`) は archived non-product として固定し、製品 narrative を API/control-plane scope のみに揃えた。
+- `P1:` `K8SHOST-P1-01` は 2026-04-10 に scope-resolved。canonical proof が API contract 中心であること自体を製品境界として明文化し、実 pod runtime は製品 claim から外した。
+- `P2:` `K8SHOST-P2-01` は 2026-04-10 に解消済み。archived scaffolds の非正本扱いは `supported-surface-guard` の contract marker で継続監視される。
+- `依存関係:` `iam`, `flaredb`, `chainfire`, `prismnet`, `flashdns`, `fiberlb`, optional `creditservice`。
+
+## apigateway
+
+- `責務:` external API/proxy surface。route, auth provider, credit provider, request mediation を持つ。
+- `Canonical entrypoint:` `nix/modules/apigateway.nix`; `apigateway/crates/apigateway-server/src/main.rs`。
+- `現在ある証拠:` `node06` が `apigateway` を正本 gateway node として起動; `docs/testing.md` と `nix/test-cluster/README.md` が API-gateway-mediated flows を `fresh-matrix` に含める; server code は route, auth, credit provider, upstream timeout, request-id を持つ。
+- `未証明事項:` multi-node HA; config distribution / reload; TLS termination strategy; gateway as product docs。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `APIGW-P1-01` は 2026-04-10 に scope-fixed。APIGateway は stateless replicated behind external L4/VIP として supported、config distribution は rendered config + restart-based rollout、live in-process reload は unsupported と docs に固定された。次段で残るのは dedicated multi-gateway HA proof の追加。
+- `P2:` `APIGW-P2-01` release proof は `node06` と `fresh-matrix` への間接依存が中心で、専用 smoke gate が無い。
+- `依存関係:` upstream services; optional `iam` / `creditservice` provider; external clients。
+
+## nightlight
+
+- `責務:` metrics ingestion と query。Prometheus remote_write / query API と gRPC query/admin を持つ。
+- `Canonical entrypoint:` `nix/modules/nightlight.nix`; `nightlight/crates/nightlight-server/src/main.rs`; API proto は `nightlight/crates/nightlight-api/proto/*`。
+- `現在ある証拠:` `nightlight-server` は HTTP と gRPC を両方 bind する; `node06` が gateway node で起動; `docs/testing.md` と `nix/test-cluster/README.md` が NightLight HTTP surface の host-forward proof を記述; local WAL/snapshot/retention loop がある。
+- `未証明事項:` replicated metrics topology; large retention; sustained remote_write load; tenant isolation。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `NIGHTLIGHT-P1-01` は 2026-04-10 に scope-fixed。NightLight は single-node WAL/snapshot service として product shape を固定し、replicated / HA metrics path は unsupported であることを docs と guard に反映した。
+- `P2:` `NIGHTLIGHT-P2-01` は 2026-04-10 に narrowed。tenant boundary は deployment-scoped か upstream-auth-scoped であり、process 内の hard multi-tenant auth や per-tenant retention は current product contract に含めないことを docs に固定した。次段は auth or quota aware multi-tenant proof の追加。
+- `依存関係:` local disk; optional `apigateway`; external metric writers/readers。
+
+## creditservice
+
+- `責務:` quota, wallet, reservation, admission control。
+- `Canonical entrypoint:` `nix/modules/creditservice.nix`; `creditservice/crates/creditservice-server/src/main.rs`; 製品スコープは `creditservice/README.md`。
+- `現在ある証拠:` `creditservice/README.md` が supported scope と non-goals を明記; `docs/testing.md` が `fresh-matrix` で quota/wallet/reservation/API-gateway path を proof 対象にしている; module は `iamAddr`, `flaredbAddr`, optional SQL backend を持つ; `node06` が canonical gateway node で起動する。
+- `未証明事項:` backend migration; finance-system との分離運用; export/reporting path。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `CREDIT-P1-01` 製品 narrative が README の non-goal を越えて finance ledger に膨らまないよう境界維持が必要。
+- `P2:` `CREDIT-P2-01` は 2026-04-10 に narrowed。export と backend migration は offline export/import or backend-native snapshot workflows として README へ固定し、live mixed-writer migration は unsupported と明示した。次段は dedicated export proof の追加。
+- `依存関係:` `iam`, `flaredb`, optional `chainfire`; `apigateway`, `k8shost`, tenant admission flow が consumer。
+
+## deployer
+
+- `責務:` bootstrap and rollout-intent authority。`/api/v1/phone-home`, install plan, desired-system reference, cluster inventory を持つ。
+- `Canonical entrypoint:` `nix/modules/deployer.nix`; `deployer/crates/deployer-server/src/main.rs`; route wiring は `deployer/crates/deployer-server/src/lib.rs`。
+- `現在ある証拠:` `/api/v1/phone-home` が server route に存在; `nix/modules/deployer.nix` が package/service/cluster-state seed を持つ; `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md` が `baremetal-iso`, `baremetal-iso-e2e`, `deployer-vm-smoke`, `deployer-bootstrap-e2e`, `durability-proof`, `rollout-soak` を正本 proof とする; `verify-baremetal-iso.sh` が install path を end-to-end で辿る; 2026-04-10 の `durability-proof` は `deployer-pre-register-request.json`, `deployer-backup-list.json`, `deployer-post-restart-list.json`, `deployer-replayed-list.json` を保存し、`rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/deployer-post-restart-nodes.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `deployer-journal.log` で longer-run live restart と release boundary marker を保存した。
+- `未証明事項:` 実機 USB/BMC install; deployer 自身の true HA; ChainFire-backed multi-instance active failover の実装; operator disaster recovery の実機確認。
+- `P0:` `DEPLOYER-P0-01` 現在の canonical bare-metal proof は QEMU-as-hardware までで、実機 regression lane はまだ無い。
+- `P1:` `DEPLOYER-P1-01` は 2026-04-10 に scope-fixed final。release contract は one active writer plus optional cold-standby restore with `ultracloud.cluster` state re-apply and preserved admin request replay で固定し、automatic ChainFire-backed multi-instance failover は supported surface の外へ明示的に出した。`rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/deployer-post-restart-nodes.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt` で live restart proof と boundary marker を保存した。今後やるなら scope 拡張として true HA 実装を別 ticket で扱う。
+- `P2:` `DEPLOYER-P2-01` `bootstrapFlakeBundle` と optional binary cache を production でどう供給するかの標準運用形がまだ文書化不足。
+- `依存関係:` `chainfire`; `nix-agent`; `install-target`; ISO/first-boot path; optional binary cache。
+
+## fleet-scheduler
+
+- `責務:` non-Kubernetes native service scheduler。cluster-native service placement, failover, publication reconciliation を持つ。
+- `Canonical entrypoint:` `nix/modules/fleet-scheduler.nix`; `deployer/crates/fleet-scheduler/src/main.rs`; publication code は `publish.rs`。
+- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md` が `fresh-smoke`, `fresh-matrix`, `fleet-scheduler-e2e`, `rollout-soak` をこの境界の proof とする; module は `iamEndpoint`, `fiberlbEndpoint`, `flashdnsEndpoint`, `heartbeatTimeoutSecs` を持つ; scheduler code は `chainfire` watch, dependency summary, publication reconciliation を持つ; `fresh-smoke` は `node04 -> draining`, `node05` fail-stop, worker return 後の replica restore を通し、`rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/maintenance-held.json`, `power-loss-held.json`, `fleet-scheduler-post-restart.json`, `scope-fixed-contract.json`, `fleet-scheduler-scope-fixed.txt` で scope-fixed longer-run proof を保存した。
+- `未証明事項:` 大規模クラスタ; multi-hour maintenance 窓; operator approval workflow を伴う drain choreography。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `FLEET-P1-01` は 2026-04-10 に scope-fixed final。release contract は two native-runtime workers 上の one planned drain cycle + one fail-stop worker-loss cycle + 30-second held degraded states で固定し、`rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/maintenance-held.json`, `power-loss-held.json`, `fleet-scheduler-post-restart.json`, `scope-fixed-contract.json`, `fleet-scheduler-scope-fixed.txt` でその upper bound を live KVM artifact として保存した。multi-hour maintenance windows, pinned singleton policies, operator approval workflows, and larger-cluster drain storms は supported surface の外へ明示的に出した。
+- `P2:` `FLEET-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
+- `依存関係:` `chainfire`; `node-agent`; optional `iam`, `fiberlb`, `flashdns`।
+
+## nix-agent
+
+- `責務:` host-local NixOS convergence only。desired system を build/apply し、health check と rollback を担う。
+- `Canonical entrypoint:` `nix/modules/nix-agent.nix`; `deployer/crates/nix-agent/src/main.rs`。
+- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md` が `baremetal-iso`, `baremetal-iso-e2e`, `deployer-vm-smoke`, `deployer-vm-rollback`, `portable-control-plane-regressions` を proof とする; code は desired-system, observed-system, rollback-on-failure, health-check-command を持つ; `nix/modules/nix-agent.nix` がその CLI 契約を正本生成する; 2026-04-10 の `rollout-soak` は `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T154744+0900/node01-nix-agent-scope.txt` と `node04-nix-agent-scope.txt` を保存し、steady-state `test-cluster` では live `nix-agent.service` restart を pretending しない boundary を artifact と docs で固定した。
+- `未証明事項:` kernel/network failure 下の rollback; multi-node wave rollout; real hardware recovery after partial switch。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `NIXAGENT-P1-01` は 2026-04-10 に解消済み。`healthCheckCommand` の argv 契約、`rollbackOnFailure` の `rolled-back` semantics、`deployer-vm-rollback` proof、partial failure recovery 手順は `docs/rollout-bundle.md` と `docs/testing.md` に固定された。
+- `P2:` `NIXAGENT-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
+- `依存関係:` `chainfire`; deployer が publish する desired-system; local NixOS flake / switch-to-configuration。
+
+## node-agent
+
+- `責務:` host-local runtime reconcile only。native service instance の heartbeat, process/container 実行, local observed state を担う。
+- `Canonical entrypoint:` `nix/modules/node-agent.nix`; `deployer/crates/node-agent/src/main.rs`。
+- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md` が `fresh-smoke`, `fresh-matrix`, `fleet-scheduler-e2e`, `portable-control-plane-regressions` を proof とする; code は `watcher`, `agent`, `process` を持つ; module は Podman enable, stateDir, pidDir, `allowLocalInstanceUpsert` を持つ; `process.rs` は `${stateDir}/pids/*.log` と `${stateDir}/pids/*.meta.json` の contract を実装する。
+- `未証明事項:` heterogeneous runtime support; crash-looping host service の細かな SLO; secret-rotation workflow そのもの。
+- `P0:` いまの static survey で即死級の file-level breakage は未検出。
+- `P1:` `NODEAGENT-P1-01` は 2026-04-10 に解消済み。logs / secrets / volume / upgrade 契約は `docs/rollout-bundle.md` と module description に固定された。
+- `P2:` `NODEAGENT-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
+- `依存関係:` `chainfire`; `fleet-scheduler`; optional Podman; host systemd/process model。
+
+## Nix/bootstrap/harness
+
+- `責務:` 製品 surface を定義し、`single-node dev`, `3-node HA control plane`, `bare-metal bootstrap` の NixOS outputs と VM/QEMU harness を正本化する。
+- `Canonical entrypoint:` `flake.nix`; `nix/modules/default.nix`; `nix/single-node/base.nix`; `nix/test-cluster/run-publishable-kvm-suite.sh`; `nix/test-cluster/run-local-baseline.sh`; `nix/test-cluster/verify-baremetal-iso.sh`; `nix/nodes/baremetal-qemu/*`。
+- `現在ある証拠:` `flake.nix` に `single-node-quickstart`, `single-node-trial-vm`, `canonical-profile-eval-guards`, `portable-control-plane-regressions`, `baremetal-iso-e2e` がある; `nix/modules/default.nix` が現在の module surface を一括 import する; `nix/single-node/base.nix` が最小 VM platform core と optional bundle を組む; `run-publishable-kvm-suite.sh` と `run-local-baseline.sh` が local CPU 並列度と local builder を固定する; `verify-baremetal-iso.sh` が ISO -> phone-home -> bundle fetch -> Disko -> reboot -> `nix-agent active` を辿る; `run-cluster.sh` には `durability-proof` と `rollout-soak` が追加され、`chainfire`, `flaredb`, `deployer`, `coronafs`, `lightningstor` の backup/restore と failure-injection artifact を `/work/durability-proof` に、longer-run rollout/control-plane maintenance artifact を `/work/rollout-soak` に保存する; 2026-04-10 の local AMD/KVM baseline で required 6 checks と `single-node-quickstart`, `baremetal-iso`, `fresh-smoke` がすべて pass した。
+- `未証明事項:` 実機 USB/BMC install; `/nix/store` 容量制御の自動 guard; optional bundle 全部入り quickstart の release proof; non-Nix easy-trial artifact。
+- `P0:` `HARNESS-P0-01` real hardware regression lane がまだ無く、canonical bare-metal proof は QEMU stand-in のまま。
+- `P1:` `HARNESS-P1-01` は 2026-04-10 に解消済み。quickstart optional bundle の health gating は `lightningstor`, `flashdns`, `fiberlb` の TCP probe と `coronafs` の `50088/healthz` へ揃えた。
+- `P1:` `HARNESS-P1-02` は 2026-04-10 に scope-fixed。easy-trial は `single-node-trial-vm` による Nix VM appliance で成立し、より軽い Docker/OCI 風 trial path を supported としない理由は `docs/edge-trial-surface.md`, `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `nix/single-node/surface.nix`, `supported-surface-guard` に揃えた。
+- `P1:` `HARNESS-P1-03` は 2026-04-10 に解消済み。`fresh-smoke` の stale VM cleanup は current profile の `vm_dir` / `vde_switch_dir` に含まれる PID に限定し、別 checkout の同名 cluster VM を巻き込まないようにした。
+- `P2:` `HARNESS-P2-01` は 2026-04-10 に解消済み。`./work` と local builder parallelism に加えて `./nix/test-cluster/work-root-budget.sh` が `status` に加えて `enforce` と `prune-proof-logs` を持つようになり、disk budget advisory だけでなく stronger local budget gate と safer dated-proof cleanup workflow を提供するようになった。
+- `依存関係:` `nix`, `nixpkgs`, QEMU/KVM, host disk under `./work`, local CPU parallelism, 全 component module 群。
+
+## Notes For The Next Implementation Agent
+
+- まず `DEPLOYER-P0-01` / `HARNESS-P0-01` を処理すると、hardware proof と実機 operator path の残件を低コストで減らせる。
+- baseline 再現は `nix/test-cluster/run-local-baseline.sh` を使うと、local-only builder と `./work` 配下ログを固定したまま同じ経路を再実行できる。
+- その次に `DEPLOYER-P0-01` / `HARNESS-P0-01` を実機 smoke へ進めると、QEMU-only から hardware path へ移れる。
+- `DEPLOYER-P1-01` と `FLEET-P1-01` は scope-fixed final になった。今後それらを再度開くなら、current release boundary を拡張する別 tranche として true deployer HA や larger-cluster scheduler maintenance proof を扱うとよい。
+- `FIBERLB-P1-01` は scope-fixed になったが、将来的に backend certificate verification を製品化するなら docs/guard の限定契約を書き換える必要がある。
diff --git a/apigateway/README.md b/apigateway/README.md
new file mode 100644
index 0000000..e297321
--- /dev/null
+++ b/apigateway/README.md
@@ -0,0 +1,17 @@
+# APIGateway
+
+`apigateway` is UltraCloud's supported external API and proxy surface for auth-aware and credit-aware upstream traffic.
+
+## Supported product shape
+
+APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer; live in-process reload is not part of the product contract.
+
+- Config distribution is restart-based. Render routes, auth providers, and credit providers from Nix or generated cluster state, then replace or restart the process.
+- Scale-out is supported by running multiple identical instances behind FiberLB or another L4 or VIP distribution layer.
+- The release-facing proof remains `nix run ./nix/test-cluster#cluster -- fresh-matrix`, which validates the shipped single gateway-node composition on `node06`.
+
+## Explicit non-goals
+
+- hot route reload through an admin API or `SIGHUP`
+- in-process config gossip or leader election between gateway replicas
+- a claim that every HA layout is directly release-proven in the current harness
diff --git a/apigateway/crates/apigateway-server/src/main.rs b/apigateway/crates/apigateway-server/src/main.rs
index 3905c40..b233d50 100644
--- a/apigateway/crates/apigateway-server/src/main.rs
+++ b/apigateway/crates/apigateway-server/src/main.rs
@@ -366,7 +366,10 @@ async fn main() -> Result<(), Box> {
.init();
if used_default_config {
- info!("Config file not found: {}, using defaults", args.config.display());
+ info!(
+ "Config file not found: {}, using defaults",
+ args.config.display()
+ );
}
let routes = build_routes(config.routes)?;
@@ -412,7 +415,11 @@ async fn main() -> Result<(), Box> {
.with_state(state);
let listener = tokio::net::TcpListener::bind(config.http_addr).await?;
- axum::serve(listener, app.into_make_service_with_connect_info::()).await?;
+ axum::serve(
+ listener,
+ app.into_make_service_with_connect_info::(),
+ )
+ .await?;
Ok(())
}
@@ -426,7 +433,13 @@ async fn health() -> Json {
}
async fn list_routes(State(state): State>) -> Json> {
- Json(state.routes.iter().map(|route| route.config.clone()).collect())
+ Json(
+ state
+ .routes
+ .iter()
+ .map(|route| route.config.clone())
+ .collect(),
+ )
}
async fn proxy(
@@ -463,8 +476,12 @@ async fn proxy(
let target_url = build_upstream_url(&route, request.uri())?;
- let request_timeout =
- Duration::from_millis(route.config.timeout_ms.unwrap_or(state.upstream_timeout.as_millis() as u64));
+ let request_timeout = Duration::from_millis(
+ route
+ .config
+ .timeout_ms
+ .unwrap_or(state.upstream_timeout.as_millis() as u64),
+ );
let mut builder = state
.client
.request(request.method().clone(), target_url)
@@ -630,13 +647,12 @@ async fn enforce_credit(
credit_subject.as_ref().expect("credit subject resolved"),
)
.await;
- apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision)
- .map(|decision| {
- decision.map(|decision| CreditReservation {
- provider: credit_cfg.provider.clone(),
- reservation_id: decision.reservation_id,
- })
+ apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision).map(|decision| {
+ decision.map(|decision| CreditReservation {
+ provider: credit_cfg.provider.clone(),
+ reservation_id: decision.reservation_id,
})
+ })
}
fn apply_credit_mode(
@@ -837,13 +853,19 @@ async fn finalize_credit(
CommitPolicy::Never => return,
CommitPolicy::Always => {
if let Err(err) = commit_credit(state, credit_cfg, &reservation).await {
- warn!("Failed to commit credit reservation {}: {}", reservation.reservation_id, err);
+ warn!(
+ "Failed to commit credit reservation {}: {}",
+ reservation.reservation_id, err
+ );
}
}
CommitPolicy::Success => {
if status.is_success() || status.is_redirection() {
if let Err(err) = commit_credit(state, credit_cfg, &reservation).await {
- warn!("Failed to commit credit reservation {}: {}", reservation.reservation_id, err);
+ warn!(
+ "Failed to commit credit reservation {}: {}",
+ reservation.reservation_id, err
+ );
}
} else if let Err(err) = rollback_credit(state, credit_cfg, &reservation).await {
warn!(
@@ -1010,11 +1032,9 @@ async fn build_auth_providers(
for config in configs {
let provider_type = normalize_name(&config.provider_type);
if providers.contains_key(&config.name) {
- return Err(config_error(format!(
- "duplicate auth provider name {}",
- config.name
- ))
- .into());
+ return Err(
+ config_error(format!("duplicate auth provider name {}", config.name)).into(),
+ );
}
match provider_type.as_str() {
@@ -1034,10 +1054,7 @@ async fn build_auth_providers(
Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS));
providers.insert(
config.name.clone(),
- AuthProvider::Grpc(GrpcAuthProvider {
- channel,
- timeout,
- }),
+ AuthProvider::Grpc(GrpcAuthProvider { channel, timeout }),
);
}
_ => {
@@ -1061,25 +1078,19 @@ async fn build_credit_providers(
for config in configs {
let provider_type = normalize_name(&config.provider_type);
if providers.contains_key(&config.name) {
- return Err(config_error(format!(
- "duplicate credit provider name {}",
- config.name
- ))
- .into());
+ return Err(
+ config_error(format!("duplicate credit provider name {}", config.name)).into(),
+ );
}
match provider_type.as_str() {
"grpc" => {
let mut endpoint = Endpoint::from_shared(config.endpoint.clone())?
.connect_timeout(Duration::from_millis(
- config
- .timeout_ms
- .unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
+ config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
))
.timeout(Duration::from_millis(
- config
- .timeout_ms
- .unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
+ config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
));
if let Some(tls) = build_client_tls_config(&config.tls).await? {
@@ -1087,17 +1098,11 @@ async fn build_credit_providers(
}
let channel = endpoint.connect().await?;
- let timeout = Duration::from_millis(
- config
- .timeout_ms
- .unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
- );
+ let timeout =
+ Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS));
providers.insert(
config.name.clone(),
- CreditProvider::Grpc(GrpcCreditProvider {
- channel,
- timeout,
- }),
+ CreditProvider::Grpc(GrpcCreditProvider { channel, timeout }),
);
}
_ => {
@@ -1132,11 +1137,9 @@ fn build_routes(configs: Vec) -> Result, Box String {
}
if path == "/" {
let trimmed = base.trim_end_matches('/');
- return if trimmed.is_empty() { "/".to_string() } else { trimmed.to_string() };
+ return if trimmed.is_empty() {
+ "/".to_string()
+ } else {
+ trimmed.to_string()
+ };
}
format!(
@@ -1385,9 +1392,9 @@ fn build_upstream_url(route: &Route, uri: &Uri) -> Result {
#[cfg(test)]
mod tests {
use super::*;
+ use apigateway_api::GatewayCreditServiceServer;
use axum::routing::get;
use creditservice_api::{CreditServiceImpl, CreditStorage, GatewayCreditServiceImpl};
- use apigateway_api::GatewayCreditServiceServer;
use creditservice_types::Wallet;
use iam_api::{GatewayAuthServiceImpl, GatewayAuthServiceServer};
use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
@@ -1470,7 +1477,11 @@ mod tests {
}
async fn start_iam_gateway() -> (SocketAddr, String) {
- let backend = Arc::new(Backend::new(BackendConfig::Memory).await.expect("iam backend"));
+ let backend = Arc::new(
+ Backend::new(BackendConfig::Memory)
+ .await
+ .expect("iam backend"),
+ );
let principal_store = Arc::new(PrincipalStore::new(backend.clone()));
let role_store = Arc::new(RoleStore::new(backend.clone()));
let binding_store = Arc::new(BindingStore::new(backend.clone()));
@@ -1516,12 +1527,8 @@ mod tests {
role_store.clone(),
cache,
));
- let gateway_auth = GatewayAuthServiceImpl::new(
- token_service,
- principal_store,
- token_store,
- evaluator,
- );
+ let gateway_auth =
+ GatewayAuthServiceImpl::new(token_service, principal_store, token_store, evaluator);
let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
.await
@@ -1542,10 +1549,7 @@ mod tests {
async fn start_credit_gateway(iam_addr: &SocketAddr) -> SocketAddr {
let storage = creditservice_api::InMemoryStorage::new();
let wallet = Wallet::new("proj-1".into(), "org-1".into(), 100);
- storage
- .create_wallet(wallet)
- .await
- .expect("wallet create");
+ storage.create_wallet(wallet).await.expect("wallet create");
let auth_service = Arc::new(
iam_service_auth::AuthService::new(&format!("http://{}", iam_addr))
@@ -1636,7 +1640,10 @@ mod tests {
let route = routes.first().unwrap();
let uri: Uri = "/api/v1/users?debug=true".parse().unwrap();
let url = build_upstream_url(route, &uri).unwrap();
- assert_eq!(url.as_str(), "http://example.com/base/api/v1/users?debug=true");
+ assert_eq!(
+ url.as_str(),
+ "http://example.com/base/api/v1/users?debug=true"
+ );
}
#[test]
@@ -1671,7 +1678,8 @@ mod tests {
let outcome = apply_auth_mode(PolicyMode::Optional, false, decision).unwrap();
assert!(outcome.subject.is_none());
- let outcome = apply_auth_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
+ let outcome =
+ apply_auth_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
assert!(outcome.subject.is_none());
}
@@ -1692,7 +1700,8 @@ mod tests {
let outcome = apply_credit_mode(PolicyMode::Optional, false, decision).unwrap();
assert!(outcome.is_none());
- let outcome = apply_credit_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
+ let outcome =
+ apply_credit_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
assert!(outcome.is_none());
}
@@ -1783,7 +1792,8 @@ mod tests {
Err(status) => panic!("unexpected proxy status: {}", status),
}
}
- let response = response.expect("gateway auth+credit test timed out waiting for ready backends");
+ let response =
+ response.expect("gateway auth+credit test timed out waiting for ready backends");
assert_eq!(response.status(), StatusCode::OK);
}
@@ -1812,7 +1822,10 @@ mod tests {
let request = Request::builder()
.method("GET")
.uri("/v1/echo-auth")
- .header(axum::http::header::AUTHORIZATION, "Bearer passthrough-token")
+ .header(
+ axum::http::header::AUTHORIZATION,
+ "Bearer passthrough-token",
+ )
.header(PHOTON_AUTH_TOKEN_HEADER, "photon-token")
.body(Body::empty())
.expect("request build");
@@ -1828,8 +1841,14 @@ mod tests {
let body = to_bytes(response.into_body(), 1024 * 1024).await.unwrap();
let json: serde_json::Value = serde_json::from_slice(&body).unwrap();
- assert_eq!(json.get("authorization").and_then(|v| v.as_str()), Some("Bearer passthrough-token"));
- assert_eq!(json.get("photon_token").and_then(|v| v.as_str()), Some("photon-token"));
+ assert_eq!(
+ json.get("authorization").and_then(|v| v.as_str()),
+ Some("Bearer passthrough-token")
+ );
+ assert_eq!(
+ json.get("photon_token").and_then(|v| v.as_str()),
+ Some("photon-token")
+ );
}
#[test]
diff --git a/chainfire/Cargo.lock b/chainfire/Cargo.lock
index dfd5a17..4ac4082 100644
--- a/chainfire/Cargo.lock
+++ b/chainfire/Cargo.lock
@@ -388,18 +388,7 @@ dependencies = [
name = "chainfire-core"
version = "0.1.0"
dependencies = [
- "async-trait",
- "bytes",
- "chainfire-gossip",
- "chainfire-types",
- "dashmap",
- "futures",
- "parking_lot",
- "tempfile",
"thiserror 1.0.69",
- "tokio",
- "tokio-stream",
- "tracing",
]
[[package]]
diff --git a/chainfire/baremetal/pxe-server/assets/.gitkeep b/chainfire/baremetal/pxe-server/assets/.gitkeep
index b27a78c..352bf44 100644
--- a/chainfire/baremetal/pxe-server/assets/.gitkeep
+++ b/chainfire/baremetal/pxe-server/assets/.gitkeep
@@ -1,4 +1,4 @@
-# This directory is a placeholder for runtime assets
+# This directory is reserved for runtime assets
#
# Actual boot assets will be created at: /var/lib/pxe-boot/
# when the PXE server is deployed.
diff --git a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
index 397b6ea..6db3a62 100644
--- a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
+++ b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
@@ -190,8 +190,8 @@ set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8
# set kernel-params ${kernel-params} systemd.log_level=debug
echo Loading NixOS kernel...
-# NOTE: These paths will be populated by the S3 image builder (T032.S3)
-# For now, they point to placeholder paths that need to be updated
+# NOTE: These paths are populated by the S3 image builder (T032.S3)
+# and must resolve to the generated kernel/initrd objects at deploy time.
kernel ${nixos-url}/bzImage ${kernel-params} || goto failed
echo Loading NixOS initrd...
diff --git a/chainfire/chainfire-client/src/client.rs b/chainfire/chainfire-client/src/client.rs
index 55f0d08..edc2d83 100644
--- a/chainfire/chainfire-client/src/client.rs
+++ b/chainfire/chainfire-client/src/client.rs
@@ -4,8 +4,8 @@ use crate::error::{ClientError, Result};
use crate::watch::WatchHandle;
use chainfire_proto::proto::{
cluster_client::ClusterClient, compare, kv_client::KvClient, request_op, response_op,
- watch_client::WatchClient, Compare, DeleteRangeRequest, MemberAddRequest, PutRequest,
- RangeRequest, RequestOp, StatusRequest, TxnRequest,
+ watch_client::WatchClient, Compare, DeleteRangeRequest, PutRequest, RangeRequest, RequestOp,
+ StatusRequest, TxnRequest,
};
use std::time::Duration;
use tonic::transport::Channel;
@@ -616,53 +616,6 @@ impl Client {
raft_term: resp.raft_term,
})
}
-
- /// Add a member to the cluster
- ///
- /// # Arguments
- /// * `peer_url` - The Raft address of the new member (e.g., "127.0.0.1:2380")
- /// * `is_learner` - Whether to add as learner (true) or voter (false)
- ///
- /// # Returns
- /// The node ID of the added member
- pub async fn member_add(
- &mut self,
- node_id: u64,
- peer_url: impl AsRef,
- is_learner: bool,
- ) -> Result {
- let peer_url = peer_url.as_ref().to_string();
- let resp = self
- .with_cluster_retry(|mut cluster| {
- let peer_url = peer_url.clone();
- async move {
- cluster
- .member_add(MemberAddRequest {
- node_id,
- peer_urls: vec![peer_url],
- is_learner,
- })
- .await
- .map(|resp| resp.into_inner())
- }
- })
- .await?;
-
- // Extract the member ID from the response
- let member_id = resp
- .member
- .map(|m| m.id)
- .ok_or_else(|| ClientError::Internal("No member in response".to_string()))?;
-
- debug!(
- member_id = member_id,
- peer_url = peer_url.as_str(),
- is_learner = is_learner,
- "Added member to cluster"
- );
-
- Ok(member_id)
- }
}
/// Cluster status
diff --git a/chainfire/chainfire-client/src/watch.rs b/chainfire/chainfire-client/src/watch.rs
index d1a338f..43d91d8 100644
--- a/chainfire/chainfire-client/src/watch.rs
+++ b/chainfire/chainfire-client/src/watch.rs
@@ -136,9 +136,10 @@ fn convert_event(event: Event) -> WatchEvent {
EventType::Delete
};
- let (key, value, revision) = event.kv.map(|kv| {
- (kv.key, kv.value, kv.mod_revision as u64)
- }).unwrap_or_default();
+ let (key, value, revision) = event
+ .kv
+ .map(|kv| (kv.key, kv.value, kv.mod_revision as u64))
+ .unwrap_or_default();
WatchEvent {
event_type,
diff --git a/chainfire/crates/chainfire-api/build.rs b/chainfire/crates/chainfire-api/build.rs
index 1b77c9b..9550202 100644
--- a/chainfire/crates/chainfire-api/build.rs
+++ b/chainfire/crates/chainfire-api/build.rs
@@ -4,10 +4,7 @@ fn main() -> Result<(), Box> {
.build_server(true)
.build_client(true)
.compile_protos(
- &[
- "../../proto/chainfire.proto",
- "../../proto/internal.proto",
- ],
+ &["../../proto/chainfire.proto", "../../proto/internal.proto"],
&["../../proto"],
)?;
diff --git a/chainfire/crates/chainfire-api/src/cluster_service.rs b/chainfire/crates/chainfire-api/src/cluster_service.rs
index 9f83685..504294c 100644
--- a/chainfire/crates/chainfire-api/src/cluster_service.rs
+++ b/chainfire/crates/chainfire-api/src/cluster_service.rs
@@ -1,30 +1,22 @@
//! Cluster management service implementation
//!
//! This service handles cluster operations and status queries.
-//!
-//! NOTE: Custom RaftCore does not yet support dynamic membership changes.
-//! Member add/remove operations are disabled for now.
+//! The supported surface reports the fixed membership that the node booted with.
use crate::conversions::make_header;
use crate::proto::{
- cluster_server::Cluster, GetSnapshotRequest, GetSnapshotResponse, Member, MemberAddRequest,
- MemberAddResponse, MemberListRequest, MemberListResponse, MemberRemoveRequest,
- MemberRemoveResponse, SnapshotMeta, StatusRequest, StatusResponse, TransferSnapshotRequest,
- TransferSnapshotResponse,
+ cluster_server::Cluster, Member, MemberListRequest, MemberListResponse, StatusRequest,
+ StatusResponse,
};
use chainfire_raft::core::RaftCore;
use std::sync::Arc;
-use tokio::sync::mpsc;
-use tokio_stream::wrappers::ReceiverStream;
use tonic::{Request, Response, Status};
-use tracing::{debug, info, warn};
+use tracing::debug;
/// Cluster service implementation
pub struct ClusterServiceImpl {
/// Raft core
raft: Arc,
- /// gRPC Raft client for managing node addresses
- rpc_client: Arc,
/// Cluster ID
cluster_id: u64,
/// Configured members with client and peer URLs
@@ -37,13 +29,11 @@ impl ClusterServiceImpl {
/// Create a new cluster service
pub fn new(
raft: Arc,
- rpc_client: Arc,
cluster_id: u64,
members: Vec,
) -> Self {
Self {
raft,
- rpc_client,
cluster_id,
members,
version: env!("CARGO_PKG_VERSION").to_string(),
@@ -55,8 +45,7 @@ impl ClusterServiceImpl {
}
/// Get current members as proto Member list
- /// NOTE: Custom RaftCore doesn't track membership dynamically yet, so this returns
- /// the configured static membership that the server was booted with.
+ /// Return the configured static membership that the server was booted with.
async fn get_member_list(&self) -> Vec {
if self.members.is_empty() {
return vec![Member {
@@ -73,35 +62,6 @@ impl ClusterServiceImpl {
#[tonic::async_trait]
impl Cluster for ClusterServiceImpl {
- async fn member_add(
- &self,
- request: Request,
- ) -> Result, Status> {
- let req = request.into_inner();
- debug!(node_id = req.node_id, peer_urls = ?req.peer_urls, is_learner = req.is_learner, "Member add request");
-
- // Custom RaftCore doesn't support dynamic membership changes yet
- warn!("Member add not supported in custom Raft implementation");
- Err(Status::unimplemented(
- "Dynamic membership changes not supported in custom Raft implementation. \
- All cluster members must be configured at startup via initial_members."
- ))
- }
-
- async fn member_remove(
- &self,
- request: Request,
- ) -> Result, Status> {
- let req = request.into_inner();
- debug!(member_id = req.id, "Member remove request");
-
- // Custom RaftCore doesn't support dynamic membership changes yet
- warn!("Member remove not supported in custom Raft implementation");
- Err(Status::unimplemented(
- "Dynamic membership changes not supported in custom Raft implementation"
- ))
- }
-
async fn member_list(
&self,
_request: Request,
@@ -128,104 +88,11 @@ impl Cluster for ClusterServiceImpl {
Ok(Response::new(StatusResponse {
header: Some(self.make_header(last_applied)),
version: self.version.clone(),
- db_size: 0, // TODO: get actual RocksDB size
+ db_size: 0,
leader: leader.unwrap_or(0),
raft_index: commit_index,
raft_term: term,
raft_applied_index: last_applied,
}))
}
-
- /// Transfer snapshot to a target node for pre-seeding (T041 Option C)
- ///
- /// This is a workaround for OpenRaft 0.9.x learner replication bug.
- /// By pre-seeding learners with a snapshot, we avoid the assertion failure
- /// during log replication.
- ///
- /// TODO(T041.S5): Full implementation pending - currently returns placeholder
- async fn transfer_snapshot(
- &self,
- request: Request,
- ) -> Result, Status> {
- let req = request.into_inner();
- info!(
- target_node_id = req.target_node_id,
- target_addr = %req.target_addr,
- "Snapshot transfer request (T041 Option C)"
- );
-
- // Get current state from state machine
- let sm = self.raft.state_machine();
- let revision = sm.current_revision();
- let term = self.raft.current_term().await;
- let membership = self.raft.membership().await;
-
- let meta = SnapshotMeta {
- last_log_index: revision,
- last_log_term: term,
- membership: membership.clone(),
- size: 0, // Will be set when full impl is done
- };
-
- // TODO(T041.S5): Implement full snapshot transfer
- // 1. Serialize KV data using chainfire_storage::snapshot::SnapshotBuilder
- // 2. Stream snapshot to target via InstallSnapshot RPC
- // 3. Wait for target to apply snapshot
- //
- // For now, return success placeholder - the actual workaround can use
- // data directory copy (Option C1) until this API is complete.
-
- warn!(
- target = %req.target_addr,
- "TransferSnapshot not yet fully implemented - use data dir copy workaround"
- );
-
- Ok(Response::new(TransferSnapshotResponse {
- header: Some(self.make_header(revision)),
- success: false,
- error: "TransferSnapshot API not yet implemented - use data directory copy".to_string(),
- meta: Some(meta),
- }))
- }
-
- type GetSnapshotStream = ReceiverStream>;
-
- /// Get snapshot from this node as a stream of chunks
- ///
- /// TODO(T041.S5): Full implementation pending - currently returns empty snapshot
- async fn get_snapshot(
- &self,
- _request: Request,
- ) -> Result, Status> {
- debug!("Get snapshot request (T041 Option C)");
-
- // Get current state from state machine
- let sm = self.raft.state_machine();
- let revision = sm.current_revision();
- let term = self.raft.current_term().await;
- let membership = self.raft.membership().await;
-
- let meta = SnapshotMeta {
- last_log_index: revision,
- last_log_term: term,
- membership,
- size: 0,
- };
-
- // Create channel for streaming response
- let (tx, rx) = mpsc::channel(4);
-
- // TODO(T041.S5): Stream actual KV data
- // For now, just send metadata with empty data
- tokio::spawn(async move {
- let response = GetSnapshotResponse {
- meta: Some(meta),
- chunk: vec![],
- done: true,
- };
- let _ = tx.send(Ok(response)).await;
- });
-
- Ok(Response::new(ReceiverStream::new(rx)))
- }
}
diff --git a/chainfire/crates/chainfire-api/src/internal_service.rs b/chainfire/crates/chainfire-api/src/internal_service.rs
index d6e26e0..ab77877 100644
--- a/chainfire/crates/chainfire-api/src/internal_service.rs
+++ b/chainfire/crates/chainfire-api/src/internal_service.rs
@@ -4,22 +4,17 @@
//! It bridges the gRPC layer with the custom Raft implementation.
use crate::internal_proto::{
- raft_service_server::RaftService,
- AppendEntriesRequest as ProtoAppendEntriesRequest,
- AppendEntriesResponse as ProtoAppendEntriesResponse,
- InstallSnapshotRequest, InstallSnapshotResponse,
- VoteRequest as ProtoVoteRequest,
+ raft_service_server::RaftService, AppendEntriesRequest as ProtoAppendEntriesRequest,
+ AppendEntriesResponse as ProtoAppendEntriesResponse, VoteRequest as ProtoVoteRequest,
VoteResponse as ProtoVoteResponse,
};
-use chainfire_raft::core::{
- RaftCore, VoteRequest, AppendEntriesRequest,
-};
-use chainfire_storage::{LogId, LogEntry as RaftLogEntry, EntryPayload};
+use chainfire_raft::core::{AppendEntriesRequest, RaftCore, VoteRequest};
+use chainfire_storage::{EntryPayload, LogEntry as RaftLogEntry, LogId};
use chainfire_types::command::RaftCommand;
use std::sync::Arc;
use tokio::sync::oneshot;
-use tonic::{Request, Response, Status, Streaming};
-use tracing::{debug, info, trace, warn};
+use tonic::{Request, Response, Status};
+use tracing::{info, trace, warn};
/// Internal Raft RPC service implementation
///
@@ -67,7 +62,11 @@ impl RaftService for RaftServiceImpl {
Status::internal("Vote request failed: channel closed")
})?;
- trace!(term = resp.term, granted = resp.vote_granted, "Vote response");
+ trace!(
+ term = resp.term,
+ granted = resp.vote_granted,
+ "Vote response"
+ );
Ok(Response::new(ProtoVoteResponse {
term: resp.term,
vote_granted: resp.vote_granted,
@@ -141,22 +140,4 @@ impl RaftService for RaftServiceImpl {
}))
}
- async fn install_snapshot(
- &self,
- request: Request>,
- ) -> Result, Status> {
- let mut stream = request.into_inner();
- debug!("InstallSnapshot stream started");
-
- // Collect all chunks (for compatibility)
- while let Some(chunk) = stream.message().await? {
- if chunk.done {
- break;
- }
- }
-
- // Custom Raft doesn't support snapshots yet
- warn!("InstallSnapshot not supported in custom Raft implementation");
- Err(Status::unimplemented("Snapshots not supported in custom Raft implementation"))
- }
}
diff --git a/chainfire/crates/chainfire-api/src/kv_service.rs b/chainfire/crates/chainfire-api/src/kv_service.rs
index ff8d718..63d1b00 100644
--- a/chainfire/crates/chainfire-api/src/kv_service.rs
+++ b/chainfire/crates/chainfire-api/src/kv_service.rs
@@ -45,7 +45,9 @@ impl Kv for KvServiceImpl {
// NOTE: Custom RaftCore doesn't yet support linearizable_read() method
// For now, just warn if non-serializable read is requested
if !req.serializable {
- warn!("Linearizable reads not yet supported in custom Raft, performing serializable read");
+ warn!(
+ "Linearizable reads not yet supported in custom Raft, performing serializable read"
+ );
}
// Get state machine from Raft core
@@ -84,7 +86,11 @@ impl Kv for KvServiceImpl {
let command = RaftCommand::Put {
key: req.key,
value: req.value,
- lease_id: if req.lease != 0 { Some(req.lease) } else { None },
+ lease_id: if req.lease != 0 {
+ Some(req.lease)
+ } else {
+ None
+ },
prev_kv: req.prev_kv,
};
@@ -115,19 +121,25 @@ impl Kv for KvServiceImpl {
let req = request.into_inner();
debug!(key = ?String::from_utf8_lossy(&req.key), "Delete request");
- // Workaround: Pre-check key existence to determine deleted count
- // TODO: Replace with proper RaftResponse.deleted once client_write returns full response
+ // Pre-check key existence because the current client_write path does not
+ // return a delete count in the write response.
let sm = self.raft.state_machine();
let deleted_count = if req.range_end.is_empty() {
// Single key delete - check if exists
- let exists = sm.kv()
+ let exists = sm
+ .kv()
.get(&req.key)
.map_err(|e| Status::internal(e.to_string()))?
.is_some();
- if exists { 1 } else { 0 }
+ if exists {
+ 1
+ } else {
+ 0
+ }
} else {
// Range delete - count keys in range
- let kvs = sm.kv()
+ let kvs = sm
+ .kv()
.range(&req.key, Some(&req.range_end))
.map_err(|e| Status::internal(e.to_string()))?;
kvs.len() as i64
@@ -231,7 +243,7 @@ impl Kv for KvServiceImpl {
Ok(Response::new(TxnResponse {
header: Some(self.make_header(revision).await),
- succeeded: true, // Assume success if no error
+ succeeded: true, // Assume success if no error
responses: vec![], // Not supported yet
}))
}
@@ -276,9 +288,7 @@ fn convert_txn_responses(
.collect()
}
-fn convert_ops(
- ops: &[crate::proto::RequestOp],
-) -> Vec {
+fn convert_ops(ops: &[crate::proto::RequestOp]) -> Vec {
use chainfire_types::command::TxnOp;
ops.iter()
@@ -287,7 +297,11 @@ fn convert_ops(
crate::proto::request_op::Request::RequestPut(put) => TxnOp::Put {
key: put.key.clone(),
value: put.value.clone(),
- lease_id: if put.lease != 0 { Some(put.lease) } else { None },
+ lease_id: if put.lease != 0 {
+ Some(put.lease)
+ } else {
+ None
+ },
},
crate::proto::request_op::Request::RequestDeleteRange(del) => {
if del.range_end.is_empty() {
@@ -307,7 +321,7 @@ fn convert_ops(
limit: range.limit,
keys_only: range.keys_only,
count_only: range.count_only,
- }
+ },
})
})
.collect()
diff --git a/chainfire/crates/chainfire-api/src/lease_service.rs b/chainfire/crates/chainfire-api/src/lease_service.rs
index 8eb166b..777a9fb 100644
--- a/chainfire/crates/chainfire-api/src/lease_service.rs
+++ b/chainfire/crates/chainfire-api/src/lease_service.rs
@@ -182,7 +182,8 @@ impl Lease for LeaseServiceImpl {
let leases = sm.leases();
let lease_ids = leases.list();
- let statuses: Vec = lease_ids.into_iter().map(|id| LeaseStatus { id }).collect();
+ let statuses: Vec =
+ lease_ids.into_iter().map(|id| LeaseStatus { id }).collect();
Ok(Response::new(LeaseLeasesResponse {
header: Some(self.make_header(revision)),
diff --git a/chainfire/crates/chainfire-api/src/lib.rs b/chainfire/crates/chainfire-api/src/lib.rs
index 06a71ad..30c3d52 100644
--- a/chainfire/crates/chainfire-api/src/lib.rs
+++ b/chainfire/crates/chainfire-api/src/lib.rs
@@ -5,25 +5,25 @@
//! - gRPC service implementations
//! - Client and server components
+pub mod cluster_service;
+pub mod conversions;
pub mod generated;
+pub mod internal_service;
pub mod kv_service;
pub mod lease_service;
-pub mod watch_service;
-pub mod cluster_service;
-pub mod internal_service;
pub mod raft_client;
-pub mod conversions;
+pub mod watch_service;
// Re-export generated types
-pub use generated::chainfire::v1 as proto;
pub use generated::chainfire::internal as internal_proto;
+pub use generated::chainfire::v1 as proto;
// Re-export services
+pub use cluster_service::ClusterServiceImpl;
+pub use internal_service::RaftServiceImpl;
pub use kv_service::KvServiceImpl;
pub use lease_service::LeaseServiceImpl;
pub use watch_service::WatchServiceImpl;
-pub use cluster_service::ClusterServiceImpl;
-pub use internal_service::RaftServiceImpl;
// Re-export Raft client and config
pub use raft_client::{GrpcRaftClient, RetryConfig};
diff --git a/chainfire/crates/chainfire-api/src/raft_client.rs b/chainfire/crates/chainfire-api/src/raft_client.rs
index edb15e9..36a5f3a 100644
--- a/chainfire/crates/chainfire-api/src/raft_client.rs
+++ b/chainfire/crates/chainfire-api/src/raft_client.rs
@@ -112,7 +112,10 @@ impl GrpcRaftClient {
}
/// Get or create a gRPC client for the target node
- async fn get_client(&self, target: NodeId) -> Result, RaftNetworkError> {
+ async fn get_client(
+ &self,
+ target: NodeId,
+ ) -> Result, RaftNetworkError> {
// Check cache first
{
let clients = self.clients.read().await;
@@ -290,9 +293,7 @@ impl RaftRpcClient for GrpcRaftClient {
use chainfire_storage::EntryPayload;
let data = match &e.payload {
EntryPayload::Blank => vec![],
- EntryPayload::Normal(cmd) => {
- bincode::serialize(cmd).unwrap_or_default()
- }
+ EntryPayload::Normal(cmd) => bincode::serialize(cmd).unwrap_or_default(),
EntryPayload::Membership(_) => vec![],
};
(e.log_id.index, e.log_id.term, data)
@@ -333,8 +334,16 @@ impl RaftRpcClient for GrpcRaftClient {
Ok(AppendEntriesResponse {
term: resp.term,
success: resp.success,
- conflict_index: if resp.conflict_index > 0 { Some(resp.conflict_index) } else { None },
- conflict_term: if resp.conflict_term > 0 { Some(resp.conflict_term) } else { None },
+ conflict_index: if resp.conflict_index > 0 {
+ Some(resp.conflict_index)
+ } else {
+ None
+ },
+ conflict_term: if resp.conflict_term > 0 {
+ Some(resp.conflict_term)
+ } else {
+ None
+ },
})
}
})
diff --git a/chainfire/crates/chainfire-api/src/watch_service.rs b/chainfire/crates/chainfire-api/src/watch_service.rs
index 71aec81..449192a 100644
--- a/chainfire/crates/chainfire-api/src/watch_service.rs
+++ b/chainfire/crates/chainfire-api/src/watch_service.rs
@@ -1,9 +1,7 @@
//! Watch service implementation
use crate::conversions::make_header;
-use crate::proto::{
- watch_server::Watch, WatchRequest, WatchResponse,
-};
+use crate::proto::{watch_server::Watch, WatchRequest, WatchResponse};
use chainfire_watch::{WatchRegistry, WatchStream};
use std::pin::Pin;
use std::sync::Arc;
@@ -39,7 +37,8 @@ impl WatchServiceImpl {
#[tonic::async_trait]
impl Watch for WatchServiceImpl {
- type WatchStream = Pin> + Send>>;
+ type WatchStream =
+ Pin> + Send>>;
async fn watch(
&self,
@@ -81,13 +80,17 @@ impl Watch for WatchServiceImpl {
Ok(req) => {
if let Some(request_union) = req.request_union {
let response = match request_union {
- crate::proto::watch_request::RequestUnion::CreateRequest(create) => {
+ crate::proto::watch_request::RequestUnion::CreateRequest(
+ create,
+ ) => {
let internal_req: chainfire_types::watch::WatchRequest =
create.into();
let resp = stream.create_watch(internal_req);
internal_to_proto_response(resp, cluster_id, member_id)
}
- crate::proto::watch_request::RequestUnion::CancelRequest(cancel) => {
+ crate::proto::watch_request::RequestUnion::CancelRequest(
+ cancel,
+ ) => {
let resp = stream.cancel_watch(cancel.watch_id);
internal_to_proto_response(resp, cluster_id, member_id)
}
diff --git a/chainfire/crates/chainfire-core/Cargo.toml b/chainfire/crates/chainfire-core/Cargo.toml
index db4ed3a..f076a5b 100644
--- a/chainfire/crates/chainfire-core/Cargo.toml
+++ b/chainfire/crates/chainfire-core/Cargo.toml
@@ -3,35 +3,12 @@ name = "chainfire-core"
version.workspace = true
edition.workspace = true
license.workspace = true
-description = "Embeddable distributed cluster library with Raft consensus and SWIM gossip"
+description = "Internal compatibility crate for non-public ChainFire workspace types"
rust-version.workspace = true
+publish = false
[dependencies]
-# Internal crates
-chainfire-types = { workspace = true }
-chainfire-gossip = { workspace = true }
-# Note: chainfire-storage, chainfire-raft, chainfire-watch
-# will be added as implementation progresses
-# chainfire-storage = { workspace = true }
-# chainfire-raft = { workspace = true }
-# chainfire-watch = { workspace = true }
-
-# Async runtime
-tokio = { workspace = true }
-tokio-stream = { workspace = true }
-futures = { workspace = true }
-async-trait = { workspace = true }
-
-# Utilities
thiserror = { workspace = true }
-tracing = { workspace = true }
-bytes = { workspace = true }
-parking_lot = { workspace = true }
-dashmap = { workspace = true }
-
-[dev-dependencies]
-tokio = { workspace = true, features = ["test-util"] }
-tempfile = { workspace = true }
[lints]
workspace = true
diff --git a/chainfire/crates/chainfire-core/src/builder.rs b/chainfire/crates/chainfire-core/src/builder.rs
deleted file mode 100644
index 2d911a4..0000000
--- a/chainfire/crates/chainfire-core/src/builder.rs
+++ /dev/null
@@ -1,238 +0,0 @@
-//! Builder pattern for cluster creation
-
-use std::net::SocketAddr;
-use std::path::PathBuf;
-use std::sync::Arc;
-
-use chainfire_gossip::{GossipAgent, GossipId};
-use chainfire_types::node::NodeRole;
-use chainfire_types::RaftRole;
-
-use crate::callbacks::{ClusterEventHandler, KvEventHandler};
-use crate::cluster::Cluster;
-use crate::config::{ClusterConfig, MemberConfig, StorageBackendConfig, TimeoutConfig};
-use crate::error::{ClusterError, Result};
-use crate::events::EventDispatcher;
-
-/// Builder for creating a Chainfire cluster instance
-///
-/// # Example
-///
-/// ```ignore
-/// use chainfire_core::ClusterBuilder;
-///
-/// let cluster = ClusterBuilder::new(1)
-/// .name("node-1")
-/// .gossip_addr("0.0.0.0:7946".parse()?)
-/// .raft_addr("0.0.0.0:2380".parse()?)
-/// .bootstrap(true)
-/// .build()
-/// .await?;
-/// ```
-pub struct ClusterBuilder {
- config: ClusterConfig,
- cluster_handlers: Vec>,
- kv_handlers: Vec>,
-}
-
-impl ClusterBuilder {
- /// Create a new cluster builder with the given node ID
- pub fn new(node_id: u64) -> Self {
- Self {
- config: ClusterConfig {
- node_id,
- ..Default::default()
- },
- cluster_handlers: Vec::new(),
- kv_handlers: Vec::new(),
- }
- }
-
- /// Set the node name
- pub fn name(mut self, name: impl Into) -> Self {
- self.config.node_name = name.into();
- self
- }
-
- /// Set the node role (ControlPlane or Worker)
- pub fn role(mut self, role: NodeRole) -> Self {
- self.config.node_role = role;
- self
- }
-
- /// Set the Raft participation role (Voter, Learner, or None)
- pub fn raft_role(mut self, role: RaftRole) -> Self {
- self.config.raft_role = role;
- self
- }
-
- /// Set the API listen address
- pub fn api_addr(mut self, addr: SocketAddr) -> Self {
- self.config.api_addr = Some(addr);
- self
- }
-
- /// Set the Raft listen address (for control plane nodes)
- pub fn raft_addr(mut self, addr: SocketAddr) -> Self {
- self.config.raft_addr = Some(addr);
- self
- }
-
- /// Set the gossip listen address
- pub fn gossip_addr(mut self, addr: SocketAddr) -> Self {
- self.config.gossip_addr = addr;
- self
- }
-
- /// Set the storage backend
- pub fn storage(mut self, backend: StorageBackendConfig) -> Self {
- self.config.storage = backend;
- self
- }
-
- /// Set the data directory (convenience method for RocksDB storage)
- pub fn data_dir(mut self, path: impl Into) -> Self {
- self.config.storage = StorageBackendConfig::RocksDb { path: path.into() };
- self
- }
-
- /// Use in-memory storage
- pub fn memory_storage(mut self) -> Self {
- self.config.storage = StorageBackendConfig::Memory;
- self
- }
-
- /// Add initial cluster members (for bootstrap)
- pub fn initial_members(mut self, members: Vec) -> Self {
- self.config.initial_members = members;
- self
- }
-
- /// Add a single initial member
- pub fn add_member(mut self, member: MemberConfig) -> Self {
- self.config.initial_members.push(member);
- self
- }
-
- /// Enable cluster bootstrap (first node)
- pub fn bootstrap(mut self, bootstrap: bool) -> Self {
- self.config.bootstrap = bootstrap;
- self
- }
-
- /// Set the cluster ID
- pub fn cluster_id(mut self, id: u64) -> Self {
- self.config.cluster_id = id;
- self
- }
-
- /// Enable gRPC API server
- pub fn with_grpc_api(mut self, enabled: bool) -> Self {
- self.config.enable_grpc_api = enabled;
- self
- }
-
- /// Set timeout configuration
- pub fn timeouts(mut self, timeouts: TimeoutConfig) -> Self {
- self.config.timeouts = timeouts;
- self
- }
-
- /// Register a cluster event handler
- ///
- /// Multiple handlers can be registered. They will all be called
- /// when cluster events occur.
- pub fn on_cluster_event(mut self, handler: H) -> Self
- where
- H: ClusterEventHandler + 'static,
- {
- self.cluster_handlers.push(Arc::new(handler));
- self
- }
-
- /// Register a cluster event handler (Arc version)
- pub fn on_cluster_event_arc(mut self, handler: Arc) -> Self {
- self.cluster_handlers.push(handler);
- self
- }
-
- /// Register a KV event handler
- ///
- /// Multiple handlers can be registered. They will all be called
- /// when KV events occur.
- pub fn on_kv_event(mut self, handler: H) -> Self
- where
- H: KvEventHandler + 'static,
- {
- self.kv_handlers.push(Arc::new(handler));
- self
- }
-
- /// Register a KV event handler (Arc version)
- pub fn on_kv_event_arc(mut self, handler: Arc) -> Self {
- self.kv_handlers.push(handler);
- self
- }
-
- /// Validate the configuration
- fn validate(&self) -> Result<()> {
- if self.config.node_id == 0 {
- return Err(ClusterError::Config("node_id must be non-zero".into()));
- }
-
- if self.config.node_name.is_empty() {
- return Err(ClusterError::Config("node_name is required".into()));
- }
-
- // Raft-participating nodes need a Raft address
- if self.config.raft_role.participates_in_raft() && self.config.raft_addr.is_none() {
- return Err(ClusterError::Config(
- "raft_addr is required for Raft-participating nodes".into(),
- ));
- }
-
- Ok(())
- }
-
- /// Build the cluster instance
- ///
- /// This initializes the storage backend, Raft (if applicable), and gossip.
- pub async fn build(self) -> Result {
- self.validate()?;
-
- // Create event dispatcher with registered handlers
- let mut event_dispatcher = EventDispatcher::new();
- for handler in self.cluster_handlers {
- event_dispatcher.add_cluster_handler(handler);
- }
- for handler in self.kv_handlers {
- event_dispatcher.add_kv_handler(handler);
- }
-
- // Initialize gossip agent
- let gossip_identity = GossipId::new(
- self.config.node_id,
- self.config.gossip_addr,
- self.config.node_role,
- );
-
- let gossip_agent = GossipAgent::new(gossip_identity, chainfire_gossip::agent::default_config())
- .await
- .map_err(|e| ClusterError::Gossip(e.to_string()))?;
-
- tracing::info!(
- node_id = self.config.node_id,
- gossip_addr = %self.config.gossip_addr,
- "Gossip agent initialized"
- );
-
- // Create the cluster
- let cluster = Cluster::new(self.config, Some(gossip_agent), event_dispatcher);
-
- // TODO: Initialize storage backend
- // TODO: Initialize Raft if role participates
- // TODO: Start background tasks
-
- Ok(cluster)
- }
-}
diff --git a/chainfire/crates/chainfire-core/src/callbacks.rs b/chainfire/crates/chainfire-core/src/callbacks.rs
deleted file mode 100644
index 1dcf8a1..0000000
--- a/chainfire/crates/chainfire-core/src/callbacks.rs
+++ /dev/null
@@ -1,103 +0,0 @@
-//! Callback traits for cluster events
-
-use async_trait::async_trait;
-
-use chainfire_types::node::NodeInfo;
-
-use crate::kvs::KvEntry;
-
-/// Handler for cluster lifecycle events
-///
-/// Implement this trait to receive notifications about cluster membership
-/// and leadership changes.
-#[async_trait]
-pub trait ClusterEventHandler: Send + Sync {
- /// Called when a node joins the cluster
- async fn on_node_joined(&self, _node: &NodeInfo) {}
-
- /// Called when a node leaves the cluster
- async fn on_node_left(&self, _node_id: u64, _reason: LeaveReason) {}
-
- /// Called when leadership changes
- async fn on_leader_changed(&self, _old_leader: Option, _new_leader: u64) {}
-
- /// Called when this node becomes leader
- async fn on_became_leader(&self) {}
-
- /// Called when this node loses leadership
- async fn on_lost_leadership(&self) {}
-
- /// Called when cluster membership changes
- async fn on_membership_changed(&self, _members: &[NodeInfo]) {}
-
- /// Called when a network partition is detected
- async fn on_partition_detected(&self, _reachable: &[u64], _unreachable: &[u64]) {}
-
- /// Called when cluster is ready (initial leader elected, etc.)
- async fn on_cluster_ready(&self) {}
-}
-
-/// Handler for KV store events
-///
-/// Implement this trait to receive notifications about key-value changes.
-#[async_trait]
-pub trait KvEventHandler: Send + Sync {
- /// Called when a key is created or updated
- async fn on_key_changed(
- &self,
- _namespace: &str,
- _key: &[u8],
- _value: &[u8],
- _revision: u64,
- ) {
- }
-
- /// Called when a key is deleted
- async fn on_key_deleted(&self, _namespace: &str, _key: &[u8], _revision: u64) {}
-
- /// Called when multiple keys with a prefix are changed
- async fn on_prefix_changed(&self, _namespace: &str, _prefix: &[u8], _entries: &[KvEntry]) {}
-}
-
-/// Reason for node departure from the cluster
-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
-pub enum LeaveReason {
- /// Node left gracefully
- Graceful,
-
- /// Node timed out (failed to respond)
- Timeout,
-
- /// Network partition detected
- NetworkPartition,
-
- /// Node was explicitly evicted
- Evicted,
-
- /// Unknown reason
- Unknown,
-}
-
-impl std::fmt::Display for LeaveReason {
- fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
- match self {
- LeaveReason::Graceful => write!(f, "graceful"),
- LeaveReason::Timeout => write!(f, "timeout"),
- LeaveReason::NetworkPartition => write!(f, "network_partition"),
- LeaveReason::Evicted => write!(f, "evicted"),
- LeaveReason::Unknown => write!(f, "unknown"),
- }
- }
-}
-
-/// A no-op event handler for when callbacks are not needed
-pub struct NoOpClusterEventHandler;
-
-#[async_trait]
-impl ClusterEventHandler for NoOpClusterEventHandler {}
-
-/// A no-op KV event handler
-pub struct NoOpKvEventHandler;
-
-#[async_trait]
-impl KvEventHandler for NoOpKvEventHandler {}
diff --git a/chainfire/crates/chainfire-core/src/cluster.rs b/chainfire/crates/chainfire-core/src/cluster.rs
deleted file mode 100644
index 5a2e669..0000000
--- a/chainfire/crates/chainfire-core/src/cluster.rs
+++ /dev/null
@@ -1,313 +0,0 @@
-//! Cluster management
-
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
-
-use parking_lot::RwLock;
-use tokio::sync::broadcast;
-
-use chainfire_gossip::{GossipAgent, MembershipChange};
-use chainfire_types::node::NodeInfo;
-
-use crate::config::ClusterConfig;
-use crate::error::{ClusterError, Result};
-use crate::events::EventDispatcher;
-use crate::kvs::{Kv, KvHandle};
-
-/// Current state of the cluster
-#[derive(Debug, Clone)]
-#[derive(Default)]
-pub struct ClusterState {
- /// Whether this node is the leader
- pub is_leader: bool,
-
- /// Current leader's node ID
- pub leader_id: Option,
-
- /// Current term (Raft)
- pub term: u64,
-
- /// All known cluster members
- pub members: Vec,
-
- /// Whether the cluster is ready (initial leader elected)
- pub ready: bool,
-}
-
-
-/// Main cluster instance
-///
-/// This is the primary interface for interacting with a Chainfire cluster.
-/// It manages Raft consensus, gossip membership, and the distributed KV store.
-pub struct Cluster {
- /// Node configuration
- config: ClusterConfig,
-
- /// Current cluster state
- state: Arc>,
-
- /// KV store
- kv: Arc,
-
- /// Gossip agent for cluster membership
- gossip_agent: Option,
-
- /// Event dispatcher
- event_dispatcher: Arc,
-
- /// Shutdown flag
- shutdown: AtomicBool,
-
- /// Shutdown signal sender
- shutdown_tx: broadcast::Sender<()>,
-}
-
-impl Cluster {
- /// Create a new cluster instance
- pub(crate) fn new(
- config: ClusterConfig,
- gossip_agent: Option,
- event_dispatcher: EventDispatcher,
- ) -> Self {
- let (shutdown_tx, _) = broadcast::channel(1);
-
- Self {
- config,
- state: Arc::new(RwLock::new(ClusterState::default())),
- kv: Arc::new(Kv::new()),
- gossip_agent,
- event_dispatcher: Arc::new(event_dispatcher),
- shutdown: AtomicBool::new(false),
- shutdown_tx,
- }
- }
-
- /// Get this node's ID
- pub fn node_id(&self) -> u64 {
- self.config.node_id
- }
-
- /// Get this node's name
- pub fn node_name(&self) -> &str {
- &self.config.node_name
- }
-
- /// Get a handle for interacting with the cluster
- ///
- /// Handles are lightweight and can be cloned freely.
- pub fn handle(&self) -> ClusterHandle {
- ClusterHandle {
- node_id: self.config.node_id,
- state: self.state.clone(),
- kv: self.kv.clone(),
- shutdown_tx: self.shutdown_tx.clone(),
- }
- }
-
- /// Get the KV store interface
- pub fn kv(&self) -> &Arc {
- &self.kv
- }
-
- /// Get current cluster state
- pub fn state(&self) -> ClusterState {
- self.state.read().clone()
- }
-
- /// Check if this node is the leader
- pub fn is_leader(&self) -> bool {
- self.state.read().is_leader
- }
-
- /// Get current leader ID
- pub fn leader(&self) -> Option {
- self.state.read().leader_id
- }
-
- /// Get all cluster members
- pub fn members(&self) -> Vec {
- self.state.read().members.clone()
- }
-
- /// Check if the cluster is ready
- pub fn is_ready(&self) -> bool {
- self.state.read().ready
- }
-
- /// Join an existing cluster
- ///
- /// Connects to seed nodes and joins the cluster via gossip.
- pub async fn join(&mut self, seed_addrs: &[std::net::SocketAddr]) -> Result<()> {
- if seed_addrs.is_empty() {
- return Err(ClusterError::Config("No seed addresses provided".into()));
- }
-
- let gossip_agent = self.gossip_agent.as_mut().ok_or_else(|| {
- ClusterError::Config("Gossip agent not initialized".into())
- })?;
-
- // Announce to all seed nodes to discover the cluster
- for &addr in seed_addrs {
- tracing::info!(%addr, "Announcing to seed node");
- gossip_agent
- .announce(addr)
- .map_err(|e| ClusterError::Gossip(e.to_string()))?;
- }
-
- tracing::info!(seeds = seed_addrs.len(), "Joined cluster via gossip");
- Ok(())
- }
-
- /// Leave the cluster gracefully
- pub async fn leave(&self) -> Result<()> {
- // TODO: Implement graceful leave
- self.shutdown();
- Ok(())
- }
-
- /// Add a new node to the cluster (leader only)
- pub async fn add_node(&self, _node: NodeInfo, _as_learner: bool) -> Result<()> {
- if !self.is_leader() {
- return Err(ClusterError::NotLeader {
- leader_id: self.leader(),
- });
- }
-
- // TODO: Implement node addition via Raft
- Ok(())
- }
-
- /// Remove a node from the cluster (leader only)
- pub async fn remove_node(&self, _node_id: u64) -> Result<()> {
- if !self.is_leader() {
- return Err(ClusterError::NotLeader {
- leader_id: self.leader(),
- });
- }
-
- // TODO: Implement node removal via Raft
- Ok(())
- }
-
- /// Promote a learner to voter (leader only)
- pub async fn promote_learner(&self, _node_id: u64) -> Result<()> {
- if !self.is_leader() {
- return Err(ClusterError::NotLeader {
- leader_id: self.leader(),
- });
- }
-
- // TODO: Implement learner promotion via Raft
- Ok(())
- }
-
- /// Run the cluster (blocks until shutdown)
- pub async fn run(self) -> Result<()> {
- self.run_until_shutdown(std::future::pending()).await
- }
-
- /// Run with graceful shutdown signal
- pub async fn run_until_shutdown(mut self, shutdown_signal: F) -> Result<()>
- where
- F: std::future::Future