Establish baseline product surface and proof lanes

This commit is contained in:
centra 2026-04-10 19:28:44 +09:00
parent b8ebd24d4e
commit c1d4178a52
201 changed files with 12545 additions and 3643 deletions

View file

@ -96,6 +96,23 @@ jobs:
run: |
nix run ./nix/ci#gate-ci -- --shared-crate ${{ matrix.crate }} --tier 0 --no-logs
portable-regressions:
needs: filter
if: ${{ needs.filter.outputs.any_changed == 'true' || needs.filter.outputs.global_changed == 'true' || needs.filter.outputs.shared_crates_changed == 'true' }}
runs-on: ubuntu-latest
name: portable regressions
steps:
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@v11
- uses: DeterminateSystems/magic-nix-cache-action@v8
- name: Run portable canonical profile regressions
run: |
nix build \
.#checks.x86_64-linux.canonical-profile-eval-guards \
.#checks.x86_64-linux.portable-control-plane-regressions \
--accept-flake-config
# Build server packages (tier 1+)
build:
needs: [filter, gate]
@ -116,7 +133,7 @@ jobs:
# Summary job for PR status checks
ci-status:
needs: [filter, gate, shared-crates-gate]
needs: [filter, gate, shared-crates-gate, portable-regressions]
if: always()
runs-on: ubuntu-latest
steps:
@ -128,11 +145,18 @@ jobs:
if [[ "${{ needs.shared-crates-gate.result }}" == "failure" ]]; then
exit 1
fi
if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" ]]; then
if [[ "${{ needs.portable-regressions.result }}" == "failure" ]]; then
exit 1
fi
if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" || "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then
if [[ "${{ needs.gate.result }}" == "skipped" ]]; then
echo "Gate was skipped despite changes. This is unexpected."
exit 1
fi
if [[ "${{ needs.portable-regressions.result }}" == "skipped" ]]; then
echo "Portable regressions were skipped despite changes. This is unexpected."
exit 1
fi
fi
if [[ "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then
if [[ "${{ needs.shared-crates-gate.result }}" == "skipped" ]]; then

3
.gitignore vendored
View file

@ -3,6 +3,8 @@
.code/
.codex/
.claude.json
.agent-r/
agent-r.config.toml
.ralphrc
.sisyphus/
@ -39,6 +41,7 @@ Thumbs.db
# Logs
*.log
nohup.out
quanta/test_output_renamed.log
plasmavmc/kvm_test_output.log

210
README.md
View file

@ -2,7 +2,8 @@
UltraCloud is a Nix-first cloud platform workspace that assembles a small control plane, network services, VM hosting, shared storage, object storage, and gateway services into one reproducible repository.
The canonical local proof path is the six-node VM cluster under [`nix/test-cluster`](/home/centra/cloud/nix/test-cluster/README.md). It builds all guest images on the host, boots them as hardware-like QEMU nodes, and validates real multi-node behavior.
The fastest public entrypoint is the one-command single-node quickstart. The `3-node HA control plane` profile lives in `nixosConfigurations.node01`, `nixosConfigurations.node02`, and `nixosConfigurations.node03`; the six-node VM cluster under [`nix/test-cluster`](nix/test-cluster/README.md) is the publishable harness that extends that HA baseline with worker and optional service bundles on host-built QEMU guests.
The canonical bare-metal bootstrap proof is the ISO-on-QEMU path under [`nix/test-cluster`](nix/test-cluster/README.md), which drives phone-home, Disko install, reboot, and desired-system convergence for one control-plane node and one worker-equivalent node.
## Components
@ -15,38 +16,217 @@ The canonical local proof path is the six-node VM cluster under [`nix/test-clust
- `plasmavmc`: VM control plane and worker agents
- `coronafs`: shared filesystem for mutable VM volumes
- `lightningstor`: object storage and VM image backing
- `k8shost`: Kubernetes-style hosting control plane
- `k8shost`: Kubernetes-style hosting control plane for tenant pods and services
- `apigateway`: external API and proxy surface
- `nightlight`: metrics ingestion and query service
- `creditservice`: minimal reference quota/credit service
- `deployer`: bootstrap and phone-home deployment service
- `creditservice`: quota, reservation, and admission-control service
- `deployer`: bootstrap and phone-home deployment service that owns install plans and desired-system intent
- `fleet-scheduler`: non-Kubernetes service scheduler for bare-metal cluster services
## Core API Notes
- `chainfire` ships a fixed-membership cluster API on the supported surface. Public cluster management is `MemberList` plus `Status`, and the internal Raft transport surface is `Vote` plus `AppendEntries`. `chainfire-core` is workspace-internal only; the old embeddable builder and distributed-KV scaffold are not part of the supported product contract.
- `flaredb` ships SQL on both gRPC and REST. The supported REST SQL surface is `POST /api/v1/sql` for statement execution and `GET /api/v1/tables` for table discovery, alongside the existing KV and scan endpoints.
- `plasmavmc` ships a KVM-only public VM backend contract. The supported create and recovery surface is the KVM path exercised in `single-node-quickstart`, `fresh-smoke`, and `fresh-matrix`; Firecracker and mvisor remain archived non-product backends outside the supported surface until they have real tenant-network coverage.
- `lightningstor` keeps its optional gRPC surface live: bucket versioning, bucket policy, bucket tagging, and explicit object version listing are part of the supported contract for the canonical optional bundle.
- `fiberlb` backend `Https` health checks currently do not verify backend TLS certificates. Supported scope is limited to TCP reachability plus HTTP status for the backend endpoint until CA-aware verification is wired through config, server code, and the canonical harness.
- `k8shost` keeps `WatchPods` on the supported surface as a bounded snapshot stream for the current matching pod set. The published contract is the tenant workload API, not a separate long-lived controller event bus.
- `k8shost` is fixed as an API/control-plane product surface; runtime dataplane helpers stay archived non-product until they have their own published contract and proof.
- `k8shost-cni`, `k8shost-controllers`, `lightningstor-csi`, `nixosConfigurations.netboot-worker`, and the older scripts under `baremetal/vm-cluster` are archived internal scaffolds or `legacy/manual` debugging paths outside the supported surface.
## Core Control Plane Operations
The control-plane operator contract is fixed in [docs/control-plane-ops.md](docs/control-plane-ops.md).
- ChainFire dynamic membership, replace-node, and scale-out are unsupported on the supported surface; the supported operator path is fixed-membership restore or whole-cluster replacement backed by the `durability-proof` backup/restore baseline.
- FlareDB online migration and schema evolution must start from the durability-proof backup/restore baseline and stay additive-first until a later destructive cleanup window. FlareDB destructive DDL and fully automated online migration remain outside the supported product contract for this release.
- IAM bootstrap hardening requires an explicit admin token, an explicit signing key, and a 32-byte IAM_CRED_MASTER_KEY. Signing-key rotation, credential overlap-and-revoke rotation, and mTLS overlap-and-cutover rotation are part of the supported operator contract; multi-node IAM failover remains outside the supported product contract. The standalone proof is `./nix/test-cluster/run-core-control-plane-ops-proof.sh`.
## Edge And Trial Surface
The edge-bundle and trial-surface contract is fixed in [docs/edge-trial-surface.md](docs/edge-trial-surface.md).
- APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer; live in-process reload is not part of the product contract.
- NightLight is supported as a single-node WAL/snapshot service; replicated HA metrics storage is not part of the product contract.
- CreditService export and backend migration are supported as offline export/import or backend-native snapshot workflows, not live mixed-writer migration.
- OCI/Docker artifact is intentionally not the public trial surface.
- Use `./nix/test-cluster/work-root-budget.sh status` for disk budget, GC, and cleanup guidance, `./nix/test-cluster/work-root-budget.sh enforce` for a stronger local budget gate, and `./nix/test-cluster/work-root-budget.sh prune-proof-logs 2` for safer dated-proof cleanup.
## Quick Start
Single-node quickstart:
```bash
nix run .#single-node-quickstart
```
This app is also the automated smoke check for the smallest realistic trial surface. It builds the minimal VM stack, boots a QEMU VM, waits for `chainfire`, `flaredb`, `iam`, `prismnet`, and `plasmavmc`, checks their health endpoints, and verifies the in-guest VM runtime prerequisites. For an interactive session, keep the VM running:
```bash
ULTRACLOUD_QUICKSTART_KEEP_VM=1 nix run .#single-node-quickstart
```
Buildable trial artifact:
```bash
nix build .#single-node-trial-vm
nix run .#single-node-trial
```
`single-node-trial-vm` is the lightest supported artifact for local use: a host-built NixOS VM appliance for the VM-platform core. OCI/Docker artifact is intentionally not the public trial surface here, because the supported scope needs a guest kernel plus host KVM, `/dev/net/tun`, and OVS/libvirt semantics. A privileged container would be host-coupled and would not prove the same contract.
The legacy name `.#all-in-one-quickstart` is kept as an alias, and `.#single-node-trial` is a friendlier alias for the same smoke launcher.
Portable local proof on hosts without `/dev/kvm`:
```bash
nix build .#checks.x86_64-linux.canonical-profile-eval-guards
nix build .#checks.x86_64-linux.portable-control-plane-regressions
```
This TCG-safe lane keeps canonical profile drift, the core `chainfire` / `deployer` control-plane path, the `deployer -> nix-agent` boundary, and the `fleet-scheduler -> node-agent` boundary under regression coverage without requiring nested virtualization.
Publishable nested-KVM suite:
```bash
nix develop
nix run ./nix/test-cluster#cluster -- fresh-smoke
nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp
nix run ./nix/test-cluster#cluster -- fresh-matrix
./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite
```
The checked-in entrypoint for the publishable nested-KVM suite is the local wrapper `./nix/test-cluster/run-publishable-kvm-suite.sh`. Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally not part of this re-aggregated baseline.
For the full supported-surface proof on a local AMD/KVM host, use `./nix/test-cluster/run-supported-surface-final-proof.sh ./work/final-proofs/latest`; it keeps builders local, builds `single-node-trial-vm`, runs `single-node-quickstart`, and captures the publishable KVM suite logs in one place.
`nix run ./nix/test-cluster#cluster -- durability-proof` is the canonical chainfire flaredb deployer backup/restore lane. It persists artifacts under `./work/durability-proof/latest`, proves logical backup/restore for ChainFire keys and FlareDB SQL rows, uses the canonical Deployer admin pre-register request itself as the backup artifact, verifies that the pre-registered node survives a `deployer.service` restart, replays the same request idempotently, and injects CoronaFS plus LightningStor failures against the same live KVM cluster.
`nix run ./nix/test-cluster#cluster -- rollout-soak` is the longer-running control-plane and rollout companion lane. It rebuilds from clean local KVM runtime state, persists artifacts under `./work/rollout-soak/latest`, validates exactly one planned `draining` maintenance cycle and one fail-stop worker-loss cycle on the two native-runtime workers, holds each degraded state for the configured soak window, then restarts `deployer`, `fleet-scheduler`, `node-agent`, `chainfire`, and `flaredb` before revalidating the cluster. The soak root also carries explicit scope markers so the supported boundary is encoded in the proof artifacts rather than only in docs. The steady-state KVM nodes do not run `nix-agent.service`, so the soak lane records explicit `nix-agent` scope markers instead of pretending a live-cluster `nix-agent` restart happened.
`nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof` is the focused local-KVM reality lane for the provider and VM-hosting bundles. It stores artifacts under `./work/provider-vm-reality-proof/latest`, captures authoritative FlashDNS answers, FiberLB backend drain and restore evidence, and PlasmaVMC KVM shared-storage migration plus post-migration restart state.
The 2026-04-10 local AMD/KVM proof logs are in `./work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final` for `supported-surface-guard`, `single-node-trial-vm`, and `single-node-quickstart`, and in `./work/publishable-kvm-suite` for the final passing `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix` run through `./nix/test-cluster/run-publishable-kvm-suite.sh`.
The exact bare-metal check-runner proof from `2026-04-10` is in `./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; its outer `environment.txt` records `execution_model=materialized-check-runner`, and `state/environment.txt` records `vm_accelerator_mode=kvm`.
The 2026-04-10 durability and failure-injection proof logs are in `./work/durability-proof/20260410T120618+0900`; `result.json` records `success=true`, `deployer_restore_mode="admin pre-register request replay with pre/post-restart list verification"`, and the artifact set includes `chainfire-backup-response.json`, `flaredb-restored.json`, `deployer-post-restart-list.json`, `coronafs-node04-local-state.json`, and `lightningstor-head-during-node05-outage.json`.
The 2026-04-10 longer-running rollout and control-plane soak is in `./work/rollout-soak/20260410T164549+0900`; `result.json` records `success=true`, `fleet_supported_native_runtime_nodes=2`, `validated_maintenance_cycles=1`, `validated_power_loss_cycles=1`, and `soak_hold_secs=30`, while the artifact set includes `maintenance-held.json`, `power-loss-held.json`, `deployer-post-restart-nodes.json`, `chainfire-post-restart-put.json`, `flaredb-post-restart.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt`, and the `node01-nix-agent-scope.txt` / `node04-nix-agent-scope.txt` boundary markers.
The 2026-04-10 provider and VM-hosting reality proof logs are in `./work/provider-vm-reality-proof/20260410T135827+0900`; `result.json` records `success=true`, and the artifact set includes `network-provider/fiberlb-drain-summary.txt`, `network-provider/flashdns-service-authoritative-answer.txt`, `vm-hosting/migration-summary.json`, and `vm-hosting/root-volume-after-post-migration-restart.json`.
Physical-node bring-up now has a canonical preflight wrapper as well: `nix run ./nix/test-cluster#hardware-smoke -- preflight`. It writes `kernel-params.txt`, expected markers, failure markers, and a machine-readable blocked or ready state under `./work/hardware-smoke/latest`, and the same entrypoint can later be rerun as `run` or `capture` when USB or BMC/Redfish transport is actually present.
Within that suite, `fresh-matrix` is the public provider-bundle proof: it exercises PrismNet VPC/subnet/port flows plus security-group ACL add/remove, FlashDNS record publication, and FiberLB TCP plus TLS-terminated `Https` / `TerminatedHttps` listeners in one tenant-scoped composition run. The published FiberLB L4 algorithms are kept honest with targeted server unit tests in-tree. `provider-vm-reality-proof` is the artifact-producing companion lane for the same bundle and for the VM-hosting path.
PrismNet real OVS/OVN dataplane validation remains outside the supported local KVM surface. FiberLB native BGP or BFD peer interop plus hardware VIP ownership also remain outside the supported local KVM surface. PlasmaVMC real-hardware migration or storage handoff remains a later hardware proof; the current local-KVM proof fixes the release surface to KVM shared-storage migration on the worker pair.
Project-done release proof now requires both halves of the public validation surface to be green:
- `baremetal-iso` and `baremetal-iso-e2e` for the canonical `deployer -> installer -> nix-agent` bare-metal bootstrap path
- the KVM publishable suite (`fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`) for the nested-KVM multi-node VM-hosting path
Canonical bare-metal bootstrap proof:
```bash
nix run ./nix/test-cluster#cluster -- baremetal-iso
nix build .#checks.x86_64-linux.baremetal-iso-e2e
./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/latest
```
`baremetal-iso-e2e` now materializes the exact local-KVM proof runner instead of trying to boot QEMU inside a sandboxed `nixbld` build. That older build-time execution model degraded to `TCG`; the built runner keeps the canonical attr name but executes the same `verify-baremetal-iso.sh` harness as the direct QEMU proof, with host KVM and persistent logs under `./work`.
The QEMU ISO proof is a stand-in for the real install route, not a separate workflow. Build `nixosConfigurations.ultracloud-iso`, boot it under KVM locally or write the same ISO to USB or BMC virtual media on hardware, and pass the same bootstrap inputs that the installer consumes in the harness: `ultracloud.deployer_url=<scheme://host:port>`, `ultracloud.bootstrap_token=<token>` for authenticated bootstrap or a lab-only `deployer` configured with `allow_unauthenticated=true`, optional `ultracloud.ca_cert_url=<https://.../ca.crt>`, optional `ultracloud.binary_cache_url=<http://cache:8090>`, and optional `ultracloud.node_id=` / `ultracloud.hostname=` overrides when DMI serials or DHCP names are not the desired identity.
The networking contract is the same in QEMU and on hardware: the live ISO needs DHCP or equivalent L3 reachability to `deployer` before Disko starts, and it needs reachability to the optional binary cache if you want it to pull prebuilt closures instead of compiling locally. The local QEMU proof relies on the `10.0.2.2` fallback addresses from user-mode NAT; real hardware should set `ultracloud.deployer_url` and, when used, `ultracloud.binary_cache_url` to routable control-plane endpoints. USB media and BMC virtual media are only transport differences for the same ISO and kernel parameters. For the local proof keep `./work` or `ULTRACLOUD_WORK_ROOT` on a large disk; the checked-in wrappers force local builders and derive Nix parallelism from the host CPU count unless you override it explicitly.
Canonical hardware preflight and handoff for the same path:
```bash
nix run ./nix/test-cluster#hardware-smoke -- preflight
nix run ./nix/test-cluster#hardware-smoke -- run
nix run ./nix/test-cluster#hardware-smoke -- capture
```
That wrapper keeps the QEMU proof and the physical-node proof on one contract by writing the exact kernel parameters, expected `ULTRACLOUD_MARKER` sequence, failure markers, and artifact root under `./work/hardware-smoke/latest`.
Canonical hardware handoff for that path:
1. Build `nixosConfigurations.ultracloud-iso` plus the target role configs (`baremetal-qemu-control-plane`, `baremetal-qemu-worker`, or their hardware-specific successors) and expose `deployer` plus an optional HTTP Nix cache on addresses the installer can reach.
2. Publish cluster state so that the reusable node class owns the install contract: `install_plan.nixos_configuration`, `install_plan.disko_config_path`, and preferably `install_plan.target_disk_by_id`. Node entries should only bind identity, pool, and any desired-system override that truly differs per host. When you expose a binary cache, prefer setting `desired_system.target_system` to the prebuilt class-owned closure as well so post-install convergence does not rebuild a dirty local variant on each node.
3. Boot the same ISO through USB or BMC virtual media and pass `ultracloud.deployer_url=...`, `ultracloud.bootstrap_token=...`, and, when used, `ultracloud.binary_cache_url=...` on the kernel command line.
4. Watch the canonical marker sequence from the installer journal: `pre-install.boot`, `pre-install.phone-home.complete`, `install.bundle-downloaded`, `install.disko.complete`, `install.nixos-install.complete`, `reboot`, `post-install.boot`.
5. Treat `nix-agent` reporting the desired system as `active` as the final convergence gate. The QEMU harness proves the same sequence, only with virtio disks and host-local endpoints standing in for the real chassis.
The checked-in QEMU proof now mirrors the disk-selection contract that hardware should use. Its node classes install by stable `/dev/disk/by-id/virtio-uc-control-root` and `/dev/disk/by-id/virtio-uc-worker-root` selectors, backed by explicit QEMU disk serials, while the ISO resolves the prebuilt Disko script and target system from the install profile name embedded into the ISO. Hardware should keep the same class/profile structure and swap only the disk selector, routable URLs, and physical media transport.
## Canonical Profiles
UltraCloud now fixes the public support surface to three canonical profiles:
| Profile | Canonical entrypoints | Required components | Optional components |
| --- | --- | --- | --- |
| `single-node dev` | `nix run .#single-node-quickstart`, `nix run .#single-node-trial`, `nix build .#single-node-trial-vm`, `nixosConfigurations.single-node-quickstart`, companion install image `nixosConfigurations.netboot-all-in-one` | `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` | `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost` |
| `3-node HA control plane` | `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion install image `nixosConfigurations.netboot-control-plane` | `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node | `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` |
| `bare-metal bootstrap` | `nix run ./nix/test-cluster#cluster -- baremetal-iso`, `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` | `deployer`, `first-boot-automation`, `install-target`, `nix-agent` | `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after bootstrap |
`nixosConfigurations.netboot-all-in-one` and `nixosConfigurations.netboot-control-plane` are canonical companion images for the supported `single-node dev` and `3-node HA control plane` profiles. `packages.single-node-trial-vm` is the low-friction trial artifact for the minimal VM-platform core. `nixosConfigurations.netboot-worker`, `netboot-base`, `pxe-server`, `vm-smoke-target`, and older launch flows under `baremetal/vm-cluster` are archived helpers or `legacy/manual` debugging paths outside the canonical profiles and their guard set.
## Cluster Authoring
`ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is the only supported cluster authoring source. It is the canonical place to define nodes, reusable deployer classes and pools, rollout objects, service placement intent, and the generated per-node bootstrap metadata consumed by `deployer`, `fleet-scheduler`, `nix-agent`, and `node-agent`.
`nix-nos` is limited to legacy compatibility and low-level network primitives such as interfaces, VLANs, BGP, and static routing. It is not the canonical source for cluster topology, rollout intent, scheduler state, or bootstrap inventory.
## Responsibility Boundaries
- `plasmavmc` owns tenant VM lifecycle plus KVM worker registration. It can run against explicit remote IAM, PrismNet, and FlareDB endpoints, but it does not own machine enrollment, desired-system rollout, or host-native service placement.
- `k8shost` owns Kubernetes-style pod and service APIs for tenant workloads, then translates them into `prismnet`, `flashdns`, and `fiberlb` objects. It does not place host-native cluster daemons, and its runtime dataplane helpers remain archived non-product.
- `fleet-scheduler` owns placement and failover of host-native service instances from declarative cluster state derived from `ultracloud.cluster`. It consumes `node-agent` heartbeats and writes instance placement, but it does not expose tenant-facing Kubernetes semantics.
- `deployer` owns machine enrollment, `/api/v1/phone-home`, install plans, cluster metadata, and desired-system references. The supported declarative input for that state is the JSON generated from `ultracloud.cluster`; it decides what a node should become, but it does not execute the host-local switch.
- `nix-agent` owns host-local NixOS convergence only. It reads desired-system state from `deployer` or `chainfire`, activates the target closure, and rolls back on failed health checks.
- `node-agent` owns host-local runtime execution only. It reports heartbeats and applies scheduled service-instance state, but it does not install the base OS or rewrite desired-system targets.
The single-node quickstart deliberately stops below that rollout stack: it ships only the VM-platform core plus optional add-ons, not `deployer`, `nix-agent`, `node-agent`, or `fleet-scheduler`.
## Standalone Stories
- `single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story. They keep the minimal KVM-backed VM surface light and intentionally exclude `deployer`, `nix-agent`, `fleet-scheduler`, and `node-agent`.
- `deployer-vm-smoke`, `portable-control-plane-regressions`, and `baremetal-iso` are the standalone rollout-stack story. They validate `deployer -> nix-agent` and `deployer -> fleet-scheduler -> node-agent` without requiring the full VM-hosting bundle.
## Rollout Bundle Operations
The rollout-bundle operator contract is fixed in [docs/rollout-bundle.md](docs/rollout-bundle.md). As of 2026-04-10 the supported `deployer` recovery model is scope-fixed to one active writer plus optional cold-standby restore that reuses the same ChainFire namespace, credentials, bootstrap bundle, and local state backup. `deployer` is scope-fixed to one active writer plus optional cold-standby restore; automatic ChainFire-backed multi-instance failover is outside the supported product contract for this release.
The same operator doc also fixes the `nix-agent` health-check and rollback contract, the `node-agent` logs/secrets/volume/upgrade contract, and the `fleet-scheduler` supported upper limit: the two native-runtime worker lab with one planned drain cycle, one fail-stop worker-loss cycle, and 30-second held degraded states in `rollout-soak`. `fleet-scheduler` is scope-fixed to the two native-runtime worker lab with one planned drain cycle, one fail-stop worker-loss cycle, and 30-second held degraded states in rollout-soak. The canonical proofs are `nix build .#checks.x86_64-linux.deployer-vm-rollback`, `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`, `nix build .#checks.x86_64-linux.portable-control-plane-regressions`, `nix run ./nix/test-cluster#cluster -- fresh-smoke`, `nix run ./nix/test-cluster#cluster -- rollout-soak`, and `nix run ./nix/test-cluster#cluster -- durability-proof`.
## Main Entrypoints
- workspace flake: [flake.nix](/home/centra/cloud/flake.nix)
- VM validation harness: [nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md)
- shared volume notes: [coronafs/README.md](/home/centra/cloud/coronafs/README.md)
- minimal quota-service rationale: [creditservice/README.md](/home/centra/cloud/creditservice/README.md)
- archived manual VM launch scripts: [baremetal/vm-cluster/README.md](/home/centra/cloud/baremetal/vm-cluster/README.md)
- workspace flake: [flake.nix](flake.nix)
- single-node quickstart smoke: [`nix run .#single-node-quickstart`](docs/testing.md)
- single-node trial artifact: [`nix build .#single-node-trial-vm`](docs/testing.md), [`nix run .#single-node-trial`](docs/testing.md)
- smallest rollback proof for `deployer -> nix-agent`: [`nix build .#checks.x86_64-linux.deployer-vm-rollback`](docs/rollout-bundle.md)
- `3-node HA control plane` configs: `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion image `nixosConfigurations.netboot-control-plane`
- portable local proof: [`nix build .#checks.x86_64-linux.portable-control-plane-regressions`](docs/testing.md)
- longer-running control-plane and rollout soak: [`nix run ./nix/test-cluster#cluster -- rollout-soak`](docs/testing.md)
- canonical bare-metal bootstrap smoke: [`nix run ./nix/test-cluster#cluster -- baremetal-iso`](docs/testing.md)
- canonical bare-metal exact proof runner: [`nix build .#checks.x86_64-linux.baremetal-iso-e2e`](docs/testing.md) then `./result/bin/baremetal-iso-e2e`
- canonical physical-node preflight and handoff: [`nix run ./nix/test-cluster#hardware-smoke -- preflight`](docs/hardware-bringup.md), then `run` or `capture`
- canonical profile guards: [`nix build .#checks.x86_64-linux.canonical-profile-eval-guards`](docs/testing.md), [`nix build .#checks.x86_64-linux.canonical-profile-build-guards`](docs/testing.md)
- supported surface guard: [`nix build .#checks.x86_64-linux.supported-surface-guard`](docs/testing.md) for public docs wording, shipped server API completeness, and high-signal TODO or best-effort markers in the supported provider/backend servers
- VM validation harness: [nix/test-cluster/README.md](nix/test-cluster/README.md)
- work-root budget helper: [`./nix/test-cluster/work-root-budget.sh status`](docs/testing.md), `enforce`, and `prune-proof-logs`
- shared volume notes: [coronafs/README.md](coronafs/README.md)
- apigateway supported scope: [apigateway/README.md](apigateway/README.md)
- nightlight supported scope: [nightlight/README.md](nightlight/README.md)
- creditservice supported scope: [creditservice/README.md](creditservice/README.md)
- k8shost supported scope: [k8shost/README.md](k8shost/README.md)
## Repository Guide
- [docs/README.md](/home/centra/cloud/docs/README.md): documentation entrypoint
- [docs/testing.md](/home/centra/cloud/docs/testing.md): validation path summary
- [docs/component-matrix.md](/home/centra/cloud/docs/component-matrix.md): supported multi-component compositions
- [docs/storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers
- [docs/README.md](docs/README.md): documentation entrypoint
- [docs/testing.md](docs/testing.md): validation path summary
- [docs/component-matrix.md](docs/component-matrix.md): canonical profiles and optional bundles
- [docs/rollout-bundle.md](docs/rollout-bundle.md): rollout-bundle HA, rollback, drain, logs, secrets, and volume contract
- [docs/control-plane-ops.md](docs/control-plane-ops.md): ChainFire membership boundary, FlareDB schema or destructive-DDL boundary, and IAM bootstrap hardening plus signing-key, credential, and mTLS rotation
- [docs/edge-trial-surface.md](docs/edge-trial-surface.md): APIGateway, NightLight, CreditService, trial-surface, and work-root budget contract
- [docs/provider-vm-reality.md](docs/provider-vm-reality.md): PrismNet, FlashDNS, FiberLB, and PlasmaVMC local-KVM proof scope plus artifact contract
- [docs/hardware-bringup.md](docs/hardware-bringup.md): USB/BMC/Redfish preflight, artifact capture, and hardware-smoke handoff
- [docs/storage-benchmarks.md](docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers
- `plans/`: design notes and exploration documents
## Scope
UltraCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products.
UltraCloud is centered on reproducible infrastructure behavior. Optional add-ons such as `creditservice` and `k8shost` remain part of the supported surface only when the documented scope, harness coverage, and public contract stay aligned with what the repository actually ships.
Host-level NixOS rollout validation is also expected to stay reproducible: the `deployer-vm-smoke` VM test now proves that `nix-agent` can activate a prebuilt target system closure directly, without recompiling the stack inside the guest.
Host-level NixOS rollout validation is also expected to stay reproducible: `baremetal-iso-e2e` is now the materialized exact proof runner for the full install path, `canonical-profile-eval-guards` and `canonical-profile-build-guards` fail fast when supported outputs drift, `supported-surface-guard` now rejects unfinished public wording, shipped server API stubs, high-signal completeness markers such as `TODO:` or `best-effort` in the supported network or backend servers, and archived helper regressions such as worker netboot or backend scaffolds re-entering the default product surface, while `portable-control-plane-regressions` remains the non-KVM developer lane that keeps the main control-plane and rollout boundaries green on TCG-only hosts before the publishable nested-KVM suite is rerun.

411
TODO.md Normal file
View file

@ -0,0 +1,411 @@
# UltraCloud Baseline TODO (2026-04-10)
- Task: `0fe10731-bdbc-4f8f-8bcc-5f5a16903200`
- 作成ブランチ: `task/0fe10731-baseline-todo`
- ベース: `origin/main` at `b8ebd24d4e9b2dbe71e34ba09b77092dfa7dd43c`
- 引き継ぎ方針: `task/343c8c57-main-reaggregate` の dirty worktree は reset/revert せず、そのまま新ブランチへ持ち上げた。
- この票の目的: 各コンポーネントの責務、正本 entrypoint、現時点の証拠、未証明事項、優先度付き問題票、依存関係を 1 枚に固定し、以後の自律実装の基準票にする。
- 調査入力: `README.md`, `docs/component-matrix.md`, `docs/testing.md`, `nix/test-cluster/README.md`, `plans/cluster-investigation-2026-03-02/*`, 現在の `nix/modules/*`, `nix/single-node/*`, `nix/nodes/baremetal-qemu/*`, `nix/test-cluster/*`, 各 component の `src/main.rs` / API 定義。
## Canonical Boundary Snapshot
- 正本 profile は 3 つ: `single-node dev`, `3-node HA control plane`, `bare-metal bootstrap`
- 最小コアは `chainfire + flaredb + iam + prismnet + plasmavmc`
- ネットワーク provider bundle は `prismnet + flashdns + fiberlb`
- VM hosting bundle は `plasmavmc + prismnet + coronafs + lightningstor`
- edge/tenant bundle は `apigateway + nightlight + creditservice`
- rollout bundle は `deployer + nix-agent + fleet-scheduler + node-agent`
- 2026-04-10 の current branch では、QEMU/KVM を正本の local proof とし、bare-metal proof も `QEMU as hardware` として同一 ISO 契約で扱う構造が入っている。
## 2026-03-02 Failure Split
### 2026-03-02 の失敗で、2026-04-10 current branch では file-level に解消済みのもの
- `ARCH-001`: `flake.nix` が欠損 `docs/.../configuration.nix` を参照していた件は解消済み。現在の正本は `nix/nodes/vm-cluster/node01`, `node02`, `node03``canonical-profile-eval-guards`
- `ARCH-002`: ISO install の `disko.nix` 欠損参照は解消済み。現在は `nix/nodes/baremetal-qemu/control-plane/disko.nix``.../worker/disko.nix``verify-baremetal-iso.sh` が直接使う。
- `ARCH-003`: `deployer` の Nix wiring 欠損は解消済み。`nix/modules/deployer.nix`, `flake.nix` の package/app/check 定義, `deployer-server``/api/v1/phone-home` が存在する。
- `TC-001`: `joinAddr` 不整合は解消済み。現在の `chainfire` / `flaredb` module は `initialPeers` 契約に揃っている。
- `TC-002`: `node06``creditservice` 評価失敗は解消済み。現在の `nix/test-cluster/node06.nix``creditservice.nix` を import し、`flaredbAddr` も与えている。
- `COMP-001` から `COMP-004`: IAM endpoint 注入ミスマッチは解消済み。`prismnet`, `plasmavmc`, `fiberlb`, `lightningstor`, `flashdns`, `creditservice` は現在 module から binary が実際に読む config key に変換している。
- `ARCH-004`: first-boot の `leader_url` 契約不整合は解消済み。`nix/modules/first-boot-automation.nix``http://localhost:8081` / `8082``/admin/member/add` を前提にしている。
- `ARCH-005`: FlareDB に first-boot 用 join API が無かった件は解消済み。`flaredb/crates/flaredb-server/src/rest.rs``POST /admin/member/add` がある。
- `3.1 NightLight grpcPort mismatch`: 解消済み。`nightlight-server` は現在 HTTP と gRPC を両方 bind する。
- `ARCH-006` / `cluster-config` 二重実装問題: 2026-03-02 にあった `nix-nos/topology.nix` 起点の重複は current tree ではそのまま見当たらず、正本は `nix/lib/cluster-schema.nix``nix/modules/ultracloud-cluster.nix` に寄っている。
- `QLT-001`: `flake.nix` 上の大量 `doCheck = false` 群は、少なくとも current file-level ではそのまま残っていない。
### 2026-03-02 の失敗と切り分けて、2026-04-10 では「構造 fix はあるが runtime 再証明が未了」のもの
- `VERIFY-001`: 2026-04-10 の local AMD/KVM host で `supported-surface-guard`, `single-node-trial-vm`, `single-node-quickstart`, `fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`, `./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite`, `canonical-profile-eval-guards`, `portable-control-plane-regressions`, `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `fleet-scheduler-e2e`, `baremetal-iso`, `nix build .#checks.x86_64-linux.baremetal-iso-e2e`, and the built `./result/bin/baremetal-iso-e2e` exact runner は再走済みで pass。未再証明なのは実機 bare-metal smoke のみ。
- `VERIFY-002`: bare-metal bootstrap は QEMU ISO proof まで閉じているが、USB/BMC/実機への同契約再証明はまだ無い。ただし 2026-04-10 に `nix run ./nix/test-cluster#hardware-smoke -- preflight` を追加し、transport 不在時の blocked state は `./work/hardware-smoke/latest/status.env``missing-requirements.txt` へ機械的に残せるようになった。
- `VERIFY-003`: config-contract 修正は `run-publishable-kvm-suite.sh` で全 add-on 有効 profile まで再確認済み。`baremetal-iso-e2e` も materialized host-KVM runner へ移行済みで、残件は hardware bring-up に絞られた。
## First Tranche Backlog
- `TRANCHE-01`: 完了。`single-node dev` の optional bundle health gating は 2026-04-10 に修正済み。`coronafs` の port mismatch と `flashdns` / `fiberlb` / `lightningstor` の health 未監視を解消した。
- `TRANCHE-02`: `baremetal-iso``baremetal-iso-e2e` exact runner は 2026-04-10 の local AMD/KVM host で再走済み。次段で USB/BMC/実機 1 台の smoke を追加する。
- `TRANCHE-03`: 完了。2026-04-10 に `nix run ./nix/test-cluster#cluster -- durability-proof` を追加し、`chainfire` / `flaredb` の logical backup/restore と、`deployer` の admin pre-register request replay + restart persistence proof を product doc と harness へ固定した。
- `TRANCHE-04`: 完了。`fleet-scheduler`, `nix-agent`, `node-agent`, `deployer-ctl` の local `chainfire` 既定 endpoint は 2026-04-10 に canonical `http://127.0.0.1:2379` へ正規化した。
- `TRANCHE-05`: 完了。`fiberlb` の HTTPS health check は 2026-04-10 に supported scope を明文化し、現時点では backend TLS 証明書検証なしの `TCP reachability + HTTP status` のみが製品契約だと docs/guard/source comment へ固定した。
- `TRANCHE-06`: 完了。`k8shost` は 2026-04-10 に API/control-plane 製品として固定し、runtime dataplane helpers は archived non-product と docs/guard/TODO を一致させた。
- `TRANCHE-07`: 完了。2026-04-10 の `durability-proof``lightningstor` distributed backend の node-loss / repair と `coronafs` controller/node split outage を canonical failure-injection proof として保存する。
- `TRANCHE-08`: 完了。2026-04-10 に `hardware-smoke` preflight/handoff wrapper を追加し、`deployer -> ISO -> first-boot -> nix-agent` の実機 bring-up を USB/BMC/Redfish 共通 entrypoint で準備できるようにした。transport 不在時の blocked artifact も `./work/hardware-smoke` に固定化した。
- `TRANCHE-10`: 完了。2026-04-10 に `nix run ./nix/test-cluster#cluster -- rollout-soak` を longer-run KVM operator lane として固定し、`draining` maintenance, worker power-loss, `deployer` / `fleet-scheduler` / `node-agent` restart, fixed-membership `chainfire` / `flaredb` restart を同一 artifact root に保存した。steady-state `test-cluster``nix-agent.service` が載っていないことも scope marker artifact で明文化した。
- `TRANCHE-11`: 完了。2026-04-10 に `DEPLOYER-P1-01``FLEET-P1-01` を scope-fixed final state へ更新し、`rollout-soak``scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900` へ保存するようにした。`deployer` は one active writer plus optional cold-standby restore、`fleet-scheduler` は two native-runtime workers 上の one drain + one fail-stop cycle with 30-second hold を release boundary として固定した。
- `TRANCHE-12`: 完了。2026-04-10 に `FDB-P1-01`, `IAM-P1-01`, `HARNESS-P2-01` を次段処理した。`run-core-control-plane-ops-proof.sh``/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00``scope-fixed-contract.json`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, `result.json` を保存し、FlareDB destructive DDL/fully automated online migration は scope-fixed unsupported、IAM は signing-key + credential + mTLS overlap rotation までを supported lifecycle とし multi-node failover は unsupported に固定した。`work-root-budget.sh` には `enforce``prune-proof-logs` を追加し、disk budget advisory から stronger local gate と safer cleanup workflow へ進めた。
## 2026-04-10 Physical Hardware Bring-Up Pack
- `Task:` `3dba03d3-525b-4079-8c93-90af6a89d32b`
- `Canonical entrypoint:` `nix run ./nix/test-cluster#hardware-smoke -- preflight`, then `run` or `capture`
- `Current preflight artifact root:` `./work/hardware-smoke/latest`
- `Artifact contract:` `status.env`, `missing-requirements.txt`, `kernel-params.txt`, `expected-markers.txt`, `failure-markers.txt`, `operator-handoff.md`, `environment.txt`
- `Bridge to QEMU proof:` hardware wrapper reuses `nixosConfigurations.ultracloud-iso` and the same `ULTRACLOUD_MARKER pre-install.boot.*`, `pre-install.phone-home.complete.*`, `install.disko.complete.*`, `reboot.*`, `post-install.boot.*`, `desired-system-active.*` markers that `verify-baremetal-iso.sh` enforces in the QEMU harness.
- `Blocked-state recording:` when USB device or BMC/Redfish transport is missing, `preflight` records `status=blocked` and the missing transport, kernel-parameter, and capture inputs in `missing-requirements.txt` without pretending the hardware proof ran.
- `Still open:` an actual physical-node execution remains pending until a removable USB target or BMC/Redfish endpoint plus credentials are supplied.
- `TRANCHE-09`: 完了。2026-04-10 に `docs/rollout-bundle.md` を追加し、`deployer` single-writer DR、`nix-agent` health-check/rollback、`node-agent` logs/secrets/volume/upgrade、`fleet-scheduler` drain/maintenance/failover の product contract と proof command を固定した。
## 2026-04-10 Long-Run Control Plane And Rollout Soak
- `Task:` `07d6137e-6e4c-4158-9142-8920f4f70a76`
- `Canonical entrypoint:` `nix run ./nix/test-cluster#cluster -- rollout-soak`
- `Artifact root:` `/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900`
- `Scenario proof:` one planned `node04 -> draining -> active` cycle, one `node05` power-loss and recovery cycle, restart of `deployer.service`, `fleet-scheduler.service`, `node-agent.service` on both worker nodes, and fixed-membership restart of `chainfire.service` plus `flaredb.service` on `node02`.
- `Saved evidence:` `maintenance-during.json`, `maintenance-held.json`, `maintenance-restored.json`, `power-loss-during.json`, `power-loss-held.json`, `power-loss-restored.json`, `deployer-post-restart-nodes.json`, `fleet-scheduler-post-restart.json`, `node04-node-agent-post-restart.json`, `node05-node-agent-post-restart.json`, `chainfire-post-restart-put.json`, `flaredb-post-restart.json`, `post-control-plane-restarts.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `fleet-scheduler-scope-fixed.txt`, `result.json`.
- `Long-run nix-agent boundary:` steady-state `nix/test-cluster` nodes do not ship `nix-agent.service`, so this soak records `node01-nix-agent-scope.txt` and `node04-nix-agent-scope.txt` instead of pretending a live-cluster `nix-agent` restart happened. The executable `nix-agent` proofs remain `deployer-vm-rollback`, `baremetal-iso`, and `baremetal-iso-e2e`.
- `Result:` PASS on the local AMD/KVM host. `result.json` records `success=true`, `fleet_supported_native_runtime_nodes=2`, `validated_maintenance_cycles=1`, `validated_power_loss_cycles=1`, `soak_hold_secs=30`, and the summary `validated one planned drain cycle and one fail-stop worker-loss cycle on the two-node native-runtime lab, held each degraded state for the configured soak window, restarted deployer or scheduler or agent services, and revalidated fixed-membership control-plane restarts while keeping deployer HA scope-fixed to single-writer recovery`.
## 2026-04-10 Local Executable Baseline
- `Task:` `b1e811fb-158f-415c-a011-64c724e84c5c`
- `Runner:` `nix/test-cluster/run-local-baseline.sh`
- `Log root:` `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c`
- `Local execution policy:` `ULTRACLOUD_WORK_ROOT=/mnt/d2/centra/photoncloud-monorepo/work`, `TMPDIR=/mnt/d2/centra/photoncloud-monorepo/work/tmp`, `XDG_CACHE_HOME=/mnt/d2/centra/photoncloud-monorepo/work/xdg-cache`, `PHOTON_CLUSTER_WORK_ROOT=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster`, `PHOTON_VM_DIR=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster/state`, `PHOTON_CLUSTER_VDE_SWITCH_DIR=/mnt/d2/centra/photoncloud-monorepo/work/test-cluster/vde-switch`, `NIX_CONFIG builders =` で remote builder を禁止。
- `Host evidence:` `environment.txt``host_cpu_count=12`, `ultracloud_local_nix_max_jobs=6`, `ultracloud_local_nix_build_cores=2`, `photon_cluster_nix_max_jobs=6`, `photon_cluster_nix_build_cores=2`, `nix_builders=` (empty), `kvm_access=rw`, `nested_param_value=1` を保存済み。
- `Guard/build checks:`
- `canonical-profile-eval-guards`: PASS. command `nix build .#checks.x86_64-linux.canonical-profile-eval-guards --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/canonical-profile-eval-guards.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/canonical-profile-eval-guards.log`.
- `supported-surface-guard`: PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/supported-surface-guard.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/supported-surface-guard.log`.
- `portable-control-plane-regressions`: PASS. command `nix build .#checks.x86_64-linux.portable-control-plane-regressions`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/portable-control-plane-regressions.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/portable-control-plane-regressions.log`.
- `deployer-bootstrap-e2e`: PASS. command `nix build .#checks.x86_64-linux.deployer-bootstrap-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/deployer-bootstrap-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/deployer-bootstrap-e2e.log`.
- `host-lifecycle-e2e`: PASS. command `nix build .#checks.x86_64-linux.host-lifecycle-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/host-lifecycle-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/host-lifecycle-e2e.log`.
- `fleet-scheduler-e2e`: PASS. command `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fleet-scheduler-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fleet-scheduler-e2e.log`.
- `Runtime path checks:`
- `single-node-quickstart`: PASS. command `nix run .#single-node-quickstart`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/single-node-quickstart.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/single-node-quickstart.log`; success marker `single-node quickstart smoke passed`.
- `baremetal-iso`: PASS. command `nix run ./nix/test-cluster#cluster -- baremetal-iso`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/baremetal-iso.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/baremetal-iso.log`; success markers `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, `Canonical ISO bare-metal QEMU verification succeeded`.
- `fresh-smoke`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-smoke`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fresh-smoke.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baselines/b1e811fb-158f-415c-a011-64c724e84c5c/fresh-smoke.log`; success marker `Cluster validation succeeded`.
- `2026-04-10 execution failures:` none. 2026-03-02 の historical failure split は上節のままで、この local AMD/KVM baseline では required command 群を fail として再現していない。
- `2026-04-10 observed non-failure risk:`
- `HARNESS-OBS-20260410-01`: 2026-04-10 に解消。`nix/test-cluster/run-cluster.sh` の stale VM cleanup は current `vm_dir` / `vde_switch_dir` を cmdline で確認した PID のみ収集するように変更し、path 非依存の `hostfwd=tcp::${port}-:22` fallback を撤去した。
## 2026-04-10 Bare-Metal Canonical Path
- `Task:` `6d9f45e4-1954-4a0b-b886-c61482db6c3c`
- `QEMU-as-hardware runtime proof:` PASS. command `nix run ./nix/test-cluster#cluster -- baremetal-iso`; log root `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso`; evidence files `environment.txt`, `deployer.log`, `chainfire.log`, `control-plane.serial.log`, `worker.serial.log`.
- `Runtime PASS markers:` `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, `Canonical ISO bare-metal QEMU verification succeeded`.
- `Runtime contract now proven:`
- reusable node classes own `install_plan.nixos_configuration`, `install_plan.disko_config_path`, and stable `install_plan.target_disk_by_id`
- nodes carry identity plus desired-system overrides only; when a cache-backed prebuilt closure is available they now publish `desired_system.target_system` to converge to the exact shipped system instead of a dirty local rebuild
- installed nodes now keep `nix-agent` alive across their own `switch-to-configuration` transaction long enough for activation to finish, which restored post-install `chainfire` and `nix-agent` convergence
- `Historical blocker (resolved on 2026-04-10):` direct build-time execution of `nix build .#checks.x86_64-linux.baremetal-iso-e2e` ran under sandboxed `nixbld1` and fell back to `TCG`. The exact lane is now a materialized runner: the check build succeeds quickly and emits `./result/bin/baremetal-iso-e2e`, and that runner executes the same `verify-baremetal-iso.sh` harness with host KVM and logs under `./work`.
## 2026-04-10 Responsibility And Minimal-Surface Alignment
- `Task:` `65a13e46-1376-4f37-a5c1-e520b5b376ec`
- `Authoring source decision:` `ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is now documented in `README.md`, `docs/README.md`, and `docs/testing.md` as the only supported cluster authoring source. `nix-nos` is explicitly reduced to legacy compatibility plus low-level network primitives.
- `Module boundary alignment:` `services.deployer`, `services.fleet-scheduler`, `services.nix-agent`, and `services.node-agent` descriptions now agree on the canonical layering `ultracloud.cluster -> deployer -> (nix-agent | fleet-scheduler -> node-agent)`.
- `Minimal-surface friction reduction:` `services.plasmavmc` and `services.k8shost` now wait only for local backing services that they actually use. When explicit remote endpoints are configured, they no longer hard-wire unrelated local control-plane units into startup ordering, which preserves a lighter standalone story for the VM-platform core and remote-provider deployments.
- `Validation alignment:` `supported-surface-guard` now requires contract markers for the supported authoring source, the constrained `nix-nos` role, and the standalone VM-platform story so docs drift becomes a failing regression.
- `Still open:` rollout-stack の default port mismatch は解消済み。残件は hardware bring-up と longer-duration durability proof。
## 2026-04-10 Supported Surface Final Proof
- `Task:` `32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0`
- `Guard + minimal-trial proof root:` `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final`
- `supported-surface-guard`: PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/supported-surface-guard.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/supported-surface-guard.log`.
- `single-node-trial-vm`: PASS. command `nix build .#single-node-trial-vm --no-link --print-out-paths`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-trial-vm.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-trial-vm.log`; output path `/nix/store/1nq4pkadm3lbxmhkr54iz7lgjd6vm7z3-nixos-vm`.
- `single-node-quickstart`: PASS. command `nix run .#single-node-quickstart`; meta `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-quickstart.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final/single-node-quickstart.log`; success marker `single-node quickstart smoke passed`.
- `Publishable KVM suite root:` `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite`
- `environment.txt` captures `host_cpu_count=12`, `local_nix_max_jobs=6`, `local_nix_build_cores=2`, `photon_cluster_nix_max_jobs=6`, `photon_cluster_nix_build_cores=2`, `kvm_present=yes`, `kvm_access=rw`, `kvm_amd_nested=1`, `nix_builders=`, `finished_at=2026-04-10T09:36:09+09:00`, `exit_status=0`.
- `fresh-smoke`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-smoke`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-smoke.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-smoke.log`; success marker `Cluster validation succeeded`.
- `fresh-demo-vm-webapp`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-demo-vm-webapp.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-demo-vm-webapp.log`; success markers include `PHOTON_VM_DEMO_WEB_READY` and the guest web health check on `http://10.62.10.10:8080/health`.
- `fresh-matrix`: PASS. command `nix run ./nix/test-cluster#cluster -- fresh-matrix`; meta `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-matrix.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/fresh-matrix.log`; success marker `Component matrix validation succeeded`.
- `run-publishable-kvm-suite`: PASS. command `./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite`; environment `/mnt/d2/centra/photoncloud-monorepo/work/publishable-kvm-suite/environment.txt`; final stdout marker `publishable KVM suite passed; logs in ./work/publishable-kvm-suite`.
- `Fixed while proving the surface:`
- `NODEAGENT-FIX-20260410-01`: reboot-time PID reuse could make `node-agent` treat `native-daemon` as the resurrected `native-web` instance after worker reboot, stalling `fresh-smoke` at native runtime recovery. `deployer/crates/node-agent/src/process.rs` now persists argv + boot-id metadata, validates the live `/proc/<pid>/cmdline`, and refuses to signal or reuse mismatched processes from stale pidfiles.
- `HARNESS-FIX-20260410-01`: `run-publishable-kvm-suite` exposed a control-plane LightningStor bootstrap race that was not consistently hit by ad-hoc reruns. `nix/test-cluster/node01.nix` now holds `lightningstor.service` behind explicit local control-plane and worker-replica TCP readiness with a longer start timeout, and `nix/test-cluster/run-cluster.sh` now waits the worker storage agents before gating the control-plane LightningStor unit.
- `Still open after the final supported-surface proof:` real hardware `baremetal-iso` smoke.
## 2026-04-10 baremetal-iso-e2e Local-KVM Exact Lane
- `Task:` `0de75570-dabd-471b-95fe-5898c54e2e8c`
- `Check build output:` `nix build .#checks.x86_64-linux.baremetal-iso-e2e` now materializes `./result/bin/baremetal-iso-e2e` instead of trying to execute QEMU inside the daemon sandbox.
- `Exact proof root:` `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`
- `Outer runner evidence:` `environment.txt` records `execution_model=materialized-check-runner`, `nix_builders=` (empty), `kvm_present=yes`, `kvm_access=rw`, and the local CPU-derived Nix parallelism.
- `Exact check build:` PASS. command `nix build .#checks.x86_64-linux.baremetal-iso-e2e`; output path is a runner package that ships `bin/baremetal-iso-e2e` plus `share/ultracloud/README.txt` documenting the sandbox/TCG reason for the materialized execution model.
- `Exact runner:` PASS. command `./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; meta `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/baremetal-iso-e2e.meta`; log `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/baremetal-iso-e2e.log`.
- `Inner runtime evidence:` state dir `/mnt/d2/centra/photoncloud-monorepo/work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c/state`; `state/environment.txt` records `vm_accelerator_mode=kvm`; success markers in `baremetal-iso-e2e.log` include `ULTRACLOUD_MARKER desired-system-active.iso-control-plane-01`, `ULTRACLOUD_MARKER desired-system-active.iso-worker-01`, and `Canonical ISO bare-metal QEMU verification succeeded`.
- `Remaining delta vs direct runtime proof:` the harness is now identical because both `nix run ./nix/test-cluster#cluster -- baremetal-iso` and `./result/bin/baremetal-iso-e2e` call `nix/test-cluster/verify-baremetal-iso.sh`. The only intentional difference is execution entrypoint: `nix build` materializes the runner because daemon-sandboxed `nixbld` builds would otherwise lose host KVM and degrade to `TCG`.
## 2026-04-10 Durability And Product-Boundary Hardening
- `Task:` `541356be-b289-4583-ba40-cbf46b0f9680`
- `Guard rerun:` PASS. command `nix build .#checks.x86_64-linux.supported-surface-guard --no-link`.
- `Runtime rerun:` PASS. command `nix run ./nix/test-cluster#cluster -- fresh-matrix`; success marker `Component matrix validation succeeded`.
- `Durability proof:` PASS. command `nix run ./nix/test-cluster#cluster -- durability-proof`; artifact root `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/20260410T120618+0900`; convenience symlink `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/latest`.
- `ChainFire proof:` `chainfire-backup-response.json``chainfire-restored-response.json` が同じ logical payload を返し、DELETE 後の `chainfire-after-delete.out` は 404 を返す。
- `FlareDB proof:` `flaredb-backup.json``flaredb-restored.json` が同じ SQL row を返し、`flaredb-after-delete.json` は空集合を返す。
- `Deployer proof:` `deployer-pre-register-request.json` を backup artifact とし、`deployer-backup-list.json` で pre-registered node を観測し、`deployer.service` restart 後も `deployer-post-restart-list.json` に残ることを確認し、同じ request を replay した後も `deployer-replayed-list.json` の summary が変わらないことを確認した。`result.json``deployer_restore_mode``admin pre-register request replay with pre/post-restart list verification`
- `CoronaFS failure injection:` `coronafs-node04-local-state.json` は controller 停止中も `node_local=true` と materialized path を保持し、`coronafs-node04-capabilities.json` は node-only capability split (`supports_controller_api=false`, `supports_node_api=true`) を維持した。
- `LightningStor failure injection:` `lightningstor-put-during-node05-outage.json`, `lightningstor-head-during-node05-outage.json`, `lightningstor-object-during-node05-outage.txt`, `lightningstor-object-after-repair.txt` が node05 停止中 write と repair 後 read-back を保存する。
- `FiberLB supported limitation:` `fiberlb/crates/fiberlb-server/src/healthcheck.rs`, `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `flake.nix` で、HTTPS backend health は TLS 証明書検証なしの限定契約だと固定した。
- `k8shost boundary:` `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `k8shost/README.md`, `nix/test-cluster/README.md`, `flake.nix``k8shost` を API/control-plane 製品 surface のみに固定し、`k8shost-cni`, `k8shost-controllers`, `lightningstor-csi` を archived non-product として揃えた。
- `Proof-lane hardening done during this tranche:` 初回 `durability-proof` は FlareDB cleanup tail の unsupported `DROP TABLE` で落ちたため unique namespace 前提に整理し、次に cleanup trap の unbound local で落ちたため trap cleanup を `${var:-}` と guarded tunnel shutdown に直した。現在の lane は zero-exit で artifact を残す。
## 2026-04-10 Rollout Bundle HA And DR Hardening
- `Task:` `a41343c5-116e-4313-8751-b333472f931c`
- `Operator doc:` `docs/rollout-bundle.md`
- `Verification reruns:` `nix build .#checks.x86_64-linux.portable-control-plane-regressions`, `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`, and `nix build .#checks.x86_64-linux.deployer-vm-rollback` all passed on 2026-04-10 with local-only Nix settings.
- `Durability rerun:` `nix run ./nix/test-cluster#cluster -- durability-proof` passed again from a clean KVM cluster and wrote artifacts under `/mnt/d2/centra/photoncloud-monorepo/work/durability-proof/20260410T123535+0900`.
- `Supported deployer boundary:` single-writer deployer with restart-in-place or cold-standby restore. ChainFire-backed multi-instance failover is explicitly unsupported for now and the restore runbook is fixed to `cluster-state apply + preserved pre-register request replay + admin verification`.
- `Nix-agent proof:` `nix build .#checks.x86_64-linux.deployer-vm-rollback` passed on 2026-04-10 and is now the canonical reproducible proof for `health_check_command`, rollback, and `rolled-back` partial failure recovery semantics.
- `Fleet-scheduler semantics:` `fresh-smoke` and `fleet-scheduler-e2e` remain the release proofs for short-lived `draining` maintenance, fail-stop worker loss, and replica restoration. Long-duration maintenance and large-cluster drain choreography stay scope-limited rather than silently implied.
- `Node-agent contract:` product docs now fix `${stateDir}/pids/*.log` as the per-instance log location, `${stateDir}/pids/*.meta.json` as stale-pid metadata, secret delivery as caller-provided env or mounted files only, host-path volumes as pass-through only, and upgrades as replace-and-reconcile rather than in-place patching.
## 2026-04-10 Core Control Plane Operator Lifecycle Proofs
- `Task:` `dcdc961a-0aa6-47c3-aeba-a1c67bca27b7`
- `Operator doc:` `docs/control-plane-ops.md`
- `Focused proof:` `./nix/test-cluster/run-core-control-plane-ops-proof.sh /mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00`
- `Focused proof result:` passed on 2026-04-10 and wrote `result.json`, `scope-fixed-contract.json`, `iam-key-rotation-tests.log`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, and the contract-marker logs under `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00`.
- `Supported-surface guard:` rerun after the doc and proof updates so the public lifecycle contract is now guarded alongside the existing supported-surface wording.
- `ChainFire boundary:` dynamic membership, replace-node, and scale-out are now explicit non-supported actions on the product surface. The supported path is fixed-membership restore or whole-cluster replacement anchored by the existing `durability-proof` backup/restore lane.
- `FlareDB boundary:` online migration and schema evolution are now fixed to an additive-first, backup/restore-gated operator contract. Destructive DDL and fully automated online migration are explicit non-supported boundaries for this release rather than implied future promises.
- `IAM boundary:` bootstrap hardening now requires explicit admin token, signing key, and 32-byte `IAM_CRED_MASTER_KEY` inputs in docs. The standalone proof reruns signing-key rotation, credential overlap-and-revoke rotation, and mTLS overlap-and-cutover rotation tests while checking the hardening markers in `iam-server`; multi-node IAM failover remains unsupported.
## 2026-04-10 Edge And Trial-Surface Productization
- `Task:` `cc24ac5a-b940-4a32-9136-d706ecadf875`
- `Operator doc:` `docs/edge-trial-surface.md`
- `Component docs:` `apigateway/README.md`, `nightlight/README.md`, and `creditservice/README.md`
- `Helper:` `./nix/test-cluster/work-root-budget.sh status` now reports `./work` disk usage, soft budgets, and cleanup plus `nix store gc` guidance without mutating state by default.
- `Edge bundle boundary:` APIGateway is now documented as stateless replicated behind external L4 or VIP distribution, but restart-based rollout remains the only supported config distribution or reload model proven on this branch. NightLight is fixed to a single-node WAL/snapshot product shape with process-wide retention, and CreditService export plus migration is fixed to offline export/import or backend-native snapshots instead of live mixed-writer migration.
- `Trial boundary:` `single-node-trial-vm` and `single-node-quickstart` remain the only supported lightweight trial surface. OCI/Docker remains intentionally unsupported because it would not prove the same guest-kernel, KVM, `/dev/net/tun`, and OVS/libvirt contract.
## 2026-04-10 Provider And VM-Hosting Reality Proof
- `Task:` `41a074a3-dc5c-42fc-979e-c8ebf9919d55`
- `Focused proof lane:` `nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof`
- `Focused proof result:` passed on 2026-04-10 and wrote `result.json`, `meta.json`, journals, and provider or VM-hosting artifacts under `/mnt/d2/centra/photoncloud-monorepo/work/provider-vm-reality-proof/20260410T135827+0900`.
- `Provider artifacts:` `network-provider/prismnet-port-create.json`, `network-provider/prismnet-security-group-after-add.json`, `network-provider/flashdns-workload-authoritative-answer.txt`, `network-provider/flashdns-service-authoritative-answer.txt`, `network-provider/fiberlb-drain-summary.txt`, `network-provider/fiberlb-tcp-health-before-drain.txt`, and `network-provider/fiberlb-tcp-health-after-restore.txt` fix the current local-KVM proof to tenant network lifecycle, authoritative DNS answers, and listener drain or re-convergence.
- `VM-hosting artifacts:` `vm-hosting/vm-create-response.json`, `vm-hosting/root-volume-before-migration.json`, `vm-hosting/root-volume-after-migration.json`, `vm-hosting/data-volume-after-migration.json`, `vm-hosting/migration-summary.json`, `vm-hosting/prismnet-port-after-migration.json`, and `vm-hosting/demo-state-after-post-migration-restart.json` fix the current release proof to KVM shared-storage migration, CoronaFS handoff, and post-migration restart on the worker pair.
- `Scope-fixed gaps:` real OVS/OVN dataplane validation, native BGP or BFD peer interop with hardware VIP ownership, and real-hardware VM migration or storage handoff remain outside the supported local-KVM surface and are now explicit docs or guard limits rather than implied release claims.
## chainfire
- `責務:` UltraCloud 全体の replicated coordination store。KV, lease, watch, cluster membership view, rollout stack の state anchor を持つ。
- `Canonical entrypoint:` `nix/modules/chainfire.nix`; `chainfire/crates/chainfire-server/src/main.rs`; supported API は `chainfire/proto/chainfire.proto`
- `現在ある証拠:` `README.md``MemberList` / `Status` を supported surface と明示; `chainfire/crates/chainfire-server/src/rest.rs` に health と member add がある; `docs/testing.md` が quickstart と HA proof を定義; `nix/single-node/base.nix``nix/nodes/vm-cluster/*` が正本 wiring; 2026-04-10 の `durability-proof``chainfire-backup-response.json` / `chainfire-restored-response.json` で logical KV backup/restore を保存し、`rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/chainfire-post-restart-put.json``post-control-plane-restarts.json` で fixed-membership restart 後の live proof を保存した。
- `未証明事項:` rolling upgrade 手順; 実機 3 ノード上での membership 変更; power-loss 後の復旧 runbook。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `CF-P1-01` は 2026-04-10 に scope freeze から live-restart-proof 付きへ進んだ。dynamic membership / scale-out / replace-node は supported surface では explicit に unsupported のままだが、fixed-membership restart 自体は `rollout-soak` により live KVM proof へ格上げされた。次段で残るのは、live membership mutation 自体を製品化したい場合の dedicated KVM proof 追加だけ。
- `P2:` `CF-P2-01` `chainfire-core` の internal pruning が current branch で進行中なので、公開境界と workspace 内部境界の最終整理が必要。
- `依存関係:` local disk; host networking; `flaredb`, `iam`, `deployer`, `fleet-scheduler`, `nix-agent`, `node-agent`, `coronafs` から参照される。
## flaredb
- `責務:` replicated KV/SQL metadata store。各サービスの metadata, quota state, object metadata, tenant network state の受け皿。
- `Canonical entrypoint:` `nix/modules/flaredb.nix`; `flaredb/crates/flaredb-server/src/main.rs`; REST は `flaredb/crates/flaredb-server/src/rest.rs`
- `現在ある証拠:` `README.md``POST /api/v1/sql``GET /api/v1/tables` を supported と明記; `flaredb/crates/flaredb-server/src/rest.rs` に SQL/KV/scan/member add がある; `docs/testing.md` が control-plane proof と `fresh-matrix` 依存を説明; `nix/modules/flaredb.nix``pdAddr` と namespace mode を生成; 2026-04-10 の `rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/flaredb-post-restart-create.json`, `flaredb-post-restart-insert.json`, `flaredb-post-restart.json` で member restart 後の additive SQL を保存し、`run-core-control-plane-ops-proof.sh``/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00/scope-fixed-contract.json``flaredb-migration-contract.log` で destructive DDL / fully automated online migration が supported surface の外だと固定した。
- `未証明事項:` real hardware 上の storage pressure と multi-node repair。fully automated online migration と destructive DDL online cutover はこの release では intentionally unsupported。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `FDB-P1-01` は 2026-04-10 に scope-fixed final。supported SQL/KV surface の logical backup/restore は `durability-proof` と docs で固定済みで、online migration / schema-evolution は additive-first と backup/restore baseline 前提で整理された。`rollout-soak` は member restart 後の additive SQL を live KVM artifact として保存し、`run-core-control-plane-ops-proof.sh` は destructive DDL と fully automated online migration が supported surface の外だと `/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00/scope-fixed-contract.json` に固定した。今後やるなら scope 拡張として destructive online migration proof を別 tranche で扱う。
- `P2:` `FDB-P2-01` namespace ごとの `strong` / `eventual` 方針が module default に埋まっており、operator-facing contract としてはまだ弱い。
- `依存関係:` `chainfire` を placement/coordination に使う; local disk; `iam`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `creditservice`, `k8shost` から参照される。
## iam
- `責務:` identity, token issuance, authn, authz, tenant principal 管理。
- `Canonical entrypoint:` `nix/modules/iam.nix`; `iam/crates/iam-server/src/main.rs`; API package は `iam/crates/iam-api/src/lib.rs`
- `現在ある証拠:` `README.md``docs/component-matrix.md` が core component として扱う; `nix/modules/iam.nix``chainfire` / `flaredb` 接続を正本生成; `iam-authn`, `iam-authz`, `iam-store` crate が分離; `fresh-matrix` と gateway path が credit/k8shost/plasmavmc 経由で IAM を前提にしている; `run-core-control-plane-ops-proof.sh``/mnt/d2/centra/photoncloud-monorepo/work/core-control-plane-ops-proof/20260410T172148+09:00``iam-key-rotation-tests.log`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, `scope-fixed-contract.json`, `result.json` を保存し、bootstrap hardening, signing-key rotation, credential overlap rotation, mTLS overlap rotation を standalone proof として固定した。
- `未証明事項:` multi-node IAM failover; backend matrix 全体での same-lane lifecycle proof。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `IAM-P1-01` は 2026-04-10 に scope-fixed final。bootstrap hardening と token/signing-key rotation は `docs/control-plane-ops.md``run-core-control-plane-ops-proof.sh` で standalone に固定され、同じ proof root が credential overlap-and-revoke rotation と mTLS overlap-and-cutover rotation も保存するようになった。multi-node IAM failover は supported surface の外へ明示的に出した。今後やるなら scope 拡張として clustered IAM failover proof を別 tranche で扱う。
- `P2:` `IAM-P2-01` `flaredb` / `postgres` / `sqlite` / `memory` の backend matrix 全体を harness ではまだ網羅していない。
- `依存関係:` `flaredb` が主 storage; optional `chainfire`; `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `creditservice`, `k8shost`, `apigateway` が consumer。
## prismnet
- `責務:` tenant network control plane。VPC, subnet, port, router, security group, service IP pool を扱う。
- `Canonical entrypoint:` `nix/modules/prismnet.nix`; `prismnet/crates/prismnet-server/src/main.rs`; API は `prismnet/crates/prismnet-api/proto/prismnet.proto`
- `現在ある証拠:` `docs/testing.md``README.md``fresh-matrix` で VPC/subnet/port と security-group ACL add/remove を正本 proof と明示; `prismnet/crates/prismnet-server/src/services/*` に service 実装がある; `prismnet/crates/prismnet-server/src/ovn/client.rs` が OVN client を持つ; `nix/modules/prismnet.nix` が binary-consumed config を生成する。
- `未証明事項:` 実機 OVS/OVN dataplane; DHCP/metadata service の実ハード proof; multi-rack network integration。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `PRISMNET-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が local KVM lab で VPC/subnet/port lifecycle, security-group ACL add/remove, attached-VM networking artifact を dated root に保存するようになった。未解消の次段は real OVS/OVN dataplane と hardware-switch integration を release proof に昇格させること。
- `P2:` `PRISMNET-P2-01` `ovn/mock.rs` が近接して残っているため、supported path と archived/test path の境界を継続監視する必要がある。
- `依存関係:` `iam`, `flaredb`, optional `chainfire`; consumer は `flashdns`, `fiberlb`, `plasmavmc`, `k8shost`
## flashdns
- `責務:` authoritative DNS publication。tenant record, reverse zone, DNS handler を持つ。
- `Canonical entrypoint:` `nix/modules/flashdns.nix`; `flashdns/crates/flashdns-server/src/main.rs`; `flashdns/crates/flashdns-server/src/dns/*`
- `現在ある証拠:` `docs/testing.md``README.md``fresh-matrix` で record publication を正本 proof としている; `flashdns` server は record/zone/reverse-zone service を持つ; `nix/modules/flashdns.nix` が binary-consumed config を生成する。
- `未証明事項:` real port 53 exposure; upstream/secondary integration; failover with real network gear。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `FLASHDNS-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が authoritative workload/service answers を dated root に保存するようになり、local KVM での publication evidence は release lane に入った。未解消の次段は real port 53 exposure と upstream/secondary interop を hardware or external-network proof に広げること。
- `P2:` `FLASHDNS-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
- `依存関係:` `iam`, `flaredb`, optional `chainfire`; publication source は `k8shost``fleet-scheduler`
## fiberlb
- `責務:` service publication / VIP / L4-L7 load balancing / native BGP advertisement。
- `Canonical entrypoint:` `nix/modules/fiberlb.nix`; `fiberlb/crates/fiberlb-server/src/main.rs`; dataplane は `dataplane.rs`, `l7_dataplane.rs`, `vip_manager.rs`, `bgp_client.rs`
- `現在ある証拠:` `README.md``docs/testing.md``fresh-matrix` で TCP と TLS-terminated `Https` / `TerminatedHttps` listener を正本 proof としている; server code に native BGP/BFD, VIP ownership, TLS store, L7 dataplane 実装がある; L4 algorithm は in-tree tests を持つ。
- `未証明事項:` 実機 BGP peer との interop; L2/VIP 所有権の hardware proof; IPv6 と mixed peer topology。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `FIBERLB-P1-01` は 2026-04-10 に scope-fixed。`fiberlb/crates/fiberlb-server/src/healthcheck.rs` の HTTPS health check は依然として backend TLS 証明書検証をしないが、その理由と supported 範囲 (`TCP reachability + HTTP status`) は docs/guard/source comment に固定された。将来の CA-aware verification は別 tranche。
- `P1:` `FIBERLB-P1-02` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が listener publication, backend disable, drain, restore, re-convergence の artifact を dated root に保存するようになった。未解消の次段は native BGP/BFD peer interop と hardware VIP ownership を real network proof へ広げること。
- `P2:` `FIBERLB-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
- `依存関係:` `iam`, `flaredb`, optional `chainfire`; publication consumer は `k8shost``fleet-scheduler`; 実ネットワーク peer が必要。
## plasmavmc
- `責務:` tenant VM control plane と worker agent。VM lifecycle, image/materialization, worker registration, hypervisor integration を持つ。
- `Canonical entrypoint:` `nix/modules/plasmavmc.nix`; `plasmavmc/crates/plasmavmc-server/src/main.rs`; supported public backend は `plasmavmc-kvm`
- `現在ある証拠:` `README.md` が KVM-only public contract を明記; `docs/testing.md``single-node-quickstart`, `fresh-smoke`, `fresh-matrix``HYPERVISOR_TYPE_KVM` を正本 proof とする; `vm_service.rs``HYPERVISOR_TYPE_KVM` 以外を public surface 外とする; `volume_manager.rs``coronafs` / `lightningstor` integration を持つ。
- `未証明事項:` 実機での migration / storage handoff; long-running guest upgrade; network + storage fault 下での recovery。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `PLASMAVMC-P1-01` は 2026-04-10 に narrowed。`provider-vm-reality-proof` が shared-storage migration, PrismNet-attached post-migration networking, CoronaFS handoff, post-migration restart state を dated root に保存するようになった。未解消の次段は real-hardware migration と storage handoff の release proof を足すこと。
- `P2:` `PLASMAVMC-P2-01` Firecracker / mvisor の archived code が in-tree に残るため、supported surface への逆流を guard し続ける必要がある。
- `依存関係:` `iam`, `flaredb`, `prismnet`, optional `chainfire`, `lightningstor`, `coronafs`, host KVM/QEMU。
## coronafs
- `責務:` mutable VM volume layer。raw volume を管理し、`qemu-nbd` で worker に export する。
- `Canonical entrypoint:` `nix/modules/coronafs.nix`; `coronafs/crates/coronafs-server/src/main.rs`; 製品説明は `coronafs/README.md`
- `現在ある証拠:` `coronafs/README.md` が mutable VM-volume layer としての split を明言; `coronafs-server``/healthz` と volume/export API を持つ; `docs/testing.md``plasmavmc + coronafs + lightningstor``fresh-matrix` で proof 対象にしている; `plasmavmc/volume_manager.rs` に深い integration がある。
- `未証明事項:` export interruption 後の recovery の長時間耐久; 実ディスク/実ネットワーク上での latency budget。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `CORONAFS-P1-01` は 2026-04-10 に解消済み。`nix/single-node/surface.nix` の quickstart health URL は `http://127.0.0.1:50088/healthz` に修正された。
- `P1:` `CORONAFS-P1-02` は 2026-04-10 に解消済み。`durability-proof` が controller outage 中も node-local materialized volume の read と node-only capability split を検証する canonical failure-injection lane を持つ。
- `P2:` `CORONAFS-P2-01` storage benchmark はあるが、canonical publish gate では recovery path の比重がまだ弱い。
- `依存関係:` `qemu-nbd`, `qemu-img`, local disk; optional `chainfire` metadata backend; primary consumer は `plasmavmc`
## lightningstor
- `責務:` object storage と VM image backing。metadata plane と data node plane を持つ。
- `Canonical entrypoint:` `nix/modules/lightningstor.nix`; `lightningstor/crates/lightningstor-server/src/main.rs`; `lightningstor/crates/lightningstor-node/src/main.rs`; S3 path は `src/s3/*`
- `現在ある証拠:` `README.md` が bucket versioning / policy / tagging / object version listing を supported surface と明記; `docs/testing.md``fresh-matrix` で bucket metadata と object-version APIs を proof 対象にしている; server は S3 auth, distributed backend, repair queue を持つ; module は metadata/data/all-in-one mode を持つ。
- `未証明事項:` distributed backend の実機 failover; S3 compatibility breadth; cold-start image distribution on hardware。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `LIGHTNINGSTOR-P1-01` は 2026-04-10 に解消済み。`durability-proof` が node05 outage 中の write/head/read と service restore 後の repair/read-back を canonical failure-injection artifact として保存する。
- `P2:` `LIGHTNINGSTOR-P2-01` は 2026-04-10 に解消済み。`single-node dev` optional bundle は `nix/single-node/surface.nix` 上の TCP health gating を持つようになった。
- `依存関係:` `iam`, `flaredb`, optional `chainfire`; optional `lightningstor-node`; consumer は `plasmavmc` と tenant object clients。
## k8shost
- `責務:` tenant workload API surface。pod/deployment/service を扱い、`prismnet`, `flashdns`, `fiberlb`, optional `creditservice` に投影する。
- `Canonical entrypoint:` `nix/modules/k8shost.nix`; `k8shost/crates/k8shost-server/src/main.rs`; API protobuf は `k8shost/crates/k8shost-proto/proto/k8s.proto`
- `現在ある証拠:` `k8shost/README.md` が supported scope を定義; `README.md``WatchPods` を bounded snapshot stream と明記; `k8shost-server/src/services/pod.rs``ReceiverStream` ベースの `WatchPods` を実装; `docs/testing.md``fresh-smoke` / `fresh-matrix` で API contract を proof 対象にしている; 2026-04-10 には docs/guard/TODO で API/control-plane product surface のみに固定された。
- `未証明事項:` 実 workload runtime; tenant networking dataplane with real CNI/CSI; node-level execution semantics。
- `P0:` `K8SHOST-P0-01` は 2026-04-10 に解消済み。実 workload dataplane (`k8shost-cni`, `k8shost-controllers`, `lightningstor-csi`) は archived non-product として固定し、製品 narrative を API/control-plane scope のみに揃えた。
- `P1:` `K8SHOST-P1-01` は 2026-04-10 に scope-resolved。canonical proof が API contract 中心であること自体を製品境界として明文化し、実 pod runtime は製品 claim から外した。
- `P2:` `K8SHOST-P2-01` は 2026-04-10 に解消済み。archived scaffolds の非正本扱いは `supported-surface-guard` の contract marker で継続監視される。
- `依存関係:` `iam`, `flaredb`, `chainfire`, `prismnet`, `flashdns`, `fiberlb`, optional `creditservice`
## apigateway
- `責務:` external API/proxy surface。route, auth provider, credit provider, request mediation を持つ。
- `Canonical entrypoint:` `nix/modules/apigateway.nix`; `apigateway/crates/apigateway-server/src/main.rs`
- `現在ある証拠:` `node06``apigateway` を正本 gateway node として起動; `docs/testing.md``nix/test-cluster/README.md` が API-gateway-mediated flows を `fresh-matrix` に含める; server code は route, auth, credit provider, upstream timeout, request-id を持つ。
- `未証明事項:` multi-node HA; config distribution / reload; TLS termination strategy; gateway as product docs。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `APIGW-P1-01` は 2026-04-10 に scope-fixed。APIGateway は stateless replicated behind external L4/VIP として supported、config distribution は rendered config + restart-based rollout、live in-process reload は unsupported と docs に固定された。次段で残るのは dedicated multi-gateway HA proof の追加。
- `P2:` `APIGW-P2-01` release proof は `node06``fresh-matrix` への間接依存が中心で、専用 smoke gate が無い。
- `依存関係:` upstream services; optional `iam` / `creditservice` provider; external clients。
## nightlight
- `責務:` metrics ingestion と query。Prometheus remote_write / query API と gRPC query/admin を持つ。
- `Canonical entrypoint:` `nix/modules/nightlight.nix`; `nightlight/crates/nightlight-server/src/main.rs`; API proto は `nightlight/crates/nightlight-api/proto/*`
- `現在ある証拠:` `nightlight-server` は HTTP と gRPC を両方 bind する; `node06` が gateway node で起動; `docs/testing.md``nix/test-cluster/README.md` が NightLight HTTP surface の host-forward proof を記述; local WAL/snapshot/retention loop がある。
- `未証明事項:` replicated metrics topology; large retention; sustained remote_write load; tenant isolation。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `NIGHTLIGHT-P1-01` は 2026-04-10 に scope-fixed。NightLight は single-node WAL/snapshot service として product shape を固定し、replicated / HA metrics path は unsupported であることを docs と guard に反映した。
- `P2:` `NIGHTLIGHT-P2-01` は 2026-04-10 に narrowed。tenant boundary は deployment-scoped か upstream-auth-scoped であり、process 内の hard multi-tenant auth や per-tenant retention は current product contract に含めないことを docs に固定した。次段は auth or quota aware multi-tenant proof の追加。
- `依存関係:` local disk; optional `apigateway`; external metric writers/readers。
## creditservice
- `責務:` quota, wallet, reservation, admission control。
- `Canonical entrypoint:` `nix/modules/creditservice.nix`; `creditservice/crates/creditservice-server/src/main.rs`; 製品スコープは `creditservice/README.md`
- `現在ある証拠:` `creditservice/README.md` が supported scope と non-goals を明記; `docs/testing.md``fresh-matrix` で quota/wallet/reservation/API-gateway path を proof 対象にしている; module は `iamAddr`, `flaredbAddr`, optional SQL backend を持つ; `node06` が canonical gateway node で起動する。
- `未証明事項:` backend migration; finance-system との分離運用; export/reporting path。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `CREDIT-P1-01` 製品 narrative が README の non-goal を越えて finance ledger に膨らまないよう境界維持が必要。
- `P2:` `CREDIT-P2-01` は 2026-04-10 に narrowed。export と backend migration は offline export/import or backend-native snapshot workflows として README へ固定し、live mixed-writer migration は unsupported と明示した。次段は dedicated export proof の追加。
- `依存関係:` `iam`, `flaredb`, optional `chainfire`; `apigateway`, `k8shost`, tenant admission flow が consumer。
## deployer
- `責務:` bootstrap and rollout-intent authority。`/api/v1/phone-home`, install plan, desired-system reference, cluster inventory を持つ。
- `Canonical entrypoint:` `nix/modules/deployer.nix`; `deployer/crates/deployer-server/src/main.rs`; route wiring は `deployer/crates/deployer-server/src/lib.rs`
- `現在ある証拠:` `/api/v1/phone-home` が server route に存在; `nix/modules/deployer.nix` が package/service/cluster-state seed を持つ; `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md``baremetal-iso`, `baremetal-iso-e2e`, `deployer-vm-smoke`, `deployer-bootstrap-e2e`, `durability-proof`, `rollout-soak` を正本 proof とする; `verify-baremetal-iso.sh` が install path を end-to-end で辿る; 2026-04-10 の `durability-proof``deployer-pre-register-request.json`, `deployer-backup-list.json`, `deployer-post-restart-list.json`, `deployer-replayed-list.json` を保存し、`rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/deployer-post-restart-nodes.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, `deployer-journal.log` で longer-run live restart と release boundary marker を保存した。
- `未証明事項:` 実機 USB/BMC install; deployer 自身の true HA; ChainFire-backed multi-instance active failover の実装; operator disaster recovery の実機確認。
- `P0:` `DEPLOYER-P0-01` 現在の canonical bare-metal proof は QEMU-as-hardware までで、実機 regression lane はまだ無い。
- `P1:` `DEPLOYER-P1-01` は 2026-04-10 に scope-fixed final。release contract は one active writer plus optional cold-standby restore with `ultracloud.cluster` state re-apply and preserved admin request replay で固定し、automatic ChainFire-backed multi-instance failover は supported surface の外へ明示的に出した。`rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/deployer-post-restart-nodes.json`, `scope-fixed-contract.json`, `deployer-scope-fixed.txt` で live restart proof と boundary marker を保存した。今後やるなら scope 拡張として true HA 実装を別 ticket で扱う。
- `P2:` `DEPLOYER-P2-01` `bootstrapFlakeBundle` と optional binary cache を production でどう供給するかの標準運用形がまだ文書化不足。
- `依存関係:` `chainfire`; `nix-agent`; `install-target`; ISO/first-boot path; optional binary cache。
## fleet-scheduler
- `責務:` non-Kubernetes native service scheduler。cluster-native service placement, failover, publication reconciliation を持つ。
- `Canonical entrypoint:` `nix/modules/fleet-scheduler.nix`; `deployer/crates/fleet-scheduler/src/main.rs`; publication code は `publish.rs`
- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md``fresh-smoke`, `fresh-matrix`, `fleet-scheduler-e2e`, `rollout-soak` をこの境界の proof とする; module は `iamEndpoint`, `fiberlbEndpoint`, `flashdnsEndpoint`, `heartbeatTimeoutSecs` を持つ; scheduler code は `chainfire` watch, dependency summary, publication reconciliation を持つ; `fresh-smoke``node04 -> draining`, `node05` fail-stop, worker return 後の replica restore を通し、`rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/maintenance-held.json`, `power-loss-held.json`, `fleet-scheduler-post-restart.json`, `scope-fixed-contract.json`, `fleet-scheduler-scope-fixed.txt` で scope-fixed longer-run proof を保存した。
- `未証明事項:` 大規模クラスタ; multi-hour maintenance 窓; operator approval workflow を伴う drain choreography。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `FLEET-P1-01` は 2026-04-10 に scope-fixed final。release contract は two native-runtime workers 上の one planned drain cycle + one fail-stop worker-loss cycle + 30-second held degraded states で固定し、`rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T164549+0900/maintenance-held.json`, `power-loss-held.json`, `fleet-scheduler-post-restart.json`, `scope-fixed-contract.json`, `fleet-scheduler-scope-fixed.txt` でその upper bound を live KVM artifact として保存した。multi-hour maintenance windows, pinned singleton policies, operator approval workflows, and larger-cluster drain storms は supported surface の外へ明示的に出した。
- `P2:` `FLEET-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
- `依存関係:` `chainfire`; `node-agent`; optional `iam`, `fiberlb`, `flashdns`
## nix-agent
- `責務:` host-local NixOS convergence only。desired system を build/apply し、health check と rollback を担う。
- `Canonical entrypoint:` `nix/modules/nix-agent.nix`; `deployer/crates/nix-agent/src/main.rs`
- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md``baremetal-iso`, `baremetal-iso-e2e`, `deployer-vm-smoke`, `deployer-vm-rollback`, `portable-control-plane-regressions` を proof とする; code は desired-system, observed-system, rollback-on-failure, health-check-command を持つ; `nix/modules/nix-agent.nix` がその CLI 契約を正本生成する; 2026-04-10 の `rollout-soak``/mnt/d2/centra/photoncloud-monorepo/work/rollout-soak/20260410T154744+0900/node01-nix-agent-scope.txt``node04-nix-agent-scope.txt` を保存し、steady-state `test-cluster` では live `nix-agent.service` restart を pretending しない boundary を artifact と docs で固定した。
- `未証明事項:` kernel/network failure 下の rollback; multi-node wave rollout; real hardware recovery after partial switch。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `NIXAGENT-P1-01` は 2026-04-10 に解消済み。`healthCheckCommand` の argv 契約、`rollbackOnFailure``rolled-back` semantics、`deployer-vm-rollback` proof、partial failure recovery 手順は `docs/rollout-bundle.md``docs/testing.md` に固定された。
- `P2:` `NIXAGENT-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
- `依存関係:` `chainfire`; deployer が publish する desired-system; local NixOS flake / switch-to-configuration。
## node-agent
- `責務:` host-local runtime reconcile only。native service instance の heartbeat, process/container 実行, local observed state を担う。
- `Canonical entrypoint:` `nix/modules/node-agent.nix`; `deployer/crates/node-agent/src/main.rs`
- `現在ある証拠:` `docs/testing.md`, `docs/rollout-bundle.md`, `nix/test-cluster/README.md``fresh-smoke`, `fresh-matrix`, `fleet-scheduler-e2e`, `portable-control-plane-regressions` を proof とする; code は `watcher`, `agent`, `process` を持つ; module は Podman enable, stateDir, pidDir, `allowLocalInstanceUpsert` を持つ; `process.rs``${stateDir}/pids/*.log``${stateDir}/pids/*.meta.json` の contract を実装する。
- `未証明事項:` heterogeneous runtime support; crash-looping host service の細かな SLO; secret-rotation workflow そのもの。
- `P0:` いまの static survey で即死級の file-level breakage は未検出。
- `P1:` `NODEAGENT-P1-01` は 2026-04-10 に解消済み。logs / secrets / volume / upgrade 契約は `docs/rollout-bundle.md` と module description に固定された。
- `P2:` `NODEAGENT-P2-01` は 2026-04-10 に解消済み。module/binary default の `chainfireEndpoint` は canonical `http://127.0.0.1:2379` へ揃えた。
- `依存関係:` `chainfire`; `fleet-scheduler`; optional Podman; host systemd/process model。
## Nix/bootstrap/harness
- `責務:` 製品 surface を定義し、`single-node dev`, `3-node HA control plane`, `bare-metal bootstrap` の NixOS outputs と VM/QEMU harness を正本化する。
- `Canonical entrypoint:` `flake.nix`; `nix/modules/default.nix`; `nix/single-node/base.nix`; `nix/test-cluster/run-publishable-kvm-suite.sh`; `nix/test-cluster/run-local-baseline.sh`; `nix/test-cluster/verify-baremetal-iso.sh`; `nix/nodes/baremetal-qemu/*`
- `現在ある証拠:` `flake.nix``single-node-quickstart`, `single-node-trial-vm`, `canonical-profile-eval-guards`, `portable-control-plane-regressions`, `baremetal-iso-e2e` がある; `nix/modules/default.nix` が現在の module surface を一括 import する; `nix/single-node/base.nix` が最小 VM platform core と optional bundle を組む; `run-publishable-kvm-suite.sh``run-local-baseline.sh` が local CPU 並列度と local builder を固定する; `verify-baremetal-iso.sh` が ISO -> phone-home -> bundle fetch -> Disko -> reboot -> `nix-agent active` を辿る; `run-cluster.sh` には `durability-proof``rollout-soak` が追加され、`chainfire`, `flaredb`, `deployer`, `coronafs`, `lightningstor` の backup/restore と failure-injection artifact を `/work/durability-proof` に、longer-run rollout/control-plane maintenance artifact を `/work/rollout-soak` に保存する; 2026-04-10 の local AMD/KVM baseline で required 6 checks と `single-node-quickstart`, `baremetal-iso`, `fresh-smoke` がすべて pass した。
- `未証明事項:` 実機 USB/BMC install; `/nix/store` 容量制御の自動 guard; optional bundle 全部入り quickstart の release proof; non-Nix easy-trial artifact。
- `P0:` `HARNESS-P0-01` real hardware regression lane がまだ無く、canonical bare-metal proof は QEMU stand-in のまま。
- `P1:` `HARNESS-P1-01` は 2026-04-10 に解消済み。quickstart optional bundle の health gating は `lightningstor`, `flashdns`, `fiberlb` の TCP probe と `coronafs``50088/healthz` へ揃えた。
- `P1:` `HARNESS-P1-02` は 2026-04-10 に scope-fixed。easy-trial は `single-node-trial-vm` による Nix VM appliance で成立し、より軽い Docker/OCI 風 trial path を supported としない理由は `docs/edge-trial-surface.md`, `README.md`, `docs/testing.md`, `docs/component-matrix.md`, `nix/single-node/surface.nix`, `supported-surface-guard` に揃えた。
- `P1:` `HARNESS-P1-03` は 2026-04-10 に解消済み。`fresh-smoke` の stale VM cleanup は current profile の `vm_dir` / `vde_switch_dir` に含まれる PID に限定し、別 checkout の同名 cluster VM を巻き込まないようにした。
- `P2:` `HARNESS-P2-01` は 2026-04-10 に解消済み。`./work` と local builder parallelism に加えて `./nix/test-cluster/work-root-budget.sh``status` に加えて `enforce``prune-proof-logs` を持つようになり、disk budget advisory だけでなく stronger local budget gate と safer dated-proof cleanup workflow を提供するようになった。
- `依存関係:` `nix`, `nixpkgs`, QEMU/KVM, host disk under `./work`, local CPU parallelism, 全 component module 群。
## Notes For The Next Implementation Agent
- まず `DEPLOYER-P0-01` / `HARNESS-P0-01` を処理すると、hardware proof と実機 operator path の残件を低コストで減らせる。
- baseline 再現は `nix/test-cluster/run-local-baseline.sh` を使うと、local-only builder と `./work` 配下ログを固定したまま同じ経路を再実行できる。
- その次に `DEPLOYER-P0-01` / `HARNESS-P0-01` を実機 smoke へ進めると、QEMU-only から hardware path へ移れる。
- `DEPLOYER-P1-01``FLEET-P1-01` は scope-fixed final になった。今後それらを再度開くなら、current release boundary を拡張する別 tranche として true deployer HA や larger-cluster scheduler maintenance proof を扱うとよい。
- `FIBERLB-P1-01` は scope-fixed になったが、将来的に backend certificate verification を製品化するなら docs/guard の限定契約を書き換える必要がある。

17
apigateway/README.md Normal file
View file

@ -0,0 +1,17 @@
# APIGateway
`apigateway` is UltraCloud's supported external API and proxy surface for auth-aware and credit-aware upstream traffic.
## Supported product shape
APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer; live in-process reload is not part of the product contract.
- Config distribution is restart-based. Render routes, auth providers, and credit providers from Nix or generated cluster state, then replace or restart the process.
- Scale-out is supported by running multiple identical instances behind FiberLB or another L4 or VIP distribution layer.
- The release-facing proof remains `nix run ./nix/test-cluster#cluster -- fresh-matrix`, which validates the shipped single gateway-node composition on `node06`.
## Explicit non-goals
- hot route reload through an admin API or `SIGHUP`
- in-process config gossip or leader election between gateway replicas
- a claim that every HA layout is directly release-proven in the current harness

View file

@ -366,7 +366,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.init();
if used_default_config {
info!("Config file not found: {}, using defaults", args.config.display());
info!(
"Config file not found: {}, using defaults",
args.config.display()
);
}
let routes = build_routes(config.routes)?;
@ -412,7 +415,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.with_state(state);
let listener = tokio::net::TcpListener::bind(config.http_addr).await?;
axum::serve(listener, app.into_make_service_with_connect_info::<SocketAddr>()).await?;
axum::serve(
listener,
app.into_make_service_with_connect_info::<SocketAddr>(),
)
.await?;
Ok(())
}
@ -426,7 +433,13 @@ async fn health() -> Json<serde_json::Value> {
}
async fn list_routes(State(state): State<Arc<ServerState>>) -> Json<Vec<RouteConfig>> {
Json(state.routes.iter().map(|route| route.config.clone()).collect())
Json(
state
.routes
.iter()
.map(|route| route.config.clone())
.collect(),
)
}
async fn proxy(
@ -463,8 +476,12 @@ async fn proxy(
let target_url = build_upstream_url(&route, request.uri())?;
let request_timeout =
Duration::from_millis(route.config.timeout_ms.unwrap_or(state.upstream_timeout.as_millis() as u64));
let request_timeout = Duration::from_millis(
route
.config
.timeout_ms
.unwrap_or(state.upstream_timeout.as_millis() as u64),
);
let mut builder = state
.client
.request(request.method().clone(), target_url)
@ -630,13 +647,12 @@ async fn enforce_credit(
credit_subject.as_ref().expect("credit subject resolved"),
)
.await;
apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision)
.map(|decision| {
decision.map(|decision| CreditReservation {
provider: credit_cfg.provider.clone(),
reservation_id: decision.reservation_id,
})
apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision).map(|decision| {
decision.map(|decision| CreditReservation {
provider: credit_cfg.provider.clone(),
reservation_id: decision.reservation_id,
})
})
}
fn apply_credit_mode(
@ -837,13 +853,19 @@ async fn finalize_credit(
CommitPolicy::Never => return,
CommitPolicy::Always => {
if let Err(err) = commit_credit(state, credit_cfg, &reservation).await {
warn!("Failed to commit credit reservation {}: {}", reservation.reservation_id, err);
warn!(
"Failed to commit credit reservation {}: {}",
reservation.reservation_id, err
);
}
}
CommitPolicy::Success => {
if status.is_success() || status.is_redirection() {
if let Err(err) = commit_credit(state, credit_cfg, &reservation).await {
warn!("Failed to commit credit reservation {}: {}", reservation.reservation_id, err);
warn!(
"Failed to commit credit reservation {}: {}",
reservation.reservation_id, err
);
}
} else if let Err(err) = rollback_credit(state, credit_cfg, &reservation).await {
warn!(
@ -1010,11 +1032,9 @@ async fn build_auth_providers(
for config in configs {
let provider_type = normalize_name(&config.provider_type);
if providers.contains_key(&config.name) {
return Err(config_error(format!(
"duplicate auth provider name {}",
config.name
))
.into());
return Err(
config_error(format!("duplicate auth provider name {}", config.name)).into(),
);
}
match provider_type.as_str() {
@ -1034,10 +1054,7 @@ async fn build_auth_providers(
Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS));
providers.insert(
config.name.clone(),
AuthProvider::Grpc(GrpcAuthProvider {
channel,
timeout,
}),
AuthProvider::Grpc(GrpcAuthProvider { channel, timeout }),
);
}
_ => {
@ -1061,25 +1078,19 @@ async fn build_credit_providers(
for config in configs {
let provider_type = normalize_name(&config.provider_type);
if providers.contains_key(&config.name) {
return Err(config_error(format!(
"duplicate credit provider name {}",
config.name
))
.into());
return Err(
config_error(format!("duplicate credit provider name {}", config.name)).into(),
);
}
match provider_type.as_str() {
"grpc" => {
let mut endpoint = Endpoint::from_shared(config.endpoint.clone())?
.connect_timeout(Duration::from_millis(
config
.timeout_ms
.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
))
.timeout(Duration::from_millis(
config
.timeout_ms
.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
));
if let Some(tls) = build_client_tls_config(&config.tls).await? {
@ -1087,17 +1098,11 @@ async fn build_credit_providers(
}
let channel = endpoint.connect().await?;
let timeout = Duration::from_millis(
config
.timeout_ms
.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
);
let timeout =
Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS));
providers.insert(
config.name.clone(),
CreditProvider::Grpc(GrpcCreditProvider {
channel,
timeout,
}),
CreditProvider::Grpc(GrpcCreditProvider { channel, timeout }),
);
}
_ => {
@ -1132,11 +1137,9 @@ fn build_routes(configs: Vec<RouteConfig>) -> Result<Vec<Route>, Box<dyn std::er
.into());
}
if upstream.host_str().is_none() {
return Err(config_error(format!(
"route {} upstream must include host",
config.name
))
.into());
return Err(
config_error(format!("route {} upstream must include host", config.name)).into(),
);
}
let upstream_base_path = normalize_upstream_base_path(upstream.path());
@ -1357,7 +1360,11 @@ fn join_paths(base: &str, path: &str) -> String {
}
if path == "/" {
let trimmed = base.trim_end_matches('/');
return if trimmed.is_empty() { "/".to_string() } else { trimmed.to_string() };
return if trimmed.is_empty() {
"/".to_string()
} else {
trimmed.to_string()
};
}
format!(
@ -1385,9 +1392,9 @@ fn build_upstream_url(route: &Route, uri: &Uri) -> Result<Url, StatusCode> {
#[cfg(test)]
mod tests {
use super::*;
use apigateway_api::GatewayCreditServiceServer;
use axum::routing::get;
use creditservice_api::{CreditServiceImpl, CreditStorage, GatewayCreditServiceImpl};
use apigateway_api::GatewayCreditServiceServer;
use creditservice_types::Wallet;
use iam_api::{GatewayAuthServiceImpl, GatewayAuthServiceServer};
use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
@ -1470,7 +1477,11 @@ mod tests {
}
async fn start_iam_gateway() -> (SocketAddr, String) {
let backend = Arc::new(Backend::new(BackendConfig::Memory).await.expect("iam backend"));
let backend = Arc::new(
Backend::new(BackendConfig::Memory)
.await
.expect("iam backend"),
);
let principal_store = Arc::new(PrincipalStore::new(backend.clone()));
let role_store = Arc::new(RoleStore::new(backend.clone()));
let binding_store = Arc::new(BindingStore::new(backend.clone()));
@ -1516,12 +1527,8 @@ mod tests {
role_store.clone(),
cache,
));
let gateway_auth = GatewayAuthServiceImpl::new(
token_service,
principal_store,
token_store,
evaluator,
);
let gateway_auth =
GatewayAuthServiceImpl::new(token_service, principal_store, token_store, evaluator);
let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
.await
@ -1542,10 +1549,7 @@ mod tests {
async fn start_credit_gateway(iam_addr: &SocketAddr) -> SocketAddr {
let storage = creditservice_api::InMemoryStorage::new();
let wallet = Wallet::new("proj-1".into(), "org-1".into(), 100);
storage
.create_wallet(wallet)
.await
.expect("wallet create");
storage.create_wallet(wallet).await.expect("wallet create");
let auth_service = Arc::new(
iam_service_auth::AuthService::new(&format!("http://{}", iam_addr))
@ -1636,7 +1640,10 @@ mod tests {
let route = routes.first().unwrap();
let uri: Uri = "/api/v1/users?debug=true".parse().unwrap();
let url = build_upstream_url(route, &uri).unwrap();
assert_eq!(url.as_str(), "http://example.com/base/api/v1/users?debug=true");
assert_eq!(
url.as_str(),
"http://example.com/base/api/v1/users?debug=true"
);
}
#[test]
@ -1671,7 +1678,8 @@ mod tests {
let outcome = apply_auth_mode(PolicyMode::Optional, false, decision).unwrap();
assert!(outcome.subject.is_none());
let outcome = apply_auth_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
let outcome =
apply_auth_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
assert!(outcome.subject.is_none());
}
@ -1692,7 +1700,8 @@ mod tests {
let outcome = apply_credit_mode(PolicyMode::Optional, false, decision).unwrap();
assert!(outcome.is_none());
let outcome = apply_credit_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
let outcome =
apply_credit_mode(PolicyMode::Optional, false, Err(StatusCode::BAD_GATEWAY)).unwrap();
assert!(outcome.is_none());
}
@ -1783,7 +1792,8 @@ mod tests {
Err(status) => panic!("unexpected proxy status: {}", status),
}
}
let response = response.expect("gateway auth+credit test timed out waiting for ready backends");
let response =
response.expect("gateway auth+credit test timed out waiting for ready backends");
assert_eq!(response.status(), StatusCode::OK);
}
@ -1812,7 +1822,10 @@ mod tests {
let request = Request::builder()
.method("GET")
.uri("/v1/echo-auth")
.header(axum::http::header::AUTHORIZATION, "Bearer passthrough-token")
.header(
axum::http::header::AUTHORIZATION,
"Bearer passthrough-token",
)
.header(PHOTON_AUTH_TOKEN_HEADER, "photon-token")
.body(Body::empty())
.expect("request build");
@ -1828,8 +1841,14 @@ mod tests {
let body = to_bytes(response.into_body(), 1024 * 1024).await.unwrap();
let json: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(json.get("authorization").and_then(|v| v.as_str()), Some("Bearer passthrough-token"));
assert_eq!(json.get("photon_token").and_then(|v| v.as_str()), Some("photon-token"));
assert_eq!(
json.get("authorization").and_then(|v| v.as_str()),
Some("Bearer passthrough-token")
);
assert_eq!(
json.get("photon_token").and_then(|v| v.as_str()),
Some("photon-token")
);
}
#[test]

11
chainfire/Cargo.lock generated
View file

@ -388,18 +388,7 @@ dependencies = [
name = "chainfire-core"
version = "0.1.0"
dependencies = [
"async-trait",
"bytes",
"chainfire-gossip",
"chainfire-types",
"dashmap",
"futures",
"parking_lot",
"tempfile",
"thiserror 1.0.69",
"tokio",
"tokio-stream",
"tracing",
]
[[package]]

View file

@ -1,4 +1,4 @@
# This directory is a placeholder for runtime assets
# This directory is reserved for runtime assets
#
# Actual boot assets will be created at: /var/lib/pxe-boot/
# when the PXE server is deployed.

View file

@ -190,8 +190,8 @@ set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8
# set kernel-params ${kernel-params} systemd.log_level=debug
echo Loading NixOS kernel...
# NOTE: These paths will be populated by the S3 image builder (T032.S3)
# For now, they point to placeholder paths that need to be updated
# NOTE: These paths are populated by the S3 image builder (T032.S3)
# and must resolve to the generated kernel/initrd objects at deploy time.
kernel ${nixos-url}/bzImage ${kernel-params} || goto failed
echo Loading NixOS initrd...

View file

@ -4,8 +4,8 @@ use crate::error::{ClientError, Result};
use crate::watch::WatchHandle;
use chainfire_proto::proto::{
cluster_client::ClusterClient, compare, kv_client::KvClient, request_op, response_op,
watch_client::WatchClient, Compare, DeleteRangeRequest, MemberAddRequest, PutRequest,
RangeRequest, RequestOp, StatusRequest, TxnRequest,
watch_client::WatchClient, Compare, DeleteRangeRequest, PutRequest, RangeRequest, RequestOp,
StatusRequest, TxnRequest,
};
use std::time::Duration;
use tonic::transport::Channel;
@ -616,53 +616,6 @@ impl Client {
raft_term: resp.raft_term,
})
}
/// Add a member to the cluster
///
/// # Arguments
/// * `peer_url` - The Raft address of the new member (e.g., "127.0.0.1:2380")
/// * `is_learner` - Whether to add as learner (true) or voter (false)
///
/// # Returns
/// The node ID of the added member
pub async fn member_add(
&mut self,
node_id: u64,
peer_url: impl AsRef<str>,
is_learner: bool,
) -> Result<u64> {
let peer_url = peer_url.as_ref().to_string();
let resp = self
.with_cluster_retry(|mut cluster| {
let peer_url = peer_url.clone();
async move {
cluster
.member_add(MemberAddRequest {
node_id,
peer_urls: vec![peer_url],
is_learner,
})
.await
.map(|resp| resp.into_inner())
}
})
.await?;
// Extract the member ID from the response
let member_id = resp
.member
.map(|m| m.id)
.ok_or_else(|| ClientError::Internal("No member in response".to_string()))?;
debug!(
member_id = member_id,
peer_url = peer_url.as_str(),
is_learner = is_learner,
"Added member to cluster"
);
Ok(member_id)
}
}
/// Cluster status

View file

@ -136,9 +136,10 @@ fn convert_event(event: Event) -> WatchEvent {
EventType::Delete
};
let (key, value, revision) = event.kv.map(|kv| {
(kv.key, kv.value, kv.mod_revision as u64)
}).unwrap_or_default();
let (key, value, revision) = event
.kv
.map(|kv| (kv.key, kv.value, kv.mod_revision as u64))
.unwrap_or_default();
WatchEvent {
event_type,

View file

@ -4,10 +4,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
.build_server(true)
.build_client(true)
.compile_protos(
&[
"../../proto/chainfire.proto",
"../../proto/internal.proto",
],
&["../../proto/chainfire.proto", "../../proto/internal.proto"],
&["../../proto"],
)?;

View file

@ -1,30 +1,22 @@
//! Cluster management service implementation
//!
//! This service handles cluster operations and status queries.
//!
//! NOTE: Custom RaftCore does not yet support dynamic membership changes.
//! Member add/remove operations are disabled for now.
//! The supported surface reports the fixed membership that the node booted with.
use crate::conversions::make_header;
use crate::proto::{
cluster_server::Cluster, GetSnapshotRequest, GetSnapshotResponse, Member, MemberAddRequest,
MemberAddResponse, MemberListRequest, MemberListResponse, MemberRemoveRequest,
MemberRemoveResponse, SnapshotMeta, StatusRequest, StatusResponse, TransferSnapshotRequest,
TransferSnapshotResponse,
cluster_server::Cluster, Member, MemberListRequest, MemberListResponse, StatusRequest,
StatusResponse,
};
use chainfire_raft::core::RaftCore;
use std::sync::Arc;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;
use tonic::{Request, Response, Status};
use tracing::{debug, info, warn};
use tracing::debug;
/// Cluster service implementation
pub struct ClusterServiceImpl {
/// Raft core
raft: Arc<RaftCore>,
/// gRPC Raft client for managing node addresses
rpc_client: Arc<crate::GrpcRaftClient>,
/// Cluster ID
cluster_id: u64,
/// Configured members with client and peer URLs
@ -37,13 +29,11 @@ impl ClusterServiceImpl {
/// Create a new cluster service
pub fn new(
raft: Arc<RaftCore>,
rpc_client: Arc<crate::GrpcRaftClient>,
cluster_id: u64,
members: Vec<Member>,
) -> Self {
Self {
raft,
rpc_client,
cluster_id,
members,
version: env!("CARGO_PKG_VERSION").to_string(),
@ -55,8 +45,7 @@ impl ClusterServiceImpl {
}
/// Get current members as proto Member list
/// NOTE: Custom RaftCore doesn't track membership dynamically yet, so this returns
/// the configured static membership that the server was booted with.
/// Return the configured static membership that the server was booted with.
async fn get_member_list(&self) -> Vec<Member> {
if self.members.is_empty() {
return vec![Member {
@ -73,35 +62,6 @@ impl ClusterServiceImpl {
#[tonic::async_trait]
impl Cluster for ClusterServiceImpl {
async fn member_add(
&self,
request: Request<MemberAddRequest>,
) -> Result<Response<MemberAddResponse>, Status> {
let req = request.into_inner();
debug!(node_id = req.node_id, peer_urls = ?req.peer_urls, is_learner = req.is_learner, "Member add request");
// Custom RaftCore doesn't support dynamic membership changes yet
warn!("Member add not supported in custom Raft implementation");
Err(Status::unimplemented(
"Dynamic membership changes not supported in custom Raft implementation. \
All cluster members must be configured at startup via initial_members."
))
}
async fn member_remove(
&self,
request: Request<MemberRemoveRequest>,
) -> Result<Response<MemberRemoveResponse>, Status> {
let req = request.into_inner();
debug!(member_id = req.id, "Member remove request");
// Custom RaftCore doesn't support dynamic membership changes yet
warn!("Member remove not supported in custom Raft implementation");
Err(Status::unimplemented(
"Dynamic membership changes not supported in custom Raft implementation"
))
}
async fn member_list(
&self,
_request: Request<MemberListRequest>,
@ -128,104 +88,11 @@ impl Cluster for ClusterServiceImpl {
Ok(Response::new(StatusResponse {
header: Some(self.make_header(last_applied)),
version: self.version.clone(),
db_size: 0, // TODO: get actual RocksDB size
db_size: 0,
leader: leader.unwrap_or(0),
raft_index: commit_index,
raft_term: term,
raft_applied_index: last_applied,
}))
}
/// Transfer snapshot to a target node for pre-seeding (T041 Option C)
///
/// This is a workaround for OpenRaft 0.9.x learner replication bug.
/// By pre-seeding learners with a snapshot, we avoid the assertion failure
/// during log replication.
///
/// TODO(T041.S5): Full implementation pending - currently returns placeholder
async fn transfer_snapshot(
&self,
request: Request<TransferSnapshotRequest>,
) -> Result<Response<TransferSnapshotResponse>, Status> {
let req = request.into_inner();
info!(
target_node_id = req.target_node_id,
target_addr = %req.target_addr,
"Snapshot transfer request (T041 Option C)"
);
// Get current state from state machine
let sm = self.raft.state_machine();
let revision = sm.current_revision();
let term = self.raft.current_term().await;
let membership = self.raft.membership().await;
let meta = SnapshotMeta {
last_log_index: revision,
last_log_term: term,
membership: membership.clone(),
size: 0, // Will be set when full impl is done
};
// TODO(T041.S5): Implement full snapshot transfer
// 1. Serialize KV data using chainfire_storage::snapshot::SnapshotBuilder
// 2. Stream snapshot to target via InstallSnapshot RPC
// 3. Wait for target to apply snapshot
//
// For now, return success placeholder - the actual workaround can use
// data directory copy (Option C1) until this API is complete.
warn!(
target = %req.target_addr,
"TransferSnapshot not yet fully implemented - use data dir copy workaround"
);
Ok(Response::new(TransferSnapshotResponse {
header: Some(self.make_header(revision)),
success: false,
error: "TransferSnapshot API not yet implemented - use data directory copy".to_string(),
meta: Some(meta),
}))
}
type GetSnapshotStream = ReceiverStream<Result<GetSnapshotResponse, Status>>;
/// Get snapshot from this node as a stream of chunks
///
/// TODO(T041.S5): Full implementation pending - currently returns empty snapshot
async fn get_snapshot(
&self,
_request: Request<GetSnapshotRequest>,
) -> Result<Response<Self::GetSnapshotStream>, Status> {
debug!("Get snapshot request (T041 Option C)");
// Get current state from state machine
let sm = self.raft.state_machine();
let revision = sm.current_revision();
let term = self.raft.current_term().await;
let membership = self.raft.membership().await;
let meta = SnapshotMeta {
last_log_index: revision,
last_log_term: term,
membership,
size: 0,
};
// Create channel for streaming response
let (tx, rx) = mpsc::channel(4);
// TODO(T041.S5): Stream actual KV data
// For now, just send metadata with empty data
tokio::spawn(async move {
let response = GetSnapshotResponse {
meta: Some(meta),
chunk: vec![],
done: true,
};
let _ = tx.send(Ok(response)).await;
});
Ok(Response::new(ReceiverStream::new(rx)))
}
}

View file

@ -4,22 +4,17 @@
//! It bridges the gRPC layer with the custom Raft implementation.
use crate::internal_proto::{
raft_service_server::RaftService,
AppendEntriesRequest as ProtoAppendEntriesRequest,
AppendEntriesResponse as ProtoAppendEntriesResponse,
InstallSnapshotRequest, InstallSnapshotResponse,
VoteRequest as ProtoVoteRequest,
raft_service_server::RaftService, AppendEntriesRequest as ProtoAppendEntriesRequest,
AppendEntriesResponse as ProtoAppendEntriesResponse, VoteRequest as ProtoVoteRequest,
VoteResponse as ProtoVoteResponse,
};
use chainfire_raft::core::{
RaftCore, VoteRequest, AppendEntriesRequest,
};
use chainfire_storage::{LogId, LogEntry as RaftLogEntry, EntryPayload};
use chainfire_raft::core::{AppendEntriesRequest, RaftCore, VoteRequest};
use chainfire_storage::{EntryPayload, LogEntry as RaftLogEntry, LogId};
use chainfire_types::command::RaftCommand;
use std::sync::Arc;
use tokio::sync::oneshot;
use tonic::{Request, Response, Status, Streaming};
use tracing::{debug, info, trace, warn};
use tonic::{Request, Response, Status};
use tracing::{info, trace, warn};
/// Internal Raft RPC service implementation
///
@ -67,7 +62,11 @@ impl RaftService for RaftServiceImpl {
Status::internal("Vote request failed: channel closed")
})?;
trace!(term = resp.term, granted = resp.vote_granted, "Vote response");
trace!(
term = resp.term,
granted = resp.vote_granted,
"Vote response"
);
Ok(Response::new(ProtoVoteResponse {
term: resp.term,
vote_granted: resp.vote_granted,
@ -141,22 +140,4 @@ impl RaftService for RaftServiceImpl {
}))
}
async fn install_snapshot(
&self,
request: Request<Streaming<InstallSnapshotRequest>>,
) -> Result<Response<InstallSnapshotResponse>, Status> {
let mut stream = request.into_inner();
debug!("InstallSnapshot stream started");
// Collect all chunks (for compatibility)
while let Some(chunk) = stream.message().await? {
if chunk.done {
break;
}
}
// Custom Raft doesn't support snapshots yet
warn!("InstallSnapshot not supported in custom Raft implementation");
Err(Status::unimplemented("Snapshots not supported in custom Raft implementation"))
}
}

View file

@ -45,7 +45,9 @@ impl Kv for KvServiceImpl {
// NOTE: Custom RaftCore doesn't yet support linearizable_read() method
// For now, just warn if non-serializable read is requested
if !req.serializable {
warn!("Linearizable reads not yet supported in custom Raft, performing serializable read");
warn!(
"Linearizable reads not yet supported in custom Raft, performing serializable read"
);
}
// Get state machine from Raft core
@ -84,7 +86,11 @@ impl Kv for KvServiceImpl {
let command = RaftCommand::Put {
key: req.key,
value: req.value,
lease_id: if req.lease != 0 { Some(req.lease) } else { None },
lease_id: if req.lease != 0 {
Some(req.lease)
} else {
None
},
prev_kv: req.prev_kv,
};
@ -115,19 +121,25 @@ impl Kv for KvServiceImpl {
let req = request.into_inner();
debug!(key = ?String::from_utf8_lossy(&req.key), "Delete request");
// Workaround: Pre-check key existence to determine deleted count
// TODO: Replace with proper RaftResponse.deleted once client_write returns full response
// Pre-check key existence because the current client_write path does not
// return a delete count in the write response.
let sm = self.raft.state_machine();
let deleted_count = if req.range_end.is_empty() {
// Single key delete - check if exists
let exists = sm.kv()
let exists = sm
.kv()
.get(&req.key)
.map_err(|e| Status::internal(e.to_string()))?
.is_some();
if exists { 1 } else { 0 }
if exists {
1
} else {
0
}
} else {
// Range delete - count keys in range
let kvs = sm.kv()
let kvs = sm
.kv()
.range(&req.key, Some(&req.range_end))
.map_err(|e| Status::internal(e.to_string()))?;
kvs.len() as i64
@ -231,7 +243,7 @@ impl Kv for KvServiceImpl {
Ok(Response::new(TxnResponse {
header: Some(self.make_header(revision).await),
succeeded: true, // Assume success if no error
succeeded: true, // Assume success if no error
responses: vec![], // Not supported yet
}))
}
@ -276,9 +288,7 @@ fn convert_txn_responses(
.collect()
}
fn convert_ops(
ops: &[crate::proto::RequestOp],
) -> Vec<chainfire_types::command::TxnOp> {
fn convert_ops(ops: &[crate::proto::RequestOp]) -> Vec<chainfire_types::command::TxnOp> {
use chainfire_types::command::TxnOp;
ops.iter()
@ -287,7 +297,11 @@ fn convert_ops(
crate::proto::request_op::Request::RequestPut(put) => TxnOp::Put {
key: put.key.clone(),
value: put.value.clone(),
lease_id: if put.lease != 0 { Some(put.lease) } else { None },
lease_id: if put.lease != 0 {
Some(put.lease)
} else {
None
},
},
crate::proto::request_op::Request::RequestDeleteRange(del) => {
if del.range_end.is_empty() {
@ -307,7 +321,7 @@ fn convert_ops(
limit: range.limit,
keys_only: range.keys_only,
count_only: range.count_only,
}
},
})
})
.collect()

View file

@ -182,7 +182,8 @@ impl Lease for LeaseServiceImpl {
let leases = sm.leases();
let lease_ids = leases.list();
let statuses: Vec<LeaseStatus> = lease_ids.into_iter().map(|id| LeaseStatus { id }).collect();
let statuses: Vec<LeaseStatus> =
lease_ids.into_iter().map(|id| LeaseStatus { id }).collect();
Ok(Response::new(LeaseLeasesResponse {
header: Some(self.make_header(revision)),

View file

@ -5,25 +5,25 @@
//! - gRPC service implementations
//! - Client and server components
pub mod cluster_service;
pub mod conversions;
pub mod generated;
pub mod internal_service;
pub mod kv_service;
pub mod lease_service;
pub mod watch_service;
pub mod cluster_service;
pub mod internal_service;
pub mod raft_client;
pub mod conversions;
pub mod watch_service;
// Re-export generated types
pub use generated::chainfire::v1 as proto;
pub use generated::chainfire::internal as internal_proto;
pub use generated::chainfire::v1 as proto;
// Re-export services
pub use cluster_service::ClusterServiceImpl;
pub use internal_service::RaftServiceImpl;
pub use kv_service::KvServiceImpl;
pub use lease_service::LeaseServiceImpl;
pub use watch_service::WatchServiceImpl;
pub use cluster_service::ClusterServiceImpl;
pub use internal_service::RaftServiceImpl;
// Re-export Raft client and config
pub use raft_client::{GrpcRaftClient, RetryConfig};

View file

@ -112,7 +112,10 @@ impl GrpcRaftClient {
}
/// Get or create a gRPC client for the target node
async fn get_client(&self, target: NodeId) -> Result<RaftServiceClient<Channel>, RaftNetworkError> {
async fn get_client(
&self,
target: NodeId,
) -> Result<RaftServiceClient<Channel>, RaftNetworkError> {
// Check cache first
{
let clients = self.clients.read().await;
@ -290,9 +293,7 @@ impl RaftRpcClient for GrpcRaftClient {
use chainfire_storage::EntryPayload;
let data = match &e.payload {
EntryPayload::Blank => vec![],
EntryPayload::Normal(cmd) => {
bincode::serialize(cmd).unwrap_or_default()
}
EntryPayload::Normal(cmd) => bincode::serialize(cmd).unwrap_or_default(),
EntryPayload::Membership(_) => vec![],
};
(e.log_id.index, e.log_id.term, data)
@ -333,8 +334,16 @@ impl RaftRpcClient for GrpcRaftClient {
Ok(AppendEntriesResponse {
term: resp.term,
success: resp.success,
conflict_index: if resp.conflict_index > 0 { Some(resp.conflict_index) } else { None },
conflict_term: if resp.conflict_term > 0 { Some(resp.conflict_term) } else { None },
conflict_index: if resp.conflict_index > 0 {
Some(resp.conflict_index)
} else {
None
},
conflict_term: if resp.conflict_term > 0 {
Some(resp.conflict_term)
} else {
None
},
})
}
})

View file

@ -1,9 +1,7 @@
//! Watch service implementation
use crate::conversions::make_header;
use crate::proto::{
watch_server::Watch, WatchRequest, WatchResponse,
};
use crate::proto::{watch_server::Watch, WatchRequest, WatchResponse};
use chainfire_watch::{WatchRegistry, WatchStream};
use std::pin::Pin;
use std::sync::Arc;
@ -39,7 +37,8 @@ impl WatchServiceImpl {
#[tonic::async_trait]
impl Watch for WatchServiceImpl {
type WatchStream = Pin<Box<dyn tokio_stream::Stream<Item = Result<WatchResponse, Status>> + Send>>;
type WatchStream =
Pin<Box<dyn tokio_stream::Stream<Item = Result<WatchResponse, Status>> + Send>>;
async fn watch(
&self,
@ -81,13 +80,17 @@ impl Watch for WatchServiceImpl {
Ok(req) => {
if let Some(request_union) = req.request_union {
let response = match request_union {
crate::proto::watch_request::RequestUnion::CreateRequest(create) => {
crate::proto::watch_request::RequestUnion::CreateRequest(
create,
) => {
let internal_req: chainfire_types::watch::WatchRequest =
create.into();
let resp = stream.create_watch(internal_req);
internal_to_proto_response(resp, cluster_id, member_id)
}
crate::proto::watch_request::RequestUnion::CancelRequest(cancel) => {
crate::proto::watch_request::RequestUnion::CancelRequest(
cancel,
) => {
let resp = stream.cancel_watch(cancel.watch_id);
internal_to_proto_response(resp, cluster_id, member_id)
}

View file

@ -3,35 +3,12 @@ name = "chainfire-core"
version.workspace = true
edition.workspace = true
license.workspace = true
description = "Embeddable distributed cluster library with Raft consensus and SWIM gossip"
description = "Internal compatibility crate for non-public ChainFire workspace types"
rust-version.workspace = true
publish = false
[dependencies]
# Internal crates
chainfire-types = { workspace = true }
chainfire-gossip = { workspace = true }
# Note: chainfire-storage, chainfire-raft, chainfire-watch
# will be added as implementation progresses
# chainfire-storage = { workspace = true }
# chainfire-raft = { workspace = true }
# chainfire-watch = { workspace = true }
# Async runtime
tokio = { workspace = true }
tokio-stream = { workspace = true }
futures = { workspace = true }
async-trait = { workspace = true }
# Utilities
thiserror = { workspace = true }
tracing = { workspace = true }
bytes = { workspace = true }
parking_lot = { workspace = true }
dashmap = { workspace = true }
[dev-dependencies]
tokio = { workspace = true, features = ["test-util"] }
tempfile = { workspace = true }
[lints]
workspace = true

View file

@ -1,238 +0,0 @@
//! Builder pattern for cluster creation
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use chainfire_gossip::{GossipAgent, GossipId};
use chainfire_types::node::NodeRole;
use chainfire_types::RaftRole;
use crate::callbacks::{ClusterEventHandler, KvEventHandler};
use crate::cluster::Cluster;
use crate::config::{ClusterConfig, MemberConfig, StorageBackendConfig, TimeoutConfig};
use crate::error::{ClusterError, Result};
use crate::events::EventDispatcher;
/// Builder for creating a Chainfire cluster instance
///
/// # Example
///
/// ```ignore
/// use chainfire_core::ClusterBuilder;
///
/// let cluster = ClusterBuilder::new(1)
/// .name("node-1")
/// .gossip_addr("0.0.0.0:7946".parse()?)
/// .raft_addr("0.0.0.0:2380".parse()?)
/// .bootstrap(true)
/// .build()
/// .await?;
/// ```
pub struct ClusterBuilder {
config: ClusterConfig,
cluster_handlers: Vec<Arc<dyn ClusterEventHandler>>,
kv_handlers: Vec<Arc<dyn KvEventHandler>>,
}
impl ClusterBuilder {
/// Create a new cluster builder with the given node ID
pub fn new(node_id: u64) -> Self {
Self {
config: ClusterConfig {
node_id,
..Default::default()
},
cluster_handlers: Vec::new(),
kv_handlers: Vec::new(),
}
}
/// Set the node name
pub fn name(mut self, name: impl Into<String>) -> Self {
self.config.node_name = name.into();
self
}
/// Set the node role (ControlPlane or Worker)
pub fn role(mut self, role: NodeRole) -> Self {
self.config.node_role = role;
self
}
/// Set the Raft participation role (Voter, Learner, or None)
pub fn raft_role(mut self, role: RaftRole) -> Self {
self.config.raft_role = role;
self
}
/// Set the API listen address
pub fn api_addr(mut self, addr: SocketAddr) -> Self {
self.config.api_addr = Some(addr);
self
}
/// Set the Raft listen address (for control plane nodes)
pub fn raft_addr(mut self, addr: SocketAddr) -> Self {
self.config.raft_addr = Some(addr);
self
}
/// Set the gossip listen address
pub fn gossip_addr(mut self, addr: SocketAddr) -> Self {
self.config.gossip_addr = addr;
self
}
/// Set the storage backend
pub fn storage(mut self, backend: StorageBackendConfig) -> Self {
self.config.storage = backend;
self
}
/// Set the data directory (convenience method for RocksDB storage)
pub fn data_dir(mut self, path: impl Into<PathBuf>) -> Self {
self.config.storage = StorageBackendConfig::RocksDb { path: path.into() };
self
}
/// Use in-memory storage
pub fn memory_storage(mut self) -> Self {
self.config.storage = StorageBackendConfig::Memory;
self
}
/// Add initial cluster members (for bootstrap)
pub fn initial_members(mut self, members: Vec<MemberConfig>) -> Self {
self.config.initial_members = members;
self
}
/// Add a single initial member
pub fn add_member(mut self, member: MemberConfig) -> Self {
self.config.initial_members.push(member);
self
}
/// Enable cluster bootstrap (first node)
pub fn bootstrap(mut self, bootstrap: bool) -> Self {
self.config.bootstrap = bootstrap;
self
}
/// Set the cluster ID
pub fn cluster_id(mut self, id: u64) -> Self {
self.config.cluster_id = id;
self
}
/// Enable gRPC API server
pub fn with_grpc_api(mut self, enabled: bool) -> Self {
self.config.enable_grpc_api = enabled;
self
}
/// Set timeout configuration
pub fn timeouts(mut self, timeouts: TimeoutConfig) -> Self {
self.config.timeouts = timeouts;
self
}
/// Register a cluster event handler
///
/// Multiple handlers can be registered. They will all be called
/// when cluster events occur.
pub fn on_cluster_event<H>(mut self, handler: H) -> Self
where
H: ClusterEventHandler + 'static,
{
self.cluster_handlers.push(Arc::new(handler));
self
}
/// Register a cluster event handler (Arc version)
pub fn on_cluster_event_arc(mut self, handler: Arc<dyn ClusterEventHandler>) -> Self {
self.cluster_handlers.push(handler);
self
}
/// Register a KV event handler
///
/// Multiple handlers can be registered. They will all be called
/// when KV events occur.
pub fn on_kv_event<H>(mut self, handler: H) -> Self
where
H: KvEventHandler + 'static,
{
self.kv_handlers.push(Arc::new(handler));
self
}
/// Register a KV event handler (Arc version)
pub fn on_kv_event_arc(mut self, handler: Arc<dyn KvEventHandler>) -> Self {
self.kv_handlers.push(handler);
self
}
/// Validate the configuration
fn validate(&self) -> Result<()> {
if self.config.node_id == 0 {
return Err(ClusterError::Config("node_id must be non-zero".into()));
}
if self.config.node_name.is_empty() {
return Err(ClusterError::Config("node_name is required".into()));
}
// Raft-participating nodes need a Raft address
if self.config.raft_role.participates_in_raft() && self.config.raft_addr.is_none() {
return Err(ClusterError::Config(
"raft_addr is required for Raft-participating nodes".into(),
));
}
Ok(())
}
/// Build the cluster instance
///
/// This initializes the storage backend, Raft (if applicable), and gossip.
pub async fn build(self) -> Result<Cluster> {
self.validate()?;
// Create event dispatcher with registered handlers
let mut event_dispatcher = EventDispatcher::new();
for handler in self.cluster_handlers {
event_dispatcher.add_cluster_handler(handler);
}
for handler in self.kv_handlers {
event_dispatcher.add_kv_handler(handler);
}
// Initialize gossip agent
let gossip_identity = GossipId::new(
self.config.node_id,
self.config.gossip_addr,
self.config.node_role,
);
let gossip_agent = GossipAgent::new(gossip_identity, chainfire_gossip::agent::default_config())
.await
.map_err(|e| ClusterError::Gossip(e.to_string()))?;
tracing::info!(
node_id = self.config.node_id,
gossip_addr = %self.config.gossip_addr,
"Gossip agent initialized"
);
// Create the cluster
let cluster = Cluster::new(self.config, Some(gossip_agent), event_dispatcher);
// TODO: Initialize storage backend
// TODO: Initialize Raft if role participates
// TODO: Start background tasks
Ok(cluster)
}
}

View file

@ -1,103 +0,0 @@
//! Callback traits for cluster events
use async_trait::async_trait;
use chainfire_types::node::NodeInfo;
use crate::kvs::KvEntry;
/// Handler for cluster lifecycle events
///
/// Implement this trait to receive notifications about cluster membership
/// and leadership changes.
#[async_trait]
pub trait ClusterEventHandler: Send + Sync {
/// Called when a node joins the cluster
async fn on_node_joined(&self, _node: &NodeInfo) {}
/// Called when a node leaves the cluster
async fn on_node_left(&self, _node_id: u64, _reason: LeaveReason) {}
/// Called when leadership changes
async fn on_leader_changed(&self, _old_leader: Option<u64>, _new_leader: u64) {}
/// Called when this node becomes leader
async fn on_became_leader(&self) {}
/// Called when this node loses leadership
async fn on_lost_leadership(&self) {}
/// Called when cluster membership changes
async fn on_membership_changed(&self, _members: &[NodeInfo]) {}
/// Called when a network partition is detected
async fn on_partition_detected(&self, _reachable: &[u64], _unreachable: &[u64]) {}
/// Called when cluster is ready (initial leader elected, etc.)
async fn on_cluster_ready(&self) {}
}
/// Handler for KV store events
///
/// Implement this trait to receive notifications about key-value changes.
#[async_trait]
pub trait KvEventHandler: Send + Sync {
/// Called when a key is created or updated
async fn on_key_changed(
&self,
_namespace: &str,
_key: &[u8],
_value: &[u8],
_revision: u64,
) {
}
/// Called when a key is deleted
async fn on_key_deleted(&self, _namespace: &str, _key: &[u8], _revision: u64) {}
/// Called when multiple keys with a prefix are changed
async fn on_prefix_changed(&self, _namespace: &str, _prefix: &[u8], _entries: &[KvEntry]) {}
}
/// Reason for node departure from the cluster
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum LeaveReason {
/// Node left gracefully
Graceful,
/// Node timed out (failed to respond)
Timeout,
/// Network partition detected
NetworkPartition,
/// Node was explicitly evicted
Evicted,
/// Unknown reason
Unknown,
}
impl std::fmt::Display for LeaveReason {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
LeaveReason::Graceful => write!(f, "graceful"),
LeaveReason::Timeout => write!(f, "timeout"),
LeaveReason::NetworkPartition => write!(f, "network_partition"),
LeaveReason::Evicted => write!(f, "evicted"),
LeaveReason::Unknown => write!(f, "unknown"),
}
}
}
/// A no-op event handler for when callbacks are not needed
pub struct NoOpClusterEventHandler;
#[async_trait]
impl ClusterEventHandler for NoOpClusterEventHandler {}
/// A no-op KV event handler
pub struct NoOpKvEventHandler;
#[async_trait]
impl KvEventHandler for NoOpKvEventHandler {}

View file

@ -1,313 +0,0 @@
//! Cluster management
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::Arc;
use parking_lot::RwLock;
use tokio::sync::broadcast;
use chainfire_gossip::{GossipAgent, MembershipChange};
use chainfire_types::node::NodeInfo;
use crate::config::ClusterConfig;
use crate::error::{ClusterError, Result};
use crate::events::EventDispatcher;
use crate::kvs::{Kv, KvHandle};
/// Current state of the cluster
#[derive(Debug, Clone)]
#[derive(Default)]
pub struct ClusterState {
/// Whether this node is the leader
pub is_leader: bool,
/// Current leader's node ID
pub leader_id: Option<u64>,
/// Current term (Raft)
pub term: u64,
/// All known cluster members
pub members: Vec<NodeInfo>,
/// Whether the cluster is ready (initial leader elected)
pub ready: bool,
}
/// Main cluster instance
///
/// This is the primary interface for interacting with a Chainfire cluster.
/// It manages Raft consensus, gossip membership, and the distributed KV store.
pub struct Cluster {
/// Node configuration
config: ClusterConfig,
/// Current cluster state
state: Arc<RwLock<ClusterState>>,
/// KV store
kv: Arc<Kv>,
/// Gossip agent for cluster membership
gossip_agent: Option<GossipAgent>,
/// Event dispatcher
event_dispatcher: Arc<EventDispatcher>,
/// Shutdown flag
shutdown: AtomicBool,
/// Shutdown signal sender
shutdown_tx: broadcast::Sender<()>,
}
impl Cluster {
/// Create a new cluster instance
pub(crate) fn new(
config: ClusterConfig,
gossip_agent: Option<GossipAgent>,
event_dispatcher: EventDispatcher,
) -> Self {
let (shutdown_tx, _) = broadcast::channel(1);
Self {
config,
state: Arc::new(RwLock::new(ClusterState::default())),
kv: Arc::new(Kv::new()),
gossip_agent,
event_dispatcher: Arc::new(event_dispatcher),
shutdown: AtomicBool::new(false),
shutdown_tx,
}
}
/// Get this node's ID
pub fn node_id(&self) -> u64 {
self.config.node_id
}
/// Get this node's name
pub fn node_name(&self) -> &str {
&self.config.node_name
}
/// Get a handle for interacting with the cluster
///
/// Handles are lightweight and can be cloned freely.
pub fn handle(&self) -> ClusterHandle {
ClusterHandle {
node_id: self.config.node_id,
state: self.state.clone(),
kv: self.kv.clone(),
shutdown_tx: self.shutdown_tx.clone(),
}
}
/// Get the KV store interface
pub fn kv(&self) -> &Arc<Kv> {
&self.kv
}
/// Get current cluster state
pub fn state(&self) -> ClusterState {
self.state.read().clone()
}
/// Check if this node is the leader
pub fn is_leader(&self) -> bool {
self.state.read().is_leader
}
/// Get current leader ID
pub fn leader(&self) -> Option<u64> {
self.state.read().leader_id
}
/// Get all cluster members
pub fn members(&self) -> Vec<NodeInfo> {
self.state.read().members.clone()
}
/// Check if the cluster is ready
pub fn is_ready(&self) -> bool {
self.state.read().ready
}
/// Join an existing cluster
///
/// Connects to seed nodes and joins the cluster via gossip.
pub async fn join(&mut self, seed_addrs: &[std::net::SocketAddr]) -> Result<()> {
if seed_addrs.is_empty() {
return Err(ClusterError::Config("No seed addresses provided".into()));
}
let gossip_agent = self.gossip_agent.as_mut().ok_or_else(|| {
ClusterError::Config("Gossip agent not initialized".into())
})?;
// Announce to all seed nodes to discover the cluster
for &addr in seed_addrs {
tracing::info!(%addr, "Announcing to seed node");
gossip_agent
.announce(addr)
.map_err(|e| ClusterError::Gossip(e.to_string()))?;
}
tracing::info!(seeds = seed_addrs.len(), "Joined cluster via gossip");
Ok(())
}
/// Leave the cluster gracefully
pub async fn leave(&self) -> Result<()> {
// TODO: Implement graceful leave
self.shutdown();
Ok(())
}
/// Add a new node to the cluster (leader only)
pub async fn add_node(&self, _node: NodeInfo, _as_learner: bool) -> Result<()> {
if !self.is_leader() {
return Err(ClusterError::NotLeader {
leader_id: self.leader(),
});
}
// TODO: Implement node addition via Raft
Ok(())
}
/// Remove a node from the cluster (leader only)
pub async fn remove_node(&self, _node_id: u64) -> Result<()> {
if !self.is_leader() {
return Err(ClusterError::NotLeader {
leader_id: self.leader(),
});
}
// TODO: Implement node removal via Raft
Ok(())
}
/// Promote a learner to voter (leader only)
pub async fn promote_learner(&self, _node_id: u64) -> Result<()> {
if !self.is_leader() {
return Err(ClusterError::NotLeader {
leader_id: self.leader(),
});
}
// TODO: Implement learner promotion via Raft
Ok(())
}
/// Run the cluster (blocks until shutdown)
pub async fn run(self) -> Result<()> {
self.run_until_shutdown(std::future::pending()).await
}
/// Run with graceful shutdown signal
pub async fn run_until_shutdown<F>(mut self, shutdown_signal: F) -> Result<()>
where
F: std::future::Future<Output = ()>,
{
let mut shutdown_rx = self.shutdown_tx.subscribe();
// Start gossip agent if present
let gossip_task = if let Some(mut gossip_agent) = self.gossip_agent.take() {
let state = self.state.clone();
let shutdown_rx_gossip = self.shutdown_tx.subscribe();
// Spawn task to handle gossip membership changes
Some(tokio::spawn(async move {
// Run the gossip agent with shutdown signal
if let Err(e) = gossip_agent.run_until_shutdown(shutdown_rx_gossip).await {
tracing::error!(error = %e, "Gossip agent error");
}
}))
} else {
None
};
tokio::select! {
_ = shutdown_signal => {
tracing::info!("Received shutdown signal");
}
_ = shutdown_rx.recv() => {
tracing::info!("Received internal shutdown");
}
}
// Wait for gossip task to finish
if let Some(task) = gossip_task {
let _ = task.await;
}
Ok(())
}
/// Trigger shutdown
pub fn shutdown(&self) {
self.shutdown.store(true, Ordering::SeqCst);
let _ = self.shutdown_tx.send(());
}
/// Check if shutdown was requested
pub fn is_shutting_down(&self) -> bool {
self.shutdown.load(Ordering::SeqCst)
}
/// Get the event dispatcher
pub(crate) fn event_dispatcher(&self) -> &Arc<EventDispatcher> {
&self.event_dispatcher
}
}
/// Lightweight handle for cluster operations
///
/// This handle can be cloned and passed around cheaply. It provides
/// access to cluster state and the KV store without owning the cluster.
#[derive(Clone)]
pub struct ClusterHandle {
node_id: u64,
state: Arc<RwLock<ClusterState>>,
kv: Arc<Kv>,
shutdown_tx: broadcast::Sender<()>,
}
impl ClusterHandle {
/// Get this node's ID
pub fn node_id(&self) -> u64 {
self.node_id
}
/// Get a KV handle
pub fn kv(&self) -> KvHandle {
KvHandle::new(self.kv.clone())
}
/// Check if this node is the leader
pub fn is_leader(&self) -> bool {
self.state.read().is_leader
}
/// Get current leader ID
pub fn leader(&self) -> Option<u64> {
self.state.read().leader_id
}
/// Get all cluster members
pub fn members(&self) -> Vec<NodeInfo> {
self.state.read().members.clone()
}
/// Get current cluster state
pub fn state(&self) -> ClusterState {
self.state.read().clone()
}
/// Trigger cluster shutdown
pub fn shutdown(&self) {
let _ = self.shutdown_tx.send(());
}
}

View file

@ -1,162 +0,0 @@
//! Configuration types for chainfire-core
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use chainfire_types::node::NodeRole;
use chainfire_types::RaftRole;
// Forward declaration - will be implemented in chainfire-storage
// For now, use a placeholder trait
use async_trait::async_trait;
/// Storage backend trait for pluggable storage
#[async_trait]
pub trait StorageBackend: Send + Sync {
/// Get a value by key
async fn get(&self, key: &[u8]) -> std::io::Result<Option<Vec<u8>>>;
/// Put a value
async fn put(&self, key: &[u8], value: &[u8]) -> std::io::Result<()>;
/// Delete a key
async fn delete(&self, key: &[u8]) -> std::io::Result<bool>;
}
/// Configuration for a cluster node
#[derive(Debug, Clone)]
pub struct ClusterConfig {
/// Unique node ID
pub node_id: u64,
/// Human-readable node name
pub node_name: String,
/// Node role (ControlPlane or Worker)
pub node_role: NodeRole,
/// Raft participation role (Voter, Learner, or None)
pub raft_role: RaftRole,
/// API listen address for client connections
pub api_addr: Option<SocketAddr>,
/// Raft listen address for peer-to-peer Raft communication
pub raft_addr: Option<SocketAddr>,
/// Gossip listen address for membership discovery
pub gossip_addr: SocketAddr,
/// Storage backend configuration
pub storage: StorageBackendConfig,
/// Initial cluster members for bootstrap
pub initial_members: Vec<MemberConfig>,
/// Whether to bootstrap the cluster (first node)
pub bootstrap: bool,
/// Cluster ID
pub cluster_id: u64,
/// Enable gRPC API server
pub enable_grpc_api: bool,
/// Timeouts
pub timeouts: TimeoutConfig,
}
impl Default for ClusterConfig {
fn default() -> Self {
Self {
node_id: 0,
node_name: String::new(),
node_role: NodeRole::ControlPlane,
raft_role: RaftRole::Voter,
api_addr: None,
raft_addr: None,
gossip_addr: "0.0.0.0:7946".parse().unwrap(),
storage: StorageBackendConfig::Memory,
initial_members: Vec::new(),
bootstrap: false,
cluster_id: 1,
enable_grpc_api: false,
timeouts: TimeoutConfig::default(),
}
}
}
/// Storage backend configuration
#[derive(Clone)]
pub enum StorageBackendConfig {
/// In-memory storage (for testing/simple deployments)
Memory,
/// RocksDB storage
RocksDb {
/// Data directory path
path: PathBuf,
},
/// Custom storage backend
Custom(Arc<dyn StorageBackend>),
}
impl std::fmt::Debug for StorageBackendConfig {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
StorageBackendConfig::Memory => write!(f, "Memory"),
StorageBackendConfig::RocksDb { path } => {
f.debug_struct("RocksDb").field("path", path).finish()
}
StorageBackendConfig::Custom(_) => write!(f, "Custom(...)"),
}
}
}
/// Configuration for a cluster member
#[derive(Debug, Clone)]
pub struct MemberConfig {
/// Node ID
pub id: u64,
/// Node name
pub name: String,
/// Raft address
pub raft_addr: String,
/// Client API address
pub client_addr: String,
}
/// Timeout configuration
#[derive(Debug, Clone)]
pub struct TimeoutConfig {
/// Raft heartbeat interval
pub heartbeat_interval: Duration,
/// Raft election timeout range (min)
pub election_timeout_min: Duration,
/// Raft election timeout range (max)
pub election_timeout_max: Duration,
/// Connection timeout
pub connection_timeout: Duration,
/// Request timeout
pub request_timeout: Duration,
}
impl Default for TimeoutConfig {
fn default() -> Self {
Self {
heartbeat_interval: Duration::from_millis(150),
election_timeout_min: Duration::from_millis(300),
election_timeout_max: Duration::from_millis(600),
connection_timeout: Duration::from_secs(5),
request_timeout: Duration::from_secs(10),
}
}
}

View file

@ -1,198 +0,0 @@
//! Event types and dispatcher
use std::sync::Arc;
use tokio::sync::broadcast;
use chainfire_types::node::NodeInfo;
use crate::callbacks::{ClusterEventHandler, KvEventHandler, LeaveReason};
/// Cluster-level events
#[derive(Debug, Clone)]
pub enum ClusterEvent {
/// A node joined the cluster
NodeJoined(NodeInfo),
/// A node left the cluster
NodeLeft {
/// The node ID that left
node_id: u64,
/// Why the node left
reason: LeaveReason,
},
/// Leadership changed
LeaderChanged {
/// Previous leader (None if no previous leader)
old: Option<u64>,
/// New leader
new: u64,
},
/// This node became the leader
BecameLeader,
/// This node lost leadership
LostLeadership,
/// Cluster membership changed
MembershipChanged(Vec<NodeInfo>),
/// Network partition detected
PartitionDetected {
/// Nodes that are reachable
reachable: Vec<u64>,
/// Nodes that are unreachable
unreachable: Vec<u64>,
},
/// Cluster is ready
ClusterReady,
}
/// KV store events
#[derive(Debug, Clone)]
pub enum KvEvent {
/// A key was created or updated
KeyChanged {
/// Namespace of the key
namespace: String,
/// The key that changed
key: Vec<u8>,
/// New value
value: Vec<u8>,
/// Revision number
revision: u64,
},
/// A key was deleted
KeyDeleted {
/// Namespace of the key
namespace: String,
/// The key that was deleted
key: Vec<u8>,
/// Revision number
revision: u64,
},
}
/// Event dispatcher that manages callbacks and event broadcasting
pub struct EventDispatcher {
cluster_handlers: Vec<Arc<dyn ClusterEventHandler>>,
kv_handlers: Vec<Arc<dyn KvEventHandler>>,
event_tx: broadcast::Sender<ClusterEvent>,
}
impl EventDispatcher {
/// Create a new event dispatcher
pub fn new() -> Self {
let (event_tx, _) = broadcast::channel(1024);
Self {
cluster_handlers: Vec::new(),
kv_handlers: Vec::new(),
event_tx,
}
}
/// Add a cluster event handler
pub fn add_cluster_handler(&mut self, handler: Arc<dyn ClusterEventHandler>) {
self.cluster_handlers.push(handler);
}
/// Add a KV event handler
pub fn add_kv_handler(&mut self, handler: Arc<dyn KvEventHandler>) {
self.kv_handlers.push(handler);
}
/// Get a subscriber for cluster events
pub fn subscribe(&self) -> broadcast::Receiver<ClusterEvent> {
self.event_tx.subscribe()
}
/// Dispatch a cluster event to all handlers
pub async fn dispatch_cluster_event(&self, event: ClusterEvent) {
// Broadcast to channel subscribers
let _ = self.event_tx.send(event.clone());
// Call registered handlers
match &event {
ClusterEvent::NodeJoined(node) => {
for handler in &self.cluster_handlers {
handler.on_node_joined(node).await;
}
}
ClusterEvent::NodeLeft { node_id, reason } => {
for handler in &self.cluster_handlers {
handler.on_node_left(*node_id, *reason).await;
}
}
ClusterEvent::LeaderChanged { old, new } => {
for handler in &self.cluster_handlers {
handler.on_leader_changed(*old, *new).await;
}
}
ClusterEvent::BecameLeader => {
for handler in &self.cluster_handlers {
handler.on_became_leader().await;
}
}
ClusterEvent::LostLeadership => {
for handler in &self.cluster_handlers {
handler.on_lost_leadership().await;
}
}
ClusterEvent::MembershipChanged(members) => {
for handler in &self.cluster_handlers {
handler.on_membership_changed(members).await;
}
}
ClusterEvent::PartitionDetected {
reachable,
unreachable,
} => {
for handler in &self.cluster_handlers {
handler.on_partition_detected(reachable, unreachable).await;
}
}
ClusterEvent::ClusterReady => {
for handler in &self.cluster_handlers {
handler.on_cluster_ready().await;
}
}
}
}
/// Dispatch a KV event to all handlers
pub async fn dispatch_kv_event(&self, event: KvEvent) {
match &event {
KvEvent::KeyChanged {
namespace,
key,
value,
revision,
} => {
for handler in &self.kv_handlers {
handler
.on_key_changed(namespace, key, value, *revision)
.await;
}
}
KvEvent::KeyDeleted {
namespace,
key,
revision,
} => {
for handler in &self.kv_handlers {
handler.on_key_deleted(namespace, key, *revision).await;
}
}
}
}
}
impl Default for EventDispatcher {
fn default() -> Self {
Self::new()
}
}

View file

@ -1,290 +0,0 @@
//! Key-Value store abstraction
use std::sync::Arc;
use std::time::Duration;
use dashmap::DashMap;
use crate::error::{ClusterError, Result};
/// KV store interface
///
/// Provides access to distributed key-value storage with namespace isolation.
pub struct Kv {
namespaces: DashMap<String, Arc<KvNamespace>>,
default_namespace: Arc<KvNamespace>,
}
impl Kv {
/// Create a new KV store
pub(crate) fn new() -> Self {
let default_namespace = Arc::new(KvNamespace::new("default".to_string()));
Self {
namespaces: DashMap::new(),
default_namespace,
}
}
/// Get or create a namespace
pub fn namespace(&self, name: &str) -> Arc<KvNamespace> {
if name == "default" {
return self.default_namespace.clone();
}
self.namespaces
.entry(name.to_string())
.or_insert_with(|| Arc::new(KvNamespace::new(name.to_string())))
.clone()
}
/// Get the default namespace
pub fn default_namespace(&self) -> &Arc<KvNamespace> {
&self.default_namespace
}
// Convenience methods on default namespace
/// Get a value by key from the default namespace
pub async fn get(&self, key: impl AsRef<[u8]>) -> Result<Option<Vec<u8>>> {
self.default_namespace.get(key).await
}
/// Put a value in the default namespace
pub async fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result<u64> {
self.default_namespace.put(key, value).await
}
/// Delete a key from the default namespace
pub async fn delete(&self, key: impl AsRef<[u8]>) -> Result<bool> {
self.default_namespace.delete(key).await
}
/// Compare-and-swap in the default namespace
pub async fn compare_and_swap(
&self,
key: impl AsRef<[u8]>,
expected_version: u64,
value: impl AsRef<[u8]>,
) -> Result<CasResult> {
self.default_namespace
.compare_and_swap(key, expected_version, value)
.await
}
}
/// KV namespace for data isolation
pub struct KvNamespace {
name: String,
// TODO: Add storage backend and raft reference
}
impl KvNamespace {
pub(crate) fn new(name: String) -> Self {
Self { name }
}
/// Get the namespace name
pub fn name(&self) -> &str {
&self.name
}
/// Get a value by key
pub async fn get(&self, _key: impl AsRef<[u8]>) -> Result<Option<Vec<u8>>> {
// TODO: Implement with storage backend
Ok(None)
}
/// Get with revision
pub async fn get_with_revision(
&self,
_key: impl AsRef<[u8]>,
) -> Result<Option<(Vec<u8>, u64)>> {
// TODO: Implement with storage backend
Ok(None)
}
/// Put a value (goes through Raft if available)
pub async fn put(&self, _key: impl AsRef<[u8]>, _value: impl AsRef<[u8]>) -> Result<u64> {
// TODO: Implement with Raft
Ok(0)
}
/// Put with options
pub async fn put_with_options(
&self,
_key: impl AsRef<[u8]>,
_value: impl AsRef<[u8]>,
_options: KvOptions,
) -> Result<KvPutResult> {
// TODO: Implement with Raft
Ok(KvPutResult {
revision: 0,
prev_value: None,
})
}
/// Delete a key
pub async fn delete(&self, _key: impl AsRef<[u8]>) -> Result<bool> {
// TODO: Implement with Raft
Ok(false)
}
/// Compare-and-swap
pub async fn compare_and_swap(
&self,
_key: impl AsRef<[u8]>,
expected_version: u64,
_value: impl AsRef<[u8]>,
) -> Result<CasResult> {
// TODO: Implement with storage backend
Err(ClusterError::VersionMismatch {
expected: expected_version,
actual: 0,
})
}
/// Scan keys with prefix
pub async fn scan_prefix(
&self,
_prefix: impl AsRef<[u8]>,
_limit: u32,
) -> Result<Vec<KvEntry>> {
// TODO: Implement with storage backend
Ok(Vec::new())
}
/// Scan keys in a range
pub async fn scan_range(
&self,
_start: impl AsRef<[u8]>,
_end: impl AsRef<[u8]>,
_limit: u32,
) -> Result<Vec<KvEntry>> {
// TODO: Implement with storage backend
Ok(Vec::new())
}
/// Get with specified consistency level
pub async fn get_with_consistency(
&self,
_key: impl AsRef<[u8]>,
_consistency: ReadConsistency,
) -> Result<Option<Vec<u8>>> {
// TODO: Implement with consistency options
Ok(None)
}
}
/// Options for KV operations
#[derive(Debug, Clone, Default)]
pub struct KvOptions {
/// Lease ID for TTL-based expiration
pub lease_id: Option<u64>,
/// Return previous value
pub prev_kv: bool,
/// Time-to-live for the key
pub ttl: Option<Duration>,
}
/// Result of a put operation
#[derive(Debug, Clone)]
pub struct KvPutResult {
/// New revision after the put
pub revision: u64,
/// Previous value, if requested and existed
pub prev_value: Option<Vec<u8>>,
}
/// A key-value entry with metadata
#[derive(Debug, Clone)]
pub struct KvEntry {
/// The key
pub key: Vec<u8>,
/// The value
pub value: Vec<u8>,
/// Revision when the key was created
pub create_revision: u64,
/// Revision when the key was last modified
pub mod_revision: u64,
/// Version number (increments on each update)
pub version: u64,
/// Lease ID if the key is attached to a lease
pub lease_id: Option<u64>,
}
/// Result of a compare-and-swap operation
#[derive(Debug, Clone)]
pub enum CasResult {
/// CAS succeeded, contains new revision
Success(u64),
/// CAS failed due to version mismatch
Conflict {
/// Expected version
expected: u64,
/// Actual version found
actual: u64,
},
/// Key did not exist
NotFound,
}
/// Read consistency level
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum ReadConsistency {
/// Read from local storage (may be stale)
Local,
/// Read from any node, but verify with leader's committed index
Serializable,
/// Read only from leader (linearizable, strongest guarantee)
#[default]
Linearizable,
}
/// Lightweight handle for KV operations
#[derive(Clone)]
pub struct KvHandle {
kv: Arc<Kv>,
}
impl KvHandle {
pub(crate) fn new(kv: Arc<Kv>) -> Self {
Self { kv }
}
/// Get the underlying KV store
pub fn inner(&self) -> &Arc<Kv> {
&self.kv
}
/// Get a value by key
pub async fn get(&self, key: impl AsRef<[u8]>) -> Result<Option<Vec<u8>>> {
self.kv.get(key).await
}
/// Put a value
pub async fn put(&self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result<u64> {
self.kv.put(key, value).await
}
/// Delete a key
pub async fn delete(&self, key: impl AsRef<[u8]>) -> Result<bool> {
self.kv.delete(key).await
}
/// Get a namespace
pub fn namespace(&self, name: &str) -> Arc<KvNamespace> {
self.kv.namespace(name)
}
}

View file

@ -1,58 +1,10 @@
//! Chainfire Core - Embeddable distributed cluster library
//! Internal compatibility crate for workspace-local ChainFire types.
//!
//! This crate provides cluster management, distributed KVS, and event callbacks
//! for embedding Raft consensus and SWIM gossip into applications.
//!
//! # Example
//!
//! ```ignore
//! use chainfire_core::{ClusterBuilder, ClusterEventHandler};
//! use std::net::SocketAddr;
//!
//! struct MyHandler;
//!
//! impl ClusterEventHandler for MyHandler {
//! async fn on_leader_changed(&self, old: Option<u64>, new: u64) {
//! println!("Leader changed: {:?} -> {}", old, new);
//! }
//! }
//!
//! #[tokio::main]
//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
//! let cluster = ClusterBuilder::new(1)
//! .name("node-1")
//! .gossip_addr("0.0.0.0:7946".parse()?)
//! .raft_addr("0.0.0.0:2380".parse()?)
//! .on_cluster_event(MyHandler)
//! .build()
//! .await?;
//!
//! // Use the KVS
//! cluster.kv().put("key", b"value").await?;
//!
//! Ok(())
//! }
//! ```
//! The supported ChainFire product surface is the fixed-membership
//! `chainfire-server` / `chainfire-api` contract documented in the repository
//! root. This crate intentionally does not export an embeddable cluster,
//! membership-mutation, or distributed-KV API.
pub mod builder;
pub mod callbacks;
pub mod cluster;
pub mod config;
pub mod error;
pub mod events;
pub mod kvs;
mod error;
// Re-exports from chainfire-types
pub use chainfire_types::{
node::{NodeId, NodeInfo, NodeRole},
RaftRole,
};
// Re-exports from this crate
pub use builder::ClusterBuilder;
pub use callbacks::{ClusterEventHandler, KvEventHandler, LeaveReason};
pub use cluster::{Cluster, ClusterHandle, ClusterState};
pub use config::{ClusterConfig, StorageBackend, StorageBackendConfig};
pub use error::{ClusterError, Result};
pub use events::{ClusterEvent, EventDispatcher, KvEvent};
pub use kvs::{CasResult, Kv, KvEntry, KvHandle, KvNamespace, KvOptions, ReadConsistency};

View file

@ -1,60 +0,0 @@
use async_trait::async_trait;
use chainfire_types::node::NodeInfo;
use crate::error::Result;
use std::net::SocketAddr;
/// Abstract interface for Gossip protocol
#[async_trait]
pub trait Gossip: Send + Sync {
/// Start the gossip agent
async fn start(&self) -> Result<()>;
/// Join a cluster via seed nodes
async fn join(&self, seeds: &[SocketAddr]) -> Result<()>;
/// Announce presence to a specific node
async fn announce(&self, addr: SocketAddr) -> Result<()>;
/// Get list of known members
fn members(&self) -> Vec<NodeInfo>;
/// Shutdown the gossip agent
async fn shutdown(&self) -> Result<()>;
}
/// Abstract interface for Consensus protocol (Raft)
#[async_trait]
pub trait Consensus: Send + Sync {
/// Initialize the consensus module
async fn initialize(&self) -> Result<()>;
/// Start the event loop
async fn run(&self) -> Result<()>;
/// Propose a command to the state machine
async fn propose(&self, data: Vec<u8>) -> Result<u64>;
/// Add a node to the consensus group
async fn add_node(&self, node_id: u64, addr: String, as_learner: bool) -> Result<()>;
/// Remove a node from the consensus group
async fn remove_node(&self, node_id: u64) -> Result<()>;
/// Check if this node is the leader
fn is_leader(&self) -> bool;
/// Get the current leader ID
fn leader_id(&self) -> Option<u64>;
}
/// Abstract interface for State Machine
pub trait StateMachine: Send + Sync {
/// Apply a committed entry
fn apply(&self, index: u64, data: &[u8]) -> Result<Vec<u8>>;
/// Take a snapshot of current state
fn snapshot(&self) -> Result<Vec<u8>>;
/// Restore state from a snapshot
fn restore(&self, snapshot: &[u8]) -> Result<()>;
}

View file

@ -141,7 +141,11 @@ impl ActualStateBroadcast {
}
}
debug!(node_id, timestamp = state.timestamp, "Received actual state");
debug!(
node_id,
timestamp = state.timestamp,
"Received actual state"
);
self.cluster_state.insert(node_id, state);
true
}

View file

@ -77,13 +77,7 @@ impl Identity for GossipId {
impl std::fmt::Display for GossipId {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
"{}@{}:{}",
self.node_id,
self.addr,
self.incarnation
)
write!(f, "{}@{}:{}", self.node_id, self.addr, self.incarnation)
}
}

View file

@ -129,8 +129,14 @@ mod tests {
fn test_role_filtering() {
let state = MembershipState::new();
state.handle_change(MembershipChange::MemberUp(create_id(1, NodeRole::ControlPlane)));
state.handle_change(MembershipChange::MemberUp(create_id(2, NodeRole::ControlPlane)));
state.handle_change(MembershipChange::MemberUp(create_id(
1,
NodeRole::ControlPlane,
)));
state.handle_change(MembershipChange::MemberUp(create_id(
2,
NodeRole::ControlPlane,
)));
state.handle_change(MembershipChange::MemberUp(create_id(3, NodeRole::Worker)));
state.handle_change(MembershipChange::MemberUp(create_id(4, NodeRole::Worker)));
state.handle_change(MembershipChange::MemberUp(create_id(5, NodeRole::Worker)));

View file

@ -12,12 +12,12 @@
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::{mpsc, oneshot, RwLock, Mutex};
use tokio::sync::{mpsc, oneshot, Mutex, RwLock};
use tokio::time;
use chainfire_storage::{LogStorage, StateMachine, LogEntry, EntryPayload, LogId};
use chainfire_types::command::RaftCommand;
use crate::network::RaftRpcClient;
use chainfire_storage::{EntryPayload, LogEntry, LogId, LogStorage, StateMachine};
use chainfire_types::command::RaftCommand;
use tracing::{debug, trace};
pub type NodeId = u64;
@ -155,10 +155,7 @@ pub enum RaftEvent {
response_tx: oneshot::Sender<AppendEntriesResponse>,
},
/// RequestVote RPC response received
VoteResponse {
from: NodeId,
resp: VoteResponse,
},
VoteResponse { from: NodeId, resp: VoteResponse },
/// AppendEntries RPC response received
AppendEntriesResponse {
from: NodeId,
@ -305,7 +302,10 @@ impl RaftCore {
tracing::info!("No persistent state found, starting fresh");
}
Err(e) => {
return Err(RaftError::StorageError(format!("Failed to load vote: {}", e)));
return Err(RaftError::StorageError(format!(
"Failed to load vote: {}",
e
)));
}
}
Ok(())
@ -339,7 +339,10 @@ impl RaftCore {
// Main event loop
let mut event_rx = self.event_rx.lock().await;
eprintln!("[Node {}] EVENT LOOP acquired event_rx, starting recv loop", self.node_id);
eprintln!(
"[Node {}] EVENT LOOP acquired event_rx, starting recv loop",
self.node_id
);
loop {
tokio::select! {
@ -379,7 +382,10 @@ impl RaftCore {
RaftEvent::HeartbeatTimeout => {
self.handle_heartbeat_timeout().await?;
}
RaftEvent::ClientWrite { command, response_tx } => {
RaftEvent::ClientWrite {
command,
response_tx,
} => {
let result = self.handle_client_write(command).await;
let _ = response_tx.send(result);
}
@ -388,8 +394,10 @@ impl RaftCore {
let _ = response_tx.send(resp);
}
RaftEvent::AppendEntries { req, response_tx } => {
eprintln!("[Node {}] EVENT LOOP processing AppendEntries from {} term={}",
self.node_id, req.leader_id, req.term);
eprintln!(
"[Node {}] EVENT LOOP processing AppendEntries from {} term={}",
self.node_id, req.leader_id, req.term
);
let resp = self.handle_append_entries(req).await?;
let _ = response_tx.send(resp);
}
@ -411,11 +419,17 @@ impl RaftCore {
async fn handle_election_timeout(&self) -> Result<(), RaftError> {
let role = *self.role.read().await;
eprintln!("[Node {}] handle_election_timeout: role={:?}", self.node_id, role);
eprintln!(
"[Node {}] handle_election_timeout: role={:?}",
self.node_id, role
);
// Only followers and candidates start elections
if role == RaftRole::Leader {
eprintln!("[Node {}] Already leader, ignoring election timeout", self.node_id);
eprintln!(
"[Node {}] Already leader, ignoring election timeout",
self.node_id
);
return Ok(());
}
@ -433,7 +447,10 @@ impl RaftCore {
let current_term = persistent.current_term;
drop(persistent);
eprintln!("[Node {}] Starting election for term {}", self.node_id, current_term);
eprintln!(
"[Node {}] Starting election for term {}",
self.node_id, current_term
);
// Persist vote to storage before sending RPCs (Raft safety)
self.persist_vote().await?;
@ -448,11 +465,16 @@ impl RaftCore {
// Check if already have majority (single-node case)
let cluster_size = self.peers.len() + 1;
let majority = cluster_size / 2 + 1;
eprintln!("[Node {}] Cluster size={}, majority={}, peers={:?}",
self.node_id, cluster_size, majority, self.peers);
eprintln!(
"[Node {}] Cluster size={}, majority={}, peers={:?}",
self.node_id, cluster_size, majority, self.peers
);
if 1 >= majority {
// For single-node cluster, immediately become leader
eprintln!("[Node {}] Single-node cluster, becoming leader immediately", self.node_id);
eprintln!(
"[Node {}] Single-node cluster, becoming leader immediately",
self.node_id
);
self.become_leader().await?;
return Ok(());
}
@ -477,14 +499,16 @@ impl RaftCore {
tokio::spawn(async move {
// Send vote request via network (using real RaftRpcClient - GrpcRaftClient in production)
let resp = network.vote(peer_id, req).await
.unwrap_or(VoteResponse {
term: current_term,
vote_granted: false,
});
let resp = network.vote(peer_id, req).await.unwrap_or(VoteResponse {
term: current_term,
vote_granted: false,
});
// Send response back to main event loop
let _ = event_tx.send(RaftEvent::VoteResponse { from: peer_id, resp });
let _ = event_tx.send(RaftEvent::VoteResponse {
from: peer_id,
resp,
});
});
}
@ -515,8 +539,8 @@ impl RaftCore {
}
// Check if we can grant vote
let can_vote = persistent.voted_for.is_none()
|| persistent.voted_for == Some(req.candidate_id);
let can_vote =
persistent.voted_for.is_none() || persistent.voted_for == Some(req.candidate_id);
if !can_vote {
return Ok(VoteResponse {
@ -554,7 +578,11 @@ impl RaftCore {
}
/// Handle VoteResponse from a peer
async fn handle_vote_response(&self, from: NodeId, resp: VoteResponse) -> Result<(), RaftError> {
async fn handle_vote_response(
&self,
from: NodeId,
resp: VoteResponse,
) -> Result<(), RaftError> {
let role = *self.role.read().await;
let persistent = self.persistent.read().await;
@ -625,7 +653,8 @@ impl RaftCore {
*self.leader_state.write().await = Some(leader_state);
// Start sending heartbeats immediately
self.event_tx.send(RaftEvent::HeartbeatTimeout)
self.event_tx
.send(RaftEvent::HeartbeatTimeout)
.map_err(|e| RaftError::NetworkError(format!("Failed to send heartbeat: {}", e)))?;
Ok(())
@ -665,8 +694,10 @@ impl RaftCore {
let term = self.persistent.read().await.current_term;
let (last_log_index, _) = self.get_last_log_info().await?;
eprintln!("[Node {}] Sending heartbeat to peers: {:?} (term={})",
self.node_id, self.peers, term);
eprintln!(
"[Node {}] Sending heartbeat to peers: {:?} (term={})",
self.node_id, self.peers, term
);
// Send AppendEntries (with entries if available) to all peers
for peer_id in &self.peers {
@ -677,7 +708,8 @@ impl RaftCore {
// Get prevLogIndex and prevLogTerm for this peer
let leader_state = self.leader_state.read().await;
let next_index = leader_state.as_ref()
let next_index = leader_state
.as_ref()
.and_then(|ls| ls.next_index.get(&peer_id).copied())
.unwrap_or(1);
drop(leader_state);
@ -685,7 +717,8 @@ impl RaftCore {
let prev_log_index = next_index.saturating_sub(1);
let prev_log_term = if prev_log_index > 0 {
// Read as Vec<u8> since that's how it's stored
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
let entries: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries(prev_log_index..=prev_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
@ -701,36 +734,55 @@ impl RaftCore {
// Get entries to send (if any)
let entries: Vec<LogEntry<RaftCommand>> = if next_index <= last_log_index {
// Read entries from storage (stored as Vec<u8>)
let stored_entries: Vec<LogEntry<Vec<u8>>> = self.storage
let stored_entries: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries(next_index..=last_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log entries: {}", e)))?;
.map_err(|e| {
RaftError::StorageError(format!("Failed to read log entries: {}", e))
})?;
// Convert Vec<u8> back to RaftCommand
stored_entries.into_iter().map(|entry| {
let command = bincode::deserialize(match &entry.payload {
EntryPayload::Normal(data) => data,
EntryPayload::Blank => return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Blank,
}),
EntryPayload::Membership(nodes) => return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Membership(nodes.clone()),
}),
}).map_err(|e| RaftError::StorageError(format!("Failed to deserialize command: {}", e)))?;
stored_entries
.into_iter()
.map(|entry| {
let command = bincode::deserialize(match &entry.payload {
EntryPayload::Normal(data) => data,
EntryPayload::Blank => {
return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Blank,
})
}
EntryPayload::Membership(nodes) => {
return Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Membership(nodes.clone()),
})
}
})
.map_err(|e| {
RaftError::StorageError(format!("Failed to deserialize command: {}", e))
})?;
Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Normal(command),
Ok(LogEntry {
log_id: entry.log_id,
payload: EntryPayload::Normal(command),
})
})
}).collect::<Result<Vec<_>, RaftError>>()?
.collect::<Result<Vec<_>, RaftError>>()?
} else {
// No entries to send, just heartbeat
vec![]
};
eprintln!("[Node {}] HEARTBEAT to {}: entries.len()={} next_index={} last_log_index={}",
self.node_id, peer_id, entries.len(), next_index, last_log_index);
eprintln!(
"[Node {}] HEARTBEAT to {}: entries.len()={} next_index={} last_log_index={}",
self.node_id,
peer_id,
entries.len(),
next_index,
last_log_index
);
let req = AppendEntriesRequest {
term,
@ -741,8 +793,10 @@ impl RaftCore {
leader_commit: commit_index,
};
eprintln!("[Node {}] LEADER sending to {}: leader_commit={}",
self.node_id, peer_id, commit_index);
eprintln!(
"[Node {}] LEADER sending to {}: leader_commit={}",
self.node_id, peer_id, commit_index
);
let network = Arc::clone(&self.network);
let event_tx = self.event_tx.clone();
@ -761,18 +815,25 @@ impl RaftCore {
Ok(())
}
async fn handle_append_entries(&self, req: AppendEntriesRequest) -> Result<AppendEntriesResponse, RaftError> {
async fn handle_append_entries(
&self,
req: AppendEntriesRequest,
) -> Result<AppendEntriesResponse, RaftError> {
let mut persistent = self.persistent.write().await;
let current_term = persistent.current_term;
// DIAGNOSTIC: Log all AppendEntries received
eprintln!("[Node {}] Received AppendEntries from {} term={} (my term={})",
self.node_id, req.leader_id, req.term, current_term);
eprintln!(
"[Node {}] Received AppendEntries from {} term={} (my term={})",
self.node_id, req.leader_id, req.term, current_term
);
// If RPC request contains term T > currentTerm: set currentTerm = T, convert to follower
if req.term > current_term {
eprintln!("[Node {}] STEPPING DOWN: req.term={} > my term={}",
self.node_id, req.term, current_term);
eprintln!(
"[Node {}] STEPPING DOWN: req.term={} > my term={}",
self.node_id, req.term, current_term
);
persistent.current_term = req.term;
persistent.voted_for = None;
drop(persistent);
@ -780,8 +841,10 @@ impl RaftCore {
*self.role.write().await = RaftRole::Follower;
*self.candidate_state.write().await = None;
*self.leader_state.write().await = None;
eprintln!("[Node {}] Stepped down to Follower (now term={})",
self.node_id, req.term);
eprintln!(
"[Node {}] Stepped down to Follower (now term={})",
self.node_id, req.term
);
} else {
drop(persistent);
}
@ -810,7 +873,8 @@ impl RaftCore {
// Reply false if log doesn't contain an entry at prevLogIndex whose term matches prevLogTerm
if req.prev_log_index > 0 {
// Try to get the entry at prevLogIndex (stored as Vec<u8>)
let prev_entries: Vec<LogEntry<Vec<u8>>> = self.storage
let prev_entries: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries(req.prev_log_index..=req.prev_log_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
@ -835,9 +899,10 @@ impl RaftCore {
// Search backwards to find first entry of this term
let mut conflict_index = req.prev_log_index;
for idx in (1..req.prev_log_index).rev() {
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
.get_log_entries(idx..=idx)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
let entries: Vec<LogEntry<Vec<u8>>> =
self.storage.get_log_entries(idx..=idx).map_err(|e| {
RaftError::StorageError(format!("Failed to read log: {}", e))
})?;
if !entries.is_empty() && entries[0].log_id.term != conflict_term {
conflict_index = idx + 1;
@ -861,33 +926,39 @@ impl RaftCore {
let first_new_index = req.entries[0].log_id.index;
// Check if there's a conflict (stored as Vec<u8>)
let existing: Vec<LogEntry<Vec<u8>>> = self.storage
let existing: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries(first_new_index..=first_new_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log: {}", e)))?;
if !existing.is_empty() && existing[0].log_id.term != req.entries[0].log_id.term {
// Conflict detected - truncate from this index
self.storage
.truncate(first_new_index)
.map_err(|e| RaftError::StorageError(format!("Failed to truncate log: {}", e)))?;
self.storage.truncate(first_new_index).map_err(|e| {
RaftError::StorageError(format!("Failed to truncate log: {}", e))
})?;
}
// Convert RaftCommand entries to Vec<u8> before storing
let entries_to_store: Vec<LogEntry<Vec<u8>>> = req.entries.iter().map(|entry| {
let payload = match &entry.payload {
EntryPayload::Normal(cmd) => {
let bytes = bincode::serialize(cmd)
.map_err(|e| RaftError::StorageError(format!("Serialize failed: {}", e)))?;
EntryPayload::Normal(bytes)
}
EntryPayload::Blank => EntryPayload::Blank,
EntryPayload::Membership(nodes) => EntryPayload::Membership(nodes.clone()),
};
Ok(LogEntry {
log_id: entry.log_id,
payload,
let entries_to_store: Vec<LogEntry<Vec<u8>>> = req
.entries
.iter()
.map(|entry| {
let payload = match &entry.payload {
EntryPayload::Normal(cmd) => {
let bytes = bincode::serialize(cmd).map_err(|e| {
RaftError::StorageError(format!("Serialize failed: {}", e))
})?;
EntryPayload::Normal(bytes)
}
EntryPayload::Blank => EntryPayload::Blank,
EntryPayload::Membership(nodes) => EntryPayload::Membership(nodes.clone()),
};
Ok(LogEntry {
log_id: entry.log_id,
payload,
})
})
}).collect::<Result<Vec<_>, RaftError>>()?;
.collect::<Result<Vec<_>, RaftError>>()?;
// Append converted entries
self.storage
@ -895,14 +966,22 @@ impl RaftCore {
.map_err(|e| RaftError::StorageError(format!("Failed to append entries: {}", e)))?;
let (last_log_index, _) = self.get_last_log_info().await?;
eprintln!("[Node {}] FOLLOWER appended {} entries, last_index_now={}",
self.node_id, req.entries.len(), last_log_index);
eprintln!(
"[Node {}] FOLLOWER appended {} entries, last_index_now={}",
self.node_id,
req.entries.len(),
last_log_index
);
}
// P2: Update commit index
// If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)
eprintln!("[Node {}] FOLLOWER commit check: req.leader_commit={} my_commit={}",
self.node_id, req.leader_commit, self.volatile.read().await.commit_index);
eprintln!(
"[Node {}] FOLLOWER commit check: req.leader_commit={} my_commit={}",
self.node_id,
req.leader_commit,
self.volatile.read().await.commit_index
);
if req.leader_commit > 0 {
let mut volatile = self.volatile.write().await;
if req.leader_commit > volatile.commit_index {
@ -913,8 +992,10 @@ impl RaftCore {
};
let new_commit = std::cmp::min(req.leader_commit, last_new_index);
eprintln!("[Node {}] FOLLOWER updating commit: {} -> {}",
self.node_id, volatile.commit_index, new_commit);
eprintln!(
"[Node {}] FOLLOWER updating commit: {} -> {}",
self.node_id, volatile.commit_index, new_commit
);
volatile.commit_index = new_commit;
debug!(
@ -939,7 +1020,11 @@ impl RaftCore {
})
}
async fn handle_append_entries_response(&self, from: NodeId, resp: AppendEntriesResponse) -> Result<(), RaftError> {
async fn handle_append_entries_response(
&self,
from: NodeId,
resp: AppendEntriesResponse,
) -> Result<(), RaftError> {
// Only leaders process AppendEntries responses
let role = *self.role.read().await;
if role != RaftRole::Leader {
@ -984,8 +1069,14 @@ impl RaftCore {
leader_state.match_index.insert(from, new_match_index);
leader_state.next_index.insert(from, new_match_index + 1);
eprintln!("[Node {}] RESP from {}: success={} match_index={} next_index={}",
self.node_id, from, resp.success, new_match_index, new_match_index + 1);
eprintln!(
"[Node {}] RESP from {}: success={} match_index={} next_index={}",
self.node_id,
from,
resp.success,
new_match_index,
new_match_index + 1
);
trace!(
peer = from,
@ -1040,11 +1131,7 @@ impl RaftCore {
// Collect all match_index values plus leader's own log
let (last_log_index, _) = self.get_last_log_info().await?;
let mut match_indices: Vec<LogIndex> = leader_state
.match_index
.values()
.copied()
.collect();
let mut match_indices: Vec<LogIndex> = leader_state.match_index.values().copied().collect();
// Add leader's own index
match_indices.push(last_log_index);
@ -1056,8 +1143,10 @@ impl RaftCore {
let majority_index = match_indices.len() / 2;
let new_commit_index = match_indices[majority_index];
eprintln!("[Node {}] COMMIT CHECK: match_indices={:?} majority_idx={} new_commit={}",
self.node_id, match_indices, majority_index, new_commit_index);
eprintln!(
"[Node {}] COMMIT CHECK: match_indices={:?} majority_idx={} new_commit={}",
self.node_id, match_indices, majority_index, new_commit_index
);
let current_term = self.persistent.read().await.current_term;
let old_commit_index = self.volatile.read().await.commit_index;
@ -1067,9 +1156,12 @@ impl RaftCore {
// 2. The entry at new_commit_index is from current term (Raft safety)
if new_commit_index > old_commit_index {
// Check term of entry at new_commit_index (stored as Vec<u8>)
let entries: Vec<LogEntry<Vec<u8>>> = self.storage
let entries: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries(new_commit_index..=new_commit_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read log for commit: {}", e)))?;
.map_err(|e| {
RaftError::StorageError(format!("Failed to read log for commit: {}", e))
})?;
if !entries.is_empty() && entries[0].log_id.term == current_term {
// Safe to commit
@ -1100,20 +1192,24 @@ impl RaftCore {
}
// Get entries to apply (stored as Vec<u8>)
let stored_entries: Vec<LogEntry<Vec<u8>>> = self.storage
let stored_entries: Vec<LogEntry<Vec<u8>>> = self
.storage
.get_log_entries((last_applied + 1)..=commit_index)
.map_err(|e| RaftError::StorageError(format!("Failed to read entries for apply: {}", e)))?;
.map_err(|e| {
RaftError::StorageError(format!("Failed to read entries for apply: {}", e))
})?;
// Apply each entry to state machine
for entry in &stored_entries {
if let EntryPayload::Normal(data) = &entry.payload {
// Deserialize the command
let command: RaftCommand = bincode::deserialize(data)
.map_err(|e| RaftError::StorageError(format!("Failed to deserialize for apply: {}", e)))?;
let command: RaftCommand = bincode::deserialize(data).map_err(|e| {
RaftError::StorageError(format!("Failed to deserialize for apply: {}", e))
})?;
self.state_machine
.apply(command)
.map_err(|e| RaftError::StorageError(format!("Failed to apply to state machine: {}", e)))?;
self.state_machine.apply(command).map_err(|e| {
RaftError::StorageError(format!("Failed to apply to state machine: {}", e))
})?;
debug!(
index = entry.log_id.index,
@ -1148,14 +1244,23 @@ impl RaftCore {
// Get current term and last log index
let term = self.persistent.read().await.current_term;
eprintln!("[Node {}] handle_client_write: getting last_log_info...", self.node_id);
eprintln!(
"[Node {}] handle_client_write: getting last_log_info...",
self.node_id
);
let (last_log_index, _) = match self.get_last_log_info().await {
Ok(info) => {
eprintln!("[Node {}] handle_client_write: last_log_index={}", self.node_id, info.0);
eprintln!(
"[Node {}] handle_client_write: last_log_index={}",
self.node_id, info.0
);
info
}
Err(e) => {
eprintln!("[Node {}] handle_client_write: ERROR getting last_log_info: {:?}", self.node_id, e);
eprintln!(
"[Node {}] handle_client_write: ERROR getting last_log_info: {:?}",
self.node_id, e
);
return Err(e);
}
};
@ -1177,14 +1282,26 @@ impl RaftCore {
};
// Append to leader's log
eprintln!("[Node {}] handle_client_write: appending entry index={} term={}...", self.node_id, new_index, term);
eprintln!(
"[Node {}] handle_client_write: appending entry index={} term={}...",
self.node_id, new_index, term
);
match self.storage.append(&[entry.clone()]) {
Ok(()) => {
eprintln!("[Node {}] handle_client_write: append SUCCESS index={}", self.node_id, new_index);
eprintln!(
"[Node {}] handle_client_write: append SUCCESS index={}",
self.node_id, new_index
);
}
Err(e) => {
eprintln!("[Node {}] handle_client_write: append FAILED: {:?}", self.node_id, e);
return Err(RaftError::StorageError(format!("Failed to append entry: {}", e)));
eprintln!(
"[Node {}] handle_client_write: append FAILED: {:?}",
self.node_id, e
);
return Err(RaftError::StorageError(format!(
"Failed to append entry: {}",
e
)));
}
}
@ -1198,7 +1315,9 @@ impl RaftCore {
// Send AppendEntries with the new entry to all peers
self.event_tx
.send(RaftEvent::HeartbeatTimeout)
.map_err(|e| RaftError::NetworkError(format!("Failed to trigger replication: {}", e)))?;
.map_err(|e| {
RaftError::NetworkError(format!("Failed to trigger replication: {}", e))
})?;
// Single-node cluster: immediately commit since we're the only voter
if self.peers.is_empty() {
@ -1218,7 +1337,8 @@ impl RaftCore {
/// Get last log index and term
async fn get_last_log_info(&self) -> Result<(LogIndex, Term), RaftError> {
let log_state = self.storage
let log_state = self
.storage
.get_log_state()
.map_err(|e| RaftError::StorageError(format!("Failed to get log state: {}", e)))?;
@ -1238,9 +1358,9 @@ impl RaftCore {
tokio::spawn(async move {
eprintln!("[ELECTION TIMER] Spawned");
loop {
let timeout = rand::random::<u64>() %
(config.election_timeout_max - config.election_timeout_min) +
config.election_timeout_min;
let timeout = rand::random::<u64>()
% (config.election_timeout_max - config.election_timeout_min)
+ config.election_timeout_min;
eprintln!("[ELECTION TIMER] Waiting {}ms", timeout);
tokio::select! {
@ -1275,7 +1395,8 @@ impl RaftCore {
let config = self.config.clone();
tokio::spawn(async move {
let mut interval = tokio::time::interval(Duration::from_millis(config.heartbeat_interval));
let mut interval =
tokio::time::interval(Duration::from_millis(config.heartbeat_interval));
// Skip the first tick (fires immediately)
interval.tick().await;
@ -1313,12 +1434,11 @@ impl RaftCore {
}
/// Inject RequestVote RPC (for testing)
pub async fn request_vote_rpc(
&self,
req: VoteRequest,
resp_tx: oneshot::Sender<VoteResponse>,
) {
let _ = self.event_tx.send(RaftEvent::VoteRequest { req, response_tx: resp_tx });
pub async fn request_vote_rpc(&self, req: VoteRequest, resp_tx: oneshot::Sender<VoteResponse>) {
let _ = self.event_tx.send(RaftEvent::VoteRequest {
req,
response_tx: resp_tx,
});
}
/// Inject AppendEntries RPC (for testing)
@ -1327,12 +1447,19 @@ impl RaftCore {
req: AppendEntriesRequest,
resp_tx: oneshot::Sender<AppendEntriesResponse>,
) {
eprintln!("[Node {}] append_entries_rpc: from {} term={}",
self.node_id, req.leader_id, req.term);
let result = self.event_tx.send(RaftEvent::AppendEntries { req, response_tx: resp_tx });
eprintln!(
"[Node {}] append_entries_rpc: from {} term={}",
self.node_id, req.leader_id, req.term
);
let result = self.event_tx.send(RaftEvent::AppendEntries {
req,
response_tx: resp_tx,
});
if let Err(e) = result {
eprintln!("[Node {}] ERROR: Failed to send AppendEntries event: channel closed",
self.node_id);
eprintln!(
"[Node {}] ERROR: Failed to send AppendEntries event: channel closed",
self.node_id
);
}
}
@ -1357,7 +1484,10 @@ impl RaftCore {
/// Submit a client write and wait for commit (blocking version)
/// Returns RaftResponse after the command is committed and applied
pub async fn write(&self, command: RaftCommand) -> Result<chainfire_types::command::RaftResponse, RaftError> {
pub async fn write(
&self,
command: RaftCommand,
) -> Result<chainfire_types::command::RaftResponse, RaftError> {
use chainfire_types::command::RaftResponse;
// Get current commit index before write

View file

@ -10,5 +10,8 @@ pub mod core;
// Common modules
pub mod network;
pub use core::{RaftCore, RaftConfig, RaftRole, VoteRequest, VoteResponse, AppendEntriesRequest, AppendEntriesResponse};
pub use core::{
AppendEntriesRequest, AppendEntriesResponse, RaftConfig, RaftCore, RaftRole, VoteRequest,
VoteResponse,
};
pub use network::RaftNetworkError;

View file

@ -2,8 +2,8 @@
//!
//! This module provides network adapters for Raft to communicate between nodes.
use crate::core::{AppendEntriesRequest, AppendEntriesResponse, VoteRequest, VoteResponse};
use chainfire_types::NodeId;
use crate::core::{VoteRequest, VoteResponse, AppendEntriesRequest, AppendEntriesResponse};
use std::sync::Arc;
use thiserror::Error;
@ -54,10 +54,7 @@ pub mod test_client {
}
pub enum RpcMessage {
Vote(
VoteRequest,
tokio::sync::oneshot::Sender<VoteResponse>,
),
Vote(VoteRequest, tokio::sync::oneshot::Sender<VoteResponse>),
AppendEntries(
AppendEntriesRequest,
tokio::sync::oneshot::Sender<AppendEntriesResponse>,
@ -109,13 +106,14 @@ pub mod test_client {
req: AppendEntriesRequest,
) -> Result<AppendEntriesResponse, RaftNetworkError> {
let channels = self.channels.read().await;
let tx = channels
.get(&target)
.ok_or_else(|| {
eprintln!("[RPC] NodeNotFound: target={}, registered={:?}",
target, channels.keys().collect::<Vec<_>>());
RaftNetworkError::NodeNotFound(target)
})?;
let tx = channels.get(&target).ok_or_else(|| {
eprintln!(
"[RPC] NodeNotFound: target={}, registered={:?}",
target,
channels.keys().collect::<Vec<_>>()
);
RaftNetworkError::NodeNotFound(target)
})?;
let (resp_tx, resp_rx) = tokio::sync::oneshot::channel();
let send_result = tx.send(RpcMessage::AppendEntries(req.clone(), resp_tx));

View file

@ -1,5 +1,7 @@
use chainfire_client::ChainFireClient;
use chainfire_server::config::{ClusterConfig, NetworkConfig, NodeConfig, RaftConfig, ServerConfig, StorageConfig};
use chainfire_server::config::{
ClusterConfig, NetworkConfig, NodeConfig, RaftConfig, ServerConfig, StorageConfig,
};
use chainfire_server::node::Node;
use chainfire_types::RaftRole;
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
@ -84,7 +86,10 @@ fn bench_put_throughput(c: &mut Criterion) {
rt.block_on(async {
for i in 0..NUM_KEYS_THROUGHPUT {
let key = format!("bench_key_{}", i);
client.put(black_box(&key), black_box(&value)).await.unwrap();
client
.put(black_box(&key), black_box(&value))
.await
.unwrap();
}
})
});
@ -180,7 +185,10 @@ fn bench_put_latency(c: &mut Criterion) {
let key = format!("latency_key_{}", key_counter);
key_counter += 1;
rt.block_on(async {
client.put(black_box(&key), black_box(&value)).await.unwrap();
client
.put(black_box(&key), black_box(&value))
.await
.unwrap();
})
});
});
@ -192,5 +200,10 @@ fn bench_put_latency(c: &mut Criterion) {
drop(rt);
}
criterion_group!(benches, bench_put_throughput, bench_get_throughput, bench_put_latency);
criterion_group!(
benches,
bench_put_throughput,
bench_get_throughput,
bench_put_latency
);
criterion_main!(benches);

View file

@ -85,10 +85,7 @@ async fn main() -> Result<()> {
"chainfire_kv_requests_total",
"Total number of KV requests by operation type"
);
metrics::describe_counter!(
"chainfire_kv_bytes_read",
"Total bytes read from KV store"
);
metrics::describe_counter!("chainfire_kv_bytes_read", "Total bytes read from KV store");
metrics::describe_counter!(
"chainfire_kv_bytes_written",
"Total bytes written to KV store"
@ -97,10 +94,7 @@ async fn main() -> Result<()> {
"chainfire_kv_request_duration_seconds",
"KV request duration in seconds"
);
metrics::describe_gauge!(
"chainfire_raft_term",
"Current Raft term"
);
metrics::describe_gauge!("chainfire_raft_term", "Current Raft term");
metrics::describe_gauge!(
"chainfire_raft_is_leader",
"Whether this node is the Raft leader (1=yes, 0=no)"
@ -110,10 +104,10 @@ async fn main() -> Result<()> {
"Total number of watch events emitted"
);
use config::{Config as Cfg, Environment, File, FileFormat};
use toml; // Import toml for serializing defaults
use config::{Config as Cfg, Environment, File, FileFormat};
use toml; // Import toml for serializing defaults
// ... (rest of existing imports)
// ... (rest of existing imports)
// Load configuration using config-rs
let mut settings = Cfg::builder()
@ -124,8 +118,7 @@ use toml; // Import toml for serializing defaults
))
// Layer 2: Environment variables (e.g., CHAINFIRE_NODE__ID, CHAINFIRE_NETWORK__API_ADDR)
.add_source(
Environment::with_prefix("CHAINFIRE")
.separator("__") // Use double underscore for nested fields
Environment::with_prefix("CHAINFIRE").separator("__"), // Use double underscore for nested fields
);
// Layer 3: Configuration file (if specified)
@ -136,9 +129,7 @@ use toml; // Import toml for serializing defaults
info!("Config file not found, using defaults and environment variables.");
}
let mut config: ServerConfig = settings
.build()?
.try_deserialize()?;
let mut config: ServerConfig = settings.build()?.try_deserialize()?;
// Apply command line overrides (Layer 4: highest precedence)
if let Some(node_id) = args.node_id {

View file

@ -6,9 +6,9 @@ use crate::config::ServerConfig;
use anyhow::Result;
use chainfire_api::GrpcRaftClient;
use chainfire_gossip::{GossipAgent, GossipId};
use chainfire_raft::core::{RaftCore, RaftConfig};
use chainfire_raft::core::{RaftConfig, RaftCore};
use chainfire_raft::network::RaftRpcClient;
use chainfire_storage::{RocksStore, LogStorage, StateMachine};
use chainfire_storage::{LogStorage, RocksStore, StateMachine};
use chainfire_types::node::NodeRole;
use chainfire_types::RaftRole;
use chainfire_watch::{stream::WatchEventHandler, WatchRegistry};
@ -58,12 +58,16 @@ impl Node {
// Create gRPC Raft client and register peer addresses
let rpc_client = Arc::new(GrpcRaftClient::new());
for member in &config.cluster.initial_members {
rpc_client.add_node(member.id, member.raft_addr.clone()).await;
rpc_client
.add_node(member.id, member.raft_addr.clone())
.await;
info!(node_id = member.id, addr = %member.raft_addr, "Registered peer");
}
// Extract peer node IDs (excluding self)
let peers: Vec<u64> = config.cluster.initial_members
let peers: Vec<u64> = config
.cluster
.initial_members
.iter()
.map(|m| m.id)
.filter(|&id| id != config.node.id)
@ -115,10 +119,8 @@ impl Node {
let gossip_id = GossipId::new(config.node.id, config.network.gossip_addr, gossip_role);
let gossip = Some(
GossipAgent::new(gossip_id, chainfire_gossip::agent::default_config())
.await?,
);
let gossip =
Some(GossipAgent::new(gossip_id, chainfire_gossip::agent::default_config()).await?);
info!(
addr = %config.network.gossip_addr,
gossip_role = ?gossip_role,

View file

@ -145,7 +145,12 @@ pub struct ReadQuery {
pub fn build_router(state: RestApiState) -> Router {
Router::new()
// Wildcard route handles all keys (with or without slashes)
.route("/api/v1/kv/*key", get(get_kv_wildcard).put(put_kv_wildcard).delete(delete_kv_wildcard))
.route(
"/api/v1/kv/*key",
get(get_kv_wildcard)
.put(put_kv_wildcard)
.delete(delete_kv_wildcard),
)
.route("/api/v1/kv", get(list_kv))
.route("/api/v1/cluster/status", get(cluster_status))
.route("/api/v1/cluster/members", post(add_member))
@ -159,7 +164,9 @@ pub fn build_router(state: RestApiState) -> Router {
async fn health_check() -> (StatusCode, Json<SuccessResponse<serde_json::Value>>) {
(
StatusCode::OK,
Json(SuccessResponse::new(serde_json::json!({ "status": "healthy" }))),
Json(SuccessResponse::new(
serde_json::json!({ "status": "healthy" }),
)),
)
}
@ -187,9 +194,13 @@ async fn get_kv_wildcard(
let sm = state.raft.state_machine();
let key_bytes = full_key.as_bytes().to_vec();
let results = sm.kv()
.get(&key_bytes)
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
let results = sm.kv().get(&key_bytes).map_err(|e| {
error_response(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
&e.to_string(),
)
})?;
let value = results
.into_iter()
@ -207,7 +218,8 @@ async fn put_kv_wildcard(
State(state): State<RestApiState>,
Path(key): Path<String>,
Json(req): Json<PutRequest>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)>
{
// Use key as-is for simple keys, prepend / for namespaced keys
let full_key = if key.contains('/') {
format!("/{}", key)
@ -225,7 +237,9 @@ async fn put_kv_wildcard(
Ok((
StatusCode::OK,
Json(SuccessResponse::new(serde_json::json!({ "key": full_key, "success": true }))),
Json(SuccessResponse::new(
serde_json::json!({ "key": full_key, "success": true }),
)),
))
}
@ -233,7 +247,8 @@ async fn put_kv_wildcard(
async fn delete_kv_wildcard(
State(state): State<RestApiState>,
Path(key): Path<String>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)>
{
// Use key as-is for simple keys, prepend / for namespaced keys
let full_key = if key.contains('/') {
format!("/{}", key)
@ -249,7 +264,9 @@ async fn delete_kv_wildcard(
Ok((
StatusCode::OK,
Json(SuccessResponse::new(serde_json::json!({ "key": full_key, "success": true }))),
Json(SuccessResponse::new(
serde_json::json!({ "key": full_key, "success": true }),
)),
))
}
@ -271,9 +288,13 @@ async fn list_kv(
let start_key = prefix.as_bytes().to_vec();
let end_key = format!("{}~", prefix).as_bytes().to_vec();
let results = sm.kv()
.range(&start_key, Some(&end_key))
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
let results = sm.kv().range(&start_key, Some(&end_key)).map_err(|e| {
error_response(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
&e.to_string(),
)
})?;
let items: Vec<KvItem> = results
.into_iter()
@ -325,14 +346,20 @@ fn string_to_node_id(s: &str) -> u64 {
async fn add_member(
State(state): State<RestApiState>,
Json(req): Json<AddMemberRequest>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
let rpc_client = state
.rpc_client
.as_ref()
.ok_or_else(|| error_response(StatusCode::SERVICE_UNAVAILABLE, "SERVICE_UNAVAILABLE", "RPC client not available"))?;
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)>
{
let rpc_client = state.rpc_client.as_ref().ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"SERVICE_UNAVAILABLE",
"RPC client not available",
)
})?;
// Add node to RPC client's routing table
rpc_client.add_node(req.node_id, req.raft_addr.clone()).await;
rpc_client
.add_node(req.node_id, req.raft_addr.clone())
.await;
// Note: RaftCore doesn't have add_peer() - members are managed via configuration
// For now, we just register the node in the RPC client
@ -353,13 +380,17 @@ async fn add_member(
async fn add_member_legacy(
State(state): State<RestApiState>,
Json(req): Json<AddMemberRequestLegacy>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)>
{
let node_id = string_to_node_id(&req.id);
let rpc_client = state
.rpc_client
.as_ref()
.ok_or_else(|| error_response(StatusCode::SERVICE_UNAVAILABLE, "SERVICE_UNAVAILABLE", "RPC client not available"))?;
let rpc_client = state.rpc_client.as_ref().ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"SERVICE_UNAVAILABLE",
"RPC client not available",
)
})?;
// Add node to RPC client's routing table
rpc_client.add_node(node_id, req.raft_addr.clone()).await;
@ -459,15 +490,19 @@ async fn proxy_write_to_leader(
if response.status().is_success() {
return Ok(());
}
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
let status =
StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response
.json::<ErrorResponse>()
.await
.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
Err((status, Json(payload)))
}
@ -510,11 +545,7 @@ where
&format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
)
})?;
let url = format!(
"{}{}",
leader_http_addr.trim_end_matches('/'),
path
);
let url = format!("{}{}", leader_http_addr.trim_end_matches('/'), path);
let mut request = state.http_client.get(&url);
if let Some(query) = query {
request = request.query(query);
@ -536,15 +567,19 @@ where
})?;
return Ok(Json(payload));
}
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
let status =
StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response
.json::<ErrorResponse>()
.await
.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
Err((status, Json(payload)))
}

View file

@ -36,10 +36,7 @@ impl Server {
}
/// Apply TLS configuration to a server builder
async fn apply_tls_config(
&self,
builder: TonicServer,
) -> Result<TonicServer> {
async fn apply_tls_config(&self, builder: TonicServer) -> Result<TonicServer> {
if let Some(tls_config) = &self.config.network.tls {
info!("TLS enabled, loading certificates...");
let cert = tokio::fs::read(&tls_config.cert_file).await?;
@ -48,12 +45,9 @@ impl Server {
let tls = if tls_config.require_client_cert {
info!("mTLS enabled, requiring client certificates");
let ca_cert = tokio::fs::read(
tls_config
.ca_file
.as_ref()
.ok_or_else(|| anyhow::anyhow!("ca_file required when require_client_cert=true"))?,
)
let ca_cert = tokio::fs::read(tls_config.ca_file.as_ref().ok_or_else(|| {
anyhow::anyhow!("ca_file required when require_client_cert=true")
})?)
.await?;
let ca = Certificate::from_pem(ca_cert);
@ -100,15 +94,8 @@ impl Server {
raft.node_id(),
);
let rpc_client = self
.node
.rpc_client()
.expect("rpc_client should exist in full mode")
.clone();
let cluster_service = ClusterServiceImpl::new(
Arc::clone(&raft),
rpc_client,
self.node.cluster_id(),
configured_members(&self.config),
);

View file

@ -23,7 +23,9 @@ fn bench_write_throughput(c: &mut Criterion) {
b.iter(|| {
for i in 0..NUM_KEYS_THROUGHPUT {
let key = format!("bench_key_{:08}", i).into_bytes();
store.put(black_box(key), black_box(value.clone()), None).unwrap();
store
.put(black_box(key), black_box(value.clone()), None)
.unwrap();
}
});
});
@ -77,7 +79,9 @@ fn bench_write_latency(c: &mut Criterion) {
b.iter(|| {
let key = format!("latency_key_{:08}", key_counter).into_bytes();
key_counter += 1;
store.put(black_box(key), black_box(value.clone()), None).unwrap();
store
.put(black_box(key), black_box(value.clone()), None)
.unwrap();
});
});

View file

@ -62,8 +62,8 @@ impl KvStore {
.cf_handle(cf::META)
.ok_or_else(|| StorageError::RocksDb("META cf not found".into()))?;
let bytes =
bincode::serialize(&revision).map_err(|e| StorageError::Serialization(e.to_string()))?;
let bytes = bincode::serialize(&revision)
.map_err(|e| StorageError::Serialization(e.to_string()))?;
self.store
.db()

View file

@ -43,7 +43,10 @@ impl LeaseStore {
} else {
// Check if ID is already in use
if self.leases.contains_key(&id) {
return Err(StorageError::LeaseError(format!("Lease {} already exists", id)));
return Err(StorageError::LeaseError(format!(
"Lease {} already exists",
id
)));
}
// Update next_id if necessary
let _ = self.next_id.fetch_max(id + 1, Ordering::SeqCst);
@ -61,7 +64,11 @@ impl LeaseStore {
pub fn revoke(&self, id: LeaseId) -> Result<Vec<Vec<u8>>, StorageError> {
match self.leases.remove(&id) {
Some((_, lease)) => {
info!(lease_id = id, keys_count = lease.keys.len(), "Lease revoked");
info!(
lease_id = id,
keys_count = lease.keys.len(),
"Lease revoked"
);
Ok(lease.keys)
}
None => Err(StorageError::LeaseError(format!("Lease {} not found", id))),
@ -88,9 +95,9 @@ impl LeaseStore {
/// Get remaining TTL for a lease
pub fn time_to_live(&self, id: LeaseId) -> Option<(i64, i64, Vec<Vec<u8>>)> {
self.leases.get(&id).map(|lease| {
(lease.remaining(), lease.ttl, lease.keys.clone())
})
self.leases
.get(&id)
.map(|lease| (lease.remaining(), lease.ttl, lease.keys.clone()))
}
/// List all lease IDs
@ -105,7 +112,10 @@ impl LeaseStore {
lease.attach_key(key);
Ok(())
}
None => Err(StorageError::LeaseError(format!("Lease {} not found", lease_id))),
None => Err(StorageError::LeaseError(format!(
"Lease {} not found",
lease_id
))),
}
}

View file

@ -17,7 +17,7 @@ pub mod store;
pub use kv_store::KvStore;
pub use lease_store::{LeaseExpirationWorker, LeaseStore};
pub use log_storage::{LogStorage, LogEntry, EntryPayload, LogId, Vote, LogState};
pub use log_storage::{EntryPayload, LogEntry, LogId, LogState, LogStorage, Vote};
pub use snapshot::{Snapshot, SnapshotBuilder, SnapshotMeta};
pub use state_machine::StateMachine;
pub use store::RocksStore;

View file

@ -16,8 +16,9 @@ pub type LogIndex = u64;
pub type Term = u64;
/// Log ID combining term and index
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)]
#[derive(Default)]
#[derive(
Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize, Default,
)]
pub struct LogId {
pub term: Term,
pub index: LogIndex,
@ -29,7 +30,6 @@ impl LogId {
}
}
/// A log entry stored in the Raft log
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LogEntry<D> {
@ -120,10 +120,7 @@ impl LogStorage {
let last_purged_log_id = self.get_last_purged_log_id()?;
// Get last log ID
let mut last_iter = self
.store
.db()
.iterator_cf(&cf, rocksdb::IteratorMode::End);
let mut last_iter = self.store.db().iterator_cf(&cf, rocksdb::IteratorMode::End);
let last_log_id = if let Some(Ok((_, value))) = last_iter.next() {
// Skip empty or corrupt entries - treat as empty log
@ -133,7 +130,10 @@ impl LogStorage {
match bincode::deserialize::<LogEntry<Vec<u8>>>(&value) {
Ok(entry) => Some(entry.log_id),
Err(e) => {
eprintln!("Warning: Failed to deserialize log entry: {}, treating as empty log", e);
eprintln!(
"Warning: Failed to deserialize log entry: {}, treating as empty log",
e
);
last_purged_log_id
}
}
@ -369,7 +369,10 @@ impl LogStorage {
match bincode::deserialize::<LogId>(&bytes) {
Ok(log_id) => Ok(Some(log_id)),
Err(e) => {
eprintln!("Warning: Failed to deserialize last_purged: {}, treating as None", e);
eprintln!(
"Warning: Failed to deserialize last_purged: {}, treating as None",
e
);
Ok(None)
}
}

View file

@ -38,8 +38,8 @@ impl Snapshot {
/// Serialize snapshot to bytes
pub fn to_bytes(&self) -> Result<Vec<u8>, StorageError> {
// Format: [meta_len: u32][meta][data]
let meta_bytes =
bincode::serialize(&self.meta).map_err(|e| StorageError::Serialization(e.to_string()))?;
let meta_bytes = bincode::serialize(&self.meta)
.map_err(|e| StorageError::Serialization(e.to_string()))?;
let mut result = Vec::with_capacity(4 + meta_bytes.len() + self.data.len());
result.extend_from_slice(&(meta_bytes.len() as u32).to_le_bytes());
@ -108,8 +108,8 @@ impl SnapshotBuilder {
}
// Serialize entries
let data = bincode::serialize(&entries)
.map_err(|e| StorageError::Serialization(e.to_string()))?;
let data =
bincode::serialize(&entries).map_err(|e| StorageError::Serialization(e.to_string()))?;
let meta = SnapshotMeta {
last_log_index,
@ -277,10 +277,8 @@ mod tests {
// Add data to store1
let kv1 = KvStore::new(store1.clone()).unwrap();
kv1.put(b"key1".to_vec(), b"value1".to_vec(), None)
.unwrap();
kv1.put(b"key2".to_vec(), b"value2".to_vec(), None)
.unwrap();
kv1.put(b"key1".to_vec(), b"value1".to_vec(), None).unwrap();
kv1.put(b"key2".to_vec(), b"value2".to_vec(), None).unwrap();
// Build snapshot from store1
let builder1 = SnapshotBuilder::new(store1.clone());

View file

@ -277,7 +277,7 @@ impl StateMachine {
txn_responses.push(TxnOpResponse::Range {
kvs,
count,
more: false, // TODO: handle pagination
more: false,
});
}
}
@ -341,7 +341,11 @@ impl StateMachine {
/// Apply a lease grant command
fn apply_lease_grant(&self, id: i64, ttl: i64) -> Result<RaftResponse, StorageError> {
let lease = self.leases.grant(id, ttl)?;
Ok(RaftResponse::lease(self.current_revision(), lease.id, lease.ttl))
Ok(RaftResponse::lease(
self.current_revision(),
lease.id,
lease.ttl,
))
}
/// Apply a lease revoke command

View file

@ -115,10 +115,7 @@ mod tests {
{
let store = RocksStore::new(dir.path()).unwrap();
let cf = store.cf_handle(cf::META).unwrap();
store
.db()
.put_cf(&cf, b"test_key", b"test_value")
.unwrap();
store.db().put_cf(&cf, b"test_key", b"test_value").unwrap();
}
// Reopen and verify data persisted

View file

@ -7,8 +7,7 @@ use crate::Revision;
use serde::{Deserialize, Serialize};
/// Commands submitted to Raft consensus
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Default)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum RaftCommand {
/// Put a key-value pair
Put {
@ -69,7 +68,6 @@ pub enum RaftCommand {
Noop,
}
/// Comparison for transaction conditions
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Compare {
@ -129,9 +127,7 @@ pub enum TxnOp {
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub enum TxnOpResponse {
/// Response from a Put operation
Put {
prev_kv: Option<KvEntry>,
},
Put { prev_kv: Option<KvEntry> },
/// Response from a Delete/DeleteRange operation
Delete {
deleted: u64,

View file

@ -7,8 +7,7 @@ use serde::{Deserialize, Serialize};
pub type Revision = u64;
/// A key-value entry with metadata
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Default)]
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct KvEntry {
/// The key
pub key: Vec<u8>,
@ -77,7 +76,6 @@ impl KvEntry {
}
}
/// Range of keys for scan operations
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct KeyRange {

View file

@ -7,8 +7,7 @@ use std::net::SocketAddr;
pub type NodeId = u64;
/// Role of a node in the cluster
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[derive(Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
pub enum NodeRole {
/// Control Plane node - participates in Raft consensus
ControlPlane,
@ -17,7 +16,6 @@ pub enum NodeRole {
Worker,
}
/// Raft participation role for a node.
///
/// This determines whether and how a node participates in the Raft consensus protocol.

View file

@ -51,11 +51,7 @@ impl WatchRegistry {
}
/// Create a new watch subscription
pub fn create_watch(
&self,
req: WatchRequest,
sender: mpsc::Sender<WatchResponse>,
) -> i64 {
pub fn create_watch(&self, req: WatchRequest, sender: mpsc::Sender<WatchResponse>) -> i64 {
let watch_id = if req.watch_id != 0 {
req.watch_id
} else {
@ -72,7 +68,9 @@ impl WatchRegistry {
watch_id,
matcher,
prev_kv: req.prev_kv,
created_revision: req.start_revision.unwrap_or_else(|| self.current_revision()),
created_revision: req
.start_revision
.unwrap_or_else(|| self.current_revision()),
sender,
};
@ -82,10 +80,7 @@ impl WatchRegistry {
// Add to prefix index
{
let mut index = self.prefix_index.write();
index
.entry(req.key.clone())
.or_default()
.insert(watch_id);
index.entry(req.key.clone()).or_default().insert(watch_id);
}
debug!(watch_id, key = ?String::from_utf8_lossy(&req.key), "Created watch");

View file

@ -23,26 +23,14 @@ service Watch {
rpc Watch(stream WatchRequest) returns (stream WatchResponse);
}
// Cluster management service
// Cluster management service for fixed-membership clusters.
service Cluster {
// MemberAdd adds a member into the cluster
rpc MemberAdd(MemberAddRequest) returns (MemberAddResponse);
// MemberRemove removes an existing member from the cluster
rpc MemberRemove(MemberRemoveRequest) returns (MemberRemoveResponse);
// MemberList lists all the members in the cluster
// MemberList lists the members configured at cluster bootstrap time
rpc MemberList(MemberListRequest) returns (MemberListResponse);
// Status gets the status of the cluster
rpc Status(StatusRequest) returns (StatusResponse);
// TransferSnapshot transfers a snapshot to a target node for pre-seeding
// This is used as a workaround for OpenRaft 0.9.x learner replication bug
rpc TransferSnapshot(TransferSnapshotRequest) returns (TransferSnapshotResponse);
// GetSnapshot returns the current snapshot from this node
rpc GetSnapshot(GetSnapshotRequest) returns (stream GetSnapshotResponse);
}
// Lease service for TTL-based key expiration
@ -295,34 +283,6 @@ message Member {
bool is_learner = 5;
}
message MemberAddRequest {
// node_id is the joining node's actual ID
uint64 node_id = 1;
// peer_urls are the URLs to reach the new member
repeated string peer_urls = 2;
// is_learner indicates if the member is a learner
bool is_learner = 3;
}
message MemberAddResponse {
ResponseHeader header = 1;
// member is the member information for the added member
Member member = 2;
// members is the list of all members after adding
repeated Member members = 3;
}
message MemberRemoveRequest {
// ID is the member ID to remove
uint64 id = 1;
}
message MemberRemoveResponse {
ResponseHeader header = 1;
// members is the list of all members after removing
repeated Member members = 2;
}
message MemberListRequest {}
message MemberListResponse {
@ -421,49 +381,3 @@ message LeaseStatus {
// ID is the lease ID
int64 id = 1;
}
// ========== Snapshot Transfer (T041 Option C workaround) ==========
// Snapshot metadata
message SnapshotMeta {
// last_log_index is the last log index included in the snapshot
uint64 last_log_index = 1;
// last_log_term is the term of the last log entry included
uint64 last_log_term = 2;
// membership is the cluster membership at snapshot time
repeated uint64 membership = 3;
// size is the size of snapshot data in bytes
uint64 size = 4;
}
// Request to transfer snapshot to a target node
message TransferSnapshotRequest {
// target_node_id is the ID of the node to receive the snapshot
uint64 target_node_id = 1;
// target_addr is the gRPC address of the target node
string target_addr = 2;
}
// Response from snapshot transfer
message TransferSnapshotResponse {
ResponseHeader header = 1;
// success indicates if the transfer completed successfully
bool success = 2;
// error is the error message if transfer failed
string error = 3;
// meta is the metadata of the transferred snapshot
SnapshotMeta meta = 4;
}
// Request to get snapshot from this node
message GetSnapshotRequest {}
// Streaming response containing snapshot chunks
message GetSnapshotResponse {
// meta is the snapshot metadata (only in first chunk)
SnapshotMeta meta = 1;
// chunk is the snapshot data chunk
bytes chunk = 2;
// done indicates if this is the last chunk
bool done = 3;
}

View file

@ -9,9 +9,6 @@ service RaftService {
// AppendEntries sends log entries to followers
rpc AppendEntries(AppendEntriesRequest) returns (AppendEntriesResponse);
// InstallSnapshot sends a snapshot to a follower
rpc InstallSnapshot(stream InstallSnapshotRequest) returns (InstallSnapshotResponse);
}
message VoteRequest {
@ -69,25 +66,3 @@ message AppendEntriesResponse {
// conflict_term is the term of the conflicting entry
uint64 conflict_term = 4;
}
message InstallSnapshotRequest {
// term is the leader's term
uint64 term = 1;
// leader_id is the leader's ID
uint64 leader_id = 2;
// last_included_index is the snapshot replaces all entries up through and including this index
uint64 last_included_index = 3;
// last_included_term is term of last_included_index
uint64 last_included_term = 4;
// offset is byte offset where chunk is positioned in the snapshot file
uint64 offset = 5;
// data is raw bytes of the snapshot chunk
bytes data = 6;
// done is true if this is the last chunk
bool done = 7;
}
message InstallSnapshotResponse {
// term is the current term
uint64 term = 1;
}

View file

@ -1,22 +1,25 @@
# CreditService
`creditservice` is a minimal reference service that proves UltraCloud can integrate vendor-specific quota and credit control with platform auth and gateway admission.
`creditservice` is UltraCloud's supported quota, reservation, wallet, and admission-control service. It integrates with platform auth, persists state in the platform data plane, and sits behind the same gateway and VM-cluster validation used for the rest of the published surface.
It is intentionally not a full billing product.
## What this proves
- a vendor-specific credit or quota service can be built in-tree
- the service can authenticate against Photon IAM
- the service can participate in gateway and control-plane admission flows
- the service can persist state in Photon-supported backends
It is intentionally scoped to real-time control and admission, not finance-system ownership.
## Supported scope
- quota checks
- credit reservations, commits, and releases
- tenant-aware auth integration
- quota creation, lookup, and enforcement
- credit reservations, commits, releases, and wallet mutations
- tenant-aware auth integration through IAM
- gateway-facing admission control hooks
- persistent state in FlareDB, PostgreSQL, or SQLite depending on deployment mode
## Export And Migration
CreditService export and backend migration are supported as offline export/import or backend-native snapshot workflows, not live mixed-writer migration.
- Use backend-native snapshots or logical API replay as the export baseline.
- Drain or quiesce writes before moving between FlareDB, PostgreSQL, and SQLite backends.
- Rehydrate the target backend, then cut APIGateway or direct callers over to the new endpoint.
- Treat finance-grade reporting, settlement, or ledger history export as out of scope for the product boundary.
## Explicit non-goals
@ -26,14 +29,12 @@ It is intentionally not a full billing product.
- finance-grade ledger completeness
- full metering platform ownership
## Test expectation
## Release proof
The main proof should come from cluster-level VM validation in `nix/test-cluster`, not from expanding `creditservice` into a larger product surface.
Concrete proof path:
The release-facing proof comes from the publishable VM-cluster harness:
```bash
nix run ./nix/test-cluster#cluster -- fresh-smoke
nix run ./nix/test-cluster#cluster -- fresh-matrix
```
That flow boots node06 with `apigateway`, `nightlight`, and `creditservice`, and validates that `creditservice` starts in the IAM-integrated cluster path.
That flow boots node06 with `apigateway`, `nightlight`, and `creditservice`, then validates REST and gRPC quota flows, wallet and reservation mutations, IAM integration, and API-gateway-mediated admission traffic.

View file

@ -53,16 +53,16 @@ impl Default for PricingRules {
fn default() -> Self {
let mut prices = HashMap::new();
// Default pricing (credits per hour/GB)
prices.insert(ResourceType::VmInstance, 100); // 100 credits/hour
prices.insert(ResourceType::VmCpu, 10); // 10 credits/CPU-hour
prices.insert(ResourceType::VmMemoryGb, 5); // 5 credits/GB-hour
prices.insert(ResourceType::StorageGb, 1); // 1 credit/GB-hour
prices.insert(ResourceType::NetworkPort, 2); // 2 credits/port-hour
prices.insert(ResourceType::LoadBalancer, 50); // 50 credits/hour
prices.insert(ResourceType::DnsZone, 10); // 10 credits/zone-hour
prices.insert(ResourceType::DnsRecord, 1); // 1 credit/record-hour
prices.insert(ResourceType::K8sCluster, 200); // 200 credits/hour
prices.insert(ResourceType::K8sNode, 100); // 100 credits/node-hour
prices.insert(ResourceType::VmInstance, 100); // 100 credits/hour
prices.insert(ResourceType::VmCpu, 10); // 10 credits/CPU-hour
prices.insert(ResourceType::VmMemoryGb, 5); // 5 credits/GB-hour
prices.insert(ResourceType::StorageGb, 1); // 1 credit/GB-hour
prices.insert(ResourceType::NetworkPort, 2); // 2 credits/port-hour
prices.insert(ResourceType::LoadBalancer, 50); // 50 credits/hour
prices.insert(ResourceType::DnsZone, 10); // 10 credits/zone-hour
prices.insert(ResourceType::DnsRecord, 1); // 1 credit/record-hour
prices.insert(ResourceType::K8sCluster, 200); // 200 credits/hour
prices.insert(ResourceType::K8sNode, 100); // 100 credits/node-hour
Self { prices }
}
}
@ -128,12 +128,16 @@ impl UsageMetricsProvider for MockUsageMetricsProvider {
period_start: DateTime<Utc>,
period_end: DateTime<Utc>,
) -> Result<UsageMetrics> {
Ok(self.mock_data.get(project_id).cloned().unwrap_or_else(|| UsageMetrics {
project_id: project_id.to_string(),
resource_usage: HashMap::new(),
period_start,
period_end,
}))
Ok(self
.mock_data
.get(project_id)
.cloned()
.unwrap_or_else(|| UsageMetrics {
project_id: project_id.to_string(),
resource_usage: HashMap::new(),
period_start,
period_end,
}))
}
async fn list_projects_with_usage(
@ -199,6 +203,8 @@ mod tests {
.unwrap();
assert_eq!(metrics.project_id, "proj-1");
assert!(metrics.resource_usage.contains_key(&ResourceType::VmInstance));
assert!(metrics
.resource_usage
.contains_key(&ResourceType::VmInstance));
}
}

View file

@ -120,7 +120,10 @@ impl CreditServiceImpl {
let org_id = req_org_id.unwrap_or("");
resolve_tenant_ids_from_context(tenant, org_id, req_project_id)
}
None => Ok((req_org_id.unwrap_or("").to_string(), req_project_id.to_string())),
None => Ok((
req_org_id.unwrap_or("").to_string(),
req_project_id.to_string(),
)),
}
}
@ -377,7 +380,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
@ -424,8 +428,11 @@ impl CreditService for CreditServiceImpl {
));
}
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), Some(req.org_id.as_str()), &req.project_id)?;
let (org_id, project_id) = self.resolve_project_scope(
tenant.as_ref(),
Some(req.org_id.as_str()),
&req.project_id,
)?;
self.authorize_project_action(
tenant.as_ref(),
@ -481,7 +488,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
@ -559,7 +567,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
@ -629,7 +638,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
@ -716,7 +726,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
@ -981,7 +992,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
ACTION_BILLING_PROCESS,
@ -1080,7 +1092,8 @@ impl CreditService for CreditServiceImpl {
}
let resource_type = proto_to_resource_type(req.resource_type)?;
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
ACTION_QUOTA_SET,
@ -1121,7 +1134,8 @@ impl CreditService for CreditServiceImpl {
}
let resource_type = proto_to_resource_type(req.resource_type)?;
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
ACTION_QUOTA_READ,
@ -1165,7 +1179,8 @@ impl CreditService for CreditServiceImpl {
return Err(Status::invalid_argument("project_id is required"));
}
let (org_id, project_id) = self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
let (org_id, project_id) =
self.resolve_project_scope(tenant.as_ref(), None, &req.project_id)?;
self.authorize_project_action(
tenant.as_ref(),
ACTION_QUOTA_READ,

View file

@ -36,8 +36,8 @@ impl FlareDbStorage {
.unwrap_or_else(|| endpoint.clone());
debug!(endpoint = %endpoint, "Connecting to FlareDB");
let client = RdbClient::connect_with_pd_namespace(endpoint, pd_endpoint, "creditservice")
.await
.map_err(|e| Error::Storage(format!("Failed to connect to FlareDB: {}", e)))?;
.await
.map_err(|e| Error::Storage(format!("Failed to connect to FlareDB: {}", e)))?;
Ok(Arc::new(Self {
client: Arc::new(Mutex::new(client)),
@ -60,7 +60,11 @@ impl FlareDbStorage {
}
fn quota_key(project_id: &str, resource_type: ResourceType) -> String {
format!("/creditservice/quotas/{}/{}", project_id, resource_type.as_str())
format!(
"/creditservice/quotas/{}/{}",
project_id,
resource_type.as_str()
)
}
fn transactions_prefix(project_id: &str) -> String {
@ -273,7 +277,11 @@ impl CreditStorage for FlareDbStorage {
Ok(reservations)
}
async fn get_quota(&self, project_id: &str, resource_type: ResourceType) -> Result<Option<Quota>> {
async fn get_quota(
&self,
project_id: &str,
resource_type: ResourceType,
) -> Result<Option<Quota>> {
let key = Self::quota_key(project_id, resource_type);
self.get_value_with_version(&key)
.await?

View file

@ -1,24 +1,24 @@
//! gRPC service implementations for the Photon credit-control reference service.
//! gRPC service implementations for the supported credit-control service.
//!
//! The goal is to prove quota and admission control can be integrated with
//! Photon IAM and gateway flows without turning this crate into a full billing
//! product.
//! The product boundary is quota, reservation, wallet, and admission control.
//! This crate intentionally avoids turning that scope into a finance-grade
//! billing or settlement system.
mod billing;
mod flaredb_storage;
mod sql_storage;
mod credit_service;
mod flaredb_storage;
mod gateway_credit_service;
mod nightlight;
mod sql_storage;
mod storage;
pub use billing::{
MockUsageMetricsProvider, PricingRules, ProjectBillingResult, ResourceUsage, UsageMetrics,
UsageMetricsProvider,
};
pub use flaredb_storage::FlareDbStorage;
pub use sql_storage::SqlStorage;
pub use credit_service::CreditServiceImpl;
pub use flaredb_storage::FlareDbStorage;
pub use gateway_credit_service::GatewayCreditServiceImpl;
pub use nightlight::NightLightClient;
pub use sql_storage::SqlStorage;
pub use storage::{CreditStorage, InMemoryStorage};

View file

@ -181,17 +181,11 @@ impl NightLightClient {
(query, "lb-hours".to_string())
}
ResourceType::DnsZone => {
let query = format!(
r#"count(dns_zone_active{{project_id="{}"}})"#,
project_id
);
let query = format!(r#"count(dns_zone_active{{project_id="{}"}})"#, project_id);
(query, "zones".to_string())
}
ResourceType::DnsRecord => {
let query = format!(
r#"count(dns_record_active{{project_id="{}"}})"#,
project_id
);
let query = format!(r#"count(dns_record_active{{project_id="{}"}})"#, project_id);
(query, "records".to_string())
}
ResourceType::K8sCluster => {

View file

@ -46,7 +46,9 @@ impl SqlStorage {
));
}
if url.contains(":memory:") {
return Err(Error::Storage("In-memory SQLite is not allowed".to_string()));
return Err(Error::Storage(
"In-memory SQLite is not allowed".to_string(),
));
}
let pool = PoolOptions::<Sqlite>::new()
.max_connections(1)
@ -114,7 +116,11 @@ impl SqlStorage {
}
fn quota_key(project_id: &str, resource_type: ResourceType) -> String {
format!("/creditservice/quotas/{}/{}", project_id, resource_type.as_str())
format!(
"/creditservice/quotas/{}/{}",
project_id,
resource_type.as_str()
)
}
fn transactions_prefix(project_id: &str) -> String {
@ -171,15 +177,15 @@ impl SqlStorage {
async fn put_if_absent(&self, key: &str, value: &[u8]) -> Result<bool> {
let rows_affected = match &self.backend {
SqlBackend::Postgres(pool) => {
sqlx::query("INSERT INTO creditservice_kv (key, value) VALUES ($1, $2) ON CONFLICT DO NOTHING")
.bind(key)
.bind(value)
.execute(pool.as_ref())
.await
.map_err(|e| Error::Storage(format!("Postgres insert failed: {}", e)))?
.rows_affected()
}
SqlBackend::Postgres(pool) => sqlx::query(
"INSERT INTO creditservice_kv (key, value) VALUES ($1, $2) ON CONFLICT DO NOTHING",
)
.bind(key)
.bind(value)
.execute(pool.as_ref())
.await
.map_err(|e| Error::Storage(format!("Postgres insert failed: {}", e)))?
.rows_affected(),
SqlBackend::Sqlite(pool) => {
sqlx::query("INSERT OR IGNORE INTO creditservice_kv (key, value) VALUES (?1, ?2)")
.bind(key)
@ -226,14 +232,12 @@ impl SqlStorage {
.map_err(|e| Error::Storage(format!("Postgres delete failed: {}", e)))?
.rows_affected()
}
SqlBackend::Sqlite(pool) => {
sqlx::query("DELETE FROM creditservice_kv WHERE key = ?1")
.bind(key)
.execute(pool.as_ref())
.await
.map_err(|e| Error::Storage(format!("SQLite delete failed: {}", e)))?
.rows_affected()
}
SqlBackend::Sqlite(pool) => sqlx::query("DELETE FROM creditservice_kv WHERE key = ?1")
.bind(key)
.execute(pool.as_ref())
.await
.map_err(|e| Error::Storage(format!("SQLite delete failed: {}", e)))?
.rows_affected(),
};
Ok(rows_affected > 0)
}
@ -368,7 +372,11 @@ impl CreditStorage for SqlStorage {
Ok(reservations)
}
async fn get_quota(&self, project_id: &str, resource_type: ResourceType) -> Result<Option<Quota>> {
async fn get_quota(
&self,
project_id: &str,
resource_type: ResourceType,
) -> Result<Option<Quota>> {
let key = Self::quota_key(project_id, resource_type);
self.get(&key)
.await?

View file

@ -34,7 +34,11 @@ pub trait CreditStorage: Send + Sync {
async fn get_pending_reservations(&self, project_id: &str) -> Result<Vec<Reservation>>;
// Quota operations
async fn get_quota(&self, project_id: &str, resource_type: ResourceType) -> Result<Option<Quota>>;
async fn get_quota(
&self,
project_id: &str,
resource_type: ResourceType,
) -> Result<Option<Quota>>;
async fn set_quota(&self, quota: Quota) -> Result<Quota>;
async fn list_quotas(&self, project_id: &str) -> Result<Vec<Quota>>;
}
@ -161,7 +165,9 @@ impl CreditStorage for InMemoryStorage {
resource_type: ResourceType,
) -> Result<Option<Quota>> {
let quotas = self.quotas.read().await;
Ok(quotas.get(&(project_id.to_string(), resource_type)).cloned())
Ok(quotas
.get(&(project_id.to_string(), resource_type))
.cloned())
}
async fn set_quota(&self, quota: Quota) -> Result<Quota> {

View file

@ -1,4 +1,4 @@
//! File-first configuration for the minimal credit-control reference service.
//! File-first configuration for the supported credit-control service.
use photon_config::load_toml_config;
use photon_state::StateBackend;

View file

@ -1,7 +1,7 @@
//! CreditService reference server.
//! CreditService server.
//!
//! Main entry point for the minimal auth-integrated quota and credit-control
//! service used to prove vendor-replaceable integration.
//! Main entry point for the auth-integrated quota, reservation, wallet, and
//! admission-control service shipped in the supported UltraCloud add-on surface.
mod config;
mod rest;
@ -24,7 +24,7 @@ use tracing::info;
#[derive(Parser, Debug)]
#[command(name = "creditservice-server")]
#[command(about = "Minimal auth-integrated credit and quota control reference service")]
#[command(about = "Auth-integrated credit, quota, wallet, and admission-control service")]
struct Args {
/// Configuration file path
#[arg(short, long, default_value = "creditservice.toml")]
@ -86,11 +86,8 @@ async fn main() -> anyhow::Result<()> {
.as_deref()
.unwrap_or("127.0.0.1:2479");
info!("Using FlareDB for persistent storage: {}", flaredb_endpoint);
FlareDbStorage::new_with_pd(
flaredb_endpoint,
config.chainfire_endpoint.as_deref(),
)
.await?
FlareDbStorage::new_with_pd(flaredb_endpoint, config.chainfire_endpoint.as_deref())
.await?
}
StateBackend::Postgres | StateBackend::Sqlite => {
let database_url = config.database_url.as_deref().ok_or_else(|| {

View file

@ -2,14 +2,14 @@
//!
//! This crate defines the domain types used throughout the CreditService.
mod wallet;
mod transaction;
mod reservation;
mod quota;
mod error;
mod quota;
mod reservation;
mod transaction;
mod wallet;
pub use wallet::{Wallet, WalletStatus};
pub use transaction::{Transaction, TransactionType};
pub use reservation::{Reservation, ReservationStatus};
pub use quota::{Quota, ResourceType};
pub use error::{Error, Result};
pub use quota::{Quota, ResourceType};
pub use reservation::{Reservation, ReservationStatus};
pub use transaction::{Transaction, TransactionType};
pub use wallet::{Wallet, WalletStatus};

View file

@ -50,8 +50,7 @@ impl Reservation {
}
/// Reservation status
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum ReservationStatus {
/// Reservation is pending
#[default]
@ -63,4 +62,3 @@ pub enum ReservationStatus {
/// Reservation has expired
Expired,
}

View file

@ -61,8 +61,7 @@ impl Wallet {
}
/// Wallet status
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Default)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum WalletStatus {
/// Wallet is active and can be used
#[default]
@ -73,7 +72,6 @@ pub enum WalletStatus {
Closed,
}
#[cfg(test)]
mod tests {
use super::*;

View file

@ -70,7 +70,10 @@ impl Client {
quantity,
estimated_cost,
};
self.inner.check_quota(request).await.map(|r| r.into_inner())
self.inner
.check_quota(request)
.await
.map(|r| r.into_inner())
}
/// Reserve credits for a resource creation

View file

@ -443,7 +443,7 @@ where
Fut: Future<Output = Result<T>>,
{
let endpoints = if endpoints.is_empty() {
vec!["http://127.0.0.1:7000".to_string()]
vec!["http://127.0.0.1:2379".to_string()]
} else {
endpoints.to_vec()
};

View file

@ -16,8 +16,8 @@ mod remote;
#[derive(Parser, Debug)]
#[command(author, version, about)]
struct Cli {
/// Chainfire API エンドポイント (例: http://127.0.0.1:7000)
#[arg(long, global = true, default_value = "http://127.0.0.1:7000")]
/// Chainfire API エンドポイント (例: http://127.0.0.1:2379)
#[arg(long, global = true, default_value = "http://127.0.0.1:2379")]
chainfire_endpoint: String,
/// UltraCloud Cluster ID (論理名)

View file

@ -48,7 +48,7 @@ fn instances_prefix(cluster_namespace: &str, cluster_id: &str) -> Vec<u8> {
#[derive(Debug, Parser)]
#[command(author, version, about = "UltraCloud non-Kubernetes fleet scheduler")]
struct Cli {
#[arg(long, default_value = "http://127.0.0.1:7000")]
#[arg(long, default_value = "http://127.0.0.1:2379")]
chainfire_endpoint: String,
#[arg(long, default_value = "ultracloud")]
@ -1506,7 +1506,7 @@ mod tests {
fn test_scheduler() -> Scheduler {
Scheduler::new(Cli {
chainfire_endpoint: "http://127.0.0.1:7000".to_string(),
chainfire_endpoint: "http://127.0.0.1:2379".to_string(),
cluster_namespace: "ultracloud".to_string(),
cluster_id: "test-cluster".to_string(),
interval_secs: 1,

View file

@ -48,7 +48,7 @@ fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str)
#[derive(Parser, Debug)]
#[command(author, version, about)]
struct Cli {
#[arg(long, default_value = "http://127.0.0.1:7000")]
#[arg(long, default_value = "http://127.0.0.1:2379")]
chainfire_endpoint: String,
#[arg(long, default_value = "ultracloud")]

View file

@ -1138,7 +1138,7 @@ mod tests {
fn test_agent() -> Agent {
Agent::new(
"http://127.0.0.1:7000".to_string(),
"http://127.0.0.1:2379".to_string(),
"ultracloud".to_string(),
"test-cluster".to_string(),
"node01".to_string(),

View file

@ -18,7 +18,7 @@ mod watcher;
#[command(author, version, about)]
struct Cli {
/// Chainfire API エンドポイント
#[arg(long, default_value = "http://127.0.0.1:7000")]
#[arg(long, default_value = "http://127.0.0.1:2379")]
chainfire_endpoint: String,
/// UltraCloud cluster namespace (default: ultracloud)

View file

@ -142,6 +142,10 @@ struct ManagedProcessMetadata {
instance_id: String,
#[serde(default)]
command: Option<String>,
#[serde(default)]
args: Vec<String>,
#[serde(default)]
boot_id: Option<String>,
}
fn metadata_file_path(pid_file: &PathBuf) -> PathBuf {
@ -152,6 +156,50 @@ fn log_file_path(pid_file: &PathBuf) -> PathBuf {
PathBuf::from(format!("{}.log", pid_file.display()))
}
fn current_boot_id() -> Option<String> {
fs::read_to_string("/proc/sys/kernel/random/boot_id")
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
}
fn read_process_argv(pid: u32) -> Option<Vec<String>> {
let bytes = fs::read(format!("/proc/{pid}/cmdline")).ok()?;
let argv = bytes
.split(|byte| *byte == 0)
.filter(|part| !part.is_empty())
.map(|part| String::from_utf8_lossy(part).into_owned())
.collect::<Vec<_>>();
(!argv.is_empty()).then_some(argv)
}
fn process_argv_matches(spec_command: &str, spec_args: &[String], argv: &[String]) -> bool {
if argv.is_empty() || argv.len() != spec_args.len() + 1 {
return false;
}
let actual_command = Path::new(&argv[0])
.file_name()
.and_then(|value| value.to_str())
.unwrap_or(argv[0].as_str());
let expected_command = Path::new(spec_command)
.file_name()
.and_then(|value| value.to_str())
.unwrap_or(spec_command);
if actual_command != expected_command && argv[0] != spec_command {
return false;
}
argv[1..] == *spec_args
}
fn read_process_metadata(path: &Path) -> Option<ManagedProcessMetadata> {
fs::read(path)
.ok()
.and_then(|bytes| serde_json::from_slice(&bytes).ok())
}
const FALLBACK_EXEC_PATHS: &[&str] = &[
"/run/current-system/sw/bin",
"/run/current-system/sw/sbin",
@ -307,6 +355,8 @@ impl ManagedProcess {
service: self.service.clone(),
instance_id: self.instance_id.clone(),
command: Some(self.spec.command.clone()),
args: self.spec.args.clone(),
boot_id: current_boot_id(),
};
fs::write(&self.metadata_file, serde_json::to_vec(&metadata)?).with_context(|| {
format!("failed to write process metadata {:?}", self.metadata_file)
@ -349,12 +399,35 @@ impl ManagedProcess {
// PIDファイルからPIDを読み取って停止
if let Ok(pid_str) = fs::read_to_string(&self.pid_file) {
if let Ok(pid) = pid_str.trim().parse::<u32>() {
Command::new("kill")
.arg(pid.to_string())
.output()
.await
.ok();
for _ in 0..10 {
let metadata = read_process_metadata(&self.metadata_file);
let boot_matches = metadata
.as_ref()
.and_then(|value| value.boot_id.as_deref())
.map(|expected| current_boot_id().as_deref() == Some(expected))
.unwrap_or(true);
let argv_matches = read_process_argv(pid)
.map(|argv| process_argv_matches(&self.spec.command, &self.spec.args, &argv))
.unwrap_or(false);
if boot_matches && argv_matches {
Command::new("kill")
.arg(pid.to_string())
.output()
.await
.ok();
for _ in 0..10 {
let still_running = Command::new("kill")
.arg("-0")
.arg(pid.to_string())
.output()
.await
.map(|output| output.status.success())
.unwrap_or(false);
if !still_running {
break;
}
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
}
let still_running = Command::new("kill")
.arg("-0")
.arg(pid.to_string())
@ -362,25 +435,21 @@ impl ManagedProcess {
.await
.map(|output| output.status.success())
.unwrap_or(false);
if !still_running {
break;
if still_running {
Command::new("kill")
.arg("-9")
.arg(pid.to_string())
.output()
.await
.ok();
}
tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
}
let still_running = Command::new("kill")
.arg("-0")
.arg(pid.to_string())
.output()
.await
.map(|output| output.status.success())
.unwrap_or(false);
if still_running {
Command::new("kill")
.arg("-9")
.arg(pid.to_string())
.output()
.await
.ok();
} else {
warn!(
service = %self.service,
instance_id = %self.instance_id,
pid = pid,
"pid file points to a different process; removing stale pid-dir entry without sending a signal"
);
}
}
}
@ -410,6 +479,7 @@ impl ManagedProcess {
})? {
self.child = None;
fs::remove_file(&self.pid_file).ok();
fs::remove_file(&self.metadata_file).ok();
return Ok(false);
}
return Ok(true);
@ -437,18 +507,30 @@ impl ManagedProcess {
.with_context(|| format!("failed to check process {}", pid))?;
if !output.status.success() {
fs::remove_file(&self.pid_file).ok();
fs::remove_file(&self.metadata_file).ok();
return Ok(false);
}
// PID再利用対策: /proc からコマンドラインを確認
let cmdline_path = format!("/proc/{}/cmdline", pid);
if let Ok(cmdline) = fs::read_to_string(&cmdline_path) {
let cmdline = cmdline.replace('\0', " ");
if !cmdline.contains(&self.spec.command) {
return Ok(false);
if let Some(metadata) = read_process_metadata(&self.metadata_file) {
if let Some(expected_boot_id) = metadata.boot_id.as_deref() {
if current_boot_id().as_deref() != Some(expected_boot_id) {
fs::remove_file(&self.pid_file).ok();
fs::remove_file(&self.metadata_file).ok();
return Ok(false);
}
}
}
if !read_process_argv(pid)
.map(|argv| process_argv_matches(&self.spec.command, &self.spec.args, &argv))
.unwrap_or(false)
{
fs::remove_file(&self.pid_file).ok();
fs::remove_file(&self.metadata_file).ok();
return Ok(false);
}
if self.started_at.is_none() {
self.started_at = Some(Utc::now());
}
@ -561,6 +643,54 @@ mod tests {
let _ = fs::remove_dir_all(&temp);
}
#[test]
fn test_process_argv_matches_exact_command_and_args() {
let argv = vec![
"/run/current-system/sw/bin/python3".to_string(),
"-m".to_string(),
"http.server".to_string(),
"18190".to_string(),
"--bind".to_string(),
"10.100.0.22".to_string(),
];
assert!(process_argv_matches(
"python3",
&[
"-m".to_string(),
"http.server".to_string(),
"18190".to_string(),
"--bind".to_string(),
"10.100.0.22".to_string()
],
&argv
));
}
#[test]
fn test_process_argv_rejects_same_binary_with_different_args() {
let argv = vec![
"/run/current-system/sw/bin/python3".to_string(),
"-m".to_string(),
"http.server".to_string(),
"18193".to_string(),
"--bind".to_string(),
"10.100.0.22".to_string(),
];
assert!(!process_argv_matches(
"python3",
&[
"-m".to_string(),
"http.server".to_string(),
"18190".to_string(),
"--bind".to_string(),
"10.100.0.22".to_string()
],
&argv
));
}
}
impl ProcessManager {
@ -663,7 +793,7 @@ impl ProcessManager {
instance_id: metadata.instance_id,
spec: ProcessSpec {
command: metadata.command.unwrap_or_default(),
args: Vec::new(),
args: metadata.args.clone(),
working_dir: None,
env: Default::default(),
},

View file

@ -1218,7 +1218,7 @@ mod tests {
fn test_controller() -> HostDeploymentController {
HostDeploymentController::new(HostsCommand {
endpoint: "http://127.0.0.1:7000".to_string(),
endpoint: "http://127.0.0.1:2379".to_string(),
cluster_namespace: "ultracloud".to_string(),
cluster_id: "test-cluster".to_string(),
interval_secs: 1,

View file

@ -4,16 +4,50 @@ This directory is the public documentation entrypoint for UltraCloud.
## Read First
- [../README.md](/home/centra/cloud/README.md)
- [testing.md](/home/centra/cloud/docs/testing.md)
- [component-matrix.md](/home/centra/cloud/docs/component-matrix.md)
- [storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md)
- [../README.md](../README.md)
- [testing.md](testing.md)
- [component-matrix.md](component-matrix.md)
- [rollout-bundle.md](rollout-bundle.md)
- [control-plane-ops.md](control-plane-ops.md)
- [edge-trial-surface.md](edge-trial-surface.md)
- [provider-vm-reality.md](provider-vm-reality.md)
- [hardware-bringup.md](hardware-bringup.md)
- [storage-benchmarks.md](storage-benchmarks.md)
## Canonical Profiles
- `single-node dev`: `nix run .#single-node-quickstart`, `nix run .#single-node-trial`, `nix build .#single-node-trial-vm`, `nixosConfigurations.single-node-quickstart`, companion image `nixosConfigurations.netboot-all-in-one`
- `3-node HA control plane`: `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion image `nixosConfigurations.netboot-control-plane`
- `bare-metal bootstrap`: `nix run ./nix/test-cluster#cluster -- baremetal-iso`, `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` followed by `./result/bin/baremetal-iso-e2e` for the exact host-KVM proof
`nixosConfigurations.netboot-worker` is an archived helper outside the canonical profiles and their guard set. `baremetal/vm-cluster`, `k8shost-cni`, `k8shost-controllers`, `lightningstor-csi`, Firecracker, and mvisor remain in-tree only as non-product scaffolds or `legacy/manual` debugging paths.
`single-node-trial-vm` is the low-friction trial artifact for local use. OCI/Docker artifact is intentionally not the public trial surface because a privileged container would not exercise the same KVM, `/dev/net/tun`, and guest-kernel contract.
`ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is the only supported cluster authoring source.
`nix-nos` is limited to legacy compatibility and low-level network primitives.
`single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story.
## Key References
- VM validation harness: [../nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md)
- CoronaFS storage role: [../coronafs/README.md](/home/centra/cloud/coronafs/README.md)
- CreditService scope note: [../creditservice/README.md](/home/centra/cloud/creditservice/README.md)
- VM validation harness: [../nix/test-cluster/README.md](../nix/test-cluster/README.md)
- Hardware bring-up bridge: [hardware-bringup.md](hardware-bringup.md)
- Provider and VM-hosting reality proof: [provider-vm-reality.md](provider-vm-reality.md)
- Rollout bundle operator contract: [rollout-bundle.md](rollout-bundle.md)
- Core control-plane operator contract: [control-plane-ops.md](control-plane-ops.md)
- Edge and trial-surface contract: [edge-trial-surface.md](edge-trial-surface.md)
- APIGateway supported scope: [../apigateway/README.md](../apigateway/README.md)
- NightLight supported scope: [../nightlight/README.md](../nightlight/README.md)
- CoronaFS storage role: [../coronafs/README.md](../coronafs/README.md)
- CreditService supported scope: [../creditservice/README.md](../creditservice/README.md)
- K8sHost supported scope: [../k8shost/README.md](../k8shost/README.md)
## Core API Notes
- `chainfire` supports fixed-membership cluster introspection on the public surface: `MemberList`, `Status`, and the internal `Vote` plus `AppendEntries` Raft transport. `chainfire-core` remains a workspace-internal compatibility crate rather than a supported embeddable API.
- `flaredb` supports SQL over both gRPC and REST. The public REST endpoints are `POST /api/v1/sql` and `GET /api/v1/tables`.
- `lightningstor` keeps bucket versioning, bucket policy, bucket tagging, and explicit object version listing on the supported optional surface.
- `k8shost` keeps `WatchPods` on the supported surface as a bounded snapshot stream of the current matching pods.
## Design Notes

View file

@ -1,54 +1,98 @@
# Component Matrix
UltraCloud is intended to validate meaningful service combinations, not only a single all-on deployment.
This page summarizes the compositions that are exercised by the VM-cluster harness today.
UltraCloud now fixes the public support surface to three canonical profiles. This page defines the required and optional component bundles for each profile and keeps everything else explicitly outside the core contract.
## Validated Control Plane
## Canonical Profiles
- `chainfire + flaredb + iam`
### `single-node dev`
## Validated Network Provider Layer
- Required components: `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet`
- Optional components: `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost`
- Canonical entrypoints: `nix run .#single-node-quickstart`, `nix run .#single-node-trial`, `nix build .#single-node-trial-vm`, `nixosConfigurations.single-node-quickstart`, and companion install image `nixosConfigurations.netboot-all-in-one`
- Optional component toggles: `ultracloud.quickstart.enableLightningStor`, `enableCoronafs`, `enableFlashDNS`, `enableFiberLB`, `enableApiGateway`, `enableNightlight`, `enableCreditService`, `enableK8sHost`
- Primary use: one-command local bring-up, API development, and one-box VM experimentation without the HA control-plane or rollout-stack overhead
- Trial artifact: `single-node-trial-vm` is the supported buildable VM appliance for local use; the `single-node-quickstart` or `single-node-trial` app is the smoke launcher for that same minimal surface
- `prismnet`
- `prismnet + flashdns`
- `prismnet + fiberlb`
- `prismnet + flashdns + fiberlb`
### `3-node HA control plane`
These combinations justify the existence of the network services as composable providers rather than hidden internal subsystems.
- Required components: `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node
- Optional components: `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice`
- Canonical entrypoints: `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, and companion install image `nixosConfigurations.netboot-control-plane`
- Primary use: stable replicated control plane that can later accept worker, storage, and edge bundles without redefining the bootstrap path
## Validated VM Hosting Layer
### `bare-metal bootstrap`
- `plasmavmc + prismnet`
- `plasmavmc + lightningstor`
- `plasmavmc + coronafs`
- `plasmavmc + coronafs + lightningstor`
- `plasmavmc + prismnet + coronafs + lightningstor`
- Required components: `deployer`, `first-boot-automation`, `install-target`, `nix-agent`
- Optional components: `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after the first successful rollout
- Canonical entrypoints: `nix run ./nix/test-cluster#cluster -- baremetal-iso`, `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e`, and the built runner `./result/bin/baremetal-iso-e2e` for the exact host-KVM proof
- Primary use: boot the installer ISO, phone home to `deployer`, fetch the flake bundle, run Disko, reboot, and converge QEMU-emulated or real machines into either the single-node or HA profile
This split keeps mutable VM volumes on CoronaFS and immutable VM images on LightningStor object storage.
## Companion And Helper Outputs
## Validated Kubernetes-Style Hosting Layer
- `nixosConfigurations.netboot-all-in-one`: canonical companion install image for `single-node dev`
- `nixosConfigurations.netboot-control-plane`: canonical companion install image for `3-node HA control plane`
- `packages.single-node-trial-vm`: low-friction buildable VM appliance for the minimal VM-platform core
- `nixosConfigurations.netboot-worker`: archived/non-product worker helper kept in-tree for manual lab debugging only
- `k8shost + prismnet`
- `k8shost + flashdns`
- `k8shost + fiberlb`
- `k8shost + prismnet + flashdns + fiberlb`
## Cluster Authoring Source
## Validated Edge And Tenant Services
`ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is the only supported cluster authoring source. It is the canonical input for deployer classes and pools, service placement state, rollout objects, and per-node bootstrap metadata.
- `apigateway + iam + prismnet`
- `nightlight + apigateway`
- `nightlight`
- `creditservice + iam + apigateway`
- `creditservice + iam`
- `deployer + iam + chainfire`
`nix-nos` is limited to legacy compatibility and low-level network primitives such as interfaces, VLANs, BGP, and static routing. It is not the canonical source for cluster topology, rollout intent, or scheduler state.
## Validation Direction
## Optional Composition Bundles
The VM cluster harness now exposes:
The optional bundles below remain important, but they are layered on top of the canonical profiles rather than treated as separate top-level products:
```bash
nix run ./nix/test-cluster#cluster -- matrix
nix run ./nix/test-cluster#cluster -- fresh-matrix
```
- control-plane core: `chainfire + flaredb + iam`
- network provider bundle: `prismnet + flashdns + fiberlb`
- VM hosting bundle: `plasmavmc + prismnet + coronafs + lightningstor`
- Kubernetes-style hosting bundle: `k8shost + prismnet + flashdns + fiberlb`
- edge and tenant bundle: `apigateway + iam + nightlight + creditservice`
- native rollout bundle: `deployer + chainfire + nix-agent + fleet-scheduler + node-agent`
`fresh-matrix` is the publishable path because it rebuilds the host-side VM images before validating the composed service scenarios, including PrismNet-backed PlasmaVMC guests.
`fresh-matrix` is the publishable composition proof because it rebuilds the host-side VM images before validating these bundles on the VM cluster.
For the edge and tenant bundle, the published contract now means: APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer, but config rollout is restart-based and live in-process reload is not promised; NightLight is supported as a single-node WAL/snapshot service with instance-wide retention rather than replicated HA metrics storage; and CreditService stays scoped to quota, wallet, reservation, and admission control, with export or backend migration handled as offline export/import or backend-native snapshot workflows instead of live mixed-writer migration.
For the network provider bundle specifically, the published contract now means: PrismNet can create tenant VPC/subnet/port state and add then delete security-group ACLs deterministically, FlashDNS can publish records for those workloads, and FiberLB can front them with TCP plus TLS-terminated `Https` / `TerminatedHttps` listeners. `provider-vm-reality-proof` is the artifact-producing companion lane for that surface; it records authoritative DNS answers plus FiberLB backend drain and re-convergence under `./work/provider-vm-reality-proof/latest`. The shipped FiberLB L4 algorithms stay under targeted server tests in-tree.
PrismNet real OVS/OVN dataplane validation remains outside the supported local KVM surface.
FiberLB native BGP or BFD peer interop and hardware VIP ownership remain outside the supported local KVM surface.
FiberLB HTTPS health checks currently do not verify backend TLS certificates. Supported scope is limited to TCP reachability plus HTTP status for the backend endpoint until CA-aware verification is wired through config, server code, and the canonical harness.
For the VM hosting bundle, the published PlasmaVMC contract is the KVM-backed VM lifecycle path plus PrismNet-attached guest networking. `provider-vm-reality-proof` records KVM shared-storage migration and post-migration restart artifacts on the worker pair. Real-hardware migration or storage handoff remains a later hardware proof. Firecracker and mvisor code stays in-tree only as archived non-product backend scaffolding until it has end-to-end tenant-network coverage and publishable suite proof.
## Responsibility Boundaries
- `k8shost`: tenant workload API surface. It manages pod, deployment, and service semantics, then delegates network publication to `prismnet`, `flashdns`, and `fiberlb`.
- `k8shost` is fixed as an API/control-plane product surface. Supported binaries stop at `k8shost-server`, and `k8shost-cni`, `lightningstor-csi`, plus `k8shost-controllers` stay archived non-product until they have their own published coverage and a real network or storage dataplane contract.
- `plasmavmc`: tenant VM API surface. The supported public backend is KVM; it can run against explicit remote IAM, PrismNet, and FlareDB endpoints, and other backend implementations stay outside the canonical contract until they have end-to-end runtime and tenant-network coverage.
- `creditservice`: tenant quota, wallet, reservation, and admission-control surface. It stays in the supported bundle because `fresh-matrix` exercises both its direct APIs and the API-gateway path.
- `fleet-scheduler`: bare-metal service placement surface. It schedules host-native service instances from declarative cluster state generated from `ultracloud.cluster` plus `node-agent` heartbeats, without exposing Kubernetes APIs.
- `deployer`: enrollment and rollout authority. It serves `/api/v1/phone-home`, stores install plans and desired-system references, and seeds cluster metadata from the generated `ultracloud.cluster` state.
- `nix-agent`: host OS reconciler. It turns `deployer` desired-system references into `switch-to-configuration` actions plus rollback and health-check handling.
- `node-agent`: host runtime reconciler. It applies scheduled service-instance state, keeps runtime heartbeats fresh, and reports host-local execution status back to the scheduler.
The intended layering is `deployer -> nix-agent` for machine image or NixOS generation changes, and `deployer -> fleet-scheduler -> node-agent` for host-native service placement changes. `k8shost` stays separate because it is the tenant workload control plane, not the native service scheduler. The `single-node dev` profile intentionally stops before that rollout stack and keeps only the VM-platform core plus explicit add-ons.
## Standalone Stories
- `single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story for the minimal KVM-backed surface.
- `deployer-vm-smoke`, `portable-control-plane-regressions`, and `baremetal-iso` are the standalone rollout-stack story for `deployer -> nix-agent` and `deployer -> fleet-scheduler -> node-agent`.
- OCI/Docker artifact is intentionally not the public trial surface because the supported VM-platform contract depends on a guest kernel plus host KVM, `/dev/net/tun`, and OVS/libvirt semantics.
## Archived Scaffolds
- `k8shost-cni`: internal scaffold for old tenant-network experiments; excluded from default workspace members and canonical docs
- `k8shost-controllers`: controller prototype scaffold; excluded from default workspace members and canonical docs
- `lightningstor-csi`: storage helper prototype; excluded from default workspace members and canonical docs
- Firecracker and mvisor: archived PlasmaVMC backend scaffolds outside the supported KVM-only contract and excluded from the default PlasmaVMC workspace members
- `nixosConfigurations.netboot-worker`: archived worker helper image outside canonical profile guards
- `baremetal/vm-cluster`: `legacy/manual` debugging path outside the main product surface
## Non-Canonical Paths
- `baremetal/vm-cluster` remains `legacy/manual`
- standalone use of `netboot-control-plane` or `netboot-all-in-one` outside the documented profiles is a debugging path, not a fourth supported profile
- `netboot-worker`, Firecracker, mvisor, `k8shost-cni`, `lightningstor-csi`, and `k8shost-controllers` are archived non-product scaffolds rather than canonical entrypoints
- `netboot-base`, `pxe-server`, and `vm-smoke-target` are internal or legacy helpers, not supported profiles by themselves
- ad hoc shell-driven cluster bring-up is for debugging only and should not be presented as the canonical public path

77
docs/control-plane-ops.md Normal file
View file

@ -0,0 +1,77 @@
# Core Control Plane Operations
This document fixes the supported operator lifecycle for the core control-plane services: `chainfire`, `flaredb`, and `iam`.
## ChainFire Membership And Node Replacement
ChainFire dynamic membership, replace-node, and scale-out are unsupported on the supported surface.
The supported public surface is the fixed-membership cluster API already documented in `chainfire-api`: `MemberList` and `Status` report the membership that the node booted with, and operators should treat that membership as immutable for a release branch.
Supported operator actions today:
1. Keep the canonical control plane at the documented fixed membership for the branch.
2. Use the canonical `durability-proof` backup/restore lane before disruptive maintenance.
3. Use `nix run ./nix/test-cluster#cluster -- rollout-soak` when you need a longer-running fixed-membership restart proof after maintenance or rollout work.
4. Recover failed nodes by restoring the same fixed-membership cluster shape or by rebuilding the whole cluster with a freshly published static membership and then restoring data.
Unsupported operator actions today:
1. Live `replace-node` through a public ChainFire API.
2. Live `scale-out` by adding new voters on the supported surface.
3. Relying on internal membership helpers as a published product contract.
The focused boundary proof is `./nix/test-cluster/run-core-control-plane-ops-proof.sh`, which records the fixed-membership source marker from `chainfire-api` and the public docs markers under `./work/core-control-plane-ops-proof`. The live-operations companion is `nix run ./nix/test-cluster#cluster -- rollout-soak`, which on 2026-04-10 recorded `chainfire-post-restart-put.json`, `chainfire-post-restart.json`, and `post-control-plane-restarts.json` under `./work/rollout-soak/20260410T164549+0900` after repeated maintenance and worker power-loss, without promoting dynamic membership to supported scope.
## FlareDB Online Migration And Schema Evolution
FlareDB online migration and schema evolution must start from the durability-proof backup/restore baseline.
The supported operator contract is additive-first schema evolution:
1. Run `nix run ./nix/test-cluster#cluster -- durability-proof` or keep an equivalent logical backup artifact before changing schema.
2. Apply additive changes first: new tables, new nullable columns, new indexes, and code paths that tolerate both old and new shapes.
3. Backfill data and cut read traffic to the new schema before deleting or rewriting old state.
4. Treat destructive cleanup, `DROP TABLE`, and incompatible column rewrites as a later maintenance step after a fresh backup.
This keeps the migration runbook consistent with the current product proof: the durability lane proves logical SQL backup/restore, and the 2026-04-10 `rollout-soak` artifact root `./work/rollout-soak/20260410T164549+0900` rechecks additive SQL operations through `flaredb-post-restart-create.json`, `flaredb-post-restart-insert.json`, and `flaredb-post-restart.json` after a FlareDB member restart. The operator contract for live changes stays additive schema evolution rather than destructive in-place rewrites.
FlareDB destructive DDL and fully automated online migration remain outside the supported product contract for this release. When you need `DROP TABLE`, incompatible column rewrites, or automated destructive cutover, stop at the additive-first boundary above, take a fresh logical backup, and treat the destructive step as an explicit offline maintenance action rather than a release-proven online behavior.
Internal raft membership helpers in `flaredb-raft` exist for implementation work, but they are not the published operator API for schema migration.
## IAM Bootstrap Hardening And Rotation
IAM bootstrap hardening requires an explicit admin token, an explicit signing key, and a 32-byte IAM_CRED_MASTER_KEY; signing-key rotation, credential rotation, and mTLS overlap-and-cutover rotation are the supported recovery paths.
Production bootstrap contract:
1. Set `IAM_ADMIN_TOKEN` or `PHOTON_IAM_ADMIN_TOKEN`.
2. Set `authn.internal_token.signing_key` in config or provide the equivalent environment-backed configuration.
3. Set `IAM_CRED_MASTER_KEY` to a 32-byte value before enabling credential issuance.
4. Keep `admin.allow_unauthenticated=true`, `IAM_ALLOW_UNAUTHENTICATED_ADMIN=true`, and random signing keys limited to local development or lab proof environments.
Supported token and key rotation flow:
1. Add the new signing key and keep the old key available for verification during the overlap window.
2. Issue new tokens from the new active key.
3. Wait for the maximum supported token TTL or explicitly revoke the old population before retiring the old key.
4. Purge retired keys only after the overlap and retirement windows are complete.
Supported credential rotation flow:
1. Keep `IAM_CRED_MASTER_KEY` explicit and stable across the overlap window.
2. Mint a new credential for the same principal before revoking the old one.
3. Move clients to the new access key and verify it can still read back its secret material.
4. Revoke the old credential only after cutover is complete.
Supported mTLS overlap-and-cutover rotation flow:
1. Configure IAM to trust both the old and new service identity mapping or trust roots during the overlap window.
2. Issue or install the new client certificate and cut traffic over to it.
3. Remove the old mapping or trust root only after the new certificate is serving traffic successfully.
4. Verify the old certificate is rejected once the overlap window closes.
Multi-node IAM failover remains outside the supported product contract for this release. The current release proof is lifecycle-oriented rather than HA-oriented: bootstrap hardening, signing-key rotation, credential overlap-and-revoke rotation, and mTLS overlap-and-cutover rotation are supported; clustered IAM failover is future scope expansion.
The standalone proof is `./nix/test-cluster/run-core-control-plane-ops-proof.sh`. It runs the `iam-authn` signing-key and mTLS rotation tests plus the `iam-api` credential rotation test, records the bootstrap hardening source markers from `iam-server`, and persists logs plus `result.json` and `scope-fixed-contract.json` under `./work/core-control-plane-ops-proof`. The dated 2026-04-10 artifact root is `./work/core-control-plane-ops-proof/20260410T172148+09:00`.

View file

@ -0,0 +1,83 @@
# Edge And Trial Surface
This document fixes the supported product boundary for the edge bundle and the lightest trial surface.
## APIGateway
APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer; live in-process reload is not part of the product contract.
Supported operator contract:
1. Render gateway config from Nix or `ultracloud.cluster` generated inputs and restart or replace the process when routes, auth providers, or credit providers change.
2. Scale out by running multiple identical gateway instances behind FiberLB, an external load balancer, or another L4 or VIP distribution layer.
3. Treat route distribution as configuration rollout, not as a dynamic control-plane API.
Explicit non-supported behavior:
1. Hot route reload through an admin API or `SIGHUP`.
2. Stateful leader election or in-process config distribution between gateway replicas.
3. A release promise that every HA topology is directly exercised by `fresh-matrix`.
Current proof scope:
1. `fresh-matrix` proves the shipped single gateway-node composition on `node06`.
2. The HA story is a supported operator shape, but the release-facing proof remains one stateless gateway instance plus restart-based rollout.
## NightLight
NightLight is supported as a single-node WAL/snapshot service; replicated HA metrics storage is not part of the product contract.
Supported operator contract:
1. Use one NightLight instance per edge bundle, per lab, or per tenant environment when you need a hard operational boundary.
2. Use `retention_days`, the WAL, and periodic snapshots as the retention contract for that instance.
3. Put shared access control in front of NightLight with APIGateway or another authenticated front door when multiple writers or readers share the same endpoint.
Explicit non-supported behavior:
1. Multi-node or quorum-backed NightLight replication.
2. Per-tenant retention enforcement inside NightLight itself.
3. Treating NightLight labels as a hard security boundary.
The supported tenant contract is therefore deployment-scoped: one NightLight instance can serve one environment or a carefully trusted shared bundle, but tenant isolation is not enforced inside the process.
## CreditService
CreditService export and backend migration are supported as offline export/import or backend-native snapshot workflows, not live mixed-writer migration.
Supported operator contract:
1. Keep CreditService scoped to quota, wallet, reservation, and admission-control behavior.
2. Use backend-native snapshots or logical API replay as the export baseline.
3. Drain or quiesce writes before moving between FlareDB, PostgreSQL, or SQLite backends.
4. Rehydrate the target backend, then cut APIGateway or callers over to the new endpoint.
Explicit non-supported behavior:
1. Finance-grade ledger ownership.
2. Live mixed-writer backend migration.
3. Turning the service into a pricing, invoicing, or settlement platform.
## Trial Surface
OCI/Docker artifact is intentionally not the public trial surface.
The supported lightweight trial remains:
1. `nix build .#single-node-trial-vm`
2. `nix run .#single-node-trial`
3. `nix run .#single-node-quickstart`
That boundary exists because the supported VM-platform contract needs a guest kernel plus host KVM, `/dev/net/tun`, and OVS or libvirt semantics. A Docker or OCI image would either be host-coupled and privileged or prove a different, weaker contract.
## Work Root Budget
Use `./nix/test-cluster/work-root-budget.sh status` for reporting, `./nix/test-cluster/work-root-budget.sh enforce` for a stronger local budget gate, and `./nix/test-cluster/work-root-budget.sh prune-proof-logs 2` for safer dated-proof cleanup.
Recommended soft budgets on a local AMD/KVM proof host:
1. Keep `./work/test-cluster/state` under roughly 35 GiB.
2. Keep disposable runtime state such as `./work/tmp` and `./work/publishable-kvm-runtime` under roughly 10 GiB combined.
3. Keep dated proof roots trimmed so combined proof logs stay under roughly 20 GiB unless you are intentionally archiving a release snapshot.
The helper prints current sizes, highlights budget overruns, and prints safe cleanup steps such as stopping the cluster, cleaning runtime state, deleting disposable log roots, and then running a Nix store GC after old result symlinks are no longer needed. The `enforce` mode lets local proof lanes fail fast when the operator has let `./work` drift beyond the documented soft budget, and `prune-proof-logs` gives a dry-run-first workflow for trimming dated proof roots.

135
docs/hardware-bringup.md Normal file
View file

@ -0,0 +1,135 @@
# Hardware Bring-Up
This document is the operator bridge between the canonical QEMU ISO proof and a real USB or BMC/Redfish install smoke.
## Canonical entrypoint
```bash
nix run ./nix/test-cluster#hardware-smoke -- preflight
nix run ./nix/test-cluster#hardware-smoke -- run
nix run ./nix/test-cluster#hardware-smoke -- capture
```
The wrapper always writes artifacts under `./work/hardware-smoke/<run-id>` and refreshes `./work/hardware-smoke/latest`.
## What it fixes
- kernel parameters are emitted once in `kernel-params.txt`
- expected success markers are emitted once in `expected-markers.txt`
- failure markers are emitted once in `failure-markers.txt`
- operator instructions are emitted once in `operator-handoff.md`
- missing transport inputs are emitted once in `missing-requirements.txt`
When transport is absent, `preflight` exits successfully but records `status=blocked` in `status.env`.
## Shared ISO contract
The physical-node wrapper uses the same ISO attr and the same success markers as the QEMU proof:
- ISO attr: `.#nixosConfigurations.ultracloud-iso.config.system.build.isoImage`
- QEMU proof: `nix run ./nix/test-cluster#cluster -- baremetal-iso`
- exact local-KVM proof: `nix build .#checks.x86_64-linux.baremetal-iso-e2e && ./result/bin/baremetal-iso-e2e`
The bridge is intentional: QEMU stands in for the chassis only. The install sequence stays `phone-home -> bundle download -> Disko -> reboot -> post-install boot -> desired-system active`.
## Required kernel parameters
`hardware-smoke.sh` writes the exact kernel parameter set to `kernel-params.txt`:
- `ultracloud.deployer_url=<scheme://host:port>`
- `ultracloud.bootstrap_token=<token>` or a deliberate unauthenticated lab deployer with `ULTRACLOUD_HARDWARE_ALLOW_UNAUTHENTICATED=1`
- optional `ultracloud.ca_cert_url=<https://.../ca.crt>`
- optional `ultracloud.binary_cache_url=<http://cache:8090>`
- optional `ultracloud.node_id=<node-id>`
- optional `ultracloud.hostname=<hostname>`
## Expected success markers
The wrapper records the canonical marker list in `expected-markers.txt`:
- `ULTRACLOUD_MARKER pre-install.boot.<node-id>`
- `ULTRACLOUD_MARKER pre-install.phone-home.complete.<node-id>`
- `ULTRACLOUD_MARKER install.bundle-downloaded.<node-id>`
- `ULTRACLOUD_MARKER install.disko.complete.<node-id>`
- `ULTRACLOUD_MARKER install.nixos-install.complete.<node-id>`
- `ULTRACLOUD_MARKER reboot.<node-id>`
- `ULTRACLOUD_MARKER post-install.boot.<node-id>.<role>`
- `ULTRACLOUD_MARKER desired-system-active.<node-id>`
The wrapper also expects `nix-agent.service` to be active after install, and `chainfire.service` to be active when the node role is `control-plane`.
## USB path
Provide:
- `ULTRACLOUD_HARDWARE_TRANSPORT=usb`
- `ULTRACLOUD_HARDWARE_USB_DEVICE=/dev/sdX`
- `ULTRACLOUD_HARDWARE_ALLOW_DESTRUCTIVE=YES`
- `ULTRACLOUD_HARDWARE_DEPLOYER_URL=...`
- `ULTRACLOUD_HARDWARE_BOOTSTRAP_TOKEN=...` or `ULTRACLOUD_HARDWARE_ALLOW_UNAUTHENTICATED=1`
- `ULTRACLOUD_HARDWARE_SSH_HOST=...` or `ULTRACLOUD_HARDWARE_SERIAL_LOG=...`
Example:
```bash
ULTRACLOUD_HARDWARE_TRANSPORT=usb \
ULTRACLOUD_HARDWARE_USB_DEVICE=/dev/sdX \
ULTRACLOUD_HARDWARE_ALLOW_DESTRUCTIVE=YES \
ULTRACLOUD_HARDWARE_DEPLOYER_URL=http://10.0.0.10:8088 \
ULTRACLOUD_HARDWARE_BOOTSTRAP_TOKEN=lab-bootstrap-token \
ULTRACLOUD_HARDWARE_SSH_HOST=10.0.0.21 \
nix run ./nix/test-cluster#hardware-smoke -- run
```
## BMC / Redfish virtual media path
Provide:
- `ULTRACLOUD_HARDWARE_TRANSPORT=redfish` or `bmc`
- `ULTRACLOUD_HARDWARE_REDFISH_ENDPOINT=https://bmc.example`
- `ULTRACLOUD_HARDWARE_REDFISH_USERNAME=...`
- `ULTRACLOUD_HARDWARE_REDFISH_PASSWORD=...`
- `ULTRACLOUD_HARDWARE_ISO_URL=https://http-server/ultracloud-bootstrap.iso`
- optional `ULTRACLOUD_HARDWARE_REDFISH_SYSTEM_ID=System.Embedded.1`
- optional `ULTRACLOUD_HARDWARE_REDFISH_MANAGER_ID=iDRAC.Embedded.1`
- optional `ULTRACLOUD_HARDWARE_REDFISH_VIRTUAL_MEDIA_ID=CD`
- `ULTRACLOUD_HARDWARE_DEPLOYER_URL=...`
- `ULTRACLOUD_HARDWARE_BOOTSTRAP_TOKEN=...` or `ULTRACLOUD_HARDWARE_ALLOW_UNAUTHENTICATED=1`
- `ULTRACLOUD_HARDWARE_SSH_HOST=...` or `ULTRACLOUD_HARDWARE_SERIAL_LOG=...`
Example:
```bash
ULTRACLOUD_HARDWARE_TRANSPORT=redfish \
ULTRACLOUD_HARDWARE_REDFISH_ENDPOINT=https://bmc.example \
ULTRACLOUD_HARDWARE_REDFISH_USERNAME=admin \
ULTRACLOUD_HARDWARE_REDFISH_PASSWORD=secret \
ULTRACLOUD_HARDWARE_ISO_URL=https://mirror.example/ultracloud-bootstrap.iso \
ULTRACLOUD_HARDWARE_DEPLOYER_URL=http://10.0.0.10:8088 \
ULTRACLOUD_HARDWARE_BOOTSTRAP_TOKEN=lab-bootstrap-token \
ULTRACLOUD_HARDWARE_SSH_HOST=10.0.0.21 \
nix run ./nix/test-cluster#hardware-smoke -- run
```
## Capture-only mode
If the transport action is manual, keep the same proof root and collect the success evidence later:
```bash
ULTRACLOUD_HARDWARE_PROOF_ROOT=./work/hardware-smoke/latest \
ULTRACLOUD_HARDWARE_SSH_HOST=10.0.0.21 \
nix run ./nix/test-cluster#hardware-smoke -- capture
```
## Failure and blocked behavior
`preflight` records `status=blocked` when any of these are missing:
- transport device or BMC/Redfish endpoint
- deployer URL
- bootstrap token or explicit unauthenticated acknowledgement
- USB destructive acknowledgement
- BMC/Redfish ISO URL
- capture channel for `desired-system active`
That blocked state is intentional. It means the repo is ready for a physical-node run, but the local session still lacks the external transport or credentials needed to execute it.

View file

@ -0,0 +1,37 @@
# Provider And VM-Hosting Reality Proof
The focused local-KVM proof for the provider and VM-hosting bundles is:
```bash
nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof
```
Artifacts are written under `./work/provider-vm-reality-proof/<timestamp>` and `./work/provider-vm-reality-proof/latest`.
## What This Lane Proves
- PrismNet tenant VPC, subnet, port, and security-group ACL lifecycle on the supported local-KVM surface.
- FlashDNS authoritative record exposure on the DNS listener, with captured answers for workload and service records.
- FiberLB listener publication plus backend drain and re-convergence for the shipped local-KVM listener surface.
- PlasmaVMC KVM shared-storage migration, CoronaFS handoff, and post-migration restart on the supported worker pair.
The proof is intentionally narrower than `fresh-matrix`. `fresh-matrix` remains the broad composition suite; `provider-vm-reality-proof` is the artifact-producing companion lane that keeps provider and VM-hosting evidence in one dated root.
## Recorded Artifacts
The proof root keeps two subtrees:
- `network-provider/`: PrismNet, FlashDNS, and FiberLB create or get responses, authoritative DNS answers, FiberLB backend disable or restore evidence, and service journals.
- `vm-hosting/`: VM create response, VM spec, volume state before and after migration, PrismNet port state after migration, VM watch output, and PlasmaVMC or CoronaFS service journals.
`result.json` records the overall proof status, start and finish timestamps, and the artifact subdirectories.
## Supported Scope And Fixed Limits
The local-KVM proof intentionally does not claim the full hardware-network surface.
- PrismNet real OVS/OVN dataplane validation remains outside the supported local KVM surface. The current proof keeps tenant API lifecycle and attached-VM networking honest, but not a release-grade `ovn-nbctl` or hardware-switch path.
- FiberLB native BGP or BFD peer interop and hardware VIP ownership remain outside the supported local KVM surface. The current proof fixes the shipped contract to listener publication plus backend drain or re-convergence inside the lab.
- PlasmaVMC real-hardware migration or storage handoff remains a later hardware proof. The current proof fixes the release surface to KVM shared-storage migration on the local worker pair.
Use the hardware bring-up pack in [hardware-bringup.md](hardware-bringup.md) when transport becomes available and the ISO path can be exercised on a real machine.

103
docs/rollout-bundle.md Normal file
View file

@ -0,0 +1,103 @@
# Rollout Bundle Operations
This document fixes the supported operator contract for the native rollout bundle:
- `deployer`
- `fleet-scheduler`
- `nix-agent`
- `node-agent`
The supported layering is still `deployer -> nix-agent` for host OS rollout and `deployer -> fleet-scheduler -> node-agent` for host-native service placement.
## Supported Scope
- `deployer` is supported as a single logical rollout authority. The supported recovery model is restart-in-place or cold-standby replacement that reuses the same `chainfire` namespace, admin and bootstrap credentials, bootstrap flake bundle, and local state backup.
- `deployer` is scope-fixed to one active writer plus optional cold-standby restore; automatic ChainFire-backed multi-instance failover is outside the supported product contract for this release. Do not run multiple writers against the same `deployer` namespace and assume automatic leader failover is safe.
- `nix-agent` is supported for host-local desired-system apply, post-activation health-check execution, and rollback to the previous known system.
- `fleet-scheduler` is scope-fixed to the two native-runtime worker lab with one planned drain cycle, one fail-stop worker-loss cycle, and 30-second held degraded states in `rollout-soak`; multi-hour maintenance windows, pinned singleton policies, and large-cluster drain storms are outside the supported product contract for this release.
- `node-agent` is supported for host-local runtime reconcile, process and container execution, per-instance logs, and declared host-path volume mounts. It is not a secret manager, a storage provisioner, or an in-place binary patch system.
## Proof Commands
- `nix build .#checks.x86_64-linux.deployer-vm-smoke`
- `nix build .#checks.x86_64-linux.deployer-vm-rollback`
- `nix build .#checks.x86_64-linux.portable-control-plane-regressions`
- `nix build .#checks.x86_64-linux.fleet-scheduler-e2e`
- `nix run ./nix/test-cluster#cluster -- fresh-smoke`
- `nix run ./nix/test-cluster#cluster -- rollout-soak`
- `nix run ./nix/test-cluster#cluster -- durability-proof`
`deployer-vm-rollback` is the smallest reproducible proof for the `nix-agent` health-check and rollback path. `fresh-smoke` and `fleet-scheduler-e2e` keep the short regression semantics green. `rollout-soak` is the longer-running KVM operator lane for one planned drain cycle, one fail-stop worker-loss cycle, and service-restart behavior across `deployer`, `fleet-scheduler`, `node-agent`, and the fixed-membership control plane. It writes `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, and `fleet-scheduler-scope-fixed.txt` so the release boundary is captured in the proof root instead of being implied only by docs. The steady-state `nix/test-cluster` nodes record explicit `nix-agent` scope markers instead of pretending they run `nix-agent.service`. `durability-proof` remains the canonical persisted artifact lane for `deployer` backup, restart, replay, and storage-side failure injection.
## Deployer HA And DR
Supported deployer recovery is a single-writer restore runbook. `DEPLOYER-P1-01` is closed as a scope-fixed release boundary rather than an implied future HA promise:
1. Preserve the generated cluster state from `ultracloud.cluster`, the deployer bootstrap and admin credentials, and `services.deployer.localStatePath`.
2. Start exactly one `deployer` instance with the same `chainfireEndpoints`, `clusterNamespace`, `chainfireNamespace`, tokens, and optional TLS CA inputs.
3. Re-apply the canonical cluster state:
```bash
deployer-ctl \
--chainfire-endpoint http://127.0.0.1:2379 \
--cluster-id <cluster-id> \
--cluster-namespace ultracloud \
--deployer-namespace deployer \
apply --config cluster-state.json --prune
```
4. Replay any preserved admin pre-register requests in the same shape as `./work/durability-proof/latest/deployer-pre-register-request.json`.
5. Verify the recovered state with `curl -fsS -H 'x-deployer-token: <token>' http://<deployer>:8088/api/v1/admin/nodes | jq` and, for node rollout intent, `deployer-ctl node inspect --node-id <node> --include-desired-system --include-observed-system`.
The 2026-04-10 canonical backup-and-replay proof for this contract is `nix run ./nix/test-cluster#cluster -- durability-proof`, which recorded `deployer-pre-register-request.json`, `deployer-backup-list.json`, `deployer-post-restart-list.json`, and `deployer-replayed-list.json` under `./work/durability-proof/20260410T120618+0900`. The longer-run live-operations companion is `nix run ./nix/test-cluster#cluster -- rollout-soak`, which on 2026-04-10 recorded `deployer-post-restart-nodes.json`, `maintenance-held.json`, `power-loss-held.json`, `post-control-plane-restarts.json`, `scope-fixed-contract.json`, and `deployer-scope-fixed.txt` under `./work/rollout-soak/20260410T164549+0900` while holding degraded states and re-checking the admin inventory.
## Nix-Agent Operator Contract
- `services.nix-agent.healthCheckCommand` is an argv vector, not a shell fragment. Every entry is passed to the process directly.
- The command runs after `switch-to-configuration`.
- Exit status `0` means the desired system stays active.
- Non-zero exit with `rollbackOnFailure = true` causes rollback to the previous known system and reports observed status `rolled-back`.
- Non-zero exit with `rollbackOnFailure = false` leaves the failed generation in place and requires operator intervention.
The supported recovery flow is:
1. Inspect the desired and observed rollout state:
```bash
deployer-ctl \
--chainfire-endpoint http://127.0.0.1:2379 \
--cluster-id <cluster-id> \
--cluster-namespace ultracloud \
--deployer-namespace deployer \
node inspect \
--node-id <node-id> \
--include-desired-system \
--include-observed-system
```
2. If the node reports `rolled-back`, fix the failed target or health-check input, then re-publish the desired system.
3. Re-run the smallest proof lane with `nix build .#checks.x86_64-linux.deployer-vm-rollback` when the issue is in the `deployer -> nix-agent` boundary, or the installer-backed `baremetal-iso` and `baremetal-iso-e2e` lanes when the issue includes first boot.
`deployer-vm-rollback` is the canonical reproducible proof for this contract. It publishes a desired system whose `health_check_command = ["false"]`, expects observed status `rolled-back`, and proves that the current system does not remain on the rejected target generation. The longer-running 2026-04-10 `rollout-soak` lane does not pretend the steady-state `nix/test-cluster` nodes are deployer-managed `nix-agent` nodes; instead it records `node01-nix-agent-scope.txt` and `node04-nix-agent-scope.txt` under `./work/rollout-soak/20260410T154744+0900`, while the executable `nix-agent` proof surface remains `deployer-vm-rollback`, `baremetal-iso`, and `baremetal-iso-e2e`.
## Fleet-Scheduler Drain And Maintenance Contract
- Use `deployer-ctl node set-state --node-id <node> --state draining` for planned short-lived maintenance.
- `draining` removes the node from new placement and causes the scheduler to relocate replicated work when capacity exists.
- `active` re-admits the node and allows replica count to grow back, but healthy singleton work is not required to churn back automatically.
- Fail-stop worker loss is treated like implicit maintenance exhaustion: the scheduler restores healthy placement on the remaining eligible nodes when placement policy allows it.
- The supported release proof is limited to the two native-runtime worker lab with one planned drain cycle and one fail-stop worker-loss cycle, each held for 30 seconds in `rollout-soak`.
- Multi-hour maintenance windows, operator approval workflows, pinned singleton drain choreography, and large-cluster drain storms remain outside the supported contract for this release.
`fresh-smoke` is the canonical KVM proof for the baseline behavior. It drains `node04`, verifies that `native-web`, `native-container`, and `native-daemon` relocate to `node05`, restores `node04`, then simulates `node05` loss and verifies failover back to `node04` plus replica restoration when `node05` returns. `rollout-soak` reruns that choreography as exactly one planned drain cycle and one fail-stop worker-loss cycle, holds each degraded state for 30 seconds, restarts the rollout services, and then rechecks the live runtime state; the 2026-04-10 run under `./work/rollout-soak/20260410T164549+0900` is the current release-grade artifact for that scope-fixed boundary. `fleet-scheduler-e2e` remains the cheap regression lane for the same scheduling semantics.
## Node-Agent Logs, Secrets, Volumes, And Upgrade Contract
- Runtime state lives under `services.node-agent.stateDir`, with pid files, metadata, and per-instance logs under `${stateDir}/pids`.
- Each managed instance writes combined stdout and stderr to `${stateDir}/pids/<service>-<instance>.pid.log`.
- Metadata is persisted beside the pid file as `${stateDir}/pids/<service>-<instance>.pid.meta.json`, including argv and boot-id data used to reject stale pid reuse across reboot.
- Secrets are not fetched, rotated, or encrypted by `node-agent`. Supported secret delivery is limited to values already present in the rendered service spec, environment, or mounted host files.
- Volumes are declared host-path mounts from `ContainerVolumeSpec`. `node-agent` passes them through to the runtime and honors `read_only`, but it does not provision or garbage-collect those paths.
- Upgrades are replace-and-reconcile operations driven by `fleet-scheduler` state changes. `node-agent` does not patch binaries in place; it stops stale processes or containers and starts new ones from the updated spec.
`fresh-smoke`, `fresh-matrix`, `fleet-scheduler-e2e`, and `rollout-soak` are the operator proofs for the live runtime path, while the persisted process metadata in `deployer/crates/node-agent/src/process.rs` is the source of truth for the log and stale-pid contract. `rollout-soak` restarts `node-agent.service` on live worker nodes and records the longer-running restart survival artifacts under `./work/rollout-soak/20260410T164549+0900`; `nix-agent` stays scope-fixed to its dedicated deployer and installer proofs because the steady-state KVM cluster nodes do not run `nix-agent.service`.

View file

@ -1,37 +1,247 @@
# Testing
UltraCloud treats VM-first validation as the canonical local proof path.
UltraCloud treats VM-first validation as the canonical local proof path and keeps the public support contract limited to three profiles.
## Canonical Validation
## Canonical Profiles
| Profile | Canonical entrypoints | Required components | Optional components |
| --- | --- | --- | --- |
| `single-node dev` | `nix run .#single-node-quickstart`, `nix run .#single-node-trial`, `nix build .#single-node-trial-vm`, `nixosConfigurations.single-node-quickstart`, companion install image `nixosConfigurations.netboot-all-in-one` | `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` | `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost` |
| `3-node HA control plane` | `nixosConfigurations.node01`, `nixosConfigurations.node02`, `nixosConfigurations.node03`, companion install image `nixosConfigurations.netboot-control-plane` | `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node | `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` |
| `bare-metal bootstrap` | `nix run ./nix/test-cluster#cluster -- baremetal-iso`, `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` | `deployer`, `first-boot-automation`, `install-target`, `nix-agent` | `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after bootstrap |
`nixosConfigurations.netboot-all-in-one` and `nixosConfigurations.netboot-control-plane` are canonical companion images for the single-node and HA profiles. `nixosConfigurations.netboot-worker` is an archived worker helper outside the canonical profiles and their guard set, and `baremetal/vm-cluster` remains a `legacy/manual` debugging path rather than a publishable entrypoint.
## Cluster Authoring Source
`ultracloud.cluster` backed by `nix/lib/cluster-schema.nix` is the only supported cluster authoring source. The supported rollout and scheduling tests consume cluster state generated from that module rather than treating `nix-nos` or ad hoc shell state as a primary source.
`nix-nos` is limited to legacy compatibility and low-level network primitives such as interfaces, VLANs, BGP, and static routing.
## Quickstart Smoke
```bash
nix run ./nix/test-cluster#cluster -- fresh-smoke
nix flake show . --all-systems | rg -n "quickstart|single-node|trial|container|oci"
nix build .#single-node-trial-vm
nix eval --no-eval-cache .#nixosConfigurations.single-node-quickstart.config.system.build.toplevel.drvPath --raw
nix run .#single-node-quickstart
```
This flow:
`single-node-trial-vm` is the buildable trial artifact for the minimal VM-platform core, and `single-node-quickstart` is the automated smoke launcher for that same surface. The launcher boots the minimal VM stack under QEMU, waits for `chainfire`, `flaredb`, `iam`, `prismnet`, and `plasmavmc`, verifies their health from inside the guest, and checks the machine-readable product-surface manifest shipped in the VM. The launcher uses the generated NixOS VM runner, so it can fall back to TCG when `/dev/kvm` is absent.
- builds all six VM images on the host
- boots the cluster in dependency order
- validates control-plane, worker, gateway, storage, and fault-injection behavior
- proves that `deployer` seeds scheduler-managed native services directly from declarative Nix cluster state
`single-node-trial` is a public alias for the same smoke launcher. OCI/Docker artifact is intentionally not the public trial surface because the supported scope needs a guest kernel plus host KVM, `/dev/net/tun`, and OVS/libvirt semantics; a privileged container would not represent the same contract.
For debugging, keep the VM alive after the smoke passes:
```bash
ULTRACLOUD_QUICKSTART_KEEP_VM=1 nix run .#single-node-quickstart
```
## 3-Node HA Control Plane
```bash
nix eval --no-eval-cache .#nixosConfigurations.node01.config.system.build.toplevel.drvPath --raw
nix eval --no-eval-cache .#nixosConfigurations.node02.config.system.build.toplevel.drvPath --raw
nix eval --no-eval-cache .#nixosConfigurations.node03.config.system.build.toplevel.drvPath --raw
nix eval --no-eval-cache .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel.drvPath --raw
```
These are the canonical HA control-plane entrypoints. The publishable six-node VM-cluster suite under `./nix/test-cluster` extends this baseline with worker and optional service nodes, but it does not redefine the supported profile names.
## Canonical Bare-Metal Proof
```bash
nix eval --no-eval-cache .#nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel.drvPath --raw
nix eval --no-eval-cache .#nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel.drvPath --raw
nix run ./nix/test-cluster#cluster -- baremetal-iso
nix build .#checks.x86_64-linux.baremetal-iso-e2e
./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/latest
nix run ./nix/test-cluster#hardware-smoke -- preflight
```
`baremetal-iso` is the canonical install path for QEMU-as-bare-metal validation. It boots `nixosConfigurations.ultracloud-iso`, waits for `/api/v1/phone-home`, downloads the flake bundle from `deployer`, runs Disko, reboots, confirms the first post-install boot markers, and waits for `nix-agent` to report the desired system as `active` for both `baremetal-qemu-control-plane` and `baremetal-qemu-worker`.
`baremetal-iso-e2e` now keeps the exact flake attr but changes the execution model: `nix build .#checks.x86_64-linux.baremetal-iso-e2e` materializes `./result/bin/baremetal-iso-e2e`, and that built runner executes the same `nix/test-cluster/verify-baremetal-iso.sh` harness with host KVM and logs under `./work` by default. This avoids the old daemon-sandbox path where a `nixbld` build fell back to `TCG` instead of the host's `/dev/kvm`.
The local proof intentionally mirrors the real hardware route. Build `nixosConfigurations.ultracloud-iso`, then either boot that ISO in QEMU with KVM or put the same image on USB or BMC virtual media for the target machine. The live installer consumes the same bootstrap parameters in every environment:
- `ultracloud.deployer_url=<scheme://host:port>` for the reachable `deployer` endpoint
- `ultracloud.bootstrap_token=<token>` for authenticated phone-home, or a lab-only `deployer` with `allow_unauthenticated=true`
- `ultracloud.ca_cert_url=<https://.../ca.crt>` when `deployer` is TLS-enabled with a private CA
- `ultracloud.binary_cache_url=<http://cache:8090>` when you want the installer to fetch host-built closures instead of compiling locally
- `ultracloud.node_id=` and `ultracloud.hostname=` only when you need to override the DMI-serial or hostname-derived identity
The networking assumptions are also the same. The ISO needs DHCP or equivalent IP configuration that can reach `deployer` before Disko starts, and it must also reach the optional binary cache when that URL is set. The QEMU harness uses user-mode NAT and the built-in `10.0.2.2` fallback endpoints for the local host; physical installs should set the deployer and cache URLs explicitly to routable control-plane addresses.
The proven marker sequence from `nix/test-cluster/verify-baremetal-iso.sh` is the same sequence you should expect on hardware: `pre-install.boot`, `pre-install.phone-home.complete`, `install.bundle-downloaded`, `install.disko.complete`, `install.nixos-install.complete`, `reboot`, `post-install.boot`, and finally `nix-agent` reporting the desired system as `active`. USB and BMC virtual media change only how the ISO is presented to the machine; they do not change the bootstrap contract.
## Hardware Bring-Up Pack
```bash
nix run ./nix/test-cluster#hardware-smoke -- preflight
nix run ./nix/test-cluster#hardware-smoke -- run
nix run ./nix/test-cluster#hardware-smoke -- capture
```
`hardware-smoke` is the canonical USB/BMC/Redfish bridge for the physical-node proof. It always writes artifacts under `./work/hardware-smoke/<run-id>` and refreshes `./work/hardware-smoke/latest`.
- `preflight` emits `kernel-params.txt`, `expected-markers.txt`, `failure-markers.txt`, `operator-handoff.md`, and `status.env`.
- With no USB device or BMC/Redfish credentials, `preflight` records `status=blocked` and the exact missing transport inputs in `missing-requirements.txt`.
- With transport present, the same wrapper can write USB media or call Redfish virtual media and then capture the real `desired-system active` evidence through SSH or a supplied serial log.
- The expected hardware markers are the same `ULTRACLOUD_MARKER pre-install.boot.*`, `pre-install.phone-home.complete.*`, `install.disko.complete.*`, `reboot.*`, `post-install.boot.*`, and `desired-system-active.*` lines used by `verify-baremetal-iso.sh`.
Hardware runbook for the same canonical path:
1. Build `nixosConfigurations.ultracloud-iso` and the target install profiles you want the installer to materialize.
2. Publish cluster state where each reusable node class owns `install_plan.nixos_configuration`, `install_plan.disko_config_path`, and a stable disk selector. Prefer `install_plan.target_disk_by_id` on hardware; the QEMU proof now uses `/dev/disk/by-id/virtio-uc-control-root` and `/dev/disk/by-id/virtio-uc-worker-root` to exercise the same contract. When the live ISO can reach a binary cache, also publish `desired_system.target_system` with the prebuilt closure for that class so `nix-agent` converges to the exact shipped system instead of rebuilding a dirty local copy.
3. Make `deployer` and the optional binary cache reachable from the live ISO, then boot the ISO through USB or BMC virtual media with `ultracloud.deployer_url=...`, `ultracloud.bootstrap_token=...`, and optional `ultracloud.binary_cache_url=...`.
4. Confirm the live installer resolves the install profile, downloads the flake bundle, runs Disko against the selected disk, reboots, and lands on the post-install marker.
5. Confirm `nix-agent` on the installed node converges the desired system to `active`.
QEMU-to-hardware mapping for the proof:
| QEMU harness proof | Hardware proof |
| --- | --- |
| `nix run ./nix/test-cluster#cluster -- baremetal-iso` | boot the same `nixosConfigurations.ultracloud-iso` through USB or BMC virtual media |
| user-mode NAT fallback to `10.0.2.2` | routable `ultracloud.deployer_url` and optional `ultracloud.binary_cache_url` |
| virtio disk by-id selectors seeded by explicit QEMU serials | server, NVMe, or RAID-controller `/dev/disk/by-id/...` selectors in the node class |
| host-local QEMU logs and SSH on `127.0.0.1:22231/22232` | serial-over-LAN, BMC console, or physical console plus SSH on the installed host |
| same marker sequence and `nix-agent` active gate | same marker sequence and `nix-agent` active gate |
Host prerequisites for the KVM-backed proof are a Linux host with readable and writable `/dev/kvm`, nested virtualization enabled, and enough free space under `./work` or `ULTRACLOUD_WORK_ROOT` for VM disks, logs, and temporary build state. The checked-in wrappers force local Nix builders and derive `max-jobs` and per-build cores from the host CPU count unless `ULTRACLOUD_LOCAL_NIX_MAX_JOBS`, `ULTRACLOUD_LOCAL_NIX_BUILD_CORES`, `PHOTON_CLUSTER_NIX_MAX_JOBS`, or `PHOTON_CLUSTER_NIX_BUILD_CORES` override them.
## Regression Guards
```bash
nix build .#checks.x86_64-linux.canonical-profile-eval-guards
nix build .#checks.x86_64-linux.canonical-profile-build-guards
```
These two checks are the fast fail-first drift gates for the supported surface:
- `canonical-profile-eval-guards`: forces evaluation of every canonical profile entrypoint, so broken attrs fail before any long-running harness work starts.
- `canonical-profile-build-guards`: realizes the single-node VM, the HA control-plane configs and companion image, and the ISO or bare-metal outputs so build-time drift is caught even when a cluster harness is not running.
- `supported-surface-guard`: rejects unfinished public-surface wording across the published docs, add-on workspaces, and VM-cluster harness files, fails on shipped public server code that still contains `Status::unimplemented`, `unimplemented!()`, `todo!()`, or other intentional stub responses, blocks high-signal completeness markers such as `TODO:`, `FIXME`, or `best-effort` in the supported FiberLB, PrismNet, PlasmaVMC, and K8sHost server code paths, and also fails if archived helpers such as `netboot-worker`, `plasmavmc-firecracker`, `k8shost-cni`, `k8shost-csi`, or `k8shost-controllers` re-enter the default product surface.
## Portable Local Proof
```bash
nix build .#checks.x86_64-linux.canonical-profile-eval-guards
nix build .#checks.x86_64-linux.portable-control-plane-regressions
```
Use this lane on Linux hosts that do not expose `/dev/kvm`:
- `portable-control-plane-regressions`: TCG-safe aggregate check that keeps the canonical profile eval guard, `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `deployer-vm-smoke`, and `fleet-scheduler-e2e` green together.
- It also links in `supported-surface-guard`, so unsupported product-surface wording, code-level public API stubs, or high-signal completeness markers in the supported provider/backend servers fail in the same low-cost lane before a publishable rerun.
- It intentionally does not boot the six-node nested-KVM VM suite, so it is a developer regression path, not the publishable multi-node proof.
- CI runs `canonical-profile-eval-guards` and `portable-control-plane-regressions` on every relevant change from `.github/workflows/nix.yml`.
## Publishable Checks
```bash
nix run .#single-node-quickstart
nix run ./nix/test-cluster#cluster -- baremetal-iso
nix run ./nix/test-cluster#cluster -- fresh-smoke
nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp
nix run ./nix/test-cluster#cluster -- fresh-matrix
nix run ./nix/test-cluster#cluster -- fresh-bench-storage
nix run ./nix/test-cluster#cluster -- provider-vm-reality-proof
nix run ./nix/test-cluster#cluster -- rollout-soak
./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite
./nix/test-cluster/run-supported-surface-final-proof.sh ./work/final-proofs/latest
nix build .#checks.x86_64-linux.baremetal-iso-e2e
nix build .#checks.x86_64-linux.baremetal-iso-e2e && ./result/bin/baremetal-iso-e2e ./work/baremetal-iso-e2e/latest
nix build .#checks.x86_64-linux.deployer-vm-smoke
```
Use these commands as the release-facing local proof set:
- `fresh-smoke`: whole-cluster readiness, core behavior, and fault injection
- `fresh-demo-vm-webapp`: focused VM demo showing a web app inside the guest with FlareDB-backed state and LightningStor object snapshots surviving restart and migration
- `fresh-matrix`: composed service scenarios such as `prismnet + flashdns + fiberlb` and PrismNet-backed VM hosting bundles with `plasmavmc + coronafs + lightningstor`
- `fresh-bench-storage`: CoronaFS local-vs-shared-volume throughput, cross-worker volume visibility, and LightningStor large/small-object throughput capture
- `deployer-vm-smoke`: prebuilt NixOS system closure handoff into `nix-agent`, proving host rollout can activate a host-built target without guest-side compilation
- `single-node-quickstart`: productized one-command quickstart gate for the minimal VM platform profile
- `single-node-trial-vm`: buildable VM appliance for the same minimal VM-platform profile
- `baremetal-iso`: canonical bare-metal bootstrap gate covering pre-install boot, phone-home, flake bundle fetch, Disko install, reboot, post-install boot, and desired-system activation on one control-plane node plus one worker-equivalent node
- `fresh-smoke`: base VM-cluster gate for the six-node harness that extends the canonical `3-node HA control plane`, including readiness, core behavior, and fault injection
- `fresh-smoke` also proves the supported PlasmaVMC backend contract by requiring both worker registrations to advertise `HYPERVISOR_TYPE_KVM` and nothing broader on the public surface
- `fresh-demo-vm-webapp`: optional VM-hosting bundle proof for `plasmavmc + prismnet` with state persisted through `lightningstor`
- `fresh-matrix`: optional composition proof for provider bundles such as `prismnet + flashdns + fiberlb` and `plasmavmc + coronafs + lightningstor`, including PrismNet security-group ACL add/remove, FiberLB TCP plus TLS-terminated `Https` / `TerminatedHttps` listeners, LightningStor bucket metadata plus object-version APIs, the published `k8shost` pod-watch surface, and the KVM-only PlasmaVMC worker contract
- `provider-vm-reality-proof`: focused local-KVM provider and VM-hosting lane that writes dated artifacts under `./work/provider-vm-reality-proof/latest`, captures authoritative FlashDNS answers, FiberLB backend drain and re-convergence, and PlasmaVMC KVM shared-storage migration plus post-migration restart state
- `rollout-soak`: focused longer-run control-plane and rollout lane that rebuilds from clean local runtime state, writes dated artifacts under `./work/rollout-soak/latest`, repeats `draining` maintenance and worker power-loss, then restarts `deployer`, `fleet-scheduler`, `node-agent`, `chainfire`, and `flaredb` while recording explicit `nix-agent` scope markers for the steady-state KVM nodes
- `durability-proof`: canonical chainfire flaredb deployer backup/restore lane. It stores artifacts under `./work/durability-proof/latest`, proves logical backup/restore for ChainFire keys and FlareDB SQL rows, uses the canonical Deployer admin pre-register request itself as the backup artifact, verifies that the pre-registered node survives a `deployer.service` restart, replays the same request idempotently, and injects CoronaFS plus LightningStor failures on the live KVM cluster
- `run-publishable-kvm-suite.sh`: reproducible wrapper that captures the KVM environment, requires real `/dev/kvm` access, keeps runtime state under `./work` by default, and runs the full publishable nested-KVM trio in a single command
- `run-supported-surface-final-proof.sh`: one-shot local wrapper that keeps builders local, records environment metadata, builds `single-node-trial-vm`, runs `supported-surface-guard`, `single-node-quickstart`, and then the publishable nested-KVM suite into one dated log root
- `baremetal-iso-e2e`: materialized exact proof runner for the same canonical ISO harness; the build output keeps the attr stable, and `./result/bin/baremetal-iso-e2e` runs the real host-KVM proof with persisted log/meta
- `deployer-vm-smoke`: lightweight regression proving that `nix-agent` can activate a host-built target closure without guest-side compilation
- `deployer-vm-rollback`: smallest reproducible `nix-agent` rollback proof. It publishes a desired system with a failing `health_check_command`, expects observed status `rolled-back`, and confirms the node does not stay on the rejected target generation
`single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story. They keep the minimal KVM-backed surface separate from the rollout stack.
The checked-in entrypoint for the publishable KVM proof is the local wrapper `./nix/test-cluster/run-publishable-kvm-suite.sh`. Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally excluded from this baseline branch.
The 2026-04-10 local AMD/KVM proof snapshot is recorded under `./work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final` for `supported-surface-guard`, `single-node-trial-vm`, and `single-node-quickstart`, under `./work/publishable-kvm-suite` for the passing `fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`, and wrapper environment capture, and under `./work/rollout-soak/20260410T164549+0900` for the longer-running rollout/control-plane soak.
The 2026-04-10 exact bare-metal check-runner proof is recorded under `./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; its outer `environment.txt` records `execution_model=materialized-check-runner`, while `state/environment.txt` records `vm_accelerator_mode=kvm`.
## Responsibility Coverage
- `baremetal-iso` and `baremetal-iso-e2e` are the canonical proof for `deployer -> installer -> nix-agent`. They cover phone-home, install-plan materialization, Disko, reboot, and desired-system activation, and they now share the same `verify-baremetal-iso.sh` runtime harness.
- `deployer-vm-smoke` is the smallest regression for the same `deployer -> nix-agent` boundary. It proves that a node can receive a prebuilt target closure and activate it without guest-side compilation.
- `deployer-vm-rollback` is the canonical operator proof for `nix-agent` health-check, rollback, and partial failure recovery. Use it with [rollout-bundle.md](rollout-bundle.md) when documenting or changing the host-local rollback contract.
- `portable-control-plane-regressions` keeps the main non-KVM-safe boundaries under continuous coverage by composing `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `deployer-vm-smoke`, and `fleet-scheduler-e2e` behind the canonical profile eval guard.
- `fresh-smoke` and `fresh-matrix` are the canonical proof for `deployer -> fleet-scheduler -> node-agent`. They cover native service placement, heartbeats, failover, and runtime reconciliation.
- `fresh-smoke` proves the supported `fleet-scheduler` maintenance semantics: short-lived `active -> draining -> active` transitions, fail-stop worker loss, and replica restoration after the node returns.
- `rollout-soak` is the longer-running companion lane for the same bundle. It validates exactly one planned drain cycle and one fail-stop worker-loss cycle on the two native-runtime workers, holds each degraded state for 30 seconds, restarts `deployer`, `fleet-scheduler`, `node-agent`, `chainfire`, and `flaredb`, and then revalidates the live cluster. It also writes `scope-fixed-contract.json`, `deployer-scope-fixed.txt`, and `fleet-scheduler-scope-fixed.txt` so the supported release boundary is captured in the proof root. The steady-state KVM nodes do not ship `nix-agent.service`, so the lane records scope markers there and leaves executable `nix-agent` proof to `deployer-vm-rollback`, `baremetal-iso`, and `baremetal-iso-e2e`.
- Multi-hour maintenance windows, pinned singleton relocation rules, dynamic ChainFire membership changes, destructive FlareDB schema rewrites, fully automated online migration, and large-cluster drain storms remain outside the release-proven scope and are called out explicitly in [rollout-bundle.md](rollout-bundle.md) and [control-plane-ops.md](control-plane-ops.md).
- `fresh-smoke` also covers `k8shost` separately from `fleet-scheduler`: `k8shost` exposes tenant pod and service semantics, while `fleet-scheduler` handles bare-metal host services. `k8shost` is fixed as an API/control-plane product surface; runtime dataplane helpers stay archived non-product.
- `fresh-matrix` keeps the shipped add-on surface honest: it exercises the supported `creditservice` quota, wallet, reservation, and API-gateway flows, the published `k8shost-server` API contract, the supported LightningStor bucket metadata plus object-version APIs, and the network-provider bundle contract for PrismNet ACL lifecycle plus FiberLB TCP and TLS-terminated listeners.
- `provider-vm-reality-proof` is the artifact-producing companion lane for that same provider or VM-hosting bundle. It records PrismNet port and ACL state, authoritative FlashDNS answers, FiberLB listener drain or restore artifacts, and PlasmaVMC migration or storage-handoff state in one dated proof root.
- PrismNet real OVS/OVN dataplane validation remains outside the supported local KVM surface. The current provider proof keeps tenant API lifecycle and attached-VM networking honest, but not a release-grade `ovn-nbctl` or hardware-switch dataplane path.
- FiberLB native BGP or BFD peer interop and hardware VIP ownership remain outside the supported local KVM surface. The current provider proof fixes the shipped contract to listener publication plus backend drain and re-convergence inside the lab.
- PlasmaVMC real-hardware migration or storage handoff remains a later hardware proof. The current provider proof fixes the release surface to KVM shared-storage migration on the local worker pair.
- Within that edge bundle, APIGateway is supported as stateless replicated instances behind an external L4 or VIP layer, but the release-facing proof remains the shipped single gateway-node layout on `node06`; live in-process reload is not promised, and config rollout stays restart-based.
- NightLight is supported as a single-node WAL/snapshot service; replicated HA metrics storage and per-tenant retention enforcement are not part of the current product contract.
- CreditService export and backend migration are supported as offline export/import or backend-native snapshot workflows, not live mixed-writer migration.
- FiberLB HTTPS health checks currently do not verify backend TLS certificates. Supported scope is limited to TCP reachability plus HTTP status for the backend endpoint until CA-aware verification is wired through config, server code, and the canonical harness.
- `durability-proof` is the canonical backup, restore, and failure-injection companion lane for the publishable KVM suite. Use it after `fresh-matrix` when you need persisted artifacts for `chainfire`, `flaredb`, `deployer`, `coronafs`, and `lightningstor`.
- `rollout-soak` is the longer-running maintenance and DR companion lane for the same control-plane and rollout bundle. Use it when a change is supposed to survive the current release boundary of one planned drain cycle, one fail-stop worker-loss cycle, and service-restart churn on the live KVM lab instead of only the short `fresh-smoke` window.
- `run-core-control-plane-ops-proof.sh` is the focused operator lifecycle proof for the core control plane. It records the fixed-membership ChainFire boundary, the FlareDB additive-first migration and destructive-DDL boundary, and the standalone IAM bootstrap hardening plus signing-key, credential, and mTLS rotation proof under `./work/core-control-plane-ops-proof`.
- The supported `deployer` HA and DR boundary is scope-fixed to one active writer plus optional cold-standby restore, not automatic multi-instance failover. The canonical runbook is to recover one writer, re-apply `ultracloud.cluster` generated state with `deployer-ctl apply`, replay preserved admin pre-register requests, and then verify state through the admin API or `deployer-ctl node inspect`; the unsupported multi-instance boundary is fixed in [rollout-bundle.md](rollout-bundle.md).
- The supported `node-agent` product contract is also fixed in [rollout-bundle.md](rollout-bundle.md): per-instance logs and pid metadata live under `${stateDir}/pids`, secrets must already exist in the rendered spec or mounted host files, host-path volumes are passed through but not provisioned, and upgrades are replace-and-reconcile operations rather than in-place patching.
- The dated 2026-04-10 proof root for that lane is `./work/durability-proof/20260410T120618+0900`; `result.json` records `success=true`, and the artifact set includes `deployer-post-restart-list.json`, `coronafs-node04-local-state.json`, and `lightningstor-head-during-node05-outage.json`.
- `single-node-quickstart` intentionally excludes `deployer`, `nix-agent`, `node-agent`, and `fleet-scheduler`, so the smallest trial surface stays focused on the VM-platform core instead of mixing rollout and scheduling responsibilities.
The three `fresh-*` VM-cluster commands are the publishable nested-KVM suite. They require a Linux host with `/dev/kvm` and nested virtualization, and the harness stops at preflight by design when that device is absent. `single-node-quickstart` and `baremetal-iso` can still fall back to `TCG` for debugging, but the release-facing `baremetal-iso-e2e` runner now requires host KVM so the exact proof lane matches the shipped hardware proxy route. `deployer-vm-smoke` and `portable-control-plane-regressions` remain the supported non-KVM developer lanes.
Release-facing completion now requires both of these to be green on the same branch:
- the canonical bare-metal proof: `nix run ./nix/test-cluster#cluster -- baremetal-iso` plus `nix build .#checks.x86_64-linux.baremetal-iso-e2e` and `./result/bin/baremetal-iso-e2e`
- the publishable nested-KVM suite: `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix`, preferably through `./nix/test-cluster/run-publishable-kvm-suite.sh`
Focused operator lifecycle proof for the core control plane:
```bash
./nix/test-cluster/run-core-control-plane-ops-proof.sh ./work/core-control-plane-ops-proof/latest
```
This proof is lighter than the full KVM suite. It keeps `supported-surface-guard` honest for the control-plane contract, runs the standalone IAM signing-key rotation, credential rotation, and mTLS overlap rotation tests, and records the explicit ChainFire membership, FlareDB schema migration or destructive-DDL boundary, and IAM bootstrap hardening markers that the public docs now promise.
The dated 2026-04-10 artifact root for that lane is `./work/core-control-plane-ops-proof/20260410T172148+09:00`; it includes `iam-key-rotation-tests.log`, `iam-credential-rotation-tests.log`, `iam-mtls-rotation-tests.log`, `scope-fixed-contract.json`, and `result.json`.
## Work Root Budget
```bash
./nix/test-cluster/work-root-budget.sh status
./nix/test-cluster/work-root-budget.sh enforce
./nix/test-cluster/work-root-budget.sh cleanup-advice
./nix/test-cluster/work-root-budget.sh prune-proof-logs 2
```
Use `./nix/test-cluster/work-root-budget.sh status` for reporting, `./nix/test-cluster/work-root-budget.sh enforce` when a local proof run should fail on budget overrun, and `./nix/test-cluster/work-root-budget.sh prune-proof-logs 2` for a safer dated-proof cleanup dry-run.
The helper keeps the local proof path practical by reporting the current size of `./work`, `./work/test-cluster/state`, disposable runtime directories such as `./work/tmp` and `./work/publishable-kvm-runtime`, and the dated proof roots including `./work/provider-vm-reality-proof` and `./work/hardware-smoke`. The `enforce` mode turns those soft budgets into a non-zero local gate, and `prune-proof-logs` gives a safer dated-proof cleanup workflow before the final `nix store gc`.
## Extended Measurements
```bash
nix run ./nix/test-cluster#cluster -- fresh-bench-storage
```
`fresh-bench-storage` remains useful for storage regression tracking, but it is a benchmark path, not part of the minimal canonical publish gate.
## Operational Commands
@ -53,8 +263,13 @@ nix run ./nix/test-cluster#cluster -- clean
- package unit tests are useful but not sufficient
- host-built VM clusters are the main integration signal
- bootstrap and rollout paths must stay evaluable independently of the larger VM-hosting feature set
- distributed storage and virtualization paths must be checked under failure, not only at steady state
## Legacy Note
## Legacy And Experimental Paths
Older manual launch scripts under `baremetal/vm-cluster` are archived only for historical reference. They are not the release-validation path.
- `baremetal/vm-cluster` manual launch scripts are `legacy/manual`, not canonical validation
- direct `nix develop ./nix/test-cluster -c ./nix/test-cluster/run-cluster.sh ...` usage is a debugging path, not the publishable entrypoint
- standalone use of `netboot-control-plane` or `netboot-all-in-one` outside the documented profiles is a debugging path, not a fourth supported profile
- `netboot-worker`, Firecracker, mvisor, `k8shost-cni`, `k8shost-controllers`, and `lightningstor-csi` are archived non-product helpers and should not be presented as canonical entrypoints
- `netboot-base`, `pxe-server`, `vm-smoke-target`, and other helper images are internal or legacy building blocks, not supported profiles by themselves

111
fiberlb/Cargo.lock generated
View file

@ -161,6 +161,45 @@ dependencies = [
"password-hash",
]
[[package]]
name = "asn1-rs"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56624a96882bb8c26d61312ae18cb45868e5a9992ea73c58e45c3101e56a1e60"
dependencies = [
"asn1-rs-derive",
"asn1-rs-impl",
"displaydoc",
"nom",
"num-traits",
"rusticata-macros",
"thiserror 2.0.18",
"time",
]
[[package]]
name = "asn1-rs-derive"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3109e49b1e4909e9db6515a30c633684d68cdeaa252f215214cb4fa1a5bfee2c"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "asn1-rs-impl"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "async-stream"
version = "0.3.6"
@ -638,6 +677,26 @@ dependencies = [
"parking_lot_core",
]
[[package]]
name = "data-encoding"
version = "2.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea"
[[package]]
name = "der-parser"
version = "10.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07da5016415d5a3c4dd39b11ed26f915f52fc4e0dc197d87908bc916e51bc1a6"
dependencies = [
"asn1-rs",
"displaydoc",
"nom",
"num-bigint",
"num-traits",
"rusticata-macros",
]
[[package]]
name = "deranged"
version = "0.5.8"
@ -783,6 +842,7 @@ dependencies = [
"tracing",
"tracing-subscriber",
"uuid",
"x509-parser",
]
[[package]]
@ -1783,6 +1843,12 @@ version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "minimal-lexical"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "mio"
version = "1.1.1"
@ -1800,6 +1866,16 @@ version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084"
[[package]]
name = "nom"
version = "7.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a"
dependencies = [
"memchr",
"minimal-lexical",
]
[[package]]
name = "nu-ansi-term"
version = "0.50.3"
@ -1853,6 +1929,15 @@ dependencies = [
"libc",
]
[[package]]
name = "oid-registry"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "12f40cff3dde1b6087cc5d5f5d4d65712f34016a03ed60e9c08dcc392736b5b7"
dependencies = [
"asn1-rs",
]
[[package]]
name = "once_cell"
version = "1.21.4"
@ -2422,6 +2507,15 @@ version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d"
[[package]]
name = "rusticata-macros"
version = "4.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632"
dependencies = [
"nom",
]
[[package]]
name = "rustix"
version = "1.1.4"
@ -3922,6 +4016,23 @@ version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "x509-parser"
version = "0.18.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d43b0f71ce057da06bc0851b23ee24f3f86190b07203dd8f567d0b706a185202"
dependencies = [
"asn1-rs",
"data-encoding",
"der-parser",
"lazy_static",
"nom",
"oid-registry",
"rusticata-macros",
"thiserror 2.0.18",
"time",
]
[[package]]
name = "yoke"
version = "0.8.1"

View file

@ -35,6 +35,7 @@ rustls = "0.23"
rustls-pemfile = "2.0"
tokio-rustls = "0.26"
axum-server = { version = "0.7", features = ["tls-rustls"] }
x509-parser = "0.18"
tracing = { workspace = true }
tracing-subscriber = { workspace = true }

View file

@ -574,8 +574,8 @@ message PrefixSID {
// tlv is one of:
message TLV {
oneof tlv {
// IndexLabelTLV Type 1 (not yet implemented)
// OriginatorSRGBTLV Type 3 (not yet implemented)
// Type 1 is reserved for IndexLabelTLV.
// Type 3 is reserved for OriginatorSRGBTLV.
SRv6L3ServiceTLV l3_service = 3;
SRv6L2ServiceTLV l2_service = 4;
}

View file

@ -1,11 +1,11 @@
//! L4 TCP Data Plane for FiberLB
//!
//! Handles TCP proxy functionality with round-robin backend selection.
//! Handles TCP proxy functionality with the published L4 balancing algorithms.
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::net::SocketAddr;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use tokio::net::{TcpListener, TcpStream};
@ -14,7 +14,10 @@ use tokio::task::JoinHandle;
use crate::maglev::MaglevTable;
use crate::metadata::LbMetadataStore;
use fiberlb_types::{Backend, BackendStatus, ListenerId, Listener, PoolId, PoolAlgorithm, BackendAdminState};
use fiberlb_types::{
Backend, BackendAdminState, BackendId, BackendStatus, Listener, ListenerId, PoolAlgorithm,
PoolId,
};
/// Result type for data plane operations
pub type Result<T> = std::result::Result<T, DataPlaneError>;
@ -56,6 +59,8 @@ pub struct DataPlane {
metadata: Arc<LbMetadataStore>,
listeners: Arc<RwLock<HashMap<ListenerId, ListenerHandle>>>,
pool_cache: Arc<RwLock<HashMap<PoolId, CachedPool>>>,
pool_counters: Arc<Mutex<HashMap<PoolId, usize>>>,
active_connections: Arc<Mutex<HashMap<BackendId, usize>>>,
}
impl DataPlane {
@ -67,6 +72,8 @@ impl DataPlane {
metadata,
listeners: Arc::new(RwLock::new(HashMap::new())),
pool_cache: Arc::new(RwLock::new(HashMap::new())),
pool_counters: Arc::new(Mutex::new(HashMap::new())),
active_connections: Arc::new(Mutex::new(HashMap::new())),
}
}
@ -105,6 +112,8 @@ impl DataPlane {
// Clone required state for the task
let metadata = self.metadata.clone();
let pool_cache = self.pool_cache.clone();
let pool_counters = self.pool_counters.clone();
let active_connections = self.active_connections.clone();
let listener_id_clone = listener_id;
// Spawn listener task
@ -117,6 +126,8 @@ impl DataPlane {
tracing::debug!("Accepted connection from {}", peer_addr);
let metadata = metadata.clone();
let pool_cache = pool_cache.clone();
let pool_counters = pool_counters.clone();
let active_connections = active_connections.clone();
let pool_id = pool_id;
// Spawn connection handler
@ -126,6 +137,8 @@ impl DataPlane {
peer_addr,
metadata,
pool_cache,
pool_counters,
active_connections,
pool_id,
).await {
tracing::debug!("Connection handler error: {}", e);
@ -205,18 +218,37 @@ impl DataPlane {
peer_addr: SocketAddr,
metadata: Arc<LbMetadataStore>,
pool_cache: Arc<RwLock<HashMap<PoolId, CachedPool>>>,
pool_counters: Arc<Mutex<HashMap<PoolId, usize>>>,
active_connections: Arc<Mutex<HashMap<BackendId, usize>>>,
pool_id: PoolId,
) -> Result<()> {
// Select a backend using client address for consistent hashing
let connection_key = peer_addr.to_string();
let backend = Self::select_backend(&metadata, &pool_cache, &pool_id, &connection_key, false).await?;
let connection_key = peer_addr.ip().to_string();
let backend = Self::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
&connection_key,
false,
)
.await?;
// Build backend address
let backend_stream = match Self::connect_backend(&backend).await {
Ok(stream) => stream,
let (backend, backend_stream) = match Self::connect_backend(&backend).await {
Ok(stream) => (backend, stream),
Err(error) => {
Self::invalidate_pool_cache(&pool_cache, &pool_id).await;
let fallback = Self::select_backend(&metadata, &pool_cache, &pool_id, &connection_key, true).await?;
let fallback = Self::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
&connection_key,
true,
)
.await?;
if fallback.id == backend.id {
return Err(error);
}
@ -225,10 +257,13 @@ impl DataPlane {
fallback_backend = %fallback.id,
"Retrying FiberLB backend connection after cache refresh"
);
Self::connect_backend(&fallback).await?
let fallback_stream = Self::connect_backend(&fallback).await?;
(fallback, fallback_stream)
}
};
let _active_guard = ActiveConnectionGuard::new(active_connections, backend.id);
// Proxy bidirectionally
Self::proxy_bidirectional(client, backend_stream).await
}
@ -249,6 +284,8 @@ impl DataPlane {
async fn select_backend(
metadata: &Arc<LbMetadataStore>,
pool_cache: &Arc<RwLock<HashMap<PoolId, CachedPool>>>,
pool_counters: &Arc<Mutex<HashMap<PoolId, usize>>>,
active_connections: &Arc<Mutex<HashMap<BackendId, usize>>>,
pool_id: &PoolId,
connection_key: &str,
force_refresh: bool,
@ -257,22 +294,29 @@ impl DataPlane {
let healthy = snapshot.healthy_backends;
// Select based on algorithm
match snapshot.algorithm {
let index = match snapshot.algorithm {
PoolAlgorithm::RoundRobin => Self::next_pool_counter(pool_counters, pool_id) % healthy.len(),
PoolAlgorithm::LeastConnections => {
Self::least_connections_index(active_connections, pool_counters, pool_id, &healthy)
}
PoolAlgorithm::IpHash => Self::stable_hash(connection_key) % healthy.len(),
PoolAlgorithm::WeightedRoundRobin => {
Self::weighted_round_robin_index(pool_counters, pool_id, &healthy)
}
PoolAlgorithm::Random => {
let offset = Self::next_pool_counter(pool_counters, pool_id);
Self::stable_hash(&(connection_key, offset)) % healthy.len()
}
PoolAlgorithm::Maglev => {
// Use Maglev consistent hashing
let table = MaglevTable::new(&healthy, None);
let idx = table.lookup(connection_key)
.ok_or(DataPlaneError::NoHealthyBackends)?;
Ok(healthy[idx].clone())
table
.lookup(connection_key)
.ok_or(DataPlaneError::NoHealthyBackends)?
}
_ => {
// Default: Round-robin for all other algorithms
// TODO: Implement LeastConnections, IpHash, WeightedRoundRobin, Random
static COUNTER: AtomicUsize = AtomicUsize::new(0);
let idx = COUNTER.fetch_add(1, Ordering::Relaxed) % healthy.len();
Ok(healthy.into_iter().nth(idx).unwrap())
}
}
};
Ok(healthy[index].clone())
}
async fn get_pool_snapshot(
@ -326,6 +370,80 @@ impl DataPlane {
Ok(snapshot)
}
fn stable_hash<T: Hash + ?Sized>(value: &T) -> usize {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
value.hash(&mut hasher);
hasher.finish() as usize
}
fn next_pool_counter(pool_counters: &Arc<Mutex<HashMap<PoolId, usize>>>, pool_id: &PoolId) -> usize {
let mut counters = pool_counters.lock().expect("pool counters poisoned");
let counter = counters.entry(*pool_id).or_insert(0);
let current = *counter;
*counter = counter.wrapping_add(1);
current
}
fn active_connection_count(
active_connections: &Arc<Mutex<HashMap<BackendId, usize>>>,
backend_id: &BackendId,
) -> usize {
let counts = active_connections
.lock()
.expect("active connection counters poisoned");
counts.get(backend_id).copied().unwrap_or(0)
}
fn least_connections_index(
active_connections: &Arc<Mutex<HashMap<BackendId, usize>>>,
pool_counters: &Arc<Mutex<HashMap<PoolId, usize>>>,
pool_id: &PoolId,
backends: &[Backend],
) -> usize {
let min_connections = backends
.iter()
.map(|backend| Self::active_connection_count(active_connections, &backend.id))
.min()
.unwrap_or(0);
let least_loaded = backends
.iter()
.enumerate()
.filter_map(|(index, backend)| {
let count = Self::active_connection_count(active_connections, &backend.id);
(count == min_connections).then_some(index)
})
.collect::<Vec<_>>();
let offset = Self::next_pool_counter(pool_counters, pool_id) % least_loaded.len();
least_loaded[offset]
}
fn weighted_round_robin_index(
pool_counters: &Arc<Mutex<HashMap<PoolId, usize>>>,
pool_id: &PoolId,
backends: &[Backend],
) -> usize {
let total_weight = backends
.iter()
.map(|backend| backend.weight.max(1) as usize)
.sum::<usize>();
if total_weight == 0 {
return 0;
}
let mut offset = Self::next_pool_counter(pool_counters, pool_id) % total_weight;
for (index, backend) in backends.iter().enumerate() {
let weight = backend.weight.max(1) as usize;
if offset < weight {
return index;
}
offset -= weight;
}
0
}
async fn invalidate_pool_cache(
pool_cache: &Arc<RwLock<HashMap<PoolId, CachedPool>>>,
pool_id: &PoolId,
@ -378,9 +496,67 @@ impl DataPlane {
}
}
struct ActiveConnectionGuard {
active_connections: Arc<Mutex<HashMap<BackendId, usize>>>,
backend_id: BackendId,
}
impl ActiveConnectionGuard {
fn new(
active_connections: Arc<Mutex<HashMap<BackendId, usize>>>,
backend_id: BackendId,
) -> Self {
let mut counts = active_connections
.lock()
.expect("active connection counters poisoned");
*counts.entry(backend_id).or_insert(0) += 1;
drop(counts);
Self {
active_connections,
backend_id,
}
}
}
impl Drop for ActiveConnectionGuard {
fn drop(&mut self) {
let mut counts = self
.active_connections
.lock()
.expect("active connection counters poisoned");
if let Some(count) = counts.get_mut(&self.backend_id) {
if *count > 1 {
*count -= 1;
} else {
counts.remove(&self.backend_id);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use fiberlb_types::{LoadBalancerId, Pool, PoolProtocol};
async fn seed_pool(
metadata: &Arc<LbMetadataStore>,
algorithm: PoolAlgorithm,
backends: &[(String, u16, u32)],
) -> PoolId {
let pool = Pool::new("test-pool", LoadBalancerId::new(), algorithm, PoolProtocol::Tcp);
metadata.save_pool(&pool).await.unwrap();
for (index, (address, port, weight)) in backends.iter().enumerate() {
let mut backend = Backend::new(format!("backend-{index}"), pool.id, address.clone(), *port);
backend.weight = *weight;
backend.status = BackendStatus::Online;
metadata.save_backend(&backend).await.unwrap();
}
pool.id
}
#[tokio::test]
async fn test_dataplane_creation() {
@ -409,11 +585,15 @@ mod tests {
async fn test_backend_selection_empty() {
let metadata = Arc::new(LbMetadataStore::new_in_memory());
let pool_cache = Arc::new(RwLock::new(HashMap::new()));
let pool_counters = Arc::new(Mutex::new(HashMap::new()));
let active_connections = Arc::new(Mutex::new(HashMap::new()));
let pool_id = PoolId::new();
let result = DataPlane::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
"192.168.1.1:54321",
false,
@ -422,4 +602,113 @@ mod tests {
assert!(result.is_err());
// Expecting PoolNotFound since pool doesn't exist
}
#[tokio::test]
async fn test_weighted_round_robin_selection_respects_weights() {
let metadata = Arc::new(LbMetadataStore::new_in_memory());
let pool_cache = Arc::new(RwLock::new(HashMap::new()));
let pool_counters = Arc::new(Mutex::new(HashMap::new()));
let active_connections = Arc::new(Mutex::new(HashMap::new()));
let pool_id = seed_pool(
&metadata,
PoolAlgorithm::WeightedRoundRobin,
&[
("10.0.0.1".to_string(), 8080, 1),
("10.0.0.2".to_string(), 8080, 3),
],
)
.await;
let sequence = [
DataPlane::select_backend(&metadata, &pool_cache, &pool_counters, &active_connections, &pool_id, "client-a", false).await.unwrap().address,
DataPlane::select_backend(&metadata, &pool_cache, &pool_counters, &active_connections, &pool_id, "client-b", false).await.unwrap().address,
DataPlane::select_backend(&metadata, &pool_cache, &pool_counters, &active_connections, &pool_id, "client-c", false).await.unwrap().address,
DataPlane::select_backend(&metadata, &pool_cache, &pool_counters, &active_connections, &pool_id, "client-d", false).await.unwrap().address,
];
assert_eq!(sequence, ["10.0.0.1", "10.0.0.2", "10.0.0.2", "10.0.0.2"]);
}
#[tokio::test]
async fn test_least_connections_prefers_less_loaded_backend() {
let metadata = Arc::new(LbMetadataStore::new_in_memory());
let pool_cache = Arc::new(RwLock::new(HashMap::new()));
let pool_counters = Arc::new(Mutex::new(HashMap::new()));
let active_connections = Arc::new(Mutex::new(HashMap::new()));
let pool_id = seed_pool(
&metadata,
PoolAlgorithm::LeastConnections,
&[
("10.0.0.1".to_string(), 8080, 1),
("10.0.0.2".to_string(), 8080, 1),
],
)
.await;
let snapshot = DataPlane::get_pool_snapshot(&metadata, &pool_cache, &pool_id, false)
.await
.unwrap();
let loaded_backend = snapshot.healthy_backends[0].id;
active_connections
.lock()
.unwrap()
.insert(loaded_backend, 4);
let selected = DataPlane::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
"least-client",
false,
)
.await
.unwrap();
assert_eq!(selected.address, "10.0.0.2");
}
#[tokio::test]
async fn test_ip_hash_is_stable_for_same_source_ip() {
let metadata = Arc::new(LbMetadataStore::new_in_memory());
let pool_cache = Arc::new(RwLock::new(HashMap::new()));
let pool_counters = Arc::new(Mutex::new(HashMap::new()));
let active_connections = Arc::new(Mutex::new(HashMap::new()));
let pool_id = seed_pool(
&metadata,
PoolAlgorithm::IpHash,
&[
("10.0.0.1".to_string(), 8080, 1),
("10.0.0.2".to_string(), 8080, 1),
("10.0.0.3".to_string(), 8080, 1),
],
)
.await;
let first = DataPlane::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
"192.168.10.44",
false,
)
.await
.unwrap();
let second = DataPlane::select_backend(
&metadata,
&pool_cache,
&pool_counters,
&active_connections,
&pool_id,
"192.168.10.44",
false,
)
.await
.unwrap();
assert_eq!(first.id, second.id);
}
}

View file

@ -152,7 +152,10 @@ impl HealthChecker {
self.http_check(backend, path).await
}
HealthCheckType::Https => {
// For now, treat HTTPS same as HTTP (no TLS verification)
// HTTPS backends currently use the HTTP probe path without
// backend certificate verification. The supported surface is
// documented as TCP reachability plus HTTP status only until
// CA-aware verification is added to the config and harness.
let path = hc_config
.and_then(|hc| hc.http_config.as_ref())
.map(|cfg| cfg.path.as_str())

View file

@ -1,30 +1,33 @@
//! L7 (HTTP/HTTPS) Data Plane
//!
//! Provides HTTP-aware load balancing with content-based routing, TLS termination,
//! Provides HTTP-aware load balancing with content-based routing, TLS-terminated HTTPS,
//! and session persistence.
use axum::{
body::Body,
extract::{Request, State},
http::{header, HeaderValue, StatusCode, Uri},
http::{header, HeaderValue, StatusCode, Uri, Version},
response::{IntoResponse, Response},
routing::any,
Router,
};
use hyper_util::client::legacy::connect::HttpConnector;
use hyper_util::client::legacy::Client;
use hyper_util::rt::TokioExecutor;
use hyper_util::rt::{TokioExecutor, TokioIo};
use hyper_util::service::TowerToHyperService;
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::Arc;
use std::sync::{Arc, Mutex};
use tokio::sync::RwLock;
use tokio::task::JoinHandle;
use tokio_rustls::TlsAcceptor;
use crate::l7_router::{L7Router, RequestInfo, RoutingResult};
use crate::metadata::LbMetadataStore;
use crate::tls::build_tls_config;
use fiberlb_types::{
Backend, BackendAdminState, BackendStatus, Listener, ListenerId, ListenerProtocol, PoolAlgorithm,
PoolId,
Backend, BackendAdminState, BackendId, BackendStatus, CertificateId, Listener, ListenerId,
ListenerProtocol, PoolAlgorithm, PoolId,
};
type Result<T> = std::result::Result<T, L7Error>;
@ -37,8 +40,10 @@ pub enum L7Error {
InvalidProtocol,
#[error("TLS config missing for HTTPS listener")]
TlsConfigMissing,
#[error("TLS termination not implemented for HTTPS listeners")]
TlsNotImplemented,
#[error("TLS certificate not found: {0}")]
TlsCertificateNotFound(String),
#[error("TLS configuration error: {0}")]
TlsConfig(String),
#[error("Backend unavailable: {0}")]
BackendUnavailable(String),
#[error("Proxy error: {0}")]
@ -59,6 +64,7 @@ pub struct L7DataPlane {
http_client: Client<HttpConnector, Body>,
listeners: Arc<RwLock<HashMap<ListenerId, L7ListenerHandle>>>,
pool_counters: Arc<RwLock<HashMap<PoolId, usize>>>,
active_requests: Arc<Mutex<HashMap<BackendId, usize>>>,
}
impl L7DataPlane {
@ -74,6 +80,7 @@ impl L7DataPlane {
http_client,
listeners: Arc::new(RwLock::new(HashMap::new())),
pool_counters: Arc::new(RwLock::new(HashMap::new())),
active_requests: Arc::new(Mutex::new(HashMap::new())),
}
}
@ -91,14 +98,11 @@ impl L7DataPlane {
.parse()
.map_err(|e| L7Error::ProxyError(format!("Invalid bind address: {}", e)))?;
// For now, only implement HTTP (HTTPS/TLS in Phase 3)
match listener.protocol {
ListenerProtocol::Http => {
self.start_http_server(listener_id, bind_addr, app).await
}
ListenerProtocol::Http => self.start_http_server(listener_id, bind_addr, app).await,
ListenerProtocol::Https | ListenerProtocol::TerminatedHttps => {
// TODO: Phase 3 - TLS termination
Err(L7Error::TlsNotImplemented)
self.start_tls_server(listener_id, bind_addr, app, &listener)
.await
}
_ => Err(L7Error::InvalidProtocol),
}
@ -138,6 +142,7 @@ impl L7DataPlane {
listener_id: listener.id,
default_pool_id: listener.default_pool_id.clone(),
pool_counters: self.pool_counters.clone(),
active_requests: self.active_requests.clone(),
};
Ok(Router::new()
@ -174,6 +179,103 @@ impl L7DataPlane {
Ok(())
}
async fn start_tls_server(
&self,
listener_id: ListenerId,
bind_addr: SocketAddr,
app: Router,
listener: &Listener,
) -> Result<()> {
let tls = listener
.tls_config
.as_ref()
.ok_or(L7Error::TlsConfigMissing)?;
let certificate_id = parse_certificate_id(&tls.certificate_id)?;
let certificate = self
.metadata
.find_certificate_by_id(&certificate_id)
.await
.map_err(|error| L7Error::Metadata(error.to_string()))?
.ok_or_else(|| L7Error::TlsCertificateNotFound(tls.certificate_id.clone()))?;
let tls_config = build_tls_config(
&certificate.certificate,
&certificate.private_key,
tls.min_version,
)
.map_err(|error| L7Error::TlsConfig(error.to_string()))?;
let acceptor = TlsAcceptor::from(Arc::new(tls_config));
tracing::info!(
listener_id = %listener_id,
addr = %bind_addr,
"Starting L7 HTTPS listener"
);
let tcp_listener = tokio::net::TcpListener::bind(bind_addr)
.await
.map_err(|e| L7Error::ProxyError(format!("Failed to bind: {}", e)))?;
let task = tokio::spawn(async move {
loop {
match tcp_listener.accept().await {
Ok((stream, peer_addr)) => {
let acceptor = acceptor.clone();
let app = app.clone();
tokio::spawn(async move {
match acceptor.accept(stream).await {
Ok(tls_stream) => {
let io = TokioIo::new(tls_stream);
let builder = hyper_util::server::conn::auto::Builder::new(
TokioExecutor::new(),
);
let service = TowerToHyperService::new(app);
if let Err(error) = builder
.serve_connection_with_upgrades(io, service)
.await
{
tracing::warn!(
listener_id = %listener_id,
peer_addr = %peer_addr,
error = %error,
"HTTPS server connection ended with error"
);
}
}
Err(error) => {
tracing::warn!(
listener_id = %listener_id,
peer_addr = %peer_addr,
error = %error,
"TLS handshake failed"
);
}
}
});
}
Err(error) => {
tracing::error!(
listener_id = %listener_id,
error = %error,
"HTTPS accept error"
);
}
}
}
});
let mut listeners = self.listeners.write().await;
listeners.insert(listener_id, L7ListenerHandle { task });
Ok(())
}
}
fn parse_certificate_id(id: &str) -> Result<CertificateId> {
let uuid = id
.parse()
.map_err(|_| L7Error::TlsConfig(format!("invalid certificate ID: {id}")))?;
Ok(CertificateId::from_uuid(uuid))
}
/// Shared state for proxy handlers
@ -185,6 +287,7 @@ struct ProxyState {
listener_id: ListenerId,
default_pool_id: Option<PoolId>,
pool_counters: Arc<RwLock<HashMap<PoolId, usize>>>,
active_requests: Arc<Mutex<HashMap<BackendId, usize>>>,
}
/// Main proxy request handler
@ -246,6 +349,7 @@ async fn proxy_to_pool(
return text_response(StatusCode::SERVICE_UNAVAILABLE, error.to_string());
}
};
let _active_request = ActiveRequestGuard::new(state.active_requests.clone(), backend.id);
let path_and_query = request
.uri()
@ -267,8 +371,7 @@ async fn proxy_to_pool(
};
let (mut parts, body) = request.into_parts();
parts.uri = target_uri;
rewrite_proxy_headers(&mut parts.headers, &backend_host);
rewrite_backend_request_parts(&mut parts, target_uri, &backend_host);
match state.http_client.request(Request::from_parts(parts, body)).await {
Ok(response) => {
@ -318,10 +421,9 @@ async fn select_backend(
let index = match pool.algorithm {
PoolAlgorithm::IpHash | PoolAlgorithm::Maglev => request_hash % backends.len(),
PoolAlgorithm::WeightedRoundRobin => weighted_round_robin_index(state, pool_id, &backends).await,
PoolAlgorithm::Random => next_counter(state, pool_id).await % backends.len(),
PoolAlgorithm::LeastConnections | PoolAlgorithm::RoundRobin => {
next_counter(state, pool_id).await % backends.len()
}
PoolAlgorithm::Random => random_index(state, pool_id, request_hash, backends.len()).await,
PoolAlgorithm::LeastConnections => least_connections_index(state, pool_id, &backends).await,
PoolAlgorithm::RoundRobin => next_counter(state, pool_id).await % backends.len(),
};
Ok(backends[index].clone())
@ -365,17 +467,69 @@ async fn weighted_round_robin_index(
0
}
async fn random_index(
state: &ProxyState,
pool_id: PoolId,
request_hash: usize,
backend_count: usize,
) -> usize {
let offset = next_counter(state, pool_id).await;
stable_hash(&(request_hash, offset)) % backend_count
}
async fn least_connections_index(
state: &ProxyState,
pool_id: PoolId,
backends: &[Backend],
) -> usize {
let min_requests = {
let counts = state
.active_requests
.lock()
.expect("active request counters poisoned");
backends
.iter()
.map(|backend| counts.get(&backend.id).copied().unwrap_or(0))
.min()
.unwrap_or(0)
};
let candidates = {
let counts = state
.active_requests
.lock()
.expect("active request counters poisoned");
backends
.iter()
.enumerate()
.filter_map(|(index, backend)| {
let count = counts.get(&backend.id).copied().unwrap_or(0);
(count == min_requests).then_some(index)
})
.collect::<Vec<_>>()
};
let offset = next_counter(state, pool_id).await % candidates.len();
candidates[offset]
}
fn stable_request_hash(request: &Request) -> usize {
use std::hash::{Hash, Hasher};
stable_hash(&(
request.method().clone(),
request.uri().path_and_query().map(|value| value.as_str().to_string()),
request
.headers()
.get(header::HOST)
.and_then(|value| value.to_str().ok())
.map(str::to_string),
))
}
fn stable_hash<T: std::hash::Hash>(value: &T) -> usize {
use std::hash::Hasher;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
request.method().hash(&mut hasher);
request.uri().path_and_query().map(|value| value.as_str()).hash(&mut hasher);
request
.headers()
.get(header::HOST)
.and_then(|value| value.to_str().ok())
.hash(&mut hasher);
std::hash::Hash::hash(value, &mut hasher);
hasher.finish() as usize
}
@ -393,9 +547,82 @@ fn rewrite_proxy_headers(headers: &mut axum::http::HeaderMap, backend_host: &str
}
}
fn rewrite_backend_request_parts(
parts: &mut axum::http::request::Parts,
target_uri: Uri,
backend_host: &str,
) {
parts.uri = target_uri;
parts.version = Version::HTTP_11;
rewrite_proxy_headers(&mut parts.headers, backend_host);
}
fn text_response(status: StatusCode, body: impl Into<Body>) -> Response {
Response::builder()
.status(status)
.body(body.into())
.unwrap()
}
struct ActiveRequestGuard {
active_requests: Arc<Mutex<HashMap<BackendId, usize>>>,
backend_id: BackendId,
}
impl ActiveRequestGuard {
fn new(active_requests: Arc<Mutex<HashMap<BackendId, usize>>>, backend_id: BackendId) -> Self {
let mut counts = active_requests
.lock()
.expect("active request counters poisoned");
*counts.entry(backend_id).or_insert(0) += 1;
drop(counts);
Self {
active_requests,
backend_id,
}
}
}
impl Drop for ActiveRequestGuard {
fn drop(&mut self) {
let mut counts = self
.active_requests
.lock()
.expect("active request counters poisoned");
if let Some(count) = counts.get_mut(&self.backend_id) {
if *count > 1 {
*count -= 1;
} else {
counts.remove(&self.backend_id);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn rewrite_backend_request_parts_sets_http11_and_host() {
let request = Request::builder()
.uri("https://frontend.example.test/health")
.version(Version::HTTP_2)
.header(header::HOST, "frontend.example.test")
.body(Body::empty())
.expect("request");
let target_uri: Uri = "http://10.0.0.10:8081/health".parse().expect("uri");
let backend_host = "10.0.0.10:8081";
let (mut parts, _body) = request.into_parts();
rewrite_backend_request_parts(&mut parts, target_uri.clone(), backend_host);
assert_eq!(parts.version, Version::HTTP_11);
assert_eq!(parts.uri, target_uri);
assert_eq!(
parts.headers.get(header::HOST).and_then(|value| value.to_str().ok()),
Some(backend_host)
);
}
}

View file

@ -18,6 +18,7 @@ use fiberlb_types::{
use iam_service_auth::{get_tenant_context, resource_for_tenant, AuthService};
use tonic::{Request, Response, Status};
use uuid::Uuid;
use x509_parser::parse_x509_certificate;
/// Certificate service implementation
pub struct CertificateServiceImpl {
@ -82,6 +83,26 @@ fn proto_to_cert_type(cert_type: i32) -> CertificateType {
}
}
fn parse_certificate_expiry(certificate_pem: &str) -> Result<u64, Status> {
let cert_chain = rustls_pemfile::certs(&mut certificate_pem.as_bytes())
.collect::<Result<Vec<_>, _>>()
.map_err(|e| Status::invalid_argument(format!("failed to parse certificate PEM: {e}")))?;
let cert_der = cert_chain
.first()
.ok_or_else(|| Status::invalid_argument("certificate PEM did not contain any certificates"))?;
let (_, parsed) = parse_x509_certificate(cert_der.as_ref())
.map_err(|e| Status::invalid_argument(format!("failed to parse X.509 certificate: {e:?}")))?;
let expires_at = parsed.validity().not_after.timestamp();
if expires_at <= 0 {
return Err(Status::invalid_argument(
"certificate expiry must be after the Unix epoch",
));
}
Ok(expires_at as u64)
}
#[tonic::async_trait]
impl CertificateService for CertificateServiceImpl {
async fn create_certificate(
@ -128,13 +149,7 @@ impl CertificateService for CertificateServiceImpl {
// Parse certificate type
let cert_type = proto_to_cert_type(req.cert_type);
// TODO: Parse certificate to extract expiry date
// For now, set expires_at to 1 year from now
let expires_at = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_secs() + (365 * 24 * 60 * 60);
let expires_at = parse_certificate_expiry(&req.certificate)?;
// Create new certificate
let certificate = Certificate::new(
@ -335,3 +350,14 @@ impl CertificateService for CertificateServiceImpl {
Ok(Response::new(DeleteCertificateResponse {}))
}
}
#[cfg(test)]
mod tests {
use super::parse_certificate_expiry;
#[test]
fn parse_certificate_expiry_rejects_invalid_pem() {
let err = parse_certificate_expiry("not-a-certificate").unwrap_err();
assert_eq!(err.code(), tonic::Code::InvalidArgument);
}
}

View file

@ -1,6 +1,6 @@
//! TLS Configuration and Certificate Management
//!
//! Provides rustls-based TLS termination with SNI support for L7 HTTPS listeners.
//! Provides rustls-based terminated-HTTPS support with SNI for L7 listeners.
use rustls::crypto::ring::sign::any_supported_type;
use rustls::pki_types::CertificateDer;
@ -9,7 +9,7 @@ use rustls::sign::CertifiedKey;
use rustls::ServerConfig;
use std::collections::HashMap;
use std::io::Cursor;
use std::sync::Arc;
use std::sync::{Arc, Once};
use fiberlb_types::{Certificate, CertificateId, LoadBalancerId, TlsVersion};
@ -29,12 +29,21 @@ pub enum TlsError {
CertificateNotFound(String),
}
fn ensure_crypto_provider() {
static INIT: Once = Once::new();
INIT.call_once(|| {
let _ = rustls::crypto::ring::default_provider().install_default();
});
}
/// Build TLS server configuration from certificate and private key
pub fn build_tls_config(
cert_pem: &str,
key_pem: &str,
min_version: TlsVersion,
) -> Result<ServerConfig> {
ensure_crypto_provider();
// Parse certificate chain from PEM
let mut cert_reader = Cursor::new(cert_pem.as_bytes());
let certs: Vec<CertificateDer> = rustls_pemfile::certs(&mut cert_reader)
@ -69,6 +78,8 @@ pub fn build_tls_config(
}
pub fn build_certified_key(cert_pem: &str, key_pem: &str) -> Result<Arc<CertifiedKey>> {
ensure_crypto_provider();
let mut cert_reader = Cursor::new(cert_pem.as_bytes());
let certs: Vec<CertificateDer> = rustls_pemfile::certs(&mut cert_reader)
.collect::<std::result::Result<Vec<_>, _>>()

View file

@ -40,7 +40,7 @@ impl std::fmt::Display for CertificateId {
/// TLS Certificate
///
/// Stores X.509 certificates and private keys for TLS termination.
/// Stores X.509 certificates and private keys for terminated HTTPS listeners.
/// Certificates are stored in PEM format and should be encrypted at rest
/// in production deployments.
#[derive(Debug, Clone, Serialize, Deserialize)]
@ -76,7 +76,7 @@ pub struct Certificate {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CertificateType {
/// Standard server certificate for TLS termination
/// Standard server certificate for terminated HTTPS listeners
Server,
/// CA certificate for client authentication
ClientCa,

View file

@ -50,7 +50,7 @@ pub enum ListenerProtocol {
Udp,
/// HTTP (L7)
Http,
/// HTTPS (L7 with TLS termination)
/// HTTPS (L7 with terminated HTTPS)
Https,
/// Terminated HTTPS (pass through to HTTP backend)
TerminatedHttps,

993
flake.nix

File diff suppressed because it is too large Load diff

View file

@ -105,8 +105,8 @@ fn resolve_chainfire_route_from_snapshot(
}
}
let selected_store = selected_store
.ok_or_else(|| tonic::Status::not_found("region peer store not found"))?;
let selected_store =
selected_store.ok_or_else(|| tonic::Status::not_found("region peer store not found"))?;
if candidate_addrs.is_empty() {
return Err(tonic::Status::not_found(
"region has no candidate store addresses",
@ -841,9 +841,7 @@ impl RdbClient {
force_refresh: bool,
) -> Result<ResolvedRoute, tonic::Status> {
if force_refresh {
let snapshot = self
.chainfire_route_snapshot(kv_client, true)
.await?;
let snapshot = self.chainfire_route_snapshot(kv_client, true).await?;
return resolve_chainfire_route_from_snapshot(key, &snapshot);
}
@ -946,9 +944,8 @@ async fn list_chainfire_regions(
#[cfg(test)]
mod tests {
use super::{
normalize_transport_addr, parse_transport_endpoints,
resolve_chainfire_route_from_snapshot, ChainfireRegionInfo, ChainfireRouteSnapshot,
ChainfireStoreInfo, RdbClient,
normalize_transport_addr, parse_transport_endpoints, resolve_chainfire_route_from_snapshot,
ChainfireRegionInfo, ChainfireRouteSnapshot, ChainfireStoreInfo, RdbClient,
};
use std::collections::HashMap;
use std::time::Instant;

View file

@ -23,15 +23,9 @@ service Watch {
rpc Watch(stream WatchRequest) returns (stream WatchResponse);
}
// Cluster management service
// Cluster management service for fixed-membership clusters.
service Cluster {
// MemberAdd adds a member into the cluster
rpc MemberAdd(MemberAddRequest) returns (MemberAddResponse);
// MemberRemove removes an existing member from the cluster
rpc MemberRemove(MemberRemoveRequest) returns (MemberRemoveResponse);
// MemberList lists all the members in the cluster
// MemberList lists the members configured at cluster bootstrap time
rpc MemberList(MemberListRequest) returns (MemberListResponse);
// Status gets the status of the cluster
@ -266,32 +260,6 @@ message Member {
bool is_learner = 5;
}
message MemberAddRequest {
// peer_urls are the URLs to reach the new member
repeated string peer_urls = 1;
// is_learner indicates if the member is a learner
bool is_learner = 2;
}
message MemberAddResponse {
ResponseHeader header = 1;
// member is the member information for the added member
Member member = 2;
// members is the list of all members after adding
repeated Member members = 3;
}
message MemberRemoveRequest {
// ID is the member ID to remove
uint64 id = 1;
}
message MemberRemoveResponse {
ResponseHeader header = 1;
// members is the list of all members after removing
repeated Member members = 2;
}
message MemberListRequest {}
message MemberListResponse {

View file

@ -4,14 +4,16 @@
//! for inter-node communication using gRPC.
use crate::types::{FlareNode, FlareNodeId, FlareTypeConfig};
use openraft::error::{Fatal, NetworkError, RPCError, RaftError, ReplicationClosed, StreamingError};
use flaredb_proto::raft_server::raft_service_client::RaftServiceClient;
use openraft::error::{
Fatal, NetworkError, RPCError, RaftError, ReplicationClosed, StreamingError,
};
use openraft::network::{RPCOption, RaftNetwork, RaftNetworkFactory};
use openraft::raft::{
AppendEntriesRequest, AppendEntriesResponse, InstallSnapshotRequest, InstallSnapshotResponse,
SnapshotResponse, VoteRequest, VoteResponse,
};
use openraft::{OptionalSend, Snapshot, Vote};
use flaredb_proto::raft_server::raft_service_client::RaftServiceClient;
use std::collections::HashMap;
use std::future::Future;
use std::sync::Arc;
@ -348,8 +350,9 @@ impl RaftNetwork<FlareTypeConfig> for FlareNetwork {
RPCError::Network(Self::network_error(e.to_string()))
})?;
let resp: VoteResponse<FlareNodeId> = serde_json::from_slice(&response.into_inner().data)
.map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?;
let resp: VoteResponse<FlareNodeId> =
serde_json::from_slice(&response.into_inner().data)
.map_err(|e| RPCError::Network(Self::network_error(e.to_string())))?;
Ok(resp)
}

View file

@ -7,7 +7,10 @@ use crate::types::{
FlareEntry, FlareLogId, FlareNode, FlareNodeId, FlareResponse, FlareTypeConfig,
};
use flaredb_storage::rocks_engine::RocksEngine;
use openraft::storage::{LogFlushed, LogState, RaftLogReader, RaftLogStorage, RaftSnapshotBuilder, RaftStateMachine, Snapshot};
use openraft::storage::{
LogFlushed, LogState, RaftLogReader, RaftLogStorage, RaftSnapshotBuilder, RaftStateMachine,
Snapshot,
};
use openraft::{EntryPayload, OptionalSend, RaftLogId, SnapshotMeta};
use openraft::{StorageError, StorageIOError, StoredMembership, Vote};
use std::fmt::Debug;
@ -50,8 +53,7 @@ impl PersistentFlareStore {
debug!(
has_snapshot = snapshot.is_some(),
snapshot_idx,
"PersistentFlareStore initialized"
snapshot_idx, "PersistentFlareStore initialized"
);
Self {
@ -63,9 +65,7 @@ impl PersistentFlareStore {
}
/// Recover state from RocksDB on startup
fn recover_from_disk(
engine: &RocksEngine,
) -> (FlareStateMachine, Option<FlareSnapshot>, u64) {
fn recover_from_disk(engine: &RocksEngine) -> (FlareStateMachine, Option<FlareSnapshot>, u64) {
// Load snapshot index
let snapshot_idx = engine
.get_raft_state(KEY_SNAPSHOT_IDX)
@ -90,10 +90,8 @@ impl PersistentFlareStore {
.and_then(|data| serde_json::from_slice(&data).ok());
// Load snapshot data
let snapshot_data: Option<Vec<u8>> = engine
.get_raft_state(KEY_SNAPSHOT_DATA)
.ok()
.flatten();
let snapshot_data: Option<Vec<u8>> =
engine.get_raft_state(KEY_SNAPSHOT_DATA).ok().flatten();
// If we have both meta and data, reconstruct the snapshot and state machine
match (snapshot_meta, snapshot_data) {
@ -105,10 +103,7 @@ impl PersistentFlareStore {
snapshot_id = %meta.snapshot_id,
"Recovered state machine from snapshot"
);
let snapshot = FlareSnapshot {
meta,
data,
};
let snapshot = FlareSnapshot { meta, data };
(sm, Some(snapshot), snapshot_idx)
}
Err(e) => {
@ -128,10 +123,14 @@ impl PersistentFlareStore {
}
/// Persist snapshot to RocksDB
fn persist_snapshot(&self, snapshot: &FlareSnapshot, idx: u64) -> Result<(), StorageError<FlareNodeId>> {
fn persist_snapshot(
&self,
snapshot: &FlareSnapshot,
idx: u64,
) -> Result<(), StorageError<FlareNodeId>> {
// Persist snapshot metadata
let meta_data = serde_json::to_vec(&snapshot.meta)
.map_err(|e| StorageIOError::write(&e))?;
let meta_data =
serde_json::to_vec(&snapshot.meta).map_err(|e| StorageIOError::write(&e))?;
self.engine
.put_raft_state(KEY_SNAPSHOT_META, &meta_data)
.map_err(|e| StorageIOError::write(&e))?;
@ -308,7 +307,9 @@ impl RaftSnapshotBuilder<FlareTypeConfig> for Arc<PersistentFlareStore> {
impl RaftLogStorage<FlareTypeConfig> for Arc<PersistentFlareStore> {
type LogReader = Self;
async fn get_log_state(&mut self) -> Result<LogState<FlareTypeConfig>, StorageError<FlareNodeId>> {
async fn get_log_state(
&mut self,
) -> Result<LogState<FlareTypeConfig>, StorageError<FlareNodeId>> {
// Get last purged log ID from state
let last_purged = self
.engine
@ -379,9 +380,7 @@ impl RaftLogStorage<FlareTypeConfig> for Arc<PersistentFlareStore> {
Ok(())
}
async fn read_committed(
&mut self,
) -> Result<Option<FlareLogId>, StorageError<FlareNodeId>> {
async fn read_committed(&mut self) -> Result<Option<FlareLogId>, StorageError<FlareNodeId>> {
let committed = self
.engine
.get_raft_state(KEY_COMMITTED)
@ -465,7 +464,10 @@ impl RaftStateMachine<FlareTypeConfig> for Arc<PersistentFlareStore> {
async fn applied_state(
&mut self,
) -> Result<(Option<FlareLogId>, StoredMembership<FlareNodeId, FlareNode>), StorageError<FlareNodeId>> {
) -> Result<
(Option<FlareLogId>, StoredMembership<FlareNodeId, FlareNode>),
StorageError<FlareNodeId>,
> {
let sm = self.sm.read().await;
Ok((sm.last_applied_log, sm.last_membership.clone()))
}
@ -520,8 +522,10 @@ impl RaftStateMachine<FlareTypeConfig> for Arc<PersistentFlareStore> {
// Update state machine (using bincode to match build_snapshot)
{
let new_sm: FlareStateMachine = bincode::deserialize(&new_snapshot.data)
.map_err(|e| StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e))?;
let new_sm: FlareStateMachine =
bincode::deserialize(&new_snapshot.data).map_err(|e| {
StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e)
})?;
let mut sm = self.sm.write().await;
*sm = new_sm;
}

View file

@ -6,9 +6,7 @@
use crate::network::FlareNetworkFactory;
use crate::persistent_storage::PersistentFlareStore;
use crate::storage::FlareStore;
use crate::types::{
FlareNode, FlareNodeId, FlareRaft, FlareRequest, FlareResponse,
};
use crate::types::{FlareNode, FlareNodeId, FlareRaft, FlareRequest, FlareResponse};
use flaredb_storage::rocks_engine::RocksEngine;
use openraft::error::{ClientWriteError, InitializeError, RaftError};
use openraft::{Config, Raft};
@ -117,7 +115,9 @@ pub struct FlareRaftNode {
impl FlareRaftNode {
/// Create OpenRaft config with sensible defaults
fn create_raft_config(region_id: u64) -> Result<Arc<Config>, Box<dyn std::error::Error + Send + Sync>> {
fn create_raft_config(
region_id: u64,
) -> Result<Arc<Config>, Box<dyn std::error::Error + Send + Sync>> {
let raft_config = Config {
cluster_name: format!("flare-region-{}", region_id),
// VM-backed cluster tests can stall for >1s while other services build images,
@ -216,8 +216,7 @@ impl FlareRaftNode {
info!(
store_id,
region_id,
"Created persistent Raft node with RocksDB storage"
region_id, "Created persistent Raft node with RocksDB storage"
);
Ok(node)

View file

@ -7,7 +7,10 @@ use crate::types::{
FlareEntry, FlareLogId, FlareNode, FlareNodeId, FlareRequest, FlareResponse,
FlareStoredMembership, FlareTypeConfig,
};
use openraft::storage::{LogFlushed, LogState, RaftLogReader, RaftLogStorage, RaftSnapshotBuilder, RaftStateMachine, Snapshot};
use openraft::storage::{
LogFlushed, LogState, RaftLogReader, RaftLogStorage, RaftSnapshotBuilder, RaftStateMachine,
Snapshot,
};
use openraft::{EntryPayload, OptionalSend, RaftLogId, SnapshotMeta};
use openraft::{StorageError, StorageIOError, StoredMembership, Vote};
use serde::{Deserialize, Serialize};
@ -141,7 +144,9 @@ impl RaftLogReader<FlareTypeConfig> for Arc<FlareStore> {
}
impl RaftSnapshotBuilder<FlareTypeConfig> for Arc<FlareStore> {
async fn build_snapshot(&mut self) -> Result<Snapshot<FlareTypeConfig>, StorageError<FlareNodeId>> {
async fn build_snapshot(
&mut self,
) -> Result<Snapshot<FlareTypeConfig>, StorageError<FlareNodeId>> {
let data;
let last_applied_log;
let last_membership;
@ -192,7 +197,9 @@ impl RaftSnapshotBuilder<FlareTypeConfig> for Arc<FlareStore> {
impl RaftLogStorage<FlareTypeConfig> for Arc<FlareStore> {
type LogReader = Self;
async fn get_log_state(&mut self) -> Result<LogState<FlareTypeConfig>, StorageError<FlareNodeId>> {
async fn get_log_state(
&mut self,
) -> Result<LogState<FlareTypeConfig>, StorageError<FlareNodeId>> {
let log = self.log.read().await;
let last_serialized = log.iter().next_back().map(|(_, ent)| ent);
@ -218,7 +225,10 @@ impl RaftLogStorage<FlareTypeConfig> for Arc<FlareStore> {
})
}
async fn save_vote(&mut self, vote: &Vote<FlareNodeId>) -> Result<(), StorageError<FlareNodeId>> {
async fn save_vote(
&mut self,
vote: &Vote<FlareNodeId>,
) -> Result<(), StorageError<FlareNodeId>> {
let mut v = self.vote.write().await;
*v = Some(*vote);
Ok(())
@ -300,8 +310,10 @@ impl RaftStateMachine<FlareTypeConfig> for Arc<FlareStore> {
async fn applied_state(
&mut self,
) -> Result<(Option<FlareLogId>, StoredMembership<FlareNodeId, FlareNode>), StorageError<FlareNodeId>>
{
) -> Result<
(Option<FlareLogId>, StoredMembership<FlareNodeId, FlareNode>),
StorageError<FlareNodeId>,
> {
let sm = self.sm.read().await;
Ok((sm.last_applied_log, sm.last_membership.clone()))
}
@ -356,8 +368,10 @@ impl RaftStateMachine<FlareTypeConfig> for Arc<FlareStore> {
// Update state machine (using bincode to match build_snapshot)
{
let new_sm: FlareStateMachine = bincode::deserialize(&new_snapshot.data)
.map_err(|e| StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e))?;
let new_sm: FlareStateMachine =
bincode::deserialize(&new_snapshot.data).map_err(|e| {
StorageIOError::read_snapshot(Some(new_snapshot.meta.signature()), &e)
})?;
let mut sm = self.sm.write().await;
*sm = new_sm;
}
@ -435,11 +449,7 @@ pub fn apply_request(sm: &mut FlareStateMachine, req: &FlareRequest, index: u64)
let key_tuple = (*namespace_id, key.clone());
// Get current version (0 if key doesn't exist)
let current_version = sm
.cas_data
.get(&key_tuple)
.map(|(_, v, _)| *v)
.unwrap_or(0);
let current_version = sm.cas_data.get(&key_tuple).map(|(_, v, _)| *v).unwrap_or(0);
if current_version != *expected_version {
// Version mismatch - CAS fails
@ -469,11 +479,7 @@ pub fn apply_request(sm: &mut FlareStateMachine, req: &FlareRequest, index: u64)
let key_tuple = (*namespace_id, key.clone());
// Get current version (0 if key doesn't exist)
let current_version = sm
.cas_data
.get(&key_tuple)
.map(|(_, v, _)| *v)
.unwrap_or(0);
let current_version = sm.cas_data.get(&key_tuple).map(|(_, v, _)| *v).unwrap_or(0);
// If expected_version is 0, delete if exists (no version check)
// Otherwise, only delete if version matches
@ -685,7 +691,10 @@ mod tests {
ts: 200,
};
let response = apply_request(&mut sm, &req_delete, 2);
assert!(matches!(response, FlareResponse::DeleteResult { existed: true }));
assert!(matches!(
response,
FlareResponse::DeleteResult { existed: true }
));
// Key should be deleted
let data = sm.kv_data.get(&(1, b"key".to_vec()));
@ -703,7 +712,10 @@ mod tests {
ts: 100,
};
let response = apply_request(&mut sm, &req_delete, 1);
assert!(matches!(response, FlareResponse::DeleteResult { existed: false }));
assert!(matches!(
response,
FlareResponse::DeleteResult { existed: false }
));
}
#[tokio::test]

Some files were not shown because too many files have changed in this diff Show more