From 11cd8be2f749cffe3d97287e6f587d3855574505 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:47:21 +0900 Subject: [PATCH 01/12] Establish canonical validation lanes --- .github/workflows/kvm-publishable.yml | 37 + .github/workflows/nix.yml | 28 +- README.md | 91 +- docs/README.md | 14 +- docs/component-matrix.md | 74 +- docs/testing.md | 111 ++- flake.nix | 487 ++++++++++- nix/ci/flake.nix | 3 + nix/images/netboot-all-in-one.nix | 214 +---- nix/iso/ultracloud-iso.nix | 230 +++-- nix/modules/default.nix | 1 + nix/nodes/baremetal-qemu/common.nix | 87 ++ .../control-plane/configuration.nix | 45 + .../baremetal-qemu/control-plane/disko.nix | 5 + .../baremetal-qemu/worker/configuration.nix | 35 + nix/nodes/baremetal-qemu/worker/disko.nix | 5 + nix/nodes/vm-cluster/common-disko.nix | 17 +- nix/single-node/base.nix | 360 ++++++++ nix/single-node/qemu-vm.nix | 24 + nix/test-cluster/README.md | 14 +- nix/test-cluster/flake.nix | 4 + nix/test-cluster/run-cluster.sh | 7 + nix/test-cluster/run-publishable-kvm-suite.sh | 87 ++ nix/test-cluster/verify-baremetal-iso.sh | 824 ++++++++++++++++++ .../logs/nix-build-deployer-vm-smoke.meta | 4 + .../logs/nix-eval-netboot-all-in-one.meta | 4 + .../logs/nix-eval-netboot-control-plane.meta | 4 + .../logs/nix-eval-netboot-worker.meta | 4 + plans/baselines/logs/nix-eval-node01.meta | 4 + .../logs/nix-eval-ultracloud-iso.meta | 4 + .../logs/nix-run-fresh-demo-vm-webapp.meta | 4 + .../baselines/logs/nix-run-fresh-matrix.meta | 4 + plans/baselines/logs/nix-run-fresh-smoke.meta | 4 + plans/baselines/main-baseline-2026-04-04.md | 52 ++ 34 files changed, 2578 insertions(+), 314 deletions(-) create mode 100644 .github/workflows/kvm-publishable.yml create mode 100644 nix/nodes/baremetal-qemu/common.nix create mode 100644 nix/nodes/baremetal-qemu/control-plane/configuration.nix create mode 100644 nix/nodes/baremetal-qemu/control-plane/disko.nix create mode 100644 nix/nodes/baremetal-qemu/worker/configuration.nix create mode 100644 nix/nodes/baremetal-qemu/worker/disko.nix create mode 100644 nix/single-node/base.nix create mode 100644 nix/single-node/qemu-vm.nix create mode 100755 nix/test-cluster/run-publishable-kvm-suite.sh create mode 100644 nix/test-cluster/verify-baremetal-iso.sh create mode 100644 plans/baselines/logs/nix-build-deployer-vm-smoke.meta create mode 100644 plans/baselines/logs/nix-eval-netboot-all-in-one.meta create mode 100644 plans/baselines/logs/nix-eval-netboot-control-plane.meta create mode 100644 plans/baselines/logs/nix-eval-netboot-worker.meta create mode 100644 plans/baselines/logs/nix-eval-node01.meta create mode 100644 plans/baselines/logs/nix-eval-ultracloud-iso.meta create mode 100644 plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta create mode 100644 plans/baselines/logs/nix-run-fresh-matrix.meta create mode 100644 plans/baselines/logs/nix-run-fresh-smoke.meta create mode 100644 plans/baselines/main-baseline-2026-04-04.md diff --git a/.github/workflows/kvm-publishable.yml b/.github/workflows/kvm-publishable.yml new file mode 100644 index 0000000..b85f1b2 --- /dev/null +++ b/.github/workflows/kvm-publishable.yml @@ -0,0 +1,37 @@ +name: KVM Publishable Validation + +on: + workflow_dispatch: + +jobs: + publishable-kvm-suite: + runs-on: ubuntu-latest + timeout-minutes: 360 + + steps: + - uses: actions/checkout@v4 + + - uses: DeterminateSystems/nix-installer-action@v11 + + - uses: DeterminateSystems/magic-nix-cache-action@v8 + + - name: Probe KVM Environment + run: | + set -euo pipefail + echo "hostname=$(hostname)" + uname -a + id + test -e /dev/kvm + ls -l /dev/kvm + if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then + echo "kvm_intel_nested=$(cat /sys/module/kvm_intel/parameters/nested)" + fi + if [[ -f /sys/module/kvm_amd/parameters/nested ]]; then + echo "kvm_amd_nested=$(cat /sys/module/kvm_amd/parameters/nested)" + fi + + - name: Run Publishable KVM Suite + run: | + set -euo pipefail + chmod +x ./nix/test-cluster/run-publishable-kvm-suite.sh + ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" diff --git a/.github/workflows/nix.yml b/.github/workflows/nix.yml index 3b96a4d..7ef2f93 100644 --- a/.github/workflows/nix.yml +++ b/.github/workflows/nix.yml @@ -96,6 +96,23 @@ jobs: run: | nix run ./nix/ci#gate-ci -- --shared-crate ${{ matrix.crate }} --tier 0 --no-logs + portable-regressions: + needs: filter + if: ${{ needs.filter.outputs.any_changed == 'true' || needs.filter.outputs.global_changed == 'true' || needs.filter.outputs.shared_crates_changed == 'true' }} + runs-on: ubuntu-latest + name: portable regressions + steps: + - uses: actions/checkout@v4 + - uses: DeterminateSystems/nix-installer-action@v11 + - uses: DeterminateSystems/magic-nix-cache-action@v8 + + - name: Run portable canonical profile regressions + run: | + nix build \ + .#checks.x86_64-linux.canonical-profile-eval-guards \ + .#checks.x86_64-linux.portable-control-plane-regressions \ + --accept-flake-config + # Build server packages (tier 1+) build: needs: [filter, gate] @@ -116,7 +133,7 @@ jobs: # Summary job for PR status checks ci-status: - needs: [filter, gate, shared-crates-gate] + needs: [filter, gate, shared-crates-gate, portable-regressions] if: always() runs-on: ubuntu-latest steps: @@ -128,11 +145,18 @@ jobs: if [[ "${{ needs.shared-crates-gate.result }}" == "failure" ]]; then exit 1 fi - if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" ]]; then + if [[ "${{ needs.portable-regressions.result }}" == "failure" ]]; then + exit 1 + fi + if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" || "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then if [[ "${{ needs.gate.result }}" == "skipped" ]]; then echo "Gate was skipped despite changes. This is unexpected." exit 1 fi + if [[ "${{ needs.portable-regressions.result }}" == "skipped" ]]; then + echo "Portable regressions were skipped despite changes. This is unexpected." + exit 1 + fi fi if [[ "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then if [[ "${{ needs.shared-crates-gate.result }}" == "skipped" ]]; then diff --git a/README.md b/README.md index 1a2f63a..daef69e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ UltraCloud is a Nix-first cloud platform workspace that assembles a small control plane, network services, VM hosting, shared storage, object storage, and gateway services into one reproducible repository. -The canonical local proof path is the six-node VM cluster under [`nix/test-cluster`](/home/centra/cloud/nix/test-cluster/README.md). It builds all guest images on the host, boots them as hardware-like QEMU nodes, and validates real multi-node behavior. +The fastest public entrypoint is the one-command single-node quickstart. The canonical multi-node integration proof remains the six-node VM cluster under [`nix/test-cluster`](nix/test-cluster/README.md), which builds all guest images on the host, boots them as hardware-like QEMU nodes, and validates real multi-node behavior. +The canonical bare-metal bootstrap proof is the ISO-on-QEMU path under [`nix/test-cluster`](nix/test-cluster/README.md), which drives phone-home, Disko install, reboot, and desired-system convergence for one control-plane node and one worker-equivalent node. ## Components @@ -15,38 +16,102 @@ The canonical local proof path is the six-node VM cluster under [`nix/test-clust - `plasmavmc`: VM control plane and worker agents - `coronafs`: shared filesystem for mutable VM volumes - `lightningstor`: object storage and VM image backing -- `k8shost`: Kubernetes-style hosting control plane +- `k8shost`: Kubernetes-style hosting control plane for tenant pods and services - `apigateway`: external API and proxy surface - `nightlight`: metrics ingestion and query service - `creditservice`: minimal reference quota/credit service -- `deployer`: bootstrap and phone-home deployment service +- `deployer`: bootstrap and phone-home deployment service that owns install plans and desired-system intent - `fleet-scheduler`: non-Kubernetes service scheduler for bare-metal cluster services ## Quick Start +Single-node quickstart: + +```bash +nix run .#single-node-quickstart +``` + +This app builds the minimal VM stack, boots a QEMU VM, waits for `chainfire`, `flaredb`, `iam`, `prismnet`, and `plasmavmc`, checks their health endpoints, and verifies the in-guest VM runtime prerequisites. For an interactive session, keep the VM running: + +```bash +ULTRACLOUD_QUICKSTART_KEEP_VM=1 nix run .#single-node-quickstart +``` + +The legacy name `.#all-in-one-quickstart` is kept as an alias. + +Portable local proof on hosts without `/dev/kvm`: + +```bash +nix build .#checks.x86_64-linux.canonical-profile-eval-guards +nix build .#checks.x86_64-linux.portable-control-plane-regressions +``` + +This TCG-safe lane keeps canonical profile drift, the core `chainfire` / `deployer` control-plane path, the `deployer -> nix-agent` boundary, and the `fleet-scheduler -> node-agent` boundary under regression coverage without requiring nested virtualization. + +Publishable nested-KVM suite: + ```bash nix develop nix run ./nix/test-cluster#cluster -- fresh-smoke +nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp +nix run ./nix/test-cluster#cluster -- fresh-matrix +./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite ``` +Project-done release proof now requires both halves of the public validation surface to be green: + +- `baremetal-iso` and `baremetal-iso-e2e` for the canonical `deployer -> installer -> nix-agent` bare-metal bootstrap path +- the KVM publishable suite (`fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`) for the nested-KVM multi-node VM-hosting path + +Canonical bare-metal bootstrap proof: + +```bash +nix run ./nix/test-cluster#cluster -- baremetal-iso +nix build .#checks.x86_64-linux.baremetal-iso-e2e +``` + +## Canonical Profiles + +UltraCloud now fixes the public support surface to three canonical profiles: + +| Profile | Primary Nix outputs | Required components | Optional components | +| --- | --- | --- | --- | +| `single-node dev` | `nix run .#single-node-quickstart`, `nixosConfigurations.single-node-quickstart`, companion install image `nixosConfigurations.netboot-all-in-one` | `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` | `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost`, `deployer` | +| `3-node HA control plane` | `nixosConfigurations.node01`, `node02`, `node03`, `netboot-control-plane` | `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node | `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` | +| `bare-metal bootstrap` | `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` | `deployer`, `first-boot-automation`, `install-target`, `nix-agent` | `netboot-control-plane`, `netboot-worker`, and `netboot-all-in-one` as experimental helper images, plus `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after bootstrap | + +`netboot-base` is an internal helper image, not a public profile. `netboot-control-plane`, `netboot-worker`, and `netboot-all-in-one` remain experimental helper images until they implement the same phone-home and install semantics as the ISO path. Older launch flows under `baremetal/vm-cluster` are `legacy/manual`, not canonical. + +## Responsibility Boundaries + +- `k8shost` owns Kubernetes-style pod and service APIs for tenant workloads, then translates them into `prismnet`, `flashdns`, and `fiberlb` objects. It does not place host-native cluster daemons. +- `fleet-scheduler` owns placement and failover of host-native service instances from declarative cluster state. It consumes `node-agent` heartbeats and writes instance placement, but it does not expose tenant-facing Kubernetes semantics. +- `deployer` owns machine enrollment, `/api/v1/phone-home`, install plans, cluster metadata, and desired-system references. It decides what a node should become, but it does not execute the host-local switch. +- `nix-agent` owns host-local NixOS convergence only. It reads desired-system state from `deployer` or `chainfire`, activates the target closure, and rolls back on failed health checks. +- `node-agent` owns host-local runtime execution only. It reports heartbeats and applies scheduled service-instance state, but it does not install the base OS or rewrite desired-system targets. + ## Main Entrypoints -- workspace flake: [flake.nix](/home/centra/cloud/flake.nix) -- VM validation harness: [nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md) -- shared volume notes: [coronafs/README.md](/home/centra/cloud/coronafs/README.md) -- minimal quota-service rationale: [creditservice/README.md](/home/centra/cloud/creditservice/README.md) -- archived manual VM launch scripts: [baremetal/vm-cluster/README.md](/home/centra/cloud/baremetal/vm-cluster/README.md) +- workspace flake: [flake.nix](flake.nix) +- single-node quickstart smoke: [`nix run .#single-node-quickstart`](docs/testing.md) +- portable local proof: [`nix build .#checks.x86_64-linux.portable-control-plane-regressions`](docs/testing.md) +- canonical bare-metal bootstrap smoke: [`nix run ./nix/test-cluster#cluster -- baremetal-iso`](docs/testing.md) +- canonical profile guards: [`nix build .#checks.x86_64-linux.canonical-profile-eval-guards`](docs/testing.md), [`nix build .#checks.x86_64-linux.canonical-profile-build-guards`](docs/testing.md) +- VM validation harness: [nix/test-cluster/README.md](nix/test-cluster/README.md) +- shared volume notes: [coronafs/README.md](coronafs/README.md) +- minimal quota-service rationale: [creditservice/README.md](creditservice/README.md) +- legacy/manual VM launch scripts: [baremetal/vm-cluster/README.md](baremetal/vm-cluster/README.md) ## Repository Guide -- [docs/README.md](/home/centra/cloud/docs/README.md): documentation entrypoint -- [docs/testing.md](/home/centra/cloud/docs/testing.md): validation path summary -- [docs/component-matrix.md](/home/centra/cloud/docs/component-matrix.md): supported multi-component compositions -- [docs/storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers +- [docs/README.md](docs/README.md): documentation entrypoint +- [docs/testing.md](docs/testing.md): validation path summary +- [docs/component-matrix.md](docs/component-matrix.md): canonical profiles and optional bundles +- [docs/storage-benchmarks.md](docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers - `plans/`: design notes and exploration documents ## Scope UltraCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products. -Host-level NixOS rollout validation is also expected to stay reproducible: the `deployer-vm-smoke` VM test now proves that `nix-agent` can activate a prebuilt target system closure directly, without recompiling the stack inside the guest. +Host-level NixOS rollout validation is also expected to stay reproducible: `baremetal-iso-e2e` is now the full install-path proof, `canonical-profile-eval-guards` and `canonical-profile-build-guards` fail fast when supported outputs drift, and `portable-control-plane-regressions` is the non-KVM developer lane that keeps the main control-plane and rollout boundaries green on TCG-only hosts before the publishable nested-KVM suite is rerun. diff --git a/docs/README.md b/docs/README.md index cca3840..ded7b31 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,16 +4,16 @@ This directory is the public documentation entrypoint for UltraCloud. ## Read First -- [../README.md](/home/centra/cloud/README.md) -- [testing.md](/home/centra/cloud/docs/testing.md) -- [component-matrix.md](/home/centra/cloud/docs/component-matrix.md) -- [storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md) +- [../README.md](../README.md) +- [testing.md](testing.md) +- [component-matrix.md](component-matrix.md) +- [storage-benchmarks.md](storage-benchmarks.md) ## Key References -- VM validation harness: [../nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md) -- CoronaFS storage role: [../coronafs/README.md](/home/centra/cloud/coronafs/README.md) -- CreditService scope note: [../creditservice/README.md](/home/centra/cloud/creditservice/README.md) +- VM validation harness: [../nix/test-cluster/README.md](../nix/test-cluster/README.md) +- CoronaFS storage role: [../coronafs/README.md](../coronafs/README.md) +- CreditService scope note: [../creditservice/README.md](../creditservice/README.md) ## Design Notes diff --git a/docs/component-matrix.md b/docs/component-matrix.md index ef48b89..aa8be99 100644 --- a/docs/component-matrix.md +++ b/docs/component-matrix.md @@ -1,54 +1,56 @@ # Component Matrix -UltraCloud is intended to validate meaningful service combinations, not only a single all-on deployment. -This page summarizes the compositions that are exercised by the VM-cluster harness today. +UltraCloud now fixes the public support surface to three canonical profiles. This page defines the required and optional component bundles for each profile and keeps everything else explicitly outside the core contract. -## Validated Control Plane +## Canonical Profiles -- `chainfire + flaredb + iam` +### `single-node dev` -## Validated Network Provider Layer +- Required components: `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` +- Optional components: `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost`, `deployer` +- Primary Nix outputs: `nix run .#single-node-quickstart`, `nixosConfigurations.single-node-quickstart`, and companion install image `nixosConfigurations.netboot-all-in-one` +- Optional component toggles: `ultracloud.quickstart.enableLightningStor`, `enableCoronafs`, `enableFlashDNS`, `enableFiberLB`, `enableApiGateway`, `enableNightlight`, `enableCreditService`, `enableK8sHost` +- Primary use: one-command local bring-up, API development, and one-box VM experimentation without the HA control-plane overhead -- `prismnet` -- `prismnet + flashdns` -- `prismnet + fiberlb` -- `prismnet + flashdns + fiberlb` +### `3-node HA control plane` -These combinations justify the existence of the network services as composable providers rather than hidden internal subsystems. +- Required components: `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node +- Optional components: `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` +- Primary Nix outputs: `nixosConfigurations.node01`, `node02`, `node03`, `netboot-control-plane` +- Primary use: stable replicated control plane that can later accept worker, storage, and edge bundles without redefining the bootstrap path -## Validated VM Hosting Layer +### `bare-metal bootstrap` -- `plasmavmc + prismnet` -- `plasmavmc + lightningstor` -- `plasmavmc + coronafs` -- `plasmavmc + coronafs + lightningstor` -- `plasmavmc + prismnet + coronafs + lightningstor` +- Required components: `deployer`, `first-boot-automation`, `install-target`, `nix-agent` +- Optional components: `netboot-control-plane`, `netboot-worker`, and `netboot-all-in-one` as experimental helper images, plus `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after the first successful rollout +- Primary Nix outputs: `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` +- Primary use: boot the installer ISO, phone home to `deployer`, fetch the flake bundle, run Disko, reboot, and converge QEMU-emulated or real machines into either the single-node or HA profile -This split keeps mutable VM volumes on CoronaFS and immutable VM images on LightningStor object storage. +## Optional Composition Bundles -## Validated Kubernetes-Style Hosting Layer +The optional bundles below remain important, but they are layered on top of the canonical profiles rather than treated as separate top-level products: -- `k8shost + prismnet` -- `k8shost + flashdns` -- `k8shost + fiberlb` -- `k8shost + prismnet + flashdns + fiberlb` +- control-plane core: `chainfire + flaredb + iam` +- network provider bundle: `prismnet + flashdns + fiberlb` +- VM hosting bundle: `plasmavmc + prismnet + coronafs + lightningstor` +- Kubernetes-style hosting bundle: `k8shost + prismnet + flashdns + fiberlb` +- edge and tenant bundle: `apigateway + iam + nightlight + creditservice` +- native rollout bundle: `deployer + chainfire + nix-agent + fleet-scheduler + node-agent` -## Validated Edge And Tenant Services +`fresh-matrix` is the publishable composition proof because it rebuilds the host-side VM images before validating these bundles on the VM cluster. -- `apigateway + iam + prismnet` -- `nightlight + apigateway` -- `nightlight` -- `creditservice + iam + apigateway` -- `creditservice + iam` -- `deployer + iam + chainfire` +## Responsibility Boundaries -## Validation Direction +- `k8shost`: tenant workload API surface. It manages pod, deployment, and service semantics, then delegates network publication to `prismnet`, `flashdns`, and `fiberlb`. +- `fleet-scheduler`: bare-metal service placement surface. It schedules host-native service instances from declarative cluster state and `node-agent` heartbeats, without exposing Kubernetes APIs. +- `deployer`: enrollment and rollout authority. It serves `/api/v1/phone-home`, stores install plans and desired-system references, and seeds cluster metadata. +- `nix-agent`: host OS reconciler. It turns `deployer` desired-system references into `switch-to-configuration` actions plus rollback and health-check handling. +- `node-agent`: host runtime reconciler. It applies scheduled service-instance state, keeps runtime heartbeats fresh, and reports host-local execution status back to the scheduler. -The VM cluster harness now exposes: +The intended layering is `deployer -> nix-agent` for machine image or NixOS generation changes, and `deployer -> fleet-scheduler -> node-agent` for host-native service placement changes. `k8shost` stays separate because it is the tenant workload control plane, not the native service scheduler. -```bash -nix run ./nix/test-cluster#cluster -- matrix -nix run ./nix/test-cluster#cluster -- fresh-matrix -``` +## Non-Canonical Paths -`fresh-matrix` is the publishable path because it rebuilds the host-side VM images before validating the composed service scenarios, including PrismNet-backed PlasmaVMC guests. +- `baremetal/vm-cluster` remains `legacy/manual` +- `netboot-control-plane`, `netboot-worker`, `netboot-all-in-one`, `netboot-base`, and `pxe-server` are internal or experimental helpers, not supported profiles by themselves +- ad hoc shell-driven cluster bring-up is for debugging only and should not be presented as the canonical public path diff --git a/docs/testing.md b/docs/testing.md index fb73ad4..975d4cf 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -1,37 +1,113 @@ # Testing -UltraCloud treats VM-first validation as the canonical local proof path. +UltraCloud treats VM-first validation as the canonical local proof path and keeps the public support contract limited to three profiles. -## Canonical Validation +## Canonical Profiles + +| Profile | Primary outputs | Required components | Optional components | +| --- | --- | --- | --- | +| `single-node dev` | `nix run .#single-node-quickstart`, `nixosConfigurations.single-node-quickstart`, companion install image `nixosConfigurations.netboot-all-in-one` | `chainfire`, `flaredb`, `iam`, `plasmavmc`, `prismnet` | `lightningstor`, `coronafs`, `flashdns`, `fiberlb`, `apigateway`, `nightlight`, `creditservice`, `k8shost`, `deployer` | +| `3-node HA control plane` | `nixosConfigurations.node01`, `node02`, `node03`, `netboot-control-plane` | `chainfire`, `flaredb`, `iam`, `nix-agent` on every control-plane node, plus `deployer` on the bootstrap node | `fleet-scheduler`, `node-agent`, `prismnet`, `flashdns`, `fiberlb`, `plasmavmc`, `lightningstor`, `coronafs`, `k8shost`, `apigateway`, `nightlight`, `creditservice` | +| `bare-metal bootstrap` | `nixosConfigurations.ultracloud-iso`, `nixosConfigurations.baremetal-qemu-control-plane`, `nixosConfigurations.baremetal-qemu-worker`, `checks.x86_64-linux.baremetal-iso-e2e` | `deployer`, `first-boot-automation`, `install-target`, `nix-agent` | `netboot-control-plane`, `netboot-worker`, and `netboot-all-in-one` as experimental helper images, plus `node-agent`, `fleet-scheduler`, and higher-level storage or edge services after bootstrap | + +## Quickstart Smoke ```bash -nix run ./nix/test-cluster#cluster -- fresh-smoke +nix flake show . --all-systems | rg -n "single|all-in-one|quickstart" +nix eval --no-eval-cache .#nixosConfigurations.single-node-quickstart.config.system.build.toplevel.drvPath --raw +nix run .#single-node-quickstart ``` -This flow: +`single-node-quickstart` is the supported one-box entrypoint. It boots the minimal VM stack under QEMU, waits for `chainfire`, `flaredb`, `iam`, `prismnet`, and `plasmavmc`, and verifies their health from inside the guest. The launcher uses the generated NixOS VM runner, so it can fall back to TCG when `/dev/kvm` is absent. -- builds all six VM images on the host -- boots the cluster in dependency order -- validates control-plane, worker, gateway, storage, and fault-injection behavior -- proves that `deployer` seeds scheduler-managed native services directly from declarative Nix cluster state +For debugging, keep the VM alive after the smoke passes: + +```bash +ULTRACLOUD_QUICKSTART_KEEP_VM=1 nix run .#single-node-quickstart +``` + +## Canonical Bare-Metal Proof + +```bash +nix eval --no-eval-cache .#nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel.drvPath --raw +nix eval --no-eval-cache .#nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel.drvPath --raw +nix run ./nix/test-cluster#cluster -- baremetal-iso +nix build .#checks.x86_64-linux.baremetal-iso-e2e +``` + +`baremetal-iso` is the canonical install path for QEMU-as-bare-metal validation. It boots `nixosConfigurations.ultracloud-iso`, waits for `/api/v1/phone-home`, downloads the flake bundle from `deployer`, runs Disko, reboots, confirms the first post-install boot markers, and waits for `nix-agent` to report the desired system as `active` for both `baremetal-qemu-control-plane` and `baremetal-qemu-worker`. `baremetal-iso-e2e` runs the same flow under `flake check`. + +## Regression Guards + +```bash +nix build .#checks.x86_64-linux.canonical-profile-eval-guards +nix build .#checks.x86_64-linux.canonical-profile-build-guards +``` + +These two checks are the fast fail-first drift gates for the supported surface: + +- `canonical-profile-eval-guards`: forces evaluation of every canonical profile output, including `netboot-worker` and `netboot-all-in-one`, so broken attrs fail before any long-running harness work starts. +- `canonical-profile-build-guards`: realizes the canonical VM, ISO, control-plane, and helper-image outputs so build-time drift is caught even when a cluster harness is not running. + +## Portable Local Proof + +```bash +nix build .#checks.x86_64-linux.canonical-profile-eval-guards +nix build .#checks.x86_64-linux.portable-control-plane-regressions +``` + +Use this lane on Linux hosts that do not expose `/dev/kvm`: + +- `portable-control-plane-regressions`: TCG-safe aggregate check that keeps the canonical profile eval guard, `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `deployer-vm-smoke`, and `fleet-scheduler-e2e` green together. +- It intentionally does not boot the six-node nested-KVM VM suite, so it is a developer regression path, not the publishable multi-node proof. +- CI runs `canonical-profile-eval-guards` and `portable-control-plane-regressions` on every relevant change from `.github/workflows/nix.yml`. ## Publishable Checks ```bash +nix run .#single-node-quickstart +nix run ./nix/test-cluster#cluster -- baremetal-iso nix run ./nix/test-cluster#cluster -- fresh-smoke nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp nix run ./nix/test-cluster#cluster -- fresh-matrix -nix run ./nix/test-cluster#cluster -- fresh-bench-storage +./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite +nix build .#checks.x86_64-linux.baremetal-iso-e2e nix build .#checks.x86_64-linux.deployer-vm-smoke ``` Use these commands as the release-facing local proof set: -- `fresh-smoke`: whole-cluster readiness, core behavior, and fault injection -- `fresh-demo-vm-webapp`: focused VM demo showing a web app inside the guest with FlareDB-backed state and LightningStor object snapshots surviving restart and migration -- `fresh-matrix`: composed service scenarios such as `prismnet + flashdns + fiberlb` and PrismNet-backed VM hosting bundles with `plasmavmc + coronafs + lightningstor` -- `fresh-bench-storage`: CoronaFS local-vs-shared-volume throughput, cross-worker volume visibility, and LightningStor large/small-object throughput capture -- `deployer-vm-smoke`: prebuilt NixOS system closure handoff into `nix-agent`, proving host rollout can activate a host-built target without guest-side compilation +- `single-node-quickstart`: productized one-command quickstart gate for the minimal VM platform profile +- `baremetal-iso`: canonical bare-metal bootstrap gate covering pre-install boot, phone-home, flake bundle fetch, Disko install, reboot, post-install boot, and desired-system activation on one control-plane node plus one worker-equivalent node +- `fresh-smoke`: base VM-cluster gate for the canonical multi-node topology, including readiness, core behavior, and fault injection +- `fresh-demo-vm-webapp`: optional VM-hosting bundle proof for `plasmavmc + prismnet` with state persisted through `lightningstor` +- `fresh-matrix`: optional composition proof for provider bundles such as `prismnet + flashdns + fiberlb` and `plasmavmc + coronafs + lightningstor` +- `run-publishable-kvm-suite.sh`: reproducible wrapper that captures the KVM environment and runs the full publishable nested-KVM trio in a single command +- `baremetal-iso-e2e`: flake-check wrapper around the same canonical ISO harness +- `deployer-vm-smoke`: lightweight regression proving that `nix-agent` can activate a host-built target closure without guest-side compilation + +## Responsibility Coverage + +- `baremetal-iso` and `baremetal-iso-e2e` are the canonical proof for `deployer -> installer -> nix-agent`. They cover phone-home, install-plan materialization, Disko, reboot, and desired-system activation. +- `deployer-vm-smoke` is the smallest regression for the same `deployer -> nix-agent` boundary. It proves that a node can receive a prebuilt target closure and activate it without guest-side compilation. +- `portable-control-plane-regressions` keeps the main non-KVM-safe boundaries under continuous coverage by composing `deployer-bootstrap-e2e`, `host-lifecycle-e2e`, `deployer-vm-smoke`, and `fleet-scheduler-e2e` behind the canonical profile eval guard. +- `fresh-smoke` and `fresh-matrix` are the canonical proof for `deployer -> fleet-scheduler -> node-agent`. They cover native service placement, heartbeats, failover, and runtime reconciliation. +- `fresh-smoke` also covers `k8shost` separately from `fleet-scheduler`: `k8shost` exposes tenant pod and service semantics, while `fleet-scheduler` handles bare-metal host services. + +The three `fresh-*` VM-cluster commands are the publishable nested-KVM suite. They require a Linux host with `/dev/kvm` and nested virtualization, and the harness stops at preflight by design when that device is absent. `single-node-quickstart`, `baremetal-iso`, `baremetal-iso-e2e`, `deployer-vm-smoke`, and `portable-control-plane-regressions` can run on TCG-only hosts, but they are slower without host KVM. + +Release-facing completion now requires both of these to be green on the same branch: + +- the canonical bare-metal proof: `nix run ./nix/test-cluster#cluster -- baremetal-iso` plus `nix build .#checks.x86_64-linux.baremetal-iso-e2e` +- the publishable nested-KVM suite: `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix`, preferably through `./nix/test-cluster/run-publishable-kvm-suite.sh` + +## Extended Measurements + +```bash +nix run ./nix/test-cluster#cluster -- fresh-bench-storage +``` + +`fresh-bench-storage` remains useful for storage regression tracking, but it is a benchmark path, not part of the minimal canonical publish gate. ## Operational Commands @@ -53,8 +129,11 @@ nix run ./nix/test-cluster#cluster -- clean - package unit tests are useful but not sufficient - host-built VM clusters are the main integration signal +- bootstrap and rollout paths must stay evaluable independently of the larger VM-hosting feature set - distributed storage and virtualization paths must be checked under failure, not only at steady state -## Legacy Note +## Legacy And Experimental Paths -Older manual launch scripts under `baremetal/vm-cluster` are archived only for historical reference. They are not the release-validation path. +- `baremetal/vm-cluster` manual launch scripts are `legacy/manual`, not canonical validation +- direct `nix develop ./nix/test-cluster -c ./nix/test-cluster/run-cluster.sh ...` usage is a debugging path, not the publishable entrypoint +- `netboot-control-plane`, `netboot-worker`, `netboot-all-in-one`, `netboot-base`, `pxe-server`, and other helper images are internal or experimental building blocks, not supported profiles by themselves diff --git a/flake.nix b/flake.nix index 9166f57..0292c63 100644 --- a/flake.nix +++ b/flake.nix @@ -963,6 +963,185 @@ self.packages.${system}.vmClusterDeployerState ]; }; + + single-node-quickstart-vm = + self.nixosConfigurations.single-node-quickstart.config.system.build.vm; + + single-node-quickstart = pkgs.writeShellApplication { + name = "single-node-quickstart"; + runtimeInputs = with pkgs; [ + coreutils + findutils + netcat + openssh + procps + sshpass + ]; + text = '' + set -euo pipefail + + STATE_DIR="''${ULTRACLOUD_QUICKSTART_STATE_DIR:-$HOME/.ultracloud-single-node-quickstart}" + RUN_DIR="$STATE_DIR/run" + DISK_IMAGE="$STATE_DIR/quickstart.qcow2" + PID_FILE="$STATE_DIR/qemu.pid" + SERIAL_LOG="$STATE_DIR/serial.log" + SSH_PORT="''${ULTRACLOUD_QUICKSTART_SSH_PORT:-22220}" + KEEP_VM="''${ULTRACLOUD_QUICKSTART_KEEP_VM:-0}" + REUSE_DISK="''${ULTRACLOUD_QUICKSTART_REUSE_DISK:-0}" + VM_PATH="${self.packages.${system}.single-node-quickstart-vm}" + RUN_VM="$(find "$VM_PATH/bin" -maxdepth 1 -name 'run-*-vm' | head -n1)" + + log() { + printf '[single-node-quickstart] %s\n' "$*" + } + + dump_serial() { + if [ -f "$SERIAL_LOG" ]; then + log "serial log tail:" + tail -n 120 "$SERIAL_LOG" >&2 || true + fi + } + + cleanup() { + if [ -f "$PID_FILE" ]; then + pid="$(cat "$PID_FILE")" + if kill -0 "$pid" >/dev/null 2>&1; then + kill "$pid" >/dev/null 2>&1 || true + for _ in $(seq 1 30); do + if ! kill -0 "$pid" >/dev/null 2>&1; then + break + fi + sleep 1 + done + fi + rm -f "$PID_FILE" + fi + } + + on_exit() { + status="$?" + if [ "$status" -ne 0 ]; then + dump_serial + fi + if [ "$KEEP_VM" != "1" ]; then + cleanup + fi + exit "$status" + } + + wait_for_ssh() { + local deadline=$((SECONDS + 240)) + while true; do + if sshpass -p ultracloud ssh \ + -F /dev/null \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o LogLevel=ERROR \ + -o ConnectTimeout=5 \ + -o ConnectionAttempts=1 \ + -p "$SSH_PORT" \ + root@127.0.0.1 true >/dev/null 2>&1; then + return 0 + fi + if [ "$SECONDS" -ge "$deadline" ]; then + log "timed out waiting for SSH on port $SSH_PORT" + return 1 + fi + sleep 1 + done + } + + wait_for_unit_active() { + local unit="$1" + local deadline=$((SECONDS + 240)) + while true; do + if ssh_cmd systemctl is-active "$unit" >/dev/null 2>&1; then + return 0 + fi + if [ "$SECONDS" -ge "$deadline" ]; then + log "timed out waiting for $unit" + ssh_cmd systemctl status "$unit" --no-pager || true + return 1 + fi + sleep 1 + done + } + + ssh_cmd() { + sshpass -p ultracloud ssh \ + -F /dev/null \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o LogLevel=ERROR \ + -o ConnectTimeout=5 \ + -o ConnectionAttempts=1 \ + -p "$SSH_PORT" \ + root@127.0.0.1 -- "$@" + } + + ssh_shell() { + local script="$1" + local quoted + printf -v quoted '%q' "$script" + sshpass -p ultracloud ssh \ + -F /dev/null \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o LogLevel=ERROR \ + -o ConnectTimeout=5 \ + -o ConnectionAttempts=1 \ + -p "$SSH_PORT" \ + root@127.0.0.1 "bash -lc $quoted" + } + + trap on_exit EXIT + + [ -n "$RUN_VM" ] || { + log "failed to locate run-*-vm under $VM_PATH/bin" + exit 1 + } + + mkdir -p "$STATE_DIR" + rm -rf "$RUN_DIR" + mkdir -p "$RUN_DIR" + rm -f "$SERIAL_LOG" + if [ "$REUSE_DISK" != "1" ]; then + rm -f "$DISK_IMAGE" + fi + + cleanup + + log "launching single-node quickstart VM" + nohup env \ + USE_TMPDIR=1 \ + TMPDIR="$RUN_DIR" \ + NIX_DISK_IMAGE="$DISK_IMAGE" \ + QEMU_NET_OPTS="hostfwd=tcp:127.0.0.1:$SSH_PORT-:22" \ + "$RUN_VM" >"$SERIAL_LOG" 2>&1 & + echo "$!" > "$PID_FILE" + + log "waiting for guest SSH" + wait_for_ssh + + log "waiting for in-guest readiness gate" + wait_for_unit_active ultracloud-single-node-quickstart-ready.service + + log "verifying required services" + ssh_cmd systemctl is-active chainfire flaredb iam prismnet plasmavmc >/dev/null + + log "verifying service health endpoints and VM runtime prerequisites" + ssh_shell 'curl -fsS http://127.0.0.1:8081/health >/dev/null && curl -fsS http://127.0.0.1:8082/health >/dev/null && curl -fsS http://127.0.0.1:8083/health >/dev/null && curl -fsS http://127.0.0.1:8087/health >/dev/null && curl -fsS http://127.0.0.1:8084/health >/dev/null && test -x /run/current-system/sw/bin/qemu-system-x86_64 && test -x /run/current-system/sw/bin/qemu-img && test -c /dev/net/tun' + + log "single-node quickstart smoke passed" + + if [ "$KEEP_VM" = "1" ]; then + trap - EXIT + log "VM left running" + log "ssh: sshpass -p ultracloud ssh -p $SSH_PORT root@127.0.0.1" + exit 0 + fi + ''; + }; }; # ====================================================================== @@ -1044,9 +1223,75 @@ fleet-scheduler = flake-utils.lib.mkApp { drv = self.packages.${system}.fleet-scheduler; }; + + single-node-quickstart = flake-utils.lib.mkApp { + drv = self.packages.${system}.single-node-quickstart; + }; + + all-in-one-quickstart = flake-utils.lib.mkApp { + drv = self.packages.${system}.single-node-quickstart; + }; }; - checks = { + checks = + let + stripKvmRequiredSystemFeature = drv: + drv.overrideTestDerivation (old: { + requiredSystemFeatures = + builtins.filter (feature: feature != "kvm") (old.requiredSystemFeatures or [ ]); + }); + + canonicalProfileEvalData = { + single-node-quickstart = { + hostName = self.nixosConfigurations.single-node-quickstart.config.networking.hostName; + stateVersion = + self.nixosConfigurations.single-node-quickstart.config.system.stateVersion; + }; + node01 = { + hostName = self.nixosConfigurations.node01.config.networking.hostName; + stateVersion = self.nixosConfigurations.node01.config.system.stateVersion; + }; + node02 = { + hostName = self.nixosConfigurations.node02.config.networking.hostName; + stateVersion = self.nixosConfigurations.node02.config.system.stateVersion; + }; + node03 = { + hostName = self.nixosConfigurations.node03.config.networking.hostName; + stateVersion = self.nixosConfigurations.node03.config.system.stateVersion; + }; + netboot-control-plane = { + hostName = self.nixosConfigurations.netboot-control-plane.config.networking.hostName; + stateVersion = + self.nixosConfigurations.netboot-control-plane.config.system.stateVersion; + }; + netboot-worker = { + hostName = self.nixosConfigurations.netboot-worker.config.networking.hostName; + stateVersion = + self.nixosConfigurations.netboot-worker.config.system.stateVersion; + }; + netboot-all-in-one = { + hostName = self.nixosConfigurations.netboot-all-in-one.config.networking.hostName; + stateVersion = + self.nixosConfigurations.netboot-all-in-one.config.system.stateVersion; + }; + ultracloud-iso = { + hostName = self.nixosConfigurations.ultracloud-iso.config.networking.hostName; + imageFileName = self.nixosConfigurations.ultracloud-iso.config.image.fileName; + }; + baremetal-qemu-control-plane = { + hostName = + self.nixosConfigurations.baremetal-qemu-control-plane.config.networking.hostName; + stateVersion = + self.nixosConfigurations.baremetal-qemu-control-plane.config.system.stateVersion; + }; + baremetal-qemu-worker = { + hostName = self.nixosConfigurations.baremetal-qemu-worker.config.networking.hostName; + stateVersion = + self.nixosConfigurations.baremetal-qemu-worker.config.system.stateVersion; + }; + }; + in + { workspace-source-roots-audit = pkgs.runCommand "workspace-source-roots-audit" { nativeBuildInputs = [ pkgs.python3 ]; @@ -1169,6 +1414,76 @@ touch "$out" ''; + canonical-profile-eval-guards = pkgs.writeText "canonical-profile-eval-guards.json" + (builtins.toJSON canonicalProfileEvalData); + + canonical-profile-build-guards = pkgs.linkFarm "canonical-profile-build-guards" [ + { + name = "single-node-quickstart-vm"; + path = self.packages.${system}.single-node-quickstart-vm; + } + { + name = "node01-toplevel"; + path = self.nixosConfigurations.node01.config.system.build.toplevel; + } + { + name = "node02-toplevel"; + path = self.nixosConfigurations.node02.config.system.build.toplevel; + } + { + name = "node03-toplevel"; + path = self.nixosConfigurations.node03.config.system.build.toplevel; + } + { + name = "netboot-control-plane-toplevel"; + path = self.nixosConfigurations.netboot-control-plane.config.system.build.toplevel; + } + { + name = "netboot-worker-toplevel"; + path = self.nixosConfigurations.netboot-worker.config.system.build.toplevel; + } + { + name = "netboot-all-in-one-toplevel"; + path = self.nixosConfigurations.netboot-all-in-one.config.system.build.toplevel; + } + { + name = "ultracloud-iso-image"; + path = self.nixosConfigurations.ultracloud-iso.config.system.build.isoImage; + } + { + name = "baremetal-qemu-control-plane-toplevel"; + path = self.nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel; + } + { + name = "baremetal-qemu-worker-toplevel"; + path = self.nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel; + } + ]; + + portable-control-plane-regressions = + pkgs.linkFarm "portable-control-plane-regressions" [ + { + name = "canonical-profile-eval-guards"; + path = self.checks.${system}.canonical-profile-eval-guards; + } + { + name = "deployer-bootstrap-e2e"; + path = self.checks.${system}.deployer-bootstrap-e2e; + } + { + name = "host-lifecycle-e2e"; + path = self.checks.${system}.host-lifecycle-e2e; + } + { + name = "deployer-vm-smoke"; + path = self.checks.${system}.deployer-vm-smoke; + } + { + name = "fleet-scheduler-e2e"; + path = self.checks.${system}.fleet-scheduler-e2e; + } + ]; + first-boot-topology-vm-smoke = pkgs.testers.runNixOSTest ( import ./nix/tests/first-boot-topology-vm-smoke.nix { inherit pkgs; @@ -1177,15 +1492,15 @@ } ); - deployer-vm-smoke = pkgs.testers.runNixOSTest ( + deployer-vm-smoke = stripKvmRequiredSystemFeature (pkgs.testers.runNixOSTest ( import ./nix/tests/deployer-vm-smoke.nix { inherit pkgs; ultracloudPackages = self.packages.${system}; smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel; } - ); + )); - deployer-vm-rollback = pkgs.testers.runNixOSTest ( + deployer-vm-rollback = stripKvmRequiredSystemFeature (pkgs.testers.runNixOSTest ( import ./nix/tests/deployer-vm-smoke.nix { inherit pkgs; ultracloudPackages = self.packages.${system}; @@ -1198,7 +1513,83 @@ expectCurrentSystemMatchesTarget = false; expectMarkerPresent = false; } - ); + )); + + baremetal-iso-e2e = pkgs.runCommand "baremetal-iso-e2e" + { + nativeBuildInputs = with pkgs; [ + bash + coreutils + curl + findutils + gawk + gnugrep + gnused + iproute2 + jq + nix + openssh + procps + python3 + qemu + ]; + preferLocalBuild = true; + allowSubstitutes = false; + ULTRACLOUD_BAREMETAL_ISO_IMAGE = + "${self.nixosConfigurations.ultracloud-iso.config.system.build.isoImage}"; + ULTRACLOUD_BAREMETAL_FLAKE_BUNDLE = + "${self.packages.${system}.ultracloudFlakeBundle}"; + ULTRACLOUD_BAREMETAL_CONTROL_TARGET = + "${self.nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel}"; + ULTRACLOUD_BAREMETAL_WORKER_TARGET = + "${self.nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel}"; + ULTRACLOUD_BAREMETAL_CONTROL_DISKO_SCRIPT = + "${self.nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount}"; + ULTRACLOUD_BAREMETAL_WORKER_DISKO_SCRIPT = + "${self.nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount}"; + ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION = "${pkgs.closureInfo { + rootPaths = [ + self.nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel + self.nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel + self.nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount + self.nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount + ]; + }}"; + ULTRACLOUD_CHAINFIRE_SERVER_BIN = + "${self.packages.${system}.chainfire-server}/bin/chainfire"; + ULTRACLOUD_DEPLOYER_SERVER_BIN = + "${self.packages.${system}.deployer-server}/bin/deployer-server"; + ULTRACLOUD_DEPLOYER_CTL_BIN = + "${self.packages.${system}.deployer-ctl}/bin/deployer-ctl"; + ULTRACLOUD_OVMF_CODE = "${pkgs.OVMF.fd}/FV/OVMF_CODE.fd"; + ULTRACLOUD_OVMF_VARS = "${pkgs.OVMF.fd}/FV/OVMF_VARS.fd"; + ULTRACLOUD_QEMU_BIN = "${pkgs.qemu}/bin/qemu-system-x86_64"; + ULTRACLOUD_QEMU_IMG_BIN = "${pkgs.qemu}/bin/qemu-img"; + ULTRACLOUD_REPO_ROOT = "${self}"; + NIX_CONFIG = "experimental-features = nix-command flakes"; + } '' + export HOME="$TMPDIR/home" + mkdir -p "$HOME" + export NIX_CONFIG="$NIX_CONFIG" + export PATH="${pkgs.lib.makeBinPath [ + pkgs.bash + pkgs.coreutils + pkgs.curl + pkgs.findutils + pkgs.gawk + pkgs.gnugrep + pkgs.gnused + pkgs.iproute2 + pkgs.jq + pkgs.nix + pkgs.openssh + pkgs.procps + pkgs.python3 + pkgs.qemu + ]}" + bash ${./nix/test-cluster/verify-baremetal-iso.sh} + touch "$out" + ''; fiberlb-native-bgp-vm-smoke = pkgs.testers.runNixOSTest ( import ./nix/tests/fiberlb-native-bgp-vm-smoke.nix { @@ -1363,6 +1754,9 @@ nixosConfigurations = let vmClusterLib = import ./nix/nodes/vm-cluster/lib.nix { lib = nixpkgs.lib; }; + overlayModule = { + nixpkgs.overlays = [ self.overlays.default ]; + }; mkVmClusterSystem = nodeName: nixpkgs.lib.nixosSystem { system = "x86_64-linux"; @@ -1382,25 +1776,74 @@ # Control Plane netboot image (all 8 services) netboot-control-plane = nixpkgs.lib.nixosSystem { system = "x86_64-linux"; - modules = [ ./nix/images/netboot-control-plane.nix ]; + modules = [ + ./nix/images/netboot-control-plane.nix + overlayModule + ]; }; # Worker netboot image (compute-focused services) netboot-worker = nixpkgs.lib.nixosSystem { system = "x86_64-linux"; - modules = [ ./nix/images/netboot-worker.nix ]; + modules = [ + ./nix/images/netboot-worker.nix + overlayModule + ]; }; # All-in-One netboot image (single-node deployment) netboot-all-in-one = nixpkgs.lib.nixosSystem { system = "x86_64-linux"; - modules = [ ./nix/images/netboot-all-in-one.nix ]; + modules = [ + ./nix/images/netboot-all-in-one.nix + overlayModule + ]; + }; + + # QEMU-first single-node quickstart for one-command local bring-up. + single-node-quickstart = nixpkgs.lib.nixosSystem { + system = "x86_64-linux"; + modules = [ + ./nix/single-node/qemu-vm.nix + ./nix/single-node/base.nix + self.nixosModules.default + overlayModule + { + ultracloud.quickstart.enable = true; + } + ]; + }; + + # Canonical bare-metal ISO install targets used by the QEMU proof path. + baremetal-qemu-control-plane = nixpkgs.lib.nixosSystem { + system = "x86_64-linux"; + modules = [ + disko.nixosModules.disko + ./nix/nodes/baremetal-qemu/control-plane/configuration.nix + ./nix/nodes/baremetal-qemu/control-plane/disko.nix + self.nixosModules.default + overlayModule + ]; + }; + + baremetal-qemu-worker = nixpkgs.lib.nixosSystem { + system = "x86_64-linux"; + modules = [ + disko.nixosModules.disko + ./nix/nodes/baremetal-qemu/worker/configuration.nix + ./nix/nodes/baremetal-qemu/worker/disko.nix + self.nixosModules.default + overlayModule + ]; }; # Base netboot image (minimal, for VM testing and provisioning) netboot-base = nixpkgs.lib.nixosSystem { system = "x86_64-linux"; - modules = [ ./nix/images/netboot-base.nix ]; + modules = [ + ./nix/images/netboot-base.nix + overlayModule + ]; }; # Offline-friendly target used by deployer VM smoke tests. @@ -1412,6 +1855,20 @@ # UltraCloud ISO (T061.S5 - bootable ISO with cluster-config embedding) ultracloud-iso = nixpkgs.lib.nixosSystem { system = "x86_64-linux"; + specialArgs = { + ultracloudBaremetalFormatMountPaths = { + baremetal-qemu-control-plane = + self.nixosConfigurations."baremetal-qemu-control-plane".config.system.build.formatMount; + baremetal-qemu-worker = + self.nixosConfigurations."baremetal-qemu-worker".config.system.build.formatMount; + }; + ultracloudBaremetalSystemPaths = { + baremetal-qemu-control-plane = + self.nixosConfigurations."baremetal-qemu-control-plane".config.system.build.toplevel; + baremetal-qemu-worker = + self.nixosConfigurations."baremetal-qemu-worker".config.system.build.toplevel; + }; + }; modules = [ ./nix/iso/ultracloud-iso.nix self.nixosModules.default @@ -1455,13 +1912,13 @@ apigateway-server = self.packages.${final.system}.apigateway-server; k8shost-server = self.packages.${final.system}.k8shost-server; deployer-workspace = self.packages.${final.system}.deployer-workspace; - deployer-server = self.packages.${final.system}.deployer-workspace; - deployer-ctl = self.packages.${final.system}.deployer-workspace; - ultracloud-reconciler = self.packages.${final.system}.deployer-workspace; + deployer-server = self.packages.${final.system}.deployer-server; + deployer-ctl = self.packages.${final.system}.deployer-ctl; + ultracloud-reconciler = self.packages.${final.system}.ultracloud-reconciler; ultracloudFlakeBundle = self.packages.${final.system}.ultracloudFlakeBundle; - nix-agent = self.packages.${final.system}.deployer-workspace; - node-agent = self.packages.${final.system}.deployer-workspace; - fleet-scheduler = self.packages.${final.system}.deployer-workspace; + nix-agent = self.packages.${final.system}.nix-agent; + node-agent = self.packages.${final.system}.node-agent; + fleet-scheduler = self.packages.${final.system}.fleet-scheduler; }; }; } diff --git a/nix/ci/flake.nix b/nix/ci/flake.nix index d797f50..0ba7557 100644 --- a/nix/ci/flake.nix +++ b/nix/ci/flake.nix @@ -379,6 +379,9 @@ ${gate}/bin/ultracloud-gate --tier 0 --no-logs touch $out/ok ''; + checks.canonical-profile-eval-guards = ultracloud.checks.${system}.canonical-profile-eval-guards; + checks.portable-control-plane-regressions = + ultracloud.checks.${system}.portable-control-plane-regressions; checks.deployer-vm-smoke = ultracloud.checks.${system}.deployer-vm-smoke; checks.deployer-vm-rollback = ultracloud.checks.${system}.deployer-vm-rollback; checks.deployer-bootstrap-e2e = ultracloud.checks.${system}.deployer-bootstrap-e2e; diff --git a/nix/images/netboot-all-in-one.nix b/nix/images/netboot-all-in-one.nix index b829e3d..919af2a 100644 --- a/nix/images/netboot-all-in-one.nix +++ b/nix/images/netboot-all-in-one.nix @@ -3,220 +3,113 @@ { imports = [ ./netboot-base.nix - ../modules # Import UltraCloud service modules + ../modules ]; # ============================================================================ - # ALL-IN-ONE PROFILE + # SINGLE-NODE / ALL-IN-ONE INSTALL IMAGE # ============================================================================ - # This profile includes all 8 UltraCloud services for a single-node deployment: - # - Chainfire: Distributed configuration and coordination - # - FlareDB: Time-series metrics and events database - # - IAM: Identity and access management - # - PlasmaVMC: Virtual machine control plane - # - PrismNET: Software-defined networking controller - # - FlashDNS: High-performance DNS server - # - FiberLB: Layer 4/7 load balancer - # - LightningStor: Distributed block storage - # - K8sHost: Kubernetes hosting component + # This netboot image is the bare-metal companion to the QEMU-first + # `single-node-quickstart` profile. It keeps only the minimum VM stack in the + # image by default and leaves DNS, load-balancing, storage, API, metrics, and + # Kubernetes layers as explicit add-ons in the final installed system. # - # This profile is optimized for: - # - Development/testing environments - # - Small deployments (1-3 nodes) - # - Edge locations with limited infrastructure - # - Proof-of-concept installations + # Included by default: + # - Chainfire: local coordination and placement metadata + # - FlareDB: metadata/event storage + # - IAM: local identity plane for the dev profile + # - PrismNET: VM networking control plane + # - PlasmaVMC: VM control plane # - # Services are DISABLED by default in the netboot image. - # They will be enabled in the final installed system configuration. + # Optional after install: + # - LightningStor, CoronaFS + # - FlashDNS, FiberLB + # - API Gateway, Nightlight, CreditService + # - K8sHost # ============================================================================ - # ============================================================================ - # SERVICE PACKAGE AVAILABILITY - # ============================================================================ - # Make all service packages available in the netboot image environment.systemPackages = with pkgs; [ - # Core services chainfire-server flaredb-server iam-server - - # Compute and networking - plasmavmc-server prismnet-server - - # Network services - flashdns-server - fiberlb-server - - # Storage - lightningstor-server - - # Container orchestration - k8shost-server - - # Additional tools for all-in-one deployment - qemu # For running VMs - libvirt # Virtualization management - bridge-utils # Network bridge configuration - openvswitch # Software-defined networking + plasmavmc-server + qemu + libvirt + bridge-utils + openvswitch + curl + jq ]; - # ============================================================================ - # CHAINFIRE CONFIGURATION (DISABLED) - # ============================================================================ services.chainfire = { enable = lib.mkDefault false; port = 2379; raftPort = 2380; gossipPort = 2381; + httpPort = 8081; }; - # ============================================================================ - # FLAREDB CONFIGURATION (DISABLED) - # ============================================================================ services.flaredb = { enable = lib.mkDefault false; port = 2479; raftPort = 2480; + httpPort = 8082; }; - # ============================================================================ - # IAM CONFIGURATION (DISABLED) - # ============================================================================ services.iam = { enable = lib.mkDefault false; - port = 8080; + port = 50080; + httpPort = 8083; }; - # ============================================================================ - # PLASMAVMC CONFIGURATION (DISABLED) - # ============================================================================ - services.plasmavmc = { - enable = lib.mkDefault false; - port = 8081; - }; - - # ============================================================================ - # PRISMNET CONFIGURATION (DISABLED) - # ============================================================================ services.prismnet = { enable = lib.mkDefault false; - port = 8082; + port = 50081; + httpPort = 8087; }; - # ============================================================================ - # FLASHDNS CONFIGURATION (DISABLED) - # ============================================================================ - services.flashdns = { + services.plasmavmc = { enable = lib.mkDefault false; - port = 53; + port = 50082; + httpPort = 8084; }; - # ============================================================================ - # FIBERLB CONFIGURATION (DISABLED) - # ============================================================================ - services.fiberlb = { - enable = lib.mkDefault false; - port = 8083; - }; - - # ============================================================================ - # LIGHTNINGSTOR CONFIGURATION (DISABLED) - # ============================================================================ - services.lightningstor = { - enable = lib.mkDefault false; - port = 8084; - }; - - # ============================================================================ - # K8SHOST CONFIGURATION (DISABLED) - # ============================================================================ - services.k8shost = { - enable = lib.mkDefault false; - port = 8085; - }; - - # ============================================================================ - # VIRTUALIZATION SUPPORT - # ============================================================================ - # Enable KVM virtualization - boot.kernelModules = [ "kvm-intel" "kvm-amd" ]; - - # Enable nested virtualization + boot.kernelModules = [ "kvm-intel" "kvm-amd" "tun" ]; boot.extraModprobeConfig = '' options kvm_intel nested=1 options kvm_amd nested=1 ''; - # ============================================================================ - # NETWORKING CONFIGURATION - # ============================================================================ - # Enable Open vSwitch for SDN networking.vswitches = lib.mkDefault {}; - # Open firewall ports for all services networking.firewall.allowedTCPPorts = [ - # Chainfire - 2379 # API - 2380 # Raft - 2381 # Gossip - - # FlareDB - 2479 # API - 2480 # Raft - - # IAM - 8080 - - # PlasmaVMC + 22 + 2379 + 2380 + 2381 + 2479 + 2480 + 50080 + 50081 + 50082 8081 - - # PrismNET 8082 - - # FlashDNS - 53 - - # FiberLB 8083 - - # LightningStor 8084 - - # K8sHost - 8085 - - # QEMU/LibVirt - 16509 # libvirtd - 5900 # VNC (for VM console access) + 8087 + 16509 + 5900 ]; networking.firewall.allowedUDPPorts = [ - # FlashDNS - 53 - - # Chainfire gossip 2381 - - # VXLAN for overlay networking 4789 ]; - # ============================================================================ - # STORAGE CONFIGURATION - # ============================================================================ - # Enable LVM for flexible storage management services.lvm.enable = true; - - # Enable ZFS if needed boot.supportedFilesystems = [ "ext4" "xfs" "btrfs" "zfs" ]; - # ============================================================================ - # RESOURCE LIMITS (BALANCED FOR ALL-IN-ONE) - # ============================================================================ - # Balance resources between services on a single node - # These are minimal limits for netboot; adjust in final config based on hardware - systemd.services.chainfire.serviceConfig = lib.mkIf config.services.chainfire.enable { MemoryMax = "1G"; CPUQuota = "100%"; @@ -242,26 +135,13 @@ CPUQuota = "50%"; }; - # ============================================================================ - # PERFORMANCE TUNING - # ============================================================================ - # Optimize for mixed workload (services + VMs) boot.kernel.sysctl = { - # Increase max number of open files "fs.file-max" = 1000000; - - # Increase network buffer sizes + "net.core.netdev_max_backlog" = 5000; "net.core.rmem_max" = 134217728; "net.core.wmem_max" = 134217728; - - # Enable IP forwarding for VM networking "net.ipv4.ip_forward" = 1; "net.ipv6.conf.all.forwarding" = 1; - - # Optimize for high-performance networking - "net.core.netdev_max_backlog" = 5000; - - # Swappiness for server workloads "vm.swappiness" = 10; }; } diff --git a/nix/iso/ultracloud-iso.nix b/nix/iso/ultracloud-iso.nix index 0c35c1f..33397a3 100644 --- a/nix/iso/ultracloud-iso.nix +++ b/nix/iso/ultracloud-iso.nix @@ -2,7 +2,15 @@ # Minimal ISO with DHCP + Phone Home to Deployer + Auto-Install # For VM cluster deployment: boots, phones home, partitions disk, installs NixOS -{ config, lib, pkgs, modulesPath, ... }: +{ + config, + lib, + pkgs, + modulesPath, + ultracloudBaremetalFormatMountPaths ? { }, + ultracloudBaremetalSystemPaths ? { }, + ... +}: { imports = [ @@ -58,16 +66,34 @@ return 1 } + dmi_value() { + local path="$1" + if [ -r "$path" ]; then + tr -d '\n' <"$path" 2>/dev/null || true + fi + } + + resolve_deployer_url() { + local explicit_url="''${DEPLOYER_URL:-}" + if [ -z "$explicit_url" ]; then + explicit_url="$(cmdline_value ultracloud.deployer_url || true)" + fi + if [ -n "$explicit_url" ]; then + echo "$explicit_url" + return 0 + fi + if ${pkgs.curl}/bin/curl -fsS --connect-timeout 2 --max-time 5 \ + http://10.0.2.2:8088/health >/dev/null 2>&1; then + echo "http://10.0.2.2:8088" + return 0 + fi + echo "http://192.168.100.1:8080" + } + mkdir -p /etc/ultracloud # Discover Deployer via environment, kernel cmdline, or fallback. - DEPLOYER_URL="''${DEPLOYER_URL:-}" - if [ -z "$DEPLOYER_URL" ]; then - DEPLOYER_URL="$(cmdline_value ultracloud.deployer_url || true)" - fi - if [ -z "$DEPLOYER_URL" ]; then - DEPLOYER_URL="http://192.168.100.1:8080" - fi + DEPLOYER_URL="$(resolve_deployer_url)" # Get machine identity MACHINE_ID=$(cat /etc/machine-id) @@ -113,7 +139,24 @@ if [ -z "$NODE_IP" ]; then NODE_IP=$(hostname -I 2>/dev/null | ${pkgs.gawk}/bin/awk '{print $1}') fi - NODE_HOSTNAME=$(hostname) + REQUESTED_NODE_ID="''${ULTRACLOUD_NODE_ID:-}" + if [ -z "$REQUESTED_NODE_ID" ]; then + REQUESTED_NODE_ID="$(cmdline_value ultracloud.node_id || true)" + fi + if [ -z "$REQUESTED_NODE_ID" ]; then + REQUESTED_NODE_ID="$(dmi_value /sys/class/dmi/id/product_serial)" + fi + if [ -z "$REQUESTED_NODE_ID" ]; then + REQUESTED_NODE_ID="$(hostname)" + fi + REQUESTED_HOSTNAME="''${ULTRACLOUD_HOSTNAME:-}" + if [ -z "$REQUESTED_HOSTNAME" ]; then + REQUESTED_HOSTNAME="$(cmdline_value ultracloud.hostname || true)" + fi + if [ -z "$REQUESTED_HOSTNAME" ]; then + REQUESTED_HOSTNAME="$REQUESTED_NODE_ID" + fi + echo "ULTRACLOUD_MARKER pre-install.boot.$REQUESTED_NODE_ID" CPU_MODEL=$(${pkgs.gawk}/bin/awk -F: '/model name/ {gsub(/^[ \t]+/, "", $2); print $2; exit}' /proc/cpuinfo 2>/dev/null || true) CPU_CORES=$(${pkgs.gawk}/bin/awk '/^cpu cores/ {print $4; exit}' /proc/cpuinfo 2>/dev/null || true) CPU_THREADS=$(${pkgs.coreutils}/bin/nproc --all 2>/dev/null || true) @@ -172,8 +215,8 @@ ') REQUEST_JSON=$(${pkgs.jq}/bin/jq -n \ --arg machine_id "$MACHINE_ID" \ - --arg node_id "$NODE_HOSTNAME" \ - --arg hostname "$NODE_HOSTNAME" \ + --arg node_id "$REQUESTED_NODE_ID" \ + --arg hostname "$REQUESTED_HOSTNAME" \ --arg ip "$NODE_IP" \ --argjson hardware_facts "$HARDWARE_FACTS" ' { @@ -253,6 +296,7 @@ # Signal success NODE_ID=$(echo "$RESPONSE" | ${pkgs.jq}/bin/jq -r '.node_config.assignment.node_id // "unknown"') + echo "ULTRACLOUD_MARKER pre-install.phone-home.complete.$NODE_ID" echo "✓ Bootstrap complete: $NODE_ID" exit 0 else @@ -282,6 +326,7 @@ script = '' set -euo pipefail + export PATH="${pkgs.nix}/bin:${config.system.build.nixos-install}/bin:$PATH" cmdline_value() { local key="$1" @@ -297,6 +342,40 @@ return 1 } + resolve_deployer_url() { + local explicit_url="''${DEPLOYER_URL:-}" + if [ -z "$explicit_url" ]; then + explicit_url="$(cmdline_value ultracloud.deployer_url || true)" + fi + if [ -n "$explicit_url" ]; then + echo "$explicit_url" + return 0 + fi + if ${pkgs.curl}/bin/curl -fsS --connect-timeout 2 --max-time 5 \ + http://10.0.2.2:8088/health >/dev/null 2>&1; then + echo "http://10.0.2.2:8088" + return 0 + fi + echo "http://192.168.100.1:8080" + } + + resolve_binary_cache_url() { + local explicit_url="''${ULTRACLOUD_BINARY_CACHE_URL:-}" + if [ -z "$explicit_url" ]; then + explicit_url="$(cmdline_value ultracloud.binary_cache_url || true)" + fi + if [ -n "$explicit_url" ]; then + echo "$explicit_url" + return 0 + fi + if ${pkgs.curl}/bin/curl -fsS --connect-timeout 2 --max-time 5 \ + http://10.0.2.2:8090/nix-cache-info >/dev/null 2>&1; then + echo "http://10.0.2.2:8090" + return 0 + fi + return 1 + } + if [ ! -s /etc/ultracloud/node-config.json ]; then echo "ERROR: node-config.json missing (bootstrap not complete?)" exit 1 @@ -305,16 +384,17 @@ NODE_ID=$(${pkgs.jq}/bin/jq -r '.assignment.hostname // .assignment.node_id // empty' /etc/ultracloud/node-config.json) NODE_IP=$(${pkgs.jq}/bin/jq -r '.assignment.ip // empty' /etc/ultracloud/node-config.json) NIXOS_CONFIGURATION=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.nixos_configuration // .assignment.hostname // empty' /etc/ultracloud/node-config.json) - DISKO_PATH=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.disko_config_path // empty' /etc/ultracloud/node-config.json) + DISKO_SCRIPT_PATH=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.disko_script_path // empty' /etc/ultracloud/node-config.json) + if [ -z "$DISKO_SCRIPT_PATH" ] && [ -r /etc/ultracloud/disko-script-paths.json ]; then + DISKO_SCRIPT_PATH=$(${pkgs.jq}/bin/jq -r --arg cfg "$NIXOS_CONFIGURATION" '.[$cfg] // empty' /etc/ultracloud/disko-script-paths.json) + fi + TARGET_SYSTEM_PATH=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.target_system_path // empty' /etc/ultracloud/node-config.json) + if [ -z "$TARGET_SYSTEM_PATH" ] && [ -r /etc/ultracloud/system-paths.json ]; then + TARGET_SYSTEM_PATH=$(${pkgs.jq}/bin/jq -r --arg cfg "$NIXOS_CONFIGURATION" '.[$cfg] // empty' /etc/ultracloud/system-paths.json) + fi TARGET_DISK=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.target_disk // empty' /etc/ultracloud/node-config.json) TARGET_DISK_BY_ID=$(${pkgs.jq}/bin/jq -r '.bootstrap_plan.install_plan.target_disk_by_id // empty' /etc/ultracloud/node-config.json) - DEPLOYER_URL="''${DEPLOYER_URL:-}" - if [ -z "$DEPLOYER_URL" ]; then - DEPLOYER_URL="$(cmdline_value ultracloud.deployer_url || true)" - fi - if [ -z "$DEPLOYER_URL" ]; then - DEPLOYER_URL="http://192.168.100.1:8080" - fi + DEPLOYER_URL="$(resolve_deployer_url)" SRC_ROOT="/opt/ultracloud-src" if [ -z "$NODE_ID" ] || [ -z "$NODE_IP" ]; then @@ -362,6 +442,7 @@ "$DEPLOYER_URL/api/v1/bootstrap/flake-bundle" \ -o "$BUNDLE_PATH"; then echo "Downloaded bootstrap flake bundle from deployer" + echo "ULTRACLOUD_MARKER install.bundle-downloaded.$NODE_ID" rm -rf "$SRC_ROOT" mkdir -p "$SRC_ROOT" ${pkgs.gzip}/bin/gzip -dc "$BUNDLE_PATH" | ${pkgs.gnutar}/bin/tar -xf - -C "$SRC_ROOT" @@ -369,24 +450,12 @@ echo "No deployer flake bundle available; using embedded source tree" fi - if [ -z "$DISKO_PATH" ]; then - CANDIDATE_DISKO="nix/nodes/vm-cluster/$NODE_ID/disko.nix" - if [ -f "$SRC_ROOT/$CANDIDATE_DISKO" ]; then - DISKO_PATH="$CANDIDATE_DISKO" - fi + echo "ULTRACLOUD_MARKER install.start.$NODE_ID" + DISPLAY_TARGET_DISK="$TARGET_DISK" + if [ -n "$TARGET_DISK_BY_ID" ]; then + DISPLAY_TARGET_DISK="$TARGET_DISK_BY_ID" fi - - if [ -z "$DISKO_PATH" ]; then - echo "ERROR: node-config.json missing install_plan.disko_config_path and no default Disko path exists for $NODE_ID" - exit 1 - fi - - if [ ! -f "$SRC_ROOT/$DISKO_PATH" ]; then - echo "ERROR: Disko config not found: $SRC_ROOT/$DISKO_PATH" - exit 1 - fi - - echo "UltraCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, disko_path=$DISKO_PATH)" + echo "UltraCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, target_disk=$DISPLAY_TARGET_DISK)" # Resolve installation target disk. if [ -n "$TARGET_DISK_BY_ID" ]; then @@ -423,50 +492,99 @@ umount /mnt || true fi - echo "Validating NixOS configuration output..." - nix eval --raw "$SRC_ROOT#nixosConfigurations.$NIXOS_CONFIGURATION.config.system.build.toplevel.drvPath" >/dev/null - - EFFECTIVE_DISKO_PATH="$SRC_ROOT/$DISKO_PATH" - if [ -n "$DISK" ]; then - cat > /run/ultracloud/disko-wrapper.nix </dev/null + "$DISKO_SCRIPT_PATH/bin/disko-format-mount" + else + ${pkgs.disko}/bin/disko \ + --mode destroy,format,mount \ + --yes-wipe-all-disks \ + --root-mountpoint /mnt \ + --flake "$SRC_ROOT#$NIXOS_CONFIGURATION" + fi + echo "ULTRACLOUD_MARKER install.disko.complete.$NODE_ID" echo "Running nixos-install..." - nixos-install --flake "$SRC_ROOT#$NIXOS_CONFIGURATION" --no-root-passwd + if [ -n "$TARGET_SYSTEM_PATH" ]; then + echo "Realising pre-built target system: $TARGET_SYSTEM_PATH" + ${pkgs.nix}/bin/nix-store --realise "$TARGET_SYSTEM_PATH" >/dev/null + ${config.system.build.nixos-install}/bin/nixos-install \ + --system "$TARGET_SYSTEM_PATH" \ + --no-root-passwd \ + --no-channel-copy + else + ${config.system.build.nixos-install}/bin/nixos-install \ + --flake "$SRC_ROOT#$NIXOS_CONFIGURATION" \ + --no-root-passwd \ + --no-channel-copy + fi + echo "ULTRACLOUD_MARKER install.nixos-install.complete.$NODE_ID" + + mkdir -p /mnt/etc/ssh /mnt/etc/ultracloud /mnt/root/.ssh /mnt/var/lib + cp -f /etc/ultracloud/node-config.json /mnt/etc/ultracloud/node-config.json + cp -f /root/.ssh/authorized_keys /mnt/root/.ssh/authorized_keys + shopt -s nullglob + for host_key in /etc/ssh/ssh_host_*; do + cp -f "$host_key" /mnt/etc/ssh/"$(basename "$host_key")" + done + shopt -u nullglob + chmod 700 /mnt/root/.ssh + chmod 600 /mnt/root/.ssh/authorized_keys + chmod 600 /mnt/etc/ssh/ssh_host_*_key 2>/dev/null || true + chmod 644 /mnt/etc/ssh/ssh_host_*_key.pub 2>/dev/null || true + + rm -rf /mnt/var/lib/photon-src + cp -a "$SRC_ROOT" /mnt/var/lib/photon-src sync + echo "ULTRACLOUD_MARKER reboot.$NODE_ID" + echo "Allowing the harness to observe the reboot marker before shutting down..." + sleep 15 echo "✓ Install complete; rebooting..." ${pkgs.systemd}/bin/systemctl reboot ''; }; # Packages for bootstrap + install + environment.etc."ultracloud/disko-script-paths.json".text = + builtins.toJSON ultracloudBaremetalFormatMountPaths; + environment.etc."ultracloud/system-paths.json".text = + builtins.toJSON ultracloudBaremetalSystemPaths; + environment.systemPackages = with pkgs; [ curl jq vim htop + nix gawk gnugrep util-linux parted dosfstools e2fsprogs + disko gnutar gzip ]; diff --git a/nix/modules/default.nix b/nix/modules/default.nix index 4ea5066..4d59488 100644 --- a/nix/modules/default.nix +++ b/nix/modules/default.nix @@ -18,6 +18,7 @@ ./lightningstor.nix ./k8shost.nix ./nightlight.nix + ./apigateway.nix ./deployer.nix ./nix-agent.nix ./node-agent.nix diff --git a/nix/nodes/baremetal-qemu/common.nix b/nix/nodes/baremetal-qemu/common.nix new file mode 100644 index 0000000..e973217 --- /dev/null +++ b/nix/nodes/baremetal-qemu/common.nix @@ -0,0 +1,87 @@ +{ lib, pkgs, ... }: + +{ + boot.kernelParams = [ "console=ttyS0,115200n8" ]; + boot.initrd.availableKernelModules = [ + "ahci" + "sr_mod" + "virtio_blk" + "virtio_net" + "virtio_pci" + "virtio_scsi" + "xhci_pci" + ]; + + networking.firewall.enable = false; + networking.useDHCP = lib.mkForce false; + networking.dhcpcd.enable = lib.mkForce false; + networking.usePredictableInterfaceNames = false; + + systemd.network = { + enable = true; + wait-online.enable = true; + networks."10-eth0" = { + matchConfig.Name = "eth0"; + networkConfig.DHCP = "yes"; + linkConfig.RequiredForOnline = "routable"; + }; + }; + + services.openssh = { + enable = true; + settings = { + PermitRootLogin = "prohibit-password"; + PasswordAuthentication = false; + KbdInteractiveAuthentication = false; + }; + }; + + users.users.root.openssh.authorizedKeys.keys = [ ]; + + nix.registry = lib.mkForce { }; + nix.nixPath = lib.mkForce [ ]; + nix.channel.enable = false; + nix.settings = { + experimental-features = [ + "nix-command" + "flakes" + ]; + flake-registry = ""; + }; + nixpkgs.flake = { + source = lib.mkForce null; + setFlakeRegistry = lib.mkForce false; + setNixPath = lib.mkForce false; + }; + + documentation.enable = false; + documentation.nixos.enable = false; + documentation.man.enable = false; + documentation.info.enable = false; + documentation.doc.enable = false; + + environment.systemPackages = with pkgs; [ + curl + jq + ]; + + systemd.services.ultracloud-baremetal-postinstall-marker = { + description = "Emit a canonical post-install marker for bare-metal QEMU smoke"; + wantedBy = [ "multi-user.target" ]; + after = [ "network-online.target" ]; + wants = [ "network-online.target" ]; + serviceConfig = { + Type = "oneshot"; + RemainAfterExit = true; + StandardOutput = "journal+console"; + StandardError = "journal+console"; + }; + script = '' + hostname="$(tr -d '\n' /dev/null 2>&1; then + return 0 + fi + if [ "$SECONDS" -ge "$deadline" ]; then + echo "timed out waiting for $name at $url" >&2 + return 1 + fi + sleep 1 + done + } + + systemctl is-active ${readyUnitArgs} + ${healthCheckScript} + test -x ${pkgs.qemu}/bin/qemu-system-x86_64 + test -x ${pkgs.qemu}/bin/qemu-img + test -c /dev/net/tun + if [ -e /dev/kvm ]; then + test -r /dev/kvm + fi + ''; + }; + + system.stateVersion = "24.11"; + }; +} diff --git a/nix/single-node/qemu-vm.nix b/nix/single-node/qemu-vm.nix new file mode 100644 index 0000000..6a56f81 --- /dev/null +++ b/nix/single-node/qemu-vm.nix @@ -0,0 +1,24 @@ +{ modulesPath, ... }: + +{ + imports = [ (modulesPath + "/virtualisation/qemu-vm.nix") ]; + + virtualisation = { + graphics = false; + cores = 2; + memorySize = 3072; + diskSize = 16384; + }; + + services.openssh = { + enable = true; + settings = { + KbdInteractiveAuthentication = false; + PasswordAuthentication = true; + PermitRootLogin = "yes"; + }; + }; + + users.mutableUsers = false; + users.users.root.hashedPassword = "$6$iu4O1PEqq77wLMfh$T4bP3V9v8RoPgwqgBr2taKEgVNcb42HaTUy.VMjjsFtWTvnai3rqvy8AQbELKWdB1Qzfb7wkUOSK1wnmSZph/."; +} diff --git a/nix/test-cluster/README.md b/nix/test-cluster/README.md index c7b1000..11934d5 100644 --- a/nix/test-cluster/README.md +++ b/nix/test-cluster/README.md @@ -3,6 +3,10 @@ `nix/test-cluster` is the canonical local validation path for UltraCloud. It boots six QEMU VMs, treats them as hardware-like nodes, and validates representative control-plane, worker, and gateway behavior over SSH and service endpoints. All VM images are built on the host in a single Nix invocation and then booted as prebuilt artifacts. The guests do not compile the stack locally. +The same harness also owns the canonical bare-metal bootstrap proof: a raw-QEMU ISO flow that phones home to `deployer`, runs Disko, reboots, and waits for `nix-agent` desired-system convergence on one control-plane node and one worker-equivalent node. + +When `/dev/kvm` is absent, the portable fallback is not another harness subcommand. Use the root-flake non-KVM lane instead: `nix build .#checks.x86_64-linux.portable-control-plane-regressions`. +When `/dev/kvm` and nested virtualization are available, the reproducible publishable lane is `./nix/test-cluster/run-publishable-kvm-suite.sh`, which records environment metadata and then runs `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix` in order. ## What it validates @@ -15,6 +19,7 @@ All VM images are built on the host in a single Nix invocation and then booted a - host-forwarded access to the API gateway and NightLight HTTP surfaces - cross-node data replication smoke tests for `chainfire` and `flaredb` - deployer-seeded native runtime scheduling from declarative Nix service definitions, including drain/failover recovery +- ISO-based bare-metal bootstrap from `nixosConfigurations.ultracloud-iso` through phone-home, flake bundle fetch, Disko install, reboot, and desired-system activation ## Validation layers @@ -45,6 +50,7 @@ nix run ./nix/test-cluster#cluster -- build nix run ./nix/test-cluster#cluster -- start nix run ./nix/test-cluster#cluster -- smoke nix run ./nix/test-cluster#cluster -- fresh-smoke +nix run ./nix/test-cluster#cluster -- baremetal-iso nix run ./nix/test-cluster#cluster -- demo-vm-webapp nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp nix run ./nix/test-cluster#cluster -- serve-vm-webapp @@ -63,6 +69,12 @@ make cluster-smoke Preferred entrypoint for publishable verification: `nix run ./nix/test-cluster#cluster -- fresh-smoke` +Preferred entrypoint for publishable bare-metal bootstrap verification: `nix run ./nix/test-cluster#cluster -- baremetal-iso` + +Preferred entrypoint for portable local verification on TCG-only hosts: `nix build .#checks.x86_64-linux.portable-control-plane-regressions` + +Preferred entrypoint for reproducible KVM-suite reruns: `./nix/test-cluster/run-publishable-kvm-suite.sh ` + `make cluster-smoke` is a convenience wrapper for the same clean host-build VM validation flow. `nix run ./nix/test-cluster#cluster -- demo-vm-webapp` creates a PrismNet-attached VM, boots a tiny web app inside the guest, stores its counter in FlareDB, writes JSON snapshots to LightningStor object storage, and then proves that the state survives guest restart plus cross-worker migration. The attached data volume is still used by the guest for its local bootstrap config. @@ -101,4 +113,4 @@ Logs for each VM are written to `//vm.log`. ## Scope note -This harness is intentionally VM-first. Older ad hoc launch scripts under `baremetal/vm-cluster` are legacy/manual paths and should not be treated as the primary local validation entrypoint. +This harness is intentionally VM-first, but the canonical bare-metal install proof also lives here so the docs, harness, and `flake check` all exercise the same ISO route. Older ad hoc launch scripts under `baremetal/vm-cluster` are legacy/manual paths, and the `netboot-*` images remain experimental helpers rather than the supported bootstrap entrypoint. diff --git a/nix/test-cluster/flake.nix b/nix/test-cluster/flake.nix index ed040f4..a1f2d53 100644 --- a/nix/test-cluster/flake.nix +++ b/nix/test-cluster/flake.nix @@ -52,6 +52,8 @@ bash coreutils curl + ultracloud.packages.${system}.chainfire-server + ultracloud.packages.${system}.deployer-server ultracloud.packages.${system}.deployer-ctl findutils gawk @@ -60,7 +62,9 @@ gnugrep iproute2 jq + nix openssh + python3 procps clusterPython qemu diff --git a/nix/test-cluster/run-cluster.sh b/nix/test-cluster/run-cluster.sh index 2babaac..6308d3a 100755 --- a/nix/test-cluster/run-cluster.sh +++ b/nix/test-cluster/run-cluster.sh @@ -8369,6 +8369,10 @@ fresh_smoke_requested() { smoke_requested "$@" } +baremetal_iso_requested() { + bash "${REPO_ROOT}/nix/test-cluster/verify-baremetal-iso.sh" "$@" +} + storage_smoke_requested() { BUILD_PROFILE="storage" start_requested "${STORAGE_NODES[@]}" @@ -8686,6 +8690,7 @@ Commands: validate Run the cluster smoke validation smoke start + validate fresh-smoke clean local runtime state, rebuild on the host, start, and validate + baremetal-iso verify the canonical ISO bootstrap path from phone-home through desired-system convergence storage-smoke start the storage lab (node01-05) and validate CoronaFS/LightningStor/PlasmaVMC fresh-storage-smoke clean local runtime state, rebuild node01-05 on the host, start, and validate the storage lab demo-vm-webapp start the cluster and run the VM web app demo backed by FlareDB and LightningStor @@ -8716,6 +8721,7 @@ Commands: Examples: $0 smoke $0 fresh-smoke + $0 baremetal-iso $0 storage-smoke $0 fresh-storage-smoke $0 demo-vm-webapp @@ -8756,6 +8762,7 @@ main() { validate) validate_cluster ;; smoke) smoke_requested "$@" ;; fresh-smoke) fresh_smoke_requested "$@" ;; + baremetal-iso) baremetal_iso_requested "$@" ;; storage-smoke) storage_smoke_requested ;; fresh-storage-smoke) fresh_storage_smoke_requested ;; demo-vm-webapp) demo_vm_webapp_requested "$@" ;; diff --git a/nix/test-cluster/run-publishable-kvm-suite.sh b/nix/test-cluster/run-publishable-kvm-suite.sh new file mode 100755 index 0000000..7425173 --- /dev/null +++ b/nix/test-cluster/run-publishable-kvm-suite.sh @@ -0,0 +1,87 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" +LOG_DIR="${1:-${ULTRACLOUD_KVM_PUBLISHABLE_LOG_DIR:-${REPO_ROOT}/work/publishable-kvm-suite}}" + +mkdir -p "${LOG_DIR}" + +log() { + printf '[publishable-kvm-suite] %s\n' "$*" +} + +capture_environment() { + { + printf 'started_at=%s\n' "$(date -Is)" + printf 'hostname=%s\n' "$(hostname)" + printf 'kernel=%s\n' "$(uname -a)" + printf 'pwd=%s\n' "$(pwd)" + printf 'user=%s\n' "$(id -un)" + printf 'uid=%s\n' "$(id -u)" + printf 'gid=%s\n' "$(id -g)" + printf 'branch=%s\n' "$(git -C "${REPO_ROOT}" branch --show-current)" + printf 'commit=%s\n' "$(git -C "${REPO_ROOT}" rev-parse HEAD)" + printf 'nix_version=%s\n' "$(nix --version)" + printf 'kvm_present=%s\n' "$([[ -e /dev/kvm ]] && echo yes || echo no)" + if [[ -e /dev/kvm ]]; then + printf 'kvm_stat=%s\n' "$(stat -c '%A %U %G %t:%T' /dev/kvm)" + fi + if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then + printf 'kvm_intel_nested=%s\n' "$(cat /sys/module/kvm_intel/parameters/nested)" + fi + if [[ -f /sys/module/kvm_amd/parameters/nested ]]; then + printf 'kvm_amd_nested=%s\n' "$(cat /sys/module/kvm_amd/parameters/nested)" + fi + } >"${LOG_DIR}/environment.txt" +} + +run_case() { + local name="$1" + shift + local logfile="${LOG_DIR}/${name}.log" + local metafile="${LOG_DIR}/${name}.meta" + local started_at ended_at rc + + started_at="$(date -Is)" + printf 'command=%s\n' "$*" >"${metafile}" + printf 'started_at=%s\n' "${started_at}" >>"${metafile}" + + log "running ${name}: $*" + set +e + ( + cd "${REPO_ROOT}" + "$@" + ) 2>&1 | tee "${logfile}" + rc=${PIPESTATUS[0]} + set -e + + ended_at="$(date -Is)" + printf 'ended_at=%s\n' "${ended_at}" >>"${metafile}" + printf 'exit_code=%s\n' "${rc}" >>"${metafile}" + + if (( rc != 0 )); then + log "${name} failed; see ${logfile}" + return "${rc}" + fi + + log "${name} passed" +} + +main() { + capture_environment + + [[ -e /dev/kvm ]] || { + log "/dev/kvm is missing" + return 1 + } + + run_case fresh-smoke nix run ./nix/test-cluster#cluster -- fresh-smoke + run_case fresh-demo-vm-webapp nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp + run_case fresh-matrix nix run ./nix/test-cluster#cluster -- fresh-matrix + + printf 'finished_at=%s\n' "$(date -Is)" >>"${LOG_DIR}/environment.txt" + log "publishable KVM suite passed; logs in ${LOG_DIR}" +} + +main "$@" diff --git a/nix/test-cluster/verify-baremetal-iso.sh b/nix/test-cluster/verify-baremetal-iso.sh new file mode 100644 index 0000000..455ef43 --- /dev/null +++ b/nix/test-cluster/verify-baremetal-iso.sh @@ -0,0 +1,824 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT="${ULTRACLOUD_REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}" + +CLUSTER_ID="baremetal-iso-canonical" +CHAINFIRE_ENDPOINT="http://127.0.0.1:2379" +DEPLOYER_ENDPOINT="http://127.0.0.1:8088" +BINARY_CACHE_ENDPOINT="http://127.0.0.1:8090" +BOOTSTRAP_TOKEN="baremetal-iso-bootstrap-token" +CONTROL_NODE_ID="iso-control-plane-01" +WORKER_NODE_ID="iso-worker-01" +CONTROL_SSH_PORT="22231" +WORKER_SSH_PORT="22232" +CONTROL_DHCP_START="10.0.2.15" +WORKER_DHCP_START="10.0.2.16" +CONTROL_DISK_GIB="18G" +WORKER_DISK_GIB="18G" + +log() { + printf '[baremetal-iso-e2e] %s\n' "$*" +} + +marker() { + printf 'ULTRACLOUD_MARKER %s\n' "$*" +} + +die() { + echo "[baremetal-iso-e2e] ERROR: $*" >&2 + exit 1 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || die "required command not found: $1" +} + +resolve_store_path() { + local env_name="$1" + local attr="$2" + if [[ -n "${!env_name:-}" ]]; then + printf '%s\n' "${!env_name}" + return 0 + fi + nix build "$ROOT#$attr" --no-link --print-out-paths +} + +resolve_binary() { + local env_name="$1" + local bin_name="$2" + local attr="$3" + if [[ -n "${!env_name:-}" ]]; then + printf '%s\n' "${!env_name}" + return 0 + fi + if command -v "$bin_name" >/dev/null 2>&1; then + command -v "$bin_name" + return 0 + fi + local out + out="$(nix build "$ROOT#$attr" --no-link --print-out-paths)" + printf '%s/bin/%s\n' "$out" "$bin_name" +} + +resolve_iso_image() { + local candidate="$1" + if [[ -f "$candidate" ]]; then + printf '%s\n' "$candidate" + return 0 + fi + + local iso_dir="$candidate/iso" + if [[ -d "$iso_dir" ]]; then + local iso_path + iso_path="$(find "$iso_dir" -maxdepth 1 -type f -name '*.iso' | head -n 1)" + if [[ -n "$iso_path" ]]; then + printf '%s\n' "$iso_path" + return 0 + fi + fi + + die "unable to resolve a bootable ISO file from $candidate" +} + +resolve_ovmf_firmware() { + local env_name="$1" + local relative_path="$2" + if [[ -n "${!env_name:-}" ]]; then + printf '%s\n' "${!env_name}" + return 0 + fi + + local ovmf_dir + ovmf_dir="$(nix build nixpkgs#OVMF.fd --no-link --print-out-paths)" + printf '%s/%s\n' "$ovmf_dir" "$relative_path" +} + +wait_for_http() { + local url="$1" + local timeout_secs="$2" + local deadline=$((SECONDS + timeout_secs)) + while (( SECONDS < deadline )); do + if curl -fsS "$url" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + return 1 +} + +wait_for_log_marker() { + local label="$1" + local log_file="$2" + local needle="$3" + local timeout_secs="$4" + local deadline=$((SECONDS + timeout_secs)) + while (( SECONDS < deadline )); do + if [[ -f "$log_file" ]] && grep -Eq "$needle" "$log_file"; then + log "${label}: observed ${needle}" + return 0 + fi + sleep 2 + done + return 1 +} + +ssh_base() { + local port="$1" + shift + ssh \ + -F /dev/null \ + -i "$SSH_KEY" \ + -o BatchMode=yes \ + -o ConnectTimeout=5 \ + -o ConnectionAttempts=1 \ + -o StrictHostKeyChecking=no \ + -o UserKnownHostsFile=/dev/null \ + -o LogLevel=ERROR \ + -p "$port" \ + root@127.0.0.1 "$@" +} + +wait_for_ssh() { + local label="$1" + local port="$2" + local timeout_secs="$3" + local deadline=$((SECONDS + timeout_secs)) + while (( SECONDS < deadline )); do + if ssh_base "$port" true >/dev/null 2>&1; then + log "${label}: SSH is reachable on port ${port}" + return 0 + fi + sleep 2 + done + return 1 +} + +ssh_shell() { + local port="$1" + local script="$2" + local quoted + printf -v quoted '%q' "$script" + ssh_base "$port" "bash -lc $quoted" +} + +current_system_path() { + local port="$1" + ssh_shell "$port" 'readlink -f /run/current-system' +} + +remote_boot_id() { + local port="$1" + ssh_shell "$port" 'cat /proc/sys/kernel/random/boot_id' +} + +remote_journal_has_marker() { + local port="$1" + local needle="$2" + shift 2 + + local remote_cmd="journalctl -b -o cat --no-pager" + local unit + for unit in "$@"; do + printf -v remote_cmd '%s -u %q' "$remote_cmd" "$unit" + done + printf -v remote_cmd '%s | grep -Fq %q' "$remote_cmd" "$needle" + + ssh_shell "$port" "$remote_cmd" +} + +wait_for_remote_journal_marker() { + local label="$1" + local port="$2" + local needle="$3" + local timeout_secs="$4" + shift 4 + + local deadline=$((SECONDS + timeout_secs)) + while (( SECONDS < deadline )); do + if remote_journal_has_marker "$port" "$needle" "$@" >/dev/null 2>&1; then + log "${label}: observed ${needle} via remote journal" + return 0 + fi + sleep 2 + done + return 1 +} + +wait_for_reboot_transition() { + local label="$1" + local port="$2" + local previous_boot_id="$3" + local timeout_secs="$4" + local deadline=$((SECONDS + timeout_secs)) + + while (( SECONDS < deadline )); do + local current_boot_id + if current_boot_id="$(remote_boot_id "$port" 2>/dev/null)"; then + if [[ -n "$current_boot_id" && "$current_boot_id" != "$previous_boot_id" ]]; then + log "${label}: reboot completed with boot_id=${current_boot_id}" + return 0 + fi + fi + sleep 2 + done + return 1 +} + +observed_status() { + local node_id="$1" + local payload + if ! payload="$( + "$DEPLOYER_CTL_BIN" \ + --chainfire-endpoint "$CHAINFIRE_ENDPOINT" \ + --cluster-id "$CLUSTER_ID" \ + --cluster-namespace ultracloud \ + --deployer-namespace deployer \ + node inspect \ + --node-id "$node_id" \ + --include-observed-system \ + --format json 2>/dev/null + )"; then + printf 'missing\n' + return 0 + fi + + jq -r '.observed_system.status // "missing"' <<<"$payload" +} + +wait_for_observed_active() { + local node_id="$1" + local timeout_secs="$2" + local deadline=$((SECONDS + timeout_secs)) + while (( SECONDS < deadline )); do + if [[ "$(observed_status "$node_id")" == "active" ]]; then + log "${node_id}: observed-system reached active" + return 0 + fi + sleep 5 + done + return 1 +} + +assert_port_free() { + local port="$1" + if ss -ltn "( sport = :$port )" | grep -Fq ":$port"; then + die "port $port is already in use" + fi +} + +start_host_services() { + cat >"$TMP_DIR/chainfire.toml" <"$TMP_DIR/deployer.toml" <"$CHAINFIRE_LOG" 2>&1 & + CHAINFIRE_PID="$!" + + wait_for_http "http://127.0.0.1:8081/health" 120 \ + || die "host Chainfire did not become healthy" + + log "Starting host-side Deployer" + NO_COLOR=1 CLICOLOR=0 RUST_LOG_STYLE=never \ + "$DEPLOYER_SERVER_BIN" --config "$TMP_DIR/deployer.toml" >"$DEPLOYER_LOG" 2>&1 & + DEPLOYER_PID="$!" + + wait_for_http "http://127.0.0.1:8088/health" 120 \ + || die "host Deployer did not become healthy" +} + +seed_binary_cache() { + local path + local nar_rel + local nar_path + local store_base + local store_hash + local nar_hash + local nar_size + local refs + local deriver + + mkdir -p "$NIX_CACHE_DIR/nar" + cat >"$NIX_CACHE_DIR/nix-cache-info" <<'EOF' +StoreDir: /nix/store +WantMassQuery: 1 +Priority: 30 +EOF + + log "Seeding host-local Nix binary cache" + if [[ -n "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION:-}" && -f "${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" ]]; then + nix-store --load-db <"${ULTRACLOUD_BAREMETAL_CACHE_REGISTRATION}/registration" + fi + while IFS= read -r path; do + [[ -n "$path" ]] || continue + + store_base="$(basename "$path")" + store_hash="${store_base%%-*}" + nar_rel="nar/${store_base}.nar" + nar_path="$NIX_CACHE_DIR/$nar_rel" + + if [[ ! -f "$nar_path" ]]; then + nix-store --dump "$path" >"$nar_path" + fi + + nar_size="$(stat -c%s "$nar_path")" + nar_hash="$(nix hash file --type sha256 --base32 "$nar_path")" + refs="$(nix-store --query --references "$path" | xargs -r -n1 basename | tr '\n' ' ' | sed 's/ $//')" + deriver="$(nix-store --query --deriver "$path" 2>/dev/null || true)" + deriver="$(basename "$deriver" 2>/dev/null || true)" + + { + echo "StorePath: $path" + echo "URL: $nar_rel" + echo "Compression: none" + echo "FileHash: sha256:$nar_hash" + echo "FileSize: $nar_size" + echo "NarHash: sha256:$nar_hash" + echo "NarSize: $nar_size" + echo "References: $refs" + if [[ -n "$deriver" && "$deriver" != "unknown-deriver" ]]; then + echo "Deriver: $deriver" + fi + } >"$NIX_CACHE_DIR/${store_hash}.narinfo" + done < <( + nix-store --query --requisites \ + "$CONTROL_TARGET_SYSTEM" \ + "$WORKER_TARGET_SYSTEM" \ + "$CONTROL_DISKO_SCRIPT" \ + "$WORKER_DISKO_SCRIPT" \ + | sort -u + ) +} + +start_binary_cache() { + seed_binary_cache + + log "Starting host-local Nix binary cache" + python3 -m http.server 8090 --bind 0.0.0.0 --directory "$NIX_CACHE_DIR" \ + >"$NIX_CACHE_LOG" 2>&1 & + NIX_CACHE_PID="$!" + + wait_for_http "${BINARY_CACHE_ENDPOINT}/nix-cache-info" 120 \ + || die "host-local Nix binary cache did not become reachable" +} + +apply_cluster_state() { + cat >"$TMP_DIR/cluster-state.yaml" </dev/null + rm -f "$ovmf_vars_path" + cp "$OVMF_VARS_TEMPLATE" "$ovmf_vars_path" + chmod u+w "$ovmf_vars_path" + + nohup "$QEMU_BIN" \ + -name "$label" \ + -machine accel=tcg \ + -cpu max \ + -smp 2 \ + -m 2048 \ + -nographic \ + -no-reboot \ + -boot order=dc,once=d,menu=off \ + -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ + -drive if=pflash,format=raw,file="$ovmf_vars_path" \ + -drive file="$disk_path",if=virtio,format=qcow2 \ + -cdrom "$ISO_IMAGE" \ + -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ + -device virtio-net-pci,netdev=user0,mac="${mac}" \ + -smbios type=1,product=UltraCloudQEMUBaremetal,serial="${node_id}" \ + >"$log_path" 2>&1 & + echo "$!" >"${log_path}.pid" +} + +launch_installed_vm() { + local label="$1" + local ssh_port="$2" + local dhcp_start="$3" + local mac="$4" + local disk_path="$5" + local log_path="$6" + local ovmf_vars_path="${disk_path}.ovmf-vars.fd" + + [[ -f "$ovmf_vars_path" ]] || die "missing OVMF vars file for relaunch: $ovmf_vars_path" + + nohup "$QEMU_BIN" \ + -name "$label" \ + -machine accel=tcg \ + -cpu max \ + -smp 2 \ + -m 2048 \ + -nographic \ + -drive if=pflash,format=raw,readonly=on,file="$OVMF_CODE_FD" \ + -drive if=pflash,format=raw,file="$ovmf_vars_path" \ + -drive file="$disk_path",if=virtio,format=qcow2 \ + -netdev user,id=user0,hostfwd=tcp:127.0.0.1:${ssh_port}-:22,dhcpstart=${dhcp_start} \ + -device virtio-net-pci,netdev=user0,mac="${mac}" \ + >>"$log_path" 2>&1 & + echo "$!" >"${log_path}.pid" +} + +wait_for_pid_exit() { + local label="$1" + local pid_file="$2" + local timeout_secs="$3" + local deadline=$((SECONDS + timeout_secs)) + local pid + + [[ -f "$pid_file" ]] || die "${label} is missing pid file $pid_file" + pid="$(cat "$pid_file")" + while (( SECONDS < deadline )); do + if ! kill -0 "$pid" >/dev/null 2>&1; then + log "${label}: QEMU exited after installer-triggered reboot" + return 0 + fi + sleep 2 + done + return 1 +} + +verify_node() { + local node_id="$1" + local ssh_port="$2" + local disk_path="$3" + local log_path="$4" + local expected_role="$5" + local expected_system="$6" + local dhcp_start="$7" + local mac="$8" + + wait_for_log_marker "$node_id" "$TMP_DIR/deployer.log" "Node registered successfully.*node_id=${node_id}" 900 \ + || die "${node_id} never completed /api/v1/phone-home registration" + wait_for_ssh "$node_id" "$ssh_port" 900 \ + || die "${node_id} never exposed SSH during the installer boot" + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.boot.${node_id}" 120 \ + ultracloud-bootstrap.service ultracloud-install.service \ + || die "${node_id} never recorded the pre-install boot marker" + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER pre-install.phone-home.complete.${node_id}" 120 \ + ultracloud-bootstrap.service ultracloud-install.service \ + || die "${node_id} never recorded the phone-home completion marker" + marker "pre-install.${node_id}" + + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.bundle-downloaded.${node_id}" 1200 \ + ultracloud-install.service \ + || die "${node_id} never downloaded the flake bundle" + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.disko.complete.${node_id}" 2400 \ + ultracloud-install.service \ + || die "${node_id} never completed disko" + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER install.nixos-install.complete.${node_id}" 3600 \ + ultracloud-install.service \ + || die "${node_id} never finished nixos-install" + marker "install.${node_id}" + + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER reboot.${node_id}" 3600 \ + ultracloud-install.service \ + || die "${node_id} never emitted reboot marker" + marker "reboot.${node_id}" + + wait_for_pid_exit "$node_id" "${log_path}.pid" 300 \ + || die "${node_id} installer VM did not exit after the reboot marker" + launch_installed_vm \ + "ultracloud-baremetal-${node_id}-installed" \ + "$ssh_port" \ + "$dhcp_start" \ + "$mac" \ + "$disk_path" \ + "$log_path" + wait_for_ssh "$node_id" "$ssh_port" 1800 \ + || die "${node_id} did not come back over SSH after reboot" + wait_for_remote_journal_marker "$node_id" "$ssh_port" "ULTRACLOUD_MARKER post-install.boot.${node_id}.${expected_role}" 1800 \ + ultracloud-baremetal-postinstall-marker.service \ + || die "${node_id} never emitted post-install marker" + marker "post-install.${node_id}" + + ssh_shell "$ssh_port" 'test -f /etc/ultracloud/node-config.json' + ssh_shell "$ssh_port" 'test -d /var/lib/photon-src/.bundle-inputs/nixpkgs' + ssh_shell "$ssh_port" 'systemctl is-active nix-agent.service >/dev/null' + ssh_shell "$ssh_port" "grep -Fx '${expected_role}' /etc/ultracloud-role" + if [[ "$expected_role" == "control-plane" ]]; then + ssh_shell "$ssh_port" 'systemctl is-active chainfire.service >/dev/null' + fi + + wait_for_observed_active "$node_id" 1200 \ + || die "${node_id} never reached observed-system active" + [[ "$(current_system_path "$ssh_port")" == "$expected_system" ]] \ + || die "${node_id} current system does not match expected target" + marker "desired-system-active.${node_id}" +} + +cleanup() { + local status="$?" + set +e + + for pid_file in "$CONTROL_LOG.pid" "$WORKER_LOG.pid"; do + if [[ -f "$pid_file" ]]; then + pid="$(cat "$pid_file")" + kill "$pid" 2>/dev/null || true + wait "$pid" 2>/dev/null || true + fi + done + + if [[ -n "${DEPLOYER_PID:-}" ]]; then + kill "$DEPLOYER_PID" 2>/dev/null || true + wait "$DEPLOYER_PID" 2>/dev/null || true + fi + if [[ -n "${CHAINFIRE_PID:-}" ]]; then + kill "$CHAINFIRE_PID" 2>/dev/null || true + wait "$CHAINFIRE_PID" 2>/dev/null || true + fi + if [[ -n "${NIX_CACHE_PID:-}" ]]; then + kill "$NIX_CACHE_PID" 2>/dev/null || true + wait "$NIX_CACHE_PID" 2>/dev/null || true + fi + + if (( status != 0 )); then + log "control-plane serial log tail:" + tail -n 120 "$CONTROL_LOG" 2>/dev/null || true + log "worker serial log tail:" + tail -n 120 "$WORKER_LOG" 2>/dev/null || true + log "deployer log tail:" + tail -n 120 "$DEPLOYER_LOG" 2>/dev/null || true + log "chainfire log tail:" + tail -n 120 "$CHAINFIRE_LOG" 2>/dev/null || true + log "binary cache log tail:" + tail -n 120 "$NIX_CACHE_LOG" 2>/dev/null || true + fi + + if [[ "${KEEP_STATE_DIR:-0}" != "1" ]]; then + rm -rf "$TMP_DIR" + fi + exit "$status" +} + +main() { + require_cmd curl + require_cmd jq + require_cmd nix + require_cmd python3 + require_cmd qemu-img + require_cmd qemu-system-x86_64 + require_cmd ssh + require_cmd ssh-keygen + require_cmd ss + + ISO_IMAGE="$(resolve_iso_image "$(resolve_store_path ULTRACLOUD_BAREMETAL_ISO_IMAGE 'nixosConfigurations.ultracloud-iso.config.system.build.isoImage')")" + FLAKE_BUNDLE="$(resolve_store_path ULTRACLOUD_BAREMETAL_FLAKE_BUNDLE 'packages.x86_64-linux.ultracloudFlakeBundle')" + CONTROL_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_TARGET 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.toplevel')" + WORKER_TARGET_SYSTEM="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_TARGET 'nixosConfigurations.baremetal-qemu-worker.config.system.build.toplevel')" + CONTROL_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_CONTROL_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-control-plane.config.system.build.formatMount')" + WORKER_DISKO_SCRIPT="$(resolve_store_path ULTRACLOUD_BAREMETAL_WORKER_DISKO_SCRIPT 'nixosConfigurations.baremetal-qemu-worker.config.system.build.formatMount')" + CHAINFIRE_BIN="$(resolve_binary ULTRACLOUD_CHAINFIRE_SERVER_BIN chainfire 'packages.x86_64-linux.chainfire-server')" + DEPLOYER_SERVER_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_SERVER_BIN deployer-server 'packages.x86_64-linux.deployer-server')" + DEPLOYER_CTL_BIN="$(resolve_binary ULTRACLOUD_DEPLOYER_CTL_BIN deployer-ctl 'packages.x86_64-linux.deployer-ctl')" + OVMF_CODE_FD="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_CODE 'FV/OVMF_CODE.fd')" + OVMF_VARS_TEMPLATE="$(resolve_ovmf_firmware ULTRACLOUD_OVMF_VARS 'FV/OVMF_VARS.fd')" + QEMU_BIN="${ULTRACLOUD_QEMU_BIN:-$(command -v qemu-system-x86_64)}" + QEMU_IMG_BIN="${ULTRACLOUD_QEMU_IMG_BIN:-$(command -v qemu-img)}" + + if [[ -n "${ULTRACLOUD_BAREMETAL_STATE_DIR:-}" ]]; then + TMP_DIR="$ULTRACLOUD_BAREMETAL_STATE_DIR" + KEEP_STATE_DIR=1 + mkdir -p "$TMP_DIR" + find "$TMP_DIR" -mindepth 1 -maxdepth 1 \ + ! -name nix-cache \ + -exec rm -rf {} + + else + TMP_DIR="$(mktemp -d -t ultracloud-baremetal-iso.XXXXXX)" + KEEP_STATE_DIR=0 + fi + NIX_CACHE_DIR="$TMP_DIR/nix-cache" + CONTROL_LOG="$TMP_DIR/control-plane.serial.log" + WORKER_LOG="$TMP_DIR/worker.serial.log" + DEPLOYER_LOG="$TMP_DIR/deployer.log" + CHAINFIRE_LOG="$TMP_DIR/chainfire.log" + NIX_CACHE_LOG="$TMP_DIR/nix-cache.log" + trap cleanup EXIT + + SSH_KEY="$TMP_DIR/id_ed25519" + ssh-keygen -q -t ed25519 -N "" -f "$SSH_KEY" >/dev/null + SSH_PUBKEY="$(tr -d '\n' <"$SSH_KEY.pub")" + + assert_port_free 2379 + assert_port_free 8081 + assert_port_free 8088 + assert_port_free 8090 + assert_port_free "$CONTROL_SSH_PORT" + assert_port_free "$WORKER_SSH_PORT" + + start_binary_cache + start_host_services + apply_cluster_state + + launch_iso_vm \ + "ultracloud-baremetal-control-plane" \ + "$CONTROL_NODE_ID" \ + "$CONTROL_SSH_PORT" \ + "$CONTROL_DHCP_START" \ + "52:54:00:11:22:31" \ + "$CONTROL_DISK_GIB" \ + "$TMP_DIR/control-plane.qcow2" \ + "$CONTROL_LOG" + + verify_node \ + "$CONTROL_NODE_ID" \ + "$CONTROL_SSH_PORT" \ + "$TMP_DIR/control-plane.qcow2" \ + "$CONTROL_LOG" \ + "control-plane" \ + "$CONTROL_TARGET_SYSTEM" \ + "$CONTROL_DHCP_START" \ + "52:54:00:11:22:31" + + launch_iso_vm \ + "ultracloud-baremetal-worker" \ + "$WORKER_NODE_ID" \ + "$WORKER_SSH_PORT" \ + "$WORKER_DHCP_START" \ + "52:54:00:11:22:32" \ + "$WORKER_DISK_GIB" \ + "$TMP_DIR/worker.qcow2" \ + "$WORKER_LOG" + + verify_node \ + "$WORKER_NODE_ID" \ + "$WORKER_SSH_PORT" \ + "$TMP_DIR/worker.qcow2" \ + "$WORKER_LOG" \ + "worker" \ + "$WORKER_TARGET_SYSTEM" \ + "$WORKER_DHCP_START" \ + "52:54:00:11:22:32" + + log "Canonical ISO bare-metal QEMU verification succeeded" +} + +main "$@" diff --git a/plans/baselines/logs/nix-build-deployer-vm-smoke.meta b/plans/baselines/logs/nix-build-deployer-vm-smoke.meta new file mode 100644 index 0000000..42dd56e --- /dev/null +++ b/plans/baselines/logs/nix-build-deployer-vm-smoke.meta @@ -0,0 +1,4 @@ +command=nix build .#checks.x86_64-linux.deployer-vm-smoke +start=2026-04-04T16:44:34+09:00 +end=2026-04-04T16:50:40+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-netboot-all-in-one.meta b/plans/baselines/logs/nix-eval-netboot-all-in-one.meta new file mode 100644 index 0000000..6319d17 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-all-in-one.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-all-in-one.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:43:56+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-netboot-control-plane.meta b/plans/baselines/logs/nix-eval-netboot-control-plane.meta new file mode 100644 index 0000000..3e4a995 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-control-plane.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:44:01+09:00 +status=0 diff --git a/plans/baselines/logs/nix-eval-netboot-worker.meta b/plans/baselines/logs/nix-eval-netboot-worker.meta new file mode 100644 index 0000000..2101023 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-worker.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-worker.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:43:56+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-node01.meta b/plans/baselines/logs/nix-eval-node01.meta new file mode 100644 index 0000000..389903e --- /dev/null +++ b/plans/baselines/logs/nix-eval-node01.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.node01.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:45+09:00 +end=2026-04-04T16:43:49+09:00 +status=0 diff --git a/plans/baselines/logs/nix-eval-ultracloud-iso.meta b/plans/baselines/logs/nix-eval-ultracloud-iso.meta new file mode 100644 index 0000000..bdf8aa6 --- /dev/null +++ b/plans/baselines/logs/nix-eval-ultracloud-iso.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.ultracloud-iso.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:34+09:00 +end=2026-04-04T16:43:41+09:00 +status=0 diff --git a/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta b/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta new file mode 100644 index 0000000..922ba21 --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp +start=2026-04-04T16:48:18+09:00 +end=2026-04-04T16:48:23+09:00 +status=1 diff --git a/plans/baselines/logs/nix-run-fresh-matrix.meta b/plans/baselines/logs/nix-run-fresh-matrix.meta new file mode 100644 index 0000000..f4f0684 --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-matrix.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-matrix +start=2026-04-04T16:48:26+09:00 +end=2026-04-04T16:48:29+09:00 +status=1 diff --git a/plans/baselines/logs/nix-run-fresh-smoke.meta b/plans/baselines/logs/nix-run-fresh-smoke.meta new file mode 100644 index 0000000..85e673d --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-smoke.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-smoke +start=2026-04-04T16:46:41+09:00 +end=2026-04-04T16:48:14+09:00 +status=1 diff --git a/plans/baselines/main-baseline-2026-04-04.md b/plans/baselines/main-baseline-2026-04-04.md new file mode 100644 index 0000000..310a87f --- /dev/null +++ b/plans/baselines/main-baseline-2026-04-04.md @@ -0,0 +1,52 @@ +# UltraCloud Baseline 2026-04-04 + +Branch: `task/f5c70db0-baseline-profiles` from `origin/main` + +This file records the required smoke/build/eval commands requested by task `f5c70db0-0106-4200-bf99-0c5105116367` before profile-definition changes. + +## Branch Setup + +```bash +git fetch origin && git switch -c task/f5c70db0-baseline-profiles origin/main +``` + +Result: success. The working branch now tracks `origin/main`. + +## Environment Notes + +- Host kernel: `Linux cn-ubuntu-xgpu 6.17.0-14-generic` +- Nix: `2.33.3` +- `/dev/kvm`: absent in this environment +- Nix builder features observed during `deployer-vm-smoke`: `{benchmark, big-parallel, nixos-test, uid-range}` +- Raw command logs are stored under `plans/baselines/logs/` + +## Baseline Command Results + +| Command | Start | End | Status | Result summary | +| --- | --- | --- | --- | --- | +| `nix run ./nix/test-cluster#cluster -- fresh-smoke` | `2026-04-04T16:46:41+09:00` | `2026-04-04T16:48:14+09:00` | `1` | built the cluster runner closure, then failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp` | `2026-04-04T16:48:18+09:00` | `2026-04-04T16:48:23+09:00` | `1` | failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix run ./nix/test-cluster#cluster -- fresh-matrix` | `2026-04-04T16:48:26+09:00` | `2026-04-04T16:48:29+09:00` | `1` | failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix build .#checks.x86_64-linux.deployer-vm-smoke` | `2026-04-04T16:44:34+09:00` | `2026-04-04T16:50:40+09:00` | `1` | built most of the test closure, then failed because the current builder does not advertise the required `kvm` system feature | + +## Baseline `nix eval` Results + +| Output | Start | End | Status | Result | +| --- | --- | --- | --- | --- | +| `ultracloud-iso` | `2026-04-04T16:43:34+09:00` | `2026-04-04T16:43:41+09:00` | `0` | `/nix/store/j60isp8ai10vkgdncvi3wcjdgxqwjzpy-nixos-system-nixos-26.05.20251208.addf7cf.drv` | +| `node01` | `2026-04-04T16:43:45+09:00` | `2026-04-04T16:43:49+09:00` | `0` | `/nix/store/94g1xyv25s09hyyi924sp5bxb0y8kir9-nixos-system-node01-26.05.20251208.addf7cf.drv` | +| `netboot-control-plane` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:44:01+09:00` | `0` | `/nix/store/afknxzr1mhrlrzrkp8mj9q1fwwahdld3-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` | +| `netboot-worker` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:43:56+09:00` | `1` | `undefined variable 'plasmavmc-server'` at `nix/images/netboot-worker.nix:28:5` | +| `netboot-all-in-one` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:43:56+09:00` | `1` | `undefined variable 'chainfire-server'` at `nix/images/netboot-all-in-one.nix:39:5` | + +## Post-Baseline Repair + +After recording the baseline, `flake.nix` was adjusted so the netboot image configurations receive the UltraCloud overlay during evaluation. That keeps the baseline intact while making the named canonical-profile outputs evaluable. + +Post-fix spot check: + +- `ultracloud-iso`: `/nix/store/j60isp8ai10vkgdncvi3wcjdgxqwjzpy-nixos-system-nixos-26.05.20251208.addf7cf.drv` +- `node01`: `/nix/store/di87n45m5v30n8gccbs8pic2j8wbwgvr-nixos-system-node01-26.05.20251208.addf7cf.drv` +- `netboot-control-plane`: `/nix/store/afknxzr1mhrlrzrkp8mj9q1fwwahdld3-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` +- `netboot-worker`: `/nix/store/6x51ss2ql1n4nhi8ad0avhvzk4n6arcr-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` +- `netboot-all-in-one`: `/nix/store/2l57rda3pnd1hivjicfmp53zpimxn00n-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` From bf208ca0ffd29dad18d6d853e6c5a62bb629f567 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:49:28 +0900 Subject: [PATCH 02/12] Run KVM validation on self-hosted runners --- .github/workflows/kvm-publishable.yml | 32 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/.github/workflows/kvm-publishable.yml b/.github/workflows/kvm-publishable.yml index b85f1b2..748b483 100644 --- a/.github/workflows/kvm-publishable.yml +++ b/.github/workflows/kvm-publishable.yml @@ -5,19 +5,38 @@ on: jobs: publishable-kvm-suite: - runs-on: ubuntu-latest + runs-on: + - self-hosted + - linux + - x64 timeout-minutes: 360 steps: - uses: actions/checkout@v4 - - uses: DeterminateSystems/nix-installer-action@v11 - - - uses: DeterminateSystems/magic-nix-cache-action@v8 + - name: Ensure Nix Is Available + run: | + set -euo pipefail + if ! command -v nix >/dev/null 2>&1; then + curl -L https://nixos.org/nix/install | sh -s -- --no-daemon + fi + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + mkdir -p "$HOME/.config/nix" + printf '%s\n' 'experimental-features = nix-command flakes' > "$HOME/.config/nix/nix.conf" + nix --version - name: Probe KVM Environment run: | set -euo pipefail + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi echo "hostname=$(hostname)" uname -a id @@ -33,5 +52,10 @@ jobs: - name: Run Publishable KVM Suite run: | set -euo pipefail + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi chmod +x ./nix/test-cluster/run-publishable-kvm-suite.sh ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" From 45e77a70edbd3511848dc14907ffcc3df91657d9 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:51:29 +0900 Subject: [PATCH 03/12] Rename KVM workflow to force a fresh dispatch --- .../{kvm-publishable.yml => kvm-publishable-selfhosted.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/workflows/{kvm-publishable.yml => kvm-publishable-selfhosted.yml} (100%) diff --git a/.github/workflows/kvm-publishable.yml b/.github/workflows/kvm-publishable-selfhosted.yml similarity index 100% rename from .github/workflows/kvm-publishable.yml rename to .github/workflows/kvm-publishable-selfhosted.yml From b8ef9b64ad3f5057c22890fa988045faf112cbb6 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:52:40 +0900 Subject: [PATCH 04/12] Trigger KVM workflow on push --- .github/workflows/kvm-publishable-selfhosted.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index 748b483..e171664 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -1,6 +1,7 @@ name: KVM Publishable Validation on: + push: workflow_dispatch: jobs: From a581c9f3b967dd798f501b1dfc58ef9f92510175 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:57:37 +0900 Subject: [PATCH 05/12] Match KVM workflow to Forgejo runner labels --- .github/workflows/kvm-publishable-selfhosted.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index e171664..5ad436e 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -6,10 +6,7 @@ on: jobs: publishable-kvm-suite: - runs-on: - - self-hosted - - linux - - x64 + runs-on: nix-host timeout-minutes: 360 steps: From 8bb926d66fda7e417dfabe19f25999f959076076 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:59:00 +0900 Subject: [PATCH 06/12] Source Nix before KVM workflow bootstrap --- .github/workflows/kvm-publishable-selfhosted.yml | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index 5ad436e..e2d537a 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -15,14 +15,24 @@ jobs: - name: Ensure Nix Is Available run: | set -euo pipefail - if ! command -v nix >/dev/null 2>&1; then - curl -L https://nixos.org/nix/install | sh -s -- --no-daemon - fi + export PATH="/run/current-system/sw/bin:/nix/var/nix/profiles/default/bin:$HOME/.nix-profile/bin:$PATH" if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then . "$HOME/.nix-profile/etc/profile.d/nix.sh" fi + if ! command -v nix >/dev/null 2>&1; then + if ! command -v xz >/dev/null 2>&1; then + echo "Nix is not on PATH and xz is unavailable for bootstrap" + exit 1 + fi + curl -L https://nixos.org/nix/install | sh -s -- --no-daemon + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + fi mkdir -p "$HOME/.config/nix" printf '%s\n' 'experimental-features = nix-command flakes' > "$HOME/.config/nix/nix.conf" nix --version From 26a306da1c2879449665b5df86ea7cdda30f113b Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 03:59:52 +0900 Subject: [PATCH 07/12] Harden KVM suite runner bootstrap --- .github/workflows/kvm-publishable-selfhosted.yml | 6 ++++-- nix/test-cluster/run-publishable-kvm-suite.sh | 12 +++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index e2d537a..b2a44c5 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -40,12 +40,13 @@ jobs: - name: Probe KVM Environment run: | set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then . "$HOME/.nix-profile/etc/profile.d/nix.sh" fi - echo "hostname=$(hostname)" + echo "hostname=$(uname -n)" uname -a id test -e /dev/kvm @@ -60,10 +61,11 @@ jobs: - name: Run Publishable KVM Suite run: | set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then . "$HOME/.nix-profile/etc/profile.d/nix.sh" fi chmod +x ./nix/test-cluster/run-publishable-kvm-suite.sh - ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" + bash ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" diff --git a/nix/test-cluster/run-publishable-kvm-suite.sh b/nix/test-cluster/run-publishable-kvm-suite.sh index 7425173..326d79d 100755 --- a/nix/test-cluster/run-publishable-kvm-suite.sh +++ b/nix/test-cluster/run-publishable-kvm-suite.sh @@ -1,6 +1,8 @@ #!/usr/bin/env bash set -euo pipefail +export PATH="/run/current-system/sw/bin:/usr/bin:/bin:${PATH}" + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" LOG_DIR="${1:-${ULTRACLOUD_KVM_PUBLISHABLE_LOG_DIR:-${REPO_ROOT}/work/publishable-kvm-suite}}" @@ -11,10 +13,18 @@ log() { printf '[publishable-kvm-suite] %s\n' "$*" } +get_hostname() { + if command -v hostname >/dev/null 2>&1; then + hostname + else + uname -n + fi +} + capture_environment() { { printf 'started_at=%s\n' "$(date -Is)" - printf 'hostname=%s\n' "$(hostname)" + printf 'hostname=%s\n' "$(get_hostname)" printf 'kernel=%s\n' "$(uname -a)" printf 'pwd=%s\n' "$(pwd)" printf 'user=%s\n' "$(id -un)" From c1c610d2db7b8dd430b0c544d746ca4ab9a608f4 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 04:01:50 +0900 Subject: [PATCH 08/12] Route KVM suite temp files to larger volumes --- .../workflows/kvm-publishable-selfhosted.yml | 3 ++ nix/test-cluster/run-publishable-kvm-suite.sh | 35 +++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index b2a44c5..a802f22 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -57,6 +57,9 @@ jobs: if [[ -f /sys/module/kvm_amd/parameters/nested ]]; then echo "kvm_amd_nested=$(cat /sys/module/kvm_amd/parameters/nested)" fi + echo "runner_temp=${RUNNER_TEMP}" + df -h / /tmp /var/tmp "$RUNNER_TEMP" || true + df -h /nix || true - name: Run Publishable KVM Suite run: | diff --git a/nix/test-cluster/run-publishable-kvm-suite.sh b/nix/test-cluster/run-publishable-kvm-suite.sh index 326d79d..0e3b1de 100755 --- a/nix/test-cluster/run-publishable-kvm-suite.sh +++ b/nix/test-cluster/run-publishable-kvm-suite.sh @@ -13,6 +13,27 @@ log() { printf '[publishable-kvm-suite] %s\n' "$*" } +choose_runtime_root() { + local candidate avail best="" best_avail=-1 + + for candidate in /nix/var/tmp /var/tmp /tmp "${HOME}"; do + mkdir -p "${candidate}" 2>/dev/null || continue + avail="$(df -Pk "${candidate}" 2>/dev/null | awk 'NR==2 { print $4 }')" + [[ -n "${avail}" ]] || continue + if (( avail > best_avail )); then + best="${candidate}" + best_avail="${avail}" + fi + done + + [[ -n "${best}" ]] || { + log "no writable runtime root found" + return 1 + } + + printf '%s\n' "${best}/ultracloud-publishable-kvm-suite" +} + get_hostname() { if command -v hostname >/dev/null 2>&1; then hostname @@ -21,6 +42,16 @@ get_hostname() { fi } +prepare_runtime_dirs() { + local runtime_root + + runtime_root="$(choose_runtime_root)" + export ULTRACLOUD_KVM_RUNTIME_ROOT="${runtime_root}" + export TMPDIR="${runtime_root}/tmp" + export XDG_CACHE_HOME="${runtime_root}/xdg-cache" + mkdir -p "${TMPDIR}" "${XDG_CACHE_HOME}" +} + capture_environment() { { printf 'started_at=%s\n' "$(date -Is)" @@ -33,6 +64,9 @@ capture_environment() { printf 'branch=%s\n' "$(git -C "${REPO_ROOT}" branch --show-current)" printf 'commit=%s\n' "$(git -C "${REPO_ROOT}" rev-parse HEAD)" printf 'nix_version=%s\n' "$(nix --version)" + printf 'runtime_root=%s\n' "${ULTRACLOUD_KVM_RUNTIME_ROOT:-}" + printf 'tmpdir=%s\n' "${TMPDIR:-}" + printf 'xdg_cache_home=%s\n' "${XDG_CACHE_HOME:-}" printf 'kvm_present=%s\n' "$([[ -e /dev/kvm ]] && echo yes || echo no)" if [[ -e /dev/kvm ]]; then printf 'kvm_stat=%s\n' "$(stat -c '%A %U %G %t:%T' /dev/kvm)" @@ -79,6 +113,7 @@ run_case() { } main() { + prepare_runtime_dirs capture_environment [[ -e /dev/kvm ]] || { From f931f892e392685073075dcaf559350b229fb544 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 04:17:40 +0900 Subject: [PATCH 09/12] Document and harden remote KVM publishable lane --- README.md | 2 ++ docs/testing.md | 2 ++ nix/test-cluster/README.md | 1 + nix/test-cluster/flake.nix | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index daef69e..aefcf70 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,8 @@ nix run ./nix/test-cluster#cluster -- fresh-matrix ./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite ``` +The repository-owned remote entrypoint for the same suite is [`.github/workflows/kvm-publishable-selfhosted.yml`](.github/workflows/kvm-publishable-selfhosted.yml). It runs the wrapper on Forgejo runners labeled `nix-host`, and those runners must expose `/dev/kvm` with nested virtualization enabled. + Project-done release proof now requires both halves of the public validation surface to be green: - `baremetal-iso` and `baremetal-iso-e2e` for the canonical `deployer -> installer -> nix-agent` bare-metal bootstrap path diff --git a/docs/testing.md b/docs/testing.md index 975d4cf..e6e9d93 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -86,6 +86,8 @@ Use these commands as the release-facing local proof set: - `baremetal-iso-e2e`: flake-check wrapper around the same canonical ISO harness - `deployer-vm-smoke`: lightweight regression proving that `nix-agent` can activate a host-built target closure without guest-side compilation +The repository-owned remote entrypoint for the same publishable KVM proof is [`.github/workflows/kvm-publishable-selfhosted.yml`](../.github/workflows/kvm-publishable-selfhosted.yml). It targets Forgejo runners labeled `nix-host` and expects `/dev/kvm` plus nested virtualization on those hosts. + ## Responsibility Coverage - `baremetal-iso` and `baremetal-iso-e2e` are the canonical proof for `deployer -> installer -> nix-agent`. They cover phone-home, install-plan materialization, Disko, reboot, and desired-system activation. diff --git a/nix/test-cluster/README.md b/nix/test-cluster/README.md index 11934d5..00f0ae7 100644 --- a/nix/test-cluster/README.md +++ b/nix/test-cluster/README.md @@ -7,6 +7,7 @@ The same harness also owns the canonical bare-metal bootstrap proof: a raw-QEMU When `/dev/kvm` is absent, the portable fallback is not another harness subcommand. Use the root-flake non-KVM lane instead: `nix build .#checks.x86_64-linux.portable-control-plane-regressions`. When `/dev/kvm` and nested virtualization are available, the reproducible publishable lane is `./nix/test-cluster/run-publishable-kvm-suite.sh`, which records environment metadata and then runs `fresh-smoke`, `fresh-demo-vm-webapp`, and `fresh-matrix` in order. +The repository-owned remote entrypoint for the same suite is [`.github/workflows/kvm-publishable-selfhosted.yml`](../../.github/workflows/kvm-publishable-selfhosted.yml), which targets Forgejo `nix-host` runners with `/dev/kvm` and nested virtualization enabled. ## What it validates diff --git a/nix/test-cluster/flake.nix b/nix/test-cluster/flake.nix index a1f2d53..4d858aa 100644 --- a/nix/test-cluster/flake.nix +++ b/nix/test-cluster/flake.nix @@ -74,7 +74,7 @@ text = '' repo_root="$(${pkgs.gitMinimal}/bin/git rev-parse --show-toplevel 2>/dev/null || ${pkgs.coreutils}/bin/pwd)" export PHOTON_CLUSTER_FLAKE="''${repo_root}/nix/test-cluster" - exec "''${repo_root}/nix/test-cluster/run-cluster.sh" "$@" + exec ${pkgs.bash}/bin/bash "''${repo_root}/nix/test-cluster/run-cluster.sh" "$@" ''; }; From c527d50a9e4ae0d84ad4ae5b5b666309cee81170 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 04:19:40 +0900 Subject: [PATCH 10/12] Use manual checkout for KVM publishable workflow --- .../workflows/kvm-publishable-selfhosted.yml | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index a802f22..6a88b23 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -10,8 +10,6 @@ jobs: timeout-minutes: 360 steps: - - uses: actions/checkout@v4 - - name: Ensure Nix Is Available run: | set -euo pipefail @@ -37,6 +35,43 @@ jobs: printf '%s\n' 'experimental-features = nix-command flakes' > "$HOME/.config/nix/nix.conf" nix --version + - name: Checkout Repository + env: + REPO_URL: https://git.centraworks.net/centra/photoncloud-monorepo + run: | + set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" + + choose_checkout_root() { + local candidate avail best="" best_avail=-1 + for candidate in /var/tmp /tmp "$HOME"; do + mkdir -p "$candidate" 2>/dev/null || continue + avail="$(df -Pk "$candidate" 2>/dev/null | awk 'NR==2 { print $4 }')" + [[ -n "$avail" ]] || continue + if (( avail > best_avail )); then + best="$candidate" + best_avail="$avail" + fi + done + printf '%s\n' "$best" + } + + checkout_root="$(choose_checkout_root)" + repo_root="$(mktemp -d "${checkout_root}/ultracloud-kvm-checkout.XXXXXX")" + auth="$(printf '%s' "${GITHUB_ACTOR}:${GITHUB_TOKEN}" | base64 | tr -d '\n')" + + git init "$repo_root" + cd "$repo_root" + git remote add origin "$REPO_URL" + git -c http.extraHeader="AUTHORIZATION: basic ${auth}" fetch --depth=1 origin "${GITHUB_SHA}" + git checkout --detach FETCH_HEAD + git config --global --add safe.directory "$repo_root" + + { + printf 'REPO_ROOT=%s\n' "$repo_root" + printf 'CHECKOUT_ROOT=%s\n' "$checkout_root" + } >> "$GITHUB_ENV" + - name: Probe KVM Environment run: | set -euo pipefail @@ -58,7 +93,10 @@ jobs: echo "kvm_amd_nested=$(cat /sys/module/kvm_amd/parameters/nested)" fi echo "runner_temp=${RUNNER_TEMP}" + echo "repo_root=${REPO_ROOT}" + echo "checkout_root=${CHECKOUT_ROOT}" df -h / /tmp /var/tmp "$RUNNER_TEMP" || true + df -h "$REPO_ROOT" || true df -h /nix || true - name: Run Publishable KVM Suite @@ -70,5 +108,5 @@ jobs: elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then . "$HOME/.nix-profile/etc/profile.d/nix.sh" fi - chmod +x ./nix/test-cluster/run-publishable-kvm-suite.sh + cd "$REPO_ROOT" bash ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" From 955214f393c68d8215e51c6dad24053aa7100f75 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 04:20:17 +0900 Subject: [PATCH 11/12] Pass Forgejo token into manual checkout step --- .github/workflows/kvm-publishable-selfhosted.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index 6a88b23..6afbc57 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -38,6 +38,8 @@ jobs: - name: Checkout Repository env: REPO_URL: https://git.centraworks.net/centra/photoncloud-monorepo + REPO_ACTOR: ${{ github.actor }} + REPO_TOKEN: ${{ github.token }} run: | set -euo pipefail export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" @@ -58,7 +60,7 @@ jobs: checkout_root="$(choose_checkout_root)" repo_root="$(mktemp -d "${checkout_root}/ultracloud-kvm-checkout.XXXXXX")" - auth="$(printf '%s' "${GITHUB_ACTOR}:${GITHUB_TOKEN}" | base64 | tr -d '\n')" + auth="$(printf '%s' "${REPO_ACTOR}:${REPO_TOKEN}" | base64 | tr -d '\n')" git init "$repo_root" cd "$repo_root" From 3cf0cd49b918bfe848e8d93460f84ce1acf4fa05 Mon Sep 17 00:00:00 2001 From: centra Date: Sun, 5 Apr 2026 04:24:15 +0900 Subject: [PATCH 12/12] Pin KVM publishable lane to mouse runner --- .github/workflows/kvm-publishable-selfhosted.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml index 6afbc57..efe24ed 100644 --- a/.github/workflows/kvm-publishable-selfhosted.yml +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -6,7 +6,9 @@ on: jobs: publishable-kvm-suite: - runs-on: nix-host + runs-on: + - nix-host + - cn-nixos-mouse-runner timeout-minutes: 360 steps: