diff --git a/.github/workflows/kvm-publishable-selfhosted.yml b/.github/workflows/kvm-publishable-selfhosted.yml new file mode 100644 index 0000000..efe24ed --- /dev/null +++ b/.github/workflows/kvm-publishable-selfhosted.yml @@ -0,0 +1,116 @@ +name: KVM Publishable Validation + +on: + push: + workflow_dispatch: + +jobs: + publishable-kvm-suite: + runs-on: + - nix-host + - cn-nixos-mouse-runner + timeout-minutes: 360 + + steps: + - name: Ensure Nix Is Available + run: | + set -euo pipefail + export PATH="/run/current-system/sw/bin:/nix/var/nix/profiles/default/bin:$HOME/.nix-profile/bin:$PATH" + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + if ! command -v nix >/dev/null 2>&1; then + if ! command -v xz >/dev/null 2>&1; then + echo "Nix is not on PATH and xz is unavailable for bootstrap" + exit 1 + fi + curl -L https://nixos.org/nix/install | sh -s -- --no-daemon + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + fi + mkdir -p "$HOME/.config/nix" + printf '%s\n' 'experimental-features = nix-command flakes' > "$HOME/.config/nix/nix.conf" + nix --version + + - name: Checkout Repository + env: + REPO_URL: https://git.centraworks.net/centra/photoncloud-monorepo + REPO_ACTOR: ${{ github.actor }} + REPO_TOKEN: ${{ github.token }} + run: | + set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" + + choose_checkout_root() { + local candidate avail best="" best_avail=-1 + for candidate in /var/tmp /tmp "$HOME"; do + mkdir -p "$candidate" 2>/dev/null || continue + avail="$(df -Pk "$candidate" 2>/dev/null | awk 'NR==2 { print $4 }')" + [[ -n "$avail" ]] || continue + if (( avail > best_avail )); then + best="$candidate" + best_avail="$avail" + fi + done + printf '%s\n' "$best" + } + + checkout_root="$(choose_checkout_root)" + repo_root="$(mktemp -d "${checkout_root}/ultracloud-kvm-checkout.XXXXXX")" + auth="$(printf '%s' "${REPO_ACTOR}:${REPO_TOKEN}" | base64 | tr -d '\n')" + + git init "$repo_root" + cd "$repo_root" + git remote add origin "$REPO_URL" + git -c http.extraHeader="AUTHORIZATION: basic ${auth}" fetch --depth=1 origin "${GITHUB_SHA}" + git checkout --detach FETCH_HEAD + git config --global --add safe.directory "$repo_root" + + { + printf 'REPO_ROOT=%s\n' "$repo_root" + printf 'CHECKOUT_ROOT=%s\n' "$checkout_root" + } >> "$GITHUB_ENV" + + - name: Probe KVM Environment + run: | + set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + echo "hostname=$(uname -n)" + uname -a + id + test -e /dev/kvm + ls -l /dev/kvm + if [[ -f /sys/module/kvm_intel/parameters/nested ]]; then + echo "kvm_intel_nested=$(cat /sys/module/kvm_intel/parameters/nested)" + fi + if [[ -f /sys/module/kvm_amd/parameters/nested ]]; then + echo "kvm_amd_nested=$(cat /sys/module/kvm_amd/parameters/nested)" + fi + echo "runner_temp=${RUNNER_TEMP}" + echo "repo_root=${REPO_ROOT}" + echo "checkout_root=${CHECKOUT_ROOT}" + df -h / /tmp /var/tmp "$RUNNER_TEMP" || true + df -h "$REPO_ROOT" || true + df -h /nix || true + + - name: Run Publishable KVM Suite + run: | + set -euo pipefail + export PATH="/run/current-system/sw/bin:/usr/bin:/bin:$PATH" + if [[ -f /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh ]]; then + . /nix/var/nix/profiles/default/etc/profile.d/nix-daemon.sh + elif [[ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + cd "$REPO_ROOT" + bash ./nix/test-cluster/run-publishable-kvm-suite.sh "$RUNNER_TEMP/publishable-kvm-suite" diff --git a/README.md b/README.md index 2f6de10..eb0af84 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,7 @@ nix run ./nix/test-cluster#cluster -- chainfire-live-membership-proof ./nix/test-cluster/run-publishable-kvm-suite.sh ./work/publishable-kvm-suite ``` -The checked-in entrypoint for the publishable nested-KVM suite is the local wrapper `./nix/test-cluster/run-publishable-kvm-suite.sh`. Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally not part of this re-aggregated baseline. +The checked-in local entrypoint for the publishable nested-KVM suite is `./nix/test-cluster/run-publishable-kvm-suite.sh`. The repository-owned remote entrypoint is [`.github/workflows/kvm-publishable-selfhosted.yml`](.github/workflows/kvm-publishable-selfhosted.yml), which runs the same wrapper on Forgejo runners labeled `nix-host` and `cn-nixos-mouse-runner`. For the full supported-surface proof on a local AMD/KVM host, use `./nix/test-cluster/run-supported-surface-final-proof.sh ./work/final-proofs/latest`; it keeps builders local, builds `single-node-trial-vm`, runs `single-node-quickstart`, and captures the publishable KVM suite logs in one place. `nix run ./nix/test-cluster#cluster -- durability-proof` is the canonical chainfire flaredb deployer backup/restore lane. It persists artifacts under `./work/durability-proof/latest`, proves logical backup/restore for ChainFire keys and FlareDB SQL rows, uses the canonical Deployer admin pre-register request itself as the backup artifact, verifies that the pre-registered node survives a `deployer.service` restart, replays the same request idempotently, and injects CoronaFS plus LightningStor failures against the same live KVM cluster. `nix run ./nix/test-cluster#cluster -- rollout-soak` is the longer-running control-plane and rollout companion lane. It rebuilds from clean local KVM runtime state, persists artifacts under `./work/rollout-soak/latest`, validates exactly one planned `draining` maintenance cycle and one fail-stop worker-loss cycle on the two native-runtime workers, holds each degraded state for the configured soak window, then restarts `deployer`, `fleet-scheduler`, `node-agent`, `chainfire`, and `flaredb` before revalidating the cluster. The soak root also carries explicit scope markers so the supported boundary is encoded in the proof artifacts rather than only in docs. The steady-state KVM nodes do not run `nix-agent.service`, so the soak lane records explicit `nix-agent` scope markers instead of pretending a live-cluster `nix-agent` restart happened. diff --git a/docs/testing.md b/docs/testing.md index da43c20..9f13578 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -176,7 +176,7 @@ Use these commands as the release-facing local proof set: `single-node-trial-vm` and `single-node-quickstart` are the standalone VM-platform story. They keep the minimal KVM-backed surface separate from the rollout stack. -The checked-in entrypoint for the publishable KVM proof is the local wrapper `./nix/test-cluster/run-publishable-kvm-suite.sh`. Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally excluded from this baseline branch. +The checked-in local entrypoint for the publishable KVM proof is `./nix/test-cluster/run-publishable-kvm-suite.sh`. The repository-owned remote entrypoint is [`.github/workflows/kvm-publishable-selfhosted.yml`](../.github/workflows/kvm-publishable-selfhosted.yml), which runs the same wrapper on Forgejo runners labeled `nix-host` and `cn-nixos-mouse-runner`. The 2026-04-10 local AMD/KVM proof snapshot is recorded under `./work/final-proofs/32f64c10-1b74-4d8a-8d7d-b2cc6bf6b4f0-final` for `supported-surface-guard`, `single-node-trial-vm`, and `single-node-quickstart`, under `./work/publishable-kvm-suite` for the passing `fresh-smoke`, `fresh-demo-vm-webapp`, `fresh-matrix`, and wrapper environment capture, and under `./work/rollout-soak/20260410T164549+0900` for the longer-running rollout/control-plane soak. The 2026-04-10 exact bare-metal check-runner proof is recorded under `./work/baremetal-iso-e2e/0de75570-dabd-471b-95fe-5898c54e2e8c`; its outer `environment.txt` records `execution_model=materialized-check-runner`, while `state/environment.txt` records `vm_accelerator_mode=kvm`. diff --git a/nix/test-cluster/README.md b/nix/test-cluster/README.md index 902bf28..ac6b011 100644 --- a/nix/test-cluster/README.md +++ b/nix/test-cluster/README.md @@ -20,7 +20,7 @@ When `/dev/kvm` and nested virtualization are available, the reproducible publis `./nix/test-cluster/run-core-control-plane-ops-proof.sh` is the focused operator lifecycle proof for `chainfire`, `flaredb`, and `iam`. It records the published ChainFire live-membership API boundary, the FlareDB additive-first migration and destructive-DDL boundary, and the standalone IAM bootstrap hardening plus signing-key, credential, and mTLS rotation proof under `./work/core-control-plane-ops-proof`. `./nix/test-cluster/work-root-budget.sh` is the checked helper for local disk budget reporting, stronger local enforcement, and safer cleanup guidance under `./work`. The dated 2026-04-10 artifact root for the focused control-plane proof is `./work/core-control-plane-ops-proof/20260410T172148+09:00`. -Runner-specific workflow wiring from `task/f5c70db0-baseline-profiles` is intentionally excluded from this re-aggregated baseline; the checked-in artifact here is the local wrapper. +The repository-owned remote entrypoint for the same publishable KVM proof is [`.github/workflows/kvm-publishable-selfhosted.yml`](../../.github/workflows/kvm-publishable-selfhosted.yml). It runs the local wrapper on Forgejo runners labeled `nix-host` and `cn-nixos-mouse-runner`. ## What it validates diff --git a/plans/baselines/logs/nix-build-deployer-vm-smoke.meta b/plans/baselines/logs/nix-build-deployer-vm-smoke.meta new file mode 100644 index 0000000..42dd56e --- /dev/null +++ b/plans/baselines/logs/nix-build-deployer-vm-smoke.meta @@ -0,0 +1,4 @@ +command=nix build .#checks.x86_64-linux.deployer-vm-smoke +start=2026-04-04T16:44:34+09:00 +end=2026-04-04T16:50:40+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-netboot-all-in-one.meta b/plans/baselines/logs/nix-eval-netboot-all-in-one.meta new file mode 100644 index 0000000..6319d17 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-all-in-one.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-all-in-one.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:43:56+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-netboot-control-plane.meta b/plans/baselines/logs/nix-eval-netboot-control-plane.meta new file mode 100644 index 0000000..3e4a995 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-control-plane.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-control-plane.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:44:01+09:00 +status=0 diff --git a/plans/baselines/logs/nix-eval-netboot-worker.meta b/plans/baselines/logs/nix-eval-netboot-worker.meta new file mode 100644 index 0000000..2101023 --- /dev/null +++ b/plans/baselines/logs/nix-eval-netboot-worker.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.netboot-worker.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:54+09:00 +end=2026-04-04T16:43:56+09:00 +status=1 diff --git a/plans/baselines/logs/nix-eval-node01.meta b/plans/baselines/logs/nix-eval-node01.meta new file mode 100644 index 0000000..389903e --- /dev/null +++ b/plans/baselines/logs/nix-eval-node01.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.node01.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:45+09:00 +end=2026-04-04T16:43:49+09:00 +status=0 diff --git a/plans/baselines/logs/nix-eval-ultracloud-iso.meta b/plans/baselines/logs/nix-eval-ultracloud-iso.meta new file mode 100644 index 0000000..bdf8aa6 --- /dev/null +++ b/plans/baselines/logs/nix-eval-ultracloud-iso.meta @@ -0,0 +1,4 @@ +command=nix eval --raw .#nixosConfigurations.ultracloud-iso.config.system.build.toplevel.drvPath +start=2026-04-04T16:43:34+09:00 +end=2026-04-04T16:43:41+09:00 +status=0 diff --git a/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta b/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta new file mode 100644 index 0000000..922ba21 --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-demo-vm-webapp.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp +start=2026-04-04T16:48:18+09:00 +end=2026-04-04T16:48:23+09:00 +status=1 diff --git a/plans/baselines/logs/nix-run-fresh-matrix.meta b/plans/baselines/logs/nix-run-fresh-matrix.meta new file mode 100644 index 0000000..f4f0684 --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-matrix.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-matrix +start=2026-04-04T16:48:26+09:00 +end=2026-04-04T16:48:29+09:00 +status=1 diff --git a/plans/baselines/logs/nix-run-fresh-smoke.meta b/plans/baselines/logs/nix-run-fresh-smoke.meta new file mode 100644 index 0000000..85e673d --- /dev/null +++ b/plans/baselines/logs/nix-run-fresh-smoke.meta @@ -0,0 +1,4 @@ +command=nix run ./nix/test-cluster#cluster -- fresh-smoke +start=2026-04-04T16:46:41+09:00 +end=2026-04-04T16:48:14+09:00 +status=1 diff --git a/plans/baselines/main-baseline-2026-04-04.md b/plans/baselines/main-baseline-2026-04-04.md new file mode 100644 index 0000000..310a87f --- /dev/null +++ b/plans/baselines/main-baseline-2026-04-04.md @@ -0,0 +1,52 @@ +# UltraCloud Baseline 2026-04-04 + +Branch: `task/f5c70db0-baseline-profiles` from `origin/main` + +This file records the required smoke/build/eval commands requested by task `f5c70db0-0106-4200-bf99-0c5105116367` before profile-definition changes. + +## Branch Setup + +```bash +git fetch origin && git switch -c task/f5c70db0-baseline-profiles origin/main +``` + +Result: success. The working branch now tracks `origin/main`. + +## Environment Notes + +- Host kernel: `Linux cn-ubuntu-xgpu 6.17.0-14-generic` +- Nix: `2.33.3` +- `/dev/kvm`: absent in this environment +- Nix builder features observed during `deployer-vm-smoke`: `{benchmark, big-parallel, nixos-test, uid-range}` +- Raw command logs are stored under `plans/baselines/logs/` + +## Baseline Command Results + +| Command | Start | End | Status | Result summary | +| --- | --- | --- | --- | --- | +| `nix run ./nix/test-cluster#cluster -- fresh-smoke` | `2026-04-04T16:46:41+09:00` | `2026-04-04T16:48:14+09:00` | `1` | built the cluster runner closure, then failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix run ./nix/test-cluster#cluster -- fresh-demo-vm-webapp` | `2026-04-04T16:48:18+09:00` | `2026-04-04T16:48:23+09:00` | `1` | failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix run ./nix/test-cluster#cluster -- fresh-matrix` | `2026-04-04T16:48:26+09:00` | `2026-04-04T16:48:29+09:00` | `1` | failed preflight with `/dev/kvm is not present; nested-KVM VM validation requires hardware virtualization` | +| `nix build .#checks.x86_64-linux.deployer-vm-smoke` | `2026-04-04T16:44:34+09:00` | `2026-04-04T16:50:40+09:00` | `1` | built most of the test closure, then failed because the current builder does not advertise the required `kvm` system feature | + +## Baseline `nix eval` Results + +| Output | Start | End | Status | Result | +| --- | --- | --- | --- | --- | +| `ultracloud-iso` | `2026-04-04T16:43:34+09:00` | `2026-04-04T16:43:41+09:00` | `0` | `/nix/store/j60isp8ai10vkgdncvi3wcjdgxqwjzpy-nixos-system-nixos-26.05.20251208.addf7cf.drv` | +| `node01` | `2026-04-04T16:43:45+09:00` | `2026-04-04T16:43:49+09:00` | `0` | `/nix/store/94g1xyv25s09hyyi924sp5bxb0y8kir9-nixos-system-node01-26.05.20251208.addf7cf.drv` | +| `netboot-control-plane` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:44:01+09:00` | `0` | `/nix/store/afknxzr1mhrlrzrkp8mj9q1fwwahdld3-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` | +| `netboot-worker` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:43:56+09:00` | `1` | `undefined variable 'plasmavmc-server'` at `nix/images/netboot-worker.nix:28:5` | +| `netboot-all-in-one` | `2026-04-04T16:43:54+09:00` | `2026-04-04T16:43:56+09:00` | `1` | `undefined variable 'chainfire-server'` at `nix/images/netboot-all-in-one.nix:39:5` | + +## Post-Baseline Repair + +After recording the baseline, `flake.nix` was adjusted so the netboot image configurations receive the UltraCloud overlay during evaluation. That keeps the baseline intact while making the named canonical-profile outputs evaluable. + +Post-fix spot check: + +- `ultracloud-iso`: `/nix/store/j60isp8ai10vkgdncvi3wcjdgxqwjzpy-nixos-system-nixos-26.05.20251208.addf7cf.drv` +- `node01`: `/nix/store/di87n45m5v30n8gccbs8pic2j8wbwgvr-nixos-system-node01-26.05.20251208.addf7cf.drv` +- `netboot-control-plane`: `/nix/store/afknxzr1mhrlrzrkp8mj9q1fwwahdld3-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` +- `netboot-worker`: `/nix/store/6x51ss2ql1n4nhi8ad0avhvzk4n6arcr-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv` +- `netboot-all-in-one`: `/nix/store/2l57rda3pnd1hivjicfmp53zpimxn00n-nixos-system-nixos-kexec-26.05.20251208.addf7cf.drv`