WIP snapshot: preserve dirty worktree

This commit is contained in:
centra 2026-03-20 16:25:11 +09:00
parent 1b9ed0cd22
commit ed0f9f42f4
Signed by: centra
GPG key ID: 0C09689D20B25ACA
736 changed files with 62387 additions and 111098 deletions

View file

View file

@ -14,6 +14,7 @@ jobs:
workspaces: ${{ steps.filter.outputs.changes }} workspaces: ${{ steps.filter.outputs.changes }}
any_changed: ${{ steps.filter.outputs.workspaces_any_changed }} any_changed: ${{ steps.filter.outputs.workspaces_any_changed }}
global_changed: ${{ steps.filter.outputs.global }} global_changed: ${{ steps.filter.outputs.global }}
shared_crates_changed: ${{ steps.filter.outputs.shared_crates }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- uses: dorny/paths-filter@v3 - uses: dorny/paths-filter@v3
@ -24,9 +25,17 @@ jobs:
- 'flake.nix' - 'flake.nix'
- 'flake.lock' - 'flake.lock'
- 'nix/**' - 'nix/**'
- 'nix-nos/**'
- '.github/workflows/nix.yml' - '.github/workflows/nix.yml'
- 'Cargo.toml' - 'Cargo.toml'
- 'Cargo.lock' - 'Cargo.lock'
- 'crates/**'
- 'client-common/**'
- 'baremetal/**'
- 'scripts/**'
- 'specifications/**'
- 'docs/**'
shared_crates: 'crates/**'
chainfire: 'chainfire/**' chainfire: 'chainfire/**'
flaredb: 'flaredb/**' flaredb: 'flaredb/**'
iam: 'iam/**' iam: 'iam/**'
@ -60,7 +69,21 @@ jobs:
- name: Run PhotonCloud Gate - name: Run PhotonCloud Gate
run: | run: |
nix run .#gate-ci -- --workspace ${{ matrix.workspace }} --tier 0 --no-logs nix run ./nix/ci#gate-ci -- --workspace ${{ matrix.workspace }} --tier 0 --no-logs
shared-crates-gate:
needs: filter
if: ${{ needs.filter.outputs.shared_crates_changed == 'true' }}
runs-on: ubuntu-latest
name: gate (shared crates)
steps:
- uses: actions/checkout@v4
- uses: DeterminateSystems/nix-installer-action@v11
- uses: DeterminateSystems/magic-nix-cache-action@v8
- name: Run Shared Crates Gate
run: |
nix run ./nix/ci#gate-ci -- --shared-crates --tier 0 --no-logs
# Build server packages (tier 1+) # Build server packages (tier 1+)
build: build:
@ -88,7 +111,7 @@ jobs:
# Summary job for PR status checks # Summary job for PR status checks
ci-status: ci-status:
needs: [filter, gate] needs: [filter, gate, shared-crates-gate]
if: always() if: always()
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
@ -97,10 +120,19 @@ jobs:
if [[ "${{ needs.gate.result }}" == "failure" ]]; then if [[ "${{ needs.gate.result }}" == "failure" ]]; then
exit 1 exit 1
fi fi
if [[ "${{ needs.shared-crates-gate.result }}" == "failure" ]]; then
exit 1
fi
if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" ]]; then if [[ "${{ needs.filter.outputs.any_changed }}" == "true" || "${{ needs.filter.outputs.global_changed }}" == "true" ]]; then
if [[ "${{ needs.gate.result }}" == "skipped" ]]; then if [[ "${{ needs.gate.result }}" == "skipped" ]]; then
echo "Gate was skipped despite changes. This is unexpected." echo "Gate was skipped despite changes. This is unexpected."
exit 1 exit 1
fi fi
fi fi
if [[ "${{ needs.filter.outputs.shared_crates_changed }}" == "true" ]]; then
if [[ "${{ needs.shared-crates-gate.result }}" == "skipped" ]]; then
echo "Shared crates gate was skipped despite crates/** changes. This is unexpected."
exit 1
fi
fi
echo "CI passed or was correctly skipped." echo "CI passed or was correctly skipped."

18
.gitignore vendored
View file

@ -1,5 +1,10 @@
# Claude Code # Claude Code
.cccc/ .cccc/
.code/
.codex/
.claude.json
.ralphrc
.sisyphus/
# Rust # Rust
target/ target/
@ -9,6 +14,7 @@ target/
# Nix # Nix
result result
result-* result-*
plasmavmc/result
# local CI artifacts # local CI artifacts
work/ work/
@ -33,6 +39,8 @@ Thumbs.db
# Logs # Logs
*.log *.log
quanta/test_output_renamed.log
plasmavmc/kvm_test_output.log
# VM disk images and ISOs (large binary files) # VM disk images and ISOs (large binary files)
**/*.qcow2 **/*.qcow2
@ -54,3 +62,13 @@ flaredb/repomix-output.xml
# Temporary files # Temporary files
*.tmp *.tmp
*.bak *.bak
tmp_test.txt
tmp_test_write.txt
tmp_write_check.txt
# Runtime state
data/
chainfire/data/
flaredb/data/
creditservice/.tmp/
nightlight/.tmp/

27
CONTRIBUTING.md Normal file
View file

@ -0,0 +1,27 @@
# Contributing
PhotonCloud uses Nix as the primary development and validation entrypoint.
## Setup
```bash
nix develop
```
## Before sending changes
Run the strongest local validation you can afford.
```bash
nix run ./nix/test-cluster#cluster -- fresh-smoke
```
For smaller changes, use the narrower commands under `nix/test-cluster`.
## Expectations
- keep service startup on file-first `--config` paths
- prefer Nix-native workflows over ad hoc host scripts
- do not commit secrets, private keys, runtime state, or generated disk images
- document new validation entrypoints in `README.md` or `docs/`
- when changing multi-node behavior, validate on the VM cluster rather than only with unit tests

View file

@ -1,35 +0,0 @@
Title: Foreman Task Brief (Project-specific)
Purpose (free text)
- Complete PROJECT.md Item 12 (Nightlight) - the FINAL infrastructure component (COMPLETE)
- Achieve 12/12 PROJECT.md deliverables (NOW 12/12)
- Prepare for production deployment using T032 bare-metal provisioning
Current objectives (ranked, short)
- 1) T033 Nightlight completion: S4 PromQL Engine (P0), S5 Storage, S6 Integration
- 2) Production deployment prep: NixOS modules + Nightlight observability stack
- 3) Deferred features: T029.S5 practical app demo, FlareDB SQL layer (post-MVP)
Standing work (edit freely)
- Task status monitoring: Check docs/por/T*/task.yaml for stale/blocked tasks
- Risk radar: Monitor POR.md Risk Radar for new/escalating risks
- Progress tracking: Verify step completion matches claimed LOC/test counts
- Stale task alerts: Flag tasks with no progress >48h
- Evidence validation: Spot-check evidence trail (cargo check, test counts)
Useful references
- PROJECT.md
- docs/por/POR.md
- docs/por/T*/task.yaml (active tasks)
- docs/evidence/** and .cccc/work/**
How to act each run
- Do one useful, non-interactive step within the time box (≤ 30m).
- Save temporary outputs to .cccc/work/foreman/<YYYYMMDD-HHMMSS>/.
- Write one message to .cccc/mailbox/foreman/to_peer.md with header To: Both|PeerA|PeerB and wrap body in <TO_PEER>..</TO_PEER>.
Escalation
- If a decision is needed, write a 610 line RFD and ask the peer.
Safety
- Do not modify orchestrator code/policies; provide checkable artifacts.

202
LICENSE Normal file
View file

@ -0,0 +1,202 @@
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

37
Makefile Normal file
View file

@ -0,0 +1,37 @@
# PhotonCloud Makefile
# Unifies build and test commands
.PHONY: all build cluster-up cluster-down cluster-status cluster-validate cluster-smoke cluster-matrix cluster-bench-storage clean
# Build all services (using Nix)
build:
nix build .#packages.x86_64-linux.default
# Cluster Management
cluster-up:
nix run ./nix/test-cluster#cluster -- start
cluster-down:
nix run ./nix/test-cluster#cluster -- stop
cluster-status:
nix run ./nix/test-cluster#cluster -- status
cluster-validate:
nix run ./nix/test-cluster#cluster -- validate
cluster-smoke:
nix run ./nix/test-cluster#cluster -- fresh-smoke
cluster-matrix:
nix run ./nix/test-cluster#cluster -- fresh-matrix
cluster-bench-storage:
nix run ./nix/test-cluster#cluster -- fresh-storage-bench
cluster-clean:
nix run ./nix/test-cluster#cluster -- clean
# Clean up build artifacts
clean:
rm -rf result

View file

@ -1,398 +0,0 @@
# PlasmaCloud/PhotonCloud と Nix-NOS の統合分析
## Architecture Decision (2025-12-13)
**決定:** Nix-NOSを汎用ネットワークモジュールとして別リポジトリに分離する。
### Three-Layer Architecture
```
Layer 3: PlasmaCloud Cluster (T061)
- plasmacloud-cluster.nix
- cluster-config.json生成
- Deployer (Rust)
depends on ↓
Layer 2: PlasmaCloud Network (T061)
- plasmacloud-network.nix
- FiberLB BGP連携
- PrismNET統合
depends on ↓
Layer 1: Nix-NOS Generic (T062) ← 別リポジトリ
- BGP (BIRD2/GoBGP)
- VLAN
- Network interfaces
- PlasmaCloudを知らない汎用モジュール
```
### Repository Structure
- **github.com/centra/nix-nos**: Layer 1 (汎用、VyOS/OpenWrt代替)
- **github.com/centra/plasmacloud**: Layers 2+3 (既存リポジトリ)
---
## 1. 既存プロジェクトの概要
PlasmaCloudPhotonCloudは、以下のコンポーネントで構成されるクラウド基盤プロジェクト
### コアサービス
| コンポーネント | 役割 | 技術スタック |
|---------------|------|-------------|
| **ChainFire** | 分散KVストアetcd互換 | Rust, Raft (openraft) |
| **FlareDB** | SQLデータベース | Rust, KVバックエンド |
| **IAM** | 認証・認可 | Rust, JWT/mTLS |
| **PlasmaVMC** | VM管理 | Rust, KVM/FireCracker |
| **PrismNET** | オーバーレイネットワーク | Rust, OVN連携 |
| **LightningSTOR** | オブジェクトストレージ | Rust, S3互換 |
| **FlashDNS** | DNS | Rust, hickory-dns |
| **FiberLB** | ロードバランサー | Rust, L4/L7, BGP予定 |
| **NightLight** | メトリクス | Rust, Prometheus互換 |
| **k8shost** | コンテナオーケストレーション | Rust, K8s API互換 |
### インフラ層
- **NixOSモジュール**: 各サービス用 (`nix/modules/`)
- **first-boot-automation**: 自動クラスタ参加
- **PXE/Netboot**: ベアメタルプロビジョニング
- **TLS証明書管理**: 開発用証明書生成スクリプト
---
## 2. Nix-NOS との統合ポイント
### 2.1 Baremetal Provisioning → Deployer強化
**既存の実装:**
```
first-boot-automation.nix
├── cluster-config.json による設定注入
├── bootstrap vs join の自動判定
├── マーカーファイルによる冪等性
└── systemd サービス連携
```
**Nix-NOSで追加すべき機能:**
| 既存 | Nix-NOS追加 |
|------|-------------|
| cluster-config.json (手動作成) | topology.nix から自動生成 |
| 単一クラスタ構成 | 複数クラスタ/サイト対応 |
| nixos-anywhere 依存 | Deployer (Phone Home + Push) |
| 固定IP設定 | IPAM連携による動的割当 |
**統合設計:**
```nix
# topology.nixNix-NOS
{
nix-nos.clusters.plasmacloud = {
nodes = {
"node01" = {
role = "control-plane";
ip = "10.0.1.10";
services = [ "chainfire" "flaredb" "iam" ];
};
"node02" = { role = "control-plane"; ip = "10.0.1.11"; };
"node03" = { role = "worker"; ip = "10.0.1.12"; };
};
# Nix-NOSが自動生成 → first-boot-automationが読む
# cluster-config.json の内容をNix評価時に決定
};
}
```
### 2.2 Network Management → PrismNET + FiberLB + Nix-NOS BGP
**既存の実装:**
```
PrismNET (prismnet/)
├── VPC/Subnet/Port管理
├── Security Groups
├── IPAM
└── OVN連携
FiberLB (fiberlb/)
├── L4/L7ロードバランシング
├── ヘルスチェック
├── VIP管理
└── BGP統合設計済み、GoBGPサイドカー
```
**Nix-NOSで追加すべき機能:**
```
Nix-NOS Network Layer
├── BGP設定生成BIRD2
│ ├── iBGP/eBGP自動計算
│ ├── Route Reflector対応
│ └── ポリシー抽象化
├── topology.nix → systemd-networkd
├── OpenWrt/Cisco設定生成将来
└── FiberLB BGP連携
```
**統合設計:**
```nix
# Nix-NOSのBGPモジュール → FiberLBのGoBGP設定に統合
{
nix-nos.network.bgp = {
autonomousSystems = {
"65000" = {
members = [ "node01" "node02" "node03" ];
ibgp.strategy = "route-reflector";
ibgp.reflectors = [ "node01" ];
};
};
# FiberLBのVIPをBGPで広報
vipAdvertisements = {
"fiberlb" = {
vips = [ "10.0.100.1" "10.0.100.2" ];
nextHop = "self";
communities = [ "65000:100" ];
};
};
};
# FiberLBモジュールとの連携
services.fiberlb.bgp = {
enable = true;
# Nix-NOSが生成するGoBGP設定を参照
configFile = config.nix-nos.network.bgp.gobgpConfig;
};
}
```
### 2.3 K8sパチモン → k8shost + Pure NixOS Alternative
**既存の実装:**
```
k8shost (k8shost/)
├── Pod管理gRPC API
├── Service管理ClusterIP/NodePort
├── Node管理
├── CNI連携
├── CSI連携
└── FiberLB/FlashDNS連携
```
**Nix-NOSの役割:**
k8shostはすでにKubernetesのパチモンとして機能している。Nix-NOSは
1. **k8shostを使う場合**: k8shostクラスタ自体のデプロイをNix-NOSで管理
2. **Pure NixOSK8sなし**: より軽量な選択肢として、Systemd + Nix-NOSでサービス管理
```
┌─────────────────────────────────────────────────────────────┐
│ Orchestration Options │
├─────────────────────────────────────────────────────────────┤
│ Option A: k8shost (K8s-like) │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Nix-NOS manages: cluster topology, network, certs │ │
│ │ k8shost manages: pods, services, scaling │ │
│ └─────────────────────────────────────────────────────┘ │
│ │
│ Option B: Pure NixOS (K8s-free) │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ Nix-NOS manages: everything │ │
│ │ systemd + containers, static service discovery │ │
│ │ Use case: クラウド基盤自体の管理 │ │
│ └─────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
```
**重要な洞察:**
> 「クラウドの基盤そのものを作るのにKubernetesは使いたくない」
これは正しいアプローチ。PlasmaCloudのコアサービスChainFire, FlareDB, IAM等
- K8sの上で動くのではなく、K8sを提供する側
- Pure NixOS + Systemdで管理されるべき
- Nix-NOSはこのレイヤーを担当
---
## 3. 具体的な統合計画
### Phase 1: Baremetal Provisioning統合
**目標:** first-boot-automationをNix-NOSのtopology.nixと連携
```nix
# nix/modules/first-boot-automation.nix への追加
{ config, lib, ... }:
let
# Nix-NOSのトポロジーから設定を生成
clusterConfig =
if config.nix-nos.cluster != null then
config.nix-nos.cluster.generateClusterConfig {
hostname = config.networking.hostName;
}
else
# 従来のcluster-config.json読み込み
builtins.fromJSON (builtins.readFile /etc/nixos/secrets/cluster-config.json);
in {
# 既存のfirst-boot-automationロジックはそのまま
# ただし設定ソースをNix-NOSに切り替え可能に
}
```
### Phase 2: BGP/Network統合
**目標:** FiberLBのBGP連携T055.S3をNix-NOSで宣言的に管理
```nix
# nix/modules/fiberlb-bgp-nixnos.nix
{ config, lib, pkgs, ... }:
let
fiberlbCfg = config.services.fiberlb;
nixnosBgp = config.nix-nos.network.bgp;
in {
config = lib.mkIf (fiberlbCfg.enable && nixnosBgp.enable) {
# GoBGP設定をNix-NOSから生成
services.gobgpd = {
enable = true;
configFile = pkgs.writeText "gobgp.yaml" (
nixnosBgp.generateGobgpConfig {
localAs = nixnosBgp.getLocalAs config.networking.hostName;
routerId = nixnosBgp.getRouterId config.networking.hostName;
neighbors = nixnosBgp.getPeers config.networking.hostName;
}
);
};
# FiberLBにGoBGPアドレスを注入
services.fiberlb.bgp = {
gobgpAddress = "127.0.0.1:50051";
};
};
}
```
### Phase 3: Deployer実装
**目標:** Phone Home + Push型デプロイメントコントローラー
```
plasmacloud/
├── deployer/ # 新規追加
│ ├── src/
│ │ ├── api.rs # Phone Home API
│ │ ├── orchestrator.rs # デプロイワークフロー
│ │ ├── state.rs # ード状態管理ChainFire連携
│ │ └── iso_generator.rs # ISO自動生成
│ └── Cargo.toml
└── nix/
└── modules/
└── deployer.nix # NixOSモジュール
```
**ChainFireとの連携:**
DeployerはChainFireを状態ストアとして使用
```rust
// deployer/src/state.rs
struct NodeState {
hostname: String,
status: NodeStatus, // Pending, Provisioning, Active, Failed
bootstrap_key_hash: Option<String>,
ssh_pubkey: Option<String>,
last_seen: DateTime<Utc>,
}
impl DeployerState {
async fn register_node(&self, node: &NodeState) -> Result<()> {
// ChainFireに保存
self.chainfire_client
.put(format!("deployer/nodes/{}", node.hostname), node.to_json())
.await
}
}
```
---
## 4. アーキテクチャ全体図
```
┌─────────────────────────────────────────────────────────────────────┐
│ Nix-NOS Layer │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ topology.nix │ │
│ │ - ノード定義 │ │
│ │ - ネットワークトポロジー │ │
│ │ - サービス配置 │ │
│ └─────────────────────────────────────────────────────────────┘ │
│ │ │
│ generates │ │
│ ▼ │
│ ┌──────────────┬──────────────┬──────────────┬──────────────┐ │
│ │ NixOS Config │ BIRD Config │ GoBGP Config │ cluster- │ │
│ │ (systemd) │ (BGP) │ (FiberLB) │ config.json │ │
│ └──────────────┴──────────────┴──────────────┴──────────────┘ │
└─────────────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────────────┐
│ PlasmaCloud Services │
│ ┌───────────────────────────────────────────────────────────────┐ │
│ │ Control Plane │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │ChainFire │ │ FlareDB │ │ IAM │ │ Deployer │ │ │
│ │ │(Raft KV) │ │ (SQL) │ │(AuthN/Z) │ │ (新規) │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │
│ └───────────────────────────────────────────────────────────────┘ │
│ │
│ ┌───────────────────────────────────────────────────────────────┐ │
│ │ Network Plane │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │ PrismNET │ │ FiberLB │ │ FlashDNS │ │ BIRD2 │ │ │
│ │ │ (OVN) │ │(LB+BGP) │ │ (DNS) │ │(Nix-NOS) │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ └──────────┘ │ │
│ └───────────────────────────────────────────────────────────────┘ │
│ │
│ ┌───────────────────────────────────────────────────────────────┐ │
│ │ Compute Plane │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │PlasmaVMC │ │ k8shost │ │Lightning │ │ │
│ │ │(VM/FC) │ │(K8s-like)│ │ STOR │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ │ │
│ └───────────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────────────┘
```
---
## 5. 優先度と実装順序
| 優先度 | 機能 | 依存関係 | 工数 |
|--------|------|----------|------|
| **P0** | topology.nix → cluster-config.json生成 | なし | 1週間 |
| **P0** | BGPモジュールBIRD2設定生成 | なし | 2週間 |
| **P1** | FiberLB BGP連携GoBGP | T055.S3完了 | 2週間 |
| **P1** | Deployer基本実装 | ChainFire | 3週間 |
| **P2** | OpenWrt設定生成 | BGPモジュール | 2週間 |
| **P2** | ISO自動生成パイプライン | Deployer完了後 | 1週間 |
| **P2** | 各サービスの設定をNixで管理可能なように | なし | 適当 |
---
## 6. 結論
PlasmaCloud/PhotonCloudプロジェクトは、Nix-NOSの構想を実装するための**理想的な基盤**
1. **すでにNixOSモジュール化されている** → Nix-NOSモジュールとの統合が容易
2. **first-boot-automationが存在** → Deployerの基礎として活用可能
3. **FiberLBにBGP設計がある** → Nix-NOSのBGPモジュールと自然に統合
4. **ChainFireが状態ストア** → Deployer状態管理に利用可能
5. **k8shostが存在するがK8sではない** → 「K8sパチモン」の哲学と一致
**次のアクション:**
1. Nix-NOSモジュールをPlasmaCloudリポジトリに追加
2. topology.nix → cluster-config.json生成の実装
3. BGPモジュールBIRD2の実装とFiberLB連携

View file

@ -1,105 +0,0 @@
# Project Overview
これは、日本発のクラウド基盤を作るためのプロジェクトです。
OpenStackなどの既存の使いにくいクラウド基板の代替となり、ついでに基礎技術を各種ソフトウェアに転用できるようにする。
# Principal
Peer Aへ**自分で戦略を**決めて良い!好きにやれ!
# Current Priorities
一通り実装を終わらせ、使いやすいプラットフォームと仕様が完成することを目標とする。
実装すべきもの:
1. クラスター管理用KVSchainfire
- これは、ライブラリとして作ることにする。単体でとりあえずKVSとして簡易的にも使えるという想定。
- Raft+Gossip。
2. IAM基盤aegisという名前にしたい。
- 様々な認証方法に対応しておいてほしい。
- あと、サービス感の認証もうまくやる必要がある。mTLSでやることになるだろう。IAMとしてやるのが正解かどうかはわからないが。
3. DBaaSのための高速KVSFlareDB
- そこそこクエリ効率の良いKVSを作り、その上にSQL互換レイヤーなどが乗れるようにする。
- 超高速である必要がある。
- 結果整合性モードと強整合性モードを両方載せられるようにしたい。
- Tsurugiのような高速なDBが参考になるかも知れない。
- DBaaSのためでもあるが、高速分散KVSということで、他のもののメタデータストアとして使えるべき。
- Chainfireとの棲み分けとしては、Chainfireは単体で使う時用と、大規模な場合はクラスター管理に集中させ、メタデータのストア特に、サービ ス感の連携をするような場合は他のサービスのメタデータにアクセスしたくなるだろう。その時に、このKVSから読めれば良い。はFlareDBにすると良 さそう。
4. VM基盤PlasmaVMC
- ちゃんとした抽象化をすることで、様々なVMを扱えるようにしたいKVM,FireCracker,mvisorなどなど
5. オブジェクトストレージ基盤LightningSTOR
- この基盤の標準的な感じのある程度共通化されており、使いやすいAPIと、S3互換なAPIがあると良いかも
- メタデータストアにFlareDBが使えるように当然なっているべき
6. DNSFlashDNS
- PowerDNSを100%完全に代替可能なようにしてほしい。
- Route53のようなサービスが作れるようにしたい。
- BINDも使いたくない。
- 逆引きDNSをやるためにとんでもない行数のBINDのファイルを書くというのがあり、バカバカしすぎるのでサブネットマスクみたいなものに対応すると良い。
- DNS All-Rounderという感じにしたい。
7. ロードバランサーFiberLB
- 超高速なロードバランサーとは名ばかりで、実体としてはBGPでやるので良いような気がしている。
- AWS ELBみたいなことをできるようにしたい。
- MaglevによるL4ロードバランシング
- BGP AnycastによるL2ロードバランシング
- L7ロードバランシング
- これらをいい感じにできると良い(既存のソフトウェアでできるかも?これは要確認。)
8. Kubernetesクラスタをいい感じにホストできるもの
- k0sとかk3sとかが参考になるかも知れない。
9. これらをNixOS上で動くようにパッケージ化をしたりすると良いFlake化
- あと、Nixで設定できると良い。まあ設定ファイルを生成するだけなのでそれはできると思うが
10. NixによるベアメタルプロビジョニングDeployer
- Phone Home + Push型のデプロイメントコントローラー
- topology.nix からクラスタ設定を自動生成
- ChainFireを状態ストアとして使用
- ISO自動生成パイプライン対応
11. オーバーレイネットワーク
- マルチテナントでもうまく動くためには、ユーザーの中でアクセスできるネットワークなど、考えなければいけないことが山ほどある。これを処理 するものも必要。
- とりあえずネットワーク部分自体の実装はOVNとかで良い。
12. オブザーバビリティコンポーネントNightLight
- メトリクスストアが必要
- VictoriaMetricsはmTLSが有料なので、作る必要がある
- 完全オープンソースでやりたいからね
- 最低限、Prometheus互換PromQLとスケーラビリティ、Push型というのは必須になる
- メトリクスのデータをどこに置くかは良く良く考えないといけない。スケーラビリティを考えるとS3互換ストレージの上に載せたいが…
- あと、圧縮するかどうかなど
13. クレジット・クオータ管理CreditService
- プロジェクトごとのリソース使用量と課金を管理する「銀行」のようなサービス
- 各サービスPlasmaVMCなどからのリソース作成リクエストをインターセプトして残高確認Admission Controlを行う
- NightLightから使用量メトリクスを収集して定期的に残高を引き落とすBilling Batch
# Recent Changes (2025-12-11)
- **Renaming**:
- `Nightlight` -> `NightLight` (監視・メトリクス)
- `PrismNET` -> `PrismNET` (ネットワーク)
- `PlasmaCloud` -> `PhotonCloud` (プロジェクト全体コードネーム)
- **Architecture Decision**:
- IAMにクオータ管理を持たせず、専用の `CreditService` を新設することを決定。
- `NightLight` を使用量計測のバックエンドとして活用する方針を策定。
# Next Steps
1. **CreditServiceの実装**:
- プロジェクトごとのWallet管理、残高管理機能
- gRPC APIによるAdmission Controlの実装
2. **NightLightの実装完了**:
- 永続化層とクエリエンジンの完成
- `CreditService` へのデータ提供機能の実装
3. **PlasmaVMCの改修**:
- `CreditService` と連携したリソース作成時のチェック処理追加
- プロジェクト単位のリソース総量制限の実装
# 守るべき事柄
1. Rustで書く。
2. 全部のソフトウェアにおいて、コードベースの構造や依存ライブラリ、仕様や使い方を揃えて、統一感があるようにする。
3. テスト可能なように作る。また、テストをちゃんと書く。スケーラブルかどうかや、実際に動くかどうかもテスト可能なように良く考えたうえで作る。
4. スケーラビリティに気をつけて書く。ボトルネックになる箇所はないか?と常に確認する。
5. 統一感ある仕様をちゃんと考える。specificationsの中にmdで書いていってほしい。1ソフトウェアごとにフォルダを作り、その中に仕様を書く。
6. 設定ファイルについても統一感ある仕様が必要。
7. マルチテナントに関して最初から考慮したうえで設計する次の年にAWSやGCPでそのまま採用されてもおかしくないような性能や使いやすさが必要
8. ホームラボ用途も満たすようにしたい。
9. NixのFlakeで環境を作ったり固定したりすると良い。
10. 前方互換性は気にする必要がないすでにある実装に縛られる必要はなく、両方を変更して良い。v2とかv3とかそういうふうにバージョンを増やしていくのはやめてほしい。そうではなく、完璧な一つの実装を作ることに専念してほしい。
11. ライブラリは可能な限り最新版を使う。この先も長くメンテナンスされることを想定したい。
# 実戦テスト
全ての作ったコンポーネントについて、実践的なテストを作ってバグや仕様の悪い点を洗い出し、修正する。
NixやVM、コンテナなどあらゆるものを活用してよい。
これにより、実用レベルまで持っていくことが期待される。
実用的なアプリケーションを作ってみるとか、パフォーマンスを実際に高負荷な試験で確認するとか、そのレベルのものが求められている。
また、各コンポーネントごとのテストも行うべきだが、様々なものを組み合わせるテストも行うべきである。これも含まれる。
また、設定のやり方がちゃんと統一されているかなど、細かい点まで気を配ってやる必要がある。

527
README.md
View file

@ -1,507 +1,50 @@
# PhotonCloud (旧 PlasmaCloud) # PhotonCloud
**A modern, multi-tenant cloud infrastructure platform built in Rust** PhotonCloud is a Nix-first cloud platform workspace that assembles a small control plane, network services, VM hosting, shared storage, object storage, and gateway services into one reproducible repository.
> NOTE: プロジェクトコードネームを PlasmaCloud から PhotonCloud に改称。コンポーネント名も Nightlight → NightLight へ統一済み(詳細は `PROJECT.md` の Recent Changes を参照)。 The canonical local proof path is the six-node VM cluster under [`nix/test-cluster`](/home/centra/cloud/nix/test-cluster/README.md). It builds all guest images on the host, boots them as hardware-like QEMU nodes, and validates real multi-node behavior.
> 併存する「PlasmaCloud」表記は旧コードネームを指します。PhotonCloud と読み替えてください。
PhotonCloud provides a complete cloud computing stack with strong tenant isolation, role-based access control (RBAC), and seamless integration between compute, networking, and storage services. ## Components
## MVP-Beta Status: COMPLETE ✅ - `chainfire`: replicated coordination store
- `flaredb`: replicated KV and metadata store
The MVP-Beta milestone validates end-to-end tenant isolation and core infrastructure provisioning: - `iam`: identity, token issuance, and authorization
- `prismnet`: tenant networking control plane
- ✅ **IAM**: User authentication, RBAC, multi-tenant isolation - `flashdns`: authoritative DNS service
- ✅ **PrismNET**: VPC overlay networking with tenant boundaries - `fiberlb`: load balancer control plane and dataplane
- ✅ **PlasmaVMC**: VM provisioning with network attachment - `plasmavmc`: VM control plane and worker agents
- ✅ **Integration**: E2E tests validate complete tenant path - `coronafs`: shared filesystem for mutable VM volumes
- `lightningstor`: object storage and VM image backing
**Test Results**: 8/8 integration tests passing - `k8shost`: Kubernetes-style hosting control plane
- IAM: 6/6 tenant path tests - `apigateway`: external API and proxy surface
- Network+VM: 2/2 integration tests - `nightlight`: metrics ingestion and query service
- `creditservice`: minimal reference quota/credit service
- `deployer`: bootstrap and phone-home deployment service
- `fleet-scheduler`: non-Kubernetes service scheduler for bare-metal cluster services
## Quick Start ## Quick Start
### Get Started in 3 Steps
1. **Deploy the Platform**
```bash
# Start IAM service
cd iam && cargo run --bin iam-server -- --port 50080
# Start PrismNET service
cd prismnet && cargo run --bin prismnet-server -- --port 50081
# Start PlasmaVMC service
cd plasmavmc && cargo run --bin plasmavmc-server -- --port 50082
```
2. **Onboard Your First Tenant**
```bash
# Create user, provision network, deploy VM
# See detailed guide below
```
3. **Verify End-to-End**
```bash
# Run integration tests
cd iam && cargo test --test tenant_path_integration
cd plasmavmc && cargo test --test prismnet_integration -- --ignored
```
**For detailed instructions**: [Tenant Onboarding Guide](docs/getting-started/tenant-onboarding.md)
## Architecture Overview
```
┌─────────────────────────────────────────────────────────────┐
│ User / API Client │
└─────────────────────────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ IAM (Identity & Access Management) │
│ • User authentication & JWT tokens │
│ • RBAC with hierarchical scopes (Org → Project) │
│ • Cross-tenant access denial │
└─────────────────────────────────────────────────────────────┘
┌─────────────┴─────────────┐
↓ ↓
┌──────────────────────┐ ┌──────────────────────┐
│ PrismNET │ │ PlasmaVMC │
│ • VPC overlay │────▶│ • VM provisioning │
│ • Subnets + DHCP │ │ • Hypervisor mgmt │
│ • Ports (IP/MAC) │ │ • Network attach │
│ • Security Groups │ │ • KVM, Firecracker │
└──────────────────────┘ └──────────────────────┘
```
**Full Architecture**: [MVP-Beta Tenant Path Architecture](docs/architecture/mvp-beta-tenant-path.md)
## Core Components
### IAM (Identity & Access Management)
**Location**: `/iam`
Multi-tenant identity and access management with comprehensive RBAC.
**Features**:
- User and service account management
- Hierarchical scopes: System → Organization → Project
- Custom role creation with fine-grained permissions
- Policy evaluation with conditional logic
- JWT token issuance with tenant claims
**Services**:
- `IamAdminService`: User, role, and policy management
- `IamAuthzService`: Authorization and permission checks
- `IamTokenService`: Token issuance and validation
**Quick Start**:
```bash ```bash
cd iam nix develop
cargo build --release nix run ./nix/test-cluster#cluster -- fresh-smoke
cargo run --bin iam-server -- --port 50080
``` ```
### PrismNET (Network Virtualization) ## Main Entrypoints
**Location**: `/prismnet` - workspace flake: [flake.nix](/home/centra/cloud/flake.nix)
- VM validation harness: [nix/test-cluster/README.md](/home/centra/cloud/nix/test-cluster/README.md)
- shared volume notes: [coronafs/README.md](/home/centra/cloud/coronafs/README.md)
- minimal quota-service rationale: [creditservice/README.md](/home/centra/cloud/creditservice/README.md)
- archived manual VM launch scripts: [baremetal/vm-cluster/README.md](/home/centra/cloud/baremetal/vm-cluster/README.md)
VPC-based overlay networking with tenant isolation. ## Repository Guide
**Features**: - [docs/README.md](/home/centra/cloud/docs/README.md): documentation entrypoint
- Virtual Private Cloud (VPC) provisioning - [docs/testing.md](/home/centra/cloud/docs/testing.md): validation path summary
- Subnet management with CIDR allocation - [docs/component-matrix.md](/home/centra/cloud/docs/component-matrix.md): supported multi-component compositions
- Port allocation with IP/MAC assignment - [docs/storage-benchmarks.md](/home/centra/cloud/docs/storage-benchmarks.md): latest CoronaFS and LightningStor lab numbers
- DHCP server integration - `plans/`: design notes and exploration documents
- Security group enforcement
- OVN integration for production networking
**Services**: ## Scope
- `VpcService`: VPC lifecycle management
- `SubnetService`: Subnet CRUD operations
- `PortService`: Port allocation and attachment
- `SecurityGroupService`: Firewall rule management
**Quick Start**: PhotonCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products.
```bash
cd prismnet
export IAM_ENDPOINT=http://localhost:50080
cargo build --release
cargo run --bin prismnet-server -- --port 50081
```
### PlasmaVMC (VM Provisioning & Management)
**Location**: `/plasmavmc`
Virtual machine lifecycle management with hypervisor abstraction.
**Features**:
- VM provisioning with tenant scoping
- Hypervisor abstraction (KVM, Firecracker)
- Network attachment via PrismNET ports
- CPU, memory, and disk configuration
- VM metadata persistence (ChainFire)
- Live migration support (planned)
**Services**:
- `VmService`: VM lifecycle (create, start, stop, delete)
**Quick Start**:
```bash
cd plasmavmc
export NOVANET_ENDPOINT=http://localhost:50081
export IAM_ENDPOINT=http://localhost:50080
cargo build --release
cargo run --bin plasmavmc-server -- --port 50082
```
## Future Components (Roadmap)
### FlashDNS (DNS Service)
**Status**: Planned for next milestone
DNS resolution within tenant VPCs with automatic record creation.
**Features** (Planned):
- Tenant-scoped DNS zones
- Automatic hostname assignment for VMs
- DNS record lifecycle tied to resources
- Integration with PrismNET for VPC resolution
### FiberLB (Load Balancing)
**Status**: Planned for next milestone
Layer 4/7 load balancing with tenant isolation.
**Features** (Planned):
- Load balancer provisioning within VPCs
- Backend pool management (VM targets)
- VIP allocation from tenant subnets
- Health checks and failover
### LightningStor (Block Storage)
**Status**: Planned for next milestone
Distributed block storage with snapshot support.
**Features** (Planned):
- Volume creation and attachment to VMs
- Snapshot lifecycle management
- Replication and high availability
- Integration with ChainFire for immutable logs
## Testing
### Integration Test Suite
PlasmaCloud includes comprehensive integration tests validating the complete E2E tenant path.
**IAM Tests** (6 tests, 778 LOC):
```bash
cd iam
cargo test --test tenant_path_integration
# Tests:
# ✅ test_tenant_setup_flow
# ✅ test_cross_tenant_denial
# ✅ test_rbac_project_scope
# ✅ test_hierarchical_scope_inheritance
# ✅ test_custom_role_fine_grained_permissions
# ✅ test_multiple_role_bindings
```
**Network + VM Tests** (2 tests, 570 LOC):
```bash
cd plasmavmc
cargo test --test prismnet_integration -- --ignored
# Tests:
# ✅ prismnet_port_attachment_lifecycle
# ✅ test_network_tenant_isolation
```
**Coverage**: 8/8 tests passing (100% success rate)
See [E2E Test Documentation](docs/por/T023-e2e-tenant-path/e2e_test.md) for detailed test descriptions.
## Documentation
### Getting Started
- **[Tenant Onboarding Guide](docs/getting-started/tenant-onboarding.md)**: Complete walkthrough of deploying your first tenant
### Architecture
- **[MVP-Beta Tenant Path](docs/architecture/mvp-beta-tenant-path.md)**: Complete system architecture with diagrams
- **[Component Integration](docs/architecture/mvp-beta-tenant-path.md#component-boundaries)**: How services communicate
### Testing & Validation
- **[E2E Test Documentation](docs/por/T023-e2e-tenant-path/e2e_test.md)**: Comprehensive test suite description
- **[T023 Summary](docs/por/T023-e2e-tenant-path/SUMMARY.md)**: MVP-Beta deliverables and test results
### Component Specifications
- [IAM Specification](specifications/iam.md)
- [PrismNET Specification](specifications/prismnet.md)
- [PlasmaVMC Specification](specifications/plasmavmc.md)
## Tenant Isolation Model
PlasmaCloud enforces tenant isolation at three layers:
### Layer 1: IAM Policy Enforcement
Every API call is validated against the user's JWT token:
- Token includes `org_id` and `project_id` claims
- Resources are scoped as: `org/{org_id}/project/{project_id}/{resource_type}/{id}`
- RBAC policies enforce: `resource.org_id == token.org_id`
- Cross-tenant access results in 403 Forbidden
### Layer 2: Network VPC Isolation
Each VPC provides a logical network boundary:
- VPC scoped to an `org_id`
- OVN overlay ensures traffic isolation between VPCs
- Different tenants can use the same CIDR without collision
- Security groups provide intra-VPC firewall rules
### Layer 3: VM Scoping
Virtual machines are scoped to tenant organizations:
- VM metadata includes `org_id` and `project_id`
- VMs can only attach to ports in their tenant's VPC
- VM operations filter by token scope
- Hypervisor isolation ensures compute boundary
**Validation**: All three layers tested in [cross-tenant denial tests](docs/por/T023-e2e-tenant-path/e2e_test.md#test-scenario-2-cross-tenant-denial).
## Example Workflow
### Create a Tenant with Network and VM
```bash
# 1. Authenticate and get token
grpcurl -plaintext -d '{
"principal_id": "alice",
"org_id": "acme-corp",
"project_id": "project-alpha"
}' localhost:50080 iam.v1.IamTokenService/IssueToken
export TOKEN="<your-token>"
# 2. Create VPC
grpcurl -plaintext -H "Authorization: Bearer $TOKEN" -d '{
"org_id": "acme-corp",
"project_id": "project-alpha",
"name": "main-vpc",
"cidr": "10.0.0.0/16"
}' localhost:50081 prismnet.v1.VpcService/CreateVpc
export VPC_ID="<vpc-id>"
# 3. Create Subnet
grpcurl -plaintext -H "Authorization: Bearer $TOKEN" -d '{
"org_id": "acme-corp",
"project_id": "project-alpha",
"vpc_id": "'$VPC_ID'",
"name": "web-subnet",
"cidr": "10.0.1.0/24",
"gateway": "10.0.1.1",
"dhcp_enabled": true
}' localhost:50081 prismnet.v1.SubnetService/CreateSubnet
export SUBNET_ID="<subnet-id>"
# 4. Create Port
grpcurl -plaintext -H "Authorization: Bearer $TOKEN" -d '{
"org_id": "acme-corp",
"project_id": "project-alpha",
"subnet_id": "'$SUBNET_ID'",
"name": "vm-port",
"ip_address": "10.0.1.10"
}' localhost:50081 prismnet.v1.PortService/CreatePort
export PORT_ID="<port-id>"
# 5. Create VM with Network
grpcurl -plaintext -H "Authorization: Bearer $TOKEN" -d '{
"name": "web-server-1",
"org_id": "acme-corp",
"project_id": "project-alpha",
"spec": {
"network": [{
"id": "eth0",
"port_id": "'$PORT_ID'"
}]
}
}' localhost:50082 plasmavmc.v1.VmService/CreateVm
```
**Full walkthrough**: See [Tenant Onboarding Guide](docs/getting-started/tenant-onboarding.md)
## Development
### Prerequisites
- Rust 1.70+ with Cargo
- Protocol Buffers compiler (protoc)
- Optional: KVM for real VM execution
- Optional: OVN for production networking
### Build from Source
```bash
# Clone repository
git clone https://github.com/your-org/plasmacloud.git
cd cloud
# Initialize submodules
git submodule update --init --recursive
# Build all components
cd iam && cargo build --release
cd ../prismnet && cargo build --release
cd ../plasmavmc && cargo build --release
```
### Run Tests
```bash
# IAM tests
cd iam && cargo test --test tenant_path_integration
# Network + VM tests
cd plasmavmc && cargo test --test prismnet_integration -- --ignored
# Unit tests (all components)
cargo test
```
### Project Structure
```
cloud/
├── iam/ # Identity & Access Management
│ ├── crates/
│ │ ├── iam-api/ # gRPC services
│ │ ├── iam-authz/ # Authorization engine
│ │ ├── iam-store/ # Data persistence
│ │ └── iam-types/ # Core types
│ └── tests/
│ └── tenant_path_integration.rs # E2E tests
├── prismnet/ # Network Virtualization
│ ├── crates/
│ │ ├── prismnet-server/ # gRPC services
│ │ ├── prismnet-api/ # Protocol buffers
│ │ ├── prismnet-metadata/ # Metadata store
│ │ └── prismnet-ovn/ # OVN integration
│ └── proto/
├── plasmavmc/ # VM Provisioning
│ ├── crates/
│ │ ├── plasmavmc-server/ # VM service
│ │ ├── plasmavmc-api/ # Protocol buffers
│ │ ├── plasmavmc-hypervisor/ # Hypervisor abstraction
│ │ ├── plasmavmc-kvm/ # KVM backend
│ │ └── plasmavmc-firecracker/ # Firecracker backend
│ └── tests/
│ └── prismnet_integration.rs # E2E tests
├── flashdns/ # DNS Service (planned)
├── fiberlb/ # Load Balancing (planned)
├── lightningstor/ # Block Storage (planned)
├── chainfire/ # Immutable event log (submodule)
├── flaredb/ # Distributed metadata store (submodule)
├── docs/
│ ├── architecture/ # Architecture docs
│ ├── getting-started/ # Onboarding guides
│ └── por/ # Plan of Record (POR) docs
│ └── T023-e2e-tenant-path/ # MVP-Beta deliverables
├── specifications/ # Component specifications
└── README.md # This file
```
## Contributing
We welcome contributions! Please follow these guidelines:
1. **Fork the repository** and create a feature branch
2. **Write tests** for new functionality
3. **Update documentation** as needed
4. **Run tests** before submitting PR: `cargo test`
5. **Follow Rust style**: Use `cargo fmt` and `cargo clippy`
### Code Review Process
1. All PRs require at least one approval
2. CI must pass (tests, formatting, lints)
3. Documentation must be updated for user-facing changes
4. Integration tests required for new features
## License
PlasmaCloud is licensed under the Apache License 2.0. See [LICENSE](LICENSE) for details.
## Support & Community
- **GitHub Issues**: Report bugs or request features
- **Documentation**: See [docs/](docs/) for detailed guides
- **Architecture**: Review [architecture docs](docs/architecture/mvp-beta-tenant-path.md) for design decisions
## Roadmap
### Completed (MVP-Beta) ✅
- [x] IAM with RBAC and tenant scoping
- [x] PrismNET VPC overlay networking
- [x] PlasmaVMC VM provisioning
- [x] End-to-end integration tests
- [x] Comprehensive documentation
### In Progress
- [ ] FlashDNS integration (S3)
- [ ] FiberLB integration (S4)
- [ ] LightningStor integration (S5)
### Planned
- [ ] FlareDB persistence for production
- [ ] ChainFire integration for VM metadata
- [ ] OVN production deployment
- [ ] Kubernetes integration
- [ ] Terraform provider
- [ ] Web UI / Dashboard
## Acknowledgments
PlasmaCloud builds upon:
- **ChainFire**: Immutable event log for audit trails
- **FlareDB**: Distributed metadata store
- **OVN (Open Virtual Network)**: Production-grade overlay networking
- **gRPC**: High-performance RPC framework
- **Rust**: Safe, concurrent systems programming
---
**Status**: MVP-Beta Complete ✅
**Last Updated**: 2025-12-09
**Next Milestone**: FlashDNS, FiberLB, LightningStor integration
For detailed information, see:
- [Tenant Onboarding Guide](docs/getting-started/tenant-onboarding.md)
- [Architecture Documentation](docs/architecture/mvp-beta-tenant-path.md)
- [Test Documentation](docs/por/T023-e2e-tenant-path/e2e_test.md)

13
SECURITY.md Normal file
View file

@ -0,0 +1,13 @@
# Security Policy
Do not report sensitive vulnerabilities through public issues.
Use the repository security advisory workflow or a private maintainer contact channel when this repository is published.
When reporting, include:
- affected component
- impact summary
- reproduction steps
- configuration assumptions
- any suggested mitigation or patch direction

View file

@ -1,54 +0,0 @@
# Architectural Gap Analysis: Compute & Core
**Date:** 2025-12-08
**Scope:** Core Infrastructure (Chainfire, IAM, FlareDB) & Application Services (FlashDNS, PlasmaVMC)
## Executive Summary
The platform's core infrastructure ("Data" and "Identity" pillars) is in excellent shape, with implementation matching specifications closely. However, the "Compute" pillar (PlasmaVMC) exhibits a significant architectural deviation from its specification, currently existing as a monolithic prototype rather than the specified distributed control plane/agent model.
## Component Status Matrix
| Component | Role | Specification Status | Implementation Status | Alignment |
|-----------|------|----------------------|-----------------------|-----------|
| **Chainfire** | Cluster KVS | High | High | ✅ Strong |
| **Aegis (IAM)** | Identity | High | High | ✅ Strong |
| **FlareDB** | DBaaS KVS | High | High | ✅ Strong |
| **FlashDNS** | DNS Service | High | High | ✅ Strong |
| **PlasmaVMC** | VM Platform | High | **Low / Prototype** | ❌ **Mismatch** |
## Detailed Findings
### 1. Core Infrastructure (Chainfire, Aegis, FlareDB)
* **Chainfire:** Fully implemented crate structure. Detailed feature gap analysis exists (`chainfire_t003_gap_analysis.md`).
* **Aegis:** Correctly structured with `iam-server`, `iam-authn`, `iam-authz`, etc. Integration with Chainfire/FlareDB backends is present in `main.rs`.
* **FlareDB:** Correctly structured with `flaredb-pd`, `flaredb-server` (Multi-Raft), and reserved namespaces for IAM/Metrics.
### 2. Application Services (FlashDNS)
* **Status:** Excellent.
* **Evidence:** Crate structure matches spec. Integration with Chainfire (storage) and Aegis (auth) is visible in configuration and code.
### 3. Compute Platform (PlasmaVMC) - The Gap
* **Specification:** Describes a distributed system with:
* **Control Plane:** API, Scheduler, Image management.
* **Agent:** Runs on compute nodes, manages local hypervisors.
* **Communication:** gRPC between Control Plane and Agent.
* **Current Implementation:** Monolithic `plasmavmc-server`.
* The `server` binary directly initializes `HypervisorRegistry` and registers `KvmBackend`/`FireCrackerBackend`.
* **Missing Crates:**
* `plasmavmc-agent` (Critical)
* `plasmavmc-client`
* `plasmavmc-core` (Scheduler logic)
* **Implication:** The current code cannot support multi-node deployment or scheduling. It effectively runs the control plane *on* the hypervisor node.
## Recommendations
1. **Prioritize PlasmaVMC Refactoring:** The immediate engineering focus should be to split `plasmavmc-server` into:
* `plasmavmc-server` (Control Plane, Scheduler, API)
* `plasmavmc-agent` (Node status, Hypervisor control)
2. **Implement Agent Protocol:** Define the gRPC interface between Server and Agent (`agent.proto` mentioned in spec but possibly missing or unused).
3. **Leverage Existing Foundation:** The `plasmavmc-hypervisor` trait is solid. The `agent` implementation should simply wrap this existing trait, making the refactor straightforward.
## Conclusion
The project foundation is solid. The "Data" and "Identity" layers are ready for higher-level integration. The "Compute" layer requires architectural realignment to meet the distributed design goals.

View file

@ -1,5 +0,0 @@
Peer Aへ
/a あなたはpeerAです。戦略決定と計画立案に特化してください。実際の作業は、peerBへ依頼してください。PROJECT.mdは度々更新されることがあるので、PORに内容を追加したり、適切にMVPを設定・到達状況を確認するなどもあなたの仕事です。ともかく、終える前に確実にタスクをpeerBに渡すことを考えてください。
Peer Bへ
/b peerAからの実装依頼に基づいて実装や実験などの作業を行い、終わったあとは必ずpeerAに結果をto_peer.mdで報告してください。高品質に作業を行うことに集中してください。

5094
advice.md

File diff suppressed because one or more lines are too long

819
apigateway/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -23,7 +23,7 @@ tokio = { version = "1.40", features = ["full"] }
# HTTP server # HTTP server
axum = "0.7" axum = "0.7"
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] } reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "stream", "json"] }
# Serialization # Serialization
serde = { version = "1.0", features = ["derive"] } serde = { version = "1.0", features = ["derive"] }
@ -31,7 +31,7 @@ serde_json = "1.0"
toml = "0.8" toml = "0.8"
# gRPC # gRPC
tonic = "0.12" tonic = { version = "0.12", features = ["tls"] }
tonic-build = "0.12" tonic-build = "0.12"
prost = "0.13" prost = "0.13"
prost-types = "0.13" prost-types = "0.13"

View file

@ -31,6 +31,7 @@ bytes = "1"
iam-api = { path = "../../../iam/crates/iam-api" } iam-api = { path = "../../../iam/crates/iam-api" }
iam-authn = { path = "../../../iam/crates/iam-authn" } iam-authn = { path = "../../../iam/crates/iam-authn" }
iam-authz = { path = "../../../iam/crates/iam-authz" } iam-authz = { path = "../../../iam/crates/iam-authz" }
iam-service-auth = { path = "../../../iam/crates/iam-service-auth" }
iam-store = { path = "../../../iam/crates/iam-store" } iam-store = { path = "../../../iam/crates/iam-store" }
iam-types = { path = "../../../iam/crates/iam-types" } iam-types = { path = "../../../iam/crates/iam-types" }
creditservice-api = { path = "../../../creditservice/crates/creditservice-api" } creditservice-api = { path = "../../../creditservice/crates/creditservice-api" }

View file

@ -1,10 +1,8 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::io; use std::io;
use std::net::SocketAddr; use std::net::SocketAddr;
use std::pin::Pin;
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Arc; use std::sync::Arc;
use std::task::{Context, Poll};
use std::time::Duration; use std::time::Duration;
use apigateway_api::proto::{ use apigateway_api::proto::{
@ -13,15 +11,13 @@ use apigateway_api::proto::{
use apigateway_api::{GatewayAuthServiceClient, GatewayCreditServiceClient}; use apigateway_api::{GatewayAuthServiceClient, GatewayCreditServiceClient};
use axum::{ use axum::{
body::{to_bytes, Body}, body::{to_bytes, Body},
extract::State, extract::{ConnectInfo, State},
http::{HeaderMap, Request, StatusCode, Uri}, http::{HeaderMap, Request, StatusCode, Uri},
response::Response, response::Response,
routing::{any, get}, routing::{any, get},
Json, Router, Json, Router,
}; };
use clap::Parser; use clap::Parser;
use bytes::Bytes;
use futures_core::Stream;
use reqwest::{Client, Url}; use reqwest::{Client, Url};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity};
@ -31,8 +27,41 @@ use tracing_subscriber::EnvFilter;
use uuid::Uuid; use uuid::Uuid;
const DEFAULT_REQUEST_ID_HEADER: &str = "x-request-id"; const DEFAULT_REQUEST_ID_HEADER: &str = "x-request-id";
const PHOTON_AUTH_TOKEN_HEADER: &str = "x-photon-auth-token";
const DEFAULT_AUTH_TIMEOUT_MS: u64 = 500; const DEFAULT_AUTH_TIMEOUT_MS: u64 = 500;
const DEFAULT_CREDIT_TIMEOUT_MS: u64 = 500; const DEFAULT_CREDIT_TIMEOUT_MS: u64 = 500;
const DEFAULT_UPSTREAM_TIMEOUT_MS: u64 = 10_000;
const RESERVED_AUTH_HEADERS: [&str; 10] = [
"authorization",
"x-photon-auth-token",
"x-subject-id",
"x-org-id",
"x-project-id",
"x-roles",
"x-scopes",
"x-iam-session-id",
"x-iam-principal-kind",
"x-iam-auth-method",
];
const AUTH_PROVIDER_BLOCK_HEADERS: [&str; 17] = [
"authorization",
"x-photon-auth-token",
"x-subject-id",
"x-org-id",
"x-project-id",
"x-roles",
"x-scopes",
"proxy-authorization",
"cookie",
"set-cookie",
"host",
"connection",
"upgrade",
"keep-alive",
"te",
"trailer",
"transfer-encoding",
];
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
@ -62,6 +91,22 @@ fn default_credit_units() -> u64 {
1 1
} }
fn default_upstream_timeout_ms() -> u64 {
DEFAULT_UPSTREAM_TIMEOUT_MS
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct TlsConfig {
#[serde(default)]
ca_file: Option<String>,
#[serde(default)]
cert_file: Option<String>,
#[serde(default)]
key_file: Option<String>,
#[serde(default)]
domain_name: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
struct AuthProviderConfig { struct AuthProviderConfig {
name: String, name: String,
@ -70,6 +115,8 @@ struct AuthProviderConfig {
endpoint: String, endpoint: String,
#[serde(default)] #[serde(default)]
timeout_ms: Option<u64>, timeout_ms: Option<u64>,
#[serde(default)]
tls: Option<TlsConfig>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -80,6 +127,8 @@ struct CreditProviderConfig {
endpoint: String, endpoint: String,
#[serde(default)] #[serde(default)]
timeout_ms: Option<u64>, timeout_ms: Option<u64>,
#[serde(default)]
tls: Option<TlsConfig>,
} }
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
@ -103,6 +152,8 @@ struct RouteCreditConfig {
#[serde(default = "default_commit_policy")] #[serde(default = "default_commit_policy")]
commit_on: CommitPolicy, commit_on: CommitPolicy,
#[serde(default)] #[serde(default)]
allow_header_subject: bool,
#[serde(default)]
attributes: HashMap<String, String>, attributes: HashMap<String, String>,
} }
@ -114,6 +165,8 @@ struct RouteConfig {
#[serde(default)] #[serde(default)]
strip_prefix: bool, strip_prefix: bool,
#[serde(default)] #[serde(default)]
timeout_ms: Option<u64>,
#[serde(default)]
auth: Option<RouteAuthConfig>, auth: Option<RouteAuthConfig>,
#[serde(default)] #[serde(default)]
credit: Option<RouteCreditConfig>, credit: Option<RouteCreditConfig>,
@ -134,6 +187,12 @@ struct ServerConfig {
log_level: String, log_level: String,
#[serde(default = "default_max_body_bytes")] #[serde(default = "default_max_body_bytes")]
max_body_bytes: usize, max_body_bytes: usize,
#[serde(default = "default_max_response_bytes")]
max_response_bytes: usize,
#[serde(default = "default_upstream_timeout_ms")]
upstream_timeout_ms: u64,
#[serde(default)]
trust_forwarded_headers: bool,
#[serde(default)] #[serde(default)]
auth_providers: Vec<AuthProviderConfig>, auth_providers: Vec<AuthProviderConfig>,
#[serde(default)] #[serde(default)]
@ -148,6 +207,9 @@ impl Default for ServerConfig {
http_addr: default_http_addr(), http_addr: default_http_addr(),
log_level: default_log_level(), log_level: default_log_level(),
max_body_bytes: default_max_body_bytes(), max_body_bytes: default_max_body_bytes(),
max_response_bytes: default_max_response_bytes(),
upstream_timeout_ms: default_upstream_timeout_ms(),
trust_forwarded_headers: false,
auth_providers: Vec::new(), auth_providers: Vec::new(),
credit_providers: Vec::new(), credit_providers: Vec::new(),
routes: Vec::new(), routes: Vec::new(),
@ -175,9 +237,12 @@ struct Args {
struct ServerState { struct ServerState {
routes: Vec<Route>, routes: Vec<Route>,
client: Client, client: Client,
upstream_timeout: Duration,
max_body_bytes: usize, max_body_bytes: usize,
max_response_bytes: usize,
auth_providers: HashMap<String, AuthProvider>, auth_providers: HashMap<String, AuthProvider>,
credit_providers: HashMap<String, CreditProvider>, credit_providers: HashMap<String, CreditProvider>,
trust_forwarded_headers: bool,
} }
#[derive(Clone)] #[derive(Clone)]
@ -211,6 +276,13 @@ struct SubjectInfo {
scopes: Vec<String>, scopes: Vec<String>,
} }
#[derive(Clone, Debug)]
struct CreditSubject {
subject_id: String,
org_id: String,
project_id: String,
}
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct AuthDecision { struct AuthDecision {
allow: bool, allow: bool,
@ -238,84 +310,6 @@ struct CreditReservation {
reservation_id: String, reservation_id: String,
} }
struct CreditFinalizeState {
state: Arc<ServerState>,
route: Route,
reservation: Option<CreditReservation>,
status: reqwest::StatusCode,
}
impl CreditFinalizeState {
fn spawn_success(self) {
tokio::spawn(async move {
finalize_credit(&self.state, &self.route, self.reservation, self.status).await;
});
}
fn spawn_abort(self) {
tokio::spawn(async move {
finalize_credit_abort(&self.state, &self.route, self.reservation).await;
});
}
}
struct CreditFinalizeStream {
bytes: Option<Bytes>,
finalize: Option<CreditFinalizeState>,
completed: bool,
}
impl CreditFinalizeStream {
fn new(bytes: Bytes, finalize: CreditFinalizeState) -> Self {
Self {
bytes: Some(bytes),
finalize: Some(finalize),
completed: false,
}
}
fn finalize_success(&mut self) {
if self.completed {
return;
}
self.completed = true;
if let Some(finalize) = self.finalize.take() {
finalize.spawn_success();
}
}
fn finalize_abort(&mut self) {
if self.completed {
return;
}
self.completed = true;
if let Some(finalize) = self.finalize.take() {
finalize.spawn_abort();
}
}
}
impl Stream for CreditFinalizeStream {
type Item = Result<Bytes, io::Error>;
fn poll_next(mut self: Pin<&mut Self>, _cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
if let Some(bytes) = self.bytes.take() {
return Poll::Ready(Some(Ok(bytes)));
}
self.finalize_success();
Poll::Ready(None)
}
}
impl Drop for CreditFinalizeStream {
fn drop(&mut self) {
if !self.completed {
self.finalize_abort();
}
}
}
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
struct RequestContext { struct RequestContext {
request_id: String, request_id: String,
@ -341,6 +335,10 @@ fn default_max_body_bytes() -> usize {
16 * 1024 * 1024 16 * 1024 * 1024
} }
fn default_max_response_bytes() -> usize {
default_max_body_bytes()
}
#[tokio::main] #[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> { async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse(); let args = Args::parse();
@ -374,10 +372,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let routes = build_routes(config.routes)?; let routes = build_routes(config.routes)?;
let auth_providers = build_auth_providers(config.auth_providers).await?; let auth_providers = build_auth_providers(config.auth_providers).await?;
let credit_providers = build_credit_providers(config.credit_providers).await?; let credit_providers = build_credit_providers(config.credit_providers).await?;
let upstream_timeout = Duration::from_millis(config.upstream_timeout_ms);
let client = Client::builder().build()?;
info!("Starting API gateway"); info!("Starting API gateway");
info!(" HTTP: {}", config.http_addr); info!(" HTTP: {}", config.http_addr);
info!(" Max body bytes: {}", config.max_body_bytes); info!(" Max body bytes: {}", config.max_body_bytes);
info!(" Max response bytes: {}", config.max_response_bytes);
if !routes.is_empty() { if !routes.is_empty() {
info!("Configured {} routes", routes.len()); info!("Configured {} routes", routes.len());
@ -394,10 +395,13 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let state = Arc::new(ServerState { let state = Arc::new(ServerState {
routes, routes,
client: Client::new(), client,
upstream_timeout,
max_body_bytes: config.max_body_bytes, max_body_bytes: config.max_body_bytes,
max_response_bytes: config.max_response_bytes,
auth_providers, auth_providers,
credit_providers, credit_providers,
trust_forwarded_headers: config.trust_forwarded_headers,
}); });
let app = Router::new() let app = Router::new()
@ -408,7 +412,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.with_state(state); .with_state(state);
let listener = tokio::net::TcpListener::bind(config.http_addr).await?; let listener = tokio::net::TcpListener::bind(config.http_addr).await?;
axum::serve(listener, app).await?; axum::serve(listener, app.into_make_service_with_connect_info::<SocketAddr>()).await?;
Ok(()) Ok(())
} }
@ -427,6 +431,7 @@ async fn list_routes(State(state): State<Arc<ServerState>>) -> Json<Vec<RouteCon
async fn proxy( async fn proxy(
State(state): State<Arc<ServerState>>, State(state): State<Arc<ServerState>>,
ConnectInfo(remote_addr): ConnectInfo<SocketAddr>,
request: Request<Body>, request: Request<Body>,
) -> Result<Response<Body>, StatusCode> { ) -> Result<Response<Body>, StatusCode> {
let path = request.uri().path(); let path = request.uri().path();
@ -441,15 +446,16 @@ async fn proxy(
path: request.uri().path().to_string(), path: request.uri().path().to_string(),
raw_query: request.uri().query().unwrap_or("").to_string(), raw_query: request.uri().query().unwrap_or("").to_string(),
headers: headers_to_map(request.headers()), headers: headers_to_map(request.headers()),
client_ip: extract_client_ip(request.headers()), client_ip: extract_client_ip(
request.headers(),
remote_addr,
state.trust_forwarded_headers,
),
route_name: route.config.name.clone(), route_name: route.config.name.clone(),
}; };
let auth_token = request let auth_token = extract_auth_token(request.headers());
.headers() let forward_client_auth_headers = route.config.auth.is_none();
.get(axum::http::header::AUTHORIZATION)
.and_then(|value| value.to_str().ok())
.map(|value| value.to_string());
let auth_outcome = enforce_auth(&state, &route, &context, auth_token).await?; let auth_outcome = enforce_auth(&state, &route, &context, auth_token).await?;
let credit_reservation = let credit_reservation =
@ -457,11 +463,22 @@ async fn proxy(
let target_url = build_upstream_url(&route, request.uri())?; let target_url = build_upstream_url(&route, request.uri())?;
let mut builder = state.client.request(request.method().clone(), target_url); let request_timeout =
Duration::from_millis(route.config.timeout_ms.unwrap_or(state.upstream_timeout.as_millis() as u64));
let mut builder = state
.client
.request(request.method().clone(), target_url)
.timeout(request_timeout);
for (name, value) in request.headers().iter() { for (name, value) in request.headers().iter() {
if name == axum::http::header::HOST || name == axum::http::header::CONNECTION { if name == axum::http::header::HOST || name == axum::http::header::CONNECTION {
continue; continue;
} }
if is_reserved_auth_header(name) {
if forward_client_auth_headers && should_preserve_client_auth_header(name.as_str()) {
builder = builder.header(name, value);
}
continue;
}
builder = builder.header(name, value); builder = builder.header(name, value);
} }
@ -481,6 +498,12 @@ async fn proxy(
}; };
let status = response.status(); let status = response.status();
if let Some(content_length) = response.content_length() {
if state.max_response_bytes > 0 && content_length as usize > state.max_response_bytes {
finalize_credit_abort(&state, &route, credit_reservation).await;
return Err(StatusCode::PAYLOAD_TOO_LARGE);
}
}
let mut response_builder = Response::builder().status(status); let mut response_builder = Response::builder().status(status);
let headers = response_builder let headers = response_builder
@ -494,23 +517,22 @@ async fn proxy(
headers.insert(name, value.clone()); headers.insert(name, value.clone());
} }
let bytes = match response.bytes().await { let body = match response.bytes().await {
Ok(bytes) => bytes, Ok(body) => body,
Err(_) => { Err(_) => {
finalize_credit_abort(&state, &route, credit_reservation).await; finalize_credit_abort(&state, &route, credit_reservation).await;
return Err(StatusCode::BAD_GATEWAY); return Err(StatusCode::BAD_GATEWAY);
} }
}; };
if state.max_response_bytes > 0 && body.len() > state.max_response_bytes {
finalize_credit_abort(&state, &route, credit_reservation).await;
return Err(StatusCode::PAYLOAD_TOO_LARGE);
}
let finalize = CreditFinalizeState { finalize_credit(&state, &route, credit_reservation, status).await;
state: Arc::clone(&state),
route,
reservation: credit_reservation,
status,
};
response_builder response_builder
.body(Body::from_stream(CreditFinalizeStream::new(bytes, finalize))) .body(Body::from(body))
.map_err(|_| StatusCode::BAD_GATEWAY) .map_err(|_| StatusCode::BAD_GATEWAY)
} }
@ -592,7 +614,22 @@ async fn enforce_credit(
return Ok(None); return Ok(None);
} }
let decision = reserve_credit(state, credit_cfg, context, subject).await; let credit_subject = resolve_credit_subject(context, subject, credit_cfg.allow_header_subject);
if credit_subject.is_none() {
if credit_cfg.mode == PolicyMode::Required {
return Err(StatusCode::UNAUTHORIZED);
}
warn!("Credit skipped: missing org/project scope");
return Ok(None);
}
let decision = reserve_credit(
state,
credit_cfg,
context,
credit_subject.as_ref().expect("credit subject resolved"),
)
.await;
apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision) apply_credit_mode(credit_cfg.mode, credit_cfg.fail_open, decision)
.map(|decision| { .map(|decision| {
decision.map(|decision| CreditReservation { decision.map(|decision| CreditReservation {
@ -696,26 +733,56 @@ async fn authorize_request(
} }
} }
fn resolve_credit_subject(
context: &RequestContext,
subject: Option<&SubjectInfo>,
allow_header_subject: bool,
) -> Option<CreditSubject> {
if let Some(subject) = subject {
return Some(CreditSubject {
subject_id: subject.subject_id.clone(),
org_id: subject.org_id.clone(),
project_id: subject.project_id.clone(),
});
}
if !allow_header_subject {
return None;
}
let org_id = context.headers.get("x-org-id")?.trim();
let project_id = context.headers.get("x-project-id")?.trim();
if org_id.is_empty() || project_id.is_empty() {
return None;
}
let subject_id = context
.headers
.get("x-subject-id")
.map(|value| value.trim().to_string())
.unwrap_or_default();
Some(CreditSubject {
subject_id,
org_id: org_id.to_string(),
project_id: project_id.to_string(),
})
}
async fn reserve_credit( async fn reserve_credit(
state: &ServerState, state: &ServerState,
credit_cfg: &RouteCreditConfig, credit_cfg: &RouteCreditConfig,
context: &RequestContext, context: &RequestContext,
subject: Option<&SubjectInfo>, credit_subject: &CreditSubject,
) -> Result<CreditDecision, StatusCode> { ) -> Result<CreditDecision, StatusCode> {
let provider = state let provider = state
.credit_providers .credit_providers
.get(&credit_cfg.provider) .get(&credit_cfg.provider)
.ok_or(StatusCode::INTERNAL_SERVER_ERROR)?; .ok_or(StatusCode::INTERNAL_SERVER_ERROR)?;
let (subject_id, org_id, project_id) = subject let subject_id = credit_subject.subject_id.clone();
.map(|subject| { let org_id = credit_subject.org_id.clone();
( let project_id = credit_subject.project_id.clone();
subject.subject_id.clone(),
subject.org_id.clone(),
subject.project_id.clone(),
)
})
.unwrap_or_default();
match provider { match provider {
CreditProvider::Grpc(provider) => { CreditProvider::Grpc(provider) => {
@ -875,8 +942,14 @@ async fn rollback_credit(
} }
} }
fn apply_auth_headers(mut builder: reqwest::RequestBuilder, outcome: &AuthOutcome) -> reqwest::RequestBuilder { fn apply_auth_headers(
mut builder: reqwest::RequestBuilder,
outcome: &AuthOutcome,
) -> reqwest::RequestBuilder {
for (key, value) in &outcome.headers { for (key, value) in &outcome.headers {
if !should_forward_auth_header(key) {
continue;
}
builder = builder.header(key, value); builder = builder.header(key, value);
} }
@ -896,6 +969,39 @@ fn apply_auth_headers(mut builder: reqwest::RequestBuilder, outcome: &AuthOutcom
builder builder
} }
async fn build_client_tls_config(
tls: &Option<TlsConfig>,
) -> Result<Option<ClientTlsConfig>, Box<dyn std::error::Error>> {
let Some(tls) = tls else {
return Ok(None);
};
let mut tls_config = ClientTlsConfig::new();
if let Some(ca_file) = &tls.ca_file {
let ca = tokio::fs::read(ca_file).await?;
tls_config = tls_config.ca_certificate(Certificate::from_pem(ca));
}
match (&tls.cert_file, &tls.key_file) {
(Some(cert_file), Some(key_file)) => {
let cert = tokio::fs::read(cert_file).await?;
let key = tokio::fs::read(key_file).await?;
tls_config = tls_config.identity(Identity::from_pem(cert, key));
}
(None, None) => {}
_ => {
return Err(config_error("tls requires both cert_file and key_file").into());
}
}
if let Some(domain) = &tls.domain_name {
tls_config = tls_config.domain_name(domain);
}
Ok(Some(tls_config))
}
async fn build_auth_providers( async fn build_auth_providers(
configs: Vec<AuthProviderConfig>, configs: Vec<AuthProviderConfig>,
) -> Result<HashMap<String, AuthProvider>, Box<dyn std::error::Error>> { ) -> Result<HashMap<String, AuthProvider>, Box<dyn std::error::Error>> {
@ -913,11 +1019,19 @@ async fn build_auth_providers(
match provider_type.as_str() { match provider_type.as_str() {
"grpc" => { "grpc" => {
let endpoint = Endpoint::from_shared(config.endpoint.clone())? let mut endpoint = Endpoint::from_shared(config.endpoint.clone())?
.connect_timeout(Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS))) .connect_timeout(Duration::from_millis(
.timeout(Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS))); config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS),
))
.timeout(Duration::from_millis(
config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS),
));
if let Some(tls) = build_client_tls_config(&config.tls).await? {
endpoint = endpoint.tls_config(tls)?;
}
let channel = endpoint.connect().await?; let channel = endpoint.connect().await?;
let timeout = Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS)); let timeout =
Duration::from_millis(config.timeout_ms.unwrap_or(DEFAULT_AUTH_TIMEOUT_MS));
providers.insert( providers.insert(
config.name.clone(), config.name.clone(),
AuthProvider::Grpc(GrpcAuthProvider { AuthProvider::Grpc(GrpcAuthProvider {
@ -956,7 +1070,7 @@ async fn build_credit_providers(
match provider_type.as_str() { match provider_type.as_str() {
"grpc" => { "grpc" => {
let endpoint = Endpoint::from_shared(config.endpoint.clone())? let mut endpoint = Endpoint::from_shared(config.endpoint.clone())?
.connect_timeout(Duration::from_millis( .connect_timeout(Duration::from_millis(
config config
.timeout_ms .timeout_ms
@ -968,6 +1082,10 @@ async fn build_credit_providers(
.unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS), .unwrap_or(DEFAULT_CREDIT_TIMEOUT_MS),
)); ));
if let Some(tls) = build_client_tls_config(&config.tls).await? {
endpoint = endpoint.tls_config(tls)?;
}
let channel = endpoint.connect().await?; let channel = endpoint.connect().await?;
let timeout = Duration::from_millis( let timeout = Duration::from_millis(
config config
@ -1049,13 +1167,34 @@ fn extract_request_id(headers: &HeaderMap) -> String {
.unwrap_or_else(|| Uuid::new_v4().to_string()) .unwrap_or_else(|| Uuid::new_v4().to_string())
} }
fn extract_client_ip(headers: &HeaderMap) -> String { fn extract_client_ip(
headers headers: &HeaderMap,
remote_addr: SocketAddr,
trust_forwarded_headers: bool,
) -> String {
if trust_forwarded_headers {
if let Some(value) = headers
.get("x-forwarded-for") .get("x-forwarded-for")
.and_then(|value| value.to_str().ok()) .and_then(|value| value.to_str().ok())
.and_then(|value| value.split(',').next()) .and_then(|value| value.split(',').next())
.map(|value| value.trim().to_string()) {
.unwrap_or_default() let trimmed = value.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
if let Some(value) = headers
.get("x-real-ip")
.and_then(|value| value.to_str().ok())
{
let trimmed = value.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
}
}
remote_addr.ip().to_string()
} }
fn headers_to_map(headers: &HeaderMap) -> HashMap<String, String> { fn headers_to_map(headers: &HeaderMap) -> HashMap<String, String> {
@ -1073,6 +1212,78 @@ fn headers_to_map(headers: &HeaderMap) -> HashMap<String, String> {
map map
} }
fn extract_auth_token(headers: &HeaderMap) -> Option<String> {
let auth_header = headers
.get(axum::http::header::AUTHORIZATION)
.and_then(|value| value.to_str().ok());
if let Some(token) = auth_header.and_then(parse_auth_token_value) {
return Some(token);
}
let photon_header = headers
.get(PHOTON_AUTH_TOKEN_HEADER)
.and_then(|value| value.to_str().ok());
photon_header.and_then(parse_auth_token_value)
}
fn is_reserved_auth_header(name: &axum::http::header::HeaderName) -> bool {
is_reserved_auth_header_str(name.as_str())
}
fn is_reserved_auth_header_str(name: &str) -> bool {
let header = name.to_ascii_lowercase();
RESERVED_AUTH_HEADERS.iter().any(|value| *value == header)
}
fn should_forward_auth_header(name: &str) -> bool {
let header = name.to_ascii_lowercase();
if AUTH_PROVIDER_BLOCK_HEADERS
.iter()
.any(|value| *value == header)
{
return false;
}
header.starts_with("x-")
}
fn should_preserve_client_auth_header(name: &str) -> bool {
let header = name.to_ascii_lowercase();
header == "authorization" || header == PHOTON_AUTH_TOKEN_HEADER
}
fn parse_auth_token_value(value: &str) -> Option<String> {
let trimmed = value.trim();
if trimmed.is_empty() {
return None;
}
if let Some(token) = parse_bearer_token(trimmed) {
return Some(token);
}
// Legacy support: allow raw token values without a scheme.
if trimmed.split_whitespace().count() != 1 {
return None;
}
Some(trimmed.to_string())
}
fn parse_bearer_token(value: &str) -> Option<String> {
let mut parts = value.split_whitespace();
let scheme = parts.next()?;
if !scheme.eq_ignore_ascii_case("bearer") {
return None;
}
let token = parts.next()?;
if parts.next().is_some() {
return None;
}
Some(token.to_string())
}
fn normalize_path_prefix(prefix: &str) -> String { fn normalize_path_prefix(prefix: &str) -> String {
let trimmed = prefix.trim(); let trimmed = prefix.trim();
if trimmed.is_empty() { if trimmed.is_empty() {
@ -1104,7 +1315,22 @@ fn normalize_upstream_base_path(path: &str) -> String {
fn match_route<'a>(routes: &'a [Route], path: &str) -> Option<&'a Route> { fn match_route<'a>(routes: &'a [Route], path: &str) -> Option<&'a Route> {
routes routes
.iter() .iter()
.find(|route| path.starts_with(&route.config.path_prefix)) .find(|route| path_matches_prefix(path, &route.config.path_prefix))
}
fn path_matches_prefix(path: &str, prefix: &str) -> bool {
if prefix == "/" {
return true;
}
if path == prefix {
return true;
}
match path.strip_prefix(prefix) {
Some(stripped) => stripped.starts_with('/'),
None => false,
}
} }
fn strip_prefix_path(path: &str, prefix: &str) -> String { fn strip_prefix_path(path: &str, prefix: &str) -> String {
@ -1160,9 +1386,8 @@ fn build_upstream_url(route: &Route, uri: &Uri) -> Result<Url, StatusCode> {
mod tests { mod tests {
use super::*; use super::*;
use axum::routing::get; use axum::routing::get;
use creditservice_api::{ use creditservice_api::{CreditServiceImpl, CreditStorage, GatewayCreditServiceImpl};
CreditServiceImpl, CreditStorage, GatewayCreditServiceImpl, GatewayCreditServiceServer, use apigateway_api::GatewayCreditServiceServer;
};
use creditservice_types::Wallet; use creditservice_types::Wallet;
use iam_api::{GatewayAuthServiceImpl, GatewayAuthServiceServer}; use iam_api::{GatewayAuthServiceImpl, GatewayAuthServiceServer};
use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey}; use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
@ -1173,12 +1398,28 @@ mod tests {
use tonic::transport::Server; use tonic::transport::Server;
use uuid::Uuid; use uuid::Uuid;
async fn wait_for_test_tcp(addr: SocketAddr) {
let deadline = tokio::time::Instant::now() + Duration::from_secs(2);
loop {
if tokio::net::TcpStream::connect(addr).await.is_ok() {
return;
}
assert!(
tokio::time::Instant::now() < deadline,
"timed out waiting for test listener {}",
addr
);
tokio::time::sleep(Duration::from_millis(25)).await;
}
}
fn route_config(name: &str, prefix: &str, upstream: &str, strip_prefix: bool) -> RouteConfig { fn route_config(name: &str, prefix: &str, upstream: &str, strip_prefix: bool) -> RouteConfig {
RouteConfig { RouteConfig {
name: name.to_string(), name: name.to_string(),
path_prefix: prefix.to_string(), path_prefix: prefix.to_string(),
upstream: upstream.to_string(), upstream: upstream.to_string(),
strip_prefix, strip_prefix,
timeout_ms: None,
auth: None, auth: None,
credit: None, credit: None,
} }
@ -1202,7 +1443,21 @@ mod tests {
} }
async fn start_upstream() -> SocketAddr { async fn start_upstream() -> SocketAddr {
let app = Router::new().route("/v1/echo", get(|| async { "ok" })); let app = Router::new()
.route("/v1/echo", get(|| async { "ok" }))
.route(
"/v1/echo-auth",
get(|headers: HeaderMap| async move {
Json(serde_json::json!({
"authorization": headers
.get(axum::http::header::AUTHORIZATION)
.and_then(|value| value.to_str().ok()),
"photon_token": headers
.get(PHOTON_AUTH_TOKEN_HEADER)
.and_then(|value| value.to_str().ok()),
}))
}),
);
let listener = tokio::net::TcpListener::bind("127.0.0.1:0") let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
.await .await
.expect("bind upstream"); .expect("bind upstream");
@ -1210,6 +1465,7 @@ mod tests {
tokio::spawn(async move { tokio::spawn(async move {
axum::serve(listener, app).await.expect("upstream serve"); axum::serve(listener, app).await.expect("upstream serve");
}); });
wait_for_test_tcp(addr).await;
addr addr
} }
@ -1279,10 +1535,11 @@ mod tests {
.expect("iam gateway serve"); .expect("iam gateway serve");
}); });
wait_for_test_tcp(addr).await;
(addr, issued.token) (addr, issued.token)
} }
async fn start_credit_gateway() -> SocketAddr { async fn start_credit_gateway(iam_addr: &SocketAddr) -> SocketAddr {
let storage = creditservice_api::InMemoryStorage::new(); let storage = creditservice_api::InMemoryStorage::new();
let wallet = Wallet::new("proj-1".into(), "org-1".into(), 100); let wallet = Wallet::new("proj-1".into(), "org-1".into(), 100);
storage storage
@ -1290,7 +1547,13 @@ mod tests {
.await .await
.expect("wallet create"); .expect("wallet create");
let credit_service = Arc::new(CreditServiceImpl::new(storage)); let auth_service = Arc::new(
iam_service_auth::AuthService::new(&format!("http://{}", iam_addr))
.await
.expect("auth service"),
);
let credit_service = Arc::new(CreditServiceImpl::new(storage, auth_service));
let gateway_credit = GatewayCreditServiceImpl::new(credit_service); let gateway_credit = GatewayCreditServiceImpl::new(credit_service);
let listener = tokio::net::TcpListener::bind("127.0.0.1:0") let listener = tokio::net::TcpListener::bind("127.0.0.1:0")
@ -1305,6 +1568,7 @@ mod tests {
.expect("credit gateway serve"); .expect("credit gateway serve");
}); });
wait_for_test_tcp(addr).await;
addr addr
} }
@ -1343,6 +1607,23 @@ mod tests {
assert_eq!(matched.config.name, "api-v1"); assert_eq!(matched.config.name, "api-v1");
} }
#[test]
fn test_match_route_segment_boundary() {
let routes = build_routes(vec![
route_config("api", "/api", "http://example.com", false),
route_config("api2", "/api2", "http://example.com", false),
])
.unwrap();
let matched = match_route(&routes, "/api2").unwrap();
assert_eq!(matched.config.name, "api2");
let matched = match_route(&routes, "/api2/health").unwrap();
assert_eq!(matched.config.name, "api2");
assert!(match_route(&routes, "/apiary").is_none());
}
#[test] #[test]
fn test_build_upstream_url_preserves_query() { fn test_build_upstream_url_preserves_query() {
let routes = build_routes(vec![route_config( let routes = build_routes(vec![route_config(
@ -1419,13 +1700,14 @@ mod tests {
async fn test_gateway_auth_and_credit_flow() { async fn test_gateway_auth_and_credit_flow() {
let upstream_addr = start_upstream().await; let upstream_addr = start_upstream().await;
let (iam_addr, token) = start_iam_gateway().await; let (iam_addr, token) = start_iam_gateway().await;
let credit_addr = start_credit_gateway().await; let credit_addr = start_credit_gateway(&iam_addr).await;
let routes = build_routes(vec![RouteConfig { let routes = build_routes(vec![RouteConfig {
name: "public".to_string(), name: "public".to_string(),
path_prefix: "/v1".to_string(), path_prefix: "/v1".to_string(),
upstream: format!("http://{}", upstream_addr), upstream: format!("http://{}", upstream_addr),
strip_prefix: false, strip_prefix: false,
timeout_ms: None,
auth: Some(RouteAuthConfig { auth: Some(RouteAuthConfig {
provider: "iam".to_string(), provider: "iam".to_string(),
mode: PolicyMode::Required, mode: PolicyMode::Required,
@ -1437,6 +1719,7 @@ mod tests {
units: 1, units: 1,
fail_open: false, fail_open: false,
commit_on: CommitPolicy::Success, commit_on: CommitPolicy::Success,
allow_header_subject: false,
attributes: HashMap::new(), attributes: HashMap::new(),
}), }),
}]) }])
@ -1447,6 +1730,7 @@ mod tests {
provider_type: "grpc".to_string(), provider_type: "grpc".to_string(),
endpoint: format!("http://{}", iam_addr), endpoint: format!("http://{}", iam_addr),
timeout_ms: Some(1000), timeout_ms: Some(1000),
tls: None,
}]) }])
.await .await
.unwrap(); .unwrap();
@ -1464,19 +1748,124 @@ mod tests {
let state = Arc::new(ServerState { let state = Arc::new(ServerState {
routes, routes,
client: Client::new(), client: Client::new(),
upstream_timeout: Duration::from_secs(5),
max_body_bytes: 1024 * 1024, max_body_bytes: 1024 * 1024,
max_response_bytes: 1024 * 1024,
auth_providers, auth_providers,
credit_providers, credit_providers,
trust_forwarded_headers: false,
});
let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
let mut response = None;
while tokio::time::Instant::now() < deadline {
let request = Request::builder()
.method("GET")
.uri("/v1/echo")
.header(axum::http::header::AUTHORIZATION, &token)
.body(Body::empty())
.expect("request build");
match proxy(
State(Arc::clone(&state)),
ConnectInfo("127.0.0.1:40000".parse().unwrap()),
request,
)
.await
{
Ok(ok) => {
response = Some(ok);
break;
}
Err(StatusCode::BAD_GATEWAY) => {
tokio::time::sleep(Duration::from_millis(25)).await;
}
Err(status) => panic!("unexpected proxy status: {}", status),
}
}
let response = response.expect("gateway auth+credit test timed out waiting for ready backends");
assert_eq!(response.status(), StatusCode::OK);
}
#[tokio::test]
async fn test_proxy_forwards_client_auth_headers_when_route_has_no_auth() {
let upstream_addr = start_upstream().await;
let routes = build_routes(vec![route_config(
"passthrough",
"/v1",
&format!("http://{}", upstream_addr),
false,
)])
.unwrap();
let state = Arc::new(ServerState {
routes,
client: Client::new(),
upstream_timeout: Duration::from_secs(5),
max_body_bytes: 1024 * 1024,
max_response_bytes: 1024 * 1024,
auth_providers: HashMap::new(),
credit_providers: HashMap::new(),
trust_forwarded_headers: false,
}); });
let request = Request::builder() let request = Request::builder()
.method("GET") .method("GET")
.uri("/v1/echo") .uri("/v1/echo-auth")
.header(axum::http::header::AUTHORIZATION, token) .header(axum::http::header::AUTHORIZATION, "Bearer passthrough-token")
.header(PHOTON_AUTH_TOKEN_HEADER, "photon-token")
.body(Body::empty()) .body(Body::empty())
.expect("request build"); .expect("request build");
let response = proxy(State(state), request).await.unwrap(); let response = proxy(
State(state),
ConnectInfo("127.0.0.1:40000".parse().unwrap()),
request,
)
.await
.unwrap();
assert_eq!(response.status(), StatusCode::OK); assert_eq!(response.status(), StatusCode::OK);
let body = to_bytes(response.into_body(), 1024 * 1024).await.unwrap();
let json: serde_json::Value = serde_json::from_slice(&body).unwrap();
assert_eq!(json.get("authorization").and_then(|v| v.as_str()), Some("Bearer passthrough-token"));
assert_eq!(json.get("photon_token").and_then(|v| v.as_str()), Some("photon-token"));
}
#[test]
fn test_extract_auth_token_accepts_bearer_authorization() {
let mut headers = HeaderMap::new();
headers.insert(
axum::http::header::AUTHORIZATION,
"Bearer abc123".parse().unwrap(),
);
assert_eq!(extract_auth_token(&headers).as_deref(), Some("abc123"));
}
#[test]
fn test_extract_auth_token_accepts_legacy_raw_authorization() {
let mut headers = HeaderMap::new();
headers.insert(
axum::http::header::AUTHORIZATION,
"raw-token".parse().unwrap(),
);
assert_eq!(extract_auth_token(&headers).as_deref(), Some("raw-token"));
}
#[test]
fn test_extract_auth_token_falls_back_to_photon_header() {
let mut headers = HeaderMap::new();
headers.insert(
axum::http::header::AUTHORIZATION,
"Basic abc".parse().unwrap(),
);
headers.insert(PHOTON_AUTH_TOKEN_HEADER, "photon-token".parse().unwrap());
assert_eq!(
extract_auth_token(&headers).as_deref(),
Some("photon-token")
);
} }
} }

View file

@ -1,763 +0,0 @@
# First-Boot Automation Architecture
## Overview
The first-boot automation system provides automated cluster joining and service initialization for bare-metal provisioned nodes. It handles two critical scenarios:
1. **Bootstrap Mode**: First 3 nodes initialize a new Raft cluster
2. **Join Mode**: Additional nodes join an existing cluster
This document describes the architecture, design decisions, and implementation details.
## System Architecture
### Component Hierarchy
```
┌─────────────────────────────────────────────────────────────┐
│ NixOS Boot Process │
└────────────────────┬────────────────────────────────────────┘
┌─────────────────────────────────────────────────────────────┐
│ systemd.target: multi-user.target │
└────────────────────┬────────────────────────────────────────┘
┌───────────────┼───────────────┐
│ │ │
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────┐
│chainfire │ │ flaredb │ │ iam │
│.service │ │.service │ │.service │
└────┬─────┘ └────┬─────┘ └────┬─────┘
│ │ │
▼ ▼ ▼
┌──────────────────────────────────────────┐
│ chainfire-cluster-join.service │
│ - Waits for local chainfire health │
│ - Checks bootstrap flag │
│ - Joins cluster if bootstrap=false │
└────────────────┬─────────────────────────┘
┌──────────────────────────────────────────┐
│ flaredb-cluster-join.service │
│ - Requires chainfire-cluster-join │
│ - Waits for local flaredb health │
│ - Joins FlareDB cluster │
└────────────────┬─────────────────────────┘
┌──────────────────────────────────────────┐
│ iam-initial-setup.service │
│ - Waits for IAM health │
│ - Creates admin user if needed │
│ - Generates initial tokens │
└────────────────┬─────────────────────────┘
┌──────────────────────────────────────────┐
│ cluster-health-check.service │
│ - Polls all service health endpoints │
│ - Verifies cluster membership │
│ - Reports to journald │
└──────────────────────────────────────────┘
```
### Configuration Flow
```
┌─────────────────────────────────────────┐
│ Provisioning Server │
│ - Generates cluster-config.json │
│ - Copies to /etc/nixos/secrets/ │
└────────────────┬────────────────────────┘
│ nixos-anywhere
┌─────────────────────────────────────────┐
│ Target Node │
│ /etc/nixos/secrets/cluster-config.json │
└────────────────┬────────────────────────┘
│ Read by NixOS module
┌─────────────────────────────────────────┐
│ first-boot-automation.nix │
│ - Parses JSON config │
│ - Creates systemd services │
│ - Sets up dependencies │
└────────────────┬────────────────────────┘
│ systemd activation
┌─────────────────────────────────────────┐
│ Cluster Join Services │
│ - Execute join logic │
│ - Create marker files │
│ - Log to journald │
└─────────────────────────────────────────┘
```
## Bootstrap vs Join Decision Logic
### Decision Tree
```
┌─────────────────┐
│ Node Boots │
└────────┬────────┘
┌────────▼────────┐
│ Read cluster- │
│ config.json │
└────────┬────────┘
┌────────▼────────┐
│ bootstrap=true? │
└────────┬────────┘
┌────────────┴────────────┐
│ │
YES ▼ ▼ NO
┌─────────────────┐ ┌─────────────────┐
│ Bootstrap Mode │ │ Join Mode │
│ │ │ │
│ - Skip cluster │ │ - Wait for │
│ join API │ │ local health │
│ - Raft cluster │ │ - Contact │
│ initializes │ │ leader │
│ internally │ │ - POST to │
│ - Create marker │ │ /member/add │
│ - Exit success │ │ - Retry 5x │
└─────────────────┘ └─────────────────┘
```
### Bootstrap Mode (bootstrap: true)
**When to use:**
- First 3 nodes in a new cluster
- Nodes configured with matching `initial_peers`
- No existing cluster to join
**Behavior:**
1. Service starts with `--initial-cluster` parameter containing all bootstrap peers
2. Raft consensus protocol automatically elects leader
3. Cluster join service detects bootstrap mode and exits immediately
4. No API calls to leader (cluster doesn't exist yet)
**Configuration:**
```json
{
"bootstrap": true,
"initial_peers": ["node01:2380", "node02:2380", "node03:2380"]
}
```
**Marker file:** `/var/lib/first-boot-automation/.chainfire-initialized`
### Join Mode (bootstrap: false)
**When to use:**
- Nodes joining an existing cluster
- Expansion or replacement nodes
- Leader URL is known and reachable
**Behavior:**
1. Service starts with no initial cluster configuration
2. Cluster join service waits for local service health
3. POST to leader's `/admin/member/add` with node info
4. Leader adds member to Raft configuration
5. Node joins cluster and synchronizes state
**Configuration:**
```json
{
"bootstrap": false,
"leader_url": "https://node01.example.com:2379",
"raft_addr": "10.0.1.13:2380"
}
```
**Marker file:** `/var/lib/first-boot-automation/.chainfire-joined`
## Idempotency and State Management
### Marker Files
The system uses marker files to track initialization state:
```
/var/lib/first-boot-automation/
├── .chainfire-initialized # Bootstrap node initialized
├── .chainfire-joined # Node joined cluster
├── .flaredb-initialized # FlareDB bootstrap
├── .flaredb-joined # FlareDB joined
└── .iam-initialized # IAM setup complete
```
**Purpose:**
- Prevent duplicate join attempts on reboot
- Support idempotent operations
- Enable troubleshooting (check timestamps)
**Format:** ISO8601 timestamp of initialization
```
2025-12-10T10:30:45+00:00
```
### State Transitions
```
┌──────────────┐
│ First Boot │
│ (no marker) │
└──────┬───────┘
┌──────────────┐
│ Check Config │
│ bootstrap=? │
└──────┬───────┘
├─(true)──▶ Bootstrap ──▶ Create .initialized ──▶ Done
└─(false)─▶ Join ──▶ Create .joined ──▶ Done
│ (reboot)
┌──────────────┐
│ Marker Exists│
│ Skip Join │
└──────────────┘
```
## Retry Logic and Error Handling
### Health Check Retry
**Parameters:**
- Timeout: 120 seconds (configurable)
- Retry Interval: 5 seconds
- Max Elapsed: 300 seconds
**Logic:**
```bash
START_TIME=$(date +%s)
while true; do
ELAPSED=$(($(date +%s) - START_TIME))
if [[ $ELAPSED -ge $TIMEOUT ]]; then
exit 1 # Timeout
fi
HTTP_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" "$HEALTH_URL")
if [[ "$HTTP_CODE" == "200" ]]; then
exit 0 # Success
fi
sleep 5
done
```
### Cluster Join Retry
**Parameters:**
- Max Attempts: 5 (configurable)
- Retry Delay: 10 seconds
- Exponential Backoff: Optional (not implemented)
**Logic:**
```bash
for ATTEMPT in $(seq 1 $MAX_ATTEMPTS); do
HTTP_CODE=$(curl -X POST "$LEADER_URL/admin/member/add" -d "$PAYLOAD")
if [[ "$HTTP_CODE" == "200" || "$HTTP_CODE" == "201" ]]; then
exit 0 # Success
elif [[ "$HTTP_CODE" == "409" ]]; then
exit 2 # Already member
fi
sleep $RETRY_DELAY
done
exit 1 # Max attempts exhausted
```
### Error Codes
**Health Check:**
- `0`: Service healthy
- `1`: Timeout or unhealthy
**Cluster Join:**
- `0`: Successfully joined
- `1`: Failed after max attempts
- `2`: Already joined (idempotent)
- `3`: Invalid arguments
**Bootstrap Detector:**
- `0`: Should bootstrap
- `1`: Should join existing
- `2`: Configuration error
## Security Considerations
### TLS Certificate Handling
**Requirements:**
- All inter-node communication uses TLS
- Self-signed certificates supported via `-k` flag to curl
- Certificate validation in production (remove `-k`)
**Certificate Paths:**
```json
{
"tls": {
"enabled": true,
"ca_cert_path": "/etc/nixos/secrets/ca.crt",
"node_cert_path": "/etc/nixos/secrets/node01.crt",
"node_key_path": "/etc/nixos/secrets/node01.key"
}
}
```
**Integration with T031:**
- Certificates generated by T031 TLS automation
- Copied to target during provisioning
- Read by services at startup
### Secrets Management
**Cluster Configuration:**
- Stored in `/etc/nixos/secrets/cluster-config.json`
- Permissions: `0600 root:root` (recommended)
- Contains sensitive data: URLs, IPs, topology
**API Credentials:**
- IAM admin credentials (future implementation)
- Stored in separate file: `/etc/nixos/secrets/iam-admin.json`
- Never logged to journald
### Attack Surface
**Mitigations:**
1. **Network-level**: Firewall rules restrict cluster API ports
2. **Application-level**: mTLS for authenticated requests
3. **Access control**: SystemD service isolation
4. **Audit**: All operations logged to journald with structured JSON
## Integration Points
### T024 NixOS Modules
The first-boot automation module imports and extends service modules:
```nix
# Example: netboot-control-plane.nix
{
imports = [
../modules/chainfire.nix
../modules/flaredb.nix
../modules/iam.nix
../modules/first-boot-automation.nix
];
services.first-boot-automation.enable = true;
}
```
### T031 TLS Certificates
**Dependencies:**
- TLS certificates must exist before first boot
- Provisioning script copies certificates to `/etc/nixos/secrets/`
- Services read certificates at startup
**Certificate Generation:**
```bash
# On provisioning server (T031)
./tls/generate-node-cert.sh node01.example.com 10.0.1.10
# Copied to target
scp ca.crt node01.crt node01.key root@10.0.1.10:/etc/nixos/secrets/
```
### T032.S1-S3 PXE/Netboot
**Boot Flow:**
1. PXE boot loads iPXE firmware
2. iPXE chainloads NixOS kernel/initrd
3. NixOS installer runs (nixos-anywhere)
4. System installed to disk with first-boot automation
5. Reboot into installed system
6. First-boot automation executes
**Configuration Injection:**
```bash
# During nixos-anywhere provisioning
mkdir -p /mnt/etc/nixos/secrets
cp cluster-config.json /mnt/etc/nixos/secrets/
chmod 600 /mnt/etc/nixos/secrets/cluster-config.json
```
## Service Dependencies
### Systemd Ordering
**Chainfire:**
```
After: network-online.target, chainfire.service
Before: flaredb-cluster-join.service
Wants: network-online.target
```
**FlareDB:**
```
After: chainfire-cluster-join.service, flaredb.service
Requires: chainfire-cluster-join.service
Before: iam-initial-setup.service
```
**IAM:**
```
After: flaredb-cluster-join.service, iam.service
Before: cluster-health-check.service
```
**Health Check:**
```
After: chainfire-cluster-join, flaredb-cluster-join, iam-initial-setup
Type: oneshot (no RemainAfterExit)
```
### Dependency Graph
```
network-online.target
├──▶ chainfire.service
│ │
│ ▼
│ chainfire-cluster-join.service
│ │
├──▶ flaredb.service
│ │
│ ▼
└────▶ flaredb-cluster-join.service
┌────┴────┐
│ │
iam.service │
│ │
▼ │
iam-initial-setup.service
│ │
└────┬────┘
cluster-health-check.service
```
## Logging and Observability
### Structured Logging
All scripts output JSON-formatted logs:
```json
{
"timestamp": "2025-12-10T10:30:45+00:00",
"level": "INFO",
"service": "chainfire",
"operation": "cluster-join",
"message": "Successfully joined cluster"
}
```
**Benefits:**
- Machine-readable for log aggregation (T025)
- Easy filtering with `journalctl -o json`
- Includes context (service, operation, timestamp)
### Querying Logs
**View all first-boot automation logs:**
```bash
journalctl -u chainfire-cluster-join.service -u flaredb-cluster-join.service \
-u iam-initial-setup.service -u cluster-health-check.service
```
**Filter by log level:**
```bash
journalctl -u chainfire-cluster-join.service | grep '"level":"ERROR"'
```
**Follow live:**
```bash
journalctl -u chainfire-cluster-join.service -f
```
### Health Check Integration
**T025 Observability:**
- Health check service can POST to metrics endpoint
- Prometheus scraping of `/health` endpoints
- Alerts on cluster join failures
**Future:**
- Webhook to provisioning server on completion
- Slack/email notifications on errors
- Dashboard showing cluster join status
## Performance Characteristics
### Boot Time Analysis
**Typical Timeline (3-node cluster):**
```
T+0s : systemd starts
T+5s : network-online.target reached
T+10s : chainfire.service starts
T+15s : chainfire healthy
T+15s : chainfire-cluster-join runs (bootstrap, immediate exit)
T+20s : flaredb.service starts
T+25s : flaredb healthy
T+25s : flaredb-cluster-join runs (bootstrap, immediate exit)
T+30s : iam.service starts
T+35s : iam healthy
T+35s : iam-initial-setup runs
T+40s : cluster-health-check runs
T+40s : Node fully operational
```
**Join Mode (node joining existing cluster):**
```
T+0s : systemd starts
T+5s : network-online.target reached
T+10s : chainfire.service starts
T+15s : chainfire healthy
T+15s : chainfire-cluster-join runs
T+20s : POST to leader, wait for response
T+25s : Successfully joined chainfire cluster
T+25s : flaredb.service starts
T+30s : flaredb healthy
T+30s : flaredb-cluster-join runs
T+35s : Successfully joined flaredb cluster
T+40s : iam-initial-setup (skips, already initialized)
T+45s : cluster-health-check runs
T+45s : Node fully operational
```
### Bottlenecks
**Health Check Polling:**
- 5-second intervals may be too aggressive
- Recommendation: Exponential backoff
**Network Latency:**
- Join requests block on network RTT
- Mitigation: Ensure low-latency cluster network
**Raft Synchronization:**
- New member must catch up on Raft log
- Time depends on log size (seconds to minutes)
## Failure Modes and Recovery
### Common Failures
**1. Leader Unreachable**
**Symptom:**
```json
{"level":"ERROR","message":"Join request failed: connection error"}
```
**Diagnosis:**
- Check network connectivity: `ping node01.example.com`
- Verify firewall rules: `iptables -L`
- Check leader service status: `systemctl status chainfire.service`
**Recovery:**
```bash
# Fix network/firewall, then restart join service
systemctl restart chainfire-cluster-join.service
```
**2. Invalid Configuration**
**Symptom:**
```json
{"level":"ERROR","message":"Configuration file not found"}
```
**Diagnosis:**
- Verify file exists: `ls -la /etc/nixos/secrets/cluster-config.json`
- Check JSON syntax: `jq . /etc/nixos/secrets/cluster-config.json`
**Recovery:**
```bash
# Fix configuration, then restart
systemctl restart chainfire-cluster-join.service
```
**3. Service Not Healthy**
**Symptom:**
```json
{"level":"ERROR","message":"Health check timeout"}
```
**Diagnosis:**
- Check service logs: `journalctl -u chainfire.service`
- Verify service is running: `systemctl status chainfire.service`
- Test health endpoint: `curl -k https://localhost:2379/health`
**Recovery:**
```bash
# Restart the main service
systemctl restart chainfire.service
# Join service will auto-retry after RestartSec
```
**4. Already Member**
**Symptom:**
```json
{"level":"WARN","message":"Node already member of cluster (HTTP 409)"}
```
**Diagnosis:**
- This is normal on reboots
- Marker file created to prevent future attempts
**Recovery:**
- No action needed (idempotent behavior)
### Manual Cluster Join
If automation fails, manual join:
**Chainfire:**
```bash
curl -k -X POST https://node01.example.com:2379/admin/member/add \
-H "Content-Type: application/json" \
-d '{"id":"node04","raft_addr":"10.0.1.13:2380"}'
# Create marker to prevent auto-retry
mkdir -p /var/lib/first-boot-automation
date -Iseconds > /var/lib/first-boot-automation/.chainfire-joined
```
**FlareDB:**
```bash
curl -k -X POST https://node01.example.com:2479/admin/member/add \
-H "Content-Type: application/json" \
-d '{"id":"node04","raft_addr":"10.0.1.13:2480"}'
date -Iseconds > /var/lib/first-boot-automation/.flaredb-joined
```
### Rollback Procedure
**Remove from cluster:**
```bash
# On leader
curl -k -X DELETE https://node01.example.com:2379/admin/member/node04
# On node being removed
systemctl stop chainfire.service
rm -rf /var/lib/chainfire/*
rm /var/lib/first-boot-automation/.chainfire-joined
# Re-enable automation
systemctl restart chainfire-cluster-join.service
```
## Future Enhancements
### Planned Improvements
**1. Exponential Backoff**
- Current: Fixed 10-second delay
- Future: 1s, 2s, 4s, 8s, 16s exponential backoff
**2. Leader Discovery**
- Current: Static leader URL in config
- Future: DNS SRV records for dynamic discovery
**3. Webhook Notifications**
- POST to provisioning server on completion
- Include node info, join time, cluster health
**4. Pre-flight Checks**
- Validate network connectivity before attempting join
- Check TLS certificate validity
- Verify disk space, memory, CPU requirements
**5. Automated Testing**
- Integration tests with real cluster
- Simulate failures (network partitions, leader crashes)
- Validate idempotency
**6. Configuration Validation**
- JSON schema validation at boot
- Fail fast on invalid configuration
- Provide clear error messages
## References
- **T024**: NixOS service modules
- **T025**: Observability and monitoring
- **T031**: TLS certificate automation
- **T032.S1-S3**: PXE boot, netboot images, provisioning
- **Design Document**: `/home/centra/cloud/docs/por/T032-baremetal-provisioning/design.md`
## Appendix: Configuration Schema
### cluster-config.json Schema
```json
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"required": ["node_id", "node_role", "bootstrap", "cluster_name", "leader_url", "raft_addr"],
"properties": {
"node_id": {
"type": "string",
"description": "Unique node identifier"
},
"node_role": {
"type": "string",
"enum": ["control-plane", "worker", "all-in-one"]
},
"bootstrap": {
"type": "boolean",
"description": "True for first 3 nodes, false for join"
},
"cluster_name": {
"type": "string"
},
"leader_url": {
"type": "string",
"format": "uri"
},
"raft_addr": {
"type": "string",
"pattern": "^[0-9.]+:[0-9]+$"
},
"initial_peers": {
"type": "array",
"items": {"type": "string"}
},
"flaredb_peers": {
"type": "array",
"items": {"type": "string"}
}
}
}
```

View file

@ -1,858 +0,0 @@
# First-Boot Automation for Bare-Metal Provisioning
Automated cluster joining and service initialization for bare-metal provisioned NixOS nodes.
## Table of Contents
- [Overview](#overview)
- [Quick Start](#quick-start)
- [Configuration](#configuration)
- [Bootstrap vs Join](#bootstrap-vs-join)
- [Systemd Services](#systemd-services)
- [Troubleshooting](#troubleshooting)
- [Manual Operations](#manual-operations)
- [Security](#security)
- [Examples](#examples)
## Overview
The first-boot automation system handles automated cluster joining for distributed services (Chainfire, FlareDB, IAM) on first boot of bare-metal provisioned nodes. It supports two modes:
- **Bootstrap Mode**: Initialize a new Raft cluster (first 3 nodes)
- **Join Mode**: Join an existing cluster (additional nodes)
### Features
- Automated health checking with retries
- Idempotent operations (safe to run multiple times)
- Structured JSON logging to journald
- Graceful failure handling with configurable retries
- Integration with TLS certificates (T031)
- Support for both bootstrap and runtime join scenarios
### Architecture
See [ARCHITECTURE.md](ARCHITECTURE.md) for detailed design documentation.
## Quick Start
### Prerequisites
1. Node provisioned via T032.S1-S3 (PXE boot and installation)
2. Cluster configuration file at `/etc/nixos/secrets/cluster-config.json`
3. TLS certificates at `/etc/nixos/secrets/` (T031)
4. Network connectivity to cluster leader (for join mode)
### Enable First-Boot Automation
In your NixOS configuration:
```nix
# /etc/nixos/configuration.nix
{
imports = [
./nix/modules/first-boot-automation.nix
];
services.first-boot-automation = {
enable = true;
configFile = "/etc/nixos/secrets/cluster-config.json";
# Optional: disable specific services
enableChainfire = true;
enableFlareDB = true;
enableIAM = true;
enableHealthCheck = true;
};
}
```
### First Boot
After provisioning and reboot:
1. Node boots from disk
2. systemd starts services
3. First-boot automation runs automatically
4. Cluster join completes within 30-60 seconds
Check status:
```bash
systemctl status chainfire-cluster-join.service
systemctl status flaredb-cluster-join.service
systemctl status iam-initial-setup.service
systemctl status cluster-health-check.service
```
## Configuration
### cluster-config.json Format
```json
{
"node_id": "node01",
"node_role": "control-plane",
"bootstrap": true,
"cluster_name": "prod-cluster",
"leader_url": "https://node01.prod.example.com:2379",
"raft_addr": "10.0.1.10:2380",
"initial_peers": [
"node01:2380",
"node02:2380",
"node03:2380"
],
"flaredb_peers": [
"node01:2480",
"node02:2480",
"node03:2480"
]
}
```
### Required Fields
| Field | Type | Description |
|-------|------|-------------|
| `node_id` | string | Unique identifier for this node |
| `node_role` | string | Node role: `control-plane`, `worker`, or `all-in-one` |
| `bootstrap` | boolean | `true` for first 3 nodes, `false` for additional nodes |
| `cluster_name` | string | Cluster identifier |
| `leader_url` | string | HTTPS URL of cluster leader (used for join) |
| `raft_addr` | string | This node's Raft address (IP:port) |
| `initial_peers` | array | List of bootstrap peer addresses |
| `flaredb_peers` | array | List of FlareDB peer addresses |
### Optional Fields
| Field | Type | Description |
|-------|------|-------------|
| `node_ip` | string | Node's primary IP address |
| `node_fqdn` | string | Fully qualified domain name |
| `datacenter` | string | Datacenter identifier |
| `rack` | string | Rack identifier |
| `services` | object | Per-service configuration |
| `tls` | object | TLS certificate paths |
| `network` | object | Network CIDR ranges |
### Example Configurations
See [examples/](examples/) directory:
- `cluster-config-bootstrap.json` - Bootstrap node (first 3)
- `cluster-config-join.json` - Join node (additional)
- `cluster-config-all-in-one.json` - Single-node deployment
## Bootstrap vs Join
### Bootstrap Mode (bootstrap: true)
**When to use:**
- First 3 nodes in a new cluster
- Nodes configured with matching `initial_peers`
- No existing cluster to join
**Behavior:**
1. Services start with `--initial-cluster` configuration
2. Raft consensus automatically elects leader
3. Cluster join service detects bootstrap mode and exits immediately
4. Marker file created: `/var/lib/first-boot-automation/.chainfire-initialized`
**Example:**
```json
{
"node_id": "node01",
"bootstrap": true,
"initial_peers": ["node01:2380", "node02:2380", "node03:2380"]
}
```
### Join Mode (bootstrap: false)
**When to use:**
- Nodes joining an existing cluster
- Expansion or replacement nodes
- Leader is known and reachable
**Behavior:**
1. Service starts with no initial cluster config
2. Waits for local service to be healthy (max 120s)
3. POST to leader's `/admin/member/add` endpoint
4. Retries up to 5 times with 10s delay
5. Marker file created: `/var/lib/first-boot-automation/.chainfire-joined`
**Example:**
```json
{
"node_id": "node04",
"bootstrap": false,
"leader_url": "https://node01.prod.example.com:2379",
"raft_addr": "10.0.1.13:2380"
}
```
### Decision Matrix
| Scenario | bootstrap | initial_peers | leader_url |
|----------|-----------|---------------|------------|
| Node 1 (first) | `true` | all 3 nodes | self |
| Node 2 (first) | `true` | all 3 nodes | self |
| Node 3 (first) | `true` | all 3 nodes | self |
| Node 4+ (join) | `false` | all 3 nodes | node 1 |
## Systemd Services
### chainfire-cluster-join.service
**Description:** Joins Chainfire cluster on first boot
**Dependencies:**
- After: `network-online.target`, `chainfire.service`
- Before: `flaredb-cluster-join.service`
**Configuration:**
- Type: `oneshot`
- RemainAfterExit: `true`
- Restart: `on-failure`
**Logs:**
```bash
journalctl -u chainfire-cluster-join.service
```
### flaredb-cluster-join.service
**Description:** Joins FlareDB cluster after Chainfire
**Dependencies:**
- After: `chainfire-cluster-join.service`, `flaredb.service`
- Requires: `chainfire-cluster-join.service`
**Configuration:**
- Type: `oneshot`
- RemainAfterExit: `true`
- Restart: `on-failure`
**Logs:**
```bash
journalctl -u flaredb-cluster-join.service
```
### iam-initial-setup.service
**Description:** IAM initial setup and admin user creation
**Dependencies:**
- After: `flaredb-cluster-join.service`, `iam.service`
**Configuration:**
- Type: `oneshot`
- RemainAfterExit: `true`
**Logs:**
```bash
journalctl -u iam-initial-setup.service
```
### cluster-health-check.service
**Description:** Validates cluster health on first boot
**Dependencies:**
- After: all cluster-join services
**Configuration:**
- Type: `oneshot`
- RemainAfterExit: `false`
**Logs:**
```bash
journalctl -u cluster-health-check.service
```
## Troubleshooting
### Check Service Status
```bash
# Overall status
systemctl status chainfire-cluster-join.service
systemctl status flaredb-cluster-join.service
# Detailed logs with JSON output
journalctl -u chainfire-cluster-join.service -o json-pretty
# Follow logs in real-time
journalctl -u chainfire-cluster-join.service -f
```
### Common Issues
#### 1. Health Check Timeout
**Symptom:**
```json
{"level":"ERROR","message":"Health check timeout after 120s"}
```
**Causes:**
- Service not starting (check main service logs)
- Port conflict
- TLS certificate issues
**Solutions:**
```bash
# Check main service
systemctl status chainfire.service
journalctl -u chainfire.service
# Test health endpoint manually
curl -k https://localhost:2379/health
# Restart services
systemctl restart chainfire.service
systemctl restart chainfire-cluster-join.service
```
#### 2. Leader Unreachable
**Symptom:**
```json
{"level":"ERROR","message":"Join request failed: connection error"}
```
**Causes:**
- Network connectivity issues
- Firewall blocking ports
- Leader not running
- Wrong leader URL in config
**Solutions:**
```bash
# Test network connectivity
ping node01.prod.example.com
curl -k https://node01.prod.example.com:2379/health
# Check firewall
iptables -L -n | grep 2379
# Verify configuration
jq '.leader_url' /etc/nixos/secrets/cluster-config.json
# Try manual join (see below)
```
#### 3. Invalid Configuration
**Symptom:**
```json
{"level":"ERROR","message":"Configuration file not found"}
```
**Causes:**
- Missing configuration file
- Wrong file path
- Invalid JSON syntax
- Missing required fields
**Solutions:**
```bash
# Check file exists
ls -la /etc/nixos/secrets/cluster-config.json
# Validate JSON syntax
jq . /etc/nixos/secrets/cluster-config.json
# Check required fields
jq '.node_id, .bootstrap, .leader_url' /etc/nixos/secrets/cluster-config.json
# Fix and restart
systemctl restart chainfire-cluster-join.service
```
#### 4. Already Member (Reboot)
**Symptom:**
```json
{"level":"WARN","message":"Already member of cluster (HTTP 409)"}
```
**Explanation:**
- This is **normal** on reboots
- Marker file prevents duplicate joins
- No action needed
**Verify:**
```bash
# Check marker file
cat /var/lib/first-boot-automation/.chainfire-joined
# Should show timestamp: 2025-12-10T10:30:45+00:00
```
#### 5. Join Retry Exhausted
**Symptom:**
```json
{"level":"ERROR","message":"Failed to join cluster after 5 attempts"}
```
**Causes:**
- Persistent network issues
- Leader down or overloaded
- Invalid node configuration
- Cluster at capacity
**Solutions:**
```bash
# Check cluster status on leader
curl -k https://node01.prod.example.com:2379/admin/cluster/members | jq
# Verify this node's configuration
jq '.node_id, .raft_addr' /etc/nixos/secrets/cluster-config.json
# Increase retry attempts (edit NixOS config)
# Or perform manual join (see below)
```
### Verify Cluster Membership
**On leader node:**
```bash
# Chainfire members
curl -k https://localhost:2379/admin/cluster/members | jq
# FlareDB members
curl -k https://localhost:2479/admin/cluster/members | jq
```
**Expected output:**
```json
{
"members": [
{"id": "node01", "raft_addr": "10.0.1.10:2380", "status": "healthy"},
{"id": "node02", "raft_addr": "10.0.1.11:2380", "status": "healthy"},
{"id": "node03", "raft_addr": "10.0.1.12:2380", "status": "healthy"}
]
}
```
### Check Marker Files
```bash
# List all marker files
ls -la /var/lib/first-boot-automation/
# View timestamps
cat /var/lib/first-boot-automation/.chainfire-joined
cat /var/lib/first-boot-automation/.flaredb-joined
```
### Reset and Re-join
**Warning:** This will remove the node from the cluster and rejoin.
```bash
# Stop services
systemctl stop chainfire.service flaredb.service
# Remove data and markers
rm -rf /var/lib/chainfire/*
rm -rf /var/lib/flaredb/*
rm /var/lib/first-boot-automation/.chainfire-*
rm /var/lib/first-boot-automation/.flaredb-*
# Restart (will auto-join)
systemctl start chainfire.service
systemctl restart chainfire-cluster-join.service
```
## Manual Operations
### Manual Cluster Join
If automation fails, perform manual join:
**Chainfire:**
```bash
# On joining node, ensure service is running and healthy
curl -k https://localhost:2379/health
# From any node, add member to cluster
curl -k -X POST https://node01.prod.example.com:2379/admin/member/add \
-H "Content-Type: application/json" \
-d '{
"id": "node04",
"raft_addr": "10.0.1.13:2380"
}'
# Create marker to prevent auto-retry
mkdir -p /var/lib/first-boot-automation
date -Iseconds > /var/lib/first-boot-automation/.chainfire-joined
```
**FlareDB:**
```bash
curl -k -X POST https://node01.prod.example.com:2479/admin/member/add \
-H "Content-Type: application/json" \
-d '{
"id": "node04",
"raft_addr": "10.0.1.13:2480"
}'
date -Iseconds > /var/lib/first-boot-automation/.flaredb-joined
```
### Remove Node from Cluster
**On leader:**
```bash
# Chainfire
curl -k -X DELETE https://node01.prod.example.com:2379/admin/member/node04
# FlareDB
curl -k -X DELETE https://node01.prod.example.com:2479/admin/member/node04
```
**On removed node:**
```bash
# Stop services
systemctl stop chainfire.service flaredb.service
# Clean up data
rm -rf /var/lib/chainfire/*
rm -rf /var/lib/flaredb/*
rm /var/lib/first-boot-automation/.chainfire-*
rm /var/lib/first-boot-automation/.flaredb-*
```
### Disable First-Boot Automation
If you need to disable automation:
```nix
# In NixOS configuration
services.first-boot-automation.enable = false;
```
Or stop services temporarily:
```bash
systemctl stop chainfire-cluster-join.service
systemctl disable chainfire-cluster-join.service
```
### Re-enable After Manual Operations
After manual cluster operations:
```bash
# Create marker files to indicate join complete
mkdir -p /var/lib/first-boot-automation
date -Iseconds > /var/lib/first-boot-automation/.chainfire-joined
date -Iseconds > /var/lib/first-boot-automation/.flaredb-joined
# Or re-enable automation (will skip if markers exist)
systemctl enable --now chainfire-cluster-join.service
```
## Security
### TLS Certificates
**Requirements:**
- All cluster communication uses TLS
- Certificates must exist before first boot
- Generated by T031 TLS automation
**Certificate Paths:**
```
/etc/nixos/secrets/
├── ca.crt # CA certificate
├── node01.crt # Node certificate
└── node01.key # Node private key (mode 0600)
```
**Permissions:**
```bash
chmod 600 /etc/nixos/secrets/node01.key
chmod 644 /etc/nixos/secrets/node01.crt
chmod 644 /etc/nixos/secrets/ca.crt
```
### Configuration File Security
**Cluster configuration contains sensitive data:**
- IP addresses and network topology
- Service URLs
- Node identifiers
**Recommended permissions:**
```bash
chmod 600 /etc/nixos/secrets/cluster-config.json
chown root:root /etc/nixos/secrets/cluster-config.json
```
### Network Security
**Required firewall rules:**
```bash
# Chainfire
iptables -A INPUT -p tcp --dport 2379 -s 10.0.1.0/24 -j ACCEPT # API
iptables -A INPUT -p tcp --dport 2380 -s 10.0.1.0/24 -j ACCEPT # Raft
iptables -A INPUT -p tcp --dport 2381 -s 10.0.1.0/24 -j ACCEPT # Gossip
# FlareDB
iptables -A INPUT -p tcp --dport 2479 -s 10.0.1.0/24 -j ACCEPT # API
iptables -A INPUT -p tcp --dport 2480 -s 10.0.1.0/24 -j ACCEPT # Raft
# IAM
iptables -A INPUT -p tcp --dport 8080 -s 10.0.1.0/24 -j ACCEPT # API
```
### Production Considerations
**For production deployments:**
1. **Remove `-k` flag from curl** (validate TLS certificates)
2. **Implement mTLS** for client authentication
3. **Rotate credentials** regularly
4. **Audit logs** with structured logging
5. **Monitor health endpoints** continuously
6. **Backup cluster state** before changes
## Examples
### Example 1: 3-Node Bootstrap Cluster
**Node 1:**
```json
{
"node_id": "node01",
"bootstrap": true,
"raft_addr": "10.0.1.10:2380",
"initial_peers": ["node01:2380", "node02:2380", "node03:2380"]
}
```
**Node 2:**
```json
{
"node_id": "node02",
"bootstrap": true,
"raft_addr": "10.0.1.11:2380",
"initial_peers": ["node01:2380", "node02:2380", "node03:2380"]
}
```
**Node 3:**
```json
{
"node_id": "node03",
"bootstrap": true,
"raft_addr": "10.0.1.12:2380",
"initial_peers": ["node01:2380", "node02:2380", "node03:2380"]
}
```
**Provisioning:**
```bash
# Provision all 3 nodes simultaneously
for i in {1..3}; do
nixos-anywhere --flake .#node0$i root@node0$i.example.com &
done
wait
# Nodes will bootstrap automatically on first boot
```
### Example 2: Join Existing Cluster
**Node 4 (joining):**
```json
{
"node_id": "node04",
"bootstrap": false,
"leader_url": "https://node01.prod.example.com:2379",
"raft_addr": "10.0.1.13:2380"
}
```
**Provisioning:**
```bash
nixos-anywhere --flake .#node04 root@node04.example.com
# Node will automatically join on first boot
```
### Example 3: Single-Node All-in-One
**For development/testing:**
```json
{
"node_id": "aio01",
"bootstrap": true,
"raft_addr": "10.0.2.10:2380",
"initial_peers": ["aio01:2380"],
"flaredb_peers": ["aio01:2480"]
}
```
**Provisioning:**
```bash
nixos-anywhere --flake .#aio01 root@aio01.example.com
```
## Integration with Other Systems
### T024 NixOS Modules
First-boot automation integrates with service modules:
```nix
{
imports = [
./nix/modules/chainfire.nix
./nix/modules/flaredb.nix
./nix/modules/first-boot-automation.nix
];
services.chainfire.enable = true;
services.flaredb.enable = true;
services.first-boot-automation.enable = true;
}
```
### T025 Observability
Health checks integrate with Prometheus:
```yaml
# prometheus.yml
scrape_configs:
- job_name: 'cluster-health'
static_configs:
- targets: ['node01:2379', 'node02:2379', 'node03:2379']
metrics_path: '/health'
```
### T031 TLS Certificates
Certificates generated by T031 are used automatically:
```bash
# On provisioning server
./tls/generate-node-cert.sh node01.example.com 10.0.1.10
# Copied during nixos-anywhere
# First-boot automation reads from /etc/nixos/secrets/
```
## Logs and Debugging
### Structured Logging
All logs are JSON-formatted:
```json
{
"timestamp": "2025-12-10T10:30:45+00:00",
"level": "INFO",
"service": "chainfire",
"operation": "cluster-join",
"message": "Successfully joined cluster"
}
```
### Query Examples
**All first-boot logs:**
```bash
journalctl -u "*cluster-join*" -u "*initial-setup*" -u "*health-check*"
```
**Errors only:**
```bash
journalctl -u chainfire-cluster-join.service | grep '"level":"ERROR"'
```
**Last boot only:**
```bash
journalctl -b -u chainfire-cluster-join.service
```
**JSON output for parsing:**
```bash
journalctl -u chainfire-cluster-join.service -o json | jq '.MESSAGE'
```
## Performance Tuning
### Timeout Configuration
Adjust timeouts in NixOS module:
```nix
services.first-boot-automation = {
enable = true;
# Override default ports if needed
chainfirePort = 2379;
flaredbPort = 2479;
};
```
### Retry Configuration
Modify retry logic in scripts:
```bash
# baremetal/first-boot/cluster-join.sh
MAX_ATTEMPTS=10 # Increase from 5
RETRY_DELAY=15 # Increase from 10s
```
### Health Check Interval
Adjust polling interval:
```bash
# In service scripts
sleep 10 # Increase from 5s for less aggressive polling
```
## Support and Contributing
### Getting Help
1. Check logs: `journalctl -u chainfire-cluster-join.service`
2. Review troubleshooting section above
3. Consult [ARCHITECTURE.md](ARCHITECTURE.md) for design details
4. Check cluster status on leader node
### Reporting Issues
Include in bug reports:
```bash
# Gather diagnostic information
journalctl -u chainfire-cluster-join.service > cluster-join.log
systemctl status chainfire-cluster-join.service > service-status.txt
cat /etc/nixos/secrets/cluster-config.json > config.json # Redact sensitive data!
ls -la /var/lib/first-boot-automation/ > markers.txt
```
### Development
See [ARCHITECTURE.md](ARCHITECTURE.md) for contributing guidelines.
## References
- **ARCHITECTURE.md**: Detailed design documentation
- **T024**: NixOS service modules
- **T025**: Observability and monitoring
- **T031**: TLS certificate automation
- **T032.S1-S3**: PXE boot and provisioning
- **Design Document**: `/home/centra/cloud/docs/por/T032-baremetal-provisioning/design.md`
## License
Internal use only - Centra Cloud Platform

View file

@ -47,9 +47,9 @@ if command -v jq &> /dev/null; then
NODE_ROLE=$(echo "$CONFIG_JSON" | jq -r '.node_role // "unknown"') NODE_ROLE=$(echo "$CONFIG_JSON" | jq -r '.node_role // "unknown"')
else else
# Fallback to grep/sed for minimal environments # Fallback to grep/sed for minimal environments
BOOTSTRAP=$(echo "$CONFIG_JSON" | grep -oP '"bootstrap"\s*:\s*\K(true|false)' || echo "false") BOOTSTRAP=$(echo "$CONFIG_JSON" | grep -Eo '"bootstrap"[[:space:]]*:[[:space:]]*(true|false)' | head -n1 | sed -E 's/.*:[[:space:]]*(true|false)/\1/' || echo "false")
NODE_ID=$(echo "$CONFIG_JSON" | grep -oP '"node_id"\s*:\s*"\K[^"]+' || echo "unknown") NODE_ID=$(echo "$CONFIG_JSON" | grep -Eo '"node_id"[[:space:]]*:[[:space:]]*"[^"]+"' | head -n1 | sed -E 's/.*"node_id"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/' || echo "unknown")
NODE_ROLE=$(echo "$CONFIG_JSON" | grep -oP '"node_role"\s*:\s*"\K[^"]+' || echo "unknown") NODE_ROLE=$(echo "$CONFIG_JSON" | grep -Eo '"node_role"[[:space:]]*:[[:space:]]*"[^"]+"' | head -n1 | sed -E 's/.*"node_role"[[:space:]]*:[[:space:]]*"([^"]+)".*/\1/' || echo "unknown")
fi fi
log "INFO" "Node configuration: id=$NODE_ID, role=$NODE_ROLE, bootstrap=$BOOTSTRAP" log "INFO" "Node configuration: id=$NODE_ID, role=$NODE_ROLE, bootstrap=$BOOTSTRAP"

View file

@ -25,6 +25,9 @@ LEADER_URL="${3:-}"
JOIN_PAYLOAD="${4:-}" JOIN_PAYLOAD="${4:-}"
MAX_ATTEMPTS="${5:-5}" MAX_ATTEMPTS="${5:-5}"
RETRY_DELAY="${6:-10}" RETRY_DELAY="${6:-10}"
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
CURL_MAX_TIME="${CURL_MAX_TIME:-15}"
CURL_INSECURE="${CURL_INSECURE:-1}"
FIRST_BOOT_MARKER="/var/lib/first-boot-automation/.${SERVICE_NAME}-joined" FIRST_BOOT_MARKER="/var/lib/first-boot-automation/.${SERVICE_NAME}-joined"
@ -81,7 +84,11 @@ else
exit 1 exit 1
fi fi
HTTP_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null || echo "000") CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
if [[ "$HTTP_CODE" == "200" ]]; then if [[ "$HTTP_CODE" == "200" ]]; then
log "INFO" "Local $SERVICE_NAME is healthy" log "INFO" "Local $SERVICE_NAME is healthy"
@ -109,13 +116,20 @@ for ATTEMPT in $(seq 1 "$MAX_ATTEMPTS"); do
# Make join request to leader # Make join request to leader
RESPONSE_FILE=$(mktemp) RESPONSE_FILE=$(mktemp)
HTTP_CODE=$(curl -k -s -w "%{http_code}" -o "$RESPONSE_FILE" \ PAYLOAD_FILE=$(mktemp)
printf '%s' "$JOIN_PAYLOAD" > "$PAYLOAD_FILE"
CURL_FLAGS=(-s -w "%{http_code}" -o "$RESPONSE_FILE" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" \
-X POST "$LEADER_URL/admin/member/add" \ -X POST "$LEADER_URL/admin/member/add" \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d "$JOIN_PAYLOAD" 2>/dev/null || echo "000") --data-binary "@$PAYLOAD_FILE" 2>/dev/null || echo "000")
RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "") RESPONSE_BODY=$(cat "$RESPONSE_FILE" 2>/dev/null || echo "")
rm -f "$RESPONSE_FILE" rm -f "$RESPONSE_FILE" "$PAYLOAD_FILE"
log "INFO" "Join request response: HTTP $HTTP_CODE" log "INFO" "Join request response: HTTP $HTTP_CODE"

View file

@ -1,77 +0,0 @@
{
"node_id": "aio01",
"node_role": "all-in-one",
"bootstrap": true,
"cluster_name": "dev-cluster",
"leader_url": "https://aio01.dev.example.com:2379",
"raft_addr": "10.0.2.10:2380",
"initial_peers": [
"aio01:2380"
],
"flaredb_peers": [
"aio01:2480"
],
"node_ip": "10.0.2.10",
"node_fqdn": "aio01.dev.example.com",
"datacenter": "dev",
"rack": "rack1",
"description": "Single-node all-in-one deployment for development/testing",
"services": {
"chainfire": {
"enabled": true,
"api_port": 2379,
"raft_port": 2380,
"gossip_port": 2381
},
"flaredb": {
"enabled": true,
"api_port": 2479,
"raft_port": 2480
},
"iam": {
"enabled": true,
"api_port": 8080
},
"plasmavmc": {
"enabled": true,
"api_port": 8090
},
"novanet": {
"enabled": true,
"api_port": 8091
},
"flashdns": {
"enabled": true,
"dns_port": 53,
"api_port": 8053
},
"fiberlb": {
"enabled": true,
"api_port": 8092
},
"lightningstor": {
"enabled": true,
"api_port": 8093
},
"k8shost": {
"enabled": true,
"api_port": 10250
}
},
"tls": {
"enabled": true,
"ca_cert_path": "/etc/nixos/secrets/ca.crt",
"node_cert_path": "/etc/nixos/secrets/aio01.crt",
"node_key_path": "/etc/nixos/secrets/aio01.key"
},
"network": {
"cluster_network": "10.0.2.0/24",
"pod_network": "10.244.0.0/16",
"service_network": "10.96.0.0/12"
},
"development": {
"mode": "single-node",
"skip_replication_checks": true,
"allow_single_raft_member": true
}
}

View file

@ -1,68 +0,0 @@
{
"node_id": "node01",
"node_role": "control-plane",
"bootstrap": true,
"cluster_name": "prod-cluster",
"leader_url": "https://node01.prod.example.com:2379",
"raft_addr": "10.0.1.10:2380",
"initial_peers": [
"node01:2380",
"node02:2380",
"node03:2380"
],
"flaredb_peers": [
"node01:2480",
"node02:2480",
"node03:2480"
],
"node_ip": "10.0.1.10",
"node_fqdn": "node01.prod.example.com",
"datacenter": "dc1",
"rack": "rack1",
"description": "Bootstrap node for production cluster - initializes Raft cluster",
"services": {
"chainfire": {
"enabled": true,
"api_port": 2379,
"raft_port": 2380,
"gossip_port": 2381
},
"flaredb": {
"enabled": true,
"api_port": 2479,
"raft_port": 2480
},
"iam": {
"enabled": true,
"api_port": 8080
},
"plasmavmc": {
"enabled": true,
"api_port": 8090
},
"novanet": {
"enabled": true,
"api_port": 8091
},
"flashdns": {
"enabled": true,
"dns_port": 53,
"api_port": 8053
},
"fiberlb": {
"enabled": true,
"api_port": 8092
}
},
"tls": {
"enabled": true,
"ca_cert_path": "/etc/nixos/secrets/ca.crt",
"node_cert_path": "/etc/nixos/secrets/node01.crt",
"node_key_path": "/etc/nixos/secrets/node01.key"
},
"network": {
"cluster_network": "10.0.1.0/24",
"pod_network": "10.244.0.0/16",
"service_network": "10.96.0.0/12"
}
}

View file

@ -1,68 +0,0 @@
{
"node_id": "node04",
"node_role": "control-plane",
"bootstrap": false,
"cluster_name": "prod-cluster",
"leader_url": "https://node01.prod.example.com:2379",
"raft_addr": "10.0.1.13:2380",
"initial_peers": [
"node01:2380",
"node02:2380",
"node03:2380"
],
"flaredb_peers": [
"node01:2480",
"node02:2480",
"node03:2480"
],
"node_ip": "10.0.1.13",
"node_fqdn": "node04.prod.example.com",
"datacenter": "dc1",
"rack": "rack2",
"description": "Additional node joining existing cluster - will contact leader to join",
"services": {
"chainfire": {
"enabled": true,
"api_port": 2379,
"raft_port": 2380,
"gossip_port": 2381
},
"flaredb": {
"enabled": true,
"api_port": 2479,
"raft_port": 2480
},
"iam": {
"enabled": true,
"api_port": 8080
},
"plasmavmc": {
"enabled": true,
"api_port": 8090
},
"novanet": {
"enabled": true,
"api_port": 8091
},
"flashdns": {
"enabled": true,
"dns_port": 53,
"api_port": 8053
},
"fiberlb": {
"enabled": true,
"api_port": 8092
}
},
"tls": {
"enabled": true,
"ca_cert_path": "/etc/nixos/secrets/ca.crt",
"node_cert_path": "/etc/nixos/secrets/node04.crt",
"node_key_path": "/etc/nixos/secrets/node04.key"
},
"network": {
"cluster_network": "10.0.1.0/24",
"pod_network": "10.244.0.0/16",
"service_network": "10.96.0.0/12"
}
}

View file

@ -19,6 +19,9 @@ SERVICE_NAME="${1:-}"
HEALTH_URL="${2:-}" HEALTH_URL="${2:-}"
TIMEOUT="${3:-300}" TIMEOUT="${3:-300}"
RETRY_INTERVAL="${4:-5}" RETRY_INTERVAL="${4:-5}"
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
CURL_MAX_TIME="${CURL_MAX_TIME:-10}"
CURL_INSECURE="${CURL_INSECURE:-1}"
# Validate arguments # Validate arguments
if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" ]]; then if [[ -z "$SERVICE_NAME" || -z "$HEALTH_URL" ]]; then
@ -55,8 +58,12 @@ while true; do
ATTEMPT=$((ATTEMPT + 1)) ATTEMPT=$((ATTEMPT + 1))
log "INFO" "Health check attempt $ATTEMPT (elapsed: ${ELAPSED}s)" log "INFO" "Health check attempt $ATTEMPT (elapsed: ${ELAPSED}s)"
# Perform health check (allow insecure TLS for self-signed certs) # Perform health check (allow insecure TLS if configured)
HTTP_CODE=$(curl -k -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" 2>/dev/null || echo "000") CURL_FLAGS=(-s -o /dev/null -w "%{http_code}" --connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
HTTP_CODE=$(curl "${CURL_FLAGS[@]}" "$HEALTH_URL" 2>/dev/null || echo "000")
if [[ "$HTTP_CODE" == "200" ]]; then if [[ "$HTTP_CODE" == "200" ]]; then
log "INFO" "Health check passed (HTTP $HTTP_CODE)" log "INFO" "Health check passed (HTTP $HTTP_CODE)"

View file

@ -1,570 +0,0 @@
# PlasmaCloud Netboot Image Builder - Technical Overview
## Introduction
This document provides a technical overview of the PlasmaCloud NixOS Image Builder, which generates bootable netboot images for bare-metal provisioning. This is part of T032 (Bare-Metal Provisioning) and specifically implements deliverable S3 (NixOS Image Builder).
## System Architecture
### High-Level Flow
```
┌─────────────────────┐
│ Nix Flake │
│ (flake.nix) │
└──────────┬──────────┘
├─── nixosConfigurations
│ ├── netboot-control-plane
│ ├── netboot-worker
│ └── netboot-all-in-one
├─── packages (T024)
│ ├── chainfire-server
│ ├── flaredb-server
│ └── ... (8 services)
└─── modules (T024)
├── chainfire.nix
├── flaredb.nix
└── ... (8 modules)
Build Process
┌─────────────────────┐
│ build-images.sh │
└──────────┬──────────┘
├─── nix build netbootRamdisk
├─── nix build kernel
└─── copy to artifacts/
Output
┌─────────────────────┐
│ Netboot Artifacts │
├─────────────────────┤
│ bzImage (kernel) │
│ initrd (ramdisk) │
│ netboot.ipxe │
└─────────────────────┘
├─── PXE Server
│ (HTTP/TFTP)
└─── Target Machine
(PXE Boot)
```
## Component Breakdown
### 1. Netboot Configurations
Located in `nix/images/`, these NixOS configurations define the netboot environment:
#### `netboot-base.nix`
**Purpose**: Common base configuration for all profiles
**Key Features**:
- Extends `netboot-minimal.nix` from nixpkgs
- SSH server with root login (key-based only)
- Generic kernel with broad hardware support
- Disk management tools (disko, parted, cryptsetup, lvm2)
- Network configuration (DHCP, predictable interface names)
- Serial console support (ttyS0, tty0)
- Minimal system (no docs, no sound)
**Package Inclusions**:
```nix
disko, parted, gptfdisk # Disk management
cryptsetup, lvm2 # Encryption and LVM
e2fsprogs, xfsprogs # Filesystem tools
iproute2, curl, tcpdump # Network tools
vim, tmux, htop # System tools
```
**Kernel Configuration**:
```nix
boot.kernelPackages = pkgs.linuxPackages_latest;
boot.kernelParams = [
"console=ttyS0,115200"
"console=tty0"
"loglevel=4"
];
```
#### `netboot-control-plane.nix`
**Purpose**: Full control plane deployment
**Imports**:
- `netboot-base.nix` (base configuration)
- `../modules` (PlasmaCloud service modules)
**Service Inclusions**:
- Chainfire (ports 2379, 2380, 2381)
- FlareDB (ports 2479, 2480)
- IAM (port 8080)
- PlasmaVMC (port 8081)
- PrismNET (port 8082)
- FlashDNS (port 53)
- FiberLB (port 8083)
- LightningStor (port 8084)
- K8sHost (port 8085)
**Service State**: All services **disabled** by default via `lib.mkDefault false`
**Resource Limits** (for netboot environment):
```nix
MemoryMax = "512M"
CPUQuota = "50%"
```
#### `netboot-worker.nix`
**Purpose**: Compute-focused worker nodes
**Imports**:
- `netboot-base.nix`
- `../modules`
**Service Inclusions**:
- PlasmaVMC (VM management)
- PrismNET (SDN)
**Additional Features**:
- KVM virtualization support
- Open vSwitch for SDN
- QEMU and libvirt tools
- Optimized sysctl for VM workloads
**Performance Tuning**:
```nix
"fs.file-max" = 1000000;
"net.ipv4.ip_forward" = 1;
"net.core.netdev_max_backlog" = 5000;
```
#### `netboot-all-in-one.nix`
**Purpose**: Single-node deployment with all services
**Imports**:
- `netboot-base.nix`
- `../modules`
**Combines**: All features from control-plane + worker
**Use Cases**:
- Development environments
- Small deployments
- Edge locations
- POC installations
### 2. Flake Integration
The main `flake.nix` exposes netboot configurations:
```nix
nixosConfigurations = {
netboot-control-plane = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./nix/images/netboot-control-plane.nix ];
};
netboot-worker = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./nix/images/netboot-worker.nix ];
};
netboot-all-in-one = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./nix/images/netboot-all-in-one.nix ];
};
};
```
### 3. Build Script
`build-images.sh` orchestrates the build process:
**Workflow**:
1. Parse command-line arguments (--profile, --output-dir)
2. Create output directories
3. For each profile:
- Build netboot ramdisk: `nix build ...netbootRamdisk`
- Build kernel: `nix build ...kernel`
- Copy artifacts (bzImage, initrd)
- Generate iPXE boot script
- Calculate and display sizes
4. Verify outputs (file existence, size sanity checks)
5. Copy to PXE server (if available)
6. Print summary
**Build Commands**:
```bash
nix build .#nixosConfigurations.netboot-$profile.config.system.build.netbootRamdisk
nix build .#nixosConfigurations.netboot-$profile.config.system.build.kernel
```
**Output Structure**:
```
artifacts/
├── control-plane/
│ ├── bzImage # ~10-30 MB
│ ├── initrd # ~100-300 MB
│ ├── netboot.ipxe # iPXE script
│ ├── build.log # Build log
│ ├── initrd-link # Nix result symlink
│ └── kernel-link # Nix result symlink
├── worker/
│ └── ... (same structure)
└── all-in-one/
└── ... (same structure)
```
## Integration Points
### T024 NixOS Modules
The netboot configurations leverage T024 service modules:
**Module Structure** (example: chainfire.nix):
```nix
{
options.services.chainfire = {
enable = lib.mkEnableOption "chainfire service";
port = lib.mkOption { ... };
raftPort = lib.mkOption { ... };
package = lib.mkOption { ... };
};
config = lib.mkIf cfg.enable {
users.users.chainfire = { ... };
systemd.services.chainfire = { ... };
};
}
```
**Package Availability**:
```nix
# In netboot-control-plane.nix
environment.systemPackages = with pkgs; [
chainfire-server # From flake overlay
flaredb-server # From flake overlay
# ...
];
```
### T032.S2 PXE Infrastructure
The build script integrates with the PXE server:
**Copy Workflow**:
```bash
# Build script copies to:
chainfire/baremetal/pxe-server/assets/nixos/
├── control-plane/
│ ├── bzImage
│ └── initrd
├── worker/
│ ├── bzImage
│ └── initrd
└── all-in-one/
├── bzImage
└── initrd
```
**iPXE Boot Script** (generated):
```ipxe
#!ipxe
kernel ${boot-server}/control-plane/bzImage init=/nix/store/*/init console=ttyS0,115200
initrd ${boot-server}/control-plane/initrd
boot
```
## Build Process Deep Dive
### NixOS Netboot Build Internals
1. **netboot-minimal.nix** (from nixpkgs):
- Provides base netboot functionality
- Configures initrd with kexec support
- Sets up squashfs for Nix store
2. **Our Extensions**:
- Add PlasmaCloud service packages
- Configure SSH for nixos-anywhere
- Include provisioning tools (disko, etc.)
- Customize kernel and modules
3. **Build Outputs**:
- **bzImage**: Compressed Linux kernel
- **initrd**: Squashfs-compressed initial ramdisk containing:
- Minimal NixOS system
- Nix store with service packages
- Init scripts for booting
### Size Optimization Strategies
**Current Optimizations**:
```nix
documentation.enable = false; # -50MB
documentation.nixos.enable = false; # -20MB
i18n.supportedLocales = [ "en_US" ]; # -100MB
```
**Additional Strategies** (if needed):
- Use `linuxPackages_hardened` (smaller kernel)
- Remove unused kernel modules
- Compress with xz instead of gzip
- On-demand package fetching from HTTP substituter
**Expected Sizes**:
- **Control Plane**: ~250-350 MB (initrd)
- **Worker**: ~150-250 MB (initrd)
- **All-in-One**: ~300-400 MB (initrd)
## Boot Flow
### From PXE to Running System
```
1. PXE Boot
├─ DHCP discovers boot server
├─ TFTP loads iPXE binary
└─ iPXE executes boot script
2. Netboot Download
├─ HTTP downloads bzImage (~20MB)
├─ HTTP downloads initrd (~200MB)
└─ kexec into NixOS installer
3. NixOS Installer (in RAM)
├─ Init system starts
├─ Network configuration (DHCP)
├─ SSH server starts
└─ Ready for nixos-anywhere
4. Installation (nixos-anywhere)
├─ SSH connection established
├─ Disk partitioning (disko)
├─ NixOS system installation
├─ Secret injection
└─ Bootloader installation
5. First Boot (from disk)
├─ GRUB/systemd-boot loads
├─ Services start (enabled)
├─ Cluster join (if configured)
└─ Running PlasmaCloud node
```
## Customization Guide
### Adding a New Service
**Step 1**: Create NixOS module
```nix
# nix/modules/myservice.nix
{ config, lib, pkgs, ... }:
{
options.services.myservice = {
enable = lib.mkEnableOption "myservice";
};
config = lib.mkIf cfg.enable {
systemd.services.myservice = { ... };
};
}
```
**Step 2**: Add to flake packages
```nix
# flake.nix
packages.myservice-server = buildRustWorkspace { ... };
```
**Step 3**: Include in netboot profile
```nix
# nix/images/netboot-control-plane.nix
environment.systemPackages = with pkgs; [
myservice-server
];
services.myservice = {
enable = lib.mkDefault false;
};
```
### Creating a Custom Profile
**Step 1**: Create new netboot configuration
```nix
# nix/images/netboot-custom.nix
{ config, pkgs, lib, ... }:
{
imports = [
./netboot-base.nix
../modules
];
# Your customizations
environment.systemPackages = [ ... ];
}
```
**Step 2**: Add to flake
```nix
# flake.nix
nixosConfigurations.netboot-custom = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./nix/images/netboot-custom.nix ];
};
```
**Step 3**: Update build script
```bash
# build-images.sh
profiles_to_build=("control-plane" "worker" "all-in-one" "custom")
```
## Security Model
### Netboot Phase
**Risk**: Netboot image has root SSH access enabled
**Mitigations**:
1. **Key-based authentication only** (no passwords)
2. **Isolated provisioning VLAN**
3. **MAC address whitelist in DHCP**
4. **Firewall disabled only during install**
### Post-Installation
Services remain disabled until final configuration enables them:
```nix
# In installed system configuration
services.chainfire.enable = true; # Overrides lib.mkDefault false
```
### Secret Management
Secrets are **NOT** embedded in netboot images:
```nix
# During nixos-anywhere installation:
scp secrets/* root@target:/tmp/secrets/
# Installed system references:
services.chainfire.settings.tls = {
cert_path = "/etc/nixos/secrets/tls-cert.pem";
};
```
## Performance Characteristics
### Build Times
- **First build**: 30-60 minutes (downloads all dependencies)
- **Incremental builds**: 5-15 minutes (reuses cached artifacts)
- **With local cache**: 2-5 minutes
### Network Requirements
- **Initial download**: ~2GB (nixpkgs + dependencies)
- **Netboot download**: ~200-400MB per node
- **Installation**: ~500MB-2GB (depending on services)
### Hardware Requirements
**Build Machine**:
- CPU: 4+ cores recommended
- RAM: 8GB minimum, 16GB recommended
- Disk: 50GB free space
- Network: Broadband connection
**Target Machine**:
- RAM: 4GB minimum for netboot (8GB+ for production)
- Network: PXE boot support, DHCP
- Disk: Depends on disko configuration
## Testing Strategy
### Verification Steps
1. **Syntax Validation**:
```bash
nix flake check
```
2. **Build Test**:
```bash
./build-images.sh --profile control-plane
```
3. **Artifact Verification**:
```bash
file artifacts/control-plane/bzImage # Should be Linux kernel
file artifacts/control-plane/initrd # Should be compressed data
```
4. **PXE Boot Test**:
- Boot VM from netboot image
- Verify SSH access
- Check available tools (disko, parted, etc.)
5. **Installation Test**:
- Run nixos-anywhere on test target
- Verify successful installation
- Check service availability
## Troubleshooting Matrix
| Symptom | Possible Cause | Solution |
|---------|---------------|----------|
| Build fails | Missing flakes | Enable experimental-features |
| Large initrd | Too many packages | Remove unused packages |
| SSH fails | Wrong SSH key | Update authorized_keys |
| Boot hangs | Wrong kernel params | Check console= settings |
| No network | DHCP issues | Verify useDHCP = true |
| Service missing | Package not built | Check flake overlay |
## Future Enhancements
### Planned Improvements
1. **Image Variants**:
- Minimal installer (no services)
- Debug variant (with extra tools)
- Rescue mode (for recovery)
2. **Build Optimizations**:
- Parallel profile builds
- Incremental rebuild detection
- Binary cache integration
3. **Security Enhancements**:
- Per-node SSH keys
- TPM-based secrets
- Measured boot support
4. **Monitoring**:
- Build metrics collection
- Size trend tracking
- Performance benchmarking
## References
- **NixOS Netboot**: https://nixos.wiki/wiki/Netboot
- **nixos-anywhere**: https://github.com/nix-community/nixos-anywhere
- **disko**: https://github.com/nix-community/disko
- **T032 Design**: `docs/por/T032-baremetal-provisioning/design.md`
- **T024 Modules**: `nix/modules/`
## Revision History
| Version | Date | Author | Changes |
|---------|------|--------|---------|
| 1.0 | 2025-12-10 | T032.S3 | Initial implementation |

View file

@ -1,388 +0,0 @@
# PlasmaCloud NixOS Image Builder
This directory contains tools and configurations for building bootable NixOS netboot images for bare-metal provisioning of PlasmaCloud infrastructure.
## Overview
The NixOS Image Builder generates netboot images (kernel + initrd) that can be served via PXE/iPXE to provision bare-metal servers with PlasmaCloud services. These images integrate with the T024 NixOS service modules and the T032.S2 PXE boot infrastructure.
## Architecture
The image builder produces three deployment profiles:
### 1. Control Plane (`netboot-control-plane`)
Full control plane deployment with all 8 PlasmaCloud services:
- **Chainfire**: Distributed configuration and coordination
- **FlareDB**: Time-series metrics and events database
- **IAM**: Identity and access management
- **PlasmaVMC**: Virtual machine control plane
- **PrismNET**: Software-defined networking controller
- **FlashDNS**: High-performance DNS server
- **FiberLB**: Layer 4/7 load balancer
- **LightningStor**: Distributed block storage
- **K8sHost**: Kubernetes hosting component
**Use Cases**:
- Multi-node production clusters (3+ control plane nodes)
- High-availability deployments
- Separation of control and data planes
### 2. Worker (`netboot-worker`)
Compute-focused deployment for running tenant workloads:
- **PlasmaVMC**: Virtual machine control plane
- **PrismNET**: Software-defined networking
**Use Cases**:
- Worker nodes in multi-node clusters
- Dedicated compute capacity
- Scalable VM hosting
### 3. All-in-One (`netboot-all-in-one`)
Single-node deployment with all 8 services:
- All services from Control Plane profile
- Optimized for single-node operation
**Use Cases**:
- Development/testing environments
- Small deployments (1-3 nodes)
- Edge locations
- Proof-of-concept installations
## Prerequisites
### Build Environment
- **NixOS** or **Nix package manager** installed
- **Flakes** enabled in Nix configuration
- **Git** access to PlasmaCloud repository
- **Sufficient disk space**: ~10GB for build artifacts
### Enable Nix Flakes
If not already enabled, add to `/etc/nix/nix.conf` or `~/.config/nix/nix.conf`:
```
experimental-features = nix-command flakes
```
### Build Dependencies
The build process automatically handles all dependencies, but ensure you have:
- Working internet connection (for Nix binary cache)
- ~4GB RAM minimum
- ~10GB free disk space
## Build Instructions
### Quick Start
Build all profiles:
```bash
cd /home/centra/cloud/baremetal/image-builder
./build-images.sh
```
Build a specific profile:
```bash
# Control plane only
./build-images.sh --profile control-plane
# Worker nodes only
./build-images.sh --profile worker
# All-in-one deployment
./build-images.sh --profile all-in-one
```
Custom output directory:
```bash
./build-images.sh --output-dir /srv/pxe/images
```
### Build Output
Each profile generates:
- `bzImage` - Linux kernel (~10-30 MB)
- `initrd` - Initial ramdisk (~100-300 MB)
- `netboot.ipxe` - iPXE boot script
- `build.log` - Build log for troubleshooting
Artifacts are placed in:
```
./artifacts/
├── control-plane/
│ ├── bzImage
│ ├── initrd
│ ├── netboot.ipxe
│ └── build.log
├── worker/
│ ├── bzImage
│ ├── initrd
│ ├── netboot.ipxe
│ └── build.log
└── all-in-one/
├── bzImage
├── initrd
├── netboot.ipxe
└── build.log
```
### Manual Build Commands
You can also build images directly with Nix:
```bash
# Build initrd
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.netbootRamdisk
# Build kernel
nix build .#nixosConfigurations.netboot-control-plane.config.system.build.kernel
# Access artifacts
ls -lh result/
```
## Deployment
### Integration with PXE Server (T032.S2)
The build script automatically copies artifacts to the PXE server directory if it exists:
```
chainfire/baremetal/pxe-server/assets/nixos/
├── control-plane/
├── worker/
├── all-in-one/
├── bzImage-control-plane -> control-plane/bzImage
├── initrd-control-plane -> control-plane/initrd
├── bzImage-worker -> worker/bzImage
└── initrd-worker -> worker/initrd
```
### Manual Deployment
Copy artifacts to your PXE/HTTP server:
```bash
# Example: Deploy to nginx serving directory
sudo cp -r ./artifacts/control-plane /srv/pxe/nixos/
sudo cp -r ./artifacts/worker /srv/pxe/nixos/
sudo cp -r ./artifacts/all-in-one /srv/pxe/nixos/
```
### iPXE Boot Configuration
Reference the images in your iPXE boot script:
```ipxe
#!ipxe
set boot-server 10.0.0.2:8080
:control-plane
kernel http://${boot-server}/nixos/control-plane/bzImage init=/nix/store/*/init console=ttyS0,115200 console=tty0 loglevel=4
initrd http://${boot-server}/nixos/control-plane/initrd
boot
:worker
kernel http://${boot-server}/nixos/worker/bzImage init=/nix/store/*/init console=ttyS0,115200 console=tty0 loglevel=4
initrd http://${boot-server}/nixos/worker/initrd
boot
```
## Customization
### Adding Services
To add a service to a profile, edit the corresponding configuration:
```nix
# nix/images/netboot-control-plane.nix
environment.systemPackages = with pkgs; [
chainfire-server
flaredb-server
# ... existing services ...
my-custom-service # Add your service
];
```
### Custom Kernel Configuration
Modify `nix/images/netboot-base.nix`:
```nix
boot.kernelPackages = pkgs.linuxPackages_6_6; # Specific kernel version
boot.kernelModules = [ "my-driver" ]; # Additional modules
boot.kernelParams = [ "my-param=value" ]; # Additional kernel parameters
```
### Additional Packages
Add packages to the netboot environment:
```nix
# nix/images/netboot-base.nix
environment.systemPackages = with pkgs; [
# ... existing packages ...
# Your additions
python3
nodejs
custom-tool
];
```
### Hardware-Specific Configuration
See `examples/hardware-specific.nix` for hardware-specific customizations.
## Troubleshooting
### Build Failures
**Symptom**: Build fails with Nix errors
**Solutions**:
1. Check build log: `cat artifacts/PROFILE/build.log`
2. Verify Nix flakes are enabled
3. Update nixpkgs: `nix flake update`
4. Clear Nix store cache: `nix-collect-garbage -d`
### Missing Service Packages
**Symptom**: Error: "package not found"
**Solutions**:
1. Verify service is built: `nix build .#chainfire-server`
2. Check flake overlay: `nix flake show`
3. Rebuild all packages: `nix build .#default`
### Image Too Large
**Symptom**: Initrd > 500 MB
**Solutions**:
1. Remove unnecessary packages from `environment.systemPackages`
2. Disable documentation (already done in base config)
3. Use minimal kernel: `boot.kernelPackages = pkgs.linuxPackages_latest_hardened`
### PXE Boot Fails
**Symptom**: Server fails to boot netboot image
**Solutions**:
1. Verify artifacts are accessible via HTTP
2. Check iPXE script syntax
3. Verify kernel parameters in boot script
4. Check serial console output (ttyS0)
5. Ensure DHCP provides correct boot server IP
### SSH Access Issues
**Symptom**: Cannot SSH to netboot installer
**Solutions**:
1. Replace example SSH key in `nix/images/netboot-base.nix`
2. Verify network connectivity (DHCP, firewall)
3. Check SSH service is running: `systemctl status sshd`
## Configuration Reference
### Service Modules (T024 Integration)
All netboot profiles import PlasmaCloud service modules from `nix/modules/`:
- `chainfire.nix` - Chainfire configuration
- `flaredb.nix` - FlareDB configuration
- `iam.nix` - IAM configuration
- `plasmavmc.nix` - PlasmaVMC configuration
- `prismnet.nix` - PrismNET configuration
- `flashdns.nix` - FlashDNS configuration
- `fiberlb.nix` - FiberLB configuration
- `lightningstor.nix` - LightningStor configuration
- `k8shost.nix` - K8sHost configuration
Services are **disabled by default** in netboot images and enabled in final installed configurations.
### Netboot Base Configuration
Located at `nix/images/netboot-base.nix`, provides:
- SSH server with root access (key-based)
- Generic kernel with broad hardware support
- Disk management tools (disko, parted, cryptsetup, lvm2)
- Network tools (iproute2, curl, tcpdump)
- Serial console support (ttyS0, tty0)
- DHCP networking
- Minimal system configuration
### Profile Configurations
- `nix/images/netboot-control-plane.nix` - All 8 services
- `nix/images/netboot-worker.nix` - Compute services (PlasmaVMC, PrismNET)
- `nix/images/netboot-all-in-one.nix` - All services for single-node
## Security Considerations
### SSH Keys
**IMPORTANT**: The default SSH key in `netboot-base.nix` is an example placeholder. You MUST replace it with your actual provisioning key:
```nix
users.users.root.openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3Nza... your-provisioning-key@host"
];
```
Generate a new key:
```bash
ssh-keygen -t ed25519 -C "provisioning@plasmacloud"
```
### Network Security
- Netboot images have **firewall disabled** for installation phase
- Use isolated provisioning VLAN for PXE boot
- Implement MAC address whitelist in DHCP
- Enable firewall in final installed configurations
### Secrets Management
- Do NOT embed secrets in netboot images
- Use nixos-anywhere to inject secrets during installation
- Store secrets in `/etc/nixos/secrets/` on installed systems
- Use proper file permissions (0400 for keys)
## Next Steps
After building images:
1. **Deploy to PXE Server**: Copy artifacts to HTTP server
2. **Configure DHCP/iPXE**: Set up boot infrastructure (see T032.S2)
3. **Prepare Node Configurations**: Create per-node configs for nixos-anywhere
4. **Test Boot Process**: Verify PXE boot on test hardware
5. **Run nixos-anywhere**: Install NixOS on target machines
## Resources
- **Design Document**: `docs/por/T032-baremetal-provisioning/design.md`
- **PXE Infrastructure**: `chainfire/baremetal/pxe-server/`
- **Service Modules**: `nix/modules/`
- **Example Configurations**: `baremetal/image-builder/examples/`
## Support
For issues or questions:
1. Check build logs: `artifacts/PROFILE/build.log`
2. Review design document: `docs/por/T032-baremetal-provisioning/design.md`
3. Examine example configurations: `examples/`
4. Verify service module configuration: `nix/modules/`
## License
Apache 2.0 - See LICENSE file for details

View file

@ -77,7 +77,7 @@ Build NixOS netboot images for PlasmaCloud bare-metal provisioning.
OPTIONS: OPTIONS:
--profile PROFILE Build specific profile: --profile PROFILE Build specific profile:
- control-plane: All 8 PlasmaCloud services - control-plane: All 8 PlasmaCloud services
- worker: Compute-focused services (PlasmaVMC, NovaNET) - worker: Compute-focused services (PlasmaVMC, PrismNET)
- all-in-one: All services for single-node deployment - all-in-one: All services for single-node deployment
- all: Build all profiles (default) - all: Build all profiles (default)
@ -97,7 +97,7 @@ EXAMPLES:
PROFILES: PROFILES:
control-plane - Full control plane with all 8 services control-plane - Full control plane with all 8 services
worker - Worker node with PlasmaVMC and NovaNET worker - Worker node with PlasmaVMC and PrismNET
all-in-one - Single-node deployment with all services all-in-one - Single-node deployment with all services
OUTPUT: OUTPUT:
@ -141,6 +141,16 @@ build_profile() {
cp -f "$profile_dir/initrd-link/initrd" "$profile_dir/initrd" cp -f "$profile_dir/initrd-link/initrd" "$profile_dir/initrd"
cp -f "$profile_dir/kernel-link/bzImage" "$profile_dir/bzImage" cp -f "$profile_dir/kernel-link/bzImage" "$profile_dir/bzImage"
# Resolve init path from the build (avoids hardcoding store paths)
local init_path="/init"
if toplevel=$(nix eval --raw "$REPO_ROOT#nixosConfigurations.netboot-$profile.config.system.build.toplevel" 2>/dev/null); then
if [ -n "$toplevel" ]; then
init_path="${toplevel}/init"
fi
else
print_warning "Failed to resolve init path for $profile; using /init"
fi
# Generate iPXE boot script # Generate iPXE boot script
print_info " Generating iPXE boot script..." print_info " Generating iPXE boot script..."
cat > "$profile_dir/netboot.ipxe" << EOF cat > "$profile_dir/netboot.ipxe" << EOF
@ -159,7 +169,7 @@ echo Initrd: initrd
echo echo
# Load kernel and initrd # Load kernel and initrd
kernel \${boot-server}/$profile/bzImage init=/nix/store/*/init console=ttyS0,115200 console=tty0 loglevel=4 kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4
initrd \${boot-server}/$profile/initrd initrd \${boot-server}/$profile/initrd
# Boot # Boot

View file

@ -1,361 +0,0 @@
{ config, pkgs, lib, ... }:
# ==============================================================================
# CUSTOM NETBOOT CONFIGURATION EXAMPLE
# ==============================================================================
# This example demonstrates how to create a custom netboot configuration with:
# - Custom kernel version and modules
# - Additional packages for specialized use cases
# - Hardware-specific drivers
# - Custom network configuration
# - Debugging tools
#
# Usage:
# 1. Copy this file to nix/images/netboot-custom.nix
# 2. Add to flake.nix:
# nixosConfigurations.netboot-custom = nixpkgs.lib.nixosSystem {
# system = "x86_64-linux";
# modules = [ ./nix/images/netboot-custom.nix ];
# };
# 3. Build: ./build-images.sh --profile custom
# ==============================================================================
{
imports = [
../netboot-base.nix # Adjust path as needed
../../modules # PlasmaCloud service modules
];
# ============================================================================
# CUSTOM KERNEL CONFIGURATION
# ============================================================================
# Use specific kernel version instead of latest
boot.kernelPackages = pkgs.linuxPackages_6_6; # LTS kernel
# Add custom kernel modules for specialized hardware
boot.kernelModules = [
# Infiniband/RDMA support
"ib_core"
"ib_uverbs"
"mlx5_core"
"mlx5_ib"
# GPU support (for GPU compute nodes)
"nvidia"
"nvidia_uvm"
# Custom storage controller
"megaraid_sas"
"mpt3sas"
];
# Custom kernel parameters
boot.kernelParams = [
# Default console configuration
"console=ttyS0,115200"
"console=tty0"
"loglevel=4"
# Custom parameters
"intel_iommu=on" # Enable IOMMU for PCI passthrough
"iommu=pt" # Passthrough mode
"hugepagesz=2M" # 2MB hugepages
"hugepages=1024" # Allocate 1024 hugepages (2GB)
"isolcpus=2-7" # CPU isolation for real-time workloads
];
# Blacklist problematic modules
boot.blacklistedKernelModules = [
"nouveau" # Disable nouveau if using proprietary NVIDIA
"i915" # Disable Intel GPU if not needed
];
# ============================================================================
# ADDITIONAL PACKAGES
# ============================================================================
environment.systemPackages = with pkgs; [
# Networking diagnostics
iperf3 # Network performance testing
mtr # Network diagnostic tool
nmap # Network scanner
wireshark-cli # Packet analyzer
# Storage tools
nvme-cli # NVMe management
smartmontools # SMART monitoring
fio # I/O performance testing
sg3_utils # SCSI utilities
# Hardware diagnostics
pciutils # lspci
usbutils # lsusb
dmidecode # Hardware information
lshw # Hardware lister
hwinfo # Hardware info tool
# Debugging tools
strace # System call tracer
ltrace # Library call tracer
gdb # GNU debugger
valgrind # Memory debugger
# Performance tools
perf # Linux perf tool
bpftrace # eBPF tracing
sysstat # System statistics (sar, iostat)
# Container/virtualization tools
qemu_full # Full QEMU with all features
libvirt # Virtualization management
virt-manager # VM management (CLI)
docker # Container runtime
podman # Alternative container runtime
# Development tools (for on-site debugging)
python3Full # Python with all modules
python3Packages.pip
nodejs # Node.js runtime
git # Version control
gcc # C compiler
rustc # Rust compiler
cargo # Rust package manager
# Custom tools
# Add your organization's custom packages here
];
# ============================================================================
# CUSTOM NETWORK CONFIGURATION
# ============================================================================
# Static IP instead of DHCP (example)
networking.useDHCP = lib.mkForce false;
networking.interfaces.eth0 = {
useDHCP = false;
ipv4.addresses = [{
address = "10.0.1.100";
prefixLength = 24;
}];
};
networking.defaultGateway = "10.0.1.1";
networking.nameservers = [ "10.0.1.1" "8.8.8.8" ];
# Custom DNS domain
networking.domain = "custom.example.com";
# Enable jumbo frames
networking.interfaces.eth0.mtu = 9000;
# ============================================================================
# CUSTOM SSH CONFIGURATION
# ============================================================================
# Multiple SSH keys for different operators
users.users.root.openssh.authorizedKeys.keys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOperator1Key operator1@example.com"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOperator2Key operator2@example.com"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOperator3Key operator3@example.com"
];
# Custom SSH port (for security through obscurity - not recommended for production)
# services.openssh.ports = [ 2222 ];
# ============================================================================
# CUSTOM SERVICES
# ============================================================================
# Enable only specific PlasmaCloud services
services.plasmavmc = {
enable = lib.mkDefault false;
port = 8081;
};
services.prismnet = {
enable = lib.mkDefault false;
port = 8082;
};
# ============================================================================
# DEBUGGING AND LOGGING
# ============================================================================
# Enable verbose boot logging
boot.kernelParams = lib.mkAfter [ "loglevel=7" "debug" ];
# Enable systemd debug logging
systemd.services."serial-getty@ttyS0".environment = {
SYSTEMD_LOG_LEVEL = "debug";
};
# Enable additional logging
services.journald.extraConfig = ''
Storage=persistent
MaxRetentionSec=7day
SystemMaxUse=1G
'';
# ============================================================================
# PERFORMANCE TUNING
# ============================================================================
# Custom sysctl settings for high-performance networking
boot.kernel.sysctl = {
# Network buffer sizes
"net.core.rmem_max" = 268435456; # 256 MB
"net.core.wmem_max" = 268435456; # 256 MB
"net.core.rmem_default" = 67108864; # 64 MB
"net.core.wmem_default" = 67108864; # 64 MB
# TCP tuning
"net.ipv4.tcp_rmem" = "4096 87380 134217728";
"net.ipv4.tcp_wmem" = "4096 65536 134217728";
"net.ipv4.tcp_congestion_control" = "bbr";
# Connection tracking
"net.netfilter.nf_conntrack_max" = 1048576;
# File descriptor limits
"fs.file-max" = 2097152;
# Virtual memory
"vm.swappiness" = 1;
"vm.vfs_cache_pressure" = 50;
"vm.dirty_ratio" = 10;
"vm.dirty_background_ratio" = 5;
# Kernel
"kernel.pid_max" = 4194304;
};
# Increase systemd limits
systemd.extraConfig = ''
DefaultLimitNOFILE=1048576
DefaultLimitNPROC=1048576
'';
# ============================================================================
# HARDWARE-SPECIFIC CONFIGURATION
# ============================================================================
# Enable CPU microcode updates
hardware.cpu.intel.updateMicrocode = true;
hardware.cpu.amd.updateMicrocode = true;
# Enable firmware updates
hardware.enableRedistributableFirmware = true;
# GPU support (example for NVIDIA)
# Uncomment if using NVIDIA GPUs
# hardware.nvidia.modesetting.enable = true;
# services.xserver.videoDrivers = [ "nvidia" ];
# ============================================================================
# CUSTOM INITIALIZATION
# ============================================================================
# Run custom script on boot
systemd.services.custom-init = {
description = "Custom initialization script";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
};
script = ''
echo "Running custom initialization..."
# Example: Configure network interfaces
${pkgs.iproute2}/bin/ip link set dev eth1 up
# Example: Load custom kernel modules
${pkgs.kmod}/bin/modprobe custom_driver || true
# Example: Call home to provisioning server
${pkgs.curl}/bin/curl -X POST http://provisioning.example.com/api/register \
-d "hostname=$(hostname)" \
-d "ip=$(${pkgs.iproute2}/bin/ip -4 addr show eth0 | grep -oP '(?<=inet\s)\d+(\.\d+){3}')" \
|| true
echo "Custom initialization complete"
'';
};
# ============================================================================
# FIREWALL CONFIGURATION
# ============================================================================
# Custom firewall rules (disabled by default in netboot, but example provided)
networking.firewall = {
enable = lib.mkDefault false; # Disabled during provisioning
# When enabled, allow these ports
allowedTCPPorts = [
22 # SSH
8081 # PlasmaVMC
8082 # PrismNET
];
# Custom iptables rules
extraCommands = ''
# Allow ICMP
iptables -A INPUT -p icmp -j ACCEPT
# Rate limit SSH connections
iptables -A INPUT -p tcp --dport 22 -m state --state NEW -m recent --set
iptables -A INPUT -p tcp --dport 22 -m state --state NEW -m recent --update --seconds 60 --hitcount 4 -j DROP
'';
};
# ============================================================================
# NIX CONFIGURATION
# ============================================================================
# Custom binary caches
nix.settings = {
substituters = [
"https://cache.nixos.org"
"https://custom-cache.example.com" # Your organization's cache
];
trusted-public-keys = [
"cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY="
"custom-cache.example.com:YourPublicKeyHere"
];
# Build settings
max-jobs = "auto";
cores = 0; # Use all available cores
# Experimental features
experimental-features = [ "nix-command" "flakes" "repl-flake" ];
};
# ============================================================================
# TIMEZONE AND LOCALE
# ============================================================================
# Custom timezone (instead of UTC)
time.timeZone = lib.mkForce "America/New_York";
# Additional locale support
i18n.supportedLocales = [
"en_US.UTF-8/UTF-8"
"ja_JP.UTF-8/UTF-8" # Japanese support
];
i18n.defaultLocale = "en_US.UTF-8";
# ============================================================================
# SYSTEM STATE VERSION
# ============================================================================
system.stateVersion = "24.11";
}

View file

@ -1,442 +0,0 @@
{ config, pkgs, lib, ... }:
# ==============================================================================
# HARDWARE-SPECIFIC NETBOOT CONFIGURATION EXAMPLE
# ==============================================================================
# This example demonstrates hardware-specific configurations for common
# bare-metal server platforms. Use this as a template for your specific hardware.
#
# Common Server Platforms:
# - Dell PowerEdge (R640, R650, R750)
# - HP ProLiant (DL360, DL380, DL560)
# - Supermicro (X11, X12 series)
# - Generic whitebox servers
#
# Usage:
# 1. Copy relevant sections to your netboot configuration
# 2. Adjust based on your specific hardware
# 3. Test boot on target hardware
# ==============================================================================
{
imports = [
../netboot-base.nix
../../modules
];
# ============================================================================
# DELL POWEREDGE R640 CONFIGURATION
# ============================================================================
# Uncomment this section for Dell PowerEdge R640 servers
/*
# Hardware-specific kernel modules
boot.initrd.availableKernelModules = [
# Dell PERC RAID controller
"megaraid_sas"
# Intel X710 10GbE NIC
"i40e"
# NVMe drives
"nvme"
# Standard modules
"ahci"
"xhci_pci"
"usb_storage"
"sd_mod"
"sr_mod"
];
boot.kernelModules = [
"kvm-intel" # Intel VT-x
"ipmi_devintf" # IPMI interface
"ipmi_si" # IPMI system interface
];
# Dell-specific firmware
hardware.enableRedistributableFirmware = true;
hardware.cpu.intel.updateMicrocode = true;
# Network interface naming
# R640 typically has:
# - eno1, eno2: Onboard 1GbE (Intel i350)
# - ens1f0, ens1f1: PCIe 10GbE (Intel X710)
networking.interfaces = {
eno1 = { useDHCP = true; };
ens1f0 = {
useDHCP = false;
mtu = 9000; # Jumbo frames for 10GbE
};
};
# iDRAC/IPMI configuration
services.freeipmi.enable = true;
# Dell OpenManage tools (optional)
environment.systemPackages = with pkgs; [
ipmitool
freeipmi
];
*/
# ============================================================================
# HP PROLIANT DL360 GEN10 CONFIGURATION
# ============================================================================
# Uncomment this section for HP ProLiant DL360 Gen10 servers
/*
boot.initrd.availableKernelModules = [
# HP Smart Array controller
"hpsa"
# Broadcom/Intel NIC
"tg3"
"bnx2x"
"i40e"
# NVMe
"nvme"
# Standard
"ahci"
"xhci_pci"
"usb_storage"
"sd_mod"
];
boot.kernelModules = [
"kvm-intel"
"ipmi_devintf"
"ipmi_si"
];
hardware.enableRedistributableFirmware = true;
hardware.cpu.intel.updateMicrocode = true;
# HP-specific tools
environment.systemPackages = with pkgs; [
ipmitool
smartmontools
];
# iLO/IPMI
services.freeipmi.enable = true;
*/
# ============================================================================
# SUPERMICRO X11 SERIES CONFIGURATION
# ============================================================================
# Uncomment this section for Supermicro X11 series servers
/*
boot.initrd.availableKernelModules = [
# LSI/Broadcom RAID
"megaraid_sas"
"mpt3sas"
# Intel NIC (common on Supermicro)
"igb"
"ixgbe"
"i40e"
# NVMe
"nvme"
# Standard
"ahci"
"xhci_pci"
"ehci_pci"
"usb_storage"
"sd_mod"
];
boot.kernelModules = [
"kvm-intel" # Or kvm-amd for AMD CPUs
"ipmi_devintf"
"ipmi_si"
];
hardware.enableRedistributableFirmware = true;
# CPU-specific (adjust based on your CPU)
hardware.cpu.intel.updateMicrocode = true;
# hardware.cpu.amd.updateMicrocode = true; # For AMD CPUs
# IPMI configuration
services.freeipmi.enable = true;
environment.systemPackages = with pkgs; [
ipmitool
dmidecode
smartmontools
];
*/
# ============================================================================
# GENERIC HIGH-PERFORMANCE SERVER
# ============================================================================
# This configuration works for most modern x86_64 servers
boot.initrd.availableKernelModules = [
# SATA/AHCI
"ahci"
"ata_piix"
# NVMe
"nvme"
# USB
"xhci_pci"
"ehci_pci"
"usb_storage"
"usbhid"
# SCSI/SAS
"sd_mod"
"sr_mod"
# Common RAID controllers
"megaraid_sas" # LSI MegaRAID
"mpt3sas" # LSI SAS3
"hpsa" # HP Smart Array
"aacraid" # Adaptec
# Network
"e1000e" # Intel GbE
"igb" # Intel GbE
"ixgbe" # Intel 10GbE
"i40e" # Intel 10/25/40GbE
"bnx2x" # Broadcom 10GbE
"mlx4_core" # Mellanox ConnectX-3
"mlx5_core" # Mellanox ConnectX-4/5
];
boot.kernelModules = [
"kvm-intel" # Intel VT-x
"kvm-amd" # AMD-V
];
# Enable all firmware
hardware.enableRedistributableFirmware = true;
# CPU microcode (both Intel and AMD)
hardware.cpu.intel.updateMicrocode = true;
hardware.cpu.amd.updateMicrocode = true;
# ============================================================================
# NETWORK INTERFACE CONFIGURATION
# ============================================================================
# Predictable interface names disabled in base config, using eth0, eth1, etc.
# For specific hardware, you may want to use biosdevname or systemd naming
# Example: Bond configuration for redundancy
/*
networking.bonds.bond0 = {
interfaces = [ "eth0" "eth1" ];
driverOptions = {
mode = "802.3ad"; # LACP
xmit_hash_policy = "layer3+4";
lacp_rate = "fast";
miimon = "100";
};
};
networking.interfaces.bond0 = {
useDHCP = true;
mtu = 9000;
};
*/
# Example: VLAN configuration
/*
networking.vlans = {
vlan100 = {
id = 100;
interface = "eth0";
};
vlan200 = {
id = 200;
interface = "eth0";
};
};
networking.interfaces.vlan100 = {
useDHCP = false;
ipv4.addresses = [{
address = "10.100.1.10";
prefixLength = 24;
}];
};
*/
# ============================================================================
# STORAGE CONFIGURATION
# ============================================================================
# Enable RAID support
boot.swraid.enable = true;
boot.swraid.mdadmConf = ''
ARRAY /dev/md0 level=raid1 num-devices=2
'';
# LVM support
services.lvm.enable = true;
# ZFS support (if needed)
# boot.supportedFilesystems = [ "zfs" ];
# boot.zfs.forceImportRoot = false;
# ============================================================================
# CPU-SPECIFIC OPTIMIZATIONS
# ============================================================================
# Intel-specific
boot.kernelParams = lib.mkIf (config.hardware.cpu.intel.updateMicrocode) [
"intel_pstate=active" # Use Intel P-State driver
"intel_iommu=on" # Enable IOMMU for VT-d
];
# AMD-specific
boot.kernelParams = lib.mkIf (config.hardware.cpu.amd.updateMicrocode) [
"amd_iommu=on" # Enable IOMMU for AMD-Vi
];
# ============================================================================
# MEMORY CONFIGURATION
# ============================================================================
# Hugepages for high-performance applications (DPDK, databases)
boot.kernelParams = [
"hugepagesz=2M"
"hugepages=1024" # 2GB of 2MB hugepages
"default_hugepagesz=2M"
];
# Transparent Hugepages
boot.kernel.sysctl = {
"vm.nr_hugepages" = 1024;
# "vm.nr_overcommit_hugepages" = 512; # Additional hugepages if needed
};
# ============================================================================
# IPMI/BMC CONFIGURATION
# ============================================================================
# Enable IPMI kernel modules
boot.kernelModules = [ "ipmi_devintf" "ipmi_si" ];
# IPMI tools
services.freeipmi.enable = true;
environment.systemPackages = with pkgs; [
ipmitool # IPMI command-line tool
freeipmi # Alternative IPMI tools
];
# Example: Configure BMC network (usually done via IPMI)
# Run manually: ipmitool lan set 1 ipaddr 10.0.100.10
# Run manually: ipmitool lan set 1 netmask 255.255.255.0
# Run manually: ipmitool lan set 1 defgw ipaddr 10.0.100.1
# ============================================================================
# PERFORMANCE TUNING
# ============================================================================
# Set CPU governor for performance
powerManagement.cpuFreqGovernor = "performance";
# Disable power management features that can cause latency
boot.kernelParams = [
"processor.max_cstate=1" # Limit C-states
"intel_idle.max_cstate=1" # Limit idle states
"idle=poll" # Aggressive polling (high power usage!)
];
# Note: The above settings prioritize performance over power efficiency
# Remove or adjust for non-latency-sensitive workloads
# ============================================================================
# HARDWARE MONITORING
# ============================================================================
# Enable hardware sensors
# services.lm_sensors.enable = true; # Uncomment if needed
# SMART monitoring
services.smartd = {
enable = true;
autodetect = true;
};
# ============================================================================
# GPU CONFIGURATION (if applicable)
# ============================================================================
# NVIDIA GPU
/*
hardware.nvidia = {
modesetting.enable = true;
powerManagement.enable = false;
powerManagement.finegrained = false;
open = false; # Use proprietary driver
nvidiaSettings = false; # No GUI needed
};
services.xserver.videoDrivers = [ "nvidia" ];
# NVIDIA Container Runtime (for GPU containers)
hardware.nvidia-container-toolkit.enable = true;
environment.systemPackages = with pkgs; [
cudaPackages.cudatoolkit
nvidia-docker
];
*/
# AMD GPU
/*
boot.initrd.kernelModules = [ "amdgpu" ];
services.xserver.videoDrivers = [ "amdgpu" ];
*/
# ============================================================================
# INFINIBAND/RDMA (for high-performance networking)
# ============================================================================
/*
boot.kernelModules = [
"ib_core"
"ib_uverbs"
"ib_umad"
"rdma_cm"
"rdma_ucm"
"mlx5_core"
"mlx5_ib"
];
environment.systemPackages = with pkgs; [
rdma-core
libfabric
# perftest # RDMA performance tests
];
# Configure IPoIB (IP over InfiniBand)
networking.interfaces.ib0 = {
useDHCP = false;
ipv4.addresses = [{
address = "192.168.100.10";
prefixLength = 24;
}];
mtu = 65520; # Max for IPoIB connected mode
};
*/
# ============================================================================
# SYSTEM STATE VERSION
# ============================================================================
system.stateVersion = "24.11";
}

View file

@ -1,36 +1,22 @@
# QEMU Socket Networking VM Cluster # Legacy Baremetal VM Cluster
## Architecture `baremetal/vm-cluster` is no longer the primary local validation path.
**Topology:** 4 QEMU VMs connected via multicast socket networking (230.0.0.1:1234) Use [`nix/test-cluster`](/home/centra/cloud/nix/test-cluster/README.md) for canonical local VM validation:
**VMs:** ```bash
1. **pxe-server** (192.168.100.1) - Provides DHCP/TFTP/HTTP services nix run ./nix/test-cluster#cluster -- smoke
2. **node01** (192.168.100.11) - Cluster node ```
3. **node02** (192.168.100.12) - Cluster node
4. **node03** (192.168.100.13) - Cluster node
**Network:** All VMs share L2 segment via QEMU multicast socket (no root privileges required) This directory is kept only for the older manual T036 PXE and bare-metal style experiments.
## Files ## What remains here
- `node01.qcow2`, `node02.qcow2`, `node03.qcow2` - 100GB cluster node disks - [`pxe-server/`](/home/centra/cloud/baremetal/vm-cluster/pxe-server): older PXE server configuration
- `pxe-server.qcow2` - 20GB PXE server disk - [`legacy/`](/home/centra/cloud/baremetal/vm-cluster/legacy/README.md): archived manual deployment, validation, and ad hoc QEMU launch scripts
- `launch-pxe-server.sh` - PXE server startup script
- `launch-node01.sh`, `launch-node02.sh`, `launch-node03.sh` - Node startup scripts
- `pxe-server/` - PXE server configuration files
## MACs ## Status
- pxe-server: 52:54:00:00:00:01 - unsupported for regular development
- node01: 52:54:00:00:01:01 - not the release-validation path
- node02: 52:54:00:00:01:02 - retained only to preserve old manual experiments
- node03: 52:54:00:00:01:03
## Provisioning Flow
1. Start PXE server VM (Alpine Linux with dnsmasq)
2. Configure DHCP/TFTP/HTTP services
3. Deploy NixOS netboot artifacts
4. Start node VMs with PXE boot enabled
5. Nodes PXE boot and provision via nixos-anywhere

View file

@ -1,46 +0,0 @@
# Alpine Linux Answer File for Automated Installation
# For use with: setup-alpine -f alpine-answers.txt
# Keyboard layout
KEYMAPOPTS="us us"
# Hostname
HOSTNAMEOPTS="-n pxe-server"
# Network configuration
# eth0: multicast network (static 192.168.100.1)
# eth1: user network (DHCP for internet)
INTERFACESOPTS="auto lo
iface lo inet loopback
auto eth0
iface eth0 inet static
address 192.168.100.1
netmask 255.255.255.0
auto eth1
iface eth1 inet dhcp"
# DNS
DNSOPTS="8.8.8.8 8.8.4.4"
# Timezone
TIMEZONEOPTS="-z UTC"
# Proxy (none)
PROXYOPTS="none"
# APK mirror (auto-detect fastest)
APKREPOSOPTS="-f"
# SSH server
SSHDOPTS="-c openssh"
# NTP client
NTPOPTS="-c chrony"
# Disk mode (sys = traditional installation to disk)
DISKOPTS="-m sys /dev/vda"
# Additional packages to install
APKCACHEOPTS="/var/cache/apk"

View file

@ -0,0 +1,18 @@
# Legacy Launch Scripts
These scripts are archived manual launch helpers from the older `baremetal/vm-cluster` workflow.
They are not the canonical test path and should not be used for normal validation.
Use the Nix-native harness instead:
```bash
nix run ./nix/test-cluster#cluster -- smoke
```
Notes:
- `deploy-all.sh` and `validate-cluster.sh` are preserved only for the retired PXE/manual flow
- some scripts assume local disk images or host networking setup that is no longer maintained
- Alpine-specific flows are treated as retired
- supporting artifacts such as `alpine-answers.txt` are no longer kept current

View file

@ -1,6 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# T036 VM Cluster Deployment Script # Legacy T036 VM cluster deployment script.
# Deploys all VMs via nixos-anywhere after VNC network configuration # This is a manual bare-metal/PXE path. The canonical local VM validation path
# is nix/test-cluster/run-cluster.sh.
set -euo pipefail set -euo pipefail
@ -11,6 +12,8 @@ cd "$REPO_ROOT"
echo "=== T036 VM Cluster Deployment ===" echo "=== T036 VM Cluster Deployment ==="
echo "" echo ""
echo "This path is archived. Prefer: nix run ./nix/test-cluster#cluster -- smoke"
echo ""
echo "Prerequisites:" echo "Prerequisites:"
echo " - PXE server booted and network configured (192.168.100.1)" echo " - PXE server booted and network configured (192.168.100.1)"
echo " - Node01 booted and network configured (192.168.100.11)" echo " - Node01 booted and network configured (192.168.100.11)"
@ -56,4 +59,5 @@ echo ""
echo "All VMs have been provisioned. Systems will reboot from disk." echo "All VMs have been provisioned. Systems will reboot from disk."
echo "Wait 2-3 minutes for boot, then validate cluster..." echo "Wait 2-3 minutes for boot, then validate cluster..."
echo "" echo ""
echo "Next: Run ./validate-cluster.sh" echo "Legacy next step: baremetal/vm-cluster/legacy/validate-cluster.sh"
echo "Preferred validation path: nix run ./nix/test-cluster#cluster -- smoke"

View file

@ -10,6 +10,7 @@ set -euo pipefail
# - Telnet serial console # - Telnet serial console
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DISK="${SCRIPT_DIR}/node01.qcow2" DISK="${SCRIPT_DIR}/node01.qcow2"
KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage" KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage"
INITRD="${SCRIPT_DIR}/netboot-initrd/initrd" INITRD="${SCRIPT_DIR}/netboot-initrd/initrd"
@ -37,6 +38,13 @@ if [ ! -f "$INITRD" ]; then
exit 1 exit 1
fi fi
INIT_PATH="/init"
if command -v nix >/dev/null 2>&1; then
if TOPLEVEL=$(nix eval --raw "$REPO_ROOT#nixosConfigurations.netboot-base.config.system.build.toplevel" 2>/dev/null); then
INIT_PATH="${TOPLEVEL}/init"
fi
fi
echo "============================================" echo "============================================"
echo "Launching node01 with netboot (SSH key auth)..." echo "Launching node01 with netboot (SSH key auth)..."
echo "============================================" echo "============================================"
@ -65,7 +73,7 @@ qemu-system-x86_64 \
-drive file="${DISK}",if=virtio,format=qcow2 \ -drive file="${DISK}",if=virtio,format=qcow2 \
-kernel "${KERNEL}" \ -kernel "${KERNEL}" \
-initrd "${INITRD}" \ -initrd "${INITRD}" \
-append "init=/nix/store/qj1ilfdd8fcrmz4pk282p5qdf2q0vkmh-nixos-system-nixos-kexec-26.05.20251205.f61125a/init console=ttyS0,115200 console=tty0 loglevel=4" \ -append "init=${INIT_PATH} console=ttyS0,115200 console=tty0 loglevel=4" \
-netdev vde,id=vde0,sock=/tmp/vde.sock \ -netdev vde,id=vde0,sock=/tmp/vde.sock \
-device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \ -device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \
-netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \ -netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \

View file

@ -45,7 +45,7 @@ exec qemu-system-x86_64 \
-m 16G \ -m 16G \
-drive file="$DISK",if=virtio,format=qcow2 \ -drive file="$DISK",if=virtio,format=qcow2 \
-netdev socket,mcast="$MCAST_ADDR",id=mcast0 \ -netdev socket,mcast="$MCAST_ADDR",id=mcast0 \
-device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR",romfile= \ -device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR" \
-boot order=n \ -boot order=n \
-vnc "$VNC_DISPLAY" \ -vnc "$VNC_DISPLAY" \
-serial telnet:localhost:4441,server,nowait \ -serial telnet:localhost:4441,server,nowait \

View file

@ -10,6 +10,7 @@ set -euo pipefail
# - Telnet serial console # - Telnet serial console
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DISK="${SCRIPT_DIR}/node02.qcow2" DISK="${SCRIPT_DIR}/node02.qcow2"
KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage" KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage"
INITRD="${SCRIPT_DIR}/netboot-initrd/initrd" INITRD="${SCRIPT_DIR}/netboot-initrd/initrd"
@ -37,6 +38,13 @@ if [ ! -f "$INITRD" ]; then
exit 1 exit 1
fi fi
INIT_PATH="/init"
if command -v nix >/dev/null 2>&1; then
if TOPLEVEL=$(nix eval --raw "$REPO_ROOT#nixosConfigurations.netboot-base.config.system.build.toplevel" 2>/dev/null); then
INIT_PATH="${TOPLEVEL}/init"
fi
fi
echo "============================================" echo "============================================"
echo "Launching node02 with netboot (SSH key auth)..." echo "Launching node02 with netboot (SSH key auth)..."
echo "============================================" echo "============================================"
@ -65,7 +73,7 @@ qemu-system-x86_64 \
-drive file="${DISK}",if=virtio,format=qcow2 \ -drive file="${DISK}",if=virtio,format=qcow2 \
-kernel "${KERNEL}" \ -kernel "${KERNEL}" \
-initrd "${INITRD}" \ -initrd "${INITRD}" \
-append "init=/nix/store/qj1ilfdd8fcrmz4pk282p5qdf2q0vkmh-nixos-system-nixos-kexec-26.05.20251205.f61125a/init console=ttyS0,115200 console=tty0 loglevel=4" \ -append "init=${INIT_PATH} console=ttyS0,115200 console=tty0 loglevel=4" \
-netdev vde,id=vde0,sock=/tmp/vde.sock \ -netdev vde,id=vde0,sock=/tmp/vde.sock \
-device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \ -device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \
-netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \ -netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \

View file

@ -45,7 +45,7 @@ exec qemu-system-x86_64 \
-m 16G \ -m 16G \
-drive file="$DISK",if=virtio,format=qcow2 \ -drive file="$DISK",if=virtio,format=qcow2 \
-netdev socket,mcast="$MCAST_ADDR",id=mcast0 \ -netdev socket,mcast="$MCAST_ADDR",id=mcast0 \
-device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR",romfile= \ -device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR" \
-boot order=n \ -boot order=n \
-vnc "$VNC_DISPLAY" \ -vnc "$VNC_DISPLAY" \
-serial telnet:localhost:4442,server,nowait \ -serial telnet:localhost:4442,server,nowait \

View file

@ -10,6 +10,7 @@ set -euo pipefail
# - Telnet serial console # - Telnet serial console
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
DISK="${SCRIPT_DIR}/node03.qcow2" DISK="${SCRIPT_DIR}/node03.qcow2"
KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage" KERNEL="${SCRIPT_DIR}/netboot-kernel/bzImage"
INITRD="${SCRIPT_DIR}/netboot-initrd/initrd" INITRD="${SCRIPT_DIR}/netboot-initrd/initrd"
@ -37,6 +38,13 @@ if [ ! -f "$INITRD" ]; then
exit 1 exit 1
fi fi
INIT_PATH="/init"
if command -v nix >/dev/null 2>&1; then
if TOPLEVEL=$(nix eval --raw "$REPO_ROOT#nixosConfigurations.netboot-base.config.system.build.toplevel" 2>/dev/null); then
INIT_PATH="${TOPLEVEL}/init"
fi
fi
echo "============================================" echo "============================================"
echo "Launching node03 with netboot (SSH key auth)..." echo "Launching node03 with netboot (SSH key auth)..."
echo "============================================" echo "============================================"
@ -65,7 +73,7 @@ qemu-system-x86_64 \
-drive file="${DISK}",if=virtio,format=qcow2 \ -drive file="${DISK}",if=virtio,format=qcow2 \
-kernel "${KERNEL}" \ -kernel "${KERNEL}" \
-initrd "${INITRD}" \ -initrd "${INITRD}" \
-append "init=/nix/store/qj1ilfdd8fcrmz4pk282p5qdf2q0vkmh-nixos-system-nixos-kexec-26.05.20251205.f61125a/init console=ttyS0,115200 console=tty0 loglevel=4" \ -append "init=${INIT_PATH} console=ttyS0,115200 console=tty0 loglevel=4" \
-netdev vde,id=vde0,sock=/tmp/vde.sock \ -netdev vde,id=vde0,sock=/tmp/vde.sock \
-device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \ -device virtio-net-pci,netdev=vde0,mac="${MAC_MCAST}" \
-netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \ -netdev user,id=user0,hostfwd=tcp::${SSH_PORT}-:22 \

View file

@ -45,7 +45,7 @@ exec qemu-system-x86_64 \
-m 16G \ -m 16G \
-drive file="$DISK",if=virtio,format=qcow2 \ -drive file="$DISK",if=virtio,format=qcow2 \
-netdev socket,mcast="$MCAST_ADDR",id=mcast0 \ -netdev socket,mcast="$MCAST_ADDR",id=mcast0 \
-device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR",romfile= \ -device virtio-net-pci,netdev=mcast0,mac="$MAC_ADDR" \
-boot order=n \ -boot order=n \
-vnc "$VNC_DISPLAY" \ -vnc "$VNC_DISPLAY" \
-serial telnet:localhost:4443,server,nowait \ -serial telnet:localhost:4443,server,nowait \

View file

@ -1,11 +1,22 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# T036 Cluster Validation Script # Legacy T036 validation script.
# Validates cluster health and Raft formation per S6 acceptance criteria # The canonical local VM validation path is now nix/test-cluster/run-cluster.sh.
# Keep this script only for the older manual PXE flow.
set -euo pipefail set -euo pipefail
echo "=== T036 Cluster Validation ===" echo "=== T036 Cluster Validation ==="
echo "" echo ""
echo "This path is archived. Prefer: nix run ./nix/test-cluster#cluster -- smoke"
echo ""
CURL_CONNECT_TIMEOUT="${CURL_CONNECT_TIMEOUT:-5}"
CURL_MAX_TIME="${CURL_MAX_TIME:-10}"
CURL_INSECURE="${CURL_INSECURE:-1}"
CURL_FLAGS=(--connect-timeout "$CURL_CONNECT_TIMEOUT" --max-time "$CURL_MAX_TIME")
if [[ "$CURL_INSECURE" == "1" ]]; then
CURL_FLAGS+=(-k)
fi
# Wait for services to be ready # Wait for services to be ready
echo "Waiting for cluster services to start (60 seconds)..." echo "Waiting for cluster services to start (60 seconds)..."
@ -25,7 +36,7 @@ echo ""
echo "=== S6.2: Chainfire Cluster Validation ===" echo "=== S6.2: Chainfire Cluster Validation ==="
echo "" echo ""
echo "Checking Chainfire cluster members on node01..." echo "Checking Chainfire cluster members on node01..."
curl -k https://192.168.100.11:2379/admin/cluster/members | jq . || echo "Chainfire API not ready" curl "${CURL_FLAGS[@]}" https://192.168.100.11:2379/admin/cluster/members | jq . || echo "Chainfire API not ready"
echo "" echo ""
echo "Expected: 3 members (node01, node02, node03), one leader elected" echo "Expected: 3 members (node01, node02, node03), one leader elected"
@ -34,34 +45,34 @@ echo ""
echo "=== S6.3: FlareDB Cluster Validation ===" echo "=== S6.3: FlareDB Cluster Validation ==="
echo "" echo ""
echo "Checking FlareDB cluster members on node01..." echo "Checking FlareDB cluster members on node01..."
curl -k https://192.168.100.11:2479/admin/cluster/members | jq . || echo "FlareDB API not ready" curl "${CURL_FLAGS[@]}" https://192.168.100.11:2479/admin/cluster/members | jq . || echo "FlareDB API not ready"
echo "" echo ""
echo "=== S6.4: CRUD Operations Test ===" echo "=== S6.4: CRUD Operations Test ==="
echo "" echo ""
echo "Writing test key to FlareDB..." echo "Writing test key to FlareDB..."
curl -k -X PUT https://192.168.100.11:2479/api/v1/kv/test-key \ curl "${CURL_FLAGS[@]}" -X PUT https://192.168.100.11:2479/api/v1/kv/test-key \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{"value": "hello-t036-cluster"}' || echo "Write failed" -d '{"value": "hello-t036-cluster"}' || echo "Write failed"
echo "" echo ""
echo "Reading test key from node01..." echo "Reading test key from node01..."
curl -k https://192.168.100.11:2479/api/v1/kv/test-key || echo "Read failed" curl "${CURL_FLAGS[@]}" https://192.168.100.11:2479/api/v1/kv/test-key || echo "Read failed"
echo "" echo ""
echo "Reading test key from node02 (verify replication)..." echo "Reading test key from node02 (verify replication)..."
curl -k https://192.168.100.12:2479/api/v1/kv/test-key || echo "Read failed" curl "${CURL_FLAGS[@]}" https://192.168.100.12:2479/api/v1/kv/test-key || echo "Read failed"
echo "" echo ""
echo "Reading test key from node03 (verify replication)..." echo "Reading test key from node03 (verify replication)..."
curl -k https://192.168.100.13:2479/api/v1/kv/test-key || echo "Read failed" curl "${CURL_FLAGS[@]}" https://192.168.100.13:2479/api/v1/kv/test-key || echo "Read failed"
echo "" echo ""
echo "=== S6.5: IAM Service Validation ===" echo "=== S6.5: IAM Service Validation ==="
echo "" echo ""
for node in 192.168.100.11 192.168.100.12 192.168.100.13; do for node in 192.168.100.11 192.168.100.12 192.168.100.13; do
echo "Checking IAM health on $node..." echo "Checking IAM health on $node..."
curl -k https://$node:8080/health || echo "IAM not ready on $node" curl "${CURL_FLAGS[@]}" https://$node:8080/health || echo "IAM not ready on $node"
echo "" echo ""
done done
@ -70,9 +81,9 @@ echo "=== S6.6: Health Checks ==="
echo "" echo ""
for node in 192.168.100.11 192.168.100.12 192.168.100.13; do for node in 192.168.100.11 192.168.100.12 192.168.100.13; do
echo "Node: $node" echo "Node: $node"
echo " Chainfire: $(curl -sk https://$node:2379/health || echo 'N/A')" echo " Chainfire: $(curl -s "${CURL_FLAGS[@]}" https://$node:2379/health || echo 'N/A')"
echo " FlareDB: $(curl -sk https://$node:2479/health || echo 'N/A')" echo " FlareDB: $(curl -s "${CURL_FLAGS[@]}" https://$node:2479/health || echo 'N/A')"
echo " IAM: $(curl -sk https://$node:8080/health || echo 'N/A')" echo " IAM: $(curl -s "${CURL_FLAGS[@]}" https://$node:8080/health || echo 'N/A')"
echo "" echo ""
done done

View file

@ -1 +0,0 @@
/nix/store/nixfmms2rbqi07a0sqjf5l32mm28y1iz-initrd

View file

@ -1 +0,0 @@
/nix/store/nmi1f4lsswcr9dmm1r6j6a8b7rar5gl4-linux-6.18

View file

@ -1,20 +1,10 @@
{ config, pkgs, lib, ... }: { config, pkgs, lib, modulesPath, ... }:
{ {
imports = [ imports = [
<nixpkgs/nixos/modules/profiles/qemu-guest.nix> "${modulesPath}/profiles/qemu-guest.nix"
]; ];
# Boot configuration
boot.loader.grub.enable = true;
boot.loader.grub.device = "/dev/vda";
# Filesystems
fileSystems."/" = {
device = "/dev/vda1";
fsType = "ext4";
};
# Network configuration # Network configuration
networking.hostName = "pxe-server"; networking.hostName = "pxe-server";
networking.domain = "plasma.local"; networking.domain = "plasma.local";
@ -62,6 +52,7 @@
# DNS configuration # DNS configuration
domain = "plasma.local"; domain = "plasma.local";
local = "/plasma.local/"; local = "/plasma.local/";
address = "/deployer.local/192.168.100.1";
# TFTP configuration # TFTP configuration
enable-tftp = true; enable-tftp = true;
@ -84,6 +75,17 @@
settings.PermitRootLogin = "yes"; settings.PermitRootLogin = "yes";
}; };
# Deployer API for ISO phone-home bootstrap
services.deployer = {
enable = true;
bindAddr = "0.0.0.0:8080";
clusterId = "plasmacloud-vm-cluster";
requireChainfire = false;
allowUnauthenticated = true;
allowUnknownNodes = true;
allowTestMappings = false;
};
# Root password (for SSH access) # Root password (for SSH access)
users.users.root.password = "plasmacloud"; users.users.root.password = "plasmacloud";
@ -92,6 +94,7 @@
vim vim
curl curl
htop htop
deployer-server
]; ];
# System state version # System state version

135
bin/cloud-cli Executable file
View file

@ -0,0 +1,135 @@
#!/usr/bin/env python3
import argparse
import json
import os
import sys
import urllib.request
import urllib.error
# Default API Gateway URL (localhost forwarding from node06)
DEFAULT_API_URL = "http://localhost:8080"
def get_url(path):
return f"{DEFAULT_API_URL}{path}"
def headers(token=None):
h = {"Content-Type": "application/json"}
if token:
h["Authorization"] = f"Bearer {token}"
return h
def print_json(data):
print(json.dumps(data, indent=2))
def request(method, url, data=None, token=None):
parsed_headers = headers(token)
body = None
if data:
body = json.dumps(data).encode('utf-8')
req = urllib.request.Request(url, data=body, headers=parsed_headers, method=method)
try:
with urllib.request.urlopen(req) as response:
if response.status in [200, 201, 204]:
if response.status == 204:
print("{}")
return
resp_data = json.load(response)
print_json(resp_data)
else:
print(f"Error {response.status}")
except urllib.error.HTTPError as e:
print(f"HTTP Error {e.code}: {e.read().decode()}")
except urllib.error.URLError as e:
print(f"Connection failed: {e.reason}")
def cmd_list_vpcs(args):
url = get_url("/api/v1/vpcs")
print(f"GET {url}")
request("GET", url, token=args.token)
def cmd_create_vpc(args):
url = get_url("/api/v1/vpcs")
data = {
"name": args.name,
"cidr_block": args.cidr,
"org_id": "org-default",
"project_id": "proj-default"
}
print(f"POST {url} with {data}")
request("POST", url, data=data, token=args.token)
def cmd_list_subnets(args):
url = get_url("/api/v1/subnets")
if args.vpc:
url += f"?vpc_id={args.vpc}"
print(f"GET {url}")
request("GET", url, token=args.token)
def cmd_create_subnet(args):
url = get_url("/api/v1/subnets")
data = {
"name": args.name,
"vpc_id": args.vpc,
"cidr_block": args.cidr,
"org_id": "org-default",
"project_id": "proj-default"
}
print(f"POST {url} with {data}")
request("POST", url, data=data, token=args.token)
def cmd_list_vms(args):
url = get_url("/api/v1/vms")
print(f"GET {url}")
request("GET", url, token=args.token)
def main():
global DEFAULT_API_URL
parser = argparse.ArgumentParser(description="PhotonCloud CLI")
parser.add_argument("--token", help="Auth token", default=os.environ.get("CLOUD_TOKEN"))
parser.add_argument("--url", help="API URL", default=DEFAULT_API_URL)
subparsers = parser.add_subparsers(dest="command", required=True)
# VPC Commands
vpc_parser = subparsers.add_parser("vpc", help="Manage VPCs")
vpc_sub = vpc_parser.add_subparsers(dest="subcommand", required=True)
vpc_list = vpc_sub.add_parser("list", help="List VPCs")
vpc_list.set_defaults(func=cmd_list_vpcs)
vpc_create = vpc_sub.add_parser("create", help="Create VPC")
vpc_create.add_argument("--name", required=True)
vpc_create.add_argument("--cidr", required=True)
vpc_create.set_defaults(func=cmd_create_vpc)
# Subnet Commands
subnet_parser = subparsers.add_parser("subnet", help="Manage Subnets")
subnet_sub = subnet_parser.add_subparsers(dest="subcommand", required=True)
subnet_list = subnet_sub.add_parser("list", help="List Subnets")
subnet_list.add_argument("--vpc", help="Filter by VPC ID")
subnet_list.set_defaults(func=cmd_list_subnets)
subnet_create = subnet_sub.add_parser("create", help="Create Subnet")
subnet_create.add_argument("--name", required=True)
subnet_create.add_argument("--vpc", required=True, help="VPC ID")
subnet_create.add_argument("--cidr", required=True)
subnet_create.set_defaults(func=cmd_create_subnet)
# VM Commands
vm_parser = subparsers.add_parser("vm", help="Manage VMs")
vm_sub = vm_parser.add_subparsers(dest="subcommand", required=True)
vm_list = vm_sub.add_parser("list", help="List VMs")
vm_list.set_defaults(func=cmd_list_vms)
args = parser.parse_args()
if args.url:
DEFAULT_API_URL = args.url
args.func(args)
if __name__ == "__main__":
main()

View file

@ -1,87 +0,0 @@
RaftとGossipプロトコルを用いた、クラスター管理のための数万台までスケールするKey-Value Storeを書いてほしいです。
- プログラミング言語rust
- テストをちゃんと書きながら書くことを推奨する。
- クラスターへの参加/削除/障害検知を行う。
では、**「Raft合意形成」と「Gossip情報の拡散」を組み合わせた場合、具体的にどうデータが流れ、どうやってードが動き出すのか**、その具体的なフローを解説します。
-----
### 前提:このシステムの役割分担
* **Control Plane (CP):** Raftで構成された3〜7台Raftアルゴリズムでうまく合意が取れる範囲のサーバー。情報の「正規の持ち主」。いなくなったら自動でWorker Nodesから昇格する。
* **Worker Nodes (VM/DB Hosts):** 数百〜数千台の実働部隊。CPのクライアント。
### 1\. データはどのように書き込まれるか? (Write)
書き込みは **「必ず Control Plane の Raft Leader に対して」** 行います。Gossip経由での書き込みは順序保証がないため行いません。
「VM-A を Node-10 で起動したい」
1. **API Call:** 管理者またはCLIが、CPのAPIサーバーにリクエストを送ります。
2. **Raft Log:** CPのリーダーは、この変更を `Put(Key="/nodes/node-10/tasks/vm-a", Value="START")` としてRaftログに追加します。
3. **Commit:** 過半数のCPードがログを保存したら「書き込み完了」と見なします。
ここまでは普通のDBと同じです。
### 2\. 各ノードはどのようにデータを取得し、通知を受けるか? (Read & Notify)
ここが最大のポイントです。数千台のードが「自分宛ての命令はないか」と毎秒ポーリング問い合わせすると、CPがDDoS攻撃を受けたようにパンクします。
ここで **「Watchロングポーリング」** という仕組みを使います。
#### A. Watchによる通知と取得これがメイン
Kubernetesやetcdが採用している方式です。
1. **接続維持:** Node-10 は起動時に CP に対して `Watch("/nodes/node-10/")` というリクエストを送ります。
2. **待機:** CP は「Node-10 以下のキーに変更があるまで、レスポンスを返さずに接続を維持(ブロック)」します。
3. **イベント発火:** 先ほどの書き込みVM起動命令が発生した瞬間、CP は待機していた Node-10 への接続を通じて「更新イベントEvent: PUT, Key: ...vm-a, Value: START」を即座にプッシュします。
4. **アクション:** Node-10 は通知を受け取り次第、VMを起動します。
**結論:** 「書き込み後の通知」は絶対に必要です。それを効率よくやるのが **Watch API** です。
-----
### 3\. じゃあ Gossip はどこで使うのか?
「Raft + Watch」で完結しそうに見えますが、10台以上のスケール、特にVM基盤のような動的な環境では **Gossip が以下の「3つの穴」を埋めるために不可欠** になります。
#### ① Nodeの死活監視・インベントリ管理下り方向
CPが「Node-10にVMを立てたい」と思ったとき、「そもそもNode-10は生きているのか IPは 空きメモリは?」という情報を知る必要があります。
* **Gossipの役割:** 各Worker Nodeは、GossipSWIMプロトコルでお互いに、そしてCPに対して「自分は生きてるよ、IPはこれだよ」と喋り続けます。
* CPはこのGossip情報を聞いて、最新の「ード一覧リストMemberlist」をメモリ上に保持します。
#### ② サービスのディスカバリ(横方向)
DB基盤の場合、「DBードA」が「DBードB」と通信したいことがあります。いちいちCPに聞きに行くと遅いです。
* **Gossipの役割:** ード同士で「私はここにいるよ」とGossipし合うことで、CPを介さずに直接通信相手を見つけられます。
#### ③ "Actual State"(現状)の報告(上り方向)
VMが起動した後、「起動しました」というステータスをどうCPに伝えるか。
* **Raftに書く:** 確実ですが、頻繁に変わるステータスCPU使用率などを全部Raftに書くとCPがパンクします。
* **Gossipで流す:** 「VM-Aは起動中、負荷50%」といった情報はGossipに乗せて、**「結果的にCPに伝わればいい」** という扱いにします。
-----
### 設計のまとめRaftとGossipの使い分け
これから作る「汎用管理DB外部依存型」は、以下のようなインターフェースを持つことになるでしょう。
| アクション | 通信方式 | 具体的なデータ例 |
| :--- | :--- | :--- |
| **命令 (Desired State)**<br>「こうなってほしい」 | **Raft + Watch**<br>(強整合性) | ・VMの起動/停止命令<br>・DBのデータ配置情報の変更<br>・パスワードや設定変更 |
| **現状 (Actual State)**<br>「今こうなってます」 | **Gossip**<br>(結果整合性) | ・ノードの生存確認 (Heartbeat)<br>・リソース使用率 (CPU/Mem)<br>・「VM起動完了」などのステータス |
| **通知 (Notification)** | **Watch (HTTP/gRPC Stream)** | ・「新しい命令が来たぞ!」というトリガー |
#### 実装のアドバイス
もし「etcdのようなもの」を自作されるなら、**「Serf (Gossip)」と「Raft」をライブラリとして組み込み、その上に「gRPCによるWatch付きのKVS API」を被せる** という構成になります。
これができれば、VM基盤は「Watchして、VMを起動して、Gossipでステータスを返すエージェント」を作るだけで済みますし、DB基盤も同様に作れます。非常にスケーラブルで美しい設計です。

View file

@ -1,295 +0,0 @@
# T032.S2 PXE Boot Infrastructure - Implementation Summary
## Overview
This directory contains a complete PXE (Preboot eXecution Environment) boot infrastructure for bare-metal provisioning of Centra Cloud nodes. It enables automated, network-based installation of NixOS on physical servers with profile-based configuration.
## Implementation Status
**Task**: T032.S2 - PXE Boot Infrastructure
**Status**: ✅ Complete
**Total Lines**: 3086 lines across all files
**Date**: 2025-12-10
## What Was Delivered
### 1. Core Configuration Files
| File | Lines | Purpose |
|------|-------|---------|
| `dhcp/dhcpd.conf` | 134 | ISC DHCP server configuration with BIOS/UEFI detection |
| `ipxe/boot.ipxe` | 320 | Main iPXE boot script with 3 profiles and menu |
| `http/nginx.conf` | 187 | Nginx HTTP server for boot assets |
| `nixos-module.nix` | 358 | Complete NixOS service module |
### 2. Setup and Management
| File | Lines | Purpose |
|------|-------|---------|
| `setup.sh` | 446 | Automated setup script with download/build/validate/test |
### 3. Documentation
| File | Lines | Purpose |
|------|-------|---------|
| `README.md` | 1088 | Comprehensive documentation and troubleshooting |
| `QUICKSTART.md` | 165 | 5-minute quick start guide |
| `http/directory-structure.txt` | 95 | Directory layout documentation |
| `ipxe/mac-mappings.txt` | 49 | MAC address mapping reference |
### 4. Examples
| File | Lines | Purpose |
|------|-------|---------|
| `examples/nixos-config-examples.nix` | 391 | 8 different deployment scenario examples |
## Key Features Implemented
### DHCP Server
- ✅ Automatic BIOS/UEFI detection (option 93)
- ✅ Chainloading to iPXE via TFTP
- ✅ Per-host fixed IP assignment
- ✅ Multiple subnet support
- ✅ DHCP relay documentation
### iPXE Boot System
- ✅ Three boot profiles: control-plane, worker, all-in-one
- ✅ MAC-based automatic profile selection
- ✅ Interactive boot menu with 30-second timeout
- ✅ Serial console support (ttyS0 115200)
- ✅ Detailed error messages and debugging
- ✅ iPXE shell access for troubleshooting
### HTTP Server (Nginx)
- ✅ Serves iPXE bootloaders and scripts
- ✅ Serves NixOS kernel and initrd
- ✅ Proper cache control headers
- ✅ Directory listing for debugging
- ✅ Health check endpoint
- ✅ HTTPS support (optional)
### NixOS Module
- ✅ Declarative configuration
- ✅ Automatic firewall rules
- ✅ Service dependencies managed
- ✅ Directory structure auto-created
- ✅ Node definitions with MAC addresses
- ✅ DHCP/TFTP/HTTP integration
### Setup Script
- ✅ Directory creation
- ✅ iPXE bootloader download from boot.ipxe.org
- ✅ iPXE build from source (optional)
- ✅ Configuration validation
- ✅ Service testing
- ✅ Colored output and logging
## Boot Profiles
### 1. Control Plane
**Services**: All 8 core services (FlareDB, IAM, PlasmaVMC, K8sHost, FlashDNS, ChainFire, Object Storage, Monitoring)
**Use case**: Production control plane nodes
**Resources**: 8+ cores, 32+ GB RAM, 500+ GB SSD
### 2. Worker
**Services**: Compute-focused (K8sHost, PlasmaVMC, ChainFire, FlashDNS, monitoring agents)
**Use case**: Worker nodes for customer workloads
**Resources**: 16+ cores, 64+ GB RAM, 1+ TB SSD
### 3. All-in-One
**Services**: Complete Centra Cloud stack on one node
**Use case**: Testing, development, homelab
**Resources**: 16+ cores, 64+ GB RAM, 1+ TB SSD
**Warning**: Not for production (no HA)
## Network Flow
```
Server Powers On
DHCP Discovery (broadcast)
DHCP Server assigns IP + provides bootloader filename
TFTP download bootloader (undionly.kpxe or ipxe.efi)
iPXE executes, requests boot.ipxe via HTTP
Boot menu displayed (or auto-select via MAC)
iPXE downloads NixOS kernel + initrd via HTTP
NixOS boots and provisions node
```
## File Structure
```
baremetal/pxe-server/
├── README.md # Comprehensive documentation (1088 lines)
├── QUICKSTART.md # Quick start guide (165 lines)
├── OVERVIEW.md # This file
├── setup.sh # Setup script (446 lines, executable)
├── nixos-module.nix # NixOS service module (358 lines)
├── .gitignore # Git ignore for runtime assets
├── dhcp/
│ └── dhcpd.conf # DHCP server config (134 lines)
├── ipxe/
│ ├── boot.ipxe # Main boot script (320 lines)
│ └── mac-mappings.txt # MAC address reference (49 lines)
├── http/
│ ├── nginx.conf # HTTP server config (187 lines)
│ └── directory-structure.txt # Directory docs (95 lines)
├── examples/
│ └── nixos-config-examples.nix # 8 deployment examples (391 lines)
└── assets/
└── .gitkeep # Placeholder for runtime assets
```
## Dependencies on Other Tasks
### Prerequisites
None - this is the first step in T032 (Bare-Metal Provisioning)
### Next Steps
- **T032.S3**: Image Builder - Generate NixOS netboot images for each profile
- **T032.S4**: Provisioning Orchestrator - API-driven node lifecycle management
### Integration Points
- **FlareDB**: Node inventory and state storage
- **IAM**: Authentication for provisioning API
- **PlasmaVMC**: VM provisioning on bare-metal nodes
- **K8sHost**: Kubernetes node integration
## Testing Status
### What Can Be Tested Now
✅ Directory structure creation
✅ Configuration file syntax validation
✅ Service startup (DHCP, TFTP, HTTP)
✅ Firewall rules
✅ Boot script download
✅ iPXE bootloader download/build
### What Requires T032.S3
⏳ Actual bare-metal provisioning (needs NixOS images)
⏳ End-to-end boot flow (needs kernel/initrd)
⏳ Profile-specific deployments (needs profile configs)
## Quick Start Commands
```bash
# Install and setup
cd baremetal/pxe-server
sudo ./setup.sh --install --download --validate
# Configure NixOS (edit configuration.nix)
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server.enable = true;
# ... (see QUICKSTART.md for full config)
# Deploy
sudo nixos-rebuild switch
# Test services
sudo ./setup.sh --test
# Boot a server
# - Configure BIOS for PXE boot
# - Connect to network
# - Power on
```
## Known Limitations
1. **No NixOS images yet**: T032.S3 will generate the actual boot images
2. **Single interface**: Module supports one network interface (can be extended)
3. **No HA built-in**: DHCP failover can be configured manually (example provided)
4. **No authentication**: Provisioning API will add auth in T032.S4
## Configuration Examples Provided
1. Basic single-subnet PXE server
2. PXE server with MAC-based auto-selection
3. Custom DHCP configuration
4. Multi-homed server (multiple interfaces)
5. High-availability with failover
6. HTTPS boot (secure boot)
7. Development/testing configuration
8. Production with monitoring
## Security Considerations
- DHCP is unauthenticated (normal for PXE)
- TFTP is unencrypted (normal for PXE)
- HTTP can be upgraded to HTTPS (documented)
- iPXE supports secure boot with embedded certificates (build from source)
- Network should be isolated (provisioning VLAN recommended)
- Firewall rules limit exposure (only necessary ports)
## Troubleshooting Resources
Comprehensive troubleshooting section in README.md covers:
- DHCP discovery issues
- TFTP timeout problems
- HTTP download failures
- Boot script errors
- Serial console debugging
- Common error messages
- Service health checks
- Network connectivity tests
## Performance Considerations
- **Concurrent boots**: ~500 MB per node (kernel + initrd)
- **Recommended**: 1 Gbps link for PXE server
- **10 concurrent boots**: ~5 Gbps burst (stagger or use 10 Gbps)
- **Disk space**: 5-10 GB recommended (multiple profiles + versions)
## Compliance with Requirements
| Requirement | Status | Notes |
|-------------|--------|-------|
| DHCP server config | ✅ | ISC DHCP with BIOS/UEFI detection |
| iPXE boot scripts | ✅ | Main menu + 3 profiles |
| HTTP server config | ✅ | Nginx with proper paths |
| NixOS module | ✅ | Complete systemd integration |
| Setup script | ✅ | Download/build/validate/test |
| README | ✅ | Comprehensive + troubleshooting |
| Working examples | ✅ | All configs are production-ready |
| 800-1200 lines | ✅ | 3086 lines (exceeded) |
| No S3 implementation | ✅ | Placeholder paths only |
## Changelog
**2025-12-10**: Initial implementation
- Created complete PXE boot infrastructure
- Added DHCP, TFTP, HTTP server configurations
- Implemented iPXE boot scripts with 3 profiles
- Created NixOS service module
- Added setup script with validation
- Wrote comprehensive documentation
- Provided 8 configuration examples
## License
Part of Centra Cloud infrastructure. See project root for license.
## Support
For issues or questions:
1. Check [README.md](README.md) troubleshooting section
2. Run diagnostic: `sudo ./setup.sh --test`
3. Review logs: `sudo journalctl -u dhcpd4 -u atftpd -u nginx -f`
4. See [QUICKSTART.md](QUICKSTART.md) for common commands
---
**Implementation by**: Claude Sonnet 4.5
**Task**: T032.S2 - PXE Boot Infrastructure
**Status**: Complete and ready for deployment

View file

@ -1,177 +0,0 @@
# PXE Server Quick Start Guide
This is a condensed guide for getting the PXE boot server running quickly.
## Prerequisites
- NixOS server
- Root access
- Network connectivity to bare-metal servers
## 5-Minute Setup
### 1. Run Setup Script
```bash
cd baremetal/pxe-server
sudo ./setup.sh --install --download --validate
```
### 2. Configure NixOS
Add to `/etc/nixos/configuration.nix`:
```nix
imports = [ /path/to/baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0"; # YOUR NETWORK INTERFACE
serverAddress = "10.0.100.10"; # YOUR PXE SERVER IP
dhcp = {
subnet = "10.0.100.0"; # YOUR SUBNET
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100"; # DHCP RANGE START
end = "10.0.100.200"; # DHCP RANGE END
};
router = "10.0.100.1"; # YOUR GATEWAY
};
};
```
### 3. Deploy
```bash
sudo nixos-rebuild switch
```
### 4. Verify
```bash
sudo ./setup.sh --test
```
You should see:
- TFTP server running
- HTTP server running
- DHCP server running
### 5. Boot a Server
1. Configure server BIOS for PXE boot
2. Connect to same network
3. Power on
4. Watch for boot menu
## Adding Nodes
### Quick Add (No Auto-Selection)
Just boot the server and select profile from menu.
### With Auto-Selection
1. Get MAC address from server
2. Edit `ipxe/boot.ipxe`, add line:
```ipxe
iseq ${mac} AA:BB:CC:DD:EE:FF && set profile worker && set hostname worker-05 && goto boot ||
```
3. Optionally add to `dhcp/dhcpd.conf`:
```conf
host worker-05 {
hardware ethernet AA:BB:CC:DD:EE:FF;
fixed-address 10.0.100.65;
option host-name "worker-05";
}
```
4. Restart DHCP: `sudo systemctl restart dhcpd4`
## Troubleshooting
### Server doesn't get IP
```bash
sudo tcpdump -i eth0 port 67 or port 68
sudo journalctl -u dhcpd4 -f
```
Check:
- DHCP server running on correct interface
- Network connectivity
- Firewall allows UDP 67/68
### Server gets IP but no bootloader
```bash
sudo tcpdump -i eth0 port 69
sudo journalctl -u atftpd -f
```
Check:
- TFTP server running
- Bootloaders exist: `ls /var/lib/tftpboot/`
- Firewall allows UDP 69
### iPXE loads but can't get boot script
```bash
curl http://localhost/boot/ipxe/boot.ipxe
sudo tail -f /var/log/nginx/access.log
```
Check:
- Nginx running
- boot.ipxe exists: `ls /var/lib/pxe-boot/ipxe/`
- Firewall allows TCP 80
### Boot script loads but can't get kernel
This is expected until T032.S3 (Image Builder) is complete.
Check: `ls /var/lib/pxe-boot/nixos/`
Should have:
- bzImage
- initrd
These will be generated by the image builder.
## Common Commands
```bash
# Check all services
sudo systemctl status dhcpd4 atftpd nginx
# View logs
sudo journalctl -u dhcpd4 -u atftpd -u nginx -f
# Test connectivity
curl http://localhost/health
tftp localhost -c get undionly.kpxe /tmp/test.kpxe
# Restart services
sudo systemctl restart dhcpd4 atftpd nginx
# Check firewall
sudo iptables -L -n | grep -E "67|68|69|80"
```
## Boot Profiles
- **control-plane**: All services (FlareDB, IAM, PlasmaVMC, K8sHost, etc.)
- **worker**: Compute services (K8sHost, PlasmaVMC, ChainFire)
- **all-in-one**: Everything on one node (testing/homelab)
## Next Steps
- Add more nodes (see "Adding Nodes" above)
- Wait for T032.S3 to generate NixOS boot images
- Configure monitoring for boot activity
- Set up DHCP relay for multi-segment networks
## Full Documentation
See [README.md](README.md) for complete documentation.

View file

@ -1,829 +0,0 @@
# Centra Cloud PXE Boot Server
This directory contains the PXE (Preboot eXecution Environment) boot infrastructure for bare-metal provisioning of Centra Cloud nodes. It enables network-based installation of NixOS on physical servers with automated profile selection.
## Table of Contents
- [Architecture Overview](#architecture-overview)
- [Components](#components)
- [Quick Start](#quick-start)
- [Detailed Setup](#detailed-setup)
- [Configuration](#configuration)
- [Boot Profiles](#boot-profiles)
- [Network Requirements](#network-requirements)
- [Troubleshooting](#troubleshooting)
- [Advanced Topics](#advanced-topics)
## Architecture Overview
The PXE boot infrastructure consists of three main services:
```
┌─────────────────────────────────────────────────────────────────┐
│ PXE Boot Flow │
└─────────────────────────────────────────────────────────────────┘
Bare-Metal Server PXE Boot Server
───────────────── ───────────────
1. Power on
├─► DHCP Request ──────────────► DHCP Server
│ (ISC DHCP)
│ │
│ ├─ Assigns IP
│ ├─ Detects BIOS/UEFI
│ └─ Provides bootloader path
├◄─ DHCP Response ───────────────┤
│ (IP, next-server, filename)
├─► TFTP Get bootloader ─────────► TFTP Server
│ (undionly.kpxe or ipxe.efi) (atftpd)
├◄─ Bootloader file ─────────────┤
├─► Execute iPXE bootloader
│ │
│ ├─► HTTP Get boot.ipxe ──────► HTTP Server
│ │ (nginx)
│ │
│ ├◄─ boot.ipxe script ─────────┤
│ │
│ ├─► Display menu / Auto-select profile
│ │
│ ├─► HTTP Get kernel ──────────► HTTP Server
│ │
│ ├◄─ bzImage ───────────────────┤
│ │
│ ├─► HTTP Get initrd ───────────► HTTP Server
│ │
│ ├◄─ initrd ────────────────────┤
│ │
│ └─► Boot NixOS
└─► NixOS Installer
└─ Provisions node based on profile
```
## Components
### 1. DHCP Server (ISC DHCP)
- **Purpose**: Assigns IP addresses and directs PXE clients to bootloader
- **Config**: `dhcp/dhcpd.conf`
- **Features**:
- BIOS/UEFI detection via option 93 (architecture type)
- Per-host configuration for fixed IP assignment
- Automatic next-server and filename configuration
### 2. TFTP Server (atftpd)
- **Purpose**: Serves iPXE bootloader files to PXE clients
- **Files served**:
- `undionly.kpxe` - BIOS bootloader
- `ipxe.efi` - UEFI x86-64 bootloader
- `ipxe-i386.efi` - UEFI x86 32-bit bootloader (optional)
### 3. HTTP Server (nginx)
- **Purpose**: Serves iPXE scripts and NixOS boot images
- **Config**: `http/nginx.conf`
- **Endpoints**:
- `/boot/ipxe/boot.ipxe` - Main boot menu script
- `/boot/nixos/bzImage` - NixOS kernel
- `/boot/nixos/initrd` - NixOS initial ramdisk
- `/health` - Health check endpoint
### 4. iPXE Boot Scripts
- **Main script**: `ipxe/boot.ipxe`
- **Features**:
- Interactive boot menu with 3 profiles
- MAC-based automatic profile selection
- Serial console support for remote management
- Detailed error messages and debugging options
### 5. NixOS Service Module
- **File**: `nixos-module.nix`
- **Purpose**: Declarative NixOS configuration for all services
- **Features**:
- Single configuration file for entire stack
- Firewall rules auto-configured
- Systemd service dependencies managed
- Directory structure auto-created
## Quick Start
### Prerequisites
- NixOS server with network connectivity
- Network interface on the same subnet as bare-metal servers
- Sufficient disk space (5-10 GB for boot images)
### Installation Steps
1. **Clone this repository** (or copy `baremetal/pxe-server/` to your NixOS system)
2. **Run the setup script**:
```bash
sudo ./setup.sh --install --download --validate
```
This will:
- Create directory structure at `/var/lib/pxe-boot`
- Download iPXE bootloaders from boot.ipxe.org
- Install boot scripts
- Validate configurations
3. **Configure network settings**:
Edit `nixos-module.nix` or create a NixOS configuration:
```nix
# /etc/nixos/configuration.nix
imports = [
/path/to/baremetal/pxe-server/nixos-module.nix
];
services.centra-pxe-server = {
enable = true;
interface = "eth0"; # Your network interface
serverAddress = "10.0.100.10"; # PXE server IP
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
};
# Optional: Define known nodes with MAC addresses
nodes = {
"52:54:00:12:34:56" = {
profile = "control-plane";
hostname = "control-plane-01";
ipAddress = "10.0.100.50";
};
};
};
```
4. **Deploy NixOS configuration**:
```bash
sudo nixos-rebuild switch
```
5. **Verify services are running**:
```bash
sudo ./setup.sh --test
```
6. **Add NixOS boot images** (will be provided by T032.S3):
```bash
# Placeholder - actual images will be built by image builder
# For testing, you can use any NixOS netboot image
sudo mkdir -p /var/lib/pxe-boot/nixos
# Copy bzImage and initrd to /var/lib/pxe-boot/nixos/
```
7. **Boot a bare-metal server**:
- Configure server BIOS to boot from network (PXE)
- Connect to same network segment
- Power on server
- Watch for DHCP discovery and iPXE boot menu
## Detailed Setup
### Option 1: NixOS Module (Recommended)
The NixOS module provides a declarative way to configure the entire PXE server stack.
**Advantages**:
- Single configuration file
- Automatic service dependencies
- Rollback capability
- Integration with NixOS firewall
**Configuration Example**:
See the NixOS configuration example in [Quick Start](#quick-start).
### Option 2: Manual Installation
For non-NixOS systems or manual setup:
1. **Install required packages**:
```bash
# Debian/Ubuntu
apt-get install isc-dhcp-server atftpd nginx curl
# RHEL/CentOS
yum install dhcp tftp-server nginx curl
```
2. **Run setup script**:
```bash
sudo ./setup.sh --install --download
```
3. **Copy configuration files**:
```bash
# DHCP configuration
sudo cp dhcp/dhcpd.conf /etc/dhcp/dhcpd.conf
# Edit to match your network
sudo vim /etc/dhcp/dhcpd.conf
# Nginx configuration
sudo cp http/nginx.conf /etc/nginx/sites-available/pxe-boot
sudo ln -s /etc/nginx/sites-available/pxe-boot /etc/nginx/sites-enabled/
```
4. **Start services**:
```bash
sudo systemctl enable --now isc-dhcp-server
sudo systemctl enable --now atftpd
sudo systemctl enable --now nginx
```
5. **Configure firewall**:
```bash
# UFW (Ubuntu)
sudo ufw allow 67/udp # DHCP
sudo ufw allow 68/udp # DHCP
sudo ufw allow 69/udp # TFTP
sudo ufw allow 80/tcp # HTTP
# firewalld (RHEL)
sudo firewall-cmd --permanent --add-service=dhcp
sudo firewall-cmd --permanent --add-service=tftp
sudo firewall-cmd --permanent --add-service=http
sudo firewall-cmd --reload
```
## Configuration
### DHCP Configuration
The DHCP server configuration is in `dhcp/dhcpd.conf`. Key sections:
**Network Settings**:
```conf
subnet 10.0.100.0 netmask 255.255.255.0 {
range 10.0.100.100 10.0.100.200;
option routers 10.0.100.1;
option domain-name-servers 10.0.100.1, 8.8.8.8;
next-server 10.0.100.10; # PXE server IP
# ...
}
```
**Boot File Selection** (automatic BIOS/UEFI detection):
```conf
if exists user-class and option user-class = "iPXE" {
filename "http://10.0.100.10/boot/ipxe/boot.ipxe";
} elsif option architecture-type = 00:00 {
filename "undionly.kpxe"; # BIOS
} elsif option architecture-type = 00:07 {
filename "ipxe.efi"; # UEFI x86-64
}
```
**Host-Specific Configuration**:
```conf
host control-plane-01 {
hardware ethernet 52:54:00:12:34:56;
fixed-address 10.0.100.50;
option host-name "control-plane-01";
}
```
### iPXE Boot Script
The main boot script is `ipxe/boot.ipxe`. It provides:
1. **MAC-based automatic selection**:
```ipxe
iseq ${mac} 52:54:00:12:34:56 && set profile control-plane && goto boot ||
```
2. **Interactive menu** (if no MAC match):
```ipxe
:menu
menu Centra Cloud - Bare-Metal Provisioning
item control-plane 1. Control Plane Node (All Services)
item worker 2. Worker Node (Compute Services)
item all-in-one 3. All-in-One Node (Testing/Homelab)
```
3. **Kernel parameters**:
```ipxe
set kernel-params centra.profile=${profile}
set kernel-params ${kernel-params} centra.hostname=${hostname}
set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8
```
### Adding New Nodes
To add a new node to the infrastructure:
1. **Get the MAC address** from the server (check BIOS or network card label)
2. **Add to MAC mappings** (`ipxe/mac-mappings.txt`):
```
52:54:00:12:34:5d worker worker-04
```
3. **Update boot script** (`ipxe/boot.ipxe`):
```ipxe
iseq ${mac} 52:54:00:12:34:5d && set profile worker && set hostname worker-04 && goto boot ||
```
4. **Add DHCP host entry** (`dhcp/dhcpd.conf`):
```conf
host worker-04 {
hardware ethernet 52:54:00:12:34:5d;
fixed-address 10.0.100.64;
option host-name "worker-04";
}
```
5. **Restart DHCP service**:
```bash
sudo systemctl restart dhcpd4
```
## Boot Profiles
### 1. Control Plane Profile
**Purpose**: Nodes that run core infrastructure services
**Services included**:
- FlareDB (PD, Store, TiKV-compatible database)
- IAM (Identity and Access Management)
- PlasmaVMC (Virtual Machine Controller)
- K8sHost (Kubernetes node agent)
- FlashDNS (High-performance DNS)
- ChainFire (Firewall/networking)
- Object Storage (S3-compatible)
- Monitoring (Prometheus, Grafana)
**Resource requirements**:
- CPU: 8+ cores recommended
- RAM: 32+ GB recommended
- Disk: 500+ GB SSD
**Use case**: Production control plane nodes in a cluster
### 2. Worker Profile
**Purpose**: Nodes that run customer workloads
**Services included**:
- K8sHost (Kubernetes node agent) - primary service
- PlasmaVMC (Virtual Machine Controller) - VM workloads
- ChainFire (Network policy enforcement)
- FlashDNS (Local DNS caching)
- Basic monitoring agents
**Resource requirements**:
- CPU: 16+ cores recommended
- RAM: 64+ GB recommended
- Disk: 1+ TB SSD
**Use case**: Worker nodes for running customer applications
### 3. All-in-One Profile
**Purpose**: Single-node deployment for testing and development
**Services included**:
- Complete Centra Cloud stack on one node
- All services from control-plane profile
- Suitable for testing, development, homelab
**Resource requirements**:
- CPU: 16+ cores recommended
- RAM: 64+ GB recommended
- Disk: 1+ TB SSD
**Use case**: Development, testing, homelab deployments
**Warning**: Not recommended for production use (no HA, resource intensive)
## Network Requirements
### Network Topology
The PXE server must be on the same network segment as the bare-metal servers, or you must configure DHCP relay.
**Same Segment** (recommended for initial setup):
```
┌──────────────┐ ┌──────────────────┐
│ PXE Server │ │ Bare-Metal Srv │
│ 10.0.100.10 │◄────────┤ (DHCP client) │
└──────────────┘ L2 SW └──────────────────┘
```
**Different Segments** (requires DHCP relay):
```
┌──────────────┐ ┌──────────┐ ┌──────────────────┐
│ PXE Server │ │ Router │ │ Bare-Metal Srv │
│ 10.0.100.10 │◄────────┤ (relay) │◄────────┤ (DHCP client) │
└──────────────┘ └──────────┘ └──────────────────┘
Segment A ip helper Segment B
```
### DHCP Relay Configuration
If your PXE server is on a different network segment:
**Cisco IOS**:
```
interface vlan 100
ip helper-address 10.0.100.10
```
**Linux (dhcp-helper)**:
```bash
apt-get install dhcp-helper
# Edit /etc/default/dhcp-helper
DHCPHELPER_OPTS="-s 10.0.100.10"
systemctl restart dhcp-helper
```
**Linux (dhcrelay)**:
```bash
apt-get install isc-dhcp-relay
dhcrelay -i eth0 -i eth1 10.0.100.10
```
### Firewall Rules
The following ports must be open on the PXE server:
| Port | Protocol | Service | Direction | Description |
|------|----------|---------|-----------|-------------|
| 67 | UDP | DHCP | Inbound | DHCP server |
| 68 | UDP | DHCP | Outbound | DHCP client responses |
| 69 | UDP | TFTP | Inbound | TFTP bootloader downloads |
| 80 | TCP | HTTP | Inbound | iPXE scripts and boot images |
| 443 | TCP | HTTPS | Inbound | Optional: secure boot images |
### Network Bandwidth
Estimated bandwidth requirements:
- Per-node boot: ~500 MB download (kernel + initrd)
- Concurrent boots: Multiply by number of simultaneous boots
- Recommended: 1 Gbps link for PXE server
Example: Booting 10 nodes simultaneously requires ~5 Gbps throughput burst, so stagger boots or use 10 Gbps link.
## Troubleshooting
### DHCP Issues
**Problem**: Server doesn't get IP address
**Diagnosis**:
```bash
# On PXE server, monitor DHCP requests
sudo tcpdump -i eth0 -n port 67 or port 68
# Check DHCP server logs
sudo journalctl -u dhcpd4 -f
# Verify DHCP server is running
sudo systemctl status dhcpd4
```
**Common causes**:
- DHCP server not running on correct interface
- Firewall blocking UDP 67/68
- Network cable/switch issue
- DHCP range exhausted
**Solution**:
```bash
# Check interface configuration
ip addr show
# Verify DHCP config syntax
sudo dhcpd -t -cf /etc/dhcp/dhcpd.conf
# Check firewall
sudo iptables -L -n | grep -E "67|68"
# Restart DHCP server
sudo systemctl restart dhcpd4
```
### TFTP Issues
**Problem**: PXE client gets IP but fails to download bootloader
**Diagnosis**:
```bash
# Monitor TFTP requests
sudo tcpdump -i eth0 -n port 69
# Check TFTP server logs
sudo journalctl -u atftpd -f
# Test TFTP locally
tftp localhost -c get undionly.kpxe /tmp/test.kpxe
```
**Common causes**:
- TFTP server not running
- Bootloader files missing
- Permissions incorrect
- Firewall blocking UDP 69
**Solution**:
```bash
# Check files exist
ls -la /var/lib/tftpboot/
# Fix permissions
sudo chmod 644 /var/lib/tftpboot/*.{kpxe,efi}
# Restart TFTP server
sudo systemctl restart atftpd
# Check firewall
sudo iptables -L -n | grep 69
```
### HTTP Issues
**Problem**: iPXE loads but can't download boot script or kernel
**Diagnosis**:
```bash
# Monitor HTTP requests
sudo tail -f /var/log/nginx/access.log
# Test HTTP locally
curl -v http://localhost/boot/ipxe/boot.ipxe
curl -v http://localhost/health
# Check nginx status
sudo systemctl status nginx
```
**Common causes**:
- Nginx not running
- Boot files missing
- Permissions incorrect
- Firewall blocking TCP 80
- Wrong server IP in boot.ipxe
**Solution**:
```bash
# Check nginx config
sudo nginx -t
# Verify files exist
ls -la /var/lib/pxe-boot/ipxe/
ls -la /var/lib/pxe-boot/nixos/
# Fix permissions
sudo chown -R nginx:nginx /var/lib/pxe-boot
sudo chmod -R 755 /var/lib/pxe-boot
# Restart nginx
sudo systemctl restart nginx
```
### Boot Script Issues
**Problem**: Boot menu appears but fails to load kernel
**Diagnosis**:
- Check iPXE error messages on console
- Verify URLs in boot.ipxe match actual paths
- Test kernel download manually:
```bash
curl -I http://10.0.100.10/boot/nixos/bzImage
```
**Common causes**:
- NixOS boot images not deployed yet (normal for T032.S2)
- Wrong paths in boot.ipxe
- Files too large (check disk space)
**Solution**:
```bash
# Wait for T032.S3 (Image Builder) to generate boot images
# OR manually place NixOS netboot images:
sudo mkdir -p /var/lib/pxe-boot/nixos
# Copy bzImage and initrd from NixOS netboot
```
### Serial Console Debugging
For remote debugging without physical access:
1. **Enable serial console in BIOS**:
- Configure COM1/ttyS0 at 115200 baud
- Enable console redirection
2. **Connect via IPMI SOL** (if available):
```bash
ipmitool -I lanplus -H <bmc-ip> -U admin sol activate
```
3. **Watch boot process**:
- DHCP discovery messages
- TFTP download progress
- iPXE boot menu
- Kernel boot messages
4. **Kernel parameters include serial console**:
```
console=tty0 console=ttyS0,115200n8
```
### Common Error Messages
| Error | Cause | Solution |
|-------|-------|----------|
| `PXE-E51: No DHCP or proxyDHCP offers were received` | DHCP server not responding | Check DHCP server running, network connectivity |
| `PXE-E53: No boot filename received` | DHCP not providing filename | Check dhcpd.conf has `filename` option |
| `PXE-E32: TFTP open timeout` | TFTP server not responding | Check TFTP server running, firewall rules |
| `Not found: /boot/ipxe/boot.ipxe` | HTTP 404 error | Check file exists, nginx config, permissions |
| `Could not boot: Exec format error` | Corrupted boot file | Re-download/rebuild bootloader |
## Advanced Topics
### Building iPXE from Source
For production deployments, building iPXE from source provides:
- Custom branding
- Embedded certificates for HTTPS
- Optimized size
- Security hardening
**Build instructions**:
```bash
sudo ./setup.sh --build-ipxe
```
Or manually:
```bash
git clone https://github.com/ipxe/ipxe.git
cd ipxe/src
# BIOS bootloader
make bin/undionly.kpxe
# UEFI bootloader
make bin-x86_64-efi/ipxe.efi
# Copy to PXE server
sudo cp bin/undionly.kpxe /var/lib/pxe-boot/ipxe/
sudo cp bin-x86_64-efi/ipxe.efi /var/lib/pxe-boot/ipxe/
```
### HTTPS Boot (Secure Boot)
For enhanced security, serve boot images over HTTPS:
1. **Generate SSL certificate**:
```bash
sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \
-keyout /etc/ssl/private/pxe-server.key \
-out /etc/ssl/certs/pxe-server.crt
```
2. **Configure nginx for HTTPS** (uncomment HTTPS block in `http/nginx.conf`)
3. **Update boot.ipxe** to use `https://` URLs
4. **Rebuild iPXE with embedded certificate** (for secure boot without prompts)
### Multiple NixOS Versions
To support multiple NixOS versions for testing/rollback:
```
/var/lib/pxe-boot/nixos/
├── 24.05/
│ ├── bzImage
│ └── initrd
├── 24.11/
│ ├── bzImage
│ └── initrd
└── latest -> 24.11/ # Symlink to current version
```
Update `boot.ipxe` to use `/boot/nixos/latest/bzImage` or add menu items for version selection.
### Integration with BMC/IPMI
For fully automated provisioning:
1. **Discover new hardware** via IPMI/Redfish API
2. **Configure PXE boot** via IPMI:
```bash
ipmitool -I lanplus -H <bmc-ip> -U admin chassis bootdev pxe options=persistent
```
3. **Power on server**:
```bash
ipmitool -I lanplus -H <bmc-ip> -U admin power on
```
4. **Monitor via SOL** (serial-over-LAN)
### Monitoring and Metrics
Track PXE boot activity:
1. **DHCP leases**:
```bash
cat /var/lib/dhcp/dhcpd.leases
```
2. **HTTP access logs**:
```bash
sudo tail -f /var/log/nginx/access.log | grep -E "boot.ipxe|bzImage|initrd"
```
3. **Prometheus metrics** (if nginx-module-vts installed):
- Boot file download counts
- Bandwidth usage
- Response times
4. **Custom metrics endpoint**:
- Parse nginx access logs
- Count boots per profile
- Alert on failed boots
## Files and Directory Structure
```
baremetal/pxe-server/
├── README.md # This file
├── setup.sh # Setup and management script
├── nixos-module.nix # NixOS service module
├── dhcp/
│ └── dhcpd.conf # DHCP server configuration
├── ipxe/
│ ├── boot.ipxe # Main boot menu script
│ └── mac-mappings.txt # MAC address documentation
├── http/
│ ├── nginx.conf # HTTP server configuration
│ └── directory-structure.txt # Directory layout documentation
└── assets/ # (Created at runtime)
└── /var/lib/pxe-boot/
├── ipxe/
│ ├── undionly.kpxe
│ ├── ipxe.efi
│ └── boot.ipxe
└── nixos/
├── bzImage
└── initrd
```
## Next Steps
After completing the PXE server setup:
1. **T032.S3 - Image Builder**: Automated NixOS image generation with profile-specific configurations
2. **T032.S4 - Provisioning Orchestrator**: API-driven provisioning workflow and node lifecycle management
3. **Integration with IAM**: Authentication for provisioning API
4. **Integration with FlareDB**: Node inventory and state management
## References
- [iPXE Documentation](https://ipxe.org/)
- [ISC DHCP Documentation](https://www.isc.org/dhcp/)
- [NixOS Manual - Netboot](https://nixos.org/manual/nixos/stable/index.html#sec-building-netboot)
- [PXE Specification](https://www.intel.com/content/www/us/en/architecture-and-technology/intel-boot-executive.html)
## Support
For issues or questions:
- Check [Troubleshooting](#troubleshooting) section
- Review logs: `sudo journalctl -u dhcpd4 -u atftpd -u nginx -f`
- Run diagnostic: `sudo ./setup.sh --test`
## License
Part of Centra Cloud infrastructure - see project root for license information.

View file

@ -1,392 +0,0 @@
# NixOS Configuration Examples for PXE Boot Server
#
# This file contains example configurations for different deployment scenarios.
# Copy the relevant section to your /etc/nixos/configuration.nix
##############################################################################
# Example 1: Basic Single-Subnet PXE Server
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10";
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
nameservers = [ "10.0.100.1" "8.8.8.8" ];
domainName = "centra.local";
};
};
}
##############################################################################
# Example 2: PXE Server with Known Nodes (MAC-based Auto-Selection)
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10";
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
};
# Define known nodes with MAC addresses
nodes = {
# Control plane nodes
"52:54:00:12:34:56" = {
profile = "control-plane";
hostname = "control-plane-01";
ipAddress = "10.0.100.50";
};
"52:54:00:12:34:59" = {
profile = "control-plane";
hostname = "control-plane-02";
ipAddress = "10.0.100.51";
};
"52:54:00:12:34:5a" = {
profile = "control-plane";
hostname = "control-plane-03";
ipAddress = "10.0.100.52";
};
# Worker nodes
"52:54:00:12:34:57" = {
profile = "worker";
hostname = "worker-01";
ipAddress = "10.0.100.60";
};
"52:54:00:12:34:5b" = {
profile = "worker";
hostname = "worker-02";
ipAddress = "10.0.100.61";
};
# All-in-one test node
"52:54:00:12:34:58" = {
profile = "all-in-one";
hostname = "homelab-01";
ipAddress = "10.0.100.70";
};
};
};
}
##############################################################################
# Example 3: PXE Server with Custom DHCP Configuration
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10";
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
nameservers = [ "10.0.100.1" "1.1.1.1" "8.8.8.8" ];
domainName = "prod.centra.cloud";
# Longer lease times for stable infrastructure
defaultLeaseTime = 3600; # 1 hour
maxLeaseTime = 86400; # 24 hours
# Additional DHCP configuration
extraConfig = ''
# NTP servers
option ntp-servers 10.0.100.1;
# Additional subnet for management network
subnet 10.0.101.0 netmask 255.255.255.0 {
range 10.0.101.100 10.0.101.200;
option routers 10.0.101.1;
option subnet-mask 255.255.255.0;
next-server 10.0.100.10;
if exists user-class and option user-class = "iPXE" {
filename "http://10.0.100.10/boot/ipxe/boot.ipxe";
} elsif option architecture-type = 00:00 {
filename "undionly.kpxe";
} elsif option architecture-type = 00:07 {
filename "ipxe.efi";
}
}
# Deny unknown clients (only known MAC addresses can boot)
# deny unknown-clients;
'';
};
};
}
##############################################################################
# Example 4: Multi-Homed PXE Server (Multiple Network Interfaces)
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
# Note: The module currently supports single interface.
# For multiple interfaces, configure multiple DHCP server instances manually
# or extend the module to support this use case.
services.centra-pxe-server = {
enable = true;
interface = "eth0"; # Primary provisioning network
serverAddress = "10.0.100.10";
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
};
};
# Manual configuration for second interface
# services.dhcpd4.interfaces = [ "eth0" "eth1" ];
}
##############################################################################
# Example 5: High-Availability PXE Server (with Failover)
##############################################################################
# Primary PXE server
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10"; # Primary server IP
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.150"; # Split range for failover
};
router = "10.0.100.1";
extraConfig = ''
# DHCP Failover Configuration
failover peer "centra-pxe-failover" {
primary;
address 10.0.100.10;
port 647;
peer address 10.0.100.11;
peer port 647;
max-response-delay 30;
max-unacked-updates 10;
load balance max seconds 3;
mclt 1800;
split 128;
}
pool {
failover peer "centra-pxe-failover";
range 10.0.100.100 10.0.100.150;
}
'';
};
};
}
# Secondary PXE server (similar config with "secondary" role)
# Deploy on a different server with IP 10.0.100.11
##############################################################################
# Example 6: PXE Server with HTTPS Boot (Secure Boot)
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10";
http = {
port = 443; # Use HTTPS
};
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
};
};
# Configure SSL certificates
services.nginx = {
virtualHosts."pxe.centra.local" = {
enableSSL = true;
sslCertificate = "/etc/ssl/certs/pxe-server.crt";
sslCertificateKey = "/etc/ssl/private/pxe-server.key";
};
};
# Note: You'll need to rebuild iPXE with embedded certificates
# for seamless HTTPS boot without certificate warnings
}
##############################################################################
# Example 7: Development/Testing Configuration (Permissive)
##############################################################################
{
imports = [ ./baremetal/pxe-server/nixos-module.nix ];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "192.168.1.10"; # Typical home network
dhcp = {
subnet = "192.168.1.0";
netmask = "255.255.255.0";
broadcast = "192.168.1.255";
range = {
start = "192.168.1.100";
end = "192.168.1.120";
};
router = "192.168.1.1";
# Short lease times for rapid testing
defaultLeaseTime = 300; # 5 minutes
maxLeaseTime = 600; # 10 minutes
};
};
# Enable nginx directory listing for debugging
services.nginx.appendHttpConfig = ''
autoindex on;
'';
}
##############################################################################
# Example 8: Production Configuration with Monitoring
##############################################################################
{
imports = [
./baremetal/pxe-server/nixos-module.nix
];
services.centra-pxe-server = {
enable = true;
interface = "eth0";
serverAddress = "10.0.100.10";
dhcp = {
subnet = "10.0.100.0";
netmask = "255.255.255.0";
broadcast = "10.0.100.255";
range = {
start = "10.0.100.100";
end = "10.0.100.200";
};
router = "10.0.100.1";
};
nodes = {
# Production node definitions
# ... (add your nodes here)
};
};
# Enable Prometheus monitoring
services.prometheus.exporters.nginx = {
enable = true;
port = 9113;
};
# Centralized logging
services.rsyslog = {
enable = true;
extraConfig = ''
# Forward DHCP logs to centralized log server
if $programname == 'dhcpd' then @@logserver.centra.local:514
'';
};
# Backup DHCP leases
systemd.services.backup-dhcp-leases = {
description = "Backup DHCP leases";
serviceConfig = {
Type = "oneshot";
ExecStart = "${pkgs.rsync}/bin/rsync -a /var/lib/dhcp/dhcpd.leases /backup/dhcp/dhcpd.leases.$(date +%Y%m%d)";
};
};
systemd.timers.backup-dhcp-leases = {
wantedBy = [ "timers.target" ];
timerConfig = {
OnCalendar = "daily";
Persistent = true;
};
};
}
##############################################################################
# Notes
##############################################################################
# 1. Always update serverAddress, subnet, and interface to match your network
#
# 2. For MAC-based auto-selection, add nodes to the `nodes` attribute
#
# 3. DHCP failover requires configuration on both primary and secondary servers
#
# 4. HTTPS boot requires custom-built iPXE with embedded certificates
#
# 5. Test configurations in a development environment before production deployment
#
# 6. Keep DHCP lease database backed up for disaster recovery
#
# 7. Monitor DHCP pool utilization to avoid exhaustion
#
# 8. Use fixed IP addresses (via MAC mapping) for critical infrastructure nodes

View file

@ -1,81 +0,0 @@
# PXE Boot Server Directory Structure
#
# This document describes the directory layout for the HTTP/TFTP server
# that serves PXE boot assets.
#
# Base Directory: /var/lib/pxe-boot/
/var/lib/pxe-boot/
├── ipxe/ # iPXE bootloaders and scripts
│ ├── undionly.kpxe # iPXE bootloader for BIOS (legacy)
│ ├── ipxe.efi # iPXE bootloader for UEFI x86-64
│ ├── ipxe-i386.efi # iPXE bootloader for UEFI x86 32-bit (rare)
│ ├── boot.ipxe # Main boot script (served via HTTP)
│ └── README.txt # Documentation
├── nixos/ # NixOS netboot images
│ ├── bzImage # Linux kernel (compressed)
│ ├── initrd # Initial ramdisk
│ ├── squashfs # Root filesystem (if using squashfs)
│ ├── version.txt # Build version info
│ └── profiles/ # Profile-specific boot images (optional)
│ ├── control-plane/
│ │ ├── bzImage
│ │ └── initrd
│ ├── worker/
│ │ ├── bzImage
│ │ └── initrd
│ └── all-in-one/
│ ├── bzImage
│ └── initrd
└── README.txt # Top-level documentation
# TFTP Directory (if using separate TFTP server)
# Usually: /var/lib/tftpboot/ or /srv/tftp/
/var/lib/tftpboot/
├── undionly.kpxe # Symlink to /var/lib/pxe-boot/ipxe/undionly.kpxe
├── ipxe.efi # Symlink to /var/lib/pxe-boot/ipxe/ipxe.efi
└── ipxe-i386.efi # Symlink to /var/lib/pxe-boot/ipxe/ipxe-i386.efi
# URL Mapping
# The following URLs are served by nginx:
#
# http://10.0.100.10/boot/ipxe/boot.ipxe
# -> /var/lib/pxe-boot/ipxe/boot.ipxe
#
# http://10.0.100.10/boot/ipxe/undionly.kpxe
# -> /var/lib/pxe-boot/ipxe/undionly.kpxe
#
# http://10.0.100.10/boot/nixos/bzImage
# -> /var/lib/pxe-boot/nixos/bzImage
#
# http://10.0.100.10/boot/nixos/initrd
# -> /var/lib/pxe-boot/nixos/initrd
# File Sizes (Typical)
# - undionly.kpxe: ~100 KB
# - ipxe.efi: ~1 MB
# - boot.ipxe: ~10 KB (text script)
# - bzImage: ~10-50 MB (compressed kernel)
# - initrd: ~50-500 MB (depends on included tools/drivers)
# Permissions
# All files should be readable by the nginx user:
# chown -R nginx:nginx /var/lib/pxe-boot
# chmod -R 755 /var/lib/pxe-boot
# chmod 644 /var/lib/pxe-boot/ipxe/*
# chmod 644 /var/lib/pxe-boot/nixos/*
# Disk Space Requirements
# Minimum: 1 GB (for basic setup with one NixOS image)
# Recommended: 5-10 GB (for multiple profiles and versions)
# - Each NixOS profile: ~500 MB - 1 GB
# - Keep 2-3 versions for rollback: multiply by 2-3x
# - Add buffer for logs and temporary files
# Backup Recommendations
# - Boot scripts (ipxe/*.ipxe): Version control (git)
# - Bootloaders (ipxe/*.kpxe, *.efi): Can re-download, but keep backups
# - NixOS images: Can rebuild from S3 builder, but keep at least 2 versions
# - Configuration files: Version control (git)

View file

@ -1,47 +0,0 @@
# MAC Address to Profile Mappings
#
# This file documents the MAC address mappings used in boot.ipxe
# Update this file when adding new nodes to your infrastructure
#
# Format: MAC_ADDRESS PROFILE HOSTNAME
#
# To generate MAC addresses for virtual machines (testing):
# - Use the 52:54:00:xx:xx:xx range (QEMU/KVM local)
# - Or use your hypervisor's MAC assignment
#
# For physical servers:
# - Use the actual MAC address of the primary network interface
# - Usually found on a label on the server or in BIOS/BMC
#
# Control Plane Nodes
52:54:00:12:34:56 control-plane control-plane-01
52:54:00:12:34:59 control-plane control-plane-02
52:54:00:12:34:5a control-plane control-plane-03
# Worker Nodes
52:54:00:12:34:57 worker worker-01
52:54:00:12:34:5b worker worker-02
52:54:00:12:34:5c worker worker-03
# All-in-One Nodes (Testing/Homelab)
52:54:00:12:34:58 all-in-one all-in-one-01
# Instructions for Adding New Nodes:
# 1. Add the MAC address, profile, and hostname to this file
# 2. Update boot.ipxe with the new MAC address mapping
# 3. Update dhcpd.conf with a host entry for fixed IP assignment (optional)
# 4. Restart the DHCP service: systemctl restart dhcpd
#
# Example:
# 52:54:00:12:34:5d worker worker-04
#
# Then add to boot.ipxe:
# iseq ${mac} 52:54:00:12:34:5d && set profile worker && set hostname worker-04 && goto boot ||
#
# And optionally add to dhcpd.conf:
# host worker-04 {
# hardware ethernet 52:54:00:12:34:5d;
# fixed-address 10.0.100.64;
# option host-name "worker-04";
# }

View file

@ -1,15 +0,0 @@
use chainfire_client::Client;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Build a client with default retry/backoff.
let mut client = Client::builder("http://127.0.0.1:2379").build().await?;
// Simple put/get roundtrip.
client.put_str("/example/key", "value").await?;
if let Some(val) = client.get_str("/example/key").await? {
println!("Got value: {}", val);
}
Ok(())
}

View file

@ -1,27 +0,0 @@
// Minimal cleanup utility for deleting stale deployer entries from ChainFire.
// Usage: cargo run -p chainfire-client --example cleanup
use chainfire_client::Client;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// ChainFire API endpoint
let mut client = Client::connect("http://127.0.0.1:7000").await?;
// Stale keys to remove
let keys = [
b"deployer/nodes/info/node-025456f1".as_ref(),
b"deployer/nodes/config/025456f142ee424b88cd8aba5cf6c16a".as_ref(),
];
for key in keys {
let deleted = client.delete(key).await?;
println!(
"delete {} -> {}",
String::from_utf8_lossy(key),
if deleted { "removed" } else { "not found" }
);
}
Ok(())
}

View file

@ -103,6 +103,47 @@ impl Client {
Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64))) Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64)))
} }
/// Put a key-value pair only if the key's mod_revision matches.
///
/// This is a best-effort compare-and-set. The server may not return
/// a reliable success flag, so callers should treat this as "attempted".
pub async fn put_if_revision(
&mut self,
key: impl AsRef<[u8]>,
value: impl AsRef<[u8]>,
expected_mod_revision: u64,
) -> Result<()> {
let key_bytes = key.as_ref().to_vec();
let compare = Compare {
result: compare::CompareResult::Equal as i32,
target: compare::CompareTarget::Mod as i32,
key: key_bytes.clone(),
target_union: Some(compare::TargetUnion::ModRevision(
expected_mod_revision as i64,
)),
};
let put_op = RequestOp {
request: Some(request_op::Request::RequestPut(PutRequest {
key: key_bytes,
value: value.as_ref().to_vec(),
lease: 0,
prev_kv: false,
})),
};
self.kv
.txn(TxnRequest {
compare: vec![compare],
success: vec![put_op],
failure: vec![],
})
.await?
.into_inner();
Ok(())
}
/// Get a value as string /// Get a value as string
pub async fn get_str(&mut self, key: &str) -> Result<Option<String>> { pub async fn get_str(&mut self, key: &str) -> Result<Option<String>> {
let value = self.get(key.as_bytes()).await?; let value = self.get(key.as_bytes()).await?;

View file

@ -26,13 +26,9 @@ impl KvServiceImpl {
} }
/// Create a response header /// Create a response header
fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader { async fn make_header(&self, revision: u64) -> crate::proto::ResponseHeader {
make_header( let term = self.raft.current_term().await;
self.cluster_id, make_header(self.cluster_id, self.raft.node_id(), revision, term)
self.raft.node_id(),
revision,
0, // TODO: get actual term
)
} }
} }
@ -74,7 +70,7 @@ impl Kv for KvServiceImpl {
let count = kvs.len() as i64; let count = kvs.len() as i64;
Ok(Response::new(RangeResponse { Ok(Response::new(RangeResponse {
header: Some(self.make_header(revision)), header: Some(self.make_header(revision).await),
kvs, kvs,
more: false, more: false,
count, count,
@ -107,7 +103,7 @@ impl Kv for KvServiceImpl {
} }
Ok(Response::new(PutResponse { Ok(Response::new(PutResponse {
header: Some(self.make_header(revision)), header: Some(self.make_header(revision).await),
prev_kv: None, // Not supported yet in custom RaftCore prev_kv: None, // Not supported yet in custom RaftCore
})) }))
} }
@ -165,7 +161,7 @@ impl Kv for KvServiceImpl {
} }
Ok(Response::new(DeleteRangeResponse { Ok(Response::new(DeleteRangeResponse {
header: Some(self.make_header(revision)), header: Some(self.make_header(revision).await),
deleted: deleted_count, deleted: deleted_count,
prev_kvs: vec![], // Not supported yet prev_kvs: vec![], // Not supported yet
})) }))
@ -234,7 +230,7 @@ impl Kv for KvServiceImpl {
warn!("Transaction response details not yet supported in custom Raft implementation"); warn!("Transaction response details not yet supported in custom Raft implementation");
Ok(Response::new(TxnResponse { Ok(Response::new(TxnResponse {
header: Some(self.make_header(revision)), header: Some(self.make_header(revision).await),
succeeded: true, // Assume success if no error succeeded: true, // Assume success if no error
responses: vec![], // Not supported yet responses: vec![], // Not supported yet
})) }))

View file

@ -1,52 +0,0 @@
use std::time::Duration;
use chainfire_core::ClusterBuilder;
use chainfire_types::{node::NodeRole, RaftRole};
use tokio::time::sleep;
#[tokio::test]
async fn test_single_node_bootstrap() {
let _ = tracing_subscriber::fmt::try_init();
// 1. Build a single node cluster
let cluster = ClusterBuilder::new(1)
.name("node-1")
.memory_storage()
.gossip_addr("127.0.0.1:0".parse().unwrap())
.raft_addr("127.0.0.1:0".parse().unwrap())
.role(NodeRole::ControlPlane)
.raft_role(RaftRole::Voter)
.bootstrap(true)
.build()
.await
.expect("Failed to build cluster");
let handle = cluster.handle();
// 2. Run the cluster in a background task
tokio::spawn(async move {
cluster.run().await.unwrap();
});
// 3. Wait for leader election
let mut leader_elected = false;
for _ in 0..10 {
if handle.is_leader() {
leader_elected = true;
break;
}
sleep(Duration::from_millis(500)).await;
}
assert!(leader_elected, "Node 1 should become leader in bootstrap mode");
assert_eq!(handle.leader(), Some(1));
// 4. Test KV operations
let kv = handle.kv();
kv.put("test-key", b"test-value").await.expect("Put failed");
let value = kv.get("test-key").await.expect("Get failed");
assert_eq!(value, Some(b"test-value".to_vec()));
// 5. Shutdown
handle.shutdown();
}

View file

@ -1,613 +0,0 @@
//! Integration tests for Leader Election (P1) and Log Replication (P2)
//!
//! Tests cover:
//! - Single-node auto-election
//! - 3-node majority election
//! - Role transitions
//! - Term management
//! - Heartbeat mechanism
//! - Log replication
//! - Leader failure recovery
#![cfg(all(test, feature = "custom-raft"))]
use std::sync::Arc;
use std::time::Duration;
use tokio::time;
use tokio::sync::mpsc;
use chainfire_raft::core::{
RaftCore, RaftConfig, RaftRole, NodeId,
};
use chainfire_raft::network::custom_test_client::{InMemoryRpcClient, RpcMessage};
use chainfire_storage::{LogStorage, StateMachine, RocksStore};
/// Helper to create a test node
async fn create_test_node(node_id: NodeId, peers: Vec<NodeId>) -> (Arc<RaftCore>, tempfile::TempDir) {
let temp_dir = tempfile::TempDir::new().unwrap();
let rocks = RocksStore::new(temp_dir.path()).unwrap();
let storage = Arc::new(LogStorage::new(rocks.clone()));
let state_machine = Arc::new(StateMachine::new(rocks).unwrap());
let network = Arc::new(InMemoryRpcClient::new());
let config = RaftConfig {
election_timeout_min: 150,
election_timeout_max: 300,
heartbeat_interval: 50,
};
let node = Arc::new(RaftCore::new(
node_id,
peers,
storage,
state_machine,
network,
config,
));
node.initialize().await.unwrap();
(node, temp_dir)
}
/// Helper to create a 3-node cluster with RPC wiring
async fn create_3node_cluster() -> (
Vec<Arc<RaftCore>>,
Vec<tempfile::TempDir>,
Arc<InMemoryRpcClient>,
) {
let network = Arc::new(InMemoryRpcClient::new());
let mut nodes = Vec::new();
let mut temp_dirs = Vec::new();
// Create 3 nodes
for node_id in 1..=3 {
let peers: Vec<NodeId> = (1..=3).filter(|&id| id != node_id).collect();
let temp_dir = tempfile::TempDir::new().unwrap();
let rocks = RocksStore::new(temp_dir.path()).unwrap();
let storage = Arc::new(LogStorage::new(rocks.clone()));
let state_machine = Arc::new(StateMachine::new(rocks).unwrap());
let config = RaftConfig {
election_timeout_min: 150, // 150ms - matches single-node test
election_timeout_max: 300, // 300ms
heartbeat_interval: 50, // 50ms - matches single-node test
};
let node = Arc::new(RaftCore::new(
node_id,
peers,
storage,
state_machine,
Arc::clone(&network) as Arc<dyn chainfire_raft::network::RaftRpcClient>,
config,
));
node.initialize().await.unwrap();
nodes.push(node);
temp_dirs.push(temp_dir);
}
// Wire up RPC channels for each node
for node in &nodes {
let node_id = node.node_id();
let (tx, mut rx) = mpsc::unbounded_channel::<RpcMessage>();
network.register(node_id, tx).await;
// Spawn handler for this node's RPC messages
let node_clone = Arc::clone(node);
tokio::spawn(async move {
eprintln!("[RPC Handler {}] Started", node_clone.node_id());
while let Some(msg) = rx.recv().await {
match msg {
RpcMessage::Vote(req, resp_tx) => {
eprintln!("[RPC Handler {}] Processing Vote from {}",
node_clone.node_id(), req.candidate_id);
node_clone.request_vote_rpc(req, resp_tx).await;
}
RpcMessage::AppendEntries(req, resp_tx) => {
eprintln!("[RPC Handler {}] Processing AppendEntries from {} term={}",
node_clone.node_id(), req.leader_id, req.term);
node_clone.append_entries_rpc(req, resp_tx).await;
}
}
}
eprintln!("[RPC Handler {}] Stopped (channel closed)", node_clone.node_id());
});
}
// Give all RPC handler tasks time to start
tokio::time::sleep(tokio::time::Duration::from_millis(10)).await;
(nodes, temp_dirs, network)
}
// ============================================================================
// Test Cases
// ============================================================================
#[tokio::test]
async fn test_node_creation_and_initialization() {
// Test that we can create a node and initialize it
let (node, _temp_dir) = create_test_node(1, vec![2, 3]).await;
// Node should start as follower
assert_eq!(node.role().await, RaftRole::Follower);
// Node ID should be correct
assert_eq!(node.node_id(), 1);
// Term should start at 0
assert_eq!(node.current_term().await, 0);
}
#[tokio::test]
async fn test_role_transitions() {
// Test basic role enumeration
assert_ne!(RaftRole::Follower, RaftRole::Candidate);
assert_ne!(RaftRole::Candidate, RaftRole::Leader);
assert_ne!(RaftRole::Leader, RaftRole::Follower);
}
#[tokio::test]
async fn test_term_persistence() {
// Test that term can be persisted and loaded
let temp_dir = tempfile::TempDir::new().unwrap();
let path = temp_dir.path().to_str().unwrap().to_string();
{
// Create first node and let it initialize
let rocks = RocksStore::new(&path).unwrap();
let storage = Arc::new(LogStorage::new(rocks.clone()));
let state_machine = Arc::new(StateMachine::new(rocks).unwrap());
let network = Arc::new(InMemoryRpcClient::new());
let node = Arc::new(RaftCore::new(
1,
vec![2, 3],
storage,
state_machine,
network,
RaftConfig::default(),
));
node.initialize().await.unwrap();
// Initial term should be 0
assert_eq!(node.current_term().await, 0);
}
{
// Create second node with same storage path
let rocks = RocksStore::new(&path).unwrap();
let storage = Arc::new(LogStorage::new(rocks.clone()));
let state_machine = Arc::new(StateMachine::new(rocks).unwrap());
let network = Arc::new(InMemoryRpcClient::new());
let node = Arc::new(RaftCore::new(
1,
vec![2, 3],
storage,
state_machine,
network,
RaftConfig::default(),
));
node.initialize().await.unwrap();
// Term should still be 0 (loaded from storage)
assert_eq!(node.current_term().await, 0);
}
}
#[tokio::test]
async fn test_config_defaults() {
// Test that default config has reasonable values
let config = RaftConfig::default();
assert!(config.election_timeout_min > 0);
assert!(config.election_timeout_max > config.election_timeout_min);
assert!(config.heartbeat_interval > 0);
assert!(config.heartbeat_interval < config.election_timeout_min);
}
// ============================================================================
// P2: Log Replication Integration Tests
// ============================================================================
#[tokio::test]
async fn test_3node_cluster_formation() {
// Test 1: 3-Node Cluster Formation Test
// - 3 nodes start → Leader elected
// - All followers receive heartbeat
// - No election timeout occurs
let (nodes, _temp_dirs, _network) = create_3node_cluster().await;
// Start event loops for all nodes
let mut handles = Vec::new();
for node in &nodes {
let node_clone = Arc::clone(node);
let handle = tokio::spawn(async move {
let _ = node_clone.run().await;
});
handles.push(handle);
}
// Wait for leader election (should happen within ~500ms)
time::sleep(Duration::from_millis(500)).await;
// Check that exactly one leader was elected
let mut leader_count = 0;
let mut follower_count = 0;
let mut leader_id = None;
for node in &nodes {
match node.role().await {
RaftRole::Leader => {
leader_count += 1;
leader_id = Some(node.node_id());
}
RaftRole::Follower => {
follower_count += 1;
}
RaftRole::Candidate => {
// Should not have candidates after election
panic!("Node {} is still candidate after election", node.node_id());
}
}
}
assert_eq!(leader_count, 1, "Expected exactly one leader");
assert_eq!(follower_count, 2, "Expected exactly two followers");
assert!(leader_id.is_some(), "Leader should be identified");
println!("✓ Leader elected: node {}", leader_id.unwrap());
// Wait a bit more to ensure heartbeats prevent election timeout
// Heartbeat interval is 50ms, election timeout is 150-300ms
// So after 400ms, no new election should occur
time::sleep(Duration::from_millis(400)).await;
// Verify leader is still the same
for node in &nodes {
if node.node_id() == leader_id.unwrap() {
assert_eq!(node.role().await, RaftRole::Leader, "Leader should remain leader");
} else {
assert_eq!(
node.role().await,
RaftRole::Follower,
"Followers should remain followers due to heartbeats"
);
}
}
println!("✓ Heartbeats prevent election timeout");
}
#[tokio::test]
#[ignore] // Requires client write API implementation
async fn test_log_replication() {
// Test 2: Log Replication Test
// - Leader adds entries
// - Replicated to all followers
// - commit_index synchronized
// TODO: Implement once client write API is ready
// This requires handle_client_write to be fully implemented
}
#[tokio::test]
#[ignore] // Requires graceful node shutdown
async fn test_leader_failure_recovery() {
// Test 3: Leader Failure Test
// - Leader stops → New leader elected
// - Log consistency maintained
// TODO: Implement once we have graceful shutdown mechanism
// Currently, aborting the event loop doesn't cleanly stop the node
}
// ============================================================================
// Deferred complex tests
// ============================================================================
#[tokio::test]
#[ignore] // Requires full cluster setup
async fn test_split_vote_recovery() {
// Test that cluster recovers from split vote
// Deferred: Requires complex timing control
}
#[tokio::test]
#[ignore] // Requires node restart mechanism
async fn test_vote_persistence_across_restart() {
// Test that votes persist across node restarts
// Deferred: Requires proper shutdown/startup sequencing
}
// ============================================================================
// P3: Commitment & State Machine Integration Tests
// ============================================================================
#[tokio::test]
async fn test_write_replicate_commit() {
// Test: Client write on leader → replication → commit → state machine apply
// Verifies the complete write→replicate→commit→apply flow
use chainfire_types::command::RaftCommand;
let (nodes, _temp_dirs, _network) = create_3node_cluster().await;
// Start event loops for all nodes
let mut handles = Vec::new();
for node in &nodes {
let node_clone = Arc::clone(node);
let handle = tokio::spawn(async move {
let _ = node_clone.run().await;
});
handles.push(handle);
}
// Wait for leader election (election timeout is 2-4s)
time::sleep(Duration::from_millis(5000)).await;
// Find the leader
let mut leader = None;
for node in &nodes {
if matches!(node.role().await, RaftRole::Leader) {
leader = Some(node);
break;
}
}
let leader = leader.expect("Leader should be elected");
println!("✓ Leader elected: node {}", leader.node_id());
// Submit a write command to the leader
let cmd = RaftCommand::Put {
key: b"test_key_1".to_vec(),
value: b"test_value_1".to_vec(),
lease_id: None,
prev_kv: false,
};
leader
.client_write(cmd)
.await
.expect("Client write should succeed");
println!("✓ Client write submitted to leader");
// Wait for replication and commit (heartbeat + replication + commit)
// Heartbeat interval is 50ms, need multiple rounds:
// 1. First heartbeat sends entries
// 2. Followers ack, leader updates match_index and commit_index
// 3. Second heartbeat propagates new leader_commit to followers
// 4. Followers update their commit_index and apply entries
// Give extra time to avoid re-election issues
time::sleep(Duration::from_millis(1500)).await;
// Debug: Check all nodes' roles and states
println!("\nDEBUG: All nodes after write:");
for node in &nodes {
println!(" Node {} role={:?} term={} commit_index={} last_applied={}",
node.node_id(), node.role().await, node.current_term().await,
node.commit_index().await, node.last_applied().await);
}
println!();
// Verify that the value is committed and applied on all nodes
for node in &nodes {
let commit_index = node.commit_index().await;
let last_applied = node.last_applied().await;
assert!(
commit_index >= 1,
"Node {} should have commit_index >= 1, got {}",
node.node_id(),
commit_index
);
assert!(
last_applied >= 1,
"Node {} should have last_applied >= 1, got {}",
node.node_id(),
last_applied
);
// Verify the value exists in the state machine
let state_machine = node.state_machine();
let result = state_machine.kv().get(b"test_key_1").expect("Get should succeed");
assert!(
result.is_some(),
"Node {} should have test_key_1 in state machine",
node.node_id()
);
let entry = result.unwrap();
assert_eq!(
entry.value,
b"test_value_1",
"Node {} has wrong value for test_key_1",
node.node_id()
);
println!(
"✓ Node {} has test_key_1=test_value_1 (commit_index={}, last_applied={})",
node.node_id(),
commit_index,
last_applied
);
}
println!("✓ All nodes have committed and applied the write");
}
#[tokio::test]
async fn test_commit_consistency() {
// Test: Multiple writes preserve order across all nodes
// Verifies that the commit mechanism maintains consistency
use chainfire_types::command::RaftCommand;
let (nodes, _temp_dirs, _network) = create_3node_cluster().await;
// Start event loops
let mut handles = Vec::new();
for node in &nodes {
let node_clone = Arc::clone(node);
let handle = tokio::spawn(async move {
let _ = node_clone.run().await;
});
handles.push(handle);
}
// Wait for leader election (election timeout is 2-4s)
time::sleep(Duration::from_millis(5000)).await;
// Find the leader
let mut leader = None;
for node in &nodes {
if matches!(node.role().await, RaftRole::Leader) {
leader = Some(node);
break;
}
}
let leader = leader.expect("Leader should be elected");
println!("✓ Leader elected: node {}", leader.node_id());
// Submit multiple writes in sequence
for i in 1..=5 {
let cmd = RaftCommand::Put {
key: format!("key_{}", i).into_bytes(),
value: format!("value_{}", i).into_bytes(),
lease_id: None,
prev_kv: false,
};
leader
.client_write(cmd)
.await
.expect("Client write should succeed");
}
println!("✓ Submitted 5 writes to leader");
// Wait for all writes to commit and apply
time::sleep(Duration::from_millis(500)).await;
// Verify all nodes have all 5 keys in correct order
for node in &nodes {
let commit_index = node.commit_index().await;
let last_applied = node.last_applied().await;
assert!(
commit_index >= 5,
"Node {} should have commit_index >= 5, got {}",
node.node_id(),
commit_index
);
assert!(
last_applied >= 5,
"Node {} should have last_applied >= 5, got {}",
node.node_id(),
last_applied
);
let state_machine = node.state_machine();
for i in 1..=5 {
let key = format!("key_{}", i).into_bytes();
let expected_value = format!("value_{}", i).into_bytes();
let result = state_machine.kv().get(&key).expect("Get should succeed");
assert!(
result.is_some(),
"Node {} missing key_{}",
node.node_id(),
i
);
let entry = result.unwrap();
assert_eq!(
entry.value, expected_value,
"Node {} has wrong value for key_{}",
node.node_id(), i
);
}
println!(
"✓ Node {} has all 5 keys in correct order (commit_index={}, last_applied={})",
node.node_id(),
commit_index,
last_applied
);
}
println!("✓ All nodes maintain consistent order");
}
#[tokio::test]
async fn test_leader_only_write() {
// Test: Follower should reject client writes
// Verifies that only the leader can accept writes (Raft safety)
use chainfire_types::command::RaftCommand;
use chainfire_raft::core::RaftError;
let (nodes, _temp_dirs, _network) = create_3node_cluster().await;
// Start event loops
let mut handles = Vec::new();
for node in &nodes {
let node_clone = Arc::clone(node);
let handle = tokio::spawn(async move {
let _ = node_clone.run().await;
});
handles.push(handle);
}
// Wait for leader election (election timeout is 2-4s)
time::sleep(Duration::from_millis(5000)).await;
// Find a follower
let mut follower = None;
for node in &nodes {
if matches!(node.role().await, RaftRole::Follower) {
follower = Some(node);
break;
}
}
let follower = follower.expect("Follower should exist");
println!("✓ Found follower: node {}", follower.node_id());
// Try to write to the follower
let cmd = RaftCommand::Put {
key: b"follower_write".to_vec(),
value: b"should_fail".to_vec(),
lease_id: None,
prev_kv: false,
};
let result = follower.client_write(cmd).await;
// Should return NotLeader error
assert!(
result.is_err(),
"Follower write should fail with NotLeader error"
);
if let Err(RaftError::NotLeader { .. }) = result {
println!("✓ Follower correctly rejected write with NotLeader error");
} else {
panic!(
"Expected NotLeader error, got: {:?}",
result.err().unwrap()
);
}
}

View file

@ -1,274 +0,0 @@
//! Property-based tests for `chainfire-raft` using an in-process simulated cluster.
//!
//! These tests aim to catch timing/partition edge cases with high reproducibility.
#![cfg(all(test, feature = "custom-raft"))]
use std::sync::Arc;
use std::time::Duration;
use proptest::prelude::*;
use tokio::sync::mpsc;
use tokio::time;
use chainfire_raft::core::{RaftConfig, RaftCore};
use chainfire_raft::network::test_client::{RpcMessage, SimulatedNetwork};
use chainfire_raft::storage::{EntryPayload, LogEntry, LogStorage, StateMachine};
use chainfire_types::command::RaftCommand;
#[derive(Debug, Clone)]
enum Op {
Tick(u64),
Disconnect(u64, u64),
Reconnect(u64, u64),
Delay(u64, u64, u64),
ClearLink(u64, u64),
Write(u64, u8, u8),
}
fn node_id() -> impl Strategy<Value = u64> {
1_u64..=3_u64
}
fn distinct_pair() -> impl Strategy<Value = (u64, u64)> {
(node_id(), node_id()).prop_filter("distinct nodes", |(a, b)| a != b)
}
fn op_strategy() -> impl Strategy<Value = Op> {
prop_oneof![
// Advance simulated time by up to 300ms.
(0_u64..=300).prop_map(Op::Tick),
distinct_pair().prop_map(|(a, b)| Op::Disconnect(a, b)),
distinct_pair().prop_map(|(a, b)| Op::Reconnect(a, b)),
(distinct_pair(), 0_u64..=50).prop_map(|((a, b), d)| Op::Delay(a, b, d)),
distinct_pair().prop_map(|(a, b)| Op::ClearLink(a, b)),
// Client writes: pick node + small key/value.
(node_id(), any::<u8>(), any::<u8>()).prop_map(|(n, k, v)| Op::Write(n, k, v)),
]
}
fn ops_strategy() -> impl Strategy<Value = Vec<Op>> {
prop::collection::vec(op_strategy(), 0..40)
}
async fn advance_ms(total_ms: u64) {
// Advance in small steps to avoid “simultaneous” timer firings starving message handling.
let step_ms: u64 = 10;
let mut remaining = total_ms;
while remaining > 0 {
let d = remaining.min(step_ms);
time::advance(Duration::from_millis(d)).await;
tokio::task::yield_now().await;
remaining -= d;
}
}
async fn create_3node_cluster() -> (Vec<Arc<RaftCore>>, Arc<SimulatedNetwork>) {
let network = Arc::new(SimulatedNetwork::new());
let mut nodes = Vec::new();
for node_id in 1..=3_u64 {
let peers: Vec<u64> = (1..=3_u64).filter(|&id| id != node_id).collect();
let storage = Arc::new(LogStorage::new_in_memory());
let state_machine = Arc::new(StateMachine::new_in_memory());
let config = RaftConfig {
election_timeout_min: 150,
election_timeout_max: 300,
heartbeat_interval: 50,
// Deterministic per-node seed for reproducibility.
deterministic_seed: Some(node_id),
};
let node = Arc::new(RaftCore::new(
node_id,
peers,
storage,
state_machine,
Arc::new(network.client(node_id)) as Arc<dyn chainfire_raft::network::RaftRpcClient>,
config,
));
node.initialize().await.unwrap();
nodes.push(node);
}
// Wire up RPC handlers.
for node in &nodes {
let node_id = node.node_id();
let (tx, mut rx) = mpsc::unbounded_channel::<RpcMessage>();
network.register(node_id, tx).await;
let node_clone: Arc<RaftCore> = Arc::clone(node);
tokio::spawn(async move {
while let Some(msg) = rx.recv().await {
match msg {
RpcMessage::Vote(req, resp_tx) => {
node_clone.request_vote_rpc(req, resp_tx).await;
}
RpcMessage::AppendEntries(req, resp_tx) => {
node_clone.append_entries_rpc(req, resp_tx).await;
}
}
}
});
}
(nodes, network)
}
fn payload_fingerprint(payload: &EntryPayload<Vec<u8>>) -> Vec<u8> {
// Serialize the enum for stable equality checks across variants.
bincode::serialize(payload).unwrap_or_default()
}
async fn assert_raft_invariants(nodes: &[Arc<RaftCore>]) {
// Per-node monotonic invariants.
for node in nodes {
let commit = node.commit_index().await;
let last_applied = node.last_applied().await;
let st = node.storage().get_log_state().expect("log state");
let last_log_index = st.last_log_id.map(|id| id.index).unwrap_or(0);
assert!(
last_applied <= commit,
"node {}: last_applied={} > commit_index={}",
node.node_id(),
last_applied,
commit
);
assert!(
commit <= last_log_index,
"node {}: commit_index={} > last_log_index={}",
node.node_id(),
commit,
last_log_index
);
}
// Log Matching Property:
// If two logs contain an entry with the same index and term, then the logs are identical
// for all entries up through that index.
let mut node_logs: Vec<std::collections::BTreeMap<u64, (u64, Vec<u8>)>> = Vec::new();
for node in nodes {
let st = node.storage().get_log_state().expect("log state");
let last = st.last_log_id.map(|id| id.index).unwrap_or(0);
let entries: Vec<LogEntry<Vec<u8>>> = if last == 0 {
vec![]
} else {
node.storage()
.get_log_entries(1..=last)
.expect("log entries")
};
let mut m = std::collections::BTreeMap::new();
for e in entries {
m.insert(e.log_id.index, (e.log_id.term, payload_fingerprint(&e.payload)));
}
node_logs.push(m);
}
for a in 0..nodes.len() {
for b in (a + 1)..nodes.len() {
let la = &node_logs[a];
let lb = &node_logs[b];
for (idx, (term_a, payload_a)) in la.iter() {
if let Some((term_b, payload_b)) = lb.get(idx) {
if term_a == term_b {
assert_eq!(
payload_a, payload_b,
"log mismatch at idx={} term={} (nodes {} vs {})",
idx,
term_a,
nodes[a].node_id(),
nodes[b].node_id()
);
for j in 1..=*idx {
assert_eq!(
la.get(&j),
lb.get(&j),
"log matching violated at idx={} (prefix {} differs) nodes {} vs {}",
idx,
j,
nodes[a].node_id(),
nodes[b].node_id()
);
}
}
}
}
}
}
}
proptest! {
#![proptest_config(ProptestConfig {
cases: 32,
.. ProptestConfig::default()
})]
#[test]
fn prop_raft_log_matching_holds(ops in ops_strategy()) {
let rt = tokio::runtime::Builder::new_current_thread()
.enable_time()
.build()
.unwrap();
rt.block_on(async move {
tokio::time::pause();
let (nodes, network) = create_3node_cluster().await;
// Start event loops.
let mut handles = Vec::new();
for node in &nodes {
let node_clone = Arc::clone(node);
handles.push(tokio::spawn(async move {
let _ = node_clone.run().await;
}));
}
tokio::task::yield_now().await;
// Drive a randomized sequence of operations.
for op in ops {
match op {
Op::Tick(ms) => advance_ms(ms).await,
Op::Disconnect(a, b) => network.disconnect(a, b).await,
Op::Reconnect(a, b) => network.reconnect(a, b).await,
Op::Delay(a, b, d) => {
use chainfire_raft::network::test_client::LinkBehavior;
network.set_link(a, b, LinkBehavior::Delay(Duration::from_millis(d))).await;
network.set_link(b, a, LinkBehavior::Delay(Duration::from_millis(d))).await;
}
Op::ClearLink(a, b) => {
network.clear_link(a, b).await;
network.clear_link(b, a).await;
}
Op::Write(n, k, v) => {
let node = nodes.iter().find(|x| x.node_id() == n).unwrap();
let _ = node.client_write(RaftCommand::Put {
key: vec![k],
value: vec![v],
lease_id: None,
prev_kv: false,
}).await;
}
}
}
// Let the system settle a bit.
advance_ms(500).await;
assert_raft_invariants(&nodes).await;
// Best-effort cleanup.
for h in handles {
h.abort();
}
});
}
}

View file

@ -1,658 +0,0 @@
//! Chainfire 3-Node Cluster Integration Test
//!
//! Verifies HA behavior: leader election, state replication, and node recovery.
use chainfire_client::Client;
use chainfire_server::{
config::{ClusterConfig, NetworkConfig, NodeConfig, RaftConfig, ServerConfig, StorageConfig},
server::Server,
};
use chainfire_types::RaftRole;
use std::net::SocketAddr;
use std::time::Duration;
use tokio::time::sleep;
/// Create a 3-node cluster configuration with join flow
/// Node 1 bootstraps alone, nodes 2 & 3 join via member_add API
fn cluster_config_with_join(node_id: u64) -> (ServerConfig, tempfile::TempDir) {
let base_port = match node_id {
1 => 12379,
2 => 22379,
3 => 32379,
_ => panic!("Invalid node_id"),
};
let api_addr: SocketAddr = format!("127.0.0.1:{}", base_port).parse().unwrap();
let raft_addr: SocketAddr = format!("127.0.0.1:{}", base_port + 1).parse().unwrap();
let gossip_addr: SocketAddr = format!("127.0.0.1:{}", base_port + 2).parse().unwrap();
let temp_dir = tempfile::tempdir().unwrap();
let config = ServerConfig {
node: NodeConfig {
id: node_id,
name: format!("test-node-{}", node_id),
role: "control_plane".to_string(),
},
cluster: ClusterConfig {
id: 1,
bootstrap: node_id == 1, // Only node 1 bootstraps
initial_members: vec![], // Node 1 starts alone, others join via API
},
network: NetworkConfig {
api_addr,
http_addr: format!("127.0.0.1:{}", 28080 + node_id).parse().unwrap(),
raft_addr,
gossip_addr,
tls: None,
},
storage: StorageConfig {
data_dir: temp_dir.path().to_path_buf(),
},
// Node 1 is Voter (bootstrap), nodes 2 & 3 are Learner (join via member_add)
raft: RaftConfig {
role: if node_id == 1 { RaftRole::Voter } else { RaftRole::Learner },
},
};
(config, temp_dir)
}
/// Alias for backwards compatibility (old tests use this)
fn cluster_config(node_id: u64) -> (ServerConfig, tempfile::TempDir) {
cluster_config_with_join(node_id)
}
/// Create a 3-node cluster configuration with simultaneous bootstrap
/// All nodes start together with the same initial_members (avoids add_learner bug)
fn cluster_config_simultaneous_bootstrap(node_id: u64) -> (ServerConfig, tempfile::TempDir) {
use chainfire_server::config::MemberConfig;
let base_port = match node_id {
1 => 12379,
2 => 22379,
3 => 32379,
_ => panic!("Invalid node_id"),
};
let api_addr: SocketAddr = format!("127.0.0.1:{}", base_port).parse().unwrap();
let raft_addr: SocketAddr = format!("127.0.0.1:{}", base_port + 1).parse().unwrap();
let gossip_addr: SocketAddr = format!("127.0.0.1:{}", base_port + 2).parse().unwrap();
let temp_dir = tempfile::tempdir().unwrap();
// All nodes have the same initial_members list
let initial_members = vec![
MemberConfig { id: 1, raft_addr: "127.0.0.1:12380".to_string() },
MemberConfig { id: 2, raft_addr: "127.0.0.1:22380".to_string() },
MemberConfig { id: 3, raft_addr: "127.0.0.1:32380".to_string() },
];
let config = ServerConfig {
node: NodeConfig {
id: node_id,
name: format!("test-node-{}", node_id),
role: "control_plane".to_string(),
},
cluster: ClusterConfig {
id: 1,
bootstrap: node_id == 1, // Only node 1 bootstraps, but with full member list
initial_members: initial_members.clone(),
},
network: NetworkConfig {
api_addr,
http_addr: format!("127.0.0.1:{}", 28080 + node_id).parse().unwrap(),
raft_addr,
gossip_addr,
tls: None,
},
storage: StorageConfig {
data_dir: temp_dir.path().to_path_buf(),
},
raft: RaftConfig {
role: RaftRole::Voter, // All nodes are voters from the start
},
};
(config, temp_dir)
}
/// Create a single-node cluster configuration (for testing basic Raft functionality)
fn single_node_config() -> (ServerConfig, tempfile::TempDir) {
let api_addr: SocketAddr = "127.0.0.1:12379".parse().unwrap();
let raft_addr: SocketAddr = "127.0.0.1:12380".parse().unwrap();
let gossip_addr: SocketAddr = "127.0.0.1:12381".parse().unwrap();
let temp_dir = tempfile::tempdir().unwrap();
let config = ServerConfig {
node: NodeConfig {
id: 1,
name: "test-node-1".to_string(),
role: "control_plane".to_string(),
},
cluster: ClusterConfig {
id: 1,
bootstrap: true, // Single-node bootstrap
initial_members: vec![], // Empty = single node
},
network: NetworkConfig {
api_addr,
http_addr: format!("127.0.0.1:{}", 28080 + node_id).parse().unwrap(),
raft_addr,
gossip_addr,
tls: None,
},
storage: StorageConfig {
data_dir: temp_dir.path().to_path_buf(),
},
raft: RaftConfig::default(),
};
(config, temp_dir)
}
#[tokio::test]
#[ignore] // Run with: cargo test --test cluster_integration -- --ignored
async fn test_single_node_raft_leader_election() {
println!("\n=== Test: Single-Node Raft Leader Election ===");
// Start single node
let (config, _temp) = single_node_config();
let api_addr = config.network.api_addr;
println!("Creating single-node cluster...");
let server = Server::new(config).await.unwrap();
let handle = tokio::spawn(async move { server.run().await });
println!("Node started: {}", api_addr);
// Wait for leader election
println!("Waiting for leader election...");
sleep(Duration::from_secs(2)).await;
// Verify leader elected
let mut client = Client::connect(format!("http://{}", api_addr))
.await
.expect("Failed to connect");
let status = client.status().await.expect("Failed to get status");
println!(
"Node status: leader={}, term={}",
status.leader, status.raft_term
);
assert_eq!(status.leader, 1, "Node 1 should be leader in single-node cluster");
assert!(status.raft_term > 0, "Raft term should be > 0");
// Test basic KV operations
println!("Testing KV operations...");
client.put("test-key", "test-value").await.unwrap();
let value = client.get("test-key").await.unwrap();
assert_eq!(value, Some(b"test-value".to_vec()));
println!("✓ Single-node Raft working correctly");
// Cleanup
handle.abort();
}
#[tokio::test]
#[ignore] // Run with: cargo test --test cluster_integration -- --ignored
async fn test_3node_leader_election_with_join() {
println!("\n=== Test: 3-Node Leader Election with Join Flow ===");
// Start Node 1 (bootstrap alone)
let (config1, _temp1) = cluster_config_with_join(1);
let api1 = config1.network.api_addr;
let raft1 = config1.network.raft_addr;
println!("Creating Node 1 (bootstrap)...");
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
println!("Node 1 started: API={}, Raft={}", api1, raft1);
// Wait for node 1 to become leader
sleep(Duration::from_secs(2)).await;
// Verify node 1 is leader
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.expect("Failed to connect to node 1");
let status1 = client1.status().await.expect("Failed to get status");
println!("Node 1 status: leader={}, term={}", status1.leader, status1.raft_term);
assert_eq!(status1.leader, 1, "Node 1 should be leader");
// Start Node 2 (no bootstrap)
let (config2, _temp2) = cluster_config_with_join(2);
let api2 = config2.network.api_addr;
let raft2 = config2.network.raft_addr;
println!("Creating Node 2...");
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
println!("Node 2 started: API={}, Raft={}", api2, raft2);
sleep(Duration::from_millis(500)).await;
// Start Node 3 (no bootstrap)
let (config3, _temp3) = cluster_config_with_join(3);
let api3 = config3.network.api_addr;
let raft3 = config3.network.raft_addr;
println!("Creating Node 3...");
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
println!("Node 3 started: API={}, Raft={}", api3, raft3);
sleep(Duration::from_millis(500)).await;
// Add node 2 to cluster via member_add API
println!("Adding node 2 to cluster via member_add API...");
let member2_id = client1
.member_add(2, raft2.to_string(), false) // node_id=2, false=voter
.await
.expect("Failed to add node 2");
println!("Node 2 added with ID: {}", member2_id);
assert_eq!(member2_id, 2, "Node 2 should have ID 2");
// Add node 3 to cluster via member_add API
println!("Adding node 3 to cluster via member_add API...");
let member3_id = client1
.member_add(3, raft3.to_string(), false) // node_id=3, false=voter
.await
.expect("Failed to add node 3");
println!("Node 3 added with ID: {}", member3_id);
assert_eq!(member3_id, 3, "Node 3 should have ID 3");
// Wait for cluster membership changes to propagate
sleep(Duration::from_secs(3)).await;
// Verify all nodes see the same leader
let status1 = client1.status().await.expect("Failed to get status from node 1");
println!("Node 1 final status: leader={}, term={}", status1.leader, status1.raft_term);
let mut client2 = Client::connect(format!("http://{}", api2))
.await
.expect("Failed to connect to node 2");
let status2 = client2.status().await.expect("Failed to get status from node 2");
println!("Node 2 final status: leader={}, term={}", status2.leader, status2.raft_term);
let mut client3 = Client::connect(format!("http://{}", api3))
.await
.expect("Failed to connect to node 3");
let status3 = client3.status().await.expect("Failed to get status from node 3");
println!("Node 3 final status: leader={}, term={}", status3.leader, status3.raft_term);
// All nodes should agree on the leader
assert_eq!(status1.leader, status2.leader, "Nodes 1 and 2 disagree on leader");
assert_eq!(status1.leader, status3.leader, "Nodes 1 and 3 disagree on leader");
assert!(status1.leader > 0, "No leader elected");
println!("✓ 3-node cluster formed successfully with join flow");
// Cleanup
handle1.abort();
handle2.abort();
handle3.abort();
}
#[tokio::test]
#[ignore]
async fn test_3node_state_replication() {
println!("\n=== Test: 3-Node State Replication ===");
// Start cluster
let (config1, _temp1) = cluster_config(1);
let api1 = config1.network.api_addr;
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
let (config2, _temp2) = cluster_config(2);
let api2 = config2.network.api_addr;
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
let (config3, _temp3) = cluster_config(3);
let api3 = config3.network.api_addr;
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
sleep(Duration::from_secs(2)).await;
println!("Cluster started");
// Write data to node 1 (leader)
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.unwrap();
println!("Writing test data to node 1...");
client1.put("test/key1", "value1").await.unwrap();
client1.put("test/key2", "value2").await.unwrap();
client1.put("test/key3", "value3").await.unwrap();
// Wait for replication
sleep(Duration::from_millis(500)).await;
// Read from node 2 and node 3 (followers)
println!("Reading from node 2...");
let mut client2 = Client::connect(format!("http://{}", api2))
.await
.unwrap();
let val2 = client2.get("test/key1").await.unwrap();
assert_eq!(val2, Some(b"value1".to_vec()), "Data not replicated to node 2");
println!("Reading from node 3...");
let mut client3 = Client::connect(format!("http://{}", api3))
.await
.unwrap();
let val3 = client3.get("test/key1").await.unwrap();
assert_eq!(val3, Some(b"value1".to_vec()), "Data not replicated to node 3");
println!("✓ State replication verified");
// Cleanup
handle1.abort();
handle2.abort();
handle3.abort();
}
#[tokio::test]
#[ignore]
async fn test_3node_follower_crash() {
println!("\n=== Test: Follower Crash (Node Remains Available) ===");
// Start cluster
let (config1, _temp1) = cluster_config(1);
let api1 = config1.network.api_addr;
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
let (config2, _temp2) = cluster_config(2);
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
let (config3, _temp3) = cluster_config(3);
let api3 = config3.network.api_addr;
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
sleep(Duration::from_secs(2)).await;
println!("Cluster started");
// Write initial data
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.unwrap();
println!("Writing initial data...");
client1.put("test/before-crash", "initial").await.unwrap();
// Kill node 2 (follower)
println!("Killing node 2 (follower)...");
handle2.abort();
sleep(Duration::from_millis(500)).await;
// Cluster should still be operational (2/3 quorum)
println!("Writing data after crash...");
client1
.put("test/after-crash", "still-working")
.await
.expect("Write should succeed with 2/3 quorum");
// Read from node 3
let mut client3 = Client::connect(format!("http://{}", api3))
.await
.unwrap();
let val = client3.get("test/after-crash").await.unwrap();
assert_eq!(val, Some(b"still-working".to_vec()));
println!("✓ Cluster operational after follower crash");
// Cleanup
handle1.abort();
handle3.abort();
}
#[tokio::test]
#[ignore]
async fn test_3node_leader_crash_reelection() {
println!("\n=== Test: Leader Crash & Re-election ===");
// Start cluster
let (config1, _temp1) = cluster_config(1);
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
let (config2, _temp2) = cluster_config(2);
let api2 = config2.network.api_addr;
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
let (config3, _temp3) = cluster_config(3);
let api3 = config3.network.api_addr;
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
sleep(Duration::from_secs(2)).await;
println!("Cluster started");
// Determine initial leader
let mut client2 = Client::connect(format!("http://{}", api2))
.await
.unwrap();
let initial_status = client2.status().await.unwrap();
let initial_leader = initial_status.leader;
println!("Initial leader: node {}", initial_leader);
// Kill the leader (assume node 1)
println!("Killing leader (node 1)...");
handle1.abort();
// Wait for re-election (should be < 1s per requirements)
println!("Waiting for re-election...");
sleep(Duration::from_secs(1)).await;
// Verify new leader elected
let new_status = client2.status().await.unwrap();
println!(
"New leader: node {}, term: {}",
new_status.leader, new_status.raft_term
);
assert!(new_status.leader > 0, "No new leader elected");
assert!(
new_status.raft_term > initial_status.raft_term,
"Raft term should increase after re-election"
);
println!("✓ Leader re-election successful within 1s");
// Verify cluster still functional
let mut client3 = Client::connect(format!("http://{}", api3))
.await
.unwrap();
client3
.put("test/post-reelection", "functional")
.await
.expect("Cluster should be functional after re-election");
println!("✓ Cluster operational after re-election");
// Cleanup
handle2.abort();
handle3.abort();
}
/// Test 3-node cluster with learners only (no voter promotion)
/// T041 Workaround: Avoids change_membership by keeping nodes as learners
#[tokio::test]
#[ignore] // Run with: cargo test --test cluster_integration test_3node_with_learners -- --ignored
async fn test_3node_with_learners() {
println!("\n=== Test: 3-Node Cluster with Learners (T041 Workaround) ===");
// Start Node 1 (bootstrap alone as single voter)
let (config1, _temp1) = cluster_config_with_join(1);
let api1 = config1.network.api_addr;
let raft1 = config1.network.raft_addr;
println!("Creating Node 1 (bootstrap)...");
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
println!("Node 1 started: API={}, Raft={}", api1, raft1);
// Wait for node 1 to become leader
sleep(Duration::from_secs(2)).await;
// Verify node 1 is leader
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.expect("Failed to connect to node 1");
let status1 = client1.status().await.expect("Failed to get status");
println!("Node 1 status: leader={}, term={}", status1.leader, status1.raft_term);
assert_eq!(status1.leader, 1, "Node 1 should be leader");
// Start Node 2
let (config2, _temp2) = cluster_config_with_join(2);
let api2 = config2.network.api_addr;
let raft2 = config2.network.raft_addr;
println!("Creating Node 2...");
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
println!("Node 2 started: API={}, Raft={}", api2, raft2);
sleep(Duration::from_millis(500)).await;
// Start Node 3
let (config3, _temp3) = cluster_config_with_join(3);
let api3 = config3.network.api_addr;
let raft3 = config3.network.raft_addr;
println!("Creating Node 3...");
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
println!("Node 3 started: API={}, Raft={}", api3, raft3);
sleep(Duration::from_millis(500)).await;
// Add node 2 as LEARNER (is_learner=true, no voter promotion)
println!("Adding node 2 as learner (no voter promotion)...");
let member2_id = client1
.member_add(2, raft2.to_string(), true) // is_learner=true
.await
.expect("Failed to add node 2 as learner");
println!("Node 2 added as learner with ID: {}", member2_id);
assert_eq!(member2_id, 2);
// Add node 3 as LEARNER
println!("Adding node 3 as learner (no voter promotion)...");
let member3_id = client1
.member_add(3, raft3.to_string(), true) // is_learner=true
.await
.expect("Failed to add node 3 as learner");
println!("Node 3 added as learner with ID: {}", member3_id);
assert_eq!(member3_id, 3);
// Wait for replication
sleep(Duration::from_secs(2)).await;
// Test write on leader
println!("Testing KV write on leader...");
client1.put("test-key", "test-value").await.expect("Put failed");
// Wait for replication to learners
sleep(Duration::from_secs(1)).await;
// Verify data replicated to learner (should be able to read)
let mut client2 = Client::connect(format!("http://{}", api2))
.await
.expect("Failed to connect to node 2");
// Note: Reading from a learner may require forwarding to leader
// For now, just verify the cluster is operational
let status2 = client2.status().await.expect("Failed to get status from learner");
println!("Node 2 (learner) status: leader={}, term={}", status2.leader, status2.raft_term);
// All nodes should see node 1 as leader
assert_eq!(status2.leader, 1, "Learner should see node 1 as leader");
println!("✓ 3-node cluster with learners working");
// Cleanup
handle1.abort();
handle2.abort();
handle3.abort();
}
/// Test 3-node cluster formation using staggered bootstrap (DISABLED - doesn't work)
#[tokio::test]
#[ignore]
async fn test_3node_simultaneous_bootstrap_disabled() {
println!("\n=== Test: 3-Node Staggered Bootstrap (T041 Workaround) ===");
// Start Node 1 first (bootstrap=true, will initialize with full membership)
let (config1, _temp1) = cluster_config_simultaneous_bootstrap(1);
let api1 = config1.network.api_addr;
println!("Creating Node 1 (bootstrap)...");
let server1 = Server::new(config1).await.unwrap();
let handle1 = tokio::spawn(async move { server1.run().await });
println!("Node 1 started: API={}", api1);
// Give node 1 time to become leader
println!("Waiting for Node 1 to become leader (3s)...");
sleep(Duration::from_secs(3)).await;
// Verify node 1 is leader
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.expect("Failed to connect to node 1");
let status1 = client1.status().await.expect("Failed to get status");
println!("Node 1 status before others: leader={}, term={}", status1.leader, status1.raft_term);
// Now start nodes 2 and 3
let (config2, _temp2) = cluster_config_simultaneous_bootstrap(2);
let api2 = config2.network.api_addr;
println!("Creating Node 2...");
let server2 = Server::new(config2).await.unwrap();
let handle2 = tokio::spawn(async move { server2.run().await });
println!("Node 2 started: API={}", api2);
let (config3, _temp3) = cluster_config_simultaneous_bootstrap(3);
let api3 = config3.network.api_addr;
println!("Creating Node 3...");
let server3 = Server::new(config3).await.unwrap();
let handle3 = tokio::spawn(async move { server3.run().await });
println!("Node 3 started: API={}", api3);
// Wait for cluster to stabilize
println!("Waiting for cluster to stabilize (5s)...");
sleep(Duration::from_secs(5)).await;
// Verify cluster formed and leader elected
let mut client1 = Client::connect(format!("http://{}", api1))
.await
.expect("Failed to connect to node 1");
let status1 = client1.status().await.expect("Failed to get status from node 1");
println!("Node 1 status: leader={}, term={}", status1.leader, status1.raft_term);
let mut client2 = Client::connect(format!("http://{}", api2))
.await
.expect("Failed to connect to node 2");
let status2 = client2.status().await.expect("Failed to get status from node 2");
println!("Node 2 status: leader={}, term={}", status2.leader, status2.raft_term);
let mut client3 = Client::connect(format!("http://{}", api3))
.await
.expect("Failed to connect to node 3");
let status3 = client3.status().await.expect("Failed to get status from node 3");
println!("Node 3 status: leader={}, term={}", status3.leader, status3.raft_term);
// All nodes should agree on the leader
assert!(status1.leader > 0, "No leader elected");
assert_eq!(status1.leader, status2.leader, "Nodes 1 and 2 disagree on leader");
assert_eq!(status1.leader, status3.leader, "Nodes 1 and 3 disagree on leader");
// Test KV operations on the cluster
println!("Testing KV operations...");
client1.put("test-key", "test-value").await.expect("Put failed");
// Wait for commit to propagate to followers via heartbeat (heartbeat_interval=100ms)
sleep(Duration::from_millis(200)).await;
let value = client2.get("test-key").await.expect("Get failed");
assert_eq!(value, Some(b"test-value".to_vec()), "Value not replicated");
println!("✓ 3-node cluster formed successfully with simultaneous bootstrap");
// Cleanup
handle1.abort();
handle2.abort();
handle3.abort();
}

View file

@ -1,175 +0,0 @@
//! Integration tests for Chainfire
//!
//! These tests verify that the server, client, and all components work together correctly.
use chainfire_client::Client;
use chainfire_server::{
config::{ClusterConfig, NetworkConfig, NodeConfig, RaftConfig, ServerConfig, StorageConfig},
server::Server,
};
use std::time::Duration;
use tokio::time::sleep;
/// Create a test server configuration
fn test_config(port: u16) -> (ServerConfig, tempfile::TempDir) {
use std::net::SocketAddr;
let api_addr: SocketAddr = format!("127.0.0.1:{}", port).parse().unwrap();
let raft_addr: SocketAddr = format!("127.0.0.1:{}", port + 100).parse().unwrap();
let gossip_addr: SocketAddr = format!("127.0.0.1:{}", port + 200).parse().unwrap();
let temp_dir = tempfile::tempdir().unwrap();
let config = ServerConfig {
node: NodeConfig {
id: 1,
name: format!("test-node-{}", port),
role: "control_plane".to_string(),
},
cluster: ClusterConfig {
id: 1,
bootstrap: true,
initial_members: vec![],
},
network: NetworkConfig {
api_addr,
http_addr: "127.0.0.1:28081".parse().unwrap(),
raft_addr,
gossip_addr,
tls: None,
},
storage: StorageConfig {
data_dir: temp_dir.path().to_path_buf(),
},
raft: RaftConfig::default(),
};
(config, temp_dir)
}
#[tokio::test]
async fn test_single_node_kv_operations() {
// Start server
let (config, _temp_dir) = test_config(23790);
let api_addr = config.network.api_addr;
let server = Server::new(config).await.unwrap();
// Run server in background
let server_handle = tokio::spawn(async move {
let _ = server.run().await;
});
// Wait for server to start and Raft leader election
// Increased from 500ms to 2000ms for CI/constrained environments
sleep(Duration::from_millis(2000)).await;
// Connect client
let mut client = Client::connect(format!("http://{}", api_addr))
.await
.unwrap();
// Test put with retry (leader election may still be in progress)
let mut rev = 0;
for attempt in 0..5 {
match client.put("test/key1", "value1").await {
Ok(r) => {
rev = r;
break;
}
Err(e) if attempt < 4 => {
eprintln!("Put attempt {} failed: {}, retrying...", attempt + 1, e);
sleep(Duration::from_millis(500)).await;
}
Err(e) => panic!("Put failed after 5 attempts: {}", e),
}
}
assert!(rev > 0);
// Test get
let value = client.get("test/key1").await.unwrap();
assert_eq!(value, Some(b"value1".to_vec()));
// Test put with different value
let rev2 = client.put("test/key1", "value2").await.unwrap();
assert!(rev2 > rev);
// Test get updated value
let value = client.get("test/key1").await.unwrap();
assert_eq!(value, Some(b"value2".to_vec()));
// Test get non-existent key
let value = client.get("test/nonexistent").await.unwrap();
assert!(value.is_none());
// Test delete
let deleted = client.delete("test/key1").await.unwrap();
assert!(deleted);
// Verify deletion
let value = client.get("test/key1").await.unwrap();
assert!(value.is_none());
// Test delete non-existent key
let deleted = client.delete("test/nonexistent").await.unwrap();
assert!(!deleted);
// Test prefix operations
client.put("prefix/a", "1").await.unwrap();
client.put("prefix/b", "2").await.unwrap();
client.put("prefix/c", "3").await.unwrap();
client.put("other/key", "other").await.unwrap();
let prefix_values = client.get_prefix("prefix/").await.unwrap();
assert_eq!(prefix_values.len(), 3);
// Cleanup
server_handle.abort();
}
#[tokio::test]
async fn test_cluster_status() {
let (config, _temp_dir) = test_config(23800);
let api_addr = config.network.api_addr;
let server = Server::new(config).await.unwrap();
let server_handle = tokio::spawn(async move {
let _ = server.run().await;
});
sleep(Duration::from_millis(500)).await;
let mut client = Client::connect(format!("http://{}", api_addr))
.await
.unwrap();
let status = client.status().await.unwrap();
assert_eq!(status.leader, 1);
assert!(status.raft_term > 0);
server_handle.abort();
}
#[tokio::test]
async fn test_string_convenience_methods() {
let (config, _temp_dir) = test_config(23810);
let api_addr = config.network.api_addr;
let server = Server::new(config).await.unwrap();
let server_handle = tokio::spawn(async move {
let _ = server.run().await;
});
sleep(Duration::from_millis(500)).await;
let mut client = Client::connect(format!("http://{}", api_addr))
.await
.unwrap();
// Test string methods
client.put_str("/config/name", "chainfire").await.unwrap();
let value = client.get_str("/config/name").await.unwrap();
assert_eq!(value, Some("chainfire".to_string()));
server_handle.abort();
}

View file

@ -1 +0,0 @@
MANIFEST-000005

View file

@ -1 +0,0 @@
9b9417c1-5d46-4b8a-b14e-ac341643df55

View file

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,684 +0,0 @@
# This is a RocksDB option file.
#
# For detailed file format spec, please refer to the example file
# in examples/rocksdb_option_file_example.ini
#
[Version]
rocksdb_version=10.5.1
options_file_version=1.1
[DBOptions]
compaction_readahead_size=2097152
strict_bytes_per_sync=false
bytes_per_sync=1048576
max_background_jobs=4
avoid_flush_during_shutdown=false
max_background_flushes=-1
delayed_write_rate=16777216
max_open_files=-1
max_subcompactions=1
writable_file_max_buffer_size=1048576
wal_bytes_per_sync=0
max_background_compactions=-1
max_total_wal_size=0
delete_obsolete_files_period_micros=21600000000
stats_dump_period_sec=600
stats_history_buffer_size=1048576
stats_persist_period_sec=600
follower_refresh_catchup_period_ms=10000
enforce_single_del_contracts=true
lowest_used_cache_tier=kNonVolatileBlockTier
bgerror_resume_retry_interval=1000000
metadata_write_temperature=kUnknown
best_efforts_recovery=false
log_readahead_size=0
write_identity_file=true
write_dbid_to_manifest=true
prefix_seek_opt_in_only=false
wal_compression=kNoCompression
manual_wal_flush=false
db_host_id=__hostname__
two_write_queues=false
allow_ingest_behind=false
skip_checking_sst_file_sizes_on_db_open=false
flush_verify_memtable_count=true
atomic_flush=false
verify_sst_unique_id_in_manifest=true
skip_stats_update_on_db_open=false
track_and_verify_wals=false
track_and_verify_wals_in_manifest=false
compaction_verify_record_count=true
paranoid_checks=true
create_if_missing=true
max_write_batch_group_size_bytes=1048576
follower_catchup_retry_count=10
avoid_flush_during_recovery=false
file_checksum_gen_factory=nullptr
enable_thread_tracking=false
allow_fallocate=true
allow_data_in_errors=false
error_if_exists=false
use_direct_io_for_flush_and_compaction=false
background_close_inactive_wals=false
create_missing_column_families=true
WAL_size_limit_MB=0
use_direct_reads=false
persist_stats_to_disk=false
allow_2pc=false
max_log_file_size=0
is_fd_close_on_exec=true
avoid_unnecessary_blocking_io=false
max_file_opening_threads=16
wal_filter=nullptr
wal_write_temperature=kUnknown
follower_catchup_retry_wait_ms=100
allow_mmap_reads=false
allow_mmap_writes=false
use_adaptive_mutex=false
use_fsync=false
table_cache_numshardbits=6
dump_malloc_stats=false
db_write_buffer_size=0
keep_log_file_num=1000
max_bgerror_resume_count=2147483647
allow_concurrent_memtable_write=true
recycle_log_file_num=0
log_file_time_to_roll=0
manifest_preallocation_size=4194304
enable_write_thread_adaptive_yield=true
WAL_ttl_seconds=0
max_manifest_file_size=1073741824
wal_recovery_mode=kPointInTimeRecovery
enable_pipelined_write=false
write_thread_slow_yield_usec=3
unordered_write=false
write_thread_max_yield_usec=100
advise_random_on_open=true
info_log_level=INFO_LEVEL
[CFOptions "default"]
memtable_max_range_deletions=0
compression_manager=nullptr
compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_memory_checks=false
memtable_avg_op_scan_flush_trigger=0
block_protection_bytes_per_key=0
uncache_aggressiveness=0
bottommost_file_compaction_delay=0
memtable_protection_bytes_per_key=0
experimental_mempurge_threshold=0.000000
bottommost_compression=kDisableCompressionOption
sample_for_compression=0
prepopulate_blob_cache=kDisable
blob_file_starting_level=0
blob_compaction_readahead_size=0
table_factory=BlockBasedTable
max_successive_merges=0
max_write_buffer_number=2
prefix_extractor=nullptr
memtable_huge_page_size=0
write_buffer_size=67108864
strict_max_successive_merges=false
arena_block_size=1048576
memtable_op_scan_flush_trigger=0
level0_file_num_compaction_trigger=4
report_bg_io_stats=false
inplace_update_num_locks=10000
memtable_prefix_bloom_size_ratio=0.000000
level0_stop_writes_trigger=36
blob_compression_type=kNoCompression
level0_slowdown_writes_trigger=20
hard_pending_compaction_bytes_limit=274877906944
target_file_size_multiplier=1
bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_file_checks=false
blob_garbage_collection_force_threshold=1.000000
enable_blob_files=false
soft_pending_compaction_bytes_limit=68719476736
target_file_size_base=67108864
max_compaction_bytes=1677721600
disable_auto_compactions=false
min_blob_size=0
memtable_whole_key_filtering=false
max_bytes_for_level_base=268435456
last_level_temperature=kUnknown
preserve_internal_time_seconds=0
compaction_options_fifo={trivial_copy_buffer_size=4096;allow_trivial_copy_when_change_temperature=false;file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
max_bytes_for_level_multiplier=10.000000
max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
max_sequential_skip_in_iterations=8
compression=kSnappyCompression
default_write_temperature=kUnknown
compaction_options_universal={reduce_file_locking=false;incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;max_read_amp=-1;size_ratio=1;}
blob_garbage_collection_age_cutoff=0.250000
ttl=2592000
periodic_compaction_seconds=0
preclude_last_level_data_seconds=0
blob_file_size=268435456
enable_blob_garbage_collection=false
persist_user_defined_timestamps=true
compaction_pri=kMinOverlappingRatio
compaction_filter_factory=nullptr
comparator=leveldb.BytewiseComparator
bloom_locality=0
merge_operator=nullptr
compaction_filter=nullptr
level_compaction_dynamic_level_bytes=true
optimize_filters_for_hits=false
inplace_update_support=false
max_write_buffer_size_to_maintain=0
memtable_factory=SkipListFactory
memtable_insert_with_hint_prefix_extractor=nullptr
num_levels=7
force_consistency_checks=true
sst_partitioner_factory=nullptr
default_temperature=kUnknown
disallow_memtable_writes=false
compaction_style=kCompactionStyleLevel
min_write_buffer_number_to_merge=1
[TableOptions/BlockBasedTable "default"]
num_file_reads_for_auto_readahead=2
initial_auto_readahead_size=8192
metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
enable_index_compression=true
verify_compression=false
prepopulate_block_cache=kDisable
format_version=6
use_delta_encoding=true
pin_top_level_index_and_filter=true
read_amp_bytes_per_bit=0
decouple_partitioned_filters=false
partition_filters=false
metadata_block_size=4096
max_auto_readahead_size=262144
index_block_restart_interval=1
block_size_deviation=10
block_size=4096
detect_filter_construct_corruption=false
no_block_cache=false
checksum=kXXH3
filter_policy=nullptr
data_block_hash_table_util_ratio=0.750000
block_restart_interval=16
index_type=kBinarySearch
pin_l0_filter_and_index_blocks_in_cache=false
data_block_index_type=kDataBlockBinarySearch
cache_index_and_filter_blocks_with_high_priority=true
whole_key_filtering=true
index_shortening=kShortenSeparators
cache_index_and_filter_blocks=false
block_align=false
optimize_filters_for_memory=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "raft_logs"]
memtable_max_range_deletions=0
compression_manager=nullptr
compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_memory_checks=false
memtable_avg_op_scan_flush_trigger=0
block_protection_bytes_per_key=0
uncache_aggressiveness=0
bottommost_file_compaction_delay=0
memtable_protection_bytes_per_key=0
experimental_mempurge_threshold=0.000000
bottommost_compression=kDisableCompressionOption
sample_for_compression=0
prepopulate_blob_cache=kDisable
blob_file_starting_level=0
blob_compaction_readahead_size=0
table_factory=BlockBasedTable
max_successive_merges=0
max_write_buffer_number=3
prefix_extractor=nullptr
memtable_huge_page_size=0
write_buffer_size=67108864
strict_max_successive_merges=false
arena_block_size=1048576
memtable_op_scan_flush_trigger=0
level0_file_num_compaction_trigger=4
report_bg_io_stats=false
inplace_update_num_locks=10000
memtable_prefix_bloom_size_ratio=0.000000
level0_stop_writes_trigger=36
blob_compression_type=kNoCompression
level0_slowdown_writes_trigger=20
hard_pending_compaction_bytes_limit=274877906944
target_file_size_multiplier=1
bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_file_checks=false
blob_garbage_collection_force_threshold=1.000000
enable_blob_files=false
soft_pending_compaction_bytes_limit=68719476736
target_file_size_base=67108864
max_compaction_bytes=1677721600
disable_auto_compactions=false
min_blob_size=0
memtable_whole_key_filtering=false
max_bytes_for_level_base=268435456
last_level_temperature=kUnknown
preserve_internal_time_seconds=0
compaction_options_fifo={trivial_copy_buffer_size=4096;allow_trivial_copy_when_change_temperature=false;file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
max_bytes_for_level_multiplier=10.000000
max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
max_sequential_skip_in_iterations=8
compression=kSnappyCompression
default_write_temperature=kUnknown
compaction_options_universal={reduce_file_locking=false;incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;max_read_amp=-1;size_ratio=1;}
blob_garbage_collection_age_cutoff=0.250000
ttl=2592000
periodic_compaction_seconds=0
preclude_last_level_data_seconds=0
blob_file_size=268435456
enable_blob_garbage_collection=false
persist_user_defined_timestamps=true
compaction_pri=kMinOverlappingRatio
compaction_filter_factory=nullptr
comparator=leveldb.BytewiseComparator
bloom_locality=0
merge_operator=nullptr
compaction_filter=nullptr
level_compaction_dynamic_level_bytes=true
optimize_filters_for_hits=false
inplace_update_support=false
max_write_buffer_size_to_maintain=0
memtable_factory=SkipListFactory
memtable_insert_with_hint_prefix_extractor=nullptr
num_levels=7
force_consistency_checks=true
sst_partitioner_factory=nullptr
default_temperature=kUnknown
disallow_memtable_writes=false
compaction_style=kCompactionStyleLevel
min_write_buffer_number_to_merge=1
[TableOptions/BlockBasedTable "raft_logs"]
num_file_reads_for_auto_readahead=2
initial_auto_readahead_size=8192
metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
enable_index_compression=true
verify_compression=false
prepopulate_block_cache=kDisable
format_version=6
use_delta_encoding=true
pin_top_level_index_and_filter=true
read_amp_bytes_per_bit=0
decouple_partitioned_filters=false
partition_filters=false
metadata_block_size=4096
max_auto_readahead_size=262144
index_block_restart_interval=1
block_size_deviation=10
block_size=4096
detect_filter_construct_corruption=false
no_block_cache=false
checksum=kXXH3
filter_policy=nullptr
data_block_hash_table_util_ratio=0.750000
block_restart_interval=16
index_type=kBinarySearch
pin_l0_filter_and_index_blocks_in_cache=false
data_block_index_type=kDataBlockBinarySearch
cache_index_and_filter_blocks_with_high_priority=true
whole_key_filtering=true
index_shortening=kShortenSeparators
cache_index_and_filter_blocks=false
block_align=false
optimize_filters_for_memory=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "raft_meta"]
memtable_max_range_deletions=0
compression_manager=nullptr
compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_memory_checks=false
memtable_avg_op_scan_flush_trigger=0
block_protection_bytes_per_key=0
uncache_aggressiveness=0
bottommost_file_compaction_delay=0
memtable_protection_bytes_per_key=0
experimental_mempurge_threshold=0.000000
bottommost_compression=kDisableCompressionOption
sample_for_compression=0
prepopulate_blob_cache=kDisable
blob_file_starting_level=0
blob_compaction_readahead_size=0
table_factory=BlockBasedTable
max_successive_merges=0
max_write_buffer_number=2
prefix_extractor=nullptr
memtable_huge_page_size=0
write_buffer_size=16777216
strict_max_successive_merges=false
arena_block_size=1048576
memtable_op_scan_flush_trigger=0
level0_file_num_compaction_trigger=4
report_bg_io_stats=false
inplace_update_num_locks=10000
memtable_prefix_bloom_size_ratio=0.000000
level0_stop_writes_trigger=36
blob_compression_type=kNoCompression
level0_slowdown_writes_trigger=20
hard_pending_compaction_bytes_limit=274877906944
target_file_size_multiplier=1
bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_file_checks=false
blob_garbage_collection_force_threshold=1.000000
enable_blob_files=false
soft_pending_compaction_bytes_limit=68719476736
target_file_size_base=67108864
max_compaction_bytes=1677721600
disable_auto_compactions=false
min_blob_size=0
memtable_whole_key_filtering=false
max_bytes_for_level_base=268435456
last_level_temperature=kUnknown
preserve_internal_time_seconds=0
compaction_options_fifo={trivial_copy_buffer_size=4096;allow_trivial_copy_when_change_temperature=false;file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
max_bytes_for_level_multiplier=10.000000
max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
max_sequential_skip_in_iterations=8
compression=kSnappyCompression
default_write_temperature=kUnknown
compaction_options_universal={reduce_file_locking=false;incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;max_read_amp=-1;size_ratio=1;}
blob_garbage_collection_age_cutoff=0.250000
ttl=2592000
periodic_compaction_seconds=0
preclude_last_level_data_seconds=0
blob_file_size=268435456
enable_blob_garbage_collection=false
persist_user_defined_timestamps=true
compaction_pri=kMinOverlappingRatio
compaction_filter_factory=nullptr
comparator=leveldb.BytewiseComparator
bloom_locality=0
merge_operator=nullptr
compaction_filter=nullptr
level_compaction_dynamic_level_bytes=true
optimize_filters_for_hits=false
inplace_update_support=false
max_write_buffer_size_to_maintain=0
memtable_factory=SkipListFactory
memtable_insert_with_hint_prefix_extractor=nullptr
num_levels=7
force_consistency_checks=true
sst_partitioner_factory=nullptr
default_temperature=kUnknown
disallow_memtable_writes=false
compaction_style=kCompactionStyleLevel
min_write_buffer_number_to_merge=1
[TableOptions/BlockBasedTable "raft_meta"]
num_file_reads_for_auto_readahead=2
initial_auto_readahead_size=8192
metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
enable_index_compression=true
verify_compression=false
prepopulate_block_cache=kDisable
format_version=6
use_delta_encoding=true
pin_top_level_index_and_filter=true
read_amp_bytes_per_bit=0
decouple_partitioned_filters=false
partition_filters=false
metadata_block_size=4096
max_auto_readahead_size=262144
index_block_restart_interval=1
block_size_deviation=10
block_size=4096
detect_filter_construct_corruption=false
no_block_cache=false
checksum=kXXH3
filter_policy=nullptr
data_block_hash_table_util_ratio=0.750000
block_restart_interval=16
index_type=kBinarySearch
pin_l0_filter_and_index_blocks_in_cache=false
data_block_index_type=kDataBlockBinarySearch
cache_index_and_filter_blocks_with_high_priority=true
whole_key_filtering=true
index_shortening=kShortenSeparators
cache_index_and_filter_blocks=false
block_align=false
optimize_filters_for_memory=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "key_value"]
memtable_max_range_deletions=0
compression_manager=nullptr
compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_memory_checks=false
memtable_avg_op_scan_flush_trigger=0
block_protection_bytes_per_key=0
uncache_aggressiveness=0
bottommost_file_compaction_delay=0
memtable_protection_bytes_per_key=0
experimental_mempurge_threshold=0.000000
bottommost_compression=kDisableCompressionOption
sample_for_compression=0
prepopulate_blob_cache=kDisable
blob_file_starting_level=0
blob_compaction_readahead_size=0
table_factory=BlockBasedTable
max_successive_merges=0
max_write_buffer_number=4
prefix_extractor=rocksdb.FixedPrefix.8
memtable_huge_page_size=0
write_buffer_size=134217728
strict_max_successive_merges=false
arena_block_size=1048576
memtable_op_scan_flush_trigger=0
level0_file_num_compaction_trigger=4
report_bg_io_stats=false
inplace_update_num_locks=10000
memtable_prefix_bloom_size_ratio=0.000000
level0_stop_writes_trigger=36
blob_compression_type=kNoCompression
level0_slowdown_writes_trigger=20
hard_pending_compaction_bytes_limit=274877906944
target_file_size_multiplier=1
bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_file_checks=false
blob_garbage_collection_force_threshold=1.000000
enable_blob_files=false
soft_pending_compaction_bytes_limit=68719476736
target_file_size_base=67108864
max_compaction_bytes=1677721600
disable_auto_compactions=false
min_blob_size=0
memtable_whole_key_filtering=false
max_bytes_for_level_base=268435456
last_level_temperature=kUnknown
preserve_internal_time_seconds=0
compaction_options_fifo={trivial_copy_buffer_size=4096;allow_trivial_copy_when_change_temperature=false;file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
max_bytes_for_level_multiplier=10.000000
max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
max_sequential_skip_in_iterations=8
compression=kSnappyCompression
default_write_temperature=kUnknown
compaction_options_universal={reduce_file_locking=false;incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;max_read_amp=-1;size_ratio=1;}
blob_garbage_collection_age_cutoff=0.250000
ttl=2592000
periodic_compaction_seconds=0
preclude_last_level_data_seconds=0
blob_file_size=268435456
enable_blob_garbage_collection=false
persist_user_defined_timestamps=true
compaction_pri=kMinOverlappingRatio
compaction_filter_factory=nullptr
comparator=leveldb.BytewiseComparator
bloom_locality=0
merge_operator=nullptr
compaction_filter=nullptr
level_compaction_dynamic_level_bytes=true
optimize_filters_for_hits=false
inplace_update_support=false
max_write_buffer_size_to_maintain=0
memtable_factory=SkipListFactory
memtable_insert_with_hint_prefix_extractor=nullptr
num_levels=7
force_consistency_checks=true
sst_partitioner_factory=nullptr
default_temperature=kUnknown
disallow_memtable_writes=false
compaction_style=kCompactionStyleLevel
min_write_buffer_number_to_merge=1
[TableOptions/BlockBasedTable "key_value"]
num_file_reads_for_auto_readahead=2
initial_auto_readahead_size=8192
metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
enable_index_compression=true
verify_compression=false
prepopulate_block_cache=kDisable
format_version=6
use_delta_encoding=true
pin_top_level_index_and_filter=true
read_amp_bytes_per_bit=0
decouple_partitioned_filters=false
partition_filters=false
metadata_block_size=4096
max_auto_readahead_size=262144
index_block_restart_interval=1
block_size_deviation=10
block_size=4096
detect_filter_construct_corruption=false
no_block_cache=false
checksum=kXXH3
filter_policy=nullptr
data_block_hash_table_util_ratio=0.750000
block_restart_interval=16
index_type=kBinarySearch
pin_l0_filter_and_index_blocks_in_cache=false
data_block_index_type=kDataBlockBinarySearch
cache_index_and_filter_blocks_with_high_priority=true
whole_key_filtering=true
index_shortening=kShortenSeparators
cache_index_and_filter_blocks=false
block_align=false
optimize_filters_for_memory=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory
[CFOptions "snapshot"]
memtable_max_range_deletions=0
compression_manager=nullptr
compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_memory_checks=false
memtable_avg_op_scan_flush_trigger=0
block_protection_bytes_per_key=0
uncache_aggressiveness=0
bottommost_file_compaction_delay=0
memtable_protection_bytes_per_key=0
experimental_mempurge_threshold=0.000000
bottommost_compression=kDisableCompressionOption
sample_for_compression=0
prepopulate_blob_cache=kDisable
blob_file_starting_level=0
blob_compaction_readahead_size=0
table_factory=BlockBasedTable
max_successive_merges=0
max_write_buffer_number=2
prefix_extractor=nullptr
memtable_huge_page_size=0
write_buffer_size=33554432
strict_max_successive_merges=false
arena_block_size=1048576
memtable_op_scan_flush_trigger=0
level0_file_num_compaction_trigger=4
report_bg_io_stats=false
inplace_update_num_locks=10000
memtable_prefix_bloom_size_ratio=0.000000
level0_stop_writes_trigger=36
blob_compression_type=kNoCompression
level0_slowdown_writes_trigger=20
hard_pending_compaction_bytes_limit=274877906944
target_file_size_multiplier=1
bottommost_compression_opts={checksum=false;max_dict_buffer_bytes=0;enabled=false;max_dict_bytes=0;max_compressed_bytes_per_kb=896;parallel_threads=1;zstd_max_train_bytes=0;level=32767;use_zstd_dict_trainer=true;strategy=0;window_bits=-14;}
paranoid_file_checks=false
blob_garbage_collection_force_threshold=1.000000
enable_blob_files=false
soft_pending_compaction_bytes_limit=68719476736
target_file_size_base=67108864
max_compaction_bytes=1677721600
disable_auto_compactions=false
min_blob_size=0
memtable_whole_key_filtering=false
max_bytes_for_level_base=268435456
last_level_temperature=kUnknown
preserve_internal_time_seconds=0
compaction_options_fifo={trivial_copy_buffer_size=4096;allow_trivial_copy_when_change_temperature=false;file_temperature_age_thresholds=;allow_compaction=false;age_for_warm=0;max_table_files_size=1073741824;}
max_bytes_for_level_multiplier=10.000000
max_bytes_for_level_multiplier_additional=1:1:1:1:1:1:1
max_sequential_skip_in_iterations=8
compression=kSnappyCompression
default_write_temperature=kUnknown
compaction_options_universal={reduce_file_locking=false;incremental=false;compression_size_percent=-1;allow_trivial_move=false;max_size_amplification_percent=200;max_merge_width=4294967295;stop_style=kCompactionStopStyleTotalSize;min_merge_width=2;max_read_amp=-1;size_ratio=1;}
blob_garbage_collection_age_cutoff=0.250000
ttl=2592000
periodic_compaction_seconds=0
preclude_last_level_data_seconds=0
blob_file_size=268435456
enable_blob_garbage_collection=false
persist_user_defined_timestamps=true
compaction_pri=kMinOverlappingRatio
compaction_filter_factory=nullptr
comparator=leveldb.BytewiseComparator
bloom_locality=0
merge_operator=nullptr
compaction_filter=nullptr
level_compaction_dynamic_level_bytes=true
optimize_filters_for_hits=false
inplace_update_support=false
max_write_buffer_size_to_maintain=0
memtable_factory=SkipListFactory
memtable_insert_with_hint_prefix_extractor=nullptr
num_levels=7
force_consistency_checks=true
sst_partitioner_factory=nullptr
default_temperature=kUnknown
disallow_memtable_writes=false
compaction_style=kCompactionStyleLevel
min_write_buffer_number_to_merge=1
[TableOptions/BlockBasedTable "snapshot"]
num_file_reads_for_auto_readahead=2
initial_auto_readahead_size=8192
metadata_cache_options={unpartitioned_pinning=kFallback;partition_pinning=kFallback;top_level_index_pinning=kFallback;}
enable_index_compression=true
verify_compression=false
prepopulate_block_cache=kDisable
format_version=6
use_delta_encoding=true
pin_top_level_index_and_filter=true
read_amp_bytes_per_bit=0
decouple_partitioned_filters=false
partition_filters=false
metadata_block_size=4096
max_auto_readahead_size=262144
index_block_restart_interval=1
block_size_deviation=10
block_size=4096
detect_filter_construct_corruption=false
no_block_cache=false
checksum=kXXH3
filter_policy=nullptr
data_block_hash_table_util_ratio=0.750000
block_restart_interval=16
index_type=kBinarySearch
pin_l0_filter_and_index_blocks_in_cache=false
data_block_index_type=kDataBlockBinarySearch
cache_index_and_filter_blocks_with_high_priority=true
whole_key_filtering=true
index_shortening=kShortenSeparators
cache_index_and_filter_blocks=false
block_align=false
optimize_filters_for_memory=true
flush_block_policy_factory=FlushBlockBySizePolicyFactory

View file

@ -1,240 +0,0 @@
# Chainfire T003 Feature Gap Analysis
**Audit Date:** 2025-12-08
**Spec Version:** 1.0
**Implementation Path:** `/home/centra/cloud/chainfire/crates/`
---
## Executive Summary
**Total Features Analyzed:** 32
**Implemented:** 20 (62.5%)
**Partially Implemented:** 5 (15.6%)
**Missing:** 7 (21.9%)
The core KV operations, Raft consensus, Watch functionality, and basic cluster management are implemented and functional. Critical gaps exist in TTL/Lease management, read consistency controls, and transaction completeness. Production readiness is blocked by missing lease service and lack of authentication.
---
## Feature Gap Matrix
| Feature | Spec Section | Status | Priority | Complexity | Notes |
|---------|--------------|--------|----------|------------|-------|
| **Lease Service (TTL)** | 8.3, 4.1 | ❌ Missing | P0 | Medium (3-5d) | Protocol has lease field but no Lease gRPC service; critical for production |
| **TTL Expiration Logic** | 4.1, spec line 22-23 | ❌ Missing | P0 | Medium (3-5d) | lease_id stored but no background expiration worker |
| **Read Consistency Levels** | 4.1 | ❌ Missing | P0 | Small (1-2d) | Local/Serializable/Linearizable not implemented; all reads are undefined consistency |
| **Range Ops in Transactions** | 4.2, line 224-229 | ⚠️ Partial | P1 | Small (1-2d) | RequestOp has RangeRequest but returns dummy Delete op (kv_service.rs:224-229) |
| **Transaction Responses** | 3.1, kv_service.rs:194 | ⚠️ Partial | P1 | Small (1-2d) | TxnResponse.responses is empty vec; TODO comment in code |
| **Point-in-Time Reads** | 3.1, 7.3 | ⚠️ Partial | P1 | Medium (3-5d) | RangeRequest has revision field but KvStore doesn't use it |
| **StorageBackend Trait** | 3.3 | ❌ Missing | P1 | Medium (3-5d) | Spec defines trait (lines 166-174) but not in chainfire-core |
| **Prometheus Metrics** | 7.2 | ❌ Missing | P1 | Small (1-2d) | Spec mentions endpoint but no implementation |
| **Health Check Service** | 7.2 | ❌ Missing | P1 | Small (1d) | gRPC health check not visible |
| **Authentication** | 6.1 | ❌ Missing | P2 | Large (1w+) | Spec says "Planned"; mTLS for peers, tokens for clients |
| **Authorization/RBAC** | 6.2 | ❌ Missing | P2 | Large (1w+) | Requires IAM integration |
| **Namespace Quotas** | 6.3 | ❌ Missing | P2 | Medium (3-5d) | Per-namespace resource limits |
| **KV Service - Range** | 3.1 | ✅ Implemented | - | - | Single key, range scan, prefix scan all working |
| **KV Service - Put** | 3.1 | ✅ Implemented | - | - | Including prev_kv support |
| **KV Service - Delete** | 3.1 | ✅ Implemented | - | - | Single and range delete working |
| **KV Service - Txn (Basic)** | 3.1 | ✅ Implemented | - | - | Compare conditions and basic ops working |
| **Watch Service** | 3.1 | ✅ Implemented | - | - | Bidirectional streaming, create/cancel/progress |
| **Cluster Service - All** | 3.1 | ✅ Implemented | - | - | MemberAdd/Remove/List/Status all present |
| **Client Library - Core** | 3.2 | ✅ Implemented | - | - | Connect, put, get, delete, CAS implemented |
| **Client - Prefix Scan** | 3.2 | ✅ Implemented | - | - | get_prefix method exists |
| **ClusterEventHandler** | 3.3 | ✅ Implemented | - | - | All 8 callbacks defined in callbacks.rs |
| **KvEventHandler** | 3.3 | ✅ Implemented | - | - | on_key_changed, on_key_deleted, on_prefix_changed |
| **ClusterBuilder** | 3.4 | ✅ Implemented | - | - | Embeddable library with builder pattern |
| **MVCC Support** | 4.3 | ✅ Implemented | - | - | Global revision counter, create/mod revisions tracked |
| **RocksDB Storage** | 4.3 | ✅ Implemented | - | - | Column families: raft_logs, raft_meta, key_value, snapshot |
| **Raft Integration** | 2.0 | ✅ Implemented | - | - | OpenRaft 0.9 integrated, Vote/AppendEntries/Snapshot RPCs |
| **SWIM Gossip** | 2.1 | ⚠️ Present | P2 | - | chainfire-gossip crate exists but integration unclear |
| **Server Binary** | 7.1 | ✅ Implemented | - | - | CLI with config file, env vars, bootstrap support |
| **Config Management** | 5.0 | ✅ Implemented | - | - | TOML config, env vars, CLI overrides |
| **Watch - Historical Replay** | 3.1 | ⚠️ Partial | P2 | Medium (3-5d) | start_revision exists in proto but historical storage unclear |
| **Snapshot & Backup** | 7.3 | ⚠️ Partial | P2 | Small (1-2d) | Raft snapshot exists but manual backup procedure not documented |
| **etcd Compatibility** | 8.3 | ⚠️ Partial | P2 | - | API similar but package names differ; missing Lease service breaks compatibility |
---
## Critical Gaps (P0)
### 1. Lease Service & TTL Expiration
**Impact:** Blocks production use cases requiring automatic key expiration (sessions, locks, ephemeral data)
**Evidence:**
- `/home/centra/cloud/chainfire/proto/chainfire.proto` has no `Lease` service definition
- `KvEntry` has `lease_id: Option<i64>` field (types/kv.rs:23) but no expiration logic
- No background worker to delete expired keys
- etcd compatibility broken without Lease service
**Fix Required:**
1. Add Lease service to proto: `LeaseGrant`, `LeaseRevoke`, `LeaseKeepAlive`, `LeaseTimeToLive`
2. Implement lease storage and expiration worker in chainfire-storage
3. Wire lease_id checks to KV operations
4. Add lease_id index for efficient expiration queries
---
### 2. Read Consistency Levels
**Impact:** Cannot guarantee linearizable reads; stale reads possible on followers
**Evidence:**
- Spec defines `ReadConsistency` enum (spec lines 208-215)
- No implementation in chainfire-storage or chainfire-api
- RangeRequest in kv_service.rs always reads from local storage without consistency checks
**Fix Required:**
1. Add consistency parameter to RangeRequest
2. Implement leader verification for Linearizable reads
3. Add committed index check for Serializable reads
4. Default to Linearizable for safety
---
### 3. Range Operations in Transactions
**Impact:** Cannot atomically read-then-write in transactions; limits CAS use cases
**Evidence:**
```rust
// /home/centra/cloud/chainfire/crates/chainfire-api/src/kv_service.rs:224-229
crate::proto::request_op::Request::RequestRange(_) => {
// Range operations in transactions are not supported yet
TxnOp::Delete { key: vec![] } // Returns dummy operation!
}
```
**Fix Required:**
1. Extend `chainfire_types::command::TxnOp` to include `Range` variant
2. Update state_machine.rs to handle read operations in transactions
3. Return range results in TxnResponse.responses
---
## Important Gaps (P1)
### 4. Transaction Response Completeness
**Evidence:**
```rust
// /home/centra/cloud/chainfire/crates/chainfire-api/src/kv_service.rs:194
Ok(Response::new(TxnResponse {
header: Some(self.make_header(response.revision)),
succeeded: response.succeeded,
responses: vec![], // TODO: fill in responses
}))
```
**Fix:** Collect operation results during txn execution and populate responses vector
---
### 5. Point-in-Time Reads (MVCC Historical Queries)
**Evidence:**
- RangeRequest has `revision` field (proto/chainfire.proto:78)
- KvStore.range() doesn't use revision parameter
- No revision-indexed storage in RocksDB
**Fix:** Implement versioned key storage or revision-based snapshots
---
### 6. StorageBackend Trait Abstraction
**Evidence:**
- Spec defines trait (lines 166-174) for pluggable backends
- chainfire-storage is RocksDB-only
- No trait in chainfire-core/src/
**Fix:** Extract trait and implement for RocksDB; enables memory backend testing
---
### 7. Observability
**Gaps:**
- No Prometheus metrics (spec mentions endpoint at 7.2)
- No gRPC health check service
- Limited structured logging
**Fix:** Add metrics crate, implement health checks, expose /metrics endpoint
---
## Nice-to-Have Gaps (P2)
- **Authentication/Authorization:** Spec marks as "Planned" - mTLS and RBAC
- **Namespace Quotas:** Resource limits per tenant
- **SWIM Gossip Integration:** chainfire-gossip crate exists but usage unclear
- **Watch Historical Replay:** start_revision in proto but storage unclear
- **Advanced etcd Compat:** Package name differences, field naming variations
---
## Key Findings
### Strengths
1. **Solid Core Implementation:** KV operations, Raft consensus, and basic transactions work well
2. **Watch System:** Fully functional with bidirectional streaming and event dispatch
3. **Client Library:** Well-designed with CAS and convenience methods
4. **Architecture:** Clean separation of concerns across crates
5. **Testing:** State machine has unit tests for core operations
### Weaknesses
1. **Incomplete Transactions:** Missing range ops and response population breaks advanced use cases
2. **No TTL Support:** Critical for production; requires full Lease service implementation
3. **Undefined Read Consistency:** Dangerous for distributed systems; needs immediate attention
4. **Limited Observability:** No metrics or health checks hinders production deployment
### Blockers for Production
1. Lease service implementation (P0)
2. Read consistency guarantees (P0)
3. Transaction completeness (P1)
4. Basic metrics/health checks (P1)
---
## Recommendations
### Phase 1: Production Readiness (2-3 weeks)
1. Implement Lease service and TTL expiration worker
2. Add read consistency levels (default to Linearizable)
3. Complete transaction responses
4. Add basic Prometheus metrics and health checks
### Phase 2: Feature Completeness (1-2 weeks)
1. Support range operations in transactions
2. Implement point-in-time reads
3. Extract StorageBackend trait
4. Document and test SWIM gossip integration
### Phase 3: Hardening (2-3 weeks)
1. Add authentication (mTLS for peers)
2. Implement basic authorization
3. Add namespace quotas
4. Comprehensive integration tests
---
## Appendix: Implementation Evidence
### Transaction Compare Logic
**Location:** `/home/centra/cloud/chainfire/crates/chainfire-storage/src/state_machine.rs:148-228`
- ✅ Supports Version, CreateRevision, ModRevision, Value comparisons
- ✅ Handles Equal, NotEqual, Greater, Less operators
- ✅ Atomic execution of success/failure ops
### Watch Implementation
**Location:** `/home/centra/cloud/chainfire/crates/chainfire-watch/`
- ✅ WatchRegistry with event dispatch
- ✅ WatchStream for bidirectional gRPC
- ✅ KeyMatcher for prefix/range watches
- ✅ Integration with state machine (state_machine.rs:82-88)
### Client CAS Example
**Location:** `/home/centra/cloud/chainfire/chainfire-client/src/client.rs:228-299`
- ✅ Uses transactions for compare-and-swap
- ✅ Returns CasOutcome with current/new versions
- ⚠️ Fallback read on failure uses range op (demonstrates txn range gap)
---
**Report Generated:** 2025-12-08
**Auditor:** Claude Code Agent
**Next Review:** After Phase 1 implementation

1165
coronafs/Cargo.lock generated Normal file

File diff suppressed because it is too large Load diff

32
coronafs/Cargo.toml Normal file
View file

@ -0,0 +1,32 @@
[workspace]
resolver = "2"
members = [
"crates/coronafs-server",
]
[workspace.package]
version = "0.1.0"
edition = "2021"
license = "MIT OR Apache-2.0"
rust-version = "1.75"
authors = ["PhotonCloud Contributors"]
repository = "https://github.com/photoncloud/photoncloud"
[workspace.dependencies]
axum = "0.8"
clap = { version = "4", features = ["derive"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
tokio = { version = "1.40", features = ["full"] }
toml = "0.8"
tracing = "0.1"
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
anyhow = "1.0"
thiserror = "1.0"
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
[workspace.lints.rust]
unsafe_code = "deny"
[workspace.lints.clippy]
all = "warn"

18
coronafs/README.md Normal file
View file

@ -0,0 +1,18 @@
# CoronaFS
CoronaFS is PhotonCloud's mutable VM-volume layer.
Current implementation:
- custom block backend, not NFS
- control API on `coronafs-server`
- raw volume data stored under `/var/lib/coronafs/volumes`
- exported to workers as `nbd://` targets via `qemu-nbd`
- primary consumer: `plasmavmc` managed VM volumes
Intentional split:
- mutable VM volumes live on CoronaFS
- immutable VM images live in LightningStor object storage
This keeps VM root/data disks on a shared block path while leaving image distribution on the object layer.

View file

@ -0,0 +1,27 @@
[package]
name = "coronafs-server"
version.workspace = true
edition.workspace = true
license.workspace = true
rust-version.workspace = true
[[bin]]
name = "coronafs-server"
path = "src/main.rs"
[dependencies]
axum = { workspace = true }
clap = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }
toml = { workspace = true }
tracing = { workspace = true }
tracing-subscriber = { workspace = true }
anyhow = { workspace = true }
thiserror = { workspace = true }
chrono = { workspace = true }
futures-util = "0.3"
[lints]
workspace = true

View file

@ -0,0 +1,61 @@
use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
use std::path::PathBuf;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct ServerConfig {
pub listen_addr: SocketAddr,
pub advertise_host: String,
pub data_dir: PathBuf,
pub export_bind_addr: String,
pub export_base_port: u16,
pub export_port_count: u16,
pub export_shared_clients: u16,
pub export_cache_mode: String,
pub export_aio_mode: String,
pub export_discard_mode: String,
pub export_detect_zeroes_mode: String,
pub preallocate: bool,
pub sync_on_write: bool,
pub qemu_nbd_path: PathBuf,
pub qemu_img_path: PathBuf,
pub log_level: String,
}
impl Default for ServerConfig {
fn default() -> Self {
Self {
listen_addr: "0.0.0.0:50088".parse().expect("valid listen addr"),
advertise_host: "127.0.0.1".to_string(),
data_dir: PathBuf::from("/var/lib/coronafs"),
export_bind_addr: "0.0.0.0".to_string(),
export_base_port: 11000,
export_port_count: 512,
export_shared_clients: 32,
export_cache_mode: "none".to_string(),
export_aio_mode: "io_uring".to_string(),
export_discard_mode: "unmap".to_string(),
export_detect_zeroes_mode: "unmap".to_string(),
preallocate: true,
sync_on_write: false,
qemu_nbd_path: PathBuf::from("qemu-nbd"),
qemu_img_path: PathBuf::from("qemu-img"),
log_level: "info".to_string(),
}
}
}
impl ServerConfig {
pub fn volume_dir(&self) -> PathBuf {
self.data_dir.join("volumes")
}
pub fn metadata_dir(&self) -> PathBuf {
self.data_dir.join("metadata")
}
pub fn pid_dir(&self) -> PathBuf {
self.data_dir.join("pids")
}
}

View file

@ -0,0 +1,748 @@
mod config;
use anyhow::{anyhow, Context, Result};
use axum::body::Body;
use axum::extract::{Path, Query, State};
use axum::http::StatusCode;
use axum::response::{IntoResponse, Response};
use axum::routing::{get, post, put};
use axum::{Json, Router};
use clap::Parser;
use config::ServerConfig;
use futures_util::StreamExt;
use serde::{Deserialize, Serialize};
#[cfg(unix)]
use std::os::unix::fs::PermissionsExt;
use std::collections::{HashMap, HashSet};
use std::path::{Path as FsPath, PathBuf};
use std::sync::Arc;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio::process::Command;
use tokio::sync::Mutex;
use tracing_subscriber::EnvFilter;
#[derive(Parser, Debug)]
#[command(author, version, about)]
struct Args {
#[arg(short, long, default_value = "coronafs.toml")]
config: PathBuf,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
struct VolumeMetadata {
id: String,
size_bytes: u64,
port: Option<u16>,
export_pid: Option<u32>,
created_at: String,
updated_at: String,
}
#[derive(Debug, Serialize)]
struct VolumeResponse {
id: String,
size_bytes: u64,
path: String,
export: Option<ExportResponse>,
}
#[derive(Debug, Serialize)]
struct ExportResponse {
uri: String,
port: u16,
pid: Option<u32>,
}
#[derive(Debug, Deserialize)]
struct CreateVolumeRequest {
size_bytes: u64,
}
#[derive(Debug, Deserialize)]
struct ResizeVolumeRequest {
size_bytes: u64,
}
#[derive(Debug, Deserialize)]
struct ImportQuery {
size_bytes: Option<u64>,
}
#[derive(Debug, Deserialize)]
struct ExportQuery {
read_only: Option<bool>,
}
#[derive(Clone)]
struct AppState {
config: Arc<ServerConfig>,
volume_guards: Arc<Mutex<HashMap<String, Arc<Mutex<()>>>>>,
reserved_ports: Arc<Mutex<HashSet<u16>>>,
}
impl AppState {
async fn new(config: ServerConfig) -> Result<Self> {
prepare_dirs(&config).await?;
let reserved_ports = collect_reserved_ports(&config).await?;
Ok(Self {
config: Arc::new(config),
volume_guards: Arc::new(Mutex::new(HashMap::new())),
reserved_ports: Arc::new(Mutex::new(reserved_ports)),
})
}
async fn volume_guard(&self, volume_id: &str) -> Arc<Mutex<()>> {
let mut guards = self.volume_guards.lock().await;
guards
.entry(volume_id.to_string())
.or_insert_with(|| Arc::new(Mutex::new(())))
.clone()
}
}
#[derive(Debug)]
struct ApiError {
status: StatusCode,
message: String,
}
impl ApiError {
fn new(status: StatusCode, message: impl Into<String>) -> Self {
Self {
status,
message: message.into(),
}
}
fn not_found(message: impl Into<String>) -> Self {
Self::new(StatusCode::NOT_FOUND, message)
}
fn internal(err: anyhow::Error) -> Self {
Self::new(StatusCode::INTERNAL_SERVER_ERROR, err.to_string())
}
}
impl IntoResponse for ApiError {
fn into_response(self) -> Response {
(
self.status,
Json(serde_json::json!({
"error": self.message,
})),
)
.into_response()
}
}
type ApiResult<T> = Result<Json<T>, ApiError>;
#[tokio::main]
async fn main() -> Result<()> {
let args = Args::parse();
let config = if args.config.exists() {
let contents = fs::read_to_string(&args.config)
.await
.with_context(|| format!("failed to read config {}", args.config.display()))?;
toml::from_str::<ServerConfig>(&contents)
.with_context(|| format!("failed to parse config {}", args.config.display()))?
} else {
ServerConfig::default()
};
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&config.log_level)),
)
.init();
let listen_addr = config.listen_addr;
let state = AppState::new(config).await?;
let app = Router::new()
.route("/healthz", get(healthz))
.route("/v1/volumes/{id}", put(create_blank_volume).get(get_volume).delete(delete_volume))
.route("/v1/volumes/{id}/import", put(import_volume))
.route("/v1/volumes/{id}/resize", post(resize_volume))
.route("/v1/volumes/{id}/export", post(ensure_export))
.with_state(state);
tracing::info!(%listen_addr, "starting CoronaFS server");
let listener = tokio::net::TcpListener::bind(listen_addr).await?;
axum::serve(listener, app).await?;
Ok(())
}
async fn healthz() -> Json<serde_json::Value> {
Json(serde_json::json!({"status": "ok"}))
}
async fn create_blank_volume(
State(state): State<AppState>,
Path(id): Path<String>,
Json(req): Json<CreateVolumeRequest>,
) -> ApiResult<VolumeResponse> {
let volume_guard = state.volume_guard(&id).await;
let _guard = volume_guard.lock().await;
create_blank_impl(&state, &id, req.size_bytes)
.await
.map(Json)
.map_err(ApiError::internal)
}
async fn import_volume(
State(state): State<AppState>,
Path(id): Path<String>,
Query(query): Query<ImportQuery>,
body: Body,
) -> ApiResult<VolumeResponse> {
let volume_guard = state.volume_guard(&id).await;
let _guard = volume_guard.lock().await;
import_impl(&state, &id, query.size_bytes, body)
.await
.map(Json)
.map_err(ApiError::internal)
}
async fn get_volume(
State(state): State<AppState>,
Path(id): Path<String>,
) -> ApiResult<VolumeResponse> {
load_response(&state, &id)
.await
.ok_or_else(|| ApiError::not_found(format!("volume {id} not found")))
.map(Json)
}
async fn ensure_export(
State(state): State<AppState>,
Path(id): Path<String>,
Query(query): Query<ExportQuery>,
) -> ApiResult<VolumeResponse> {
let volume_guard = state.volume_guard(&id).await;
let _guard = volume_guard.lock().await;
ensure_export_impl(&state, &id, query.read_only.unwrap_or(false))
.await
.map(Json)
.map_err(ApiError::internal)
}
async fn resize_volume(
State(state): State<AppState>,
Path(id): Path<String>,
Json(req): Json<ResizeVolumeRequest>,
) -> ApiResult<VolumeResponse> {
let volume_guard = state.volume_guard(&id).await;
let _guard = volume_guard.lock().await;
resize_impl(&state, &id, req.size_bytes)
.await
.map(Json)
.map_err(ApiError::internal)
}
async fn delete_volume(
State(state): State<AppState>,
Path(id): Path<String>,
) -> Result<StatusCode, ApiError> {
let volume_guard = state.volume_guard(&id).await;
let _guard = volume_guard.lock().await;
delete_impl(&state, &id)
.await
.map(|_| StatusCode::NO_CONTENT)
.map_err(ApiError::internal)
}
async fn prepare_dirs(config: &ServerConfig) -> Result<()> {
fs::create_dir_all(config.volume_dir()).await?;
fs::create_dir_all(config.metadata_dir()).await?;
fs::create_dir_all(config.pid_dir()).await?;
Ok(())
}
async fn create_blank_impl(state: &AppState, id: &str, size_bytes: u64) -> Result<VolumeResponse> {
let path = volume_path(&state.config, id);
let meta_path = metadata_path(&state.config, id);
if fs::try_exists(&meta_path).await.unwrap_or(false) {
return load_response_required(state, id).await;
}
if state.config.preallocate {
let status = Command::new("fallocate")
.args(["-l", &size_bytes.to_string(), path.to_string_lossy().as_ref()])
.status()
.await;
match status {
Ok(status) if status.success() => {}
_ => {
let file = fs::File::create(&path).await?;
file.set_len(size_bytes).await?;
}
}
} else {
let file = fs::File::create(&path).await?;
file.set_len(size_bytes).await?;
}
let meta = VolumeMetadata {
id: id.to_string(),
size_bytes,
port: None,
export_pid: None,
created_at: chrono::Utc::now().to_rfc3339(),
updated_at: chrono::Utc::now().to_rfc3339(),
};
ensure_volume_file_permissions(&path).await?;
save_metadata(&meta_path, &meta).await?;
load_response_required(state, id).await
}
async fn import_impl(
state: &AppState,
id: &str,
size_bytes: Option<u64>,
body: Body,
) -> Result<VolumeResponse> {
let path = volume_path(&state.config, id);
let meta_path = metadata_path(&state.config, id);
let tmp_path = temp_import_path(&state.config, id);
if let Some(size_bytes) = size_bytes {
create_or_preallocate_file(&tmp_path, size_bytes, state.config.preallocate).await?;
}
let mut stream = body.into_data_stream();
let mut file = fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(size_bytes.is_none())
.open(&tmp_path)
.await
.with_context(|| format!("failed to create {}", tmp_path.display()))?;
let mut bytes_written = 0u64;
while let Some(chunk) = stream.next().await {
let chunk = chunk.context("failed to read request body chunk")?;
bytes_written = bytes_written.saturating_add(chunk.len() as u64);
file.write_all(&chunk)
.await
.with_context(|| format!("failed to write {}", tmp_path.display()))?;
}
if let Some(size_bytes) = size_bytes {
file.set_len(size_bytes).await?;
}
if state.config.sync_on_write {
file.sync_all().await?;
}
drop(file);
fs::rename(&tmp_path, &path).await?;
ensure_volume_file_permissions(&path).await?;
let actual_size = fs::metadata(&path).await?.len();
let meta = VolumeMetadata {
id: id.to_string(),
size_bytes: size_bytes.unwrap_or(actual_size),
port: None,
export_pid: None,
created_at: chrono::Utc::now().to_rfc3339(),
updated_at: chrono::Utc::now().to_rfc3339(),
};
save_metadata(&meta_path, &meta).await?;
tracing::info!(
volume_id = id,
bytes_written,
volume_size = actual_size,
"Imported raw volume into CoronaFS"
);
load_response_required(state, id).await
}
async fn resize_impl(state: &AppState, id: &str, size_bytes: u64) -> Result<VolumeResponse> {
let meta_path = metadata_path(&state.config, id);
let path = volume_path(&state.config, id);
let mut meta = load_metadata(&meta_path)
.await?
.ok_or_else(|| anyhow!("volume {id} not found"))?;
let reserved_port = meta.port;
stop_export_if_running(&state.config, id, &mut meta).await?;
release_export_port(state, reserved_port).await;
meta.port = None;
let status = Command::new(&state.config.qemu_img_path)
.args([
"resize",
"-f",
"raw",
path.to_string_lossy().as_ref(),
&size_bytes.to_string(),
])
.status()
.await
.context("failed to spawn qemu-img resize")?;
if !status.success() {
return Err(anyhow!("qemu-img resize failed for {}", path.display()));
}
meta.size_bytes = size_bytes;
meta.updated_at = chrono::Utc::now().to_rfc3339();
save_metadata(&meta_path, &meta).await?;
load_response_required(state, id).await
}
async fn ensure_export_impl(state: &AppState, id: &str, read_only: bool) -> Result<VolumeResponse> {
let meta_path = metadata_path(&state.config, id);
let mut meta = load_metadata(&meta_path)
.await?
.ok_or_else(|| anyhow!("volume {id} not found"))?;
if let Some(pid) = meta.export_pid {
if process_running(pid).await {
if let Some(port) = meta.port {
mark_port_reserved(state, port).await;
}
return load_response_required(state, id).await;
}
}
let port = reserve_export_port(state, meta.port).await?;
let pid_path = pid_path(&state.config, id);
let path = volume_path(&state.config, id);
let effective_aio_mode = export_aio_mode(&state.config.export_cache_mode, &state.config.export_aio_mode);
let mut command = Command::new(&state.config.qemu_nbd_path);
command.args([
"--fork",
"--persistent",
"--pid-file",
pid_path.to_string_lossy().as_ref(),
"--shared",
&state.config.export_shared_clients.to_string(),
"--cache",
&state.config.export_cache_mode,
"--aio",
effective_aio_mode,
"--discard",
&state.config.export_discard_mode,
"--detect-zeroes",
&state.config.export_detect_zeroes_mode,
"--format",
"raw",
"--bind",
&state.config.export_bind_addr,
"--port",
&port.to_string(),
]);
if read_only {
command.arg("--read-only");
}
command.arg(path.to_string_lossy().as_ref());
let status = command
.status()
.await
.context("failed to spawn qemu-nbd")?;
if !status.success() {
release_export_port(state, Some(port)).await;
return Err(anyhow!("qemu-nbd failed to export volume {id} on port {port}"));
}
let pid = match read_pid_file(&pid_path).await {
Ok(pid) => pid,
Err(err) => {
release_export_port(state, Some(port)).await;
return Err(err);
}
};
meta.port = Some(port);
meta.export_pid = Some(pid);
meta.updated_at = chrono::Utc::now().to_rfc3339();
save_metadata(&meta_path, &meta).await?;
if let Err(err) = wait_for_tcp_listen(export_probe_host(&state.config), port).await {
let _ = stop_export_if_running(&state.config, id, &mut meta).await;
release_export_port(state, Some(port)).await;
return Err(err);
}
load_response_required(state, id).await
}
fn export_aio_mode<'a>(cache_mode: &str, aio_mode: &'a str) -> &'a str {
if aio_mode == "native" && !matches!(cache_mode, "none" | "directsync") {
tracing::warn!(
cache_mode,
requested_aio_mode = aio_mode,
effective_aio_mode = "threads",
"CoronaFS export cache mode is incompatible with qemu-nbd native AIO; falling back to threads",
);
"threads"
} else {
aio_mode
}
}
async fn delete_impl(state: &AppState, id: &str) -> Result<()> {
let meta_path = metadata_path(&state.config, id);
if let Some(mut meta) = load_metadata(&meta_path).await? {
let reserved_port = meta.port;
stop_export_if_running(&state.config, id, &mut meta).await?;
release_export_port(state, reserved_port).await;
}
let path = volume_path(&state.config, id);
if fs::try_exists(&path).await.unwrap_or(false) {
fs::remove_file(&path).await?;
}
if fs::try_exists(&meta_path).await.unwrap_or(false) {
fs::remove_file(&meta_path).await?;
}
let pid_path = pid_path(&state.config, id);
if fs::try_exists(&pid_path).await.unwrap_or(false) {
fs::remove_file(pid_path).await?;
}
Ok(())
}
async fn load_response(state: &AppState, id: &str) -> Option<VolumeResponse> {
match load_response_required(state, id).await {
Ok(response) => Some(response),
Err(_) => None,
}
}
async fn load_response_required(state: &AppState, id: &str) -> Result<VolumeResponse> {
let meta = load_metadata(&metadata_path(&state.config, id))
.await?
.ok_or_else(|| anyhow!("volume {id} not found"))?;
let export = match (meta.port, meta.export_pid) {
(Some(port), pid) if pid.map(process_running_sync).unwrap_or(false) => Some(ExportResponse {
uri: format!("nbd://{}:{}", state.config.advertise_host, port),
port,
pid,
}),
_ => None,
};
Ok(VolumeResponse {
id: meta.id,
size_bytes: meta.size_bytes,
path: volume_path(&state.config, id).display().to_string(),
export,
})
}
async fn load_metadata(path: &FsPath) -> Result<Option<VolumeMetadata>> {
if !fs::try_exists(path).await.unwrap_or(false) {
return Ok(None);
}
let bytes = fs::read(path).await?;
Ok(Some(serde_json::from_slice(&bytes)?))
}
async fn save_metadata(path: &FsPath, meta: &VolumeMetadata) -> Result<()> {
let bytes = serde_json::to_vec_pretty(meta)?;
let tmp_path = path.with_extension("json.tmp");
fs::write(&tmp_path, bytes).await?;
fs::rename(&tmp_path, path).await?;
Ok(())
}
async fn stop_export_if_running(config: &ServerConfig, id: &str, meta: &mut VolumeMetadata) -> Result<()> {
if let Some(pid) = meta.export_pid {
if process_running(pid).await {
let status = Command::new("kill")
.args(["-TERM", &pid.to_string()])
.status()
.await
.context("failed to terminate qemu-nbd export")?;
if !status.success() {
return Err(anyhow!("failed to stop qemu-nbd export pid {pid}"));
}
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10);
while process_running(pid).await {
if std::time::Instant::now() >= deadline {
let _ = Command::new("kill")
.args(["-KILL", &pid.to_string()])
.status()
.await;
break;
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
}
}
meta.export_pid = None;
let pid_path = pid_path(config, id);
if fs::try_exists(&pid_path).await.unwrap_or(false) {
fs::remove_file(pid_path).await?;
}
Ok(())
}
async fn process_running(pid: u32) -> bool {
fs::try_exists(format!("/proc/{pid}")).await.unwrap_or(false)
}
fn process_running_sync(pid: u32) -> bool {
FsPath::new("/proc").join(pid.to_string()).exists()
}
async fn read_pid_file(path: &FsPath) -> Result<u32> {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
loop {
if fs::try_exists(path).await.unwrap_or(false) {
let contents = fs::read_to_string(path).await?;
return contents
.trim()
.parse::<u32>()
.with_context(|| format!("invalid pid file {}", path.display()));
}
if std::time::Instant::now() >= deadline {
return Err(anyhow!("timed out waiting for pid file {}", path.display()));
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
}
async fn wait_for_tcp_listen(host: &str, port: u16) -> Result<()> {
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(5);
loop {
if tokio::net::TcpStream::connect((host, port)).await.is_ok() {
return Ok(());
}
if std::time::Instant::now() >= deadline {
return Err(anyhow!("timed out waiting for export {}:{}", host, port));
}
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
}
}
async fn collect_reserved_ports(config: &ServerConfig) -> Result<HashSet<u16>> {
let mut reserved = HashSet::new();
let mut entries = fs::read_dir(config.metadata_dir()).await?;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
let Some(mut meta) = load_metadata(&path).await? else {
continue;
};
match (meta.port, meta.export_pid) {
(Some(port), Some(pid)) if process_running(pid).await => {
reserved.insert(port);
}
(Some(_), _) | (_, Some(_)) => {
meta.port = None;
meta.export_pid = None;
meta.updated_at = chrono::Utc::now().to_rfc3339();
save_metadata(&path, &meta).await?;
}
_ => {}
}
}
Ok(reserved)
}
async fn reserve_export_port(state: &AppState, preferred_port: Option<u16>) -> Result<u16> {
let mut reserved = state.reserved_ports.lock().await;
if let Some(port) = preferred_port {
if port_is_usable(&state.config, &reserved, port).await {
reserved.insert(port);
return Ok(port);
}
}
let start = state.config.export_base_port as u32;
let end = start + state.config.export_port_count as u32;
for port in start..end {
let port_u16 = port as u16;
if port_is_usable(&state.config, &reserved, port_u16).await {
reserved.insert(port_u16);
return Ok(port_u16);
}
}
Err(anyhow!(
"no free export ports left in range {}..{}",
state.config.export_base_port,
state.config.export_base_port + state.config.export_port_count
))
}
async fn port_is_usable(config: &ServerConfig, reserved: &HashSet<u16>, port: u16) -> bool {
let start = config.export_base_port as u32;
let end = start + config.export_port_count as u32;
if (port as u32) < start || (port as u32) >= end || reserved.contains(&port) {
return false;
}
tokio::net::TcpListener::bind((config.export_bind_addr.as_str(), port))
.await
.is_ok()
}
async fn mark_port_reserved(state: &AppState, port: u16) {
let mut reserved = state.reserved_ports.lock().await;
reserved.insert(port);
}
async fn release_export_port(state: &AppState, port: Option<u16>) {
if let Some(port) = port {
let mut reserved = state.reserved_ports.lock().await;
reserved.remove(&port);
}
}
fn export_probe_host(config: &ServerConfig) -> &str {
match config.export_bind_addr.as_str() {
"0.0.0.0" | "::" | "" => "127.0.0.1",
host => host,
}
}
async fn create_or_preallocate_file(path: &FsPath, size_bytes: u64, preallocate: bool) -> Result<()> {
if preallocate {
let status = Command::new("fallocate")
.args(["-l", &size_bytes.to_string(), path.to_string_lossy().as_ref()])
.status()
.await;
if matches!(status, Ok(status) if status.success()) {
return Ok(());
}
}
let file = fs::File::create(path).await?;
file.set_len(size_bytes).await?;
ensure_volume_file_permissions(path).await?;
Ok(())
}
async fn ensure_volume_file_permissions(path: &FsPath) -> Result<()> {
#[cfg(unix)]
{
let permissions = std::fs::Permissions::from_mode(0o660);
fs::set_permissions(path, permissions).await?;
}
Ok(())
}
fn volume_path(config: &ServerConfig, id: &str) -> PathBuf {
config.volume_dir().join(format!("{id}.raw"))
}
fn metadata_path(config: &ServerConfig, id: &str) -> PathBuf {
config.metadata_dir().join(format!("{id}.json"))
}
fn pid_path(config: &ServerConfig, id: &str) -> PathBuf {
config.pid_dir().join(format!("{id}.pid"))
}
fn temp_import_path(config: &ServerConfig, id: &str) -> PathBuf {
config.data_dir.join(format!("{id}.import.tmp"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn export_aio_mode_falls_back_for_cached_exports() {
assert_eq!(export_aio_mode("writeback", "native"), "threads");
assert_eq!(export_aio_mode("none", "native"), "native");
assert_eq!(export_aio_mode("directsync", "native"), "native");
assert_eq!(export_aio_mode("writeback", "threads"), "threads");
}
#[test]
fn export_probe_host_prefers_loopback_for_wildcard_bind() {
let mut config = ServerConfig::default();
config.export_bind_addr = "0.0.0.0".to_string();
assert_eq!(export_probe_host(&config), "127.0.0.1");
config.export_bind_addr = "10.100.0.11".to_string();
assert_eq!(export_probe_host(&config), "10.100.0.11");
}
}

View file

@ -0,0 +1,10 @@
[package]
name = "photon-auth-client"
version = "0.1.0"
edition = "2021"
license = "MIT OR Apache-2.0"
description = "Shared IAM auth client wrapper for PhotonCloud services"
[dependencies]
anyhow = "1.0"
iam-service-auth = { path = "../../iam/crates/iam-service-auth" }

View file

@ -0,0 +1,10 @@
pub use iam_service_auth::{
get_tenant_context, resolve_tenant_ids_from_context, resource_for_tenant, AuthService,
TenantContext,
};
pub async fn connect_iam(endpoint: &str) -> anyhow::Result<AuthService> {
AuthService::new(endpoint).await.map_err(|error| {
anyhow::anyhow!("failed to connect to IAM server at {}: {}", endpoint, error)
})
}

View file

@ -0,0 +1,11 @@
[package]
name = "photon-config"
version = "0.1.0"
edition = "2021"
license = "MIT OR Apache-2.0"
description = "Shared configuration loading helpers for PhotonCloud"
[dependencies]
anyhow = "1.0"
serde = { version = "1.0", features = ["derive"] }
toml = "0.8"

View file

@ -0,0 +1,58 @@
use anyhow::Context;
use serde::de::DeserializeOwned;
use std::fs;
use std::path::Path;
pub fn load_toml_config<T>(path: &Path) -> anyhow::Result<T>
where
T: DeserializeOwned + Default,
{
if !path.exists() {
return Ok(T::default());
}
let contents = fs::read_to_string(path)
.with_context(|| format!("failed to read config file {}", path.display()))?;
toml::from_str(&contents)
.with_context(|| format!("failed to parse config file {}", path.display()))
}
#[cfg(test)]
mod tests {
use super::load_toml_config;
use serde::Deserialize;
use std::fs;
use std::path::PathBuf;
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Debug, Default, Deserialize, PartialEq)]
struct TestConfig {
value: String,
}
fn temp_path(name: &str) -> PathBuf {
let nanos = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap()
.as_nanos();
std::env::temp_dir().join(format!("{}-{}-{}.toml", name, std::process::id(), nanos))
}
#[test]
fn returns_default_when_file_is_missing() {
let path = temp_path("photon-config-missing");
let config: TestConfig = load_toml_config(&path).unwrap();
assert_eq!(config, TestConfig::default());
}
#[test]
fn loads_existing_toml_file() {
let path = temp_path("photon-config-load");
fs::write(&path, "value = \"hello\"\n").unwrap();
let config: TestConfig = load_toml_config(&path).unwrap();
assert_eq!(config.value, "hello");
let _ = fs::remove_file(path);
}
}

Some files were not shown because too many files have changed in this diff Show more