Implement host lifecycle orchestration and distributed storage restructuring

This commit is contained in:
centra 2026-03-27 12:14:12 +09:00
parent a7d5cfa738
commit 6fa172eab1
Signed by: centra
GPG key ID: 0C09689D20B25ACA
124 changed files with 21742 additions and 4016 deletions

552
apigateway/Cargo.lock generated

File diff suppressed because it is too large Load diff

434
chainfire/Cargo.lock generated
View file

@ -342,6 +342,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
[[package]]
name = "cfg_aliases"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
[[package]]
name = "chainfire-api"
version = "0.1.0"
@ -471,6 +477,7 @@ dependencies = [
"http-body-util",
"metrics",
"metrics-exporter-prometheus",
"reqwest",
"serde",
"serde_json",
"tempfile",
@ -786,6 +793,17 @@ dependencies = [
"crypto-common",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "dlv-list"
version = "0.3.0"
@ -978,8 +996,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"wasi",
"wasm-bindgen",
]
[[package]]
@ -989,9 +1009,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
dependencies = [
"cfg-if",
"js-sys",
"libc",
"r-efi",
"wasip2",
"wasm-bindgen",
]
[[package]]
@ -1150,6 +1172,7 @@ dependencies = [
"tokio",
"tokio-rustls",
"tower-service",
"webpki-roots",
]
[[package]]
@ -1171,6 +1194,7 @@ version = "0.1.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-channel",
"futures-core",
@ -1178,7 +1202,9 @@ dependencies = [
"http",
"http-body",
"hyper",
"ipnet",
"libc",
"percent-encoding",
"pin-project-lite",
"socket2 0.6.1",
"tokio",
@ -1210,6 +1236,108 @@ dependencies = [
"cc",
]
[[package]]
name = "icu_collections"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
dependencies = [
"displaydoc",
"potential_utf",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locale_core"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_normalizer"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
dependencies = [
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
[[package]]
name = "icu_properties"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
dependencies = [
"icu_collections",
"icu_locale_core",
"icu_properties_data",
"icu_provider",
"zerotrie",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "2.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
[[package]]
name = "icu_provider"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
dependencies = [
"displaydoc",
"icu_locale_core",
"writeable",
"yoke",
"zerofrom",
"zerotrie",
"zerovec",
]
[[package]]
name = "idna"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
dependencies = [
"idna_adapter",
"smallvec",
"utf8_iter",
]
[[package]]
name = "idna_adapter"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
dependencies = [
"icu_normalizer",
"icu_properties",
]
[[package]]
name = "indexmap"
version = "1.9.3"
@ -1236,6 +1364,16 @@ version = "2.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
[[package]]
name = "iri-string"
version = "0.7.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "is-terminal"
version = "0.4.17"
@ -1367,6 +1505,12 @@ version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
[[package]]
name = "litemap"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
[[package]]
name = "lock_api"
version = "0.4.14"
@ -1382,6 +1526,12 @@ version = "0.4.29"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
[[package]]
name = "lru-slab"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
[[package]]
name = "lz4-sys"
version = "1.11.1+lz4-1.10.0"
@ -1730,6 +1880,15 @@ dependencies = [
"serde",
]
[[package]]
name = "potential_utf"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
dependencies = [
"zerovec",
]
[[package]]
name = "ppv-lite86"
version = "0.2.21"
@ -1889,6 +2048,61 @@ dependencies = [
"winapi",
]
[[package]]
name = "quinn"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
dependencies = [
"bytes",
"cfg_aliases",
"pin-project-lite",
"quinn-proto",
"quinn-udp",
"rustc-hash",
"rustls",
"socket2 0.6.1",
"thiserror 2.0.17",
"tokio",
"tracing",
"web-time",
]
[[package]]
name = "quinn-proto"
version = "0.11.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
dependencies = [
"bytes",
"getrandom 0.3.4",
"lru-slab",
"rand 0.9.2",
"ring",
"rustc-hash",
"rustls",
"rustls-pki-types",
"slab",
"thiserror 2.0.17",
"tinyvec",
"tracing",
"web-time",
]
[[package]]
name = "quinn-udp"
version = "0.5.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
dependencies = [
"cfg_aliases",
"libc",
"once_cell",
"socket2 0.6.1",
"tracing",
"windows-sys 0.60.2",
]
[[package]]
name = "quote"
version = "1.0.42"
@ -2030,6 +2244,44 @@ version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
[[package]]
name = "reqwest"
version = "0.12.28"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
dependencies = [
"base64 0.22.1",
"bytes",
"futures-core",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-util",
"js-sys",
"log",
"percent-encoding",
"pin-project-lite",
"quinn",
"rustls",
"rustls-pki-types",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tokio-rustls",
"tower 0.5.2",
"tower-http",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
"webpki-roots",
]
[[package]]
name = "ring"
version = "0.17.14"
@ -2137,6 +2389,7 @@ version = "1.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "708c0f9d5f54ba0272468c1d306a52c495b31fa155e91bc25371e6df7996908c"
dependencies = [
"web-time",
"zeroize",
]
@ -2359,6 +2612,12 @@ dependencies = [
"windows-sys 0.60.2",
]
[[package]]
name = "stable_deref_trait"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "strsim"
version = "0.11.1"
@ -2387,6 +2646,20 @@ name = "sync_wrapper"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
dependencies = [
"futures-core",
]
[[package]]
name = "synstructure"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "tempfile"
@ -2450,6 +2723,16 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "tinystr"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]]
name = "tinytemplate"
version = "1.2.1"
@ -2460,6 +2743,21 @@ dependencies = [
"serde_json",
]
[[package]]
name = "tinyvec"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "tokio"
version = "1.48.0"
@ -2676,9 +2974,12 @@ checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
dependencies = [
"bitflags 2.10.0",
"bytes",
"futures-util",
"http",
"http-body",
"iri-string",
"pin-project-lite",
"tower 0.5.2",
"tower-layer",
"tower-service",
"tracing",
@ -2788,6 +3089,24 @@ version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "url"
version = "2.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
dependencies = [
"form_urlencoded",
"idna",
"percent-encoding",
"serde",
]
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "utf8parse"
version = "0.2.2"
@ -2871,6 +3190,19 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
dependencies = [
"cfg-if",
"js-sys",
"once_cell",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.106"
@ -2913,6 +3245,25 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "web-time"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "webpki-roots"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
dependencies = [
"rustls-pki-types",
]
[[package]]
name = "winapi"
version = "0.3.9"
@ -3174,6 +3525,12 @@ version = "0.46.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
[[package]]
name = "writeable"
version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
[[package]]
name = "yaml-rust"
version = "0.4.5"
@ -3183,6 +3540,29 @@ dependencies = [
"linked-hash-map",
]
[[package]]
name = "yoke"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
dependencies = [
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.8.31"
@ -3203,12 +3583,66 @@ dependencies = [
"syn",
]
[[package]]
name = "zerofrom"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
dependencies = [
"proc-macro2",
"quote",
"syn",
"synstructure",
]
[[package]]
name = "zeroize"
version = "1.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
[[package]]
name = "zerotrie"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
]
[[package]]
name = "zerovec"
version = "0.11.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "zstd-sys"
version = "2.0.16+zstd.1.5.7"

View file

@ -18,11 +18,17 @@ use chainfire_proto::proto::{
StatusRequest,
TxnRequest,
};
use std::time::Duration;
use tonic::Code;
use tonic::transport::Channel;
use tracing::debug;
use tracing::{debug, warn};
/// Chainfire client
pub struct Client {
/// Configured client endpoints
endpoints: Vec<String>,
/// Preferred endpoint index
current_endpoint: usize,
/// gRPC channel
channel: Channel,
/// KV client
@ -34,36 +40,187 @@ pub struct Client {
impl Client {
/// Connect to a Chainfire server
pub async fn connect(addr: impl AsRef<str>) -> Result<Self> {
let addr = addr.as_ref().to_string();
debug!(addr = %addr, "Connecting to Chainfire");
let endpoints = parse_endpoints(addr.as_ref())?;
let mut last_error = None;
let channel = Channel::from_shared(addr)
.map_err(|e| ClientError::Connection(e.to_string()))?
.connect()
.await?;
let kv = KvClient::new(channel.clone());
let cluster = ClusterClient::new(channel.clone());
Ok(Self {
for (index, endpoint) in endpoints.iter().enumerate() {
match connect_endpoint(endpoint).await {
Ok((channel, kv, cluster)) => {
debug!(endpoint = %endpoint, "Connected to Chainfire");
let mut client = Self {
endpoints: endpoints.clone(),
current_endpoint: index,
channel,
kv,
cluster,
})
};
client.promote_leader_endpoint().await?;
return Ok(client);
}
Err(error) => {
warn!(endpoint = %endpoint, error = %error, "Chainfire endpoint connect failed");
last_error = Some(error);
}
}
}
Err(last_error.unwrap_or_else(|| ClientError::Connection("no Chainfire endpoints configured".to_string())))
}
async fn with_kv_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
where
F: FnMut(KvClient<Channel>) -> Fut,
Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
{
let max_attempts = self.endpoints.len().max(1) * 3;
let mut last_status = None;
for attempt in 0..max_attempts {
let client = self.kv.clone();
match op(client).await {
Ok(value) => return Ok(value),
Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
warn!(
endpoint = %self.endpoints[self.current_endpoint],
code = ?status.code(),
message = %status.message(),
attempt = attempt + 1,
max_attempts,
"retrying Chainfire KV RPC on alternate endpoint"
);
last_status = Some(status);
self.recover_after_status(last_status.as_ref().unwrap()).await?;
tokio::time::sleep(retry_delay(attempt)).await;
}
Err(status) => return Err(status.into()),
}
}
Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire KV retry exhausted")).into())
}
async fn with_cluster_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
where
F: FnMut(ClusterClient<Channel>) -> Fut,
Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
{
let max_attempts = self.endpoints.len().max(1) * 3;
let mut last_status = None;
for attempt in 0..max_attempts {
let client = self.cluster.clone();
match op(client).await {
Ok(value) => return Ok(value),
Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
warn!(
endpoint = %self.endpoints[self.current_endpoint],
code = ?status.code(),
message = %status.message(),
attempt = attempt + 1,
max_attempts,
"retrying Chainfire cluster RPC on alternate endpoint"
);
last_status = Some(status);
self.recover_after_status(last_status.as_ref().unwrap()).await?;
tokio::time::sleep(retry_delay(attempt)).await;
}
Err(status) => return Err(status.into()),
}
}
Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire cluster retry exhausted")).into())
}
async fn recover_after_status(&mut self, status: &tonic::Status) -> Result<()> {
if let Some(leader_idx) = self.discover_leader_endpoint().await? {
if leader_idx != self.current_endpoint {
return self.reconnect_to_index(leader_idx).await;
}
}
if self.endpoints.len() > 1 {
let next = (self.current_endpoint + 1) % self.endpoints.len();
if next != self.current_endpoint {
return self.reconnect_to_index(next).await;
}
}
Err(ClientError::Rpc(status.clone()))
}
async fn reconnect_to_index(&mut self, index: usize) -> Result<()> {
let endpoint = self
.endpoints
.get(index)
.ok_or_else(|| ClientError::Connection(format!("invalid Chainfire endpoint index {index}")))?
.clone();
let (channel, kv, cluster) = connect_endpoint(&endpoint).await?;
self.current_endpoint = index;
self.channel = channel;
self.kv = kv;
self.cluster = cluster;
Ok(())
}
async fn promote_leader_endpoint(&mut self) -> Result<()> {
if let Some(index) = self.discover_leader_endpoint().await? {
if index != self.current_endpoint {
self.reconnect_to_index(index).await?;
}
}
Ok(())
}
async fn discover_leader_endpoint(&self) -> Result<Option<usize>> {
for (index, endpoint) in self.endpoints.iter().enumerate() {
let mut cluster = match ClusterClient::connect(endpoint.clone()).await {
Ok(client) => client,
Err(error) => {
warn!(endpoint = %endpoint, error = %error, "failed to connect while probing Chainfire leader");
continue;
}
};
match cluster.status(StatusRequest {}).await {
Ok(response) => {
let status = response.into_inner();
let member_id = status.header.as_ref().map(|header| header.member_id).unwrap_or(0);
if status.leader != 0 && status.leader == member_id {
return Ok(Some(index));
}
}
Err(status) => {
warn!(
endpoint = %endpoint,
code = ?status.code(),
message = %status.message(),
"failed to query Chainfire leader status"
);
}
}
}
Ok(None)
}
/// Put a key-value pair
pub async fn put(&mut self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result<u64> {
let key = key.as_ref().to_vec();
let value = value.as_ref().to_vec();
let resp = self
.kv
.put(PutRequest {
key: key.as_ref().to_vec(),
value: value.as_ref().to_vec(),
.with_kv_retry(|mut kv| {
let key = key.clone();
let value = value.clone();
async move {
kv.put(PutRequest {
key,
value,
lease: 0,
prev_kv: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
Ok(resp.header.map(|h| h.revision as u64).unwrap_or(0))
}
@ -86,19 +243,25 @@ impl Client {
&mut self,
key: impl AsRef<[u8]>,
) -> Result<Option<(Vec<u8>, u64)>> {
let key = key.as_ref().to_vec();
let resp = self
.kv
.range(RangeRequest {
key: key.as_ref().to_vec(),
.with_kv_retry(|mut kv| {
let key = key.clone();
async move {
kv.range(RangeRequest {
key,
range_end: vec![],
limit: 1,
revision: 0,
keys_only: false,
count_only: false,
serializable: false, // default: linearizable read
serializable: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64)))
}
@ -132,14 +295,20 @@ impl Client {
})),
};
self.kv
.txn(TxnRequest {
self.with_kv_retry(|mut kv| {
let compare = compare.clone();
let put_op = put_op.clone();
async move {
kv.txn(TxnRequest {
compare: vec![compare],
success: vec![put_op],
failure: vec![],
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
Ok(())
}
@ -152,15 +321,21 @@ impl Client {
/// Delete a key
pub async fn delete(&mut self, key: impl AsRef<[u8]>) -> Result<bool> {
let key = key.as_ref().to_vec();
let resp = self
.kv
.delete(DeleteRangeRequest {
key: key.as_ref().to_vec(),
.with_kv_retry(|mut kv| {
let key = key.clone();
async move {
kv.delete(DeleteRangeRequest {
key,
range_end: vec![],
prev_kv: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
Ok(resp.deleted > 0)
}
@ -171,9 +346,12 @@ impl Client {
let range_end = prefix_end(prefix);
let resp = self
.kv
.range(RangeRequest {
key: prefix.to_vec(),
.with_kv_retry(|mut kv| {
let key = prefix.to_vec();
let range_end = range_end.clone();
async move {
kv.range(RangeRequest {
key,
range_end,
limit: 0,
revision: 0,
@ -181,8 +359,11 @@ impl Client {
count_only: false,
serializable: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
Ok(resp.kvs.into_iter().map(|kv| (kv.key, kv.value)).collect())
}
@ -197,9 +378,12 @@ impl Client {
let range_end = prefix_end(prefix);
let resp = self
.kv
.range(RangeRequest {
key: prefix.to_vec(),
.with_kv_retry(|mut kv| {
let key = prefix.to_vec();
let range_end = range_end.clone();
async move {
kv.range(RangeRequest {
key,
range_end,
limit,
revision: 0,
@ -207,8 +391,11 @@ impl Client {
count_only: false,
serializable: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
let more = resp.more;
let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
@ -238,18 +425,24 @@ impl Client {
limit: i64,
) -> Result<(Vec<(Vec<u8>, Vec<u8>, u64)>, Option<Vec<u8>>)> {
let resp = self
.kv
.range(RangeRequest {
key: start.as_ref().to_vec(),
range_end: end.as_ref().to_vec(),
.with_kv_retry(|mut kv| {
let key = start.as_ref().to_vec();
let range_end = end.as_ref().to_vec();
async move {
kv.range(RangeRequest {
key,
range_end,
limit,
revision: 0,
keys_only: false,
count_only: false,
serializable: false,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
let more = resp.more;
let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
@ -309,14 +502,21 @@ impl Client {
};
let resp = self
.kv
.txn(TxnRequest {
.with_kv_retry(|mut kv| {
let compare = compare.clone();
let put_op = put_op.clone();
let read_on_fail = read_on_fail.clone();
async move {
kv.txn(TxnRequest {
compare: vec![compare],
success: vec![put_op],
failure: vec![read_on_fail],
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
if resp.succeeded {
let new_version = resp
@ -371,10 +571,13 @@ impl Client {
/// Get cluster status
pub async fn status(&mut self) -> Result<ClusterStatus> {
let resp = self
.cluster
.with_cluster_retry(|mut cluster| async move {
cluster
.status(StatusRequest {})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
})
.await?;
Ok(ClusterStatus {
version: resp.version,
@ -392,15 +595,22 @@ impl Client {
/// # Returns
/// The node ID of the added member
pub async fn member_add(&mut self, node_id: u64, peer_url: impl AsRef<str>, is_learner: bool) -> Result<u64> {
let peer_url = peer_url.as_ref().to_string();
let resp = self
.cluster
.with_cluster_retry(|mut cluster| {
let peer_url = peer_url.clone();
async move {
cluster
.member_add(MemberAddRequest {
node_id,
peer_urls: vec![peer_url.as_ref().to_string()],
peer_urls: vec![peer_url],
is_learner,
})
.await?
.into_inner();
.await
.map(|resp| resp.into_inner())
}
})
.await?;
// Extract the member ID from the response
let member_id = resp
@ -410,7 +620,7 @@ impl Client {
debug!(
member_id = member_id,
peer_url = peer_url.as_ref(),
peer_url = peer_url.as_str(),
is_learner = is_learner,
"Added member to cluster"
);
@ -441,6 +651,64 @@ pub struct CasOutcome {
pub new_version: u64,
}
fn parse_endpoints(input: &str) -> Result<Vec<String>> {
let endpoints: Vec<String> = input
.split(',')
.map(str::trim)
.filter(|value| !value.is_empty())
.map(normalize_endpoint)
.collect();
if endpoints.is_empty() {
return Err(ClientError::Connection("no Chainfire endpoints configured".to_string()));
}
Ok(endpoints)
}
fn normalize_endpoint(endpoint: &str) -> String {
if endpoint.contains("://") {
endpoint.to_string()
} else {
format!("http://{endpoint}")
}
}
async fn connect_endpoint(endpoint: &str) -> Result<(Channel, KvClient<Channel>, ClusterClient<Channel>)> {
let channel = Channel::from_shared(endpoint.to_string())
.map_err(|e| ClientError::Connection(e.to_string()))?
.connect()
.await?;
let kv = KvClient::new(channel.clone());
let cluster = ClusterClient::new(channel.clone());
Ok((channel, kv, cluster))
}
fn retry_delay(attempt: usize) -> Duration {
let multiplier = 1u64 << attempt.min(3);
Duration::from_millis((200 * multiplier).min(1_000))
}
fn is_retryable_status(status: &tonic::Status) -> bool {
matches!(
status.code(),
Code::Unavailable | Code::DeadlineExceeded | Code::Internal | Code::Aborted | Code::FailedPrecondition
) || retryable_message(status.message())
}
fn retryable_message(message: &str) -> bool {
let lowercase = message.to_ascii_lowercase();
lowercase.contains("not leader")
|| lowercase.contains("leader_id")
|| lowercase.contains("transport error")
|| lowercase.contains("connection was not ready")
|| lowercase.contains("deadline has elapsed")
|| lowercase.contains("broken pipe")
|| lowercase.contains("connection reset")
|| lowercase.contains("connection refused")
}
/// Calculate prefix end for range queries
fn prefix_end(prefix: &[u8]) -> Vec<u8> {
let mut end = prefix.to_vec();
@ -463,4 +731,30 @@ mod tests {
assert_eq!(prefix_end(b"abc"), b"abd");
assert_eq!(prefix_end(b"/nodes/"), b"/nodes0");
}
#[test]
fn normalize_endpoint_adds_http_scheme() {
assert_eq!(normalize_endpoint("127.0.0.1:2379"), "http://127.0.0.1:2379");
assert_eq!(normalize_endpoint("http://127.0.0.1:2379"), "http://127.0.0.1:2379");
}
#[test]
fn parse_endpoints_accepts_comma_separated_values() {
let endpoints = parse_endpoints("127.0.0.1:2379, http://127.0.0.2:2379").unwrap();
assert_eq!(
endpoints,
vec![
"http://127.0.0.1:2379".to_string(),
"http://127.0.0.2:2379".to_string()
]
);
}
#[test]
fn retryable_message_covers_not_leader_and_transport() {
assert!(retryable_message("NotLeader { leader_id: Some(1) }"));
assert!(retryable_message("transport error"));
assert!(retryable_message("connection was not ready"));
assert!(!retryable_message("permission denied"));
}
}

View file

@ -27,17 +27,25 @@ pub struct ClusterServiceImpl {
rpc_client: Arc<crate::GrpcRaftClient>,
/// Cluster ID
cluster_id: u64,
/// Configured members with client and peer URLs
members: Vec<Member>,
/// Server version
version: String,
}
impl ClusterServiceImpl {
/// Create a new cluster service
pub fn new(raft: Arc<RaftCore>, rpc_client: Arc<crate::GrpcRaftClient>, cluster_id: u64) -> Self {
pub fn new(
raft: Arc<RaftCore>,
rpc_client: Arc<crate::GrpcRaftClient>,
cluster_id: u64,
members: Vec<Member>,
) -> Self {
Self {
raft,
rpc_client,
cluster_id,
members,
version: env!("CARGO_PKG_VERSION").to_string(),
}
}
@ -47,16 +55,19 @@ impl ClusterServiceImpl {
}
/// Get current members as proto Member list
/// NOTE: Custom RaftCore doesn't track membership dynamically yet
/// NOTE: Custom RaftCore doesn't track membership dynamically yet, so this returns
/// the configured static membership that the server was booted with.
async fn get_member_list(&self) -> Vec<Member> {
// For now, return only the current node
vec![Member {
if self.members.is_empty() {
return vec![Member {
id: self.raft.node_id(),
name: format!("node-{}", self.raft.node_id()),
peer_urls: vec![],
client_urls: vec![],
is_learner: false,
}]
}];
}
self.members.clone()
}
}

View file

@ -42,6 +42,7 @@ http-body-util = { workspace = true }
uuid = { version = "1.11", features = ["v4", "serde"] }
chrono = { version = "0.4", features = ["serde"] }
serde_json = "1.0"
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
# Configuration
clap.workspace = true

View file

@ -11,13 +11,14 @@
use axum::{
extract::{Path, Query, State},
http::StatusCode,
routing::{delete, get, post, put},
routing::{get, post},
Json, Router,
};
use chainfire_api::GrpcRaftClient;
use chainfire_raft::RaftCore;
use chainfire_raft::{core::RaftError, RaftCore};
use chainfire_types::command::RaftCommand;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
/// REST API state
@ -26,16 +27,18 @@ pub struct RestApiState {
pub raft: Arc<RaftCore>,
pub cluster_id: u64,
pub rpc_client: Option<Arc<GrpcRaftClient>>,
pub http_client: reqwest::Client,
pub peer_http_addrs: Arc<HashMap<u64, String>>,
}
/// Standard REST error response
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct ErrorResponse {
pub error: ErrorDetail,
pub meta: ResponseMeta,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct ErrorDetail {
pub code: String,
pub message: String,
@ -43,7 +46,7 @@ pub struct ErrorDetail {
pub details: Option<serde_json::Value>,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct ResponseMeta {
pub request_id: String,
pub timestamp: String,
@ -59,7 +62,7 @@ impl ResponseMeta {
}
/// Standard REST success response
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct SuccessResponse<T> {
pub data: T,
pub meta: ResponseMeta,
@ -75,25 +78,25 @@ impl<T> SuccessResponse<T> {
}
/// KV Put request body
#[derive(Debug, Deserialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct PutRequest {
pub value: String,
}
/// KV Get response
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct GetResponse {
pub key: String,
pub value: String,
}
/// KV List response
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct ListResponse {
pub items: Vec<KvItem>,
}
#[derive(Debug, Serialize)]
#[derive(Debug, Serialize, Deserialize)]
pub struct KvItem {
pub key: String,
pub value: String,
@ -129,6 +132,13 @@ pub struct AddMemberRequestLegacy {
#[derive(Debug, Deserialize)]
pub struct PrefixQuery {
pub prefix: Option<String>,
pub consistency: Option<String>,
}
/// Query parameters for key reads
#[derive(Debug, Default, Deserialize)]
pub struct ReadQuery {
pub consistency: Option<String>,
}
/// Build the REST API router
@ -153,80 +163,11 @@ async fn health_check() -> (StatusCode, Json<SuccessResponse<serde_json::Value>>
)
}
/// GET /api/v1/kv/{key} - Get value
async fn get_kv(
State(state): State<RestApiState>,
Path(key): Path<String>,
) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
let sm = state.raft.state_machine();
let key_bytes = key.as_bytes().to_vec();
let results = sm.kv()
.get(&key_bytes)
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
let value = results
.into_iter()
.next()
.ok_or_else(|| error_response(StatusCode::NOT_FOUND, "NOT_FOUND", "Key not found"))?;
Ok(Json(SuccessResponse::new(GetResponse {
key,
value: String::from_utf8_lossy(&value.value).to_string(),
})))
}
/// PUT /api/v1/kv/{key} - Put value
async fn put_kv(
State(state): State<RestApiState>,
Path(key): Path<String>,
Json(req): Json<PutRequest>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
let command = RaftCommand::Put {
key: key.as_bytes().to_vec(),
value: req.value.as_bytes().to_vec(),
lease_id: None,
prev_kv: false,
};
state
.raft
.client_write(command)
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
Ok((
StatusCode::OK,
Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
))
}
/// DELETE /api/v1/kv/{key} - Delete key
async fn delete_kv(
State(state): State<RestApiState>,
Path(key): Path<String>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
let command = RaftCommand::Delete {
key: key.as_bytes().to_vec(),
prev_kv: false,
};
state
.raft
.client_write(command)
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
Ok((
StatusCode::OK,
Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
))
}
/// GET /api/v1/kv/*key - Get value (wildcard for all keys)
async fn get_kv_wildcard(
State(state): State<RestApiState>,
Path(key): Path<String>,
Query(query): Query<ReadQuery>,
) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
// Use key as-is for simple keys, prepend / for namespaced keys
// Keys like "testkey" stay as "testkey", keys like "flaredb/stores/1" become "/flaredb/stores/1"
@ -235,6 +176,14 @@ async fn get_kv_wildcard(
} else {
key.clone()
};
if should_proxy_read(query.consistency.as_deref(), &state).await {
return proxy_read_to_leader(
&state,
&format!("/api/v1/kv/{}", full_key.trim_start_matches('/')),
None,
)
.await;
}
let sm = state.raft.state_machine();
let key_bytes = full_key.as_bytes().to_vec();
@ -272,11 +221,7 @@ async fn put_kv_wildcard(
prev_kv: false,
};
state
.raft
.client_write(command)
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
submit_rest_write(&state, command, Some(&req), &full_key, reqwest::Method::PUT).await?;
Ok((
StatusCode::OK,
@ -300,11 +245,7 @@ async fn delete_kv_wildcard(
prev_kv: false,
};
state
.raft
.client_write(command)
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
submit_rest_write(&state, command, None, &full_key, reqwest::Method::DELETE).await?;
Ok((
StatusCode::OK,
@ -317,6 +258,13 @@ async fn list_kv(
State(state): State<RestApiState>,
Query(params): Query<PrefixQuery>,
) -> Result<Json<SuccessResponse<ListResponse>>, (StatusCode, Json<ErrorResponse>)> {
if should_proxy_read(params.consistency.as_deref(), &state).await {
let query = params
.prefix
.as_ref()
.map(|prefix| vec![("prefix", prefix.as_str())]);
return proxy_read_to_leader(&state, "/api/v1/kv", query.as_deref()).await;
}
let prefix = params.prefix.unwrap_or_default();
let sm = state.raft.state_machine();
@ -446,3 +394,169 @@ fn error_response(
}),
)
}
async fn submit_rest_write(
state: &RestApiState,
command: RaftCommand,
body: Option<&PutRequest>,
key: &str,
method: reqwest::Method,
) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
match state.raft.client_write(command).await {
Ok(()) => Ok(()),
Err(RaftError::NotLeader { leader_id }) => {
let resolved_leader = match leader_id {
Some(leader_id) => Some(leader_id),
None => state.raft.leader().await,
};
proxy_write_to_leader(state, resolved_leader, key, method, body).await
}
Err(err) => Err(error_response(
StatusCode::INTERNAL_SERVER_ERROR,
"INTERNAL_ERROR",
&err.to_string(),
)),
}
}
async fn proxy_write_to_leader(
state: &RestApiState,
leader_id: Option<u64>,
key: &str,
method: reqwest::Method,
body: Option<&PutRequest>,
) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
let leader_id = leader_id.ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"NOT_LEADER",
"current node is not the leader and no leader is known yet",
)
})?;
let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"NOT_LEADER",
&format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
)
})?;
let url = format!(
"{}/api/v1/kv/{}",
leader_http_addr.trim_end_matches('/'),
key.trim_start_matches('/')
);
let mut request = state.http_client.request(method, &url);
if let Some(body) = body {
request = request.json(body);
}
let response = request.send().await.map_err(|err| {
error_response(
StatusCode::BAD_GATEWAY,
"LEADER_PROXY_FAILED",
&format!("failed to forward write to leader {leader_id}: {err}"),
)
})?;
if response.status().is_success() {
return Ok(());
}
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
Err((status, Json(payload)))
}
async fn should_proxy_read(consistency: Option<&str>, state: &RestApiState) -> bool {
let node_id = state.raft.node_id();
let leader_id = state.raft.leader().await;
read_requires_leader_proxy(consistency, node_id, leader_id)
}
fn read_requires_leader_proxy(
consistency: Option<&str>,
node_id: u64,
leader_id: Option<u64>,
) -> bool {
if matches!(consistency, Some(mode) if mode.eq_ignore_ascii_case("local")) {
return false;
}
matches!(leader_id, Some(leader_id) if leader_id != node_id)
}
async fn proxy_read_to_leader<T>(
state: &RestApiState,
path: &str,
query: Option<&[(&str, &str)]>,
) -> Result<Json<SuccessResponse<T>>, (StatusCode, Json<ErrorResponse>)>
where
T: for<'de> Deserialize<'de>,
{
let leader_id = state.raft.leader().await.ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"NOT_LEADER",
"current node is not the leader and no leader is known yet",
)
})?;
let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
error_response(
StatusCode::SERVICE_UNAVAILABLE,
"NOT_LEADER",
&format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
)
})?;
let url = format!(
"{}{}",
leader_http_addr.trim_end_matches('/'),
path
);
let mut request = state.http_client.get(&url);
if let Some(query) = query {
request = request.query(query);
}
let response = request.send().await.map_err(|err| {
error_response(
StatusCode::BAD_GATEWAY,
"LEADER_PROXY_FAILED",
&format!("failed to forward read to leader {leader_id}: {err}"),
)
})?;
if response.status().is_success() {
let payload = response.json::<SuccessResponse<T>>().await.map_err(|err| {
error_response(
StatusCode::BAD_GATEWAY,
"LEADER_PROXY_FAILED",
&format!("failed to decode leader {leader_id} response: {err}"),
)
})?;
return Ok(Json(payload));
}
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
error: ErrorDetail {
code: "LEADER_PROXY_FAILED".to_string(),
message: format!("leader {leader_id} returned {status}: {err}"),
details: None,
},
meta: ResponseMeta::new(),
});
Err((status, Json(payload)))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn read_requires_leader_proxy_defaults_to_leader_consistency() {
assert!(read_requires_leader_proxy(None, 2, Some(1)));
assert!(!read_requires_leader_proxy(Some("local"), 2, Some(1)));
assert!(!read_requires_leader_proxy(None, 2, Some(2)));
assert!(!read_requires_leader_proxy(None, 2, None));
}
}

View file

@ -11,10 +11,11 @@ use crate::rest::{build_router, RestApiState};
use anyhow::Result;
use chainfire_api::internal_proto::raft_service_server::RaftServiceServer;
use chainfire_api::proto::{
cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer,
cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer, Member,
};
use chainfire_api::{ClusterServiceImpl, KvServiceImpl, RaftServiceImpl, WatchServiceImpl};
use chainfire_types::RaftRole;
use std::collections::HashMap;
use std::sync::Arc;
use tokio::signal;
use tonic::transport::{Certificate, Identity, Server as TonicServer, ServerTlsConfig};
@ -109,6 +110,7 @@ impl Server {
Arc::clone(&raft),
rpc_client,
self.node.cluster_id(),
configured_members(&self.config),
);
// Internal Raft service for inter-node communication
@ -166,10 +168,24 @@ impl Server {
// HTTP REST API server
let http_addr = self.config.network.http_addr;
let http_port = self.config.network.http_addr.port();
let peer_http_addrs = Arc::new(
self.config
.cluster
.initial_members
.iter()
.filter_map(|member| {
http_endpoint_from_raft_addr(&member.raft_addr, http_port)
.map(|http_addr| (member.id, http_addr))
})
.collect::<HashMap<_, _>>(),
);
let rest_state = RestApiState {
raft: Arc::clone(&raft),
cluster_id: self.node.cluster_id(),
rpc_client: self.node.rpc_client().cloned(),
http_client: reqwest::Client::new(),
peer_http_addrs,
};
let rest_app = build_router(rest_state);
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
@ -286,3 +302,45 @@ impl Server {
Ok(())
}
}
fn http_endpoint_from_raft_addr(raft_addr: &str, http_port: u16) -> Option<String> {
if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
return Some(format!("http://{}:{}", addr.ip(), http_port));
}
let (host, _) = raft_addr.rsplit_once(':')?;
Some(format!("http://{}:{}", host, http_port))
}
fn grpc_endpoint_from_raft_addr(raft_addr: &str, api_port: u16) -> Option<String> {
if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
return Some(format!("http://{}:{}", addr.ip(), api_port));
}
let (host, _) = raft_addr.rsplit_once(':')?;
Some(format!("http://{}:{}", host, api_port))
}
fn normalize_peer_url(raft_addr: &str) -> String {
if raft_addr.contains("://") {
raft_addr.to_string()
} else {
format!("http://{raft_addr}")
}
}
fn configured_members(config: &ServerConfig) -> Vec<Member> {
let api_port = config.network.api_addr.port();
config
.cluster
.initial_members
.iter()
.map(|member| Member {
id: member.id,
name: format!("node-{}", member.id),
peer_urls: vec![normalize_peer_url(&member.raft_addr)],
client_urls: grpc_endpoint_from_raft_addr(&member.raft_addr, api_port)
.into_iter()
.collect(),
is_learner: false,
})
.collect()
}

1114
coronafs/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -24,6 +24,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] }
anyhow = "1.0"
thiserror = "1.0"
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
[workspace.lints.rust]
unsafe_code = "deny"

View file

@ -21,7 +21,11 @@ tracing-subscriber = { workspace = true }
anyhow = { workspace = true }
thiserror = { workspace = true }
chrono = { workspace = true }
reqwest = { workspace = true }
futures-util = "0.3"
[dev-dependencies]
tempfile = "3"
[lints]
workspace = true

View file

@ -2,9 +2,40 @@ use serde::{Deserialize, Serialize};
use std::net::SocketAddr;
use std::path::PathBuf;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum ServerMode {
Combined,
Controller,
Node,
}
impl Default for ServerMode {
fn default() -> Self {
Self::Combined
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum MetadataBackend {
Filesystem,
Chainfire,
}
impl Default for MetadataBackend {
fn default() -> Self {
Self::Filesystem
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
pub struct ServerConfig {
pub mode: ServerMode,
pub metadata_backend: MetadataBackend,
pub chainfire_api_url: Option<String>,
pub chainfire_key_prefix: String,
pub listen_addr: SocketAddr,
pub advertise_host: String,
pub data_dir: PathBuf,
@ -26,6 +57,10 @@ pub struct ServerConfig {
impl Default for ServerConfig {
fn default() -> Self {
Self {
mode: ServerMode::Combined,
metadata_backend: MetadataBackend::Filesystem,
chainfire_api_url: None,
chainfire_key_prefix: "/coronafs/volumes".to_string(),
listen_addr: "0.0.0.0:50088".parse().expect("valid listen addr"),
advertise_host: "127.0.0.1".to_string(),
data_dir: PathBuf::from("/var/lib/coronafs"),
@ -34,7 +69,7 @@ impl Default for ServerConfig {
export_port_count: 512,
export_shared_clients: 32,
export_cache_mode: "none".to_string(),
export_aio_mode: "io_uring".to_string(),
export_aio_mode: "threads".to_string(),
export_discard_mode: "unmap".to_string(),
export_detect_zeroes_mode: "unmap".to_string(),
preallocate: true,
@ -47,6 +82,14 @@ impl Default for ServerConfig {
}
impl ServerConfig {
pub fn supports_controller_api(&self) -> bool {
matches!(self.mode, ServerMode::Combined | ServerMode::Controller)
}
pub fn supports_node_api(&self) -> bool {
matches!(self.mode, ServerMode::Combined | ServerMode::Node)
}
pub fn volume_dir(&self) -> PathBuf {
self.data_dir.join("volumes")
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,231 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
require_cmd() {
command -v "$1" >/dev/null 2>&1 || {
echo "missing required command: $1" >&2
exit 1
}
}
for cmd in curl qemu-io; do
require_cmd "${cmd}"
done
if ! command -v jq >/dev/null 2>&1 && ! command -v python3 >/dev/null 2>&1; then
echo "missing required command: jq or python3" >&2
exit 1
fi
json_get() {
local query="$1"
if command -v jq >/dev/null 2>&1; then
jq -r "${query}"
else
python3 -c 'import json,sys
data=json.load(sys.stdin)
value=data
for part in sys.argv[1].split("."):
if not part:
continue
value=value.get(part) if isinstance(value, dict) else None
if value is None:
break
print("" if value is None else value)
' "${query}"
fi
}
RUN_ID="${CORONAFS_BENCH_RUN_ID:-$$}"
LISTEN_PORT="${CORONAFS_BENCH_PORT:-$((25088 + (RUN_ID % 1000)))}"
EXPORT_BASE_PORT="${CORONAFS_BENCH_EXPORT_BASE_PORT:-$((26100 + (RUN_ID % 1000)))}"
VOLUME_ID="${CORONAFS_BENCH_VOLUME_ID:-local-bench-${RUN_ID}}"
SIZE_MIB="${CORONAFS_BENCH_SIZE_MIB:-${CORONAFS_BENCH_SIZE_MB:-512}}"
SIZE_BYTES="${CORONAFS_BENCH_SIZE_BYTES:-$((SIZE_MIB * 1024 * 1024))}"
WORKLOAD_MIB="${CORONAFS_BENCH_WORKLOAD_MIB:-${CORONAFS_BENCH_WORKLOAD_MB:-256}}"
EXPORT_CACHE_MODE="${CORONAFS_BENCH_EXPORT_CACHE_MODE:-none}"
EXPORT_AIO_MODE="${CORONAFS_BENCH_EXPORT_AIO_MODE:-threads}"
EXPORT_DISCARD_MODE="${CORONAFS_BENCH_EXPORT_DISCARD_MODE:-ignore}"
EXPORT_DETECT_ZEROES_MODE="${CORONAFS_BENCH_EXPORT_DETECT_ZEROES_MODE:-off}"
SERVER_BIN="${CORONAFS_SERVER_BIN:-}"
if (( WORKLOAD_MIB > SIZE_MIB )); then
echo "workload ${WORKLOAD_MIB} MiB exceeds volume size ${SIZE_MIB} MiB" >&2
exit 1
fi
if [[ -z "${SERVER_BIN}" ]]; then
SERVER_CMD=(
cargo run
--manifest-path "${REPO_ROOT}/coronafs/Cargo.toml"
-p coronafs-server
--
)
else
SERVER_CMD=("${SERVER_BIN}")
fi
TMP_DIR="$(mktemp -d)"
CONFIG_PATH="${TMP_DIR}/coronafs.toml"
SERVER_LOG="${TMP_DIR}/coronafs.log"
SERVER_PID=""
show_server_log() {
if [[ -f "${SERVER_LOG}" ]]; then
echo "--- coronafs server log ---" >&2
tail -n 200 "${SERVER_LOG}" >&2 || true
echo "--- end coronafs server log ---" >&2
fi
}
delete_volume_if_present() {
curl -fsS -X DELETE "http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}" >/dev/null 2>&1 || true
}
cleanup() {
delete_volume_if_present
local pid_file="${TMP_DIR}/data/pids/${VOLUME_ID}.pid"
if [[ -f "${pid_file}" ]]; then
local export_pid=""
export_pid="$(tr -d '\n' <"${pid_file}" 2>/dev/null || true)"
if [[ -n "${export_pid}" ]] && kill -0 "${export_pid}" 2>/dev/null; then
kill "${export_pid}" >/dev/null 2>&1 || true
wait "${export_pid}" >/dev/null 2>&1 || true
fi
rm -f "${pid_file}"
fi
if [[ -n "${SERVER_PID}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
kill "${SERVER_PID}" >/dev/null 2>&1 || true
wait "${SERVER_PID}" >/dev/null 2>&1 || true
fi
rm -rf "${TMP_DIR}"
}
trap cleanup EXIT
cat >"${CONFIG_PATH}" <<EOF
listen_addr = "127.0.0.1:${LISTEN_PORT}"
advertise_host = "127.0.0.1"
data_dir = "${TMP_DIR}/data"
export_bind_addr = "127.0.0.1"
export_base_port = ${EXPORT_BASE_PORT}
export_port_count = 8
export_shared_clients = 32
export_cache_mode = "${EXPORT_CACHE_MODE}"
export_aio_mode = "${EXPORT_AIO_MODE}"
export_discard_mode = "${EXPORT_DISCARD_MODE}"
export_detect_zeroes_mode = "${EXPORT_DETECT_ZEROES_MODE}"
preallocate = false
sync_on_write = false
log_level = "info"
EOF
"${SERVER_CMD[@]}" --config "${CONFIG_PATH}" >"${SERVER_LOG}" 2>&1 &
SERVER_PID="$!"
deadline=$((SECONDS + 60))
until curl -fsS "http://127.0.0.1:${LISTEN_PORT}/healthz" >/dev/null 2>&1; do
if (( SECONDS >= deadline )); then
echo "timed out waiting for coronafs local bench server" >&2
tail -n 200 "${SERVER_LOG}" >&2 || true
exit 1
fi
sleep 1
done
create_response_file="${TMP_DIR}/create-response.txt"
create_status="$(
curl -sS \
-o "${create_response_file}" \
-w '%{http_code}' \
-X PUT \
-H 'content-type: application/json' \
-d "{\"size_bytes\":${SIZE_BYTES}}" \
"http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}"
)"
if [[ "${create_status}" -lt 200 || "${create_status}" -ge 300 ]]; then
echo "failed to create CoronaFS benchmark volume: HTTP ${create_status}" >&2
cat "${create_response_file}" >&2 || true
show_server_log
exit 1
fi
export_response_file="${TMP_DIR}/export-response.txt"
export_status="$(
curl -sS \
-o "${export_response_file}" \
-w '%{http_code}' \
-X POST \
"http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}/export"
)"
if [[ "${export_status}" -lt 200 || "${export_status}" -ge 300 ]]; then
echo "failed to export CoronaFS benchmark volume: HTTP ${export_status}" >&2
cat "${export_response_file}" >&2 || true
show_server_log
exit 1
fi
EXPORT_JSON="$(cat "${export_response_file}")"
EXPORT_URI="$(printf '%s' "${EXPORT_JSON}" | json_get '.export.uri')"
[[ -n "${EXPORT_URI}" && "${EXPORT_URI}" != "null" ]] || {
echo "failed to obtain CoronaFS export URI" >&2
printf '%s\n' "${EXPORT_JSON}" >&2
show_server_log
exit 1
}
run_qemu_io() {
local extra=()
local start_ns end_ns elapsed_ns
local args=("$@")
local cmd=()
local qemu_cmd=""
if [[ "${#args[@]}" -eq 0 ]]; then
echo "run_qemu_io requires at least one qemu-io command" >&2
exit 1
fi
while [[ "${#args[@]}" -gt 0 && "${args[0]}" == --* ]]; do
extra+=("${args[0]}")
args=("${args[@]:1}")
done
cmd=(qemu-io -f raw "${extra[@]}")
for qemu_cmd in "${args[@]}"; do
cmd+=(-c "${qemu_cmd}")
done
cmd+=("${EXPORT_URI}")
start_ns="$(date +%s%N)"
"${cmd[@]}" >/dev/null
end_ns="$(date +%s%N)"
elapsed_ns="$((end_ns - start_ns))"
printf '%s\n' "${elapsed_ns}"
}
calc_mib_per_s() {
local bytes="$1"
local elapsed_ns="$2"
awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
BEGIN {
if (elapsed_ns <= 0) {
print "0.00"
} else {
printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
}
}
'
}
BYTES="$((WORKLOAD_MIB * 1024 * 1024))"
WRITE_NS="$(run_qemu_io "write -P 0x5a 0 ${WORKLOAD_MIB}M" "flush")"
READ_NS="$(run_qemu_io "read -P 0x5a 0 ${WORKLOAD_MIB}M")"
WRITE_MIBPS="$(calc_mib_per_s "${BYTES}" "${WRITE_NS}")"
READ_MIBPS="$(calc_mib_per_s "${BYTES}" "${READ_NS}")"
printf 'CoronaFS local export bench: uri=%s cache=%s aio=%s write=%s MiB/s read=%s MiB/s size=%s MiB\n' \
"${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}" "${WORKLOAD_MIB}"
printf '%s\t%s\t%s\t%s\t%s\n' "${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}"

561
creditservice/Cargo.lock generated

File diff suppressed because it is too large Load diff

525
deployer/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -12,8 +12,11 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
serde_yaml = "0.9"
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
chainfire-client = { path = "../../../chainfire/chainfire-client" }
deployer-types = { path = "../deployer-types" }
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
[dev-dependencies]
axum = { version = "0.7", features = ["macros"] }

View file

@ -4,7 +4,12 @@ use std::path::Path;
use anyhow::{Context, Result};
use chainfire_client::{Client, ClientError};
use deployer_types::{ClusterStateSpec, DesiredSystemSpec, InstallPlan, NodeConfig, NodeSpec};
use chrono::Utc;
use deployer_types::{
ClusterNodeRecord, ClusterStateSpec, CommissionState, DesiredSystemSpec, HostDeploymentSpec,
HostDeploymentStatus, InstallPlan, InstallState, NodeConfig, NodeSpec, ObservedSystemState,
PowerState,
};
use serde::de::DeserializeOwned;
use serde_json::{json, Value};
use tokio::fs;
@ -49,6 +54,56 @@ fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str)
.into_bytes()
}
fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}/observed-system",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn key_host_deployment_spec(
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
) -> Vec<u8> {
format!(
"{}deployments/hosts/{}/spec",
cluster_prefix(cluster_namespace, cluster_id),
deployment_name
)
.into_bytes()
}
fn key_host_deployment_status(
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
) -> Vec<u8> {
format!(
"{}deployments/hosts/{}/status",
cluster_prefix(cluster_namespace, cluster_id),
deployment_name
)
.into_bytes()
}
fn parse_commission_state(value: &str) -> Result<CommissionState> {
serde_json::from_str(&format!("\"{value}\""))
.with_context(|| format!("invalid commission state {value}"))
}
fn parse_install_state(value: &str) -> Result<InstallState> {
serde_json::from_str(&format!("\"{value}\""))
.with_context(|| format!("invalid install state {value}"))
}
fn parse_power_state(value: &str) -> Result<PowerState> {
serde_json::from_str(&format!("\"{value}\""))
.with_context(|| format!("invalid power state {value}"))
}
fn key_node_class(cluster_namespace: &str, cluster_id: &str, node_class: &str) -> Vec<u8> {
format!(
"{}node-classes/{}",
@ -178,6 +233,9 @@ fn desired_system_from_spec(node: &NodeSpec) -> Option<DesiredSystemSpec> {
if desired.rollback_on_failure.is_none() {
desired.rollback_on_failure = Some(true);
}
if desired.drain_before_apply.is_none() {
desired.drain_before_apply = Some(false);
}
if desired.nixos_configuration.is_some() {
Some(desired)
} else {
@ -322,6 +380,30 @@ async fn merge_existing_node_observed_fields(
if merged.state.is_none() {
merged.state = existing_node.state;
}
if merged.machine_id.is_none() {
merged.machine_id = existing_node.machine_id;
}
if merged.hardware_facts.is_none() {
merged.hardware_facts = existing_node.hardware_facts;
}
if merged.commission_state.is_none() {
merged.commission_state = existing_node.commission_state;
}
if merged.install_state.is_none() {
merged.install_state = existing_node.install_state;
}
if merged.commissioned_at.is_none() {
merged.commissioned_at = existing_node.commissioned_at;
}
if merged.last_inventory_hash.is_none() {
merged.last_inventory_hash = existing_node.last_inventory_hash;
}
if merged.power_state.is_none() {
merged.power_state = existing_node.power_state;
}
if merged.bmc_ref.is_none() {
merged.bmc_ref = existing_node.bmc_ref;
}
if merged.last_heartbeat.is_none() {
merged.last_heartbeat = existing_node.last_heartbeat;
}
@ -521,6 +603,13 @@ pub async fn bootstrap_cluster(
info!(enrollment_rule = %rule.name, "upserted enrollment rule");
}
for deployment in &spec.host_deployments {
let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
let value = serde_json::to_vec(deployment)?;
client.put(&key, &value).await?;
info!(deployment = %deployment.name, "upserted host deployment");
}
// 3. Service / Instance (必要であれば)
for svc in &spec.services {
let key = key_service(cluster_namespace, cluster_id, &svc.name);
@ -627,6 +716,11 @@ pub async fn apply_cluster_state(
let value = serde_json::to_vec(rule)?;
client.put(&key, &value).await?;
}
for deployment in &spec.host_deployments {
let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
let value = serde_json::to_vec(deployment)?;
client.put(&key, &value).await?;
}
for svc in &spec.services {
let key = key_service(cluster_namespace, cluster_id, &svc.name);
let value = serde_json::to_vec(svc)?;
@ -706,6 +800,421 @@ pub async fn dump_prefix(endpoint: &str, prefix: &str, json_output: bool) -> Res
.await
}
async fn get_json_key<T: DeserializeOwned>(client: &mut Client, key: &[u8]) -> Result<Option<T>> {
client
.get(key)
.await?
.map(|bytes| serde_json::from_slice::<T>(&bytes))
.transpose()
.with_context(|| format!("failed to decode key {}", String::from_utf8_lossy(key)))
}
pub async fn inspect_node(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
include_desired_system: bool,
include_observed_system: bool,
json_output: bool,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "inspect node", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let node_id = node_id.to_string();
async move {
let mut client = Client::connect(endpoint).await?;
let node = get_json_key::<ClusterNodeRecord>(
&mut client,
&key_node(&cluster_namespace, &cluster_id, &node_id),
)
.await?
.with_context(|| format!("node {} not found", node_id))?;
let desired_system = if include_desired_system {
get_json_key::<DesiredSystemSpec>(
&mut client,
&key_desired_system(&cluster_namespace, &cluster_id, &node_id),
)
.await?
} else {
None
};
let observed_system = if include_observed_system {
get_json_key::<ObservedSystemState>(
&mut client,
&key_observed_system(&cluster_namespace, &cluster_id, &node_id),
)
.await?
} else {
None
};
if json_output {
println!(
"{}",
serde_json::to_string_pretty(&json!({
"node": node,
"desired_system": desired_system,
"observed_system": observed_system,
}))?
);
} else {
println!("node_id={}", node.node_id);
println!("hostname={}", node.hostname);
println!("ip={}", node.ip);
println!("state={}", node.state.as_deref().unwrap_or("unknown"));
println!(
"commission_state={}",
node.commission_state
.map(|value| serde_json::to_string(&value).unwrap_or_default())
.unwrap_or_else(|| "\"unknown\"".to_string())
);
println!(
"install_state={}",
node.install_state
.map(|value| serde_json::to_string(&value).unwrap_or_default())
.unwrap_or_else(|| "\"unknown\"".to_string())
);
if let Some(observed_system) = observed_system {
println!(
"observed_status={}",
observed_system.status.unwrap_or_else(|| "unknown".to_string())
);
}
}
Ok(())
}
})
.await
}
pub async fn set_node_states(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
state: Option<String>,
commission_state: Option<String>,
install_state: Option<String>,
power_state: Option<String>,
bmc_ref: Option<String>,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "set node state", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let node_id = node_id.to_string();
let state = state.clone();
let commission_state = commission_state.clone();
let install_state = install_state.clone();
let power_state = power_state.clone();
let bmc_ref = bmc_ref.clone();
async move {
let mut client = Client::connect(endpoint).await?;
let key = key_node(&cluster_namespace, &cluster_id, &node_id);
let mut node = get_json_key::<ClusterNodeRecord>(&mut client, &key)
.await?
.with_context(|| format!("node {} not found", node_id))?;
if let Some(state) = state {
node.state = Some(state);
}
if let Some(commission_state) = commission_state {
let parsed = parse_commission_state(&commission_state)?;
if matches!(parsed, CommissionState::Commissioned) && node.commissioned_at.is_none()
{
node.commissioned_at = Some(Utc::now());
}
node.commission_state = Some(parsed);
}
if let Some(install_state) = install_state {
node.install_state = Some(parse_install_state(&install_state)?);
}
if let Some(power_state) = power_state {
node.power_state = Some(parse_power_state(&power_state)?);
}
if let Some(bmc_ref) = bmc_ref {
node.bmc_ref = Some(bmc_ref);
}
client.put(&key, &serde_json::to_vec(&node)?).await?;
println!("{}", serde_json::to_string_pretty(&node)?);
Ok(())
}
})
.await
}
#[allow(clippy::too_many_arguments)]
pub async fn set_observed_system(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
status: Option<String>,
nixos_configuration: Option<String>,
target_system: Option<String>,
current_system: Option<String>,
configured_system: Option<String>,
booted_system: Option<String>,
rollback_system: Option<String>,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "set observed system", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let node_id = node_id.to_string();
let status = status.clone();
let nixos_configuration = nixos_configuration.clone();
let target_system = target_system.clone();
let current_system = current_system.clone();
let configured_system = configured_system.clone();
let booted_system = booted_system.clone();
let rollback_system = rollback_system.clone();
async move {
let mut client = Client::connect(endpoint).await?;
let key = key_observed_system(&cluster_namespace, &cluster_id, &node_id);
let mut observed = get_json_key::<ObservedSystemState>(&mut client, &key)
.await?
.unwrap_or_else(|| ObservedSystemState {
node_id: node_id.clone(),
..ObservedSystemState::default()
});
observed.node_id = node_id.clone();
if let Some(status) = status {
observed.status = Some(status);
}
if let Some(nixos_configuration) = nixos_configuration {
observed.nixos_configuration = Some(nixos_configuration);
}
if let Some(target_system) = target_system {
observed.target_system = Some(target_system);
}
if let Some(current_system) = current_system {
observed.current_system = Some(current_system);
}
if let Some(configured_system) = configured_system {
observed.configured_system = Some(configured_system);
}
if let Some(booted_system) = booted_system {
observed.booted_system = Some(booted_system);
}
if let Some(rollback_system) = rollback_system {
observed.rollback_system = Some(rollback_system);
}
client.put(&key, &serde_json::to_vec(&observed)?).await?;
println!("{}", serde_json::to_string_pretty(&observed)?);
Ok(())
}
})
.await
}
pub async fn inspect_host_deployment(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
json_output: bool,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "inspect host deployment", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let deployment_name = deployment_name.to_string();
async move {
let mut client = Client::connect(endpoint).await?;
let spec = get_json_key::<HostDeploymentSpec>(
&mut client,
&key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name),
)
.await?
.with_context(|| format!("host deployment {} not found", deployment_name))?;
let status = get_json_key::<HostDeploymentStatus>(
&mut client,
&key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
)
.await?;
if json_output {
println!(
"{}",
serde_json::to_string_pretty(&json!({
"spec": spec,
"status": status,
}))?
);
} else {
println!("name={}", spec.name);
println!(
"nixos_configuration={}",
spec.nixos_configuration.as_deref().unwrap_or("unknown")
);
if let Some(status) = status {
println!("phase={}", status.phase.as_deref().unwrap_or("unknown"));
println!("paused={}", status.paused);
println!("selected_nodes={}", status.selected_nodes.join(","));
println!("completed_nodes={}", status.completed_nodes.join(","));
println!("failed_nodes={}", status.failed_nodes.join(","));
}
}
Ok(())
}
})
.await
}
pub async fn set_host_deployment_paused(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
paused: bool,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "set host deployment pause state", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let deployment_name = deployment_name.to_string();
async move {
let mut client = Client::connect(endpoint).await?;
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
if client.get(&spec_key).await?.is_none() {
return Err(anyhow::anyhow!(
"host deployment {} not found",
deployment_name
));
}
let status_key =
key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name);
let mut status = get_json_key::<HostDeploymentStatus>(&mut client, &status_key)
.await?
.unwrap_or_else(|| HostDeploymentStatus {
name: deployment_name.clone(),
..HostDeploymentStatus::default()
});
status.name = deployment_name.clone();
status.paused_by_operator = paused;
status.paused = paused;
status.phase = Some(if paused { "paused" } else { "ready" }.to_string());
status.message = Some(if paused {
"paused by operator".to_string()
} else {
"resumed by operator".to_string()
});
status.updated_at = Some(Utc::now());
client.put(&status_key, &serde_json::to_vec(&status)?).await?;
println!("{}", serde_json::to_string_pretty(&status)?);
Ok(())
}
})
.await
}
pub async fn abort_host_deployment(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
) -> Result<()> {
let endpoints = chainfire_endpoints(endpoint);
with_chainfire_endpoint_failover(&endpoints, "abort host deployment", |endpoint| {
let endpoint = endpoint.to_string();
let cluster_namespace = cluster_namespace.to_string();
let cluster_id = cluster_id.to_string();
let deployment_name = deployment_name.to_string();
async move {
let mut client = Client::connect(endpoint).await?;
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
if client.get(&spec_key).await?.is_none() {
return Err(anyhow::anyhow!(
"host deployment {} not found",
deployment_name
));
}
let node_prefix = format!("{}nodes/", cluster_prefix(&cluster_namespace, &cluster_id));
let existing = client.get_prefix(node_prefix.as_bytes()).await?;
let mut cleared_nodes = Vec::new();
for (key, value) in &existing {
let key_str = String::from_utf8_lossy(&key);
if key_str.ends_with("/desired-system") {
let Ok(desired) = serde_json::from_slice::<DesiredSystemSpec>(value) else {
continue;
};
if desired.deployment_id.as_deref() == Some(deployment_name.as_str()) {
client.delete(&key).await?;
cleared_nodes.push(desired.node_id.clone());
}
}
}
for (key, value) in existing {
let key_str = String::from_utf8_lossy(&key);
if key_str.ends_with("/desired-system") {
continue;
}
let node_suffix = key_str
.strip_prefix(&node_prefix)
.filter(|suffix| !suffix.contains('/'));
let Some(node_id) = node_suffix else {
continue;
};
let mut node = match serde_json::from_slice::<ClusterNodeRecord>(&value) {
Ok(node) => node,
Err(_) => continue,
};
if cleared_nodes.iter().any(|cleared| cleared == node_id)
&& node.state.as_deref() == Some("draining")
{
node.state = Some("active".to_string());
client.put(&key, &serde_json::to_vec(&node)?).await?;
}
}
let status = HostDeploymentStatus {
name: deployment_name.clone(),
phase: Some("aborted".to_string()),
paused: true,
paused_by_operator: true,
selected_nodes: Vec::new(),
completed_nodes: Vec::new(),
in_progress_nodes: Vec::new(),
failed_nodes: Vec::new(),
message: Some(format!(
"aborted by operator; cleared desired-system from {} node(s)",
cleared_nodes.len()
)),
updated_at: Some(Utc::now()),
};
client
.put(
&key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
&serde_json::to_vec(&status)?,
)
.await?;
println!("{}", serde_json::to_string_pretty(&status)?);
Ok(())
}
})
.await
}
async fn prune_cluster_state(
client: &mut Client,
cluster_namespace: &str,
@ -762,6 +1271,16 @@ async fn prune_cluster_state(
.to_string(),
);
}
for deployment in &spec.host_deployments {
desired_keys.insert(
String::from_utf8_lossy(&key_host_deployment_spec(
cluster_namespace,
cluster_id,
&deployment.name,
))
.to_string(),
);
}
for svc in &spec.services {
desired_keys.insert(
String::from_utf8_lossy(&key_service(cluster_namespace, cluster_id, &svc.name))
@ -893,11 +1412,18 @@ mod tests {
failure_domain: Some("rack-a".to_string()),
nix_profile: None,
install_plan: None,
hardware_facts: None,
desired_system: None,
state: Some(match NodeState::Pending {
NodeState::Pending => "pending".to_string(),
_ => unreachable!(),
}),
commission_state: None,
install_state: None,
commissioned_at: None,
last_inventory_hash: None,
power_state: None,
bmc_ref: None,
last_heartbeat: None,
}],
node_classes: vec![deployer_types::NodeClassSpec {
@ -922,6 +1448,7 @@ mod tests {
labels: HashMap::from([("env".to_string(), "dev".to_string())]),
}],
enrollment_rules: vec![],
host_deployments: vec![],
services: vec![],
instances: vec![],
mtls_policies: vec![],
@ -983,11 +1510,13 @@ mod tests {
let mut spec = test_spec();
spec.nodes[0].desired_system = Some(DesiredSystemSpec {
node_id: String::new(),
deployment_id: None,
nixos_configuration: Some("node01-next".to_string()),
flake_ref: Some("github:centra/cloud".to_string()),
switch_action: Some("boot".to_string()),
health_check_command: vec!["true".to_string()],
rollback_on_failure: Some(false),
drain_before_apply: Some(false),
});
let resolved = resolve_nodes(&spec).unwrap();
@ -1012,6 +1541,14 @@ mod tests {
&format!("{}nodes/node01/observed-system", prefix),
&prefix
));
assert!(is_prunable_key(
&format!("{}deployments/hosts/worker-rollout/spec", prefix),
&prefix
));
assert!(!is_prunable_key(
&format!("{}deployments/hosts/worker-rollout/status", prefix),
&prefix
));
}
}
@ -1028,6 +1565,7 @@ fn is_prunable_key(key: &str, prefix: &str) -> bool {
key.starts_with(&format!("{}node-classes/", prefix))
|| key.starts_with(&format!("{}pools/", prefix))
|| key.starts_with(&format!("{}enrollment-rules/", prefix))
|| key.starts_with(&format!("{}deployments/hosts/", prefix)) && key.ends_with("/spec")
|| key.starts_with(&format!("{}services/", prefix))
|| key.starts_with(&format!("{}instances/", prefix))
|| key.starts_with(&format!("{}mtls/policies/", prefix))

View file

@ -5,6 +5,7 @@ use clap::{Parser, Subcommand, ValueEnum};
use tracing_subscriber::EnvFilter;
mod chainfire;
mod power;
mod remote;
/// Deployer control CLI for PhotonCloud.
@ -82,6 +83,132 @@ enum Command {
#[arg(long, default_value = "status")]
action: String,
},
/// ノード単位の inventory / lifecycle 状態を確認・更新する
Node {
#[command(subcommand)]
command: NodeCommand,
},
/// HostDeployment rollout object を確認・操作する
Deployment {
#[command(subcommand)]
command: DeploymentCommand,
},
}
#[derive(Subcommand, Debug)]
enum NodeCommand {
/// 指定ノードの記録と関連 state を表示する
Inspect {
#[arg(long)]
node_id: String,
#[arg(long, default_value_t = false)]
include_desired_system: bool,
#[arg(long, default_value_t = false)]
include_observed_system: bool,
#[arg(long, value_enum, default_value_t = DumpFormat::Json)]
format: DumpFormat,
},
/// 指定ノードの lifecycle / commissioning 状態を更新する
SetState {
#[arg(long)]
node_id: String,
#[arg(long, value_enum)]
state: Option<NodeLifecycleStateArg>,
#[arg(long, value_enum)]
commission_state: Option<CommissionStateArg>,
#[arg(long, value_enum)]
install_state: Option<InstallStateArg>,
#[arg(long, value_enum)]
power_state: Option<PowerStateArg>,
#[arg(long)]
bmc_ref: Option<String>,
},
/// 指定ノードの observed-system を更新する
SetObserved {
#[arg(long)]
node_id: String,
#[arg(long)]
status: Option<String>,
#[arg(long)]
nixos_configuration: Option<String>,
#[arg(long)]
target_system: Option<String>,
#[arg(long)]
current_system: Option<String>,
#[arg(long)]
configured_system: Option<String>,
#[arg(long)]
booted_system: Option<String>,
#[arg(long)]
rollback_system: Option<String>,
},
/// 指定ノードの電源操作を行う
Power {
#[arg(long)]
node_id: String,
#[arg(long, value_enum)]
action: PowerActionArg,
},
/// 指定ノードに再インストールを要求する
Reinstall {
#[arg(long)]
node_id: String,
#[arg(long, default_value_t = false)]
power_cycle: bool,
},
}
#[derive(Subcommand, Debug)]
enum DeploymentCommand {
/// HostDeployment の spec/status を表示する
Inspect {
#[arg(long)]
name: String,
#[arg(long, value_enum, default_value_t = DumpFormat::Json)]
format: DumpFormat,
},
/// HostDeployment を一時停止する
Pause {
#[arg(long)]
name: String,
},
/// HostDeployment を再開する
Resume {
#[arg(long)]
name: String,
},
/// HostDeployment を中止し、配布済み desired-system を取り消す
Abort {
#[arg(long)]
name: String,
},
}
#[derive(Clone, Copy, Debug, ValueEnum)]
@ -90,6 +217,103 @@ enum DumpFormat {
Json,
}
#[derive(Clone, Copy, Debug, ValueEnum)]
enum NodeLifecycleStateArg {
Pending,
Provisioning,
Active,
Failed,
Draining,
}
impl NodeLifecycleStateArg {
fn as_str(self) -> &'static str {
match self {
Self::Pending => "pending",
Self::Provisioning => "provisioning",
Self::Active => "active",
Self::Failed => "failed",
Self::Draining => "draining",
}
}
}
#[derive(Clone, Copy, Debug, ValueEnum)]
enum CommissionStateArg {
Discovered,
Commissioning,
Commissioned,
}
impl CommissionStateArg {
fn as_str(self) -> &'static str {
match self {
Self::Discovered => "discovered",
Self::Commissioning => "commissioning",
Self::Commissioned => "commissioned",
}
}
}
#[derive(Clone, Copy, Debug, ValueEnum)]
enum InstallStateArg {
Pending,
Installing,
Installed,
Failed,
ReinstallRequested,
}
impl InstallStateArg {
fn as_str(self) -> &'static str {
match self {
Self::Pending => "pending",
Self::Installing => "installing",
Self::Installed => "installed",
Self::Failed => "failed",
Self::ReinstallRequested => "reinstall_requested",
}
}
}
#[derive(Clone, Copy, Debug, ValueEnum)]
enum PowerStateArg {
On,
Off,
Cycling,
Unknown,
}
impl PowerStateArg {
fn as_str(self) -> &'static str {
match self {
Self::On => "on",
Self::Off => "off",
Self::Cycling => "cycling",
Self::Unknown => "unknown",
}
}
}
#[derive(Clone, Copy, Debug, ValueEnum)]
enum PowerActionArg {
On,
Off,
Cycle,
Refresh,
}
impl PowerActionArg {
fn as_str(self) -> &'static str {
match self {
Self::On => "on",
Self::Off => "off",
Self::Cycle => "cycle",
Self::Refresh => "refresh",
}
}
}
#[tokio::main]
async fn main() -> Result<()> {
let env_filter =
@ -139,6 +363,149 @@ async fn main() -> Result<()> {
Command::Deployer { endpoint, action } => {
remote::run_deployer_command(&endpoint, &action).await?;
}
Command::Node { command } => {
let cluster_id = cli
.cluster_id
.as_deref()
.ok_or_else(|| anyhow::anyhow!("--cluster-id is required for node commands"))?;
match command {
NodeCommand::Inspect {
node_id,
include_desired_system,
include_observed_system,
format,
} => {
chainfire::inspect_node(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&node_id,
include_desired_system,
include_observed_system,
matches!(format, DumpFormat::Json),
)
.await?;
}
NodeCommand::SetState {
node_id,
state,
commission_state,
install_state,
power_state,
bmc_ref,
} => {
chainfire::set_node_states(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&node_id,
state.map(|value| value.as_str().to_string()),
commission_state.map(|value| value.as_str().to_string()),
install_state.map(|value| value.as_str().to_string()),
power_state.map(|value| value.as_str().to_string()),
bmc_ref,
)
.await?;
}
NodeCommand::SetObserved {
node_id,
status,
nixos_configuration,
target_system,
current_system,
configured_system,
booted_system,
rollback_system,
} => {
chainfire::set_observed_system(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&node_id,
status,
nixos_configuration,
target_system,
current_system,
configured_system,
booted_system,
rollback_system,
)
.await?;
}
NodeCommand::Power { node_id, action } => {
power::power_node(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&node_id,
action.as_str(),
)
.await?;
}
NodeCommand::Reinstall {
node_id,
power_cycle,
} => {
power::request_reinstall(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&node_id,
power_cycle,
)
.await?;
}
}
}
Command::Deployment { command } => {
let cluster_id = cli
.cluster_id
.as_deref()
.ok_or_else(|| anyhow::anyhow!("--cluster-id is required for deployment commands"))?;
match command {
DeploymentCommand::Inspect { name, format } => {
chainfire::inspect_host_deployment(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&name,
matches!(format, DumpFormat::Json),
)
.await?;
}
DeploymentCommand::Pause { name } => {
chainfire::set_host_deployment_paused(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&name,
true,
)
.await?;
}
DeploymentCommand::Resume { name } => {
chainfire::set_host_deployment_paused(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&name,
false,
)
.await?;
}
DeploymentCommand::Abort { name } => {
chainfire::abort_host_deployment(
&cli.chainfire_endpoint,
&cli.cluster_namespace,
cluster_id,
&name,
)
.await?;
}
}
}
}
Ok(())

View file

@ -0,0 +1,372 @@
use anyhow::{Context, Result};
use chainfire_client::Client;
use deployer_types::{ClusterNodeRecord, InstallState, PowerState};
use reqwest::{Client as HttpClient, Url};
use serde::Deserialize;
use serde_json::json;
fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
format!("{}/clusters/{}/", cluster_namespace, cluster_id)
}
fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}/desired-system",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}/observed-system",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn chainfire_endpoints(raw: &str) -> Vec<String> {
raw.split(',')
.map(str::trim)
.filter(|endpoint| !endpoint.is_empty())
.map(ToOwned::to_owned)
.collect()
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
enum PowerAction {
On,
Off,
Cycle,
Refresh,
}
impl PowerAction {
fn parse(value: &str) -> Result<Self> {
match value {
"on" => Ok(Self::On),
"off" => Ok(Self::Off),
"cycle" => Ok(Self::Cycle),
"refresh" => Ok(Self::Refresh),
other => Err(anyhow::anyhow!("unsupported power action {}", other)),
}
}
fn reset_type(self) -> Option<&'static str> {
match self {
Self::On => Some("On"),
Self::Off => Some("ForceOff"),
Self::Cycle => Some("PowerCycle"),
Self::Refresh => None,
}
}
}
#[derive(Debug)]
struct RedfishTarget {
resource_url: Url,
username: Option<String>,
password: Option<String>,
insecure: bool,
}
#[derive(Debug, Deserialize)]
struct RedfishSystemView {
#[serde(rename = "PowerState")]
power_state: Option<String>,
}
impl RedfishTarget {
fn parse(reference: &str) -> Result<Self> {
let rewritten = if let Some(rest) = reference.strip_prefix("redfish+http://") {
format!("http://{rest}")
} else if let Some(rest) = reference.strip_prefix("redfish+https://") {
format!("https://{rest}")
} else if let Some(rest) = reference.strip_prefix("redfish://") {
format!("https://{rest}")
} else {
return Err(anyhow::anyhow!(
"unsupported BMC reference {}; expected redfish:// or redfish+http(s)://",
reference
));
};
let mut resource_url = Url::parse(&rewritten)
.with_context(|| format!("failed to parse BMC reference {}", reference))?;
let insecure = resource_url
.query_pairs()
.any(|(key, value)| key == "insecure" && (value == "1" || value == "true"));
let username = if resource_url.username().is_empty() {
None
} else {
Some(resource_url.username().to_string())
};
let password = resource_url.password().map(ToOwned::to_owned);
let system_path = normalize_redfish_system_path(resource_url.path());
resource_url
.set_username("")
.map_err(|_| anyhow::anyhow!("failed to clear username from BMC reference"))?;
resource_url
.set_password(None)
.map_err(|_| anyhow::anyhow!("failed to clear password from BMC reference"))?;
resource_url.set_query(None);
resource_url.set_path(&system_path);
Ok(Self {
resource_url,
username,
password,
insecure,
})
}
fn action_url(&self) -> Result<Url> {
let mut action_url = self.resource_url.clone();
let path = format!(
"{}/Actions/ComputerSystem.Reset",
self.resource_url.path().trim_end_matches('/')
);
action_url.set_path(&path);
Ok(action_url)
}
async fn perform(&self, action: PowerAction) -> Result<PowerState> {
let client = HttpClient::builder()
.danger_accept_invalid_certs(self.insecure)
.build()
.context("failed to create Redfish client")?;
if let Some(reset_type) = action.reset_type() {
let request = self
.with_auth(client.post(self.action_url()?))
.json(&json!({ "ResetType": reset_type }));
request
.send()
.await
.context("failed to send Redfish reset request")?
.error_for_status()
.context("Redfish reset request failed")?;
}
match action {
PowerAction::Cycle => Ok(PowerState::Cycling),
PowerAction::On | PowerAction::Off | PowerAction::Refresh => self.refresh(&client).await,
}
}
async fn refresh(&self, client: &HttpClient) -> Result<PowerState> {
let response = self
.with_auth(client.get(self.resource_url.clone()))
.send()
.await
.context("failed to query Redfish system resource")?
.error_for_status()
.context("Redfish system query failed")?;
let system: RedfishSystemView = response
.json()
.await
.context("failed to decode Redfish system response")?;
map_redfish_power_state(system.power_state.as_deref())
}
fn with_auth(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
match self.username.as_deref() {
Some(username) => request.basic_auth(username, self.password.clone()),
None => request,
}
}
}
fn normalize_redfish_system_path(path: &str) -> String {
let trimmed = path.trim();
if trimmed.is_empty() || trimmed == "/" {
return "/redfish/v1/Systems/System.Embedded.1".to_string();
}
if trimmed.starts_with("/redfish/") {
return trimmed.to_string();
}
format!("/redfish/v1/Systems/{}", trimmed.trim_start_matches('/'))
}
fn map_redfish_power_state(value: Option<&str>) -> Result<PowerState> {
match value.unwrap_or("Unknown").to_ascii_lowercase().as_str() {
"on" => Ok(PowerState::On),
"off" => Ok(PowerState::Off),
"poweringon" | "poweringoff" | "cycling" => Ok(PowerState::Cycling),
"unknown" => Ok(PowerState::Unknown),
other => Err(anyhow::anyhow!("unsupported Redfish power state {}", other)),
}
}
async fn load_node_record(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
) -> Result<(Client, ClusterNodeRecord, Vec<u8>)> {
let endpoints = chainfire_endpoints(endpoint);
let mut last_error = None;
for endpoint in endpoints {
match Client::connect(endpoint.clone()).await {
Ok(mut client) => {
let key = key_node(cluster_namespace, cluster_id, node_id);
let Some(bytes) = client.get(&key).await? else {
return Err(anyhow::anyhow!("node {} not found", node_id));
};
let node = serde_json::from_slice::<ClusterNodeRecord>(&bytes)
.context("failed to decode node record")?;
return Ok((client, node, key));
}
Err(error) => last_error = Some(anyhow::Error::new(error)),
}
}
Err(last_error.unwrap_or_else(|| anyhow::anyhow!("no Chainfire endpoints configured")))
}
pub async fn power_node(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
action: &str,
) -> Result<()> {
let action = PowerAction::parse(action)?;
let (mut client, mut node, key) =
load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
let bmc_ref = node
.bmc_ref
.clone()
.with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
let target = RedfishTarget::parse(&bmc_ref)?;
let power_state = target.perform(action).await?;
node.power_state = Some(power_state);
client.put(&key, &serde_json::to_vec(&node)?).await?;
println!("{}", serde_json::to_string_pretty(&node)?);
Ok(())
}
pub async fn request_reinstall(
endpoint: &str,
cluster_namespace: &str,
cluster_id: &str,
node_id: &str,
power_cycle: bool,
) -> Result<()> {
let (mut client, mut node, key) =
load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
node.state = Some("provisioning".to_string());
node.install_state = Some(InstallState::ReinstallRequested);
if power_cycle {
let bmc_ref = node
.bmc_ref
.clone()
.with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
let target = RedfishTarget::parse(&bmc_ref)?;
node.power_state = Some(target.perform(PowerAction::Cycle).await?);
}
client.put(&key, &serde_json::to_vec(&node)?).await?;
client
.delete(&key_desired_system(cluster_namespace, cluster_id, node_id))
.await?;
client
.delete(&key_observed_system(cluster_namespace, cluster_id, node_id))
.await?;
println!("{}", serde_json::to_string_pretty(&node)?);
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use axum::{extract::State, http::StatusCode, routing::{get, post}, Json, Router};
use serde_json::Value;
use std::sync::{Arc, Mutex};
use tokio::net::TcpListener;
#[test]
fn parse_redfish_short_reference_defaults_to_https() {
let parsed = RedfishTarget::parse("redfish://lab-bmc/node01").unwrap();
assert_eq!(parsed.resource_url.as_str(), "https://lab-bmc/redfish/v1/Systems/node01");
}
#[test]
fn parse_redfish_explicit_http_reference_keeps_query_flags_local() {
let parsed =
RedfishTarget::parse("redfish+http://user:pass@127.0.0.1/system-1?insecure=1").unwrap();
assert_eq!(
parsed.resource_url.as_str(),
"http://127.0.0.1/redfish/v1/Systems/system-1"
);
assert_eq!(parsed.username.as_deref(), Some("user"));
assert_eq!(parsed.password.as_deref(), Some("pass"));
assert!(parsed.insecure);
}
#[tokio::test]
async fn redfish_adapter_refreshes_and_resets_power() {
#[derive(Clone, Default)]
struct TestState {
seen_payloads: Arc<Mutex<Vec<String>>>,
}
async fn system_handler() -> Json<Value> {
Json(json!({ "PowerState": "On" }))
}
async fn reset_handler(
State(state): State<TestState>,
Json(payload): Json<Value>,
) -> StatusCode {
state
.seen_payloads
.lock()
.unwrap()
.push(payload.to_string());
StatusCode::NO_CONTENT
}
let state = TestState::default();
let app = Router::new()
.route("/redfish/v1/Systems/node01", get(system_handler))
.route(
"/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset",
post(reset_handler),
)
.with_state(state.clone());
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let server = tokio::spawn(async move {
axum::serve(listener, app).await.unwrap();
});
let target = RedfishTarget::parse(&format!(
"redfish+http://{}/redfish/v1/Systems/node01",
addr
))
.unwrap();
assert_eq!(target.perform(PowerAction::Refresh).await.unwrap(), PowerState::On);
assert_eq!(target.perform(PowerAction::Off).await.unwrap(), PowerState::On);
let payloads = state.seen_payloads.lock().unwrap().clone();
assert_eq!(payloads, vec![r#"{"ResetType":"ForceOff"}"#.to_string()]);
server.abort();
}
}

View file

@ -29,6 +29,7 @@ tracing-subscriber = { workspace = true }
chrono = { workspace = true }
rcgen = { workspace = true }
clap = { workspace = true }
sha2 = "0.10"
# ChainFire for state management
chainfire-client = { workspace = true }

View file

@ -1,9 +1,11 @@
use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
use chrono::Utc;
use deployer_types::{
EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo,
NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse,
CommissionState, EnrollmentRuleSpec, HardwareFacts, InstallPlan, InstallState,
NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState, PhoneHomeRequest,
PhoneHomeResponse, PowerState,
};
use sha2::{Digest, Sha256};
use std::sync::Arc;
use tracing::{debug, error, info, warn};
@ -49,6 +51,14 @@ fn merge_hardware_summary_metadata(
}
}
fn inventory_hash(hardware_facts: Option<&HardwareFacts>) -> Option<String> {
let hardware_facts = hardware_facts?;
let payload = serde_json::to_vec(hardware_facts).ok()?;
let mut hasher = Sha256::new();
hasher.update(payload);
Some(format!("{:x}", hasher.finalize()))
}
/// POST /api/v1/phone-home
///
/// Handles node registration during first boot.
@ -794,6 +804,21 @@ async fn store_cluster_node_if_configured(
install_plan: node_config.install_plan.clone(),
hardware_facts: hardware_facts.cloned(),
state: Some(format!("{:?}", node_info.state).to_lowercase()),
commission_state: hardware_facts.map(|_| CommissionState::Discovered),
install_state: node_config.install_plan.as_ref().map(|_| InstallState::Pending),
commissioned_at: None,
last_inventory_hash: inventory_hash(hardware_facts),
power_state: node_info
.metadata
.get("power_state")
.and_then(|value| match value.as_str() {
"on" => Some(PowerState::On),
"off" => Some(PowerState::Off),
"cycling" => Some(PowerState::Cycling),
"unknown" => Some(PowerState::Unknown),
_ => None,
}),
bmc_ref: node_info.metadata.get("bmc_ref").cloned(),
last_heartbeat: Some(node_info.last_heartbeat),
};

View file

@ -24,6 +24,62 @@ impl Default for NodeState {
}
}
/// Commissioning lifecycle for inventory-driven bare-metal onboarding.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum CommissionState {
/// Node has been discovered and reported inventory but not yet approved.
Discovered,
/// Manual or automated commissioning is actively validating the node.
Commissioning,
/// Inventory has been accepted and the node can be installed or rolled out.
Commissioned,
}
impl Default for CommissionState {
fn default() -> Self {
CommissionState::Discovered
}
}
/// Installation lifecycle for host provisioning and reprovisioning.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum InstallState {
/// No install is currently running, but an install may be planned.
Pending,
/// Bootstrap or reinstall is actively writing the target system.
Installing,
/// The desired system has been installed successfully.
Installed,
/// Installation failed and needs operator or controller intervention.
Failed,
/// A reinstall has been requested but not started yet.
ReinstallRequested,
}
impl Default for InstallState {
fn default() -> Self {
InstallState::Pending
}
}
/// Best-effort power state tracked by external management adapters.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum PowerState {
On,
Off,
Cycling,
Unknown,
}
impl Default for PowerState {
fn default() -> Self {
PowerState::Unknown
}
}
/// Node information tracked by Deployer
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeInfo {
@ -492,6 +548,18 @@ pub struct ClusterNodeRecord {
pub hardware_facts: Option<HardwareFacts>,
#[serde(default)]
pub state: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub commission_state: Option<CommissionState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub install_state: Option<InstallState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub commissioned_at: Option<DateTime<Utc>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_inventory_hash: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub power_state: Option<PowerState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bmc_ref: Option<String>,
#[serde(default)]
pub last_heartbeat: Option<DateTime<Utc>>,
}
@ -534,6 +602,8 @@ pub struct DesiredSystemSpec {
#[serde(default)]
pub node_id: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub deployment_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub nixos_configuration: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub flake_ref: Option<String>,
@ -543,6 +613,8 @@ pub struct DesiredSystemSpec {
pub health_check_command: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rollback_on_failure: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub drain_before_apply: Option<bool>,
}
/// Cluster metadata (PhotonCloud scope).
@ -576,9 +648,23 @@ pub struct NodeSpec {
#[serde(default)]
pub install_plan: Option<InstallPlan>,
#[serde(default)]
pub hardware_facts: Option<HardwareFacts>,
#[serde(default)]
pub desired_system: Option<DesiredSystemSpec>,
#[serde(default)]
pub state: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub commission_state: Option<CommissionState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub install_state: Option<InstallState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub commissioned_at: Option<DateTime<Utc>>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_inventory_hash: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub power_state: Option<PowerState>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub bmc_ref: Option<String>,
#[serde(default)]
pub last_heartbeat: Option<DateTime<Utc>>,
}
@ -647,6 +733,74 @@ pub struct EnrollmentRuleSpec {
pub node_id_prefix: Option<String>,
}
/// Selector used by host deployments to target bare-metal nodes declaratively.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct HostDeploymentSelector {
#[serde(default)]
pub node_ids: Vec<String>,
#[serde(default)]
pub roles: Vec<String>,
#[serde(default)]
pub pools: Vec<String>,
#[serde(default)]
pub node_classes: Vec<String>,
#[serde(default)]
pub match_labels: HashMap<String, String>,
}
/// Declarative rollout intent for host-level NixOS updates.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct HostDeploymentSpec {
pub name: String,
#[serde(default)]
pub selector: HostDeploymentSelector,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub nixos_configuration: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub flake_ref: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub batch_size: Option<u32>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub max_unavailable: Option<u32>,
#[serde(default)]
pub health_check_command: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub switch_action: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub rollback_on_failure: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub drain_before_apply: Option<bool>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub reboot_policy: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub paused: Option<bool>,
}
/// Controller-observed rollout state for a host deployment.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
pub struct HostDeploymentStatus {
#[serde(default)]
pub name: String,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub phase: Option<String>,
#[serde(default)]
pub paused: bool,
#[serde(default)]
pub paused_by_operator: bool,
#[serde(default)]
pub selected_nodes: Vec<String>,
#[serde(default)]
pub completed_nodes: Vec<String>,
#[serde(default)]
pub in_progress_nodes: Vec<String>,
#[serde(default)]
pub failed_nodes: Vec<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub message: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub updated_at: Option<DateTime<Utc>>,
}
/// Service ports for logical service definitions.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct ServicePorts {
@ -807,6 +961,8 @@ pub struct ClusterStateSpec {
#[serde(default)]
pub enrollment_rules: Vec<EnrollmentRuleSpec>,
#[serde(default)]
pub host_deployments: Vec<HostDeploymentSpec>,
#[serde(default)]
pub services: Vec<ServiceSpec>,
#[serde(default)]
pub instances: Vec<ServiceInstanceSpec>,
@ -1080,19 +1236,92 @@ mod tests {
fn test_desired_system_spec_roundtrip() {
let desired = DesiredSystemSpec {
node_id: "node01".to_string(),
deployment_id: Some("worker-rollout".to_string()),
nixos_configuration: Some("node01".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()),
switch_action: Some("switch".to_string()),
health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()],
rollback_on_failure: Some(true),
drain_before_apply: Some(true),
};
let json = serde_json::to_string(&desired).unwrap();
let decoded: DesiredSystemSpec = serde_json::from_str(&json).unwrap();
assert_eq!(decoded.node_id, "node01");
assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout"));
assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01"));
assert_eq!(decoded.health_check_command.len(), 2);
assert_eq!(decoded.rollback_on_failure, Some(true));
assert_eq!(decoded.drain_before_apply, Some(true));
}
#[test]
fn test_host_deployment_roundtrip() {
let spec = HostDeploymentSpec {
name: "worker-rollout".to_string(),
selector: HostDeploymentSelector {
node_ids: vec![],
roles: vec!["worker".to_string()],
pools: vec!["general".to_string()],
node_classes: vec!["worker-linux".to_string()],
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
},
nixos_configuration: Some("worker-golden".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()),
batch_size: Some(1),
max_unavailable: Some(1),
health_check_command: vec!["true".to_string()],
switch_action: Some("boot".to_string()),
rollback_on_failure: Some(true),
drain_before_apply: Some(true),
reboot_policy: Some("always".to_string()),
paused: Some(false),
};
let json = serde_json::to_string(&spec).unwrap();
let decoded: HostDeploymentSpec = serde_json::from_str(&json).unwrap();
assert_eq!(decoded.name, "worker-rollout");
assert_eq!(decoded.batch_size, Some(1));
assert_eq!(decoded.max_unavailable, Some(1));
assert_eq!(decoded.selector.roles, vec!["worker".to_string()]);
assert_eq!(
decoded.selector.match_labels.get("tier").map(String::as_str),
Some("general")
);
assert_eq!(decoded.drain_before_apply, Some(true));
}
#[test]
fn test_cluster_node_record_commissioning_roundtrip() {
let node = ClusterNodeRecord {
node_id: "node01".to_string(),
machine_id: Some("machine-01".to_string()),
ip: "10.0.0.11".to_string(),
hostname: "node01".to_string(),
roles: vec!["worker".to_string()],
labels: HashMap::new(),
pool: Some("general".to_string()),
node_class: Some("worker-linux".to_string()),
failure_domain: Some("rack-a".to_string()),
nix_profile: Some("profiles/worker-linux".to_string()),
install_plan: None,
hardware_facts: None,
state: Some("provisioning".to_string()),
commission_state: Some(CommissionState::Commissioned),
install_state: Some(InstallState::Installed),
commissioned_at: Some(Utc::now()),
last_inventory_hash: Some("abc123".to_string()),
power_state: Some(PowerState::On),
bmc_ref: Some("redfish://lab-rack-a/node01".to_string()),
last_heartbeat: Some(Utc::now()),
};
let json = serde_json::to_string(&node).unwrap();
let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap();
assert_eq!(decoded.commission_state, Some(CommissionState::Commissioned));
assert_eq!(decoded.install_state, Some(InstallState::Installed));
assert_eq!(decoded.power_state, Some(PowerState::On));
assert_eq!(decoded.bmc_ref.as_deref(), Some("redfish://lab-rack-a/node01"));
}
#[test]

View file

@ -899,6 +899,12 @@ mod tests {
install_plan: None,
hardware_facts: None,
state: Some("active".to_string()),
commission_state: None,
install_state: None,
commissioned_at: None,
last_inventory_hash: None,
power_state: None,
bmc_ref: None,
last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
}
}

View file

@ -2,6 +2,7 @@ use std::fs;
use std::path::Path;
use std::process::Stdio;
use std::time::Duration;
use std::time::Instant;
use anyhow::{anyhow, Context, Result};
use chainfire_client::Client;
@ -135,7 +136,15 @@ impl Agent {
}
async fn tick(&self) -> Result<()> {
info!(
endpoint = %self.endpoint,
cluster_namespace = %self.cluster_namespace,
cluster_id = %self.cluster_id,
node_id = %self.node_id,
"starting reconciliation tick"
);
let mut client = Client::connect(self.endpoint.clone()).await?;
info!("connected to ChainFire");
let node_key = key_node(&self.cluster_namespace, &self.cluster_id, &self.node_id);
let node_raw = client.get_with_revision(&node_key).await?;
let Some((node_bytes, _revision)) = node_raw else {
@ -149,6 +158,11 @@ impl Agent {
let node: ClusterNodeRecord =
serde_json::from_slice(&node_bytes).context("failed to parse node record")?;
info!(
hostname = %node.hostname,
state = node.state.as_deref().unwrap_or("unknown"),
"loaded node record"
);
let desired = client
.get(key_desired_system(
@ -160,6 +174,11 @@ impl Agent {
.map(|bytes| serde_json::from_slice::<DesiredSystemSpec>(&bytes))
.transpose()
.context("failed to parse desired-system spec")?;
info!(
has_desired_system = desired.is_some(),
has_install_plan = node.install_plan.is_some(),
"resolved desired-state inputs"
);
let previous_observed = client
.get(key_observed_system(
@ -173,24 +192,87 @@ impl Agent {
.context("failed to parse observed-system state")?;
let mut observed = self.base_observed_state(&node);
observed.status = Some("planning".to_string());
info!(
current_system = observed.current_system.as_deref().unwrap_or(""),
configured_system = observed.configured_system.as_deref().unwrap_or(""),
booted_system = observed.booted_system.as_deref().unwrap_or(""),
"publishing planning status"
);
self.publish_observed_state(&mut client, &observed).await?;
let reconcile_result = self
.reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed)
.reconcile_node(
&node,
desired.as_ref(),
previous_observed.as_ref(),
&mut observed,
)
.await;
if let Err(error) = reconcile_result {
observed.status = Some("failed".to_string());
observed.last_error = Some(error.to_string());
observed.last_error = Some(format!("{error:#}"));
}
info!(
status = observed.status.as_deref().unwrap_or("unknown"),
"publishing final observed status"
);
self.publish_observed_state_with_retry(&observed).await?;
Ok(())
}
async fn publish_observed_state(
&self,
client: &mut Client,
observed: &ObservedSystemState,
) -> Result<()> {
info!(
status = observed.status.as_deref().unwrap_or("unknown"),
"writing observed-system state"
);
client
.put(
&key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id),
&serde_json::to_vec(&observed)?,
&serde_json::to_vec(observed)?,
)
.await?;
Ok(())
}
async fn publish_observed_state_with_retry(
&self,
observed: &ObservedSystemState,
) -> Result<()> {
let payload = serde_json::to_vec(observed)?;
let key = key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id);
let deadline = Instant::now() + Duration::from_secs(30);
let mut attempt = 1u32;
loop {
let result = async {
let mut client = Client::connect(self.endpoint.clone()).await?;
client.put(&key, &payload).await?;
Result::<()>::Ok(())
}
.await;
match result {
Ok(()) => return Ok(()),
Err(error) if Instant::now() < deadline => {
warn!(
attempt,
error = %error,
"failed to publish observed-system state; retrying with a fresh connection"
);
attempt += 1;
sleep(Duration::from_secs(2)).await;
}
Err(error) => return Err(error),
}
}
}
fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
ObservedSystemState {
node_id: node.node_id.clone(),
@ -209,7 +291,18 @@ impl Agent {
observed: &mut ObservedSystemState,
) -> Result<()> {
match node.state.as_deref() {
Some("failed") | Some("draining") => {
Some("failed") => {
observed.status = Some("paused".to_string());
return Ok(());
}
Some("draining")
if !desired
.map(|spec| {
spec.deployment_id.is_some()
&& spec.drain_before_apply.unwrap_or(false)
})
.unwrap_or(false) =>
{
observed.status = Some("paused".to_string());
return Ok(());
}
@ -227,6 +320,14 @@ impl Agent {
observed.status = Some("idle".to_string());
return Ok(());
};
info!(
nixos_configuration = %desired.nixos_configuration,
flake_ref = %desired.flake_ref,
switch_action = %desired.switch_action,
rollback_on_failure = desired.rollback_on_failure,
health_check_command = ?desired.health_check_command,
"resolved desired system"
);
observed.nixos_configuration = Some(desired.nixos_configuration.clone());
observed.flake_root = Some(desired.flake_ref.clone());
@ -236,6 +337,10 @@ impl Agent {
.and_then(|state| state.rollback_system.clone())
.or_else(|| observed.current_system.clone());
observed.rollback_system = previous_system.clone();
info!(
previous_system = previous_system.as_deref().unwrap_or(""),
"selected rollback baseline"
);
let target_system = self
.build_target_system(&desired.flake_ref, &desired.nixos_configuration)
.await
@ -246,8 +351,10 @@ impl Agent {
)
})?;
observed.target_system = Some(target_system.clone());
info!(target_system = %target_system, "built target system");
if observed.current_system.as_deref() == Some(target_system.as_str()) {
info!("target system already active");
if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
observed.status = Some("verifying".to_string());
observed.last_attempt = Some(Utc::now());
@ -279,8 +386,14 @@ impl Agent {
observed.status = Some("reconciling".to_string());
observed.last_attempt = Some(Utc::now());
info!(
target_system = %target_system,
switch_action = %desired.switch_action,
"switching to target system"
);
self.switch_to_target(&target_system, &desired.switch_action)
.await?;
info!("switch-to-configuration completed");
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
observed.current_system = read_symlink_target("/run/current-system");
@ -327,15 +440,20 @@ impl Agent {
async fn build_target_system(&self, flake_ref: &str, configuration: &str) -> Result<String> {
let flake_attr = target_flake_attr(flake_ref, configuration);
let output = run_command(
"nix",
&["build", "--no-link", "--print-out-paths", flake_attr.as_str()],
)
.await?;
info!(flake_attr = %flake_attr, "building target system");
let mut build_args = vec![
"build",
"-L",
"--no-link",
"--no-write-lock-file",
"--print-out-paths",
];
build_args.push(flake_attr.as_str());
let output = run_command("nix", &build_args).await?;
let path = output
.lines()
.find(|line| !line.trim().is_empty())
.map(str::trim)
.find(|line| line.starts_with("/nix/store/"))
.ok_or_else(|| anyhow!("nix build returned no output path"))?;
Ok(path.to_string())
}
@ -349,7 +467,12 @@ impl Agent {
));
}
run_command(
info!(
switch_bin = %switch_bin.display(),
switch_action = %switch_action,
"executing switch-to-configuration"
);
run_command_inherit_output(
switch_bin
.to_str()
.ok_or_else(|| anyhow!("invalid switch path"))?,
@ -369,9 +492,15 @@ impl Agent {
return Ok(HealthCheckOutcome::Passed);
}
info!(
command = ?desired.health_check_command,
rollback_on_failure = desired.rollback_on_failure,
"running post-activation health check"
);
if let Err(error) = run_vec_command(&desired.health_check_command).await {
let error_message = format!("health check failed after activation: {error}");
if desired.rollback_on_failure {
info!("health check failed; rolling back to previous system");
self.rollback_to_previous(previous_system).await?;
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
observed.current_system = read_symlink_target("/run/current-system");
@ -385,6 +514,7 @@ impl Agent {
return Err(anyhow!(error_message));
}
info!("post-activation health check passed");
Ok(HealthCheckOutcome::Passed)
}
@ -392,7 +522,42 @@ impl Agent {
let previous_system = previous_system
.filter(|value| !value.is_empty())
.ok_or_else(|| anyhow!("rollback requested but no previous system is known"))?;
self.switch_to_target(previous_system, "switch").await
info!(previous_system = %previous_system, "rolling back to previous system");
let switch_bin = Path::new(previous_system).join("bin/switch-to-configuration");
if switch_bin.exists() {
return self.switch_to_target(previous_system, "switch").await;
}
let activate = Path::new(previous_system).join("activate");
if !activate.exists() {
return Err(anyhow!(
"previous system {} does not contain switch-to-configuration or activate",
previous_system
));
}
info!(
previous_system = %previous_system,
activate = %activate.display(),
"previous system lacks switch-to-configuration; falling back to profile set + activate"
);
run_command(
"nix-env",
&[
"--profile",
"/nix/var/nix/profiles/system",
"--set",
previous_system,
],
)
.await?;
run_command_inherit_output(
activate
.to_str()
.ok_or_else(|| anyhow!("invalid activate path"))?,
&[],
)
.await
}
}
@ -458,6 +623,8 @@ fn read_symlink_target(path: &str) -> Option<String> {
}
async fn run_command(program: &str, args: &[&str]) -> Result<String> {
let started_at = Instant::now();
info!(program = %program, args = ?args, "running command");
let output = Command::new(program)
.args(args)
.stdin(Stdio::null())
@ -468,10 +635,25 @@ async fn run_command(program: &str, args: &[&str]) -> Result<String> {
.with_context(|| format!("failed to execute {}", program))?;
if output.status.success() {
info!(
program = %program,
args = ?args,
elapsed_ms = started_at.elapsed().as_millis(),
"command completed successfully"
);
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
} else {
let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
warn!(
program = %program,
args = ?args,
elapsed_ms = started_at.elapsed().as_millis(),
status = %output.status,
stdout = %stdout,
stderr = %stderr,
"command failed"
);
Err(anyhow!(
"{} {:?} failed with status {}: stdout='{}' stderr='{}'",
program,
@ -491,6 +673,47 @@ async fn run_vec_command(command: &[String]) -> Result<String> {
run_command(program, &arg_refs).await
}
async fn run_command_inherit_output(program: &str, args: &[&str]) -> Result<()> {
let started_at = Instant::now();
info!(
program = %program,
args = ?args,
"running command with inherited output"
);
let status = Command::new(program)
.args(args)
.stdin(Stdio::null())
.stdout(Stdio::inherit())
.stderr(Stdio::inherit())
.status()
.await
.with_context(|| format!("failed to execute {}", program))?;
if status.success() {
info!(
program = %program,
args = ?args,
elapsed_ms = started_at.elapsed().as_millis(),
"command completed successfully"
);
Ok(())
} else {
warn!(
program = %program,
args = ?args,
elapsed_ms = started_at.elapsed().as_millis(),
status = %status,
"command failed"
);
Err(anyhow!(
"{} {:?} failed with status {}",
program,
args,
status
))
}
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt()
@ -543,6 +766,12 @@ mod tests {
}),
hardware_facts: None,
state: Some("active".to_string()),
commission_state: None,
install_state: None,
commissioned_at: None,
last_inventory_hash: None,
power_state: None,
bmc_ref: None,
last_heartbeat: None,
}
}
@ -568,11 +797,13 @@ mod tests {
fn resolve_desired_system_prefers_chainfire_spec() {
let desired = DesiredSystemSpec {
node_id: "node01".to_string(),
deployment_id: None,
nixos_configuration: Some("node01-next".to_string()),
flake_ref: Some("github:centra/cloud".to_string()),
switch_action: Some("boot".to_string()),
health_check_command: vec!["true".to_string()],
rollback_on_failure: Some(true),
drain_before_apply: Some(false),
};
let resolved = resolve_desired_system(
@ -595,11 +826,13 @@ mod tests {
fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
let desired = DesiredSystemSpec {
node_id: "node01".to_string(),
deployment_id: None,
nixos_configuration: Some("node01-next".to_string()),
flake_ref: None,
switch_action: None,
health_check_command: Vec::new(),
rollback_on_failure: None,
drain_before_apply: None,
};
let resolved = resolve_desired_system(
@ -631,7 +864,10 @@ mod tests {
#[test]
fn read_symlink_target_returns_none_for_missing_path() {
assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None);
assert_eq!(
read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"),
None
);
}
#[test]

View file

@ -9,6 +9,8 @@ repository.workspace = true
[dependencies]
anyhow.workspace = true
chainfire-client.workspace = true
chrono.workspace = true
serde.workspace = true
serde_json.workspace = true
tokio.workspace = true
@ -16,5 +18,6 @@ tracing.workspace = true
tracing-subscriber.workspace = true
fiberlb-api.workspace = true
flashdns-api.workspace = true
deployer-types.workspace = true
clap = { version = "4.5", features = ["derive"] }
tonic = "0.12"

View file

@ -0,0 +1,823 @@
use anyhow::Result;
use chainfire_client::Client;
use chrono::Utc;
use clap::Args;
use deployer_types::{
ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector,
HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState, ServiceInstanceSpec,
};
use std::collections::{BTreeMap, HashMap, HashSet};
use std::time::Duration;
use tokio::time::sleep;
use tracing::{info, warn};
fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
format!("{}/clusters/{}/", cluster_namespace, cluster_id)
}
fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
format!(
"{}nodes/{}/desired-system",
cluster_prefix(cluster_namespace, cluster_id),
node_id
)
.into_bytes()
}
fn key_host_deployment_status(
cluster_namespace: &str,
cluster_id: &str,
deployment_name: &str,
) -> Vec<u8> {
format!(
"{}deployments/hosts/{}/status",
cluster_prefix(cluster_namespace, cluster_id),
deployment_name
)
.into_bytes()
}
#[derive(Debug, Clone, Args)]
pub struct HostsCommand {
#[arg(long)]
pub endpoint: String,
#[arg(long, default_value = "photoncloud")]
pub cluster_namespace: String,
#[arg(long)]
pub cluster_id: String,
#[arg(long, default_value_t = 15)]
pub interval_secs: u64,
#[arg(long, default_value_t = 300)]
pub heartbeat_timeout_secs: u64,
#[arg(long, default_value_t = false)]
pub dry_run: bool,
#[arg(long, default_value_t = false)]
pub once: bool,
}
pub async fn run(command: HostsCommand) -> Result<()> {
let controller = HostDeploymentController::new(command);
if controller.once {
controller.reconcile_once().await
} else {
loop {
if let Err(error) = controller.reconcile_once().await {
warn!(error = %error, "host deployment reconciliation failed");
}
sleep(controller.interval).await;
}
}
}
struct HostDeploymentController {
endpoint: String,
cluster_namespace: String,
cluster_id: String,
interval: Duration,
heartbeat_timeout_secs: u64,
dry_run: bool,
once: bool,
}
impl HostDeploymentController {
fn new(command: HostsCommand) -> Self {
Self {
endpoint: command.endpoint,
cluster_namespace: command.cluster_namespace,
cluster_id: command.cluster_id,
interval: Duration::from_secs(command.interval_secs),
heartbeat_timeout_secs: command.heartbeat_timeout_secs,
dry_run: command.dry_run,
once: command.once,
}
}
async fn reconcile_once(&self) -> Result<()> {
let mut client = Client::connect(self.endpoint.clone()).await?;
let nodes = self.load_nodes(&mut client).await?;
let desired_systems = self.load_desired_systems(&mut client).await?;
let observed_systems = self.load_observed_systems(&mut client).await?;
let instances = self.load_instances(&mut client).await?;
let deployments = self.load_host_deployments(&mut client).await?;
let statuses = self.load_host_deployment_statuses(&mut client).await?;
info!(
nodes = nodes.len(),
deployments = deployments.len(),
instances = instances.len(),
"loaded host deployment inputs"
);
for deployment in deployments {
let existing_status = statuses.get(&deployment.name).cloned();
let plan = plan_host_deployment(
&deployment,
existing_status.as_ref(),
&nodes,
&desired_systems,
&observed_systems,
&instances,
self.heartbeat_timeout_secs,
);
if self.dry_run {
info!(
deployment = %deployment.name,
phase = plan.status.phase.as_deref().unwrap_or("unknown"),
desired_upserts = plan.desired_upserts.len(),
desired_deletes = plan.desired_deletes.len(),
node_updates = plan.node_updates.len(),
"would reconcile host deployment"
);
continue;
}
for desired in &plan.desired_upserts {
client
.put(
&key_desired_system(
&self.cluster_namespace,
&self.cluster_id,
&desired.node_id,
),
&serde_json::to_vec(desired)?,
)
.await?;
}
for node_id in &plan.desired_deletes {
client
.delete(&key_desired_system(
&self.cluster_namespace,
&self.cluster_id,
node_id,
))
.await?;
}
for node in plan.node_updates.values() {
client
.put(
&key_node(&self.cluster_namespace, &self.cluster_id, &node.node_id),
&serde_json::to_vec(node)?,
)
.await?;
}
client
.put(
&key_host_deployment_status(
&self.cluster_namespace,
&self.cluster_id,
&deployment.name,
),
&serde_json::to_vec(&plan.status)?,
)
.await?;
}
Ok(())
}
async fn load_nodes(&self, client: &mut Client) -> Result<Vec<ClusterNodeRecord>> {
let prefix = format!(
"{}nodes/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut nodes = Vec::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
let Some(suffix) = key.strip_prefix(&prefix) else {
continue;
};
if suffix.contains('/') {
continue;
}
match serde_json::from_slice::<ClusterNodeRecord>(&value) {
Ok(node) => nodes.push(node),
Err(error) => warn!(error = %error, key = %key, "failed to decode cluster node"),
}
}
nodes.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
Ok(nodes)
}
async fn load_desired_systems(
&self,
client: &mut Client,
) -> Result<HashMap<String, DesiredSystemSpec>> {
let prefix = format!(
"{}nodes/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut desired = HashMap::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
if !key.ends_with("/desired-system") {
continue;
}
match serde_json::from_slice::<DesiredSystemSpec>(&value) {
Ok(spec) => {
desired.insert(spec.node_id.clone(), spec);
}
Err(error) => warn!(error = %error, key = %key, "failed to decode desired-system"),
}
}
Ok(desired)
}
async fn load_observed_systems(
&self,
client: &mut Client,
) -> Result<HashMap<String, ObservedSystemState>> {
let prefix = format!(
"{}nodes/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut observed = HashMap::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
if !key.ends_with("/observed-system") {
continue;
}
match serde_json::from_slice::<ObservedSystemState>(&value) {
Ok(state) => {
observed.insert(state.node_id.clone(), state);
}
Err(error) => warn!(error = %error, key = %key, "failed to decode observed-system"),
}
}
Ok(observed)
}
async fn load_instances(&self, client: &mut Client) -> Result<Vec<ServiceInstanceSpec>> {
let prefix = format!(
"{}instances/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut instances = Vec::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
Ok(instance) => instances.push(instance),
Err(error) => warn!(error = %error, key = %key, "failed to decode service instance"),
}
}
Ok(instances)
}
async fn load_host_deployments(&self, client: &mut Client) -> Result<Vec<HostDeploymentSpec>> {
let prefix = format!(
"{}deployments/hosts/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut deployments = Vec::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
if !key.ends_with("/spec") {
continue;
}
match serde_json::from_slice::<HostDeploymentSpec>(&value) {
Ok(spec) => deployments.push(spec),
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment"),
}
}
deployments.sort_by(|lhs, rhs| lhs.name.cmp(&rhs.name));
Ok(deployments)
}
async fn load_host_deployment_statuses(
&self,
client: &mut Client,
) -> Result<HashMap<String, HostDeploymentStatus>> {
let prefix = format!(
"{}deployments/hosts/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut statuses = HashMap::new();
for (key, value) in kvs {
let key = String::from_utf8_lossy(&key);
if !key.ends_with("/status") {
continue;
}
match serde_json::from_slice::<HostDeploymentStatus>(&value) {
Ok(status) => {
statuses.insert(status.name.clone(), status);
}
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment status"),
}
}
Ok(statuses)
}
}
#[derive(Debug, Default)]
struct HostDeploymentPlan {
status: HostDeploymentStatus,
desired_upserts: Vec<DesiredSystemSpec>,
desired_deletes: Vec<String>,
node_updates: BTreeMap<String, ClusterNodeRecord>,
}
fn plan_host_deployment(
deployment: &HostDeploymentSpec,
existing_status: Option<&HostDeploymentStatus>,
nodes: &[ClusterNodeRecord],
desired_systems: &HashMap<String, DesiredSystemSpec>,
observed_systems: &HashMap<String, ObservedSystemState>,
instances: &[ServiceInstanceSpec],
heartbeat_timeout_secs: u64,
) -> HostDeploymentPlan {
let now = Utc::now();
let target_configuration = deployment.nixos_configuration.clone();
let selector_matches = select_nodes(nodes, &deployment.selector);
let selected_node_ids = selector_matches
.iter()
.map(|node| node.node_id.clone())
.collect::<HashSet<_>>();
let instance_counts = active_instances_per_node(instances);
let mut completed = Vec::new();
let mut in_progress = Vec::new();
let mut failed = Vec::new();
let mut eligible_candidates = Vec::new();
let mut desired_upserts = Vec::new();
let mut node_updates = BTreeMap::new();
let batch_size = deployment.batch_size.unwrap_or(1).max(1) as usize;
let max_unavailable = deployment.max_unavailable.unwrap_or(1).max(1) as usize;
let operator_paused = existing_status
.map(|status| status.paused_by_operator)
.unwrap_or(false);
let spec_paused = deployment.paused.unwrap_or(false);
let mut desired_deletes = desired_systems
.iter()
.filter(|(node_id, desired)| {
desired.deployment_id.as_deref() == Some(deployment.name.as_str())
&& !selected_node_ids.contains(node_id.as_str())
})
.map(|(node_id, _)| node_id.clone())
.collect::<Vec<_>>();
for node in &selector_matches {
let desired = desired_systems.get(&node.node_id);
let observed = observed_systems.get(&node.node_id);
let is_completed =
is_node_completed(deployment, node, desired, observed, target_configuration.as_deref());
let is_failed = is_node_failed(deployment, desired, observed);
let is_in_progress = is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
|| (deployment.drain_before_apply == Some(true)
&& node.state.as_deref() == Some("draining")
&& instance_counts.get(&node.node_id).copied().unwrap_or_default() > 0);
if is_completed {
completed.push(node.node_id.clone());
if deployment.drain_before_apply == Some(true) && node.state.as_deref() == Some("draining")
{
let mut updated = (*node).clone();
updated.state = Some("active".to_string());
node_updates.insert(updated.node_id.clone(), updated);
}
continue;
}
if is_failed {
failed.push(node.node_id.clone());
continue;
}
if is_in_progress {
in_progress.push(node.node_id.clone());
continue;
}
if node_is_rollout_candidate(node, heartbeat_timeout_secs) {
eligible_candidates.push((*node).clone());
}
}
let unavailable = in_progress.len() + failed.len();
let paused = operator_paused || spec_paused || !failed.is_empty();
let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable);
let remaining_batch_budget = batch_size.saturating_sub(in_progress.len());
let max_starts = if deployment.nixos_configuration.is_some() {
remaining_unavailable_budget.min(remaining_batch_budget)
} else {
0
};
let mut planned = 0usize;
let mut newly_started = Vec::new();
if !paused && max_starts > 0 {
for node in eligible_candidates {
if planned >= max_starts {
break;
}
let remaining_instances = instance_counts.get(&node.node_id).copied().unwrap_or_default();
if deployment.drain_before_apply == Some(true) && remaining_instances > 0 {
let mut updated = node.clone();
updated.state = Some("draining".to_string());
node_updates.insert(updated.node_id.clone(), updated);
in_progress.push(node.node_id.clone());
newly_started.push(node.node_id.clone());
planned += 1;
continue;
}
let desired = DesiredSystemSpec {
node_id: node.node_id.clone(),
deployment_id: Some(deployment.name.clone()),
nixos_configuration: deployment.nixos_configuration.clone(),
flake_ref: deployment.flake_ref.clone(),
switch_action: deployment.switch_action.clone().or_else(|| Some("switch".to_string())),
health_check_command: deployment.health_check_command.clone(),
rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)),
drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)),
};
newly_started.push(node.node_id.clone());
in_progress.push(node.node_id.clone());
planned += 1;
if deployment.drain_before_apply == Some(true) && node.state.as_deref() != Some("draining")
{
let mut updated = node.clone();
updated.state = Some("draining".to_string());
node_updates.insert(updated.node_id.clone(), updated);
}
desired_upserts.push(desired);
}
}
let mut status = existing_status.cloned().unwrap_or_default();
status.name = deployment.name.clone();
status.selected_nodes = selector_matches.iter().map(|node| node.node_id.clone()).collect();
status.completed_nodes = dedup_sorted(completed);
status.in_progress_nodes = dedup_sorted(in_progress);
status.failed_nodes = dedup_sorted(failed);
status.paused_by_operator = operator_paused;
status.paused = paused;
status.phase = Some(if status.selected_nodes.is_empty() {
"idle"
} else if deployment.nixos_configuration.is_none() {
"invalid"
} else if status.paused {
"paused"
} else if status.completed_nodes.len() == status.selected_nodes.len() {
"completed"
} else if !newly_started.is_empty() || !status.in_progress_nodes.is_empty() {
"running"
} else {
"ready"
}
.to_string());
status.message = Some(format!(
"selected={} completed={} in_progress={} failed={} newly_started={}",
status.selected_nodes.len(),
status.completed_nodes.len(),
status.in_progress_nodes.len(),
status.failed_nodes.len(),
newly_started.len()
));
status.updated_at = Some(now);
HostDeploymentPlan {
status,
desired_upserts,
desired_deletes: {
desired_deletes.sort();
desired_deletes.dedup();
desired_deletes
},
node_updates,
}
}
fn select_nodes<'a>(
nodes: &'a [ClusterNodeRecord],
selector: &HostDeploymentSelector,
) -> Vec<&'a ClusterNodeRecord> {
let explicit_nodes = selector.node_ids.iter().collect::<HashSet<_>>();
let explicit_mode = !explicit_nodes.is_empty();
let mut selected = nodes
.iter()
.filter(|node| {
(!explicit_mode || explicit_nodes.contains(&node.node_id))
&& (selector.roles.is_empty()
|| node
.roles
.iter()
.any(|role| selector.roles.iter().any(|expected| expected == role)))
&& (selector.pools.is_empty()
|| node
.pool
.as_deref()
.map(|pool| selector.pools.iter().any(|expected| expected == pool))
.unwrap_or(false))
&& (selector.node_classes.is_empty()
|| node
.node_class
.as_deref()
.map(|node_class| {
selector
.node_classes
.iter()
.any(|expected| expected == node_class)
})
.unwrap_or(false))
&& selector
.match_labels
.iter()
.all(|(key, value)| node.labels.get(key) == Some(value))
})
.collect::<Vec<_>>();
selected.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
selected
}
fn active_instances_per_node(instances: &[ServiceInstanceSpec]) -> HashMap<String, usize> {
let mut counts = HashMap::new();
for instance in instances {
if matches!(instance.state.as_deref(), Some("failed") | Some("deleted")) {
continue;
}
*counts.entry(instance.node_id.clone()).or_insert(0usize) += 1;
}
counts
}
fn node_is_rollout_candidate(node: &ClusterNodeRecord, heartbeat_timeout_secs: u64) -> bool {
if matches!(
node.commission_state,
Some(CommissionState::Discovered | CommissionState::Commissioning)
) {
return false;
}
if matches!(
node.install_state,
Some(
InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested
)
) {
return false;
}
if !matches!(node.state.as_deref(), Some("active") | Some("draining")) {
return false;
}
if heartbeat_timeout_secs == 0 {
return true;
}
let Some(last) = node.last_heartbeat else {
return true;
};
Utc::now().signed_duration_since(last).num_seconds() <= heartbeat_timeout_secs as i64
}
fn is_node_completed(
deployment: &HostDeploymentSpec,
_node: &ClusterNodeRecord,
desired: Option<&DesiredSystemSpec>,
observed: Option<&ObservedSystemState>,
target_configuration: Option<&str>,
) -> bool {
observed
.filter(|observed| observed.status.as_deref() == Some("active"))
.and_then(|observed| observed.nixos_configuration.as_deref())
.zip(target_configuration)
.map(|(observed_configuration, target)| observed_configuration == target)
.unwrap_or(false)
&& desired
.and_then(|desired| desired.deployment_id.as_deref())
.map(|deployment_id| deployment_id == deployment.name)
.unwrap_or(false)
}
fn is_node_failed(
deployment: &HostDeploymentSpec,
desired: Option<&DesiredSystemSpec>,
observed: Option<&ObservedSystemState>,
) -> bool {
desired
.and_then(|desired| desired.deployment_id.as_deref())
.map(|deployment_id| deployment_id == deployment.name)
.unwrap_or(false)
&& observed
.and_then(|observed| observed.status.as_deref())
.map(|status| matches!(status, "failed" | "rolled-back"))
.unwrap_or(false)
}
fn is_node_in_progress(
deployment: &HostDeploymentSpec,
desired: Option<&DesiredSystemSpec>,
observed: Option<&ObservedSystemState>,
is_completed: bool,
is_failed: bool,
) -> bool {
if is_completed || is_failed {
return false;
}
desired
.and_then(|desired| desired.deployment_id.as_deref())
.map(|deployment_id| deployment_id == deployment.name)
.unwrap_or(false)
|| observed
.and_then(|observed| observed.status.as_deref())
.map(|status| matches!(status, "planning" | "pending" | "reconciling" | "verifying" | "staged"))
.unwrap_or(false)
}
fn dedup_sorted(mut values: Vec<String>) -> Vec<String> {
values.sort();
values.dedup();
values
}
#[cfg(test)]
mod tests {
use super::*;
fn test_node(node_id: &str, failure_domain: &str) -> ClusterNodeRecord {
ClusterNodeRecord {
node_id: node_id.to_string(),
machine_id: None,
ip: "10.0.0.1".to_string(),
hostname: node_id.to_string(),
roles: vec!["worker".to_string()],
labels: HashMap::from([
("tier".to_string(), "general".to_string()),
("failure_domain".to_string(), failure_domain.to_string()),
]),
pool: Some("general".to_string()),
node_class: Some("worker-linux".to_string()),
failure_domain: Some(failure_domain.to_string()),
nix_profile: None,
install_plan: None,
hardware_facts: None,
state: Some("active".to_string()),
commission_state: Some(CommissionState::Commissioned),
install_state: Some(InstallState::Installed),
commissioned_at: None,
last_inventory_hash: None,
power_state: None,
bmc_ref: None,
last_heartbeat: Some(Utc::now()),
}
}
fn test_deployment() -> HostDeploymentSpec {
HostDeploymentSpec {
name: "worker-rollout".to_string(),
selector: HostDeploymentSelector {
node_ids: vec![],
roles: vec!["worker".to_string()],
pools: vec!["general".to_string()],
node_classes: vec!["worker-linux".to_string()],
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
},
nixos_configuration: Some("worker-golden".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()),
batch_size: Some(1),
max_unavailable: Some(1),
health_check_command: vec!["true".to_string()],
switch_action: Some("switch".to_string()),
rollback_on_failure: Some(true),
drain_before_apply: Some(false),
reboot_policy: None,
paused: Some(false),
}
}
#[test]
fn plan_rollout_starts_one_node_per_batch() {
let deployment = test_deployment();
let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
let plan = plan_host_deployment(
&deployment,
None,
&nodes,
&HashMap::new(),
&HashMap::new(),
&[],
300,
);
assert_eq!(plan.desired_upserts.len(), 1);
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
assert_eq!(plan.status.phase.as_deref(), Some("running"));
}
#[test]
fn plan_rollout_pauses_on_failed_node() {
let deployment = test_deployment();
let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
let desired = HashMap::from([(
"node01".to_string(),
DesiredSystemSpec {
node_id: "node01".to_string(),
deployment_id: Some("worker-rollout".to_string()),
nixos_configuration: Some("worker-golden".to_string()),
flake_ref: None,
switch_action: Some("switch".to_string()),
health_check_command: Vec::new(),
rollback_on_failure: Some(true),
drain_before_apply: Some(false),
},
)]);
let observed = HashMap::from([(
"node01".to_string(),
ObservedSystemState {
node_id: "node01".to_string(),
nixos_configuration: Some("worker-golden".to_string()),
status: Some("rolled-back".to_string()),
..ObservedSystemState::default()
},
)]);
let plan = plan_host_deployment(
&deployment,
None,
&nodes,
&desired,
&observed,
&[],
300,
);
assert!(plan.desired_upserts.is_empty());
assert!(plan.status.paused);
assert_eq!(plan.status.failed_nodes, vec!["node01".to_string()]);
}
#[test]
fn plan_rollout_drains_before_apply_when_instances_exist() {
let mut deployment = test_deployment();
deployment.drain_before_apply = Some(true);
let nodes = vec![test_node("node01", "rack-a")];
let instances = vec![ServiceInstanceSpec {
instance_id: "api-node01".to_string(),
service: "api".to_string(),
node_id: "node01".to_string(),
ip: "10.0.0.1".to_string(),
port: 8080,
mesh_port: None,
version: None,
health_check: None,
process: None,
container: None,
managed_by: Some("fleet-scheduler".to_string()),
state: Some("active".to_string()),
last_heartbeat: None,
observed_at: None,
}];
let plan = plan_host_deployment(
&deployment,
None,
&nodes,
&HashMap::new(),
&HashMap::new(),
&instances,
300,
);
assert!(plan.desired_upserts.is_empty());
assert_eq!(
plan.node_updates
.get("node01")
.and_then(|node| node.state.as_deref()),
Some("draining")
);
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
}
}

View file

@ -29,9 +29,9 @@ use fiberlb_api::{
};
use flashdns_api::RecordServiceClient;
use flashdns_api::ReverseZoneServiceClient;
use flashdns_api::ZoneServiceClient;
use flashdns_api::proto::{
reverse_zone_service_client::ReverseZoneServiceClient,
record_data, ARecord, AaaaRecord, CaaRecord, CnameRecord, CreateRecordRequest,
CreateReverseZoneRequest, CreateZoneRequest, DeleteRecordRequest, DeleteReverseZoneRequest,
DeleteZoneRequest, ListReverseZonesRequest, MxRecord, NsRecord, PtrRecord, RecordData,
@ -39,6 +39,8 @@ use flashdns_api::proto::{
ZoneInfo,
};
mod hosts;
#[derive(Parser)]
#[command(author, version, about)]
struct Cli {
@ -71,6 +73,9 @@ enum Command {
#[arg(long, default_value_t = false)]
prune: bool,
},
/// Reconcile host deployments into per-node desired-system state
Hosts(hosts::HostsCommand),
}
#[derive(Debug, Deserialize)]
@ -294,6 +299,9 @@ async fn main() -> Result<()> {
let spec: DnsConfig = read_json(&config).await?;
reconcile_dns(spec, endpoint, prune).await?;
}
Command::Hosts(command) => {
hosts::run(command).await?;
}
}
Ok(())

View file

@ -7,6 +7,30 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
fi
run_chainfire_server_bin() {
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
else
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
fi
}
run_deployer_server_bin() {
if [[ -n "${PHOTONCLOUD_DEPLOYER_SERVER_BIN:-}" ]]; then
"$PHOTONCLOUD_DEPLOYER_SERVER_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- "$@"
fi
}
run_deployer_ctl_bin() {
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
fi
}
tmp_dir="$(mktemp -d)"
cf_pid=""
deployer_pid=""
@ -128,7 +152,7 @@ role = "voter"
EOF
echo "Starting ChainFire on 127.0.0.1:${api_port}"
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
run_chainfire_server_bin \
--config "$tmp_dir/chainfire.toml" \
>"$tmp_dir/chainfire.log" 2>&1 &
cf_pid="$!"
@ -155,7 +179,7 @@ namespace = "deployer"
EOF
echo "Starting Deployer on 127.0.0.1:${deployer_port}"
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- \
run_deployer_server_bin \
--config "$tmp_dir/deployer.toml" \
>"$tmp_dir/deployer.log" 2>&1 &
deployer_pid="$!"
@ -240,7 +264,7 @@ chainfire_endpoint="http://127.0.0.1:${api_port}"
deployer_endpoint="http://127.0.0.1:${deployer_port}"
run_deployer_ctl() {
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
run_deployer_ctl_bin \
--chainfire-endpoint "$chainfire_endpoint" \
--cluster-id test-cluster \
--cluster-namespace photoncloud \

View file

@ -7,6 +7,38 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
fi
run_chainfire_server_bin() {
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
else
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
fi
}
run_deployer_ctl_bin() {
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
fi
}
run_node_agent_bin() {
if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then
"$PHOTONCLOUD_NODE_AGENT_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@"
fi
}
run_fleet_scheduler_bin() {
if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then
"$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@"
fi
}
tmp_dir="$(mktemp -d)"
cf_pid=""
@ -104,7 +136,7 @@ EOF
mkdir -p "$tmp_dir/pids"
echo "Starting ChainFire on 127.0.0.1:${api_port}"
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
run_chainfire_server_bin \
--config "$tmp_dir/chainfire.toml" \
>"$tmp_dir/chainfire.log" 2>&1 &
cf_pid="$!"
@ -256,7 +288,7 @@ EOF
endpoint="http://127.0.0.1:${api_port}"
run_deployer_ctl() {
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
run_deployer_ctl_bin \
--chainfire-endpoint "$endpoint" \
--cluster-id test-cluster \
"$@"
@ -266,7 +298,7 @@ run_node_agent_once() {
local node_id="$1"
local pid_dir="$tmp_dir/pids/$node_id"
mkdir -p "$pid_dir"
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- \
run_node_agent_bin \
--chainfire-endpoint "$endpoint" \
--cluster-id test-cluster \
--node-id "$node_id" \
@ -277,7 +309,7 @@ run_node_agent_once() {
}
run_scheduler_once() {
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- \
run_fleet_scheduler_bin \
--chainfire-endpoint "$endpoint" \
--cluster-id test-cluster \
--interval-secs 1 \

View file

@ -0,0 +1,431 @@
#!/usr/bin/env bash
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
fi
run_chainfire_server_bin() {
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
else
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
fi
}
run_deployer_ctl_bin() {
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
fi
}
run_plasmacloud_reconciler_bin() {
if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then
"$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@"
else
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@"
fi
}
tmp_dir="$(mktemp -d)"
cf_pid=""
redfish_pid=""
cleanup() {
set +e
if [[ -n "$redfish_pid" ]]; then
kill "$redfish_pid" 2>/dev/null || true
wait "$redfish_pid" 2>/dev/null || true
fi
if [[ -n "$cf_pid" ]]; then
kill "$cf_pid" 2>/dev/null || true
wait "$cf_pid" 2>/dev/null || true
fi
rm -rf "$tmp_dir"
}
trap cleanup EXIT
free_port() {
python3 - <<'PY'
import socket
s = socket.socket()
s.bind(("127.0.0.1", 0))
print(s.getsockname()[1])
s.close()
PY
}
wait_for_port() {
local host="$1"
local port="$2"
local timeout_secs="${3:-60}"
local deadline=$((SECONDS + timeout_secs))
while (( SECONDS < deadline )); do
if python3 - "$host" "$port" <<'PY'
import socket
import sys
host = sys.argv[1]
port = int(sys.argv[2])
with socket.socket() as sock:
sock.settimeout(0.5)
try:
sock.connect((host, port))
except OSError:
raise SystemExit(1)
raise SystemExit(0)
PY
then
return 0
fi
sleep 1
done
echo "timed out waiting for ${host}:${port}" >&2
return 1
}
api_port="$(free_port)"
http_port="$(free_port)"
raft_port="$(free_port)"
gossip_port="$(free_port)"
redfish_port="$(free_port)"
cat >"$tmp_dir/chainfire.toml" <<EOF
[node]
id = 1
name = "chainfire-1"
role = "control_plane"
[storage]
data_dir = "$tmp_dir/chainfire-data"
[network]
api_addr = "127.0.0.1:${api_port}"
http_addr = "127.0.0.1:${http_port}"
raft_addr = "127.0.0.1:${raft_port}"
gossip_addr = "127.0.0.1:${gossip_port}"
[cluster]
id = 1
initial_members = []
bootstrap = true
[raft]
role = "voter"
EOF
cat >"$tmp_dir/mock-redfish.py" <<'PY'
import http.server
import json
import sys
port = int(sys.argv[1])
log_path = sys.argv[2]
class Handler(http.server.BaseHTTPRequestHandler):
def log_message(self, format, *args):
pass
def do_GET(self):
if self.path == "/redfish/v1/Systems/node01":
body = json.dumps({"PowerState": "On"}).encode("utf-8")
self.send_response(200)
self.send_header("Content-Type", "application/json")
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
return
self.send_error(404)
def do_POST(self):
if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset":
self.send_error(404)
return
length = int(self.headers.get("Content-Length", "0"))
payload = self.rfile.read(length).decode("utf-8")
with open(log_path, "a", encoding="utf-8") as handle:
handle.write(payload + "\n")
self.send_response(204)
self.end_headers()
server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
server.serve_forever()
PY
echo "Starting ChainFire on 127.0.0.1:${api_port}"
run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 &
cf_pid="$!"
wait_for_port "127.0.0.1" "$api_port" 120
wait_for_port "127.0.0.1" "$http_port" 120
echo "Starting mock Redfish on 127.0.0.1:${redfish_port}"
python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 &
redfish_pid="$!"
wait_for_port "127.0.0.1" "$redfish_port" 30
cat >"$tmp_dir/cluster.yaml" <<EOF
cluster:
cluster_id: test-cluster
environment: dev
node_classes:
- name: worker-linux
roles:
- worker
labels:
tier: general
pools:
- name: general
node_class: worker-linux
labels:
env: dev
nodes:
- node_id: node01
hostname: node01
ip: 10.0.0.11
roles:
- worker
labels:
tier: general
pool: general
node_class: worker-linux
state: active
commission_state: commissioned
install_state: installed
bmc_ref: "redfish+http://127.0.0.1:${redfish_port}/redfish/v1/Systems/node01"
- node_id: node02
hostname: node02
ip: 10.0.0.12
roles:
- worker
labels:
tier: general
pool: general
node_class: worker-linux
state: active
commission_state: commissioned
install_state: installed
host_deployments:
- name: worker-rollout
selector:
roles:
- worker
pools:
- general
node_classes:
- worker-linux
match_labels:
tier: general
nixos_configuration: worker-next
flake_ref: "github:centra/cloud"
batch_size: 1
max_unavailable: 1
health_check_command:
- "true"
switch_action: switch
rollback_on_failure: true
EOF
chainfire_endpoint="http://127.0.0.1:${api_port}"
run_deployer_ctl() {
run_deployer_ctl_bin \
--chainfire-endpoint "$chainfire_endpoint" \
--cluster-id test-cluster \
--cluster-namespace photoncloud \
--deployer-namespace deployer \
"$@"
}
run_hosts_once() {
run_plasmacloud_reconciler_bin \
hosts \
--endpoint "$chainfire_endpoint" \
--cluster-namespace photoncloud \
--cluster-id test-cluster \
--heartbeat-timeout-secs 300 \
--once
}
echo "Applying host lifecycle cluster config"
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
echo "Running host rollout controller"
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-1.json"
python3 - "$tmp_dir/deployment-1.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["phase"] == "running", payload
assert status["in_progress_nodes"] == ["node01"], payload
assert status["failed_nodes"] == [], payload
print("initial rollout wave validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump"
python3 - "$tmp_dir/nodes-1.dump" <<'PY'
import json
import sys
desired = {}
with open(sys.argv[1], "r", encoding="utf-8") as handle:
for line in handle:
if " key=" not in line or " value=" not in line:
continue
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
if not key.endswith("/desired-system"):
continue
payload = json.loads(line.split(" value=", 1)[1])
desired[payload["node_id"]] = payload
assert sorted(desired) == ["node01"], desired
assert desired["node01"]["deployment_id"] == "worker-rollout", desired
print("desired-system first wave validated")
PY
echo "Pausing and resuming deployment via CLI"
run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json"
python3 - "$tmp_dir/pause.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["paused"] is True, payload
assert payload["paused_by_operator"] is True, payload
print("pause command validated")
PY
run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json"
python3 - "$tmp_dir/resume.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["paused"] is False, payload
assert payload["paused_by_operator"] is False, payload
print("resume command validated")
PY
echo "Marking node01 rollout complete and reconciling next wave"
run_deployer_ctl node set-observed \
--node-id node01 \
--status active \
--nixos-configuration worker-next >/dev/null
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json"
python3 - "$tmp_dir/deployment-2.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["completed_nodes"] == ["node01"], payload
assert status["in_progress_nodes"] == ["node02"], payload
print("second rollout wave validated")
PY
echo "Marking node02 rollout failed and validating auto-pause"
run_deployer_ctl node set-observed \
--node-id node02 \
--status rolled-back \
--nixos-configuration worker-next >/dev/null
run_hosts_once
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json"
python3 - "$tmp_dir/deployment-3.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload["status"]
assert status["paused"] is True, payload
assert status["failed_nodes"] == ["node02"], payload
print("auto-pause on failure validated")
PY
echo "Refreshing power state through Redfish"
run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json"
python3 - "$tmp_dir/node-power.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["power_state"] == "on", payload
print("power refresh validated")
PY
echo "Requesting reinstall with power cycle"
run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json"
python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["state"] == "provisioning", payload
assert payload["install_state"] == "reinstall_requested", payload
assert payload["power_state"] == "cycling", payload
lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()]
assert any('"ResetType":"PowerCycle"' in line for line in lines), lines
print("reinstall orchestration validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump"
python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY'
import sys
lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")]
assert not any("/desired-system" in line for line in lines), lines
assert not any("/observed-system" in line for line in lines), lines
print("reinstall state cleanup validated")
PY
echo "Aborting deployment and clearing desired-system"
run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json"
python3 - "$tmp_dir/abort.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
assert payload["phase"] == "aborted", payload
assert payload["paused"] is True, payload
print("abort command validated")
PY
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump"
python3 - "$tmp_dir/nodes-2.dump" <<'PY'
import json
import sys
desired_nodes = []
with open(sys.argv[1], "r", encoding="utf-8") as handle:
for line in handle:
if " key=" not in line or " value=" not in line:
continue
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
if not key.endswith("/desired-system"):
continue
payload = json.loads(line.split(" value=", 1)[1])
if payload.get("deployment_id") == "worker-rollout":
desired_nodes.append(payload["node_id"])
assert desired_nodes == [], desired_nodes
print("desired-system cleanup validated")
PY
echo "Host lifecycle E2E verification passed"

View file

@ -1,9 +1,9 @@
# Storage Benchmarks
Generated on 2026-03-10T20:02:00+09:00 with:
Generated on 2026-03-27T12:08:47+09:00 with:
```bash
nix run ./nix/test-cluster#cluster -- fresh-bench-storage
nix run ./nix/test-cluster#cluster -- bench-storage
```
## CoronaFS
@ -12,30 +12,35 @@ Cluster network baseline, measured with `iperf3` from `node04` to `node01` befor
| Metric | Result |
|---|---:|
| TCP throughput | 22.83 MiB/s |
| TCP retransmits | 78 |
| TCP throughput | 45.92 MiB/s |
| TCP retransmits | 193 |
Measured from `node04`.
Local worker disk is the baseline. CoronaFS is the shared block volume path used for mutable VM disks, exported from `node01` over NBD.
Local worker disk is the baseline. CoronaFS now has two relevant data paths in the lab: the controller export sourced from `node01`, and the node-local export materialized onto the worker that actually attaches the mutable VM disk.
| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Sequential write | 26.36 MiB/s | 5.24 MiB/s |
| Sequential read | 348.77 MiB/s | 10.08 MiB/s |
| 4k random read | 1243 IOPS | 145 IOPS |
| Metric | Local Disk | Controller Export | Node-local Export |
|---|---:|---:|---:|
| Sequential write | 679.05 MiB/s | 30.35 MiB/s | 395.06 MiB/s |
| Sequential read | 2723.40 MiB/s | 42.70 MiB/s | 709.14 MiB/s |
| 4k random read | 16958 IOPS | 2034 IOPS | 5087 IOPS |
| 4k queued random read (`iodepth=32`) | 106026 IOPS | 14261 IOPS | 28898 IOPS |
Queue-depth profile (`libaio`, `iodepth=32`) from the same worker:
| Metric | Local Disk | CoronaFS |
|---|---:|---:|
| Depth-32 write | 27.12 MiB/s | 11.42 MiB/s |
| Depth-32 read | 4797.47 MiB/s | 10.06 MiB/s |
| Metric | Local Disk | Controller Export | Node-local Export |
|---|---:|---:|---:|
| Depth-32 write | 3417.45 MiB/s | 39.26 MiB/s | 178.04 MiB/s |
| Depth-32 read | 12996.47 MiB/s | 55.71 MiB/s | 112.88 MiB/s |
Cross-worker shared-volume visibility, measured by writing on `node04` and reading from `node05` over the same CoronaFS NBD export:
Node-local materialization timing and target-node steady-state read path:
| Metric | Result |
|---|---:|
| Cross-worker sequential read | 17.72 MiB/s |
| Node04 materialize latency | 9.23 s |
| Node05 materialize latency | 5.82 s |
| Node05 node-local sequential read | 709.14 MiB/s |
PlasmaVMC now prefers the worker-local CoronaFS export for mutable node-local volumes, even when the underlying materialization is a qcow2 overlay. The VM runtime section below is therefore the closest end-to-end proxy for real local-attach VM I/O, while the node-local export numbers remain useful for CoronaFS service consumers and for diagnosing exporter overhead.
## LightningStor
@ -46,16 +51,16 @@ Cluster network baseline for this client, measured with `iperf3` from `node03` t
| Metric | Result |
|---|---:|
| TCP throughput | 18.35 MiB/s |
| TCP retransmits | 78 |
| TCP throughput | 45.99 MiB/s |
| TCP retransmits | 207 |
### Large-object path
| Metric | Result |
|---|---:|
| Object size | 256 MiB |
| Upload throughput | 8.11 MiB/s |
| Download throughput | 7.54 MiB/s |
| Upload throughput | 18.20 MiB/s |
| Download throughput | 39.21 MiB/s |
### Small-object batch
@ -63,10 +68,10 @@ Measured as 32 objects of 4 MiB each (128 MiB total).
| Metric | Result |
|---|---:|
| Batch upload throughput | 0.81 MiB/s |
| Batch download throughput | 0.83 MiB/s |
| PUT rate | 0.20 objects/s |
| GET rate | 0.21 objects/s |
| Batch upload throughput | 18.96 MiB/s |
| Batch download throughput | 39.88 MiB/s |
| PUT rate | 4.74 objects/s |
| GET rate | 9.97 objects/s |
### Parallel small-object batch
@ -74,34 +79,57 @@ Measured as the same 32 objects of 4 MiB each, but with 8 concurrent client jobs
| Metric | Result |
|---|---:|
| Parallel batch upload throughput | 3.03 MiB/s |
| Parallel batch download throughput | 2.89 MiB/s |
| Parallel PUT rate | 0.76 objects/s |
| Parallel GET rate | 0.72 objects/s |
| Parallel batch upload throughput | 16.23 MiB/s |
| Parallel batch download throughput | 26.07 MiB/s |
| Parallel PUT rate | 4.06 objects/s |
| Parallel GET rate | 6.52 objects/s |
## VM Image Path
Measured against the real `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` path on `node01`.
Measured against the `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` clone path on `node01`.
| Metric | Result |
|---|---:|
| Guest image artifact size | 2017 MiB |
| Guest image virtual size | 4096 MiB |
| `CreateImage` latency | 176.03 s |
| First image-backed `CreateVolume` latency | 76.51 s |
| Second image-backed `CreateVolume` latency | 170.49 s |
| `CreateImage` latency | 66.49 s |
| First image-backed `CreateVolume` latency | 16.86 s |
| Second image-backed `CreateVolume` latency | 0.12 s |
## VM Runtime Path
Measured against the real `StartVm -> qemu attach -> guest boot -> guest fio` path on a worker node, using a CoronaFS-backed root disk and data disk.
| Metric | Result |
|---|---:|
| `StartVm` to qemu attach | 0.60 s |
| `StartVm` to guest benchmark result | 35.69 s |
| Guest sequential write | 123.49252223968506 MiB/s |
| Guest sequential read | 1492.7113695144653 MiB/s |
| Guest 4k random read | 25550 IOPS |
## Assessment
- CoronaFS shared-volume reads are currently 2.9% of the measured local-disk baseline on this nested-QEMU lab cluster.
- CoronaFS 4k random reads are currently 11.7% of the measured local-disk baseline.
- CoronaFS cross-worker reads are currently 5.1% of the measured local-disk sequential-read baseline, which is the more relevant signal for VM restart and migration paths.
- CoronaFS sequential reads are currently 44.2% of the measured node04->node01 TCP baseline, which helps separate NBD/export overhead from raw cluster-network limits.
- CoronaFS depth-32 reads are currently 0.2% of the local depth-32 baseline, which is a better proxy for queued guest I/O than the single-depth path.
- The shared-volume path is functionally correct for mutable VM disks and migration tests, but its read-side throughput is still too low to call production-ready for heavier VM workloads.
- LightningStor's replicated S3 path is working correctly, but 8.11 MiB/s upload and 7.54 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
- LightningStor large-object downloads are currently 41.1% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
- LightningStor's small-object batch path is also functional, but 0.20 PUT/s and 0.21 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 0.76 PUT/s and 0.72 GET/s.
- The VM image path is now measured directly rather than inferred. The cold `CreateVolume` path includes artifact fetch plus CoronaFS population; the warm `CreateVolume` path isolates repeated CoronaFS population from an already cached image.
- CoronaFS controller-export reads are currently 1.6% of the measured local-disk baseline on this nested-QEMU lab cluster.
- CoronaFS controller-export 4k random reads are currently 12.0% of the measured local-disk baseline.
- CoronaFS controller-export queued 4k random reads are currently 13.5% of the measured local queued-random-read baseline.
- CoronaFS controller-export sequential reads are currently 93.0% of the measured node04->node01 TCP baseline, which isolates the centralized source path from raw cluster-network limits.
- CoronaFS controller-export depth-32 reads are currently 0.4% of the local depth-32 baseline.
- CoronaFS node-local reads are currently 26.0% of the measured local-disk baseline, which is the more relevant steady-state signal for mutable VM disks after attachment.
- CoronaFS node-local 4k random reads are currently 30.0% of the measured local-disk baseline.
- CoronaFS node-local queued 4k random reads are currently 27.3% of the measured local queued-random-read baseline.
- CoronaFS node-local depth-32 reads are currently 0.9% of the local depth-32 baseline.
- The target worker's node-local read path is 26.0% of the measured local sequential-read baseline after materialization, which is the better proxy for restart and migration steady state than the old shared-export read.
- PlasmaVMC now attaches writable node-local volumes through the worker-local CoronaFS export, so the guest-runtime section should be treated as the real local VM steady-state path rather than the node-local export numbers alone.
- CoronaFS single-depth writes remain sensitive to the nested-QEMU/VDE lab transport, so the queued-depth and guest-runtime numbers are still the more reliable proxy for real VM workload behavior than the single-stream write figure alone.
- The central export path is now best understood as a source/materialization path; the worker-local export is the path that should determine VM-disk readiness going forward.
- LightningStor's replicated S3 path is working correctly, but 18.20 MiB/s upload and 39.21 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
- LightningStor large-object downloads are currently 85.3% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
- The current S3 frontend tuning baseline is the built-in 16 MiB streaming threshold with multipart PUT/FETCH concurrency of 8; that combination is the best default observed on this lab cluster so far.
- LightningStor uploads should be read against the replication write quorum and the same ~45.99 MiB/s lab network ceiling; this environment still limits end-to-end throughput well before modern bare-metal NICs would.
- LightningStor's small-object batch path is also functional, but 4.74 PUT/s and 9.97 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 4.06 PUT/s and 6.52 GET/s.
- The VM image section measures clone/materialization cost, not guest runtime I/O.
- The PlasmaVMC local image-backed clone fast path is now active again; a 0.12 s second clone indicates the CoronaFS qcow2 backing-file path is being hit on node01 rather than falling back to eager raw materialization.
- The VM runtime section is the real `PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel` path; use it to judge whether QEMU/NBD tuning is helping.
- The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.

574
fiberlb/Cargo.lock generated

File diff suppressed because it is too large Load diff

17
flake.lock generated
View file

@ -76,7 +76,8 @@
"flake-utils": "flake-utils",
"nix-nos": "nix-nos",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
"rust-overlay": "rust-overlay",
"systems": "systems_2"
}
},
"rust-overlay": {
@ -113,6 +114,20 @@
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"id": "systems",
"type": "indirect"
}
}
},
"root": "root",

467
flake.nix
View file

@ -33,7 +33,7 @@
# ============================================================================
# OUTPUTS: What this flake provides
# ============================================================================
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos }:
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
flake-utils.lib.eachDefaultSystem (system:
let
# Apply rust-overlay to get rust-bin attribute
@ -139,6 +139,301 @@
);
};
flakeInputsBlock = ''
inputs = {
# Use unstable nixpkgs for latest packages
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
# Rust overlay for managing Rust toolchains
rust-overlay = {
url = "github:oxalica/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
};
# Flake utilities for multi-system support
flake-utils.url = "github:numtide/flake-utils";
# Disko for declarative disk partitioning
disko = {
url = "github:nix-community/disko";
inputs.nixpkgs.follows = "nixpkgs";
};
# Nix-NOS generic network operating system modules
nix-nos = {
url = "path:./nix-nos";
inputs.nixpkgs.follows = "nixpkgs";
};
};
'';
bundledInputsBlock = ''
inputs = {
nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
rust-overlay = {
url = "path:./.bundle-inputs/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
};
flake-utils = {
url = "path:./.bundle-inputs/flake-utils";
inputs.systems.follows = "systems";
};
systems.url = "path:./.bundle-inputs/systems";
disko = {
url = "path:./.bundle-inputs/disko";
inputs.nixpkgs.follows = "nixpkgs";
};
nix-nos = {
url = "path:./nix-nos";
inputs.nixpkgs.follows = "nixpkgs";
};
};
'';
flakeHeaderBlock = ''
# ============================================================================
# INPUTS: External dependencies
# ============================================================================
inputs = {
# Use unstable nixpkgs for latest packages
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
# Rust overlay for managing Rust toolchains
rust-overlay = {
url = "github:oxalica/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
};
# Flake utilities for multi-system support
flake-utils.url = "github:numtide/flake-utils";
# Disko for declarative disk partitioning
disko = {
url = "github:nix-community/disko";
inputs.nixpkgs.follows = "nixpkgs";
};
# Nix-NOS generic network operating system modules
nix-nos = {
url = "path:./nix-nos";
inputs.nixpkgs.follows = "nixpkgs";
};
};
# ============================================================================
# OUTPUTS: What this flake provides
# ============================================================================
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
'';
bundledHeaderBlock = ''
# ============================================================================
# INPUTS: External dependencies
# ============================================================================
inputs = {
nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
rust-overlay = {
url = "path:./.bundle-inputs/rust-overlay";
inputs.nixpkgs.follows = "nixpkgs";
};
flake-utils = {
url = "path:./.bundle-inputs/flake-utils";
inputs.systems.follows = "systems";
};
systems.url = "path:./.bundle-inputs/systems";
disko = {
url = "path:./.bundle-inputs/disko";
inputs.nixpkgs.follows = "nixpkgs";
};
nix-nos = {
url = "path:./nix-nos";
inputs.nixpkgs.follows = "nixpkgs";
};
};
# ============================================================================
# OUTPUTS: What this flake provides
# ============================================================================
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
'';
bundledFlakeNix =
pkgs.writeText
"plasmacloud-bundled-flake.nix"
(
builtins.replaceStrings
[ flakeHeaderBlock ]
[ bundledHeaderBlock ]
(builtins.readFile ./flake.nix)
);
bundledFlakeHeaderFile =
pkgs.writeText "plasmacloud-bundled-flake-header" bundledHeaderBlock;
baseFlakeLock = builtins.fromJSON (builtins.readFile ./flake.lock);
bundleInputRelPaths = {
nixpkgs = "./.bundle-inputs/nixpkgs";
"rust-overlay" = "./.bundle-inputs/rust-overlay";
"flake-utils" = "./.bundle-inputs/flake-utils";
disko = "./.bundle-inputs/disko";
systems = "./.bundle-inputs/systems";
};
fetchLockedInput =
nodeName:
let
tree = builtins.fetchTree baseFlakeLock.nodes.${nodeName}.locked;
in
if builtins.isAttrs tree && tree ? outPath then tree.outPath else tree;
vendoredFlakeInputs = {
nixpkgs = fetchLockedInput "nixpkgs";
"rust-overlay" = fetchLockedInput "rust-overlay";
"flake-utils" = fetchLockedInput "flake-utils";
disko = fetchLockedInput "disko";
systems = fetchLockedInput "systems";
};
makeBundledLockNode =
nodeName: relPath:
let
node = baseFlakeLock.nodes.${nodeName};
in
node
// {
locked = {
type = "path";
path = relPath;
};
original = {
type = "path";
path = relPath;
};
};
bundledFlakeLock = baseFlakeLock // {
nodes =
baseFlakeLock.nodes
// {
root =
baseFlakeLock.nodes.root
// {
inputs =
baseFlakeLock.nodes.root.inputs
// {
systems = "systems";
};
};
nixpkgs = makeBundledLockNode "nixpkgs" bundleInputRelPaths.nixpkgs;
"rust-overlay" = makeBundledLockNode "rust-overlay" bundleInputRelPaths."rust-overlay";
"flake-utils" = makeBundledLockNode "flake-utils" bundleInputRelPaths."flake-utils";
disko = makeBundledLockNode "disko" bundleInputRelPaths.disko;
systems = makeBundledLockNode "systems" bundleInputRelPaths.systems;
};
};
bundledFlakeLockFile =
pkgs.writeText "plasmacloud-bundled-flake.lock" (builtins.toJSON bundledFlakeLock);
inBundledEval = builtins.pathExists ./.bundle-eval-marker;
bundledFlakeRootDrv = pkgs.runCommand "plasmacloud-bundled-flake-root" {
nativeBuildInputs = [
pkgs.coreutils
pkgs.python3
];
} ''
mkdir -p "$out"
cp -a ${flakeBundleSrc}/. "$out"/
chmod -R u+w "$out"
touch "$out/.bundle-eval-marker"
mkdir -p "$out/.bundle-inputs"
cp -a ${vendoredFlakeInputs.nixpkgs} "$out/.bundle-inputs/nixpkgs"
cp -a ${vendoredFlakeInputs."rust-overlay"} "$out/.bundle-inputs/rust-overlay"
cp -a ${vendoredFlakeInputs."flake-utils"} "$out/.bundle-inputs/flake-utils"
cp -a ${vendoredFlakeInputs.disko} "$out/.bundle-inputs/disko"
cp -a ${vendoredFlakeInputs.systems} "$out/.bundle-inputs/systems"
cp ${bundledFlakeLockFile} "$out/flake.lock"
python3 - <<'PY' "$out/flake.nix" ${bundledFlakeHeaderFile}
from pathlib import Path
import re
import sys
flake_path = Path(sys.argv[1])
header = Path(sys.argv[2]).read_text()
source = flake_path.read_text()
pattern = re.compile(
r" # ============================================================================\n"
r" # INPUTS: External dependencies\n"
r" # ============================================================================\n"
r" inputs = \{.*?\n"
r" # ============================================================================\n"
r" # OUTPUTS: What this flake provides\n"
r" # ============================================================================\n"
r" outputs = \{ self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems \? null \}:",
re.S,
)
rewritten, count = pattern.subn(header.rstrip("\n"), source, count=1)
if count != 1:
raise SystemExit(f"expected to rewrite 1 flake header, rewrote {count}")
flake_path.write_text(rewritten)
PY
'';
bundledFlakeRoot =
if inBundledEval then
null
else
builtins.path {
path = bundledFlakeRootDrv;
name = "plasmacloud-bundled-flake-root-src";
};
bundledFlakeRootNarHashFile =
if inBundledEval then
null
else
pkgs.runCommand "plasmacloud-bundled-flake-root-narhash" {
nativeBuildInputs = [ pkgs.nix ];
} ''
${pkgs.nix}/bin/nix \
--extra-experimental-features nix-command \
hash path --sri ${bundledFlakeRoot} \
| tr -d '\n' > "$out"
'';
bundledFlakeRootNarHash =
if inBundledEval then
null
else
builtins.readFile bundledFlakeRootNarHashFile;
bundledFlake =
if inBundledEval then
null
else
builtins.getFlake (
builtins.unsafeDiscardStringContext
"path:${toString bundledFlakeRoot}?narHash=${bundledFlakeRootNarHash}"
);
bundledVmSmokeTargetToplevel =
if inBundledEval then
null
else
bundledFlake.nixosConfigurations.vm-smoke-target.config.system.build.toplevel;
# Helper function to build a Rust workspace package
# Parameters:
# name: package name (e.g., "chainfire-server")
@ -434,16 +729,31 @@
description = "Node-local NixOS reconciliation agent for PhotonCloud hosts";
};
plasmacloud-reconciler = buildRustWorkspace {
name = "plasmacloud-reconciler";
workspaceSubdir = "deployer";
mainCrate = "plasmacloud-reconciler";
description = "Declarative reconciler for host rollouts and published resources";
};
plasmacloudFlakeBundle = pkgs.runCommand "plasmacloud-flake-bundle.tar.gz" {
nativeBuildInputs = [ pkgs.gnutar pkgs.gzip ];
nativeBuildInputs = [
pkgs.coreutils
pkgs.gnutar
pkgs.gzip
];
} ''
bundle_root="$(mktemp -d)"
cp -a ${bundledFlakeRootDrv}/. "$bundle_root"/
chmod -R u+w "$bundle_root"
tar \
--sort=name \
--mtime='@1' \
--owner=0 \
--group=0 \
--numeric-owner \
-C ${flakeBundleSrc} \
-C "$bundle_root" \
-cf - . \
| gzip -n > "$out"
'';
@ -462,6 +772,7 @@
self.nixosConfigurations.node01.config.system.build.plasmacloudDeployerClusterState;
vmClusterFlakeBundle = self.packages.${system}.plasmacloudFlakeBundle;
vmSmokeBundledTargetToplevel = bundledVmSmokeTargetToplevel;
# --------------------------------------------------------------------
# Default package: Build all servers
@ -484,6 +795,7 @@
self.packages.${system}.k8shost-server
self.packages.${system}.deployer-server
self.packages.${system}.deployer-ctl
self.packages.${system}.plasmacloud-reconciler
self.packages.${system}.nix-agent
self.packages.${system}.node-agent
self.packages.${system}.fleet-scheduler
@ -556,6 +868,10 @@
drv = self.packages.${system}.deployer-ctl;
};
plasmacloud-reconciler = flake-utils.lib.mkApp {
drv = self.packages.${system}.plasmacloud-reconciler;
};
nix-agent = flake-utils.lib.mkApp {
drv = self.packages.${system}.nix-agent;
};
@ -568,6 +884,144 @@
drv = self.packages.${system}.fleet-scheduler;
};
};
checks = {
deployer-vm-smoke = pkgs.testers.runNixOSTest (
import ./nix/tests/deployer-vm-smoke.nix {
inherit pkgs;
photoncloudPackages = self.packages.${system};
smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
}
);
deployer-vm-rollback = pkgs.testers.runNixOSTest (
import ./nix/tests/deployer-vm-smoke.nix {
inherit pkgs;
photoncloudPackages = self.packages.${system};
smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
desiredSystemOverrides = {
health_check_command = [ "false" ];
rollback_on_failure = true;
};
expectedStatus = "rolled-back";
expectCurrentSystemMatchesTarget = false;
expectMarkerPresent = false;
}
);
deployer-bootstrap-e2e = pkgs.runCommand "deployer-bootstrap-e2e" {
nativeBuildInputs = with pkgs; [
bash
coreutils
curl
findutils
gawk
gnugrep
gnused
procps
python3
];
PHOTONCLOUD_E2E_IN_NIX = "1";
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
"${self.packages.${system}.chainfire-server}/bin/chainfire";
PHOTONCLOUD_DEPLOYER_SERVER_BIN =
"${self.packages.${system}.deployer-server}/bin/deployer-server";
PHOTONCLOUD_DEPLOYER_CTL_BIN =
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
} ''
export HOME="$TMPDIR/home"
mkdir -p "$HOME"
export PATH="${pkgs.lib.makeBinPath [
pkgs.bash
pkgs.coreutils
pkgs.curl
pkgs.findutils
pkgs.gawk
pkgs.gnugrep
pkgs.gnused
pkgs.procps
pkgs.python3
]}"
bash ${./deployer/scripts/verify-deployer-bootstrap-e2e.sh}
touch "$out"
'';
host-lifecycle-e2e = pkgs.runCommand "host-lifecycle-e2e" {
nativeBuildInputs = with pkgs; [
bash
coreutils
curl
findutils
gawk
gnugrep
gnused
procps
python3
];
PHOTONCLOUD_E2E_IN_NIX = "1";
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
"${self.packages.${system}.chainfire-server}/bin/chainfire";
PHOTONCLOUD_DEPLOYER_CTL_BIN =
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN =
"${self.packages.${system}.plasmacloud-reconciler}/bin/plasmacloud-reconciler";
} ''
export HOME="$TMPDIR/home"
mkdir -p "$HOME"
export PATH="${pkgs.lib.makeBinPath [
pkgs.bash
pkgs.coreutils
pkgs.curl
pkgs.findutils
pkgs.gawk
pkgs.gnugrep
pkgs.gnused
pkgs.procps
pkgs.python3
]}"
bash ${./deployer/scripts/verify-host-lifecycle-e2e.sh}
touch "$out"
'';
fleet-scheduler-e2e = pkgs.runCommand "fleet-scheduler-e2e" {
nativeBuildInputs = with pkgs; [
bash
coreutils
curl
findutils
gawk
gnugrep
gnused
procps
python3
];
PHOTONCLOUD_E2E_IN_NIX = "1";
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
"${self.packages.${system}.chainfire-server}/bin/chainfire";
PHOTONCLOUD_DEPLOYER_CTL_BIN =
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
PHOTONCLOUD_NODE_AGENT_BIN =
"${self.packages.${system}.node-agent}/bin/node-agent";
PHOTONCLOUD_FLEET_SCHEDULER_BIN =
"${self.packages.${system}.fleet-scheduler}/bin/fleet-scheduler";
} ''
export HOME="$TMPDIR/home"
mkdir -p "$HOME"
export PATH="${pkgs.lib.makeBinPath [
pkgs.bash
pkgs.coreutils
pkgs.curl
pkgs.findutils
pkgs.gawk
pkgs.gnugrep
pkgs.gnused
pkgs.procps
pkgs.python3
]}"
bash ${./deployer/scripts/verify-fleet-scheduler-e2e.sh}
touch "$out"
'';
};
}
) // {
# ========================================================================
@ -606,6 +1060,12 @@
modules = [ ./nix/images/netboot-base.nix ];
};
# Offline-friendly target used by deployer VM smoke tests.
vm-smoke-target = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
modules = [ ./nix/images/deployer-vm-smoke-target.nix ];
};
# PlasmaCloud ISO (T061.S5 - bootable ISO with cluster-config embedding)
plasmacloud-iso = nixpkgs.lib.nixosSystem {
system = "x86_64-linux";
@ -732,6 +1192,7 @@
k8shost-server = self.packages.${final.system}.k8shost-server;
deployer-server = self.packages.${final.system}.deployer-server;
deployer-ctl = self.packages.${final.system}.deployer-ctl;
plasmacloud-reconciler = self.packages.${final.system}.plasmacloud-reconciler;
plasmacloudFlakeBundle = self.packages.${final.system}.plasmacloudFlakeBundle;
nix-agent = self.packages.${final.system}.nix-agent;
node-agent = self.packages.${final.system}.node-agent;

View file

@ -9,7 +9,7 @@ use flaredb_proto::kvrpc::{
use flaredb_proto::pdpb::Store;
use std::collections::HashMap;
use std::sync::Arc;
use std::time::{SystemTime, UNIX_EPOCH};
use std::time::{Instant, SystemTime, UNIX_EPOCH};
use serde::Deserialize;
use tokio::sync::Mutex;
use tonic::transport::Channel;
@ -35,6 +35,7 @@ pub struct RdbClient {
chainfire_kv_client: Option<ChainfireKvClient<Channel>>,
region_cache: RegionCache,
chainfire_route_cache: Arc<Mutex<Option<ChainfireRouteSnapshot>>>,
namespace: String,
}
@ -53,10 +54,18 @@ struct ChainfireRegionInfo {
leader_id: u64,
}
#[derive(Debug, Clone)]
struct ChainfireRouteSnapshot {
stores: HashMap<u64, ChainfireStoreInfo>,
regions: Vec<ChainfireRegionInfo>,
fetched_at: Instant,
}
impl RdbClient {
const ROUTE_RETRY_LIMIT: usize = 12;
const ROUTE_RETRY_BASE_DELAY_MS: u64 = 100;
const ROUTED_RPC_TIMEOUT: Duration = Duration::from_secs(1);
const CHAINFIRE_ROUTE_CACHE_TTL: Duration = Duration::from_secs(2);
pub async fn connect_with_pd(
_server_addr: String,
@ -70,26 +79,43 @@ impl RdbClient {
pd_addr: String,
namespace: impl Into<String>,
) -> Result<Self, tonic::transport::Error> {
let pd_endpoints = parse_transport_endpoints(&pd_addr);
let normalized_server_addr = normalize_transport_addr(&server_addr);
// A number of in-repo callers still pass the same address for both server and PD.
// In that case, prefer direct routing and skip the PD lookup path entirely.
let direct_addr = if !server_addr.is_empty() && server_addr == pd_addr {
Some(server_addr)
let direct_addr = if !normalized_server_addr.is_empty()
&& pd_endpoints
.iter()
.any(|endpoint| normalize_transport_addr(endpoint) == normalized_server_addr)
{
Some(normalized_server_addr.clone())
} else {
None
};
let (tso_client, pd_client, chainfire_kv_client) = if direct_addr.is_some() {
(None, None, None)
} else {
let pd_channel = Channel::from_shared(transport_endpoint(&pd_addr))
.unwrap()
.connect()
.await?;
let mut last_error = None;
let mut clients = None;
for endpoint in &pd_endpoints {
let pd_channel = match Channel::from_shared(transport_endpoint(endpoint)) {
Ok(endpoint) => match endpoint.connect().await {
Ok(channel) => channel,
Err(error) => {
last_error = Some(error);
continue;
}
},
Err(_) => {
continue;
}
};
let mut probe_client = PdClient::new(pd_channel.clone());
let probe = probe_client
.get_region(GetRegionRequest { key: Vec::new() })
.await;
match probe {
clients = Some(match probe {
Err(status) if status.code() == tonic::Code::Unimplemented => (
None,
None,
@ -100,6 +126,21 @@ impl RdbClient {
Some(PdClient::new(pd_channel)),
None,
),
});
break;
}
if let Some(clients) = clients {
clients
} else if let Some(error) = last_error {
return Err(error);
} else {
return Err(
Channel::from_shared("http://127.0.0.1:1".to_string())
.unwrap()
.connect()
.await
.expect_err("unreachable fallback endpoint should fail to connect"),
);
}
};
@ -111,6 +152,7 @@ impl RdbClient {
chainfire_kv_client,
region_cache: RegionCache::new(),
namespace: namespace.into(),
chainfire_route_cache: Arc::new(Mutex::new(None)),
})
}
@ -119,17 +161,51 @@ impl RdbClient {
server_addr: String,
namespace: impl Into<String>,
) -> Result<Self, tonic::transport::Error> {
let ep = transport_endpoint(&server_addr);
let channel = Channel::from_shared(ep).unwrap().connect().await?;
let direct_endpoints = parse_transport_endpoints(&server_addr);
let mut last_error = None;
let mut selected_addr = None;
let mut channel = None;
for endpoint in &direct_endpoints {
match Channel::from_shared(transport_endpoint(endpoint)) {
Ok(endpoint_builder) => match endpoint_builder.connect().await {
Ok(connected) => {
selected_addr = Some(endpoint.clone());
channel = Some(connected);
break;
}
Err(error) => {
last_error = Some(error);
}
},
Err(_) => {}
}
}
let selected_addr = if let Some(addr) = selected_addr {
addr
} else if let Some(error) = last_error {
return Err(error);
} else {
return Err(
Channel::from_shared("http://127.0.0.1:1".to_string())
.unwrap()
.connect()
.await
.expect_err("unreachable fallback endpoint should fail to connect"),
);
};
let channel = channel.expect("direct connect should produce a channel when selected");
Ok(Self {
channels: Arc::new(Mutex::new(HashMap::new())),
direct_addr: Some(server_addr),
direct_addr: Some(selected_addr),
tso_client: Some(TsoClient::new(channel.clone())),
pd_client: Some(PdClient::new(channel)),
chainfire_kv_client: None,
region_cache: RegionCache::new(),
namespace: namespace.into(),
chainfire_route_cache: Arc::new(Mutex::new(None)),
})
}
@ -165,6 +241,7 @@ impl RdbClient {
}
self.region_cache.clear().await;
self.invalidate_chainfire_route_cache().await;
if let Some(chainfire_kv_client) = &self.chainfire_kv_client {
return self.resolve_addr_via_chainfire(key, chainfire_kv_client.clone()).await;
@ -183,10 +260,6 @@ impl RdbClient {
Err(tonic::Status::not_found("region not found"))
}
async fn get_channel(&self, addr: &str) -> Result<Channel, tonic::transport::Error> {
Self::get_channel_from_map(&self.channels, addr).await
}
async fn get_channel_from_map(
channels: &Arc<Mutex<HashMap<String, Channel>>>,
addr: &str,
@ -207,6 +280,73 @@ impl RdbClient {
map.remove(addr);
}
async fn invalidate_chainfire_route_cache(&self) {
let mut cache = self.chainfire_route_cache.lock().await;
*cache = None;
}
async fn chainfire_route_snapshot(
&self,
mut kv_client: ChainfireKvClient<Channel>,
force_refresh: bool,
) -> Result<ChainfireRouteSnapshot, tonic::Status> {
if !force_refresh {
if let Some(snapshot) = self.chainfire_route_cache.lock().await.clone() {
if snapshot.fetched_at.elapsed() <= Self::CHAINFIRE_ROUTE_CACHE_TTL {
return Ok(snapshot);
}
}
}
let regions = list_chainfire_regions(&mut kv_client).await?;
let stores = list_chainfire_stores(&mut kv_client).await?;
let snapshot = ChainfireRouteSnapshot {
stores,
regions,
fetched_at: Instant::now(),
};
let mut cache = self.chainfire_route_cache.lock().await;
*cache = Some(snapshot.clone());
Ok(snapshot)
}
fn resolve_addr_from_chainfire_snapshot(
&self,
key: &[u8],
snapshot: &ChainfireRouteSnapshot,
) -> Result<(Region, Store), tonic::Status> {
let region = snapshot
.regions
.iter()
.find(|region| {
let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
start_ok && end_ok
})
.cloned()
.ok_or_else(|| tonic::Status::not_found("region not found"))?;
let leader = snapshot
.stores
.get(&region.leader_id)
.cloned()
.ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
Ok((
Region {
id: region.id,
start_key: region.start_key,
end_key: region.end_key,
peers: region.peers,
leader_id: region.leader_id,
},
Store {
id: leader.id,
addr: leader.addr,
},
))
}
async fn with_routed_addr<T, F, Fut>(&self, key: &[u8], mut op: F) -> Result<T, tonic::Status>
where
F: FnMut(String) -> Fut,
@ -590,41 +730,21 @@ impl RdbClient {
async fn resolve_addr_via_chainfire(
&self,
key: &[u8],
mut kv_client: ChainfireKvClient<Channel>,
kv_client: ChainfireKvClient<Channel>,
) -> Result<String, tonic::Status> {
let regions = list_chainfire_regions(&mut kv_client).await?;
let stores = list_chainfire_stores(&mut kv_client).await?;
for force_refresh in [false, true] {
let snapshot = self
.chainfire_route_snapshot(kv_client.clone(), force_refresh)
.await?;
if let Ok((region, leader)) =
self.resolve_addr_from_chainfire_snapshot(key, &snapshot)
{
self.region_cache.update(region, leader.clone()).await;
return Ok(leader.addr);
}
}
let region = regions
.into_iter()
.find(|region| {
let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
start_ok && end_ok
})
.ok_or_else(|| tonic::Status::not_found("region not found"))?;
let leader = stores
.get(&region.leader_id)
.ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
self.region_cache
.update(
Region {
id: region.id,
start_key: region.start_key,
end_key: region.end_key,
peers: region.peers,
leader_id: region.leader_id,
},
Store {
id: leader.id,
addr: leader.addr.clone(),
},
)
.await;
Ok(leader.addr.clone())
Err(tonic::Status::not_found("region not found"))
}
}
@ -636,6 +756,23 @@ fn transport_endpoint(addr: &str) -> String {
}
}
fn normalize_transport_addr(addr: &str) -> String {
addr.trim()
.trim_start_matches("http://")
.trim_start_matches("https://")
.trim_end_matches('/')
.to_string()
}
fn parse_transport_endpoints(addrs: &str) -> Vec<String> {
addrs
.split(',')
.map(str::trim)
.filter(|item| !item.is_empty())
.map(normalize_transport_addr)
.collect()
}
fn prefix_range_end(prefix: &str) -> Vec<u8> {
let mut end = prefix.as_bytes().to_vec();
if let Some(last) = end.last_mut() {
@ -696,7 +833,7 @@ async fn list_chainfire_regions(
#[cfg(test)]
mod tests {
use super::RdbClient;
use super::{RdbClient, normalize_transport_addr, parse_transport_endpoints};
#[test]
fn unknown_transport_errors_are_treated_as_retryable_routes() {
@ -711,4 +848,20 @@ mod tests {
assert!(RdbClient::is_retryable_route_error(&status));
assert!(!RdbClient::is_transport_error(&status));
}
#[test]
fn parse_transport_endpoints_accepts_comma_separated_values() {
assert_eq!(
parse_transport_endpoints("http://10.0.0.1:2379, 10.0.0.2:2379/"),
vec!["10.0.0.1:2379".to_string(), "10.0.0.2:2379".to_string()]
);
}
#[test]
fn normalize_transport_addr_strips_scheme_and_slashes() {
assert_eq!(
normalize_transport_addr("https://10.0.0.1:2479/"),
"10.0.0.1:2479".to_string()
);
}
}

View file

@ -10,6 +10,9 @@ struct Args {
#[arg(long, default_value = "127.0.0.1:2479")]
pd_addr: String,
#[arg(long, default_value = "")]
namespace: String,
#[command(subcommand)]
command: Commands,
}
@ -44,7 +47,8 @@ enum Commands {
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
let mut client = RdbClient::connect_with_pd(args.addr, args.pd_addr).await?;
let mut client =
RdbClient::connect_with_pd_namespace(args.addr, args.pd_addr, args.namespace).await?;
match args.command {
Commands::RawPut { key, value } => {

View file

@ -28,7 +28,7 @@ impl Cluster {
}
}
pub fn register_store(&self, addr: String) -> u64 {
pub fn register_store(&self, addr: String, requested_id: Option<u64>) -> u64 {
let mut state = self.inner.lock().unwrap();
// Dedup check? For now, always new ID.
@ -39,8 +39,15 @@ impl Cluster {
}
}
let id = state.next_store_id;
let id = requested_id
.filter(|id| *id != 0 && !state.stores.contains_key(id))
.unwrap_or_else(|| {
while state.stores.contains_key(&state.next_store_id) {
state.next_store_id += 1;
}
state.next_store_id
});
state.next_store_id = state.next_store_id.max(id.saturating_add(1));
state.stores.insert(id, Store { id, addr });

View file

@ -46,7 +46,8 @@ impl Pd for PdServiceImpl {
request: Request<RegisterStoreRequest>,
) -> Result<Response<RegisterStoreResponse>, Status> {
let req = request.into_inner();
let store_id = self.cluster.register_store(req.addr);
let requested_store_id = (req.store_id != 0).then_some(req.store_id);
let store_id = self.cluster.register_store(req.addr, requested_store_id);
Ok(Response::new(RegisterStoreResponse {
store_id,
cluster_id: 1, // fixed for now

View file

@ -29,6 +29,7 @@ service Pd {
message RegisterStoreRequest {
string addr = 1; // e.g., "127.0.0.1:50051"
uint64 store_id = 2; // Optional requested store ID (0 = auto-assign)
}
message RegisterStoreResponse {

View file

@ -1,23 +1,38 @@
use crate::store::Store;
use flaredb_proto::pdpb::pd_client::PdClient;
use flaredb_proto::pdpb::ListRegionsRequest;
use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
use flaredb_types::RegionMeta;
use std::sync::Arc;
use tokio::time::{sleep, Duration};
/// Periodically send region/store heartbeat to PD.
pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
pub async fn start_heartbeat(
pd_addr: String,
store: Arc<Store>,
server_addr: String,
requested_store_id: u64,
) {
tokio::spawn(async move {
let endpoint = format!("http://{}", pd_addr);
loop {
if let Ok(mut client) = PdClient::connect(endpoint.clone()).await {
if let Err(err) = client
.register_store(RegisterStoreRequest {
addr: server_addr.clone(),
store_id: requested_store_id,
})
.await
{
tracing::warn!("failed to register store with legacy PD: {}", err);
}
// list regions to keep routing fresh
if let Ok(resp) = client.list_regions(ListRegionsRequest {}).await {
let resp = resp.into_inner();
let mut metas = Vec::new();
for r in resp.regions {
let voters = if r.peers.is_empty() {
Vec::new()
vec![store.store_id()]
} else {
r.peers.clone()
};
@ -27,11 +42,7 @@ pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
start_key: r.start_key,
end_key: r.end_key,
},
if voters.is_empty() {
vec![store.store_id()]
} else {
voters
},
voters,
));
}
if !metas.is_empty() {

View file

@ -1,6 +1,8 @@
use clap::Parser;
use flaredb_proto::kvrpc::kv_cas_server::KvCasServer;
use flaredb_proto::kvrpc::kv_raw_server::KvRawServer;
use flaredb_proto::pdpb::pd_client::PdClient as LegacyPdClient;
use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
use flaredb_proto::raft_server::raft_service_server::RaftServiceServer;
use flaredb_proto::sqlrpc::sql_service_server::SqlServiceServer;
use flaredb_server::config::{self, Config, NamespaceManager};
@ -12,7 +14,7 @@ use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::Mutex;
use tokio::time::{sleep, Duration};
use tonic::transport::{Certificate, Identity, Server, ServerTlsConfig};
use tonic::transport::{Certificate, Channel, Identity, Server, ServerTlsConfig};
use tonic_health::server::health_reporter;
use tracing::{info, warn}; // Import warn
use tracing_subscriber::EnvFilter;
@ -27,7 +29,7 @@ mod service;
mod sql_service;
mod store;
use pd_client::{PdClient, PdEvent};
use pd_client::{PdClient as ChainfirePdClient, PdEvent};
const RAFT_GRPC_MESSAGE_SIZE: usize = 64 * 1024 * 1024;
@ -35,14 +37,18 @@ async fn connect_pd_with_retry(
pd_endpoints: &[String],
attempts: u32,
delay: Duration,
) -> Option<PdClient> {
) -> Option<ChainfirePdClient> {
let mut last_error = None;
for attempt in 1..=attempts {
match PdClient::connect_any(pd_endpoints).await {
match ChainfirePdClient::connect_any(pd_endpoints).await {
Ok(client) => return Some(client),
Err(err) => {
last_error = Some(err.to_string());
let protocol_mismatch = last_error
.as_deref()
.map(|msg| msg.contains("Unimplemented"))
.unwrap_or(false);
warn!(
attempt,
attempts,
@ -50,6 +56,13 @@ async fn connect_pd_with_retry(
error = last_error.as_deref().unwrap_or("unknown"),
"Failed to connect to FlareDB PD"
);
if protocol_mismatch {
warn!(
?pd_endpoints,
"PD endpoint does not speak ChainFire; falling back to legacy PD"
);
return None;
}
if attempt < attempts {
sleep(delay).await;
}
@ -65,6 +78,49 @@ async fn connect_pd_with_retry(
None
}
async fn connect_legacy_pd_with_retry(
pd_endpoints: &[String],
attempts: u32,
delay: Duration,
) -> Option<(String, LegacyPdClient<Channel>)> {
let mut last_error = None;
for attempt in 1..=attempts {
for endpoint in pd_endpoints {
let transport = if endpoint.starts_with("http") {
endpoint.clone()
} else {
format!("http://{}", endpoint)
};
match LegacyPdClient::connect(transport.clone()).await {
Ok(client) => return Some((endpoint.clone(), client)),
Err(err) => {
last_error = Some(format!("{}: {}", endpoint, err));
}
}
}
warn!(
attempt,
attempts,
?pd_endpoints,
error = last_error.as_deref().unwrap_or("unknown"),
"Failed to connect to legacy FlareDB PD"
);
if attempt < attempts {
sleep(delay).await;
}
}
warn!(
?pd_endpoints,
error = last_error.as_deref().unwrap_or("unknown"),
"Exhausted legacy FlareDB PD connection retries"
);
None
}
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
@ -334,7 +390,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let server_addr_string = server_config.addr.to_string();
tokio::spawn(async move {
let client = Arc::new(Mutex::new(
PdClient::connect_any(&pd_endpoints_for_task).await.ok(),
ChainfirePdClient::connect_any(&pd_endpoints_for_task)
.await
.ok(),
));
loop {
@ -396,7 +454,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
}
} else {
// Try to reconnect
if let Ok(new_client) = PdClient::connect_any(&pd_endpoints_for_task).await
if let Ok(new_client) =
ChainfirePdClient::connect_any(&pd_endpoints_for_task).await
{
info!("Reconnected to PD");
*guard = Some(new_client);
@ -406,6 +465,75 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
sleep(Duration::from_secs(10)).await;
}
});
} else if let Some((legacy_pd_addr, mut legacy_pd_client)) =
connect_legacy_pd_with_retry(&pd_endpoints, 3, Duration::from_secs(1)).await
{
info!(pd_addr = %legacy_pd_addr, "Connected to legacy FlareDB PD");
match legacy_pd_client
.register_store(RegisterStoreRequest {
addr: server_config.addr.to_string(),
store_id: server_config.store_id,
})
.await
{
Ok(resp) => {
let resp = resp.into_inner();
if resp.store_id != 0 && resp.store_id != server_config.store_id {
warn!(
expected_store_id = server_config.store_id,
assigned_store_id = resp.store_id,
"legacy PD assigned a different store id than local config"
);
}
}
Err(err) => warn!("failed to register with legacy PD: {}", err),
}
let mut region_metas = Vec::new();
match legacy_pd_client.list_regions(ListRegionsRequest {}).await {
Ok(resp) => {
for region in resp.into_inner().regions {
let voters = if region.peers.is_empty() || region.peers.len() < voters.len() {
voters.clone()
} else {
region.peers.clone()
};
region_metas.push((
RegionMeta {
id: region.id,
start_key: region.start_key,
end_key: region.end_key,
},
voters,
));
}
}
Err(err) => warn!("failed to list regions from legacy PD: {}", err),
}
if region_metas.is_empty() {
region_metas.push((
RegionMeta {
id: 1,
start_key: Vec::new(),
end_key: Vec::new(),
},
voters.clone(),
));
}
if let Err(e) = store.bootstrap_regions(region_metas).await {
warn!("failed to bootstrap regions from legacy PD: {}", e);
}
heartbeat::start_heartbeat(
legacy_pd_addr,
store.clone(),
server_config.addr.to_string(),
server_config.store_id,
)
.await;
} else {
info!("Starting in standalone mode with default region...");
let _ = store
@ -494,6 +622,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
server_addr: server_config.addr.to_string(),
pd_endpoints: pd_endpoints.clone(),
store_id: server_config.store_id,
configured_peers: (*peer_addrs).clone(),
};
let rest_app = rest::build_router(rest_state);
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;

View file

@ -16,8 +16,8 @@ use axum::{
};
use crate::pd_client::PdClient;
use flaredb_client::RdbClient;
use flaredb_sql::executor::{ExecutionResult, SqlExecutor};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::sync::Arc;
/// REST API state
@ -26,6 +26,7 @@ pub struct RestApiState {
pub server_addr: String,
pub pd_endpoints: Vec<String>,
pub store_id: u64,
pub configured_peers: HashMap<u64, String>,
}
/// Standard REST error response
@ -136,6 +137,15 @@ pub struct AddPeerRequest {
pub peer_id: u64,
}
/// Legacy/admin add member request for first-boot compatibility.
#[derive(Debug, Deserialize)]
pub struct AddMemberRequestLegacy {
pub id: String,
pub raft_addr: String,
#[serde(default)]
pub addr: Option<String>,
}
/// Region info response
#[derive(Debug, Serialize)]
pub struct RegionResponse {
@ -153,6 +163,7 @@ pub fn build_router(state: RestApiState) -> Router {
.route("/api/v1/scan", get(scan_kv))
.route("/api/v1/regions/{id}", get(get_region))
.route("/api/v1/regions/{id}/add_peer", post(add_peer_to_region))
.route("/admin/member/add", post(add_member_legacy))
.route("/health", get(health_check))
.with_state(state)
}
@ -320,6 +331,121 @@ async fn add_peer_to_region(
})))
}
/// POST /admin/member/add - first-boot compatible cluster join hook.
async fn add_member_legacy(
State(state): State<RestApiState>,
Json(req): Json<AddMemberRequestLegacy>,
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
let (peer_id, peer_addr) = resolve_join_peer(&state, &req).ok_or_else(|| {
error_response(
StatusCode::BAD_REQUEST,
"INVALID_MEMBER",
"Unable to resolve FlareDB peer id/address from join request",
)
})?;
let mut pd_client = PdClient::connect_any(&state.pd_endpoints)
.await
.map_err(|e| error_response(StatusCode::SERVICE_UNAVAILABLE, "PD_UNAVAILABLE", &format!("Failed to connect to PD: {}", e)))?;
let stores = pd_client.list_stores().await;
let already_registered = stores.iter().any(|store| store.id == peer_id);
pd_client
.register_store(peer_id, peer_addr.clone())
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
let mut regions = pd_client.list_regions().await;
if regions.is_empty() {
pd_client
.init_default_region(vec![state.store_id, peer_id])
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
regions = vec![crate::pd_client::RegionInfo {
id: 1,
start_key: Vec::new(),
end_key: Vec::new(),
peers: vec![state.store_id, peer_id],
leader_id: 0,
}];
}
let mut updated_regions = Vec::new();
for mut region in regions {
if !region.peers.contains(&peer_id) {
region.peers.push(peer_id);
region.peers.sort_unstable();
pd_client
.put_region(region.clone())
.await
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
updated_regions.push(region.id);
}
}
let status = if already_registered && updated_regions.is_empty() {
StatusCode::CONFLICT
} else if already_registered {
StatusCode::OK
} else {
StatusCode::CREATED
};
Ok((
status,
Json(SuccessResponse::new(serde_json::json!({
"peer_id": peer_id,
"addr": peer_addr,
"updated_regions": updated_regions,
"already_registered": already_registered,
}))),
))
}
fn resolve_join_peer(
state: &RestApiState,
req: &AddMemberRequestLegacy,
) -> Option<(u64, String)> {
if let Ok(peer_id) = req.id.parse::<u64>() {
if let Some(addr) = req
.addr
.clone()
.or_else(|| state.configured_peers.get(&peer_id).cloned())
{
return Some((peer_id, addr));
}
}
let candidate_host = socket_host(req.addr.as_deref().unwrap_or(&req.raft_addr));
state
.configured_peers
.iter()
.find(|(_, addr)| socket_host(addr) == candidate_host)
.map(|(peer_id, addr)| (*peer_id, addr.clone()))
}
fn socket_host(addr: &str) -> String {
let normalized = addr
.trim()
.trim_start_matches("http://")
.trim_start_matches("https://")
.split('/')
.next()
.unwrap_or(addr)
.to_string();
normalized
.parse::<std::net::SocketAddr>()
.map(|socket_addr| socket_addr.ip().to_string())
.unwrap_or_else(|_| {
normalized
.rsplit_once(':')
.map(|(host, _)| host.trim_matches(['[', ']']).to_string())
.unwrap_or(normalized)
})
}
/// Helper to create error response
fn error_response(
status: StatusCode,
@ -338,3 +464,51 @@ fn error_response(
}),
)
}
#[cfg(test)]
mod tests {
use super::*;
fn test_state() -> RestApiState {
RestApiState {
server_addr: "127.0.0.1:50052".to_string(),
pd_endpoints: vec!["127.0.0.1:2479".to_string()],
store_id: 1,
configured_peers: HashMap::from([
(1, "10.100.0.11:50052".to_string()),
(2, "10.100.0.12:50052".to_string()),
(3, "10.100.0.13:50052".to_string()),
]),
}
}
#[test]
fn resolve_join_peer_uses_numeric_id_when_available() {
let state = test_state();
let req = AddMemberRequestLegacy {
id: "2".to_string(),
raft_addr: "10.100.0.12:2380".to_string(),
addr: None,
};
assert_eq!(
resolve_join_peer(&state, &req),
Some((2, "10.100.0.12:50052".to_string()))
);
}
#[test]
fn resolve_join_peer_matches_host_from_raft_addr() {
let state = test_state();
let req = AddMemberRequestLegacy {
id: "node02".to_string(),
raft_addr: "10.100.0.12:2380".to_string(),
addr: None,
};
assert_eq!(
resolve_join_peer(&state, &req),
Some((2, "10.100.0.12:50052".to_string()))
);
}
}

View file

@ -16,7 +16,7 @@
};
rustToolchain = pkgs.rust-bin.stable.latest.default.override {
extensions = [ "rust-src" "rust-analyzer" ];
extensions = [ "rust-src" "rust-analyzer" "rustfmt" ];
};
in

View file

@ -6,13 +6,43 @@ if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
exec nix develop -c "$0" "$@"
fi
WORKDIR=$(mktemp -d)
PD_LOG="${WORKDIR}/flaredb-pd.log"
SERVER_LOG="${WORKDIR}/flaredb-server.log"
DATA_DIR="${WORKDIR}/data"
run_client() {
local output=""
local status=0
local attempt=0
while (( attempt < 20 )); do
if output=$(cargo run --quiet --bin flaredb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 "$@" 2>&1); then
printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
return 0
fi
status=$?
attempt=$((attempt + 1))
sleep 1
done
printf '%s\n' "${output}" >&2
return "${status}"
}
cleanup() {
local exit_code=$?
if [[ -n "${SERVER_PID:-}" ]]; then
kill "$SERVER_PID" >/dev/null 2>&1 || true
fi
if [[ -n "${PD_PID:-}" ]]; then
kill "$PD_PID" >/dev/null 2>&1 || true
fi
if (( exit_code != 0 )); then
echo "verify-core failed; logs preserved at ${WORKDIR}" >&2
[[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
[[ -f "${SERVER_LOG}" ]] && { echo "--- ${SERVER_LOG} ---" >&2; tail -n 200 "${SERVER_LOG}" >&2; }
return "${exit_code}"
fi
rm -rf "${WORKDIR}"
}
trap cleanup EXIT
@ -23,30 +53,38 @@ echo "Running tests..."
cargo test
echo "Starting PD..."
cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 >/tmp/rdb-pd.log 2>&1 &
cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
PD_PID=$!
sleep 2
echo "Starting Server..."
cargo run --bin rdb-server -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 --data-dir /tmp/rdb-server >/tmp/rdb-server.log 2>&1 &
cargo run --bin flaredb-server -- \
--pd-addr 127.0.0.1:2479 \
--addr 127.0.0.1:50052 \
--data-dir "${DATA_DIR}" \
--namespace-mode raw=eventual \
--namespace-mode cas=strong \
>"${SERVER_LOG}" 2>&1 &
SERVER_PID=$!
sleep 2
echo "Running Client Verification..."
echo "Testing TSO..."
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 tso
TSO_OUTPUT=$(run_client tso)
[[ "${TSO_OUTPUT}" == Timestamp:* ]]
echo "Testing Raw Put/Get..."
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-put --key foo --value bar
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-get --key foo
run_client --namespace raw raw-put --key foo --value bar >/dev/null
RAW_VALUE=$(run_client --namespace raw raw-get --key foo)
[[ "${RAW_VALUE}" == "bar" ]]
echo "Testing CAS success..."
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v1 --expected 0
CAS_SUCCESS=$(run_client --namespace cas cas --key cas1 --value v1 --expected 0)
[[ "${CAS_SUCCESS}" == Success,* ]]
echo "Testing CAS conflict..."
set +e
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v2 --expected 0
set -e
CAS_CONFLICT=$(run_client --namespace cas cas --key cas1 --value v2 --expected 0)
[[ "${CAS_CONFLICT}" == Conflict!* ]]
echo "Verification Complete!"

View file

@ -1,14 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail
# Run key Multi-Raft test suites.
echo "[verify] Running multi-region routing tests..."
nix develop -c cargo test -q rdb-server::tests::test_multi_region
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
exec nix develop -c "$0" "$@"
fi
echo "[verify] Running split tests..."
nix develop -c cargo test -q rdb-server::tests::test_split
echo "[verify] Running persistent snapshot recovery tests..."
cargo test -p flaredb-raft persistent_storage::tests::test_snapshot_persistence_and_recovery
echo "[verify] Running confchange/move tests..."
nix develop -c cargo test -q rdb-server::tests::test_confchange_move
echo "[verify] Running leader election tests..."
cargo test -p flaredb-raft raft_node::tests::test_leader_election
echo "[verify] Running server read-path tests..."
cargo test -p flaredb-server service::tests::scan_returns_decoded_cas_keys
echo "[verify] Done."

View file

@ -1,12 +1,23 @@
#!/usr/bin/env bash
set -euo pipefail
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
exec nix develop -c "$0" "$@"
fi
export LIBCLANG_PATH=${LIBCLANG_PATH:-/nix/store/0zn99g048j67syaq97rczq5z0j8dsvc8-clang-21.1.2-lib/lib}
echo "[verify] formatting..."
cargo fmt --all
if ! find . \
-path ./target -prune -o \
-name '*.rs' -print0 | xargs -0 rustfmt --check; then
echo "[verify] rustfmt drift detected; continuing with runtime tests" >&2
fi
echo "[verify] running rdb-server tests..."
nix-shell -p protobuf --run "LIBCLANG_PATH=${LIBCLANG_PATH} cargo test -p rdb-server --tests"
echo "[verify] running FlareDB server tests..."
cargo test -p flaredb-server --tests
echo "[verify] running FlareDB raft tests..."
cargo test -p flaredb-raft
echo "[verify] done."

View file

@ -1,40 +1,103 @@
#!/usr/bin/env bash
set -e
set -euo pipefail
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
exec nix develop -c "$0" "$@"
fi
WORKDIR=$(mktemp -d)
PD_LOG="${WORKDIR}/flaredb-pd.log"
S1_LOG="${WORKDIR}/flaredb-server-1.log"
S2_LOG="${WORKDIR}/flaredb-server-2.log"
run_client() {
local addr="$1"
shift
local output=""
local status=0
local attempt=0
while (( attempt < 20 )); do
if output=$(cargo run --quiet --bin flaredb-client -- --addr "${addr}" --pd-addr 127.0.0.1:2479 "$@" 2>&1); then
printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
return 0
fi
status=$?
attempt=$((attempt + 1))
sleep 1
done
printf '%s\n' "${output}" >&2
return "${status}"
}
cleanup() {
local exit_code=$?
if [[ -n "${PD_PID:-}" ]]; then
kill "${PD_PID}" >/dev/null 2>&1 || true
fi
if [[ -n "${S1_PID:-}" ]]; then
kill "${S1_PID}" >/dev/null 2>&1 || true
fi
if [[ -n "${S2_PID:-}" ]]; then
kill "${S2_PID}" >/dev/null 2>&1 || true
fi
if (( exit_code != 0 )); then
echo "verify-sharding failed; logs preserved at ${WORKDIR}" >&2
[[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
[[ -f "${S1_LOG}" ]] && { echo "--- ${S1_LOG} ---" >&2; tail -n 200 "${S1_LOG}" >&2; }
[[ -f "${S2_LOG}" ]] && { echo "--- ${S2_LOG} ---" >&2; tail -n 200 "${S2_LOG}" >&2; }
return "${exit_code}"
fi
rm -rf "${WORKDIR}"
}
trap cleanup EXIT
echo "Building workspace..."
cargo build
echo "Starting PD..."
cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 &
cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
PD_PID=$!
sleep 2
echo "Starting Server 1 (127.0.0.1:50001, data1)..."
# Port 50001
cargo run --bin rdb-server -- --addr 127.0.0.1:50001 --data-dir data1 --pd-addr 127.0.0.1:2479 &
cargo run --bin flaredb-server -- \
--store-id 1 \
--addr 127.0.0.1:50001 \
--http-addr 127.0.0.1:8083 \
--data-dir "${WORKDIR}/data1" \
--pd-addr 127.0.0.1:2479 \
--metrics-port 9093 \
--namespace-mode raw=eventual \
>"${S1_LOG}" 2>&1 &
S1_PID=$!
sleep 4
echo "Starting Server 2 (127.0.0.1:50002, data2)..."
# Port 50002
cargo run --bin rdb-server -- --addr 127.0.0.1:50002 --data-dir data2 --pd-addr 127.0.0.1:2479 &
cargo run --bin flaredb-server -- \
--store-id 2 \
--addr 127.0.0.1:50002 \
--http-addr 127.0.0.1:8084 \
--data-dir "${WORKDIR}/data2" \
--pd-addr 127.0.0.1:2479 \
--metrics-port 9094 \
--namespace-mode raw=eventual \
>"${S2_LOG}" 2>&1 &
S2_PID=$!
sleep 5 # Wait for registration
sleep 5 # Wait for registration and leader routing to settle
echo "Running Client Verification (Sharding)..."
echo "Running Client Verification (multi-node routing smoke)..."
# Put 'a' (Should go to S1)
echo "Testing Put 'a'..."
cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key a --value val_a
run_client 127.0.0.1:50001 --namespace raw raw-put --key a --value val_a >/dev/null
# Put 'z' (Should go to S2)
echo "Testing Put 'z'..."
cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key z --value val_z
run_client 127.0.0.1:50002 --namespace raw raw-put --key z --value val_z >/dev/null
# Cleanup
kill $PD_PID
kill $S1_PID
kill $S2_PID
rm -rf data1 data2
echo "Testing reads from both nodes..."
VALUE_A=$(run_client 127.0.0.1:50002 --namespace raw raw-get --key a)
VALUE_Z=$(run_client 127.0.0.1:50001 --namespace raw raw-get --key z)
[[ "${VALUE_A}" == "val_a" ]]
[[ "${VALUE_Z}" == "val_z" ]]
echo "Sharding Verification Complete!"

607
flashdns/Cargo.lock generated

File diff suppressed because it is too large Load diff

621
iam/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -23,6 +23,9 @@ prost = { workspace = true }
base64 = { workspace = true }
sha2 = { workspace = true }
uuid = { workspace = true }
aes-gcm = "0.10"
argon2 = "0.5"
rand_core = "0.6"
[dev-dependencies]
tokio = { workspace = true, features = ["full", "test-util"] }

View file

@ -8,12 +8,12 @@ use rand_core::{OsRng, RngCore};
use tonic::{Request, Response, Status};
use iam_store::CredentialStore;
use iam_types::{Argon2Params, CredentialRecord};
use iam_types::{Argon2Params, CredentialRecord, PrincipalKind as TypesPrincipalKind};
use crate::proto::{
iam_credential_server::IamCredential, CreateS3CredentialRequest,
CreateS3CredentialResponse, Credential, GetSecretKeyRequest, GetSecretKeyResponse,
ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
ListCredentialsRequest, ListCredentialsResponse, PrincipalKind, RevokeCredentialRequest,
RevokeCredentialResponse,
};
@ -95,6 +95,15 @@ impl IamCredentialService {
}
}
fn map_principal_kind(kind: i32) -> Result<TypesPrincipalKind, Status> {
match PrincipalKind::try_from(kind).unwrap_or(PrincipalKind::Unspecified) {
PrincipalKind::User => Ok(TypesPrincipalKind::User),
PrincipalKind::ServiceAccount => Ok(TypesPrincipalKind::ServiceAccount),
PrincipalKind::Group => Ok(TypesPrincipalKind::Group),
PrincipalKind::Unspecified => Err(Status::invalid_argument("principal_kind is required")),
}
}
#[tonic::async_trait]
impl IamCredential for IamCredentialService {
async fn create_s3_credential(
@ -103,6 +112,7 @@ impl IamCredential for IamCredentialService {
) -> Result<Response<CreateS3CredentialResponse>, Status> {
let req = request.into_inner();
let now = now_ts();
let principal_kind = map_principal_kind(req.principal_kind)?;
let (secret_b64, raw_secret) = Self::generate_secret();
let (hash, kdf) = Self::hash_secret(&raw_secret);
let secret_enc = self.encrypt_secret(&raw_secret)?;
@ -111,6 +121,9 @@ impl IamCredential for IamCredentialService {
let record = CredentialRecord {
access_key_id: access_key_id.clone(),
principal_id: req.principal_id.clone(),
principal_kind,
org_id: req.org_id.clone(),
project_id: req.project_id.clone(),
created_at: now,
expires_at: req.expires_at,
revoked: false,
@ -168,6 +181,13 @@ impl IamCredential for IamCredentialService {
secret_key: STANDARD.encode(secret),
principal_id: record.principal_id,
expires_at: record.expires_at,
org_id: record.org_id,
project_id: record.project_id,
principal_kind: match record.principal_kind {
TypesPrincipalKind::User => PrincipalKind::User as i32,
TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
TypesPrincipalKind::Group => PrincipalKind::Group as i32,
},
}))
}
@ -190,6 +210,13 @@ impl IamCredential for IamCredentialService {
expires_at: c.expires_at,
revoked: c.revoked,
description: c.description.unwrap_or_default(),
org_id: c.org_id,
project_id: c.project_id,
principal_kind: match c.principal_kind {
TypesPrincipalKind::User => PrincipalKind::User as i32,
TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
TypesPrincipalKind::Group => PrincipalKind::Group as i32,
},
})
.collect();
Ok(Response::new(ListCredentialsResponse { credentials: creds }))
@ -230,6 +257,9 @@ mod tests {
principal_id: "p1".into(),
description: "".into(),
expires_at: None,
org_id: Some("org-a".into()),
project_id: Some("project-a".into()),
principal_kind: PrincipalKind::ServiceAccount as i32,
}))
.await
.unwrap()
@ -247,6 +277,9 @@ mod tests {
let fetched = STANDARD.decode(get.secret_key).unwrap();
assert_eq!(orig, fetched);
assert_eq!(get.principal_id, "p1");
assert_eq!(get.org_id.as_deref(), Some("org-a"));
assert_eq!(get.project_id.as_deref(), Some("project-a"));
assert_eq!(get.principal_kind, PrincipalKind::ServiceAccount as i32);
}
#[tokio::test]
@ -257,6 +290,9 @@ mod tests {
principal_id: "pA".into(),
description: "".into(),
expires_at: None,
org_id: Some("org-a".into()),
project_id: Some("project-a".into()),
principal_kind: PrincipalKind::ServiceAccount as i32,
}))
.await
.unwrap()
@ -266,6 +302,9 @@ mod tests {
principal_id: "pB".into(),
description: "".into(),
expires_at: None,
org_id: Some("org-b".into()),
project_id: Some("project-b".into()),
principal_kind: PrincipalKind::ServiceAccount as i32,
}))
.await
.unwrap();
@ -289,6 +328,9 @@ mod tests {
principal_id: "p1".into(),
description: "".into(),
expires_at: None,
org_id: Some("org-a".into()),
project_id: Some("project-a".into()),
principal_kind: PrincipalKind::ServiceAccount as i32,
}))
.await
.unwrap()
@ -297,7 +339,6 @@ mod tests {
let revoke1 = svc
.revoke_credential(Request::new(RevokeCredentialRequest {
access_key_id: created.access_key_id.clone(),
reason: "test".into(),
}))
.await
.unwrap()
@ -307,7 +348,6 @@ mod tests {
let revoke2 = svc
.revoke_credential(Request::new(RevokeCredentialRequest {
access_key_id: created.access_key_id.clone(),
reason: "again".into(),
}))
.await
.unwrap()
@ -330,6 +370,9 @@ mod tests {
let expired = CredentialRecord {
access_key_id: "expired-ak".into(),
principal_id: "p1".into(),
principal_kind: TypesPrincipalKind::ServiceAccount,
org_id: Some("org-a".into()),
project_id: Some("project-a".into()),
created_at: now_ts(),
expires_at: Some(now_ts() - 10),
revoked: false,

View file

@ -1,4 +1,5 @@
mod conversions;
mod credential_service;
mod gateway_auth_service;
mod generated;
pub mod iam_service;
@ -8,7 +9,10 @@ pub mod proto {
pub use crate::generated::iam::v1::*;
}
pub use generated::iam::v1::{iam_admin_server, iam_authz_server, iam_token_server};
pub use generated::iam::v1::{
iam_admin_server, iam_authz_server, iam_credential_server, iam_token_server,
};
pub use credential_service::IamCredentialService;
pub use gateway_auth_service::GatewayAuthServiceImpl;
pub use iam_service::{IamAdminService, IamAuthzService};
pub use token_service::IamTokenService;

View file

@ -2,6 +2,7 @@
//!
//! Provides a thin gRPC client for interacting with the IAM service.
use std::future::Future;
use std::time::Duration;
use iam_api::proto::{
@ -19,6 +20,10 @@ use iam_types::{
};
use tonic::transport::{Channel, ClientTlsConfig, Endpoint};
const TRANSIENT_RPC_RETRY_ATTEMPTS: usize = 3;
const TRANSIENT_RPC_INITIAL_BACKOFF: Duration = Duration::from_millis(200);
const TRANSIENT_RPC_MAX_BACKOFF: Duration = Duration::from_millis(1_000);
/// Configuration for the IAM client
#[derive(Debug, Clone)]
pub struct IamClientConfig {
@ -100,6 +105,40 @@ impl IamClient {
IamTokenClient::new(self.channel.clone())
}
async fn call_with_retry<T, F, Fut>(operation: &'static str, mut op: F) -> Result<T>
where
F: FnMut() -> Fut,
Fut: Future<Output = std::result::Result<T, tonic::Status>>,
{
let mut last_status = None;
for attempt in 0..TRANSIENT_RPC_RETRY_ATTEMPTS {
match op().await {
Ok(value) => return Ok(value),
Err(status)
if attempt + 1 < TRANSIENT_RPC_RETRY_ATTEMPTS
&& is_retryable_status(&status) =>
{
let delay = retry_delay(attempt);
tracing::warn!(
operation,
attempt = attempt + 1,
retry_after_ms = delay.as_millis() as u64,
code = ?status.code(),
message = status.message(),
"retrying transient IAM RPC"
);
last_status = Some(status);
tokio::time::sleep(delay).await;
}
Err(status) => return Err(map_status(status)),
}
}
Err(map_status(last_status.unwrap_or_else(|| {
tonic::Status::internal(format!("IAM RPC {operation} failed without a status"))
})))
}
// ========================================================================
// Authorization APIs
// ========================================================================
@ -128,7 +167,6 @@ impl IamClient {
resource: &Resource,
context: std::collections::HashMap<String, String>,
) -> Result<bool> {
let mut client = self.authz_client();
let request = AuthorizeRequest {
principal: Some(to_proto_principal_ref(&principal.to_ref())),
action: action.to_string(),
@ -151,10 +189,12 @@ impl IamClient {
}),
};
let resp = client
.authorize(request)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("authorize", || {
let mut client = self.authz_client();
let request = request.clone();
async move { client.authorize(request).await }
})
.await?
.into_inner();
Ok(resp.allowed)
@ -166,7 +206,6 @@ impl IamClient {
/// Create a new user
pub async fn create_user(&self, id: &str, name: &str) -> Result<Principal> {
let mut client = self.admin_client();
let req = CreatePrincipalRequest {
id: id.into(),
kind: ProtoPrincipalKind::User as i32,
@ -177,25 +216,31 @@ impl IamClient {
metadata: Default::default(),
};
let resp = client
.create_principal(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("create_principal", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.create_principal(req).await }
})
.await?
.into_inner();
Ok(ProtoPrincipal::into(resp))
}
/// Get a principal
pub async fn get_principal(&self, principal_ref: &PrincipalRef) -> Result<Option<Principal>> {
let mut client = self.admin_client();
let req = GetPrincipalRequest {
principal: Some(to_proto_principal_ref(principal_ref)),
};
let resp = client.get_principal(req).await;
let resp = Self::call_with_retry("get_principal", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.get_principal(req).await }
})
.await;
match resp {
Ok(r) => Ok(Some(ProtoPrincipal::into(r.into_inner()))),
Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
Err(status) => Err(map_status(status)),
Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
Err(err) => Err(err),
}
}
@ -206,7 +251,6 @@ impl IamClient {
name: &str,
project_id: &str,
) -> Result<Principal> {
let mut client = self.admin_client();
let req = CreatePrincipalRequest {
id: id.into(),
kind: ProtoPrincipalKind::ServiceAccount as i32,
@ -216,17 +260,18 @@ impl IamClient {
email: None,
metadata: Default::default(),
};
let resp = client
.create_principal(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("create_service_account", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.create_principal(req).await }
})
.await?
.into_inner();
Ok(ProtoPrincipal::into(resp))
}
/// List users
pub async fn list_users(&self) -> Result<Vec<Principal>> {
let mut client = self.admin_client();
let req = ListPrincipalsRequest {
kind: Some(ProtoPrincipalKind::User as i32),
org_id: None,
@ -235,10 +280,12 @@ impl IamClient {
page_token: String::new(),
};
let resp = client
.list_principals(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("list_principals", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.list_principals(req).await }
})
.await?
.into_inner();
Ok(resp
@ -254,36 +301,40 @@ impl IamClient {
/// Get a role by name
pub async fn get_role(&self, name: &str) -> Result<Option<Role>> {
let mut client = self.admin_client();
let req = GetRoleRequest { name: name.into() };
let resp = client.get_role(req).await;
let resp = Self::call_with_retry("get_role", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.get_role(req).await }
})
.await;
match resp {
Ok(r) => Ok(Some(r.into_inner().into())),
Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
Err(status) => Err(map_status(status)),
Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
Err(err) => Err(err),
}
}
/// List all roles
pub async fn list_roles(&self) -> Result<Vec<Role>> {
let mut client = self.admin_client();
let req = ListRolesRequest {
scope: None,
include_builtin: true,
page_size: 0,
page_token: String::new(),
};
let resp = client
.list_roles(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("list_roles", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.list_roles(req).await }
})
.await?
.into_inner();
Ok(resp.roles.into_iter().map(Into::into).collect())
}
/// Create a custom role
pub async fn create_role(&self, role: &Role) -> Result<Role> {
let mut client = self.admin_client();
let req = CreateRoleRequest {
name: role.name.clone(),
display_name: role.display_name.clone(),
@ -297,10 +348,12 @@ impl IamClient {
.collect(),
};
let resp = client
.create_role(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("create_role", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.create_role(req).await }
})
.await?
.into_inner();
Ok(resp.into())
}
@ -311,7 +364,6 @@ impl IamClient {
/// Create a policy binding
pub async fn create_binding(&self, binding: &PolicyBinding) -> Result<PolicyBinding> {
let mut client = self.admin_client();
let req = CreateBindingRequest {
principal: Some(to_proto_principal_ref(&binding.principal_ref)),
role: binding.role_ref.clone(),
@ -320,24 +372,27 @@ impl IamClient {
expires_at: binding.expires_at,
};
let resp = client
.create_binding(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("create_binding", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.create_binding(req).await }
})
.await?
.into_inner();
Ok(resp.into())
}
/// Delete a policy binding
pub async fn delete_binding(&self, binding_id: &str) -> Result<bool> {
let mut client = self.admin_client();
let req = DeleteBindingRequest {
id: binding_id.into(),
};
let resp = client
.delete_binding(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("delete_binding", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.delete_binding(req).await }
})
.await?
.into_inner();
Ok(resp.deleted)
}
@ -347,7 +402,6 @@ impl IamClient {
&self,
principal: &PrincipalRef,
) -> Result<Vec<PolicyBinding>> {
let mut client = self.admin_client();
let req = ListBindingsRequest {
principal: Some(to_proto_principal_ref(principal)),
role: None,
@ -357,17 +411,18 @@ impl IamClient {
page_token: String::new(),
};
let resp = client
.list_bindings(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("list_bindings_for_principal", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.list_bindings(req).await }
})
.await?
.into_inner();
Ok(resp.bindings.into_iter().map(Into::into).collect())
}
/// List bindings for a scope
pub async fn list_bindings_for_scope(&self, scope: &Scope) -> Result<Vec<PolicyBinding>> {
let mut client = self.admin_client();
let req = ListBindingsRequest {
principal: None,
role: None,
@ -377,10 +432,12 @@ impl IamClient {
page_token: String::new(),
};
let resp = client
.list_bindings(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("list_bindings_for_scope", || {
let mut client = self.admin_client();
let req = req.clone();
async move { client.list_bindings(req).await }
})
.await?
.into_inner();
Ok(resp.bindings.into_iter().map(Into::into).collect())
}
@ -397,7 +454,6 @@ impl IamClient {
scope: Scope,
ttl_seconds: u64,
) -> Result<String> {
let mut client = self.token_client();
let req = IssueTokenRequest {
principal_id: principal.id.clone(),
principal_kind: match principal.kind {
@ -410,24 +466,27 @@ impl IamClient {
ttl_seconds,
};
let resp = client
.issue_token(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("issue_token", || {
let mut client = self.token_client();
let req = req.clone();
async move { client.issue_token(req).await }
})
.await?
.into_inner();
Ok(resp.token)
}
/// Validate a token
pub async fn validate_token(&self, token: &str) -> Result<InternalTokenClaims> {
let mut client = self.token_client();
let req = ValidateTokenRequest {
token: token.to_string(),
};
let resp = client
.validate_token(req)
.await
.map_err(map_status)?
let resp = Self::call_with_retry("validate_token", || {
let mut client = self.token_client();
let req = req.clone();
async move { client.validate_token(req).await }
})
.await?
.into_inner();
if !resp.valid {
@ -479,20 +538,55 @@ impl IamClient {
/// Revoke a token
pub async fn revoke_token(&self, token: &str) -> Result<()> {
let mut client = self.token_client();
let req = RevokeTokenRequest {
token: token.to_string(),
reason: "client revoke".into(),
};
client
.revoke_token(req)
.await
.map_err(map_status)?
Self::call_with_retry("revoke_token", || {
let mut client = self.token_client();
let req = req.clone();
async move { client.revoke_token(req).await }
})
.await?
.into_inner();
Ok(())
}
}
fn retry_delay(attempt: usize) -> Duration {
TRANSIENT_RPC_INITIAL_BACKOFF
.saturating_mul(1u32 << attempt.min(3))
.min(TRANSIENT_RPC_MAX_BACKOFF)
}
fn is_retryable_status(status: &tonic::Status) -> bool {
matches!(
status.code(),
tonic::Code::Unavailable
| tonic::Code::Cancelled
| tonic::Code::DeadlineExceeded
| tonic::Code::Unknown
) || retryable_message(status.message())
}
fn retryable_message(message: &str) -> bool {
let lower = message.to_ascii_lowercase();
[
"transport error",
"connection was not ready",
"h2 protocol error",
"broken pipe",
"connection refused",
"connection reset",
]
.iter()
.any(|needle| lower.contains(needle))
}
fn tonic_not_found(message: &str) -> bool {
message.contains("status: NotFound") || message.contains("code: NotFound")
}
fn map_status(status: tonic::Status) -> Error {
Error::Internal(status.to_string())
}
@ -507,3 +601,75 @@ fn to_proto_principal_ref(principal_ref: &PrincipalRef) -> ProtoPrincipalRef {
id: principal_ref.id.clone(),
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::{
atomic::{AtomicUsize, Ordering},
Arc,
};
#[test]
fn retryable_message_covers_connection_readiness() {
assert!(retryable_message("transport error"));
assert!(retryable_message("connection was not ready"));
assert!(retryable_message("h2 protocol error"));
assert!(!retryable_message("permission denied"));
}
#[test]
fn retry_delay_is_capped() {
assert_eq!(retry_delay(0), Duration::from_millis(200));
assert_eq!(retry_delay(1), Duration::from_millis(400));
assert_eq!(retry_delay(2), Duration::from_millis(800));
assert_eq!(retry_delay(3), Duration::from_millis(1000));
assert_eq!(retry_delay(7), Duration::from_millis(1000));
}
#[tokio::test(start_paused = true)]
async fn call_with_retry_retries_transient_statuses() {
let attempts = Arc::new(AtomicUsize::new(0));
let attempts_for_task = attempts.clone();
let task = tokio::spawn(async move {
IamClient::call_with_retry("test", || {
let attempts = attempts_for_task.clone();
async move {
let attempt = attempts.fetch_add(1, Ordering::SeqCst);
if attempt < 2 {
Err(tonic::Status::unavailable("connection was not ready"))
} else {
Ok("ok")
}
}
})
.await
});
tokio::time::advance(Duration::from_secs(3)).await;
assert_eq!(task.await.unwrap().unwrap(), "ok");
assert_eq!(attempts.load(Ordering::SeqCst), 3);
}
#[tokio::test(start_paused = true)]
async fn call_with_retry_stops_on_non_retryable_status() {
let attempts = Arc::new(AtomicUsize::new(0));
let attempts_for_task = attempts.clone();
let err = IamClient::call_with_retry("test", || {
let attempts = attempts_for_task.clone();
async move {
attempts.fetch_add(1, Ordering::SeqCst);
Err::<(), _>(tonic::Status::permission_denied("nope"))
}
})
.await
.unwrap_err();
assert_eq!(attempts.load(Ordering::SeqCst), 1);
match err {
Error::Internal(message) => assert!(message.contains("PermissionDenied")),
other => panic!("unexpected error: {other:?}"),
}
}
}

View file

@ -20,12 +20,15 @@ use tracing::{info, warn};
use iam_api::{
iam_admin_server::IamAdminServer, iam_authz_server::IamAuthzServer,
iam_token_server::IamTokenServer, GatewayAuthServiceImpl, GatewayAuthServiceServer,
IamAdminService, IamAuthzService, IamTokenService,
iam_credential_server::IamCredentialServer, iam_token_server::IamTokenServer,
GatewayAuthServiceImpl, GatewayAuthServiceServer, IamAdminService, IamAuthzService,
IamCredentialService, IamTokenService,
};
use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
use iam_authz::{PolicyCache, PolicyCacheConfig, PolicyEvaluator};
use iam_store::{Backend, BackendConfig, BindingStore, PrincipalStore, RoleStore, TokenStore};
use iam_store::{
Backend, BackendConfig, BindingStore, CredentialStore, PrincipalStore, RoleStore, TokenStore,
};
use config::{BackendKind, ServerConfig};
@ -190,6 +193,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let principal_store = Arc::new(PrincipalStore::new(backend.clone()));
let role_store = Arc::new(RoleStore::new(backend.clone()));
let binding_store = Arc::new(BindingStore::new(backend.clone()));
let credential_store = Arc::new(CredentialStore::new(backend.clone()));
let token_store = Arc::new(TokenStore::new(backend.clone()));
// Initialize builtin roles
@ -238,7 +242,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
)
};
let token_config = InternalTokenConfig::new(signing_key, &config.authn.internal_token.issuer)
let token_config =
InternalTokenConfig::new(signing_key.clone(), &config.authn.internal_token.issuer)
.with_default_ttl(Duration::from_secs(
config.authn.internal_token.default_ttl_seconds,
))
@ -248,6 +253,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let token_service = Arc::new(InternalTokenService::new(token_config));
let admin_token = load_admin_token();
let credential_master_key = std::env::var("IAM_CRED_MASTER_KEY")
.ok()
.map(|value| value.into_bytes())
.filter(|value| value.len() == 32)
.unwrap_or_else(|| {
warn!(
"IAM_CRED_MASTER_KEY missing or not 32 bytes, deriving credential key from signing key",
);
signing_key.sign(b"iam-credential-master-key")
});
// Create gRPC services
let authz_service = IamAuthzService::new(evaluator.clone(), principal_store.clone());
@ -262,6 +277,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
token_store.clone(),
evaluator.clone(),
);
let credential_service =
IamCredentialService::new(credential_store, &credential_master_key, "iam-cred-master")
.map_err(|e| format!("Failed to initialize credential service: {}", e))?;
let admin_service = IamAdminService::new(
principal_store.clone(),
role_store.clone(),
@ -291,6 +309,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
health_reporter
.set_serving::<IamTokenServer<IamTokenService>>()
.await;
health_reporter
.set_serving::<IamCredentialServer<IamCredentialService>>()
.await;
health_reporter
.set_serving::<IamAdminServer<IamAdminService>>()
.await;
@ -357,6 +378,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
.add_service(health_service)
.add_service(IamAuthzServer::new(authz_service))
.add_service(IamTokenServer::new(token_grpc_service))
.add_service(IamCredentialServer::new(credential_service))
.add_service(GatewayAuthServiceServer::new(gateway_auth_service))
.add_service(admin_server)
.serve(config.server.addr);

View file

@ -9,5 +9,6 @@ iam-client = { path = "../iam-client" }
iam-types = { path = "../iam-types" }
tonic = { workspace = true }
tracing = { workspace = true }
tokio = { workspace = true }
http = "1"
serde_json = "1"

View file

@ -16,6 +16,9 @@ use tracing::{debug, warn};
const PHOTON_AUTH_TOKEN_HEADER: &str = "x-photon-auth-token";
const DEFAULT_TOKEN_CACHE_TTL_MS: u64 = 5_000;
const DEFAULT_AUTHZ_CACHE_TTL_MS: u64 = 3_000;
const AUTH_CONNECT_RETRY_ATTEMPTS: usize = 6;
const AUTH_CONNECT_INITIAL_BACKOFF: Duration = Duration::from_millis(500);
const AUTH_CONNECT_MAX_BACKOFF: Duration = Duration::from_secs(5);
#[derive(Debug, Clone)]
struct CacheEntry<T> {
@ -64,9 +67,7 @@ impl AuthService {
config = config.without_tls();
}
let iam_client = IamClient::connect(config)
.await
.map_err(|e| format!("Failed to connect to IAM server: {}", e))?;
let iam_client = connect_iam_with_retry(config).await?;
Ok(Self {
iam_client: Arc::new(iam_client),
@ -273,6 +274,59 @@ impl AuthService {
}
}
async fn connect_iam_with_retry(config: IamClientConfig) -> Result<IamClient, String> {
let mut last_error = None;
for attempt in 0..AUTH_CONNECT_RETRY_ATTEMPTS {
match IamClient::connect(config.clone()).await {
Ok(client) => return Ok(client),
Err(err)
if attempt + 1 < AUTH_CONNECT_RETRY_ATTEMPTS
&& retryable_connect_error(&err.to_string()) =>
{
let delay = auth_connect_retry_delay(attempt);
warn!(
attempt = attempt + 1,
retry_after_ms = delay.as_millis() as u64,
error = %err,
"retrying IAM auth service bootstrap connection"
);
last_error = Some(err.to_string());
tokio::time::sleep(delay).await;
}
Err(err) => {
return Err(format!("Failed to connect to IAM server: {}", err));
}
}
}
Err(format!(
"Failed to connect to IAM server: {}",
last_error.unwrap_or_else(|| "unknown connection error".to_string())
))
}
fn auth_connect_retry_delay(attempt: usize) -> Duration {
AUTH_CONNECT_INITIAL_BACKOFF
.saturating_mul(1u32 << attempt.min(4))
.min(AUTH_CONNECT_MAX_BACKOFF)
}
fn retryable_connect_error(message: &str) -> bool {
let lower = message.to_ascii_lowercase();
[
"transport error",
"connection refused",
"connection was not ready",
"operation timed out",
"deadline has elapsed",
"dns error",
"broken pipe",
"connection reset",
]
.iter()
.any(|needle| lower.contains(needle))
}
fn prune_expired<T>(cache: &mut HashMap<String, CacheEntry<T>>) {
let now = Instant::now();
cache.retain(|_, entry| entry.expires_at > now);
@ -400,6 +454,29 @@ fn extract_token_from_metadata(metadata: &MetadataMap) -> Result<String, Status>
))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn retryable_connect_error_matches_transport_failures() {
assert!(retryable_connect_error("Internal error: transport error"));
assert!(retryable_connect_error("connection was not ready"));
assert!(retryable_connect_error("deadline has elapsed"));
assert!(!retryable_connect_error("permission denied"));
}
#[test]
fn auth_connect_retry_delay_is_capped() {
assert_eq!(auth_connect_retry_delay(0), Duration::from_millis(500));
assert_eq!(auth_connect_retry_delay(1), Duration::from_millis(1000));
assert_eq!(auth_connect_retry_delay(2), Duration::from_millis(2000));
assert_eq!(auth_connect_retry_delay(3), Duration::from_millis(4000));
assert_eq!(auth_connect_retry_delay(4), Duration::from_secs(5));
assert_eq!(auth_connect_retry_delay(8), Duration::from_secs(5));
}
}
fn extract_token_from_headers(headers: &HeaderMap) -> Result<String, Status> {
if let Some(auth_header) = headers.get(AUTHORIZATION) {
let auth_str = auth_header

View file

@ -1,24 +1,25 @@
//! Credential storage (access/secret key metadata)
use std::sync::Arc;
use iam_types::{CredentialRecord, Result};
use crate::backend::JsonStore;
use crate::{DynMetadataClient, MetadataClient};
use crate::backend::{Backend, CasResult, JsonStore, StorageBackend};
/// Store for credentials (S3/API keys)
pub struct CredentialStore {
client: DynMetadataClient,
backend: Arc<Backend>,
}
impl JsonStore for CredentialStore {
fn client(&self) -> &dyn MetadataClient {
self.client.as_ref()
fn backend(&self) -> &Backend {
&self.backend
}
}
impl CredentialStore {
pub fn new(client: DynMetadataClient) -> Self {
Self { client }
pub fn new(backend: Arc<Backend>) -> Self {
Self { backend }
}
pub async fn put(&self, record: &CredentialRecord) -> Result<u64> {
@ -36,13 +37,17 @@ impl CredentialStore {
principal_id: &str,
limit: u32,
) -> Result<Vec<CredentialRecord>> {
// scan prefix and filter by principal_id; small cardinality expected
let prefix = b"iam/credentials/";
let items = self.scan_prefix_json::<CredentialRecord>(prefix, limit).await?;
Ok(items
.into_iter()
.filter(|rec| rec.principal_id == principal_id)
.collect())
let items = self.backend.scan_prefix(prefix, limit).await?;
let mut credentials = Vec::new();
for pair in items {
let record: CredentialRecord = serde_json::from_slice(&pair.value)
.map_err(|e| iam_types::Error::Serialization(e.to_string()))?;
if record.principal_id == principal_id {
credentials.push(record);
}
}
Ok(credentials)
}
pub async fn revoke(&self, access_key_id: &str) -> Result<bool> {
@ -56,13 +61,10 @@ impl CredentialStore {
return Ok(false);
}
record.revoked = true;
match self
.cas_json(key.as_bytes(), version, &record)
.await?
{
crate::CasResult::Success(_) => Ok(true),
crate::CasResult::Conflict { .. } => Ok(false),
crate::CasResult::NotFound => Ok(false),
match self.cas_json(key.as_bytes(), version, &record).await? {
CasResult::Success(_) => Ok(true),
CasResult::Conflict { .. } => Ok(false),
CasResult::NotFound => Ok(false),
}
}
}

View file

@ -7,6 +7,7 @@
pub mod backend;
pub mod binding_store;
pub mod credential_store;
pub mod group_store;
pub mod principal_store;
pub mod role_store;
@ -14,6 +15,7 @@ pub mod token_store;
pub use backend::{Backend, BackendConfig, CasResult, KvPair, StorageBackend};
pub use binding_store::BindingStore;
pub use credential_store::CredentialStore;
pub use group_store::GroupStore;
pub use principal_store::PrincipalStore;
pub use role_store::RoleStore;

View file

@ -2,6 +2,8 @@
use serde::{Deserialize, Serialize};
use crate::PrincipalKind;
/// Argon2 parameters used to hash the secret key
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct Argon2Params {
@ -17,6 +19,9 @@ pub struct Argon2Params {
pub struct CredentialRecord {
pub access_key_id: String,
pub principal_id: String,
pub principal_kind: PrincipalKind,
pub org_id: Option<String>,
pub project_id: Option<String>,
pub created_at: u64,
pub expires_at: Option<u64>,
pub revoked: bool,

View file

@ -10,6 +10,7 @@
//! - Error types
pub mod condition;
pub mod credential;
pub mod error;
pub mod policy;
pub mod principal;
@ -19,6 +20,7 @@ pub mod scope;
pub mod token;
pub use condition::{Condition, ConditionExpr};
pub use credential::{Argon2Params, CredentialRecord};
pub use error::{Error, IamError, Result, StorageError};
pub use policy::{CreateBindingRequest, EffectivePolicy, PolicyBinding};
pub use principal::{Principal, PrincipalKind, PrincipalRef};

View file

@ -89,6 +89,14 @@ service IamToken {
rpc RefreshToken(RefreshTokenRequest) returns (RefreshTokenResponse);
}
// IamCredential manages S3-style access/secret key credentials.
service IamCredential {
rpc CreateS3Credential(CreateS3CredentialRequest) returns (CreateS3CredentialResponse);
rpc GetSecretKey(GetSecretKeyRequest) returns (GetSecretKeyResponse);
rpc ListCredentials(ListCredentialsRequest) returns (ListCredentialsResponse);
rpc RevokeCredential(RevokeCredentialRequest) returns (RevokeCredentialResponse);
}
message IssueTokenRequest {
// Principal to issue token for
string principal_id = 1;
@ -162,6 +170,63 @@ message RefreshTokenResponse {
uint64 expires_at = 2;
}
message CreateS3CredentialRequest {
string principal_id = 1;
string description = 2;
optional uint64 expires_at = 3;
optional string org_id = 4;
optional string project_id = 5;
PrincipalKind principal_kind = 6;
}
message CreateS3CredentialResponse {
string access_key_id = 1;
string secret_key = 2;
uint64 created_at = 3;
optional uint64 expires_at = 4;
}
message GetSecretKeyRequest {
string access_key_id = 1;
}
message GetSecretKeyResponse {
string secret_key = 1;
string principal_id = 2;
optional uint64 expires_at = 3;
optional string org_id = 4;
optional string project_id = 5;
PrincipalKind principal_kind = 6;
}
message ListCredentialsRequest {
string principal_id = 1;
}
message Credential {
string access_key_id = 1;
string principal_id = 2;
uint64 created_at = 3;
optional uint64 expires_at = 4;
bool revoked = 5;
string description = 6;
optional string org_id = 7;
optional string project_id = 8;
PrincipalKind principal_kind = 9;
}
message ListCredentialsResponse {
repeated Credential credentials = 1;
}
message RevokeCredentialRequest {
string access_key_id = 1;
}
message RevokeCredentialResponse {
bool success = 1;
}
message InternalTokenClaims {
string principal_id = 1;
PrincipalKind principal_kind = 2;

796
k8shost/Cargo.lock generated

File diff suppressed because it is too large Load diff

588
lightningstor/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -10,6 +10,8 @@ use crate::node::{NodeClientTrait, NodeRegistry};
use crate::placement::{ConsistentHashSelector, NodeSelector};
use async_trait::async_trait;
use bytes::Bytes;
use futures::future::BoxFuture;
use futures::stream::{FuturesUnordered, StreamExt};
use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
use lightningstor_types::ObjectId;
use serde::{Deserialize, Serialize};
@ -336,7 +338,7 @@ impl ErasureCodedBackend {
.map_err(|e| StorageError::Backend(e.to_string()))?;
// Try to read all shards in parallel
let mut shard_futures = Vec::with_capacity(self.total_shards());
let mut shard_futures = FuturesUnordered::new();
for shard_idx in 0..self.total_shards() {
let is_parity = shard_idx >= self.data_shards;
let chunk_id = ChunkId::new(object_id, chunk_index, shard_idx, is_parity);
@ -345,35 +347,73 @@ impl ErasureCodedBackend {
let chunk_key = chunk_id.to_key();
shard_futures.push(async move {
// Try to read from the preferred node first
if let Ok(node) = node_selector.select_for_read(&nodes, &chunk_key).await {
if let Ok(data) = node
.get_chunk(&chunk_key, shard_idx as u32, is_parity)
let preferred_id = node_selector
.select_for_read(&nodes, &chunk_key)
.await
.ok()
.map(|node| node.node_id().to_string());
let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
FuturesUnordered::new();
if let Some(preferred_id) = preferred_id.as_ref() {
if let Some(preferred) = nodes
.iter()
.find(|node| node.node_id() == preferred_id.as_str())
.cloned()
{
return Some(data);
let key = chunk_key.clone();
readers.push(Box::pin(async move {
preferred
.get_chunk(&key, shard_idx as u32, is_parity)
.await
.ok()
}));
}
}
// Try other nodes if preferred fails
for node in &nodes {
if let Ok(data) = node
.get_chunk(&chunk_key, shard_idx as u32, is_parity)
.await
if preferred_id
.as_ref()
.is_some_and(|preferred| preferred == node.node_id())
{
return Some(data);
continue;
}
let node = node.clone();
let key = chunk_key.clone();
readers.push(Box::pin(async move {
node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
}));
}
while let Some(result) = readers.next().await {
if let Some(data) = result {
return (shard_idx, Some(data));
}
}
None
(shard_idx, None)
});
}
let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
let mut shard_results = vec![None; self.total_shards()];
let mut available_count = 0usize;
while let Some((shard_idx, shard)) = shard_futures.next().await {
if shard.is_some() {
available_count += 1;
}
shard_results[shard_idx] = shard;
if available_count >= self.data_shards {
break;
}
if available_count + shard_futures.len() < self.data_shards {
break;
}
}
// Count available shards
let available_count = shard_results.iter().filter(|s| s.is_some()).count();
debug!(
object_id = %object_id,
chunk_index,
@ -419,9 +459,9 @@ impl StorageBackend for ErasureCodedBackend {
debug!(object_id = %object_id, size = original_size, "Putting object with erasure coding");
// Split data into chunks
let chunks = self.chunk_manager.split(&data);
let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
let chunks = self.chunk_manager.split_with_chunk_size(&data, chunk_size);
let chunk_count = chunks.len();
let chunk_size = self.chunk_manager.chunk_size();
// Write each chunk
for (chunk_idx, chunk_data) in chunks.into_iter().enumerate() {
@ -591,24 +631,78 @@ impl StorageBackend for ErasureCodedBackend {
.map_err(|e| StorageError::Backend(e.to_string()))?;
// Try to read shards
let mut shard_futures = Vec::with_capacity(self.total_shards());
let mut shard_futures = FuturesUnordered::new();
for shard_idx in 0..self.total_shards() {
let is_parity = shard_idx >= self.data_shards;
let key = format!("{}_{}_{}", part_key, shard_idx, if is_parity { "p" } else { "d" });
let nodes = nodes.clone();
let node_selector = self.node_selector.clone();
shard_futures.push(async move {
let preferred_id = node_selector
.select_for_read(&nodes, &key)
.await
.ok()
.map(|node| node.node_id().to_string());
let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
FuturesUnordered::new();
if let Some(preferred_id) = preferred_id.as_ref() {
if let Some(preferred) = nodes
.iter()
.find(|node| node.node_id() == preferred_id.as_str())
.cloned()
{
let key = key.clone();
readers.push(Box::pin(async move {
preferred
.get_chunk(&key, shard_idx as u32, is_parity)
.await
.ok()
}));
}
}
for node in &nodes {
if let Ok(data) = node.get_chunk(&key, shard_idx as u32, is_parity).await {
return Some(data);
if preferred_id
.as_ref()
.is_some_and(|preferred| preferred == node.node_id())
{
continue;
}
let node = node.clone();
let key = key.clone();
readers.push(Box::pin(async move {
node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
}));
}
while let Some(result) = readers.next().await {
if let Some(data) = result {
return (shard_idx, Some(data));
}
}
None
(shard_idx, None)
});
}
let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
let available = shard_results.iter().filter(|s| s.is_some()).count();
let mut shard_results = vec![None; self.total_shards()];
let mut available = 0usize;
while let Some((shard_idx, shard)) = shard_futures.next().await {
if shard.is_some() {
available += 1;
}
shard_results[shard_idx] = shard;
if available >= self.data_shards {
break;
}
if available + shard_futures.len() < self.data_shards {
break;
}
}
if available < self.data_shards {
return Err(StorageError::Backend(format!(
@ -674,7 +768,135 @@ impl StorageBackend for ErasureCodedBackend {
mod tests {
use super::*;
use crate::config::{ChunkConfig, RedundancyMode};
use crate::node::MockNodeRegistry;
use crate::node::{MockNodeClient, MockNodeRegistry, NodeError, NodeResult};
use async_trait::async_trait;
use dashmap::DashMap;
use std::time::{Duration, Instant};
use tokio::time::sleep;
struct SlowReadNodeClient {
node_id: String,
endpoint: String,
delay: Duration,
chunks: DashMap<String, Vec<u8>>,
}
impl SlowReadNodeClient {
fn new(node_id: impl Into<String>, endpoint: impl Into<String>, delay: Duration) -> Self {
Self {
node_id: node_id.into(),
endpoint: endpoint.into(),
delay,
chunks: DashMap::new(),
}
}
fn insert_chunk(&self, chunk_id: impl Into<String>, data: Vec<u8>) {
self.chunks.insert(chunk_id.into(), data);
}
}
#[async_trait]
impl NodeClientTrait for SlowReadNodeClient {
fn node_id(&self) -> &str {
&self.node_id
}
fn endpoint(&self) -> &str {
&self.endpoint
}
async fn is_healthy(&self) -> bool {
true
}
async fn put_chunk(
&self,
chunk_id: &str,
_shard_index: u32,
_is_parity: bool,
data: Bytes,
) -> NodeResult<()> {
self.chunks.insert(chunk_id.to_string(), data.to_vec());
Ok(())
}
async fn get_chunk(
&self,
chunk_id: &str,
_shard_index: u32,
_is_parity: bool,
) -> NodeResult<Bytes> {
sleep(self.delay).await;
self.chunks
.get(chunk_id)
.map(|value| Bytes::from(value.value().clone()))
.ok_or_else(|| NodeError::NotFound(chunk_id.to_string()))
}
async fn delete_chunk(&self, chunk_id: &str) -> NodeResult<()> {
self.chunks.remove(chunk_id);
Ok(())
}
async fn chunk_exists(&self, chunk_id: &str) -> NodeResult<bool> {
Ok(self.chunks.contains_key(chunk_id))
}
async fn chunk_size(&self, chunk_id: &str) -> NodeResult<Option<u64>> {
Ok(self
.chunks
.get(chunk_id)
.map(|value| value.value().len() as u64))
}
async fn ping(&self) -> NodeResult<Duration> {
Ok(Duration::from_millis(1))
}
}
struct FixedNodeRegistry {
nodes: Vec<Arc<dyn NodeClientTrait>>,
}
#[async_trait]
impl NodeRegistry for FixedNodeRegistry {
async fn get_all_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
Ok(self.nodes.clone())
}
async fn get_healthy_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
Ok(self.nodes.clone())
}
async fn register_node(&self, _info: crate::node::NodeInfo) -> NodeResult<()> {
Ok(())
}
async fn deregister_node(&self, _node_id: &str) -> NodeResult<()> {
Ok(())
}
async fn update_health(&self, _node_id: &str, _healthy: bool) -> NodeResult<()> {
Ok(())
}
async fn get_node(&self, node_id: &str) -> NodeResult<Option<Arc<dyn NodeClientTrait>>> {
Ok(self
.nodes
.iter()
.find(|node| node.node_id() == node_id)
.cloned())
}
async fn node_count(&self) -> usize {
self.nodes.len()
}
async fn healthy_node_count(&self) -> usize {
self.nodes.len()
}
}
fn create_ec_config(data_shards: usize, parity_shards: usize) -> DistributedConfig {
DistributedConfig {
@ -858,4 +1080,162 @@ mod tests {
assert_eq!(retrieved.len(), data.len());
assert_eq!(retrieved, data);
}
#[tokio::test]
async fn test_ec_backend_read_returns_after_minimum_shards() {
let config = create_ec_config(4, 2);
let mut fast_nodes = Vec::new();
for index in 0..4 {
fast_nodes.push(Arc::new(MockNodeClient::new(
format!("fast-{index}"),
format!("http://fast-{index}:9002"),
)));
}
let slow_a = Arc::new(SlowReadNodeClient::new(
"slow-a",
"http://slow-a:9002",
Duration::from_millis(250),
));
let slow_b = Arc::new(SlowReadNodeClient::new(
"slow-b",
"http://slow-b:9002",
Duration::from_millis(250),
));
let backend = ErasureCodedBackend::new(
config,
Arc::new(FixedNodeRegistry {
nodes: vec![
fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
slow_a.clone() as Arc<dyn NodeClientTrait>,
slow_b.clone() as Arc<dyn NodeClientTrait>,
],
}),
)
.await
.unwrap();
let object_id = ObjectId::new();
let data = Bytes::from(vec![5u8; 512]);
let metadata = ObjectMetadata::new(data.len() as u64, 1, data.len());
let meta_key = ObjectMetadata::metadata_key(&object_id);
let shards = backend.codec.encode(&data).unwrap();
for fast_node in &fast_nodes {
fast_node
.put_chunk(&meta_key, 0, false, Bytes::from(metadata.to_bytes()))
.await
.unwrap();
}
for slow_node in [&slow_a, &slow_b] {
slow_node.insert_chunk(meta_key.clone(), metadata.to_bytes());
}
for (shard_idx, shard_data) in shards.into_iter().enumerate() {
let is_parity = shard_idx >= backend.data_shards;
let key = ChunkId::new(&object_id, 0, shard_idx, is_parity).to_key();
if shard_idx < 4 {
fast_nodes[shard_idx]
.put_chunk(
&key,
shard_idx as u32,
is_parity,
Bytes::from(shard_data),
)
.await
.unwrap();
} else if shard_idx == 4 {
slow_a.insert_chunk(key, shard_data);
} else {
slow_b.insert_chunk(key, shard_data);
}
}
let started = Instant::now();
let retrieved = backend.get_object(&object_id).await.unwrap();
let elapsed = started.elapsed();
assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
assert_eq!(retrieved, data);
}
#[tokio::test]
async fn test_ec_backend_get_part_returns_after_minimum_shards() {
let config = create_ec_config(4, 2);
let mut fast_nodes = Vec::new();
for index in 0..4 {
fast_nodes.push(Arc::new(MockNodeClient::new(
format!("fast-{index}"),
format!("http://fast-{index}:9002"),
)));
}
let slow_a = Arc::new(SlowReadNodeClient::new(
"slow-a",
"http://slow-a:9002",
Duration::from_millis(250),
));
let slow_b = Arc::new(SlowReadNodeClient::new(
"slow-b",
"http://slow-b:9002",
Duration::from_millis(250),
));
let backend = ErasureCodedBackend::new(
config,
Arc::new(FixedNodeRegistry {
nodes: vec![
fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
slow_a.clone() as Arc<dyn NodeClientTrait>,
slow_b.clone() as Arc<dyn NodeClientTrait>,
],
}),
)
.await
.unwrap();
let upload_id = "upload-latency";
let part_number = 7;
let data = Bytes::from(vec![9u8; 512]);
let shards = backend.codec.encode(&data).unwrap();
for (shard_idx, shard_data) in shards.into_iter().enumerate() {
let is_parity = shard_idx >= backend.data_shards;
let key = format!(
"part_{}_{}_{}_{}",
upload_id,
part_number,
shard_idx,
if is_parity { "p" } else { "d" }
);
if shard_idx < 4 {
fast_nodes[shard_idx]
.put_chunk(
&key,
shard_idx as u32,
is_parity,
Bytes::from(shard_data),
)
.await
.unwrap();
} else if shard_idx == 4 {
slow_a.insert_chunk(key, shard_data);
} else {
slow_b.insert_chunk(key, shard_data);
}
}
let started = Instant::now();
let retrieved = backend.get_part(upload_id, part_number).await.unwrap();
let elapsed = started.elapsed();
assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
assert_eq!(retrieved, data);
}
}

View file

@ -5,13 +5,15 @@
use crate::chunk::ChunkManager;
use crate::config::DistributedConfig;
use crate::node::{NodeClientTrait, NodeError, NodeRegistry};
use crate::node::{NodeClientTrait, NodeError, NodeRegistry, NodeResult};
use crate::placement::{ConsistentHashSelector, NodeSelector};
use crate::repair::{RepairQueue, ReplicatedRepairTask};
use async_trait::async_trait;
use bytes::{Bytes, BytesMut};
use futures::stream::{FuturesUnordered, StreamExt};
use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
use lightningstor_types::ObjectId;
use std::net::IpAddr;
use std::sync::Arc;
use std::time::Duration;
use tracing::{debug, error, warn};
@ -81,6 +83,8 @@ pub struct ReplicatedBackend {
read_quorum: usize,
/// Write quorum (minimum replicas for successful write)
write_quorum: usize,
/// Durable queue for repairing under-replicated chunks.
repair_queue: Option<Arc<dyn RepairQueue>>,
}
impl ReplicatedBackend {
@ -92,6 +96,15 @@ impl ReplicatedBackend {
pub async fn new(
config: DistributedConfig,
node_registry: Arc<dyn NodeRegistry>,
) -> StorageResult<Self> {
Self::new_with_repair_queue(config, node_registry, None).await
}
/// Create a replicated backend with an optional durable repair queue.
pub async fn new_with_repair_queue(
config: DistributedConfig,
node_registry: Arc<dyn NodeRegistry>,
repair_queue: Option<Arc<dyn RepairQueue>>,
) -> StorageResult<Self> {
let (replica_count, read_quorum, write_quorum) = match &config.redundancy {
crate::config::RedundancyMode::Replicated {
@ -116,6 +129,7 @@ impl ReplicatedBackend {
replica_count,
read_quorum,
write_quorum,
repair_queue,
})
}
@ -134,6 +148,89 @@ impl ReplicatedBackend {
self.write_quorum
}
async fn finalize_pending_replica_writes(
repair_queue: Option<Arc<dyn RepairQueue>>,
mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
key: String,
shard_index: u32,
mut success_count: usize,
total_replicas: usize,
reason: String,
) {
let mut errors = Vec::new();
while let Some(result) = pending_writes.next().await {
match result {
Ok((_, Ok(()))) => success_count += 1,
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
Err(join_err) => errors.push(format!("join error: {join_err}")),
}
}
if success_count >= total_replicas {
return;
}
if let Some(queue) = repair_queue {
queue
.enqueue_repair(ReplicatedRepairTask::new(key.clone(), shard_index, reason))
.await;
}
warn!(
chunk_key = %key,
shard_index,
success_count,
total_replicas,
errors = ?errors,
"Replica write completed below desired replication; repair task queued"
);
}
async fn finalize_pending_chunked_write_repairs(
repair_queue: Option<Arc<dyn RepairQueue>>,
mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
repair_targets: Vec<(String, u32)>,
object_id: String,
mut success_count: usize,
total_replicas: usize,
reason: String,
) {
let mut errors = Vec::new();
while let Some(result) = pending_writes.next().await {
match result {
Ok((_, Ok(()))) => success_count += 1,
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
Err(join_err) => errors.push(format!("join error: {join_err}")),
}
}
if success_count >= total_replicas {
return;
}
if let Some(queue) = repair_queue {
for (chunk_key, shard_index) in repair_targets {
queue
.enqueue_repair(ReplicatedRepairTask::new(
chunk_key,
shard_index,
reason.clone(),
))
.await;
}
}
warn!(
object_id = %object_id,
success_count,
total_replicas,
errors = ?errors,
"Chunked replica write completed below desired replication; repair tasks queued"
);
}
fn chunk_write_parallelism(&self, chunk_count: usize) -> usize {
chunk_count
.min(
@ -220,7 +317,13 @@ impl ReplicatedBackend {
));
}
if let Ok(preferred) = self.node_selector.select_for_read(nodes, key).await {
let mut ordered_nodes = Self::ordered_read_nodes(nodes, self
.node_selector
.select_for_read(nodes, key)
.await
.ok());
if let Some(preferred) = ordered_nodes.first() {
match preferred.get_chunk(key, shard_index, false).await {
Ok(data) => return Ok(Some(data)),
Err(NodeError::NotFound(_)) => {}
@ -235,7 +338,7 @@ impl ReplicatedBackend {
}
}
for node in nodes {
for node in ordered_nodes.drain(1..) {
match node.get_chunk(key, shard_index, false).await {
Ok(data) => return Ok(Some(data)),
Err(NodeError::NotFound(_)) => continue,
@ -383,6 +486,21 @@ impl ReplicatedBackend {
Ok((_, Ok(()))) => {
success_count += 1;
if success_count >= self.write_quorum {
if success_count < total_replicas {
let pending_writes =
std::mem::replace(&mut write_futures, FuturesUnordered::new());
tokio::spawn(Self::finalize_pending_replica_writes(
self.repair_queue.clone(),
pending_writes,
key.clone(),
shard_index,
success_count,
total_replicas,
format!(
"replica write completed below desired replication after quorum ({success_count}/{total_replicas})"
),
));
}
debug!(
chunk_key = %key,
success_count,
@ -427,13 +545,13 @@ impl ReplicatedBackend {
}
async fn write_chunked_object(&self, object_id: &ObjectId, data: Bytes) -> StorageResult<()> {
let chunk_size = self.chunk_manager.chunk_size();
let chunk_count = self.chunk_manager.chunk_count(data.len());
let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
let chunk_count = ChunkManager::chunk_count_for_size(data.len(), chunk_size);
let metadata = ReplicatedObjectMetadata::new(data.len(), chunk_count, chunk_size);
let mut requests = Vec::with_capacity(chunk_count + 1);
for chunk_index in 0..chunk_count {
let chunk_key = Self::object_chunk_key(object_id, chunk_index);
let (start, len) = self.chunk_manager.chunk_range(data.len(), chunk_index);
let (start, len) = ChunkManager::chunk_range_for_size(data.len(), chunk_index, chunk_size);
let chunk_bytes = data.slice(start..start + len);
requests.push((chunk_key, chunk_index as u32, false, chunk_bytes));
}
@ -464,6 +582,27 @@ impl ReplicatedBackend {
Ok((_, Ok(()))) => {
success_count += 1;
if success_count >= self.write_quorum {
if success_count < total_replicas {
let repair_targets = requests
.iter()
.map(|(chunk_key, shard_index, _, _)| {
(chunk_key.clone(), *shard_index)
})
.collect::<Vec<_>>();
let pending_writes =
std::mem::replace(&mut write_futures, FuturesUnordered::new());
tokio::spawn(Self::finalize_pending_chunked_write_repairs(
self.repair_queue.clone(),
pending_writes,
repair_targets,
object_id.to_string(),
success_count,
total_replicas,
format!(
"chunked object write completed below desired replication after quorum ({success_count}/{total_replicas})"
),
));
}
debug!(
object_id = %object_id,
chunk_count,
@ -509,6 +648,150 @@ impl ReplicatedBackend {
)))
}
pub async fn repair_chunk(&self, task: &ReplicatedRepairTask) -> StorageResult<()> {
let healthy_nodes = self
.node_registry
.get_healthy_nodes()
.await
.map_err(|e| StorageError::Backend(e.to_string()))?;
if healthy_nodes.is_empty() {
return Err(StorageError::Backend(
"No healthy storage nodes available for repair".to_string(),
));
}
let desired_nodes = self
.node_selector
.select_nodes_for_key(&healthy_nodes, self.replica_count, &task.key)
.await
.map_err(|e| StorageError::Backend(e.to_string()))?;
let mut present_nodes = Vec::new();
let mut missing_nodes = Vec::new();
for node in desired_nodes {
match node.chunk_exists(&task.key).await {
Ok(true) => present_nodes.push(node),
Ok(false) => missing_nodes.push(node),
Err(err) => {
warn!(
chunk_key = task.key,
node_id = node.node_id(),
error = ?err,
"Failed to inspect chunk during repair; treating replica as missing"
);
missing_nodes.push(node);
}
}
}
if missing_nodes.is_empty() {
return Ok(());
}
if present_nodes.is_empty() {
let desired_node_ids = missing_nodes
.iter()
.map(|node| node.node_id().to_string())
.collect::<std::collections::HashSet<_>>();
for node in healthy_nodes {
if desired_node_ids.contains(node.node_id()) {
continue;
}
match node.chunk_exists(&task.key).await {
Ok(true) => {
present_nodes.push(node);
break;
}
Ok(false) => {}
Err(err) => {
warn!(
chunk_key = task.key,
node_id = node.node_id(),
error = ?err,
"Failed to inspect off-placement chunk during repair"
);
}
}
}
}
let source = present_nodes.first().ok_or_else(|| {
StorageError::Backend(format!(
"Cannot repair {} because no healthy source replica is available",
task.key
))
})?;
let data = source
.get_chunk(&task.key, task.shard_index, false)
.await
.map_err(|err| {
StorageError::Backend(format!(
"Failed to load repair source for {} from {}: {}",
task.key,
source.node_id(),
err
))
})?;
let mut repair_futures = FuturesUnordered::new();
for node in missing_nodes {
let node_id = node.node_id().to_string();
let key = task.key.clone();
let chunk = data.clone();
let shard_index = task.shard_index;
repair_futures.push(tokio::spawn(async move {
let result = node.put_chunk(&key, shard_index, false, chunk).await;
(node_id, result)
}));
}
let mut repaired = 0usize;
let mut errors = Vec::new();
while let Some(result) = repair_futures.next().await {
match result {
Ok((_, Ok(()))) => repaired += 1,
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
Err(join_err) => errors.push(format!("join error: {join_err}")),
}
}
if errors.is_empty() {
return Ok(());
}
Err(StorageError::Backend(format!(
"Repair for {} only restored {} replicas: {}",
task.key,
repaired,
errors.join(", ")
)))
}
pub async fn chunk_exists_anywhere(&self, key: &str) -> StorageResult<bool> {
let nodes = self
.node_registry
.get_all_nodes()
.await
.map_err(|e| StorageError::Backend(e.to_string()))?;
for node in nodes {
match node.chunk_exists(key).await {
Ok(true) => return Ok(true),
Ok(false) => {}
Err(err) => {
warn!(
chunk_key = key,
node_id = node.node_id(),
error = ?err,
"Failed to inspect chunk while probing global existence"
);
}
}
}
Ok(false)
}
async fn read_chunked_object(
&self,
object_id: &ObjectId,
@ -521,24 +804,47 @@ impl ReplicatedBackend {
.map_err(|e| StorageError::Backend(e.to_string()))?;
if !nodes.is_empty() {
let mut ordered_nodes = Vec::with_capacity(nodes.len());
if let Ok(preferred) = self
let preferred = self
.node_selector
.select_for_read(&nodes, &Self::object_chunk_key(object_id, 0))
.await
.ok();
let ordered_nodes = Self::ordered_read_nodes(&nodes, preferred);
if metadata.chunk_count > 1 {
if let Some(local_node) = ordered_nodes.iter().find(|node| Self::is_local_node(node))
{
ordered_nodes.push(preferred.clone());
let batch_requests: Vec<(String, u32, bool)> = (0..metadata.chunk_count)
.map(|chunk_index| {
(
Self::object_chunk_key(object_id, chunk_index),
chunk_index as u32,
false,
)
})
.collect();
match local_node.batch_get_chunks(batch_requests).await {
Ok(chunks) => {
return Self::assemble_chunked_bytes(
object_id,
metadata.original_size,
chunks,
);
}
Err(err) => {
warn!(
object_id = %object_id,
node_id = local_node.node_id(),
error = ?err,
"Local replica batch read failed, falling back to distributed reads"
);
}
}
for node in nodes {
if ordered_nodes
.iter()
.all(|existing| existing.node_id() != node.node_id())
{
ordered_nodes.push(node);
}
}
if ordered_nodes.len() > 1 && metadata.chunk_count > 1 {
if ordered_nodes.len() > 1 && metadata.chunk_count > 1 && !Self::has_local_node(&ordered_nodes)
{
match self
.read_chunked_object_from_distributed_batches(
object_id,
@ -783,6 +1089,74 @@ impl ReplicatedBackend {
combined.truncate(original_size);
Ok(combined.freeze())
}
fn ordered_read_nodes(
nodes: &[Arc<dyn NodeClientTrait>],
preferred: Option<Arc<dyn NodeClientTrait>>,
) -> Vec<Arc<dyn NodeClientTrait>> {
let mut ordered = Vec::with_capacity(nodes.len());
if let Some(local) = nodes.iter().find(|node| Self::is_local_node(node)) {
ordered.push(local.clone());
}
if let Some(preferred) = preferred {
if ordered
.iter()
.all(|existing| existing.node_id() != preferred.node_id())
{
ordered.push(preferred);
}
}
for node in nodes {
if ordered
.iter()
.all(|existing| existing.node_id() != node.node_id())
{
ordered.push(node.clone());
}
}
ordered
}
fn has_local_node(nodes: &[Arc<dyn NodeClientTrait>]) -> bool {
nodes.iter().any(Self::is_local_node)
}
fn is_local_node(node: &Arc<dyn NodeClientTrait>) -> bool {
Self::endpoint_is_local(node.endpoint())
}
fn endpoint_is_local(endpoint: &str) -> bool {
let authority = endpoint
.split_once("://")
.map(|(_, rest)| rest)
.unwrap_or(endpoint)
.split('/')
.next()
.unwrap_or(endpoint);
let host = if authority.starts_with('[') {
authority
.split_once(']')
.map(|(host, _)| host.trim_start_matches('['))
.unwrap_or(authority.trim_matches(['[', ']']))
} else {
authority
.rsplit_once(':')
.map(|(host, _)| host)
.unwrap_or(authority)
};
if host.eq_ignore_ascii_case("localhost") {
return true;
}
host.parse::<IpAddr>()
.map(|ip| ip.is_loopback())
.unwrap_or(false)
}
}
#[async_trait]
@ -908,12 +1282,25 @@ mod tests {
use super::*;
use crate::config::RedundancyMode;
use crate::node::{MockNodeRegistry, NodeError, NodeResult};
use crate::repair::RepairQueue;
use async_trait::async_trait;
use dashmap::DashMap;
use std::sync::Arc;
use std::time::{Duration, Instant};
use tokio::time::sleep;
#[derive(Default)]
struct CapturingRepairQueue {
tasks: DashMap<String, ReplicatedRepairTask>,
}
#[async_trait]
impl RepairQueue for CapturingRepairQueue {
async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
self.tasks.insert(task.id.clone(), task);
}
}
struct SlowNodeClient {
node_id: String,
endpoint: String,
@ -1196,6 +1583,115 @@ mod tests {
assert!(result.is_err());
}
#[tokio::test]
async fn test_under_replicated_write_enqueues_repair_task() {
let config = create_replicated_config(3);
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
let nodes = registry.all_mock_nodes();
nodes[2].set_fail_puts(true);
let repair_queue = Arc::new(CapturingRepairQueue::default());
let backend = ReplicatedBackend::new_with_repair_queue(
config,
registry,
Some(repair_queue.clone()),
)
.await
.unwrap();
let object_id = ObjectId::new();
backend
.put_object(&object_id, Bytes::from_static(b"repair-me"))
.await
.unwrap();
let mut task = None;
for _ in 0..20 {
task = repair_queue
.tasks
.iter()
.next()
.map(|entry| entry.value().clone());
if task.is_some() {
break;
}
sleep(Duration::from_millis(10)).await;
}
let task = task.expect("repair task should be queued");
assert_eq!(task.key, ReplicatedBackend::object_key(&object_id));
assert_eq!(task.shard_index, 0);
}
#[tokio::test]
async fn test_repair_chunk_restores_missing_replica() {
let config = create_replicated_config(3);
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
let nodes = registry.all_mock_nodes();
let backend = ReplicatedBackend::new(config, registry.clone())
.await
.unwrap();
let object_id = ObjectId::new();
let data = Bytes::from(vec![11u8; 128]);
backend.put_object(&object_id, data.clone()).await.unwrap();
let key = ReplicatedBackend::object_key(&object_id);
let mut missing = None;
for node in &nodes {
if node.chunk_exists(&key).await.unwrap() {
missing = Some(node.clone());
break;
}
}
let missing = missing.expect("at least one replica should exist");
missing.delete_chunk(&key).await.unwrap();
assert!(!missing.chunk_exists(&key).await.unwrap());
let task = ReplicatedRepairTask::new(key.clone(), 0, "test");
backend.repair_chunk(&task).await.unwrap();
assert!(missing.chunk_exists(&key).await.unwrap());
}
#[tokio::test]
async fn test_repair_chunk_can_source_from_off_placement_replica() {
let config = create_replicated_config(2);
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
let nodes = registry.all_mock_nodes();
let backend = ReplicatedBackend::new(config, registry.clone())
.await
.unwrap();
let object_id = ObjectId::new();
let data = Bytes::from(vec![23u8; 128]);
backend.put_object(&object_id, data.clone()).await.unwrap();
let key = ReplicatedBackend::object_key(&object_id);
let desired_nodes = backend.select_replica_nodes_for_key(&key).await.unwrap();
assert_eq!(desired_nodes.len(), 2);
let off_placement = nodes
.iter()
.find(|node| {
desired_nodes
.iter()
.all(|desired| desired.node_id() != node.node_id())
})
.cloned()
.expect("off-placement node should exist");
let source_bytes = desired_nodes[0].get_chunk(&key, 0, false).await.unwrap();
off_placement.put_chunk(&key, 0, false, source_bytes).await.unwrap();
for node in &desired_nodes {
node.delete_chunk(&key).await.unwrap();
assert!(!node.chunk_exists(&key).await.unwrap());
}
let task = ReplicatedRepairTask::new(key.clone(), 0, "off-placement-source");
backend.repair_chunk(&task).await.unwrap();
for node in &desired_nodes {
assert!(node.chunk_exists(&key).await.unwrap());
}
}
#[tokio::test]
async fn test_replicated_backend_returns_after_quorum_without_waiting_for_slow_replica() {
let config = create_replicated_config(3);
@ -1333,6 +1829,43 @@ mod tests {
.is_none());
}
#[tokio::test]
async fn test_replicated_backend_prefers_local_replica_for_chunked_reads() {
let mut config = create_replicated_config(3);
config.chunk.chunk_size = 64;
let local = Arc::new(crate::node::MockNodeClient::new(
"local",
"http://127.0.0.1:9002",
));
let slow_a = Arc::new(SlowNodeClient::new(
"slow-a",
"http://slow-a:9002",
Duration::from_millis(250),
));
let slow_b = Arc::new(SlowNodeClient::new(
"slow-b",
"http://slow-b:9002",
Duration::from_millis(250),
));
let registry = Arc::new(FixedNodeRegistry {
nodes: vec![slow_a.clone(), slow_b.clone(), local.clone()],
});
let backend = ReplicatedBackend::new(config, registry).await.unwrap();
let object_id = ObjectId::new();
let data = Bytes::from(vec![5u8; 256]);
backend.put_object(&object_id, data.clone()).await.unwrap();
let started = Instant::now();
let retrieved = backend.get_object(&object_id).await.unwrap();
let elapsed = started.elapsed();
assert_eq!(retrieved, data);
assert!(elapsed < Duration::from_millis(150), "elapsed={elapsed:?}");
assert!(local.get_count() >= 4);
}
#[tokio::test]
async fn test_replicated_backend_object_size() {
let config = create_replicated_config(3);

View file

@ -5,6 +5,8 @@
use crate::config::ChunkConfig;
const TARGET_CHUNK_COUNT_PER_OBJECT: usize = 8;
/// Manages chunk operations for large objects
#[derive(Debug, Clone)]
pub struct ChunkManager {
@ -27,18 +29,42 @@ impl ChunkManager {
self.config.chunk_size
}
/// Choose the effective chunk size for an object of the given size.
///
/// Small objects keep the configured default chunk size. Larger objects
/// scale up to keep per-object chunk counts bounded without exceeding the
/// configured maximum.
pub fn effective_chunk_size(&self, total_size: usize) -> usize {
if total_size == 0 {
return self.config.chunk_size;
}
let min_chunk_size = self.config.min_chunk_size.min(self.config.chunk_size).max(1);
let max_chunk_size = self.config.max_chunk_size.max(self.config.chunk_size);
let required = total_size.div_ceil(TARGET_CHUNK_COUNT_PER_OBJECT);
let alignment = min_chunk_size;
let aligned_required = required.div_ceil(alignment) * alignment;
aligned_required
.max(self.config.chunk_size)
.clamp(min_chunk_size, max_chunk_size)
}
/// Split data into chunks
///
/// Returns a vector of chunks. Each chunk is at most `chunk_size` bytes,
/// except the last chunk which may be smaller.
pub fn split(&self, data: &[u8]) -> Vec<Vec<u8>> {
self.split_with_chunk_size(data, self.config.chunk_size)
}
/// Split data into chunks using an explicit chunk size.
pub fn split_with_chunk_size(&self, data: &[u8], chunk_size: usize) -> Vec<Vec<u8>> {
if data.is_empty() {
return vec![vec![]];
}
data.chunks(self.config.chunk_size)
.map(|c| c.to_vec())
.collect()
data.chunks(chunk_size).map(|c| c.to_vec()).collect()
}
/// Reassemble chunks into original data
@ -50,21 +76,33 @@ impl ChunkManager {
/// Calculate the number of chunks for a given data size
pub fn chunk_count(&self, size: usize) -> usize {
Self::chunk_count_for_size(size, self.config.chunk_size)
}
pub fn chunk_count_for_size(size: usize, chunk_size: usize) -> usize {
if size == 0 {
return 1;
}
(size + self.config.chunk_size - 1) / self.config.chunk_size
size.div_ceil(chunk_size)
}
/// Calculate the size of a specific chunk
///
/// Returns the size of the chunk at the given index for data of the given total size.
pub fn chunk_size_at(&self, total_size: usize, chunk_index: usize) -> usize {
let full_chunks = total_size / self.config.chunk_size;
let remainder = total_size % self.config.chunk_size;
Self::chunk_size_at_for_size(total_size, chunk_index, self.config.chunk_size)
}
pub fn chunk_size_at_for_size(
total_size: usize,
chunk_index: usize,
chunk_size: usize,
) -> usize {
let full_chunks = total_size / chunk_size;
let remainder = total_size % chunk_size;
if chunk_index < full_chunks {
self.config.chunk_size
chunk_size
} else if chunk_index == full_chunks && remainder > 0 {
remainder
} else {
@ -76,8 +114,16 @@ impl ChunkManager {
///
/// Returns (start_offset, length) for the chunk at the given index.
pub fn chunk_range(&self, total_size: usize, chunk_index: usize) -> (usize, usize) {
let start = chunk_index * self.config.chunk_size;
let length = self.chunk_size_at(total_size, chunk_index);
Self::chunk_range_for_size(total_size, chunk_index, self.config.chunk_size)
}
pub fn chunk_range_for_size(
total_size: usize,
chunk_index: usize,
chunk_size: usize,
) -> (usize, usize) {
let start = chunk_index * chunk_size;
let length = Self::chunk_size_at_for_size(total_size, chunk_index, chunk_size);
(start, length)
}
}
@ -257,6 +303,15 @@ mod tests {
assert_eq!(manager.chunk_range(2500, 2), (2048, 452));
}
#[test]
fn test_effective_chunk_size_scales_large_objects_up_to_target_chunk_count() {
let manager = ChunkManager::default();
assert_eq!(manager.effective_chunk_size(4 * 1024 * 1024), 8 * 1024 * 1024);
assert_eq!(manager.effective_chunk_size(256 * 1024 * 1024), 32 * 1024 * 1024);
assert_eq!(manager.effective_chunk_size(1024 * 1024 * 1024), 64 * 1024 * 1024);
}
#[test]
fn test_chunk_id_to_key() {
let id = ChunkId::data_shard("obj123", 0, 2);

View file

@ -65,12 +65,14 @@ pub mod config;
pub mod erasure;
pub mod node;
pub mod placement;
pub mod repair;
// Re-export commonly used types
pub use backends::{ErasureCodedBackend, ReplicatedBackend};
pub use config::{BucketStorageConfig, ChunkConfig, DistributedConfig, RedundancyMode};
pub use node::{MockNodeClient, MockNodeRegistry, NodeRegistry, StaticNodeRegistry};
pub use placement::{ConsistentHashSelector, NodeSelector, RandomSelector, RoundRobinSelector};
pub use repair::{RepairQueue, ReplicatedRepairTask};
#[cfg(test)]
mod tests {

View file

@ -0,0 +1,58 @@
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::time::{SystemTime, UNIX_EPOCH};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct ReplicatedRepairTask {
pub id: String,
pub key: String,
pub shard_index: u32,
pub reason: String,
pub enqueued_at_millis: u64,
#[serde(default)]
pub attempt_count: u32,
#[serde(default)]
pub last_error: Option<String>,
#[serde(default)]
pub next_attempt_after_millis: u64,
}
impl ReplicatedRepairTask {
pub fn new(key: impl Into<String>, shard_index: u32, reason: impl Into<String>) -> Self {
let key = key.into();
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
Self {
id: format!("replicated::{key}::{shard_index}"),
key,
shard_index,
reason: reason.into(),
enqueued_at_millis: now,
attempt_count: 0,
last_error: None,
next_attempt_after_millis: now,
}
}
pub fn schedule_retry(&mut self, error: impl Into<String>, backoff_millis: u64) {
let now = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64;
self.attempt_count = self.attempt_count.saturating_add(1);
self.last_error = Some(error.into());
self.next_attempt_after_millis = now.saturating_add(backoff_millis);
}
pub fn is_due(&self, now_millis: u64) -> bool {
now_millis >= self.next_attempt_after_millis
}
}
#[async_trait]
pub trait RepairQueue: Send + Sync {
async fn enqueue_repair(&self, task: ReplicatedRepairTask);
}

View file

@ -1,13 +1,18 @@
//! Local chunk storage
use dashmap::DashMap;
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::path::PathBuf;
use std::sync::atomic::{AtomicU64, Ordering};
use thiserror::Error;
use tokio::fs;
use tokio::io::AsyncWriteExt;
use tokio::sync::Mutex;
use tracing::debug;
const WRITE_LOCK_STRIPES: usize = 256;
/// Errors from chunk storage operations
#[derive(Debug, Error)]
pub enum StorageError {
@ -45,6 +50,12 @@ pub struct LocalChunkStore {
/// Whether writes should be flushed before they are acknowledged.
sync_on_write: bool,
/// Monotonic nonce for per-write temporary paths.
temp_file_nonce: AtomicU64,
/// Striped per-chunk write/delete locks to keep same-key updates coherent.
write_locks: Vec<Mutex<()>>,
}
impl LocalChunkStore {
@ -65,6 +76,8 @@ impl LocalChunkStore {
max_capacity,
chunk_count: AtomicU64::new(0),
sync_on_write,
temp_file_nonce: AtomicU64::new(0),
write_locks: (0..WRITE_LOCK_STRIPES).map(|_| Mutex::new(())).collect(),
};
// Scan existing chunks
@ -91,7 +104,7 @@ impl LocalChunkStore {
if metadata.is_file() {
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
if name.ends_with(".tmp") {
if name.ends_with(".tmp") || name.starts_with(".tmp.") {
continue;
}
@ -131,6 +144,25 @@ impl LocalChunkStore {
self.data_dir.join(safe_id)
}
fn temporary_chunk_path(&self, path: &std::path::Path) -> PathBuf {
let nonce = self.temp_file_nonce.fetch_add(1, Ordering::Relaxed);
let pid = std::process::id();
let file_name = path
.file_name()
.and_then(|name| name.to_str())
.unwrap_or("chunk");
path.parent()
.unwrap_or(&self.data_dir)
.join(format!(".tmp.{file_name}.{pid}.{nonce}"))
}
fn write_lock(&self, chunk_id: &str) -> &Mutex<()> {
let mut hasher = DefaultHasher::new();
chunk_id.hash(&mut hasher);
let slot = (hasher.finish() as usize) % self.write_locks.len().max(1);
&self.write_locks[slot]
}
async fn resolve_existing_chunk_path(&self, chunk_id: &str) -> StorageResult<PathBuf> {
if let Some(path) = self.chunk_paths.get(chunk_id) {
return Ok(path.clone());
@ -154,6 +186,7 @@ impl LocalChunkStore {
/// Store a chunk
pub async fn put(&self, chunk_id: &str, data: &[u8]) -> StorageResult<u64> {
let _guard = self.write_lock(chunk_id).lock().await;
let size = data.len() as u64;
// Check if replacing existing chunk
@ -169,7 +202,7 @@ impl LocalChunkStore {
}
let path = self.chunk_path(chunk_id);
let temp_path = path.with_extension(".tmp");
let temp_path = self.temporary_chunk_path(&path);
if let Some(parent) = path.parent() {
// Multipart uploads fan out concurrent writes into the same shard
// directory. Create the parent path unconditionally so no writer can
@ -217,6 +250,7 @@ impl LocalChunkStore {
/// Delete a chunk
pub async fn delete(&self, chunk_id: &str) -> StorageResult<()> {
let _guard = self.write_lock(chunk_id).lock().await;
if let Some((_, size)) = self.chunk_sizes.remove(chunk_id) {
let path = match self.chunk_paths.remove(chunk_id) {
Some((_, path)) => path,
@ -421,4 +455,34 @@ mod tests {
assert_eq!(store.chunk_count(), 16);
}
#[tokio::test]
async fn test_concurrent_rewrites_same_chunk_use_unique_temp_paths() {
let (store, _temp) = create_test_store().await;
let store = Arc::new(store);
let barrier = Arc::new(Barrier::new(9));
let mut tasks = Vec::new();
for idx in 0..8u8 {
let store = Arc::clone(&store);
let barrier = Arc::clone(&barrier);
tasks.push(tokio::spawn(async move {
let payload = vec![idx; 2048];
barrier.wait().await;
store.put("shared-chunk", &payload).await.unwrap();
payload
}));
}
barrier.wait().await;
let mut expected_payloads = Vec::new();
for task in tasks {
expected_payloads.push(task.await.unwrap());
}
let stored = store.get("shared-chunk").await.unwrap();
assert!(expected_payloads.iter().any(|payload| payload == &stored));
assert_eq!(store.chunk_count(), 1);
}
}

View file

@ -17,6 +17,7 @@ lightningstor-distributed = { workspace = true }
lightningstor-storage = { workspace = true }
chainfire-client = { path = "../../../chainfire/chainfire-client" }
flaredb-client = { path = "../../../flaredb/crates/flaredb-client" }
iam-api = { path = "../../../iam/crates/iam-api" }
iam-service-auth = { path = "../../../iam/crates/iam-service-auth" }
tonic = { workspace = true }
tonic-health = { workspace = true }

View file

@ -9,8 +9,11 @@ mod bucket_service;
pub mod config;
pub mod metadata;
mod object_service;
pub mod repair;
pub mod s3;
pub mod tenant;
pub use bucket_service::BucketServiceImpl;
pub use config::ServerConfig;
pub use object_service::ObjectServiceImpl;
pub use repair::{MetadataRepairQueue, spawn_replicated_repair_worker};

View file

@ -5,11 +5,13 @@ use clap::Parser;
use iam_service_auth::AuthService;
use lightningstor_api::{BucketServiceServer, ObjectServiceServer};
use lightningstor_distributed::{
DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, StaticNodeRegistry,
DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, RepairQueue,
StaticNodeRegistry,
};
use lightningstor_server::{
config::{MetadataBackend, ObjectStorageBackend},
metadata::MetadataStore,
repair::{spawn_replicated_repair_worker, MetadataRepairQueue},
s3, BucketServiceImpl, ObjectServiceImpl, ServerConfig,
};
use lightningstor_storage::{LocalFsBackend, StorageBackend};
@ -28,6 +30,12 @@ const OBJECT_GRPC_INITIAL_STREAM_WINDOW: u32 = 64 * 1024 * 1024;
const OBJECT_GRPC_INITIAL_CONNECTION_WINDOW: u32 = 512 * 1024 * 1024;
const OBJECT_GRPC_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
const OBJECT_GRPC_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(10);
const REPLICATED_REPAIR_SCAN_INTERVAL: Duration = Duration::from_secs(5);
struct StorageRuntime {
backend: Arc<dyn StorageBackend>,
repair_worker: Option<tokio::task::JoinHandle<()>>,
}
/// LightningStor object storage server
#[derive(Parser, Debug)]
@ -148,8 +156,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
metrics_addr
);
let storage = create_storage_backend(&config).await?;
if let Some(endpoint) = &config.chainfire_endpoint {
tracing::info!(" Cluster coordination: ChainFire @ {}", endpoint);
let endpoint = endpoint.clone();
@ -204,6 +210,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
}
};
let storage_runtime = create_storage_backend(&config, metadata.clone()).await?;
let storage = storage_runtime.backend.clone();
let _repair_worker = storage_runtime.repair_worker;
// Initialize IAM authentication service
tracing::info!(
"Connecting to IAM server at {}",
@ -253,7 +263,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
let s3_addr: SocketAddr = config.s3_addr;
// Start S3 HTTP server with shared state
let s3_router = s3::create_router_with_state(storage.clone(), metadata.clone());
let s3_router = s3::create_router_with_auth(
storage.clone(),
metadata.clone(),
Some(config.auth.iam_server_addr.clone()),
);
let s3_server = tokio::spawn(async move {
tracing::info!("S3 HTTP server listening on {}", s3_addr);
let listener = tokio::net::TcpListener::bind(s3_addr).await.unwrap();
@ -422,24 +436,27 @@ async fn register_chainfire_membership(
async fn create_storage_backend(
config: &ServerConfig,
) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
metadata: Arc<MetadataStore>,
) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
match config.object_storage_backend {
ObjectStorageBackend::LocalFs => {
tracing::info!("Object storage backend: local_fs");
Ok(Arc::new(
LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?,
))
Ok(StorageRuntime {
backend: Arc::new(LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?),
repair_worker: None,
})
}
ObjectStorageBackend::Distributed => {
tracing::info!("Object storage backend: distributed");
create_distributed_storage_backend(&config.distributed).await
create_distributed_storage_backend(&config.distributed, metadata).await
}
}
}
async fn create_distributed_storage_backend(
config: &DistributedConfig,
) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
metadata: Arc<MetadataStore>,
) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
let endpoints: Vec<String> = config
.node_endpoints
.iter()
@ -501,9 +518,25 @@ async fn create_distributed_storage_backend(
write_quorum,
"Using replicated LightningStor storage backend"
);
Ok(Arc::new(
ReplicatedBackend::new(config.clone(), registry).await?,
))
let repair_queue: Arc<dyn RepairQueue> =
Arc::new(MetadataRepairQueue::new(metadata.clone()));
let backend = Arc::new(
ReplicatedBackend::new_with_repair_queue(
config.clone(),
registry,
Some(repair_queue),
)
.await?,
);
let repair_worker = Some(spawn_replicated_repair_worker(
metadata,
backend.clone(),
REPLICATED_REPAIR_SCAN_INTERVAL,
));
Ok(StorageRuntime {
backend,
repair_worker,
})
}
RedundancyMode::ErasureCoded {
data_shards,
@ -514,9 +547,10 @@ async fn create_distributed_storage_backend(
parity_shards,
"Using erasure-coded LightningStor storage backend"
);
Ok(Arc::new(
ErasureCodedBackend::new(config.clone(), registry).await?,
))
Ok(StorageRuntime {
backend: Arc::new(ErasureCodedBackend::new(config.clone(), registry).await?),
repair_worker: None,
})
}
RedundancyMode::None => Err(std::io::Error::other(
"distributed object storage does not support redundancy.type=none; use object_storage_backend=local_fs instead",

View file

@ -2,6 +2,7 @@
use dashmap::DashMap;
use flaredb_client::RdbClient;
use lightningstor_distributed::ReplicatedRepairTask;
use lightningstor_types::{Bucket, BucketId, MultipartUpload, Object, ObjectId, Result};
use serde_json;
use sqlx::pool::PoolOptions;
@ -215,6 +216,12 @@ impl MetadataStore {
end_key
}
fn exclusive_scan_start(key: &[u8]) -> Vec<u8> {
let mut next = key.to_vec();
next.push(0);
next
}
fn flaredb_client_for_key<'a>(
clients: &'a [Arc<Mutex<RdbClient>>],
key: &[u8],
@ -422,6 +429,56 @@ impl MetadataStore {
Ok(results)
}
async fn flaredb_scan_page(
clients: &[Arc<Mutex<RdbClient>>],
prefix: &[u8],
start_after: Option<&[u8]>,
limit: u32,
) -> Result<(Vec<(String, String)>, bool)> {
let end_key = Self::prefix_end(prefix);
let start_key = start_after
.map(Self::exclusive_scan_start)
.unwrap_or_else(|| prefix.to_vec());
let fetch_limit = limit.saturating_add(1).max(1);
let client = Self::flaredb_scan_client(clients);
let (mut items, next) = match {
let mut c = client.lock().await;
c.raw_scan(start_key.clone(), end_key.clone(), fetch_limit).await
} {
Ok((keys, values, next)) => {
let items = keys
.into_iter()
.zip(values.into_iter())
.map(|(key, value)| {
(
String::from_utf8_lossy(&key).to_string(),
String::from_utf8_lossy(&value).to_string(),
)
})
.collect::<Vec<_>>();
(items, next)
}
Err(status) if Self::flaredb_requires_strong(&status) => {
Self::flaredb_scan_strong(client, &start_key, &end_key, fetch_limit).await?
}
Err(error) => {
return Err(lightningstor_types::Error::StorageError(format!(
"FlareDB scan failed: {}",
error
)));
}
};
let has_more = if items.len() > limit as usize {
items.truncate(limit as usize);
true
} else {
next.is_some()
};
Ok((items, has_more))
}
async fn flaredb_has_prefix(clients: &[Arc<Mutex<RdbClient>>], prefix: &[u8]) -> Result<bool> {
let end_key = Self::prefix_end(prefix);
let client = Self::flaredb_scan_client(clients);
@ -613,11 +670,146 @@ impl MetadataStore {
results.push((entry.key().clone(), entry.value().clone()));
}
}
results.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
Ok(results)
}
}
}
async fn get_prefix_page(
&self,
prefix: &str,
start_after: Option<&str>,
limit: u32,
) -> Result<(Vec<(String, String)>, bool)> {
if limit == 0 {
return Ok((Vec::new(), false));
}
match &self.backend {
StorageBackend::FlareDB(client) => {
Self::flaredb_scan_page(
client,
prefix.as_bytes(),
start_after.map(str::as_bytes),
limit,
)
.await
}
StorageBackend::Sql(sql) => {
let prefix_end = String::from_utf8(Self::prefix_end(prefix.as_bytes())).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to encode prefix end: {}",
e
))
})?;
let fetch_limit = (limit.saturating_add(1)) as i64;
match sql {
SqlStorageBackend::Postgres(pool) => {
let rows: Vec<(String, String)> = if let Some(after) = start_after {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= $1 AND key < $2 AND key > $3
ORDER BY key
LIMIT $4",
)
.bind(prefix)
.bind(&prefix_end)
.bind(after)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres paged scan failed: {}",
e
))
})?
} else {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= $1 AND key < $2
ORDER BY key
LIMIT $3",
)
.bind(prefix)
.bind(&prefix_end)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Postgres paged scan failed: {}",
e
))
})?
};
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
SqlStorageBackend::Sqlite(pool) => {
let rows: Vec<(String, String)> = if let Some(after) = start_after {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= ?1 AND key < ?2 AND key > ?3
ORDER BY key
LIMIT ?4",
)
.bind(prefix)
.bind(&prefix_end)
.bind(after)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite paged scan failed: {}",
e
))
})?
} else {
sqlx::query_as(
"SELECT key, value FROM metadata_kv
WHERE key >= ?1 AND key < ?2
ORDER BY key
LIMIT ?3",
)
.bind(prefix)
.bind(&prefix_end)
.bind(fetch_limit)
.fetch_all(pool.as_ref())
.await
.map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"SQLite paged scan failed: {}",
e
))
})?
};
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
}
}
StorageBackend::InMemory(map) => {
let mut rows: Vec<(String, String)> = map
.iter()
.filter(|entry| entry.key().starts_with(prefix))
.map(|entry| (entry.key().clone(), entry.value().clone()))
.collect();
rows.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
if let Some(after) = start_after {
rows.retain(|(key, _)| key.as_str() > after);
}
let has_more = rows.len() > limit as usize;
let items = rows.into_iter().take(limit as usize).collect();
Ok((items, has_more))
}
}
}
/// Internal: check if any key exists with a prefix
async fn has_prefix(&self, prefix: &str) -> Result<bool> {
match &self.backend {
@ -708,10 +900,64 @@ impl MetadataStore {
"/lightningstor/multipart/uploads/"
}
fn multipart_bucket_key(bucket_id: &str, object_key: &str, upload_id: &str) -> String {
format!(
"/lightningstor/multipart/by-bucket/{}/{}/{}",
bucket_id, object_key, upload_id
)
}
fn multipart_bucket_prefix(bucket_id: &BucketId, prefix: &str) -> String {
format!("/lightningstor/multipart/by-bucket/{}/{}", bucket_id, prefix)
}
fn multipart_object_key(object_id: &ObjectId) -> String {
format!("/lightningstor/multipart/objects/{}", object_id)
}
fn replicated_repair_task_key(task_id: &str) -> String {
format!("/lightningstor/repair/replicated/{}", task_id)
}
fn replicated_repair_task_prefix() -> &'static str {
"/lightningstor/repair/replicated/"
}
pub async fn save_replicated_repair_task(&self, task: &ReplicatedRepairTask) -> Result<()> {
let key = Self::replicated_repair_task_key(&task.id);
let value = serde_json::to_string(task).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to serialize replicated repair task: {}",
e
))
})?;
self.put(&key, &value).await
}
pub async fn list_replicated_repair_tasks(
&self,
limit: u32,
) -> Result<Vec<ReplicatedRepairTask>> {
let (items, _) = self
.get_prefix_page(Self::replicated_repair_task_prefix(), None, limit)
.await?;
let mut tasks = Vec::new();
for (_, value) in items {
let task: ReplicatedRepairTask = serde_json::from_str(&value).map_err(|e| {
lightningstor_types::Error::StorageError(format!(
"Failed to deserialize replicated repair task: {}",
e
))
})?;
tasks.push(task);
}
Ok(tasks)
}
pub async fn delete_replicated_repair_task(&self, task_id: &str) -> Result<()> {
self.delete_key(&Self::replicated_repair_task_key(task_id)).await
}
/// Save bucket metadata
pub async fn save_bucket(&self, bucket: &Bucket) -> Result<()> {
let key = Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
@ -900,6 +1146,13 @@ impl MetadataStore {
prefix: &str,
max_keys: u32,
) -> Result<Vec<Object>> {
if max_keys > 0 {
return self
.list_objects_page(bucket_id, prefix, None, max_keys)
.await
.map(|(objects, _)| objects);
}
let prefix_key = Self::object_prefix(bucket_id, prefix);
let items = self.get_prefix(&prefix_key).await?;
@ -921,6 +1174,34 @@ impl MetadataStore {
Ok(objects)
}
pub async fn list_objects_page(
&self,
bucket_id: &BucketId,
prefix: &str,
start_after_key: Option<&str>,
max_keys: u32,
) -> Result<(Vec<Object>, bool)> {
if max_keys == 0 {
return Ok((Vec::new(), false));
}
let prefix_key = Self::object_prefix(bucket_id, prefix);
let start_after_storage_key =
start_after_key.map(|key| Self::object_key(bucket_id, key, None));
let (items, has_more) = self
.get_prefix_page(&prefix_key, start_after_storage_key.as_deref(), max_keys)
.await?;
let mut objects = Vec::new();
for (_, value) in items {
if let Ok(object) = serde_json::from_str::<Object>(&value) {
objects.push(object);
}
}
Ok((objects, has_more))
}
pub async fn save_multipart_upload(&self, upload: &MultipartUpload) -> Result<()> {
let key = Self::multipart_upload_key(upload.upload_id.as_str());
let value = serde_json::to_string(upload).map_err(|e| {
@ -929,7 +1210,16 @@ impl MetadataStore {
e
))
})?;
self.put(&key, &value).await
self.put(&key, &value).await?;
self.put(
&Self::multipart_bucket_key(
&upload.bucket_id,
upload.key.as_str(),
upload.upload_id.as_str(),
),
&value,
)
.await
}
pub async fn load_multipart_upload(&self, upload_id: &str) -> Result<Option<MultipartUpload>> {
@ -948,6 +1238,14 @@ impl MetadataStore {
}
pub async fn delete_multipart_upload(&self, upload_id: &str) -> Result<()> {
if let Some(upload) = self.load_multipart_upload(upload_id).await? {
self.delete_key(&Self::multipart_bucket_key(
&upload.bucket_id,
upload.key.as_str(),
upload.upload_id.as_str(),
))
.await?;
}
self.delete_key(&Self::multipart_upload_key(upload_id)).await
}
@ -957,9 +1255,24 @@ impl MetadataStore {
prefix: &str,
max_uploads: u32,
) -> Result<Vec<MultipartUpload>> {
let items = self.get_prefix(Self::multipart_upload_prefix()).await?;
let index_prefix = Self::multipart_bucket_prefix(bucket_id, prefix);
let items = if max_uploads > 0 {
self.get_prefix_page(&index_prefix, None, max_uploads)
.await?
.0
} else {
self.get_prefix(&index_prefix).await?
};
let mut uploads = Vec::new();
for (_, value) in items {
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
uploads.push(upload);
}
}
if uploads.is_empty() {
let fallback_items = self.get_prefix(Self::multipart_upload_prefix()).await?;
for (_, value) in fallback_items {
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
if upload.bucket_id == bucket_id.to_string()
&& upload.key.as_str().starts_with(prefix)
@ -968,6 +1281,7 @@ impl MetadataStore {
}
}
}
}
uploads.sort_by(|a, b| {
a.key
@ -1033,6 +1347,7 @@ fn normalize_transport_addr(endpoint: &str) -> String {
#[cfg(test)]
mod tests {
use super::*;
use lightningstor_distributed::ReplicatedRepairTask;
use lightningstor_types::{BucketName, ETag, ObjectKey};
#[tokio::test]
@ -1119,4 +1434,123 @@ mod tests {
.is_none()
);
}
#[tokio::test]
async fn list_objects_page_honors_start_after_and_has_more() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("paged-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
for key in ["a.txt", "b.txt", "c.txt"] {
let mut object = Object::new(
bucket.id.to_string(),
ObjectKey::new(key).unwrap(),
ETag::from_md5(&[7u8; 16]),
128,
Some("text/plain".to_string()),
);
object.version = lightningstor_types::ObjectVersion::null();
store.save_object(&object).await.unwrap();
}
let (first_page, first_has_more) = store
.list_objects_page(&bucket.id, "", None, 2)
.await
.unwrap();
assert_eq!(
first_page
.iter()
.map(|object| object.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["a.txt".to_string(), "b.txt".to_string()]
);
assert!(first_has_more);
let (second_page, second_has_more) = store
.list_objects_page(&bucket.id, "", Some("b.txt"), 2)
.await
.unwrap();
assert_eq!(
second_page
.iter()
.map(|object| object.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["c.txt".to_string()]
);
assert!(!second_has_more);
}
#[tokio::test]
async fn list_multipart_uploads_uses_bucket_prefix_index() {
let store = MetadataStore::new_in_memory();
let bucket = Bucket::new(
BucketName::new("multipart-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&bucket).await.unwrap();
let upload_a = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/one.bin").unwrap());
let upload_b = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/two.bin").unwrap());
let other_bucket = Bucket::new(
BucketName::new("other-bucket").unwrap(),
"org-a",
"project-a",
"default",
);
store.save_bucket(&other_bucket).await.unwrap();
let upload_other =
MultipartUpload::new(other_bucket.id.to_string(), ObjectKey::new("a/three.bin").unwrap());
store.save_multipart_upload(&upload_a).await.unwrap();
store.save_multipart_upload(&upload_b).await.unwrap();
store.save_multipart_upload(&upload_other).await.unwrap();
let uploads = store
.list_multipart_uploads(&bucket.id, "a/", 10)
.await
.unwrap();
assert_eq!(uploads.len(), 2);
assert_eq!(
uploads
.iter()
.map(|upload| upload.key.as_str().to_string())
.collect::<Vec<_>>(),
vec!["a/one.bin".to_string(), "a/two.bin".to_string()]
);
}
#[tokio::test]
async fn replicated_repair_tasks_round_trip() {
let store = MetadataStore::new_in_memory();
let mut task = ReplicatedRepairTask::new("obj_abc", 0, "quorum write");
store.save_replicated_repair_task(&task).await.unwrap();
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
assert_eq!(tasks.len(), 1);
assert_eq!(tasks[0].key, "obj_abc");
task.schedule_retry("transient failure", 5_000);
store.save_replicated_repair_task(&task).await.unwrap();
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
assert_eq!(tasks[0].attempt_count, 1);
assert_eq!(tasks[0].last_error.as_deref(), Some("transient failure"));
store
.delete_replicated_repair_task(&task.id)
.await
.unwrap();
assert!(store
.list_replicated_repair_tasks(10)
.await
.unwrap()
.is_empty());
}
}

View file

@ -155,6 +155,10 @@ impl ObjectServiceImpl {
.await
.map_err(|e| Status::internal(format!("Failed to delete multipart part: {}", e)))?;
}
self.storage
.delete_upload_parts(upload.upload_id.as_str())
.await
.map_err(|e| Status::internal(format!("Failed to clean multipart upload: {}", e)))?;
Ok(())
}
@ -465,7 +469,6 @@ impl ObjectService for ObjectServiceImpl {
let (start, end) =
Self::resolve_range(object.size as usize, req.range_start, req.range_end);
if object.etag.is_multipart() {
if let Some(upload) = self
.metadata
.load_object_multipart_upload(&object.id)
@ -476,7 +479,6 @@ impl ObjectService for ObjectServiceImpl {
self.multipart_object_stream(&object, upload, start, end),
));
}
}
let data = self
.storage
@ -524,7 +526,6 @@ impl ObjectService for ObjectServiceImpl {
.map_err(Self::to_status)?
.ok_or_else(|| Status::not_found(format!("Object {} not found", req.key)))?;
if object.etag.is_multipart() {
if let Some(upload) = self
.metadata
.load_object_multipart_upload(&object.id)
@ -540,12 +541,6 @@ impl ObjectService for ObjectServiceImpl {
.delete_multipart_upload(upload.upload_id.as_str())
.await
.map_err(Self::to_status)?;
} else {
self.storage
.delete_object(&object.id)
.await
.map_err(|e| Status::internal(format!("Failed to delete object: {}", e)))?;
}
} else {
self.storage
.delete_object(&object.id)

View file

@ -0,0 +1,182 @@
use crate::metadata::MetadataStore;
use async_trait::async_trait;
use lightningstor_distributed::{RepairQueue, ReplicatedBackend, ReplicatedRepairTask};
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use tokio::task::JoinHandle;
use tokio::time::sleep;
use tracing::{debug, warn};
const REPAIR_SCAN_LIMIT: u32 = 256;
const REPAIR_BACKOFF_BASE_MILLIS: u64 = 1_000;
const REPAIR_BACKOFF_MAX_MILLIS: u64 = 60_000;
const ORPHAN_REPAIR_DROP_ATTEMPTS: u32 = 8;
pub struct MetadataRepairQueue {
metadata: Arc<MetadataStore>,
}
impl MetadataRepairQueue {
pub fn new(metadata: Arc<MetadataStore>) -> Self {
Self { metadata }
}
}
#[async_trait]
impl RepairQueue for MetadataRepairQueue {
async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
if let Err(error) = self.metadata.save_replicated_repair_task(&task).await {
warn!(
task_id = task.id,
chunk_key = task.key,
error = %error,
"failed to persist replicated repair task"
);
}
}
}
pub fn spawn_replicated_repair_worker(
metadata: Arc<MetadataStore>,
backend: Arc<ReplicatedBackend>,
interval: Duration,
) -> JoinHandle<()> {
tokio::spawn(async move {
loop {
if let Err(error) = process_replicated_repair_queue(&metadata, &backend).await {
if replicated_repair_queue_transiently_unready(&error) {
debug!(error = %error, "replicated repair queue pass deferred until metadata becomes ready");
} else {
warn!(error = %error, "replicated repair queue pass failed");
}
}
sleep(interval).await;
}
})
}
async fn process_replicated_repair_queue(
metadata: &MetadataStore,
backend: &ReplicatedBackend,
) -> Result<(), lightningstor_types::Error> {
let now = unix_time_millis();
let tasks = metadata
.list_replicated_repair_tasks(REPAIR_SCAN_LIMIT)
.await?;
for mut task in tasks {
if !task.is_due(now) {
continue;
}
match backend.repair_chunk(&task).await {
Ok(()) => {
metadata.delete_replicated_repair_task(&task.id).await?;
debug!(
task_id = task.id,
chunk_key = task.key,
"repaired replicated chunk"
);
}
Err(error) => {
if task.attempt_count >= ORPHAN_REPAIR_DROP_ATTEMPTS {
match backend.chunk_exists_anywhere(&task.key).await {
Ok(false) => {
warn!(
task_id = task.id,
chunk_key = task.key,
attempts = task.attempt_count,
"dropping orphan replicated repair task with no remaining source replica"
);
metadata.delete_replicated_repair_task(&task.id).await?;
continue;
}
Ok(true) => {}
Err(probe_error) => {
warn!(
task_id = task.id,
chunk_key = task.key,
error = %probe_error,
"failed to probe global chunk existence while evaluating orphan repair task"
);
}
}
}
let backoff = repair_backoff_millis(task.attempt_count);
task.schedule_retry(error.to_string(), backoff);
metadata.save_replicated_repair_task(&task).await?;
warn!(
task_id = task.id,
chunk_key = task.key,
attempts = task.attempt_count,
backoff_millis = backoff,
error = %error,
"replicated chunk repair failed"
);
}
}
}
Ok(())
}
fn unix_time_millis() -> u64 {
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_millis() as u64
}
fn repair_backoff_millis(attempt_count: u32) -> u64 {
let exponent = attempt_count.min(6);
let multiplier = 1u64 << exponent;
(REPAIR_BACKOFF_BASE_MILLIS.saturating_mul(multiplier)).min(REPAIR_BACKOFF_MAX_MILLIS)
}
fn replicated_repair_queue_transiently_unready(error: &lightningstor_types::Error) -> bool {
let rendered = error.to_string().to_ascii_lowercase();
let transient = rendered.contains("region not found")
|| rendered.contains("status: notfound")
|| rendered.contains("metadata backend not ready")
|| rendered.contains("notleader");
if transient {
return true;
}
match error {
lightningstor_types::Error::StorageError(message)
| lightningstor_types::Error::Internal(message) => {
let message = message.to_ascii_lowercase();
message.contains("region not found")
|| message.contains("status: notfound")
|| message.contains("metadata backend not ready")
|| message.contains("notleader")
}
_ => false,
}
}
#[cfg(test)]
mod tests {
use super::replicated_repair_queue_transiently_unready;
#[test]
fn treats_region_not_found_as_transient_startup_state() {
let error = lightningstor_types::Error::StorageError(
"FlareDB scan failed: status: NotFound, message: \"region not found\"".to_string(),
);
assert!(replicated_repair_queue_transiently_unready(&error));
}
#[test]
fn treats_wrapped_storage_error_rendering_as_transient_startup_state() {
let error = lightningstor_types::Error::StorageError(
"FlareDB scan failed: status: NotFound, message: \"region not found\", details: [], metadata: MetadataMap { headers: {} }".to_string(),
);
assert!(replicated_repair_queue_transiently_unready(&error));
}
#[test]
fn keeps_real_repair_failures_as_warnings() {
let error =
lightningstor_types::Error::StorageError("replication checksum mismatch".to_string());
assert!(!replicated_repair_queue_transiently_unready(&error));
}
}

View file

@ -10,13 +10,17 @@ use axum::{
middleware::Next,
response::{IntoResponse, Response},
};
use crate::tenant::TenantContext;
use hmac::{Hmac, Mac};
use iam_api::proto::{iam_credential_client::IamCredentialClient, GetSecretKeyRequest};
use sha2::{Digest, Sha256};
use std::collections::HashMap;
use std::sync::Arc;
use tokio::sync::RwLock;
use tokio::sync::{Mutex, RwLock};
use tonic::transport::Channel;
use tracing::{debug, warn};
use url::form_urlencoded;
use std::time::{Duration as StdDuration, Instant};
type HmacSha256 = Hmac<Sha256>;
const DEFAULT_MAX_AUTH_BODY_BYTES: usize = 1024 * 1024 * 1024;
@ -27,6 +31,13 @@ pub(crate) struct VerifiedBodyBytes(pub Bytes);
#[derive(Clone, Debug)]
pub(crate) struct VerifiedPayloadHash(pub String);
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct VerifiedTenantContext(pub TenantContext);
fn should_buffer_auth_body(payload_hash_header: Option<&str>) -> bool {
payload_hash_header.is_none()
}
/// SigV4 authentication state
#[derive(Clone)]
pub struct AuthState {
@ -40,21 +51,73 @@ pub struct AuthState {
aws_service: String,
}
/// Placeholder IAM client (will integrate with real IAM later)
pub struct IamClient {
// Stores access_key_id -> secret_key mapping
mode: IamClientMode,
credential_cache: Arc<RwLock<HashMap<String, CachedCredential>>>,
cache_ttl: StdDuration,
}
enum IamClientMode {
Env {
credentials: std::collections::HashMap<String, String>,
},
Grpc {
endpoint: String,
channel: Arc<Mutex<Option<Channel>>>,
},
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub(crate) struct ResolvedCredential {
pub secret_key: String,
pub principal_id: String,
pub org_id: Option<String>,
pub project_id: Option<String>,
}
struct CachedCredential {
credential: ResolvedCredential,
cached_at: Instant,
}
impl IamClient {
/// Create a new IamClient loading credentials from environment variables for MVP.
/// Create a new IAM client. If an endpoint is supplied, use the IAM gRPC API.
pub fn new(iam_endpoint: Option<String>) -> Self {
let cache_ttl = std::env::var("LIGHTNINGSTOR_S3_IAM_CACHE_TTL_SECS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.map(StdDuration::from_secs)
.unwrap_or_else(|| StdDuration::from_secs(30));
if let Some(endpoint) = iam_endpoint
.map(|value| normalize_iam_endpoint(&value))
.filter(|value| !value.is_empty())
{
return Self {
mode: IamClientMode::Grpc {
endpoint,
channel: Arc::new(Mutex::new(None)),
},
credential_cache: Arc::new(RwLock::new(HashMap::new())),
cache_ttl,
};
}
Self {
mode: IamClientMode::Env {
credentials: Self::load_env_credentials(),
},
credential_cache: Arc::new(RwLock::new(HashMap::new())),
cache_ttl,
}
}
/// Load credentials from environment variables for fallback/testing.
///
/// Supports two formats:
/// 1. Single credential: S3_ACCESS_KEY_ID + S3_SECRET_KEY
/// 2. Multiple credentials: S3_CREDENTIALS="key1:secret1,key2:secret2,..."
///
/// TODO: Replace with proper IAM gRPC integration (see T060)
pub fn new() -> Self {
fn load_env_credentials() -> std::collections::HashMap<String, String> {
let mut credentials = std::collections::HashMap::new();
// Option 1: Multiple credentials via S3_CREDENTIALS
@ -87,28 +150,160 @@ impl IamClient {
warn!("Set S3_CREDENTIALS or S3_ACCESS_KEY_ID/S3_SECRET_KEY to enable access.");
}
Self { credentials }
credentials
}
/// Validate access key and return secret key
pub async fn get_secret_key(&self, access_key_id: &str) -> Result<String, String> {
self.credentials
#[cfg(test)]
fn env_credentials(&self) -> Option<&std::collections::HashMap<String, String>> {
match &self.mode {
IamClientMode::Env { credentials } => Some(credentials),
IamClientMode::Grpc { .. } => None,
}
}
fn env_default_tenant() -> (Option<String>, Option<String>) {
let org_id = std::env::var("S3_TENANT_ORG_ID")
.ok()
.or_else(|| std::env::var("S3_ORG_ID").ok())
.or_else(|| Some("default".to_string()));
let project_id = std::env::var("S3_TENANT_PROJECT_ID")
.ok()
.or_else(|| std::env::var("S3_PROJECT_ID").ok())
.or_else(|| Some("default".to_string()));
(org_id, project_id)
}
/// Validate access key and resolve the credential context.
pub async fn get_credential(&self, access_key_id: &str) -> Result<ResolvedCredential, String> {
match &self.mode {
IamClientMode::Env { credentials } => {
let secret_key = credentials
.get(access_key_id)
.cloned()
.ok_or_else(|| "Access key ID not found".to_string())
.ok_or_else(|| "Access key ID not found".to_string())?;
let (org_id, project_id) = Self::env_default_tenant();
Ok(ResolvedCredential {
secret_key,
principal_id: access_key_id.to_string(),
org_id,
project_id,
})
}
IamClientMode::Grpc { endpoint, channel } => {
if let Some(credential) = self.cached_credential(access_key_id).await {
return Ok(credential);
}
let response = self
.grpc_get_secret_key(endpoint, channel, access_key_id)
.await?;
let response = response.into_inner();
let credential = ResolvedCredential {
secret_key: response.secret_key,
principal_id: response.principal_id,
org_id: response.org_id,
project_id: response.project_id,
};
self.cache_credential(access_key_id, &credential).await;
Ok(credential)
}
}
}
async fn cached_credential(&self, access_key_id: &str) -> Option<ResolvedCredential> {
let cache = self.credential_cache.read().await;
cache.get(access_key_id).and_then(|entry| {
if entry.cached_at.elapsed() <= self.cache_ttl {
Some(entry.credential.clone())
} else {
None
}
})
}
async fn cache_credential(&self, access_key_id: &str, credential: &ResolvedCredential) {
let mut cache = self.credential_cache.write().await;
cache.insert(
access_key_id.to_string(),
CachedCredential {
credential: credential.clone(),
cached_at: Instant::now(),
},
);
}
async fn grpc_channel(
endpoint: &str,
channel: &Arc<Mutex<Option<Channel>>>,
) -> Result<Channel, String> {
let mut cached = channel.lock().await;
if let Some(existing) = cached.as_ref() {
return Ok(existing.clone());
}
let created = Channel::from_shared(endpoint.to_string())
.map_err(|e| format!("failed to parse IAM credential endpoint: {}", e))?
.connect()
.await
.map_err(|e| format!("failed to connect to IAM credential service: {}", e))?;
*cached = Some(created.clone());
Ok(created)
}
async fn invalidate_grpc_channel(channel: &Arc<Mutex<Option<Channel>>>) {
let mut cached = channel.lock().await;
*cached = None;
}
async fn grpc_get_secret_key(
&self,
endpoint: &str,
channel: &Arc<Mutex<Option<Channel>>>,
access_key_id: &str,
) -> Result<tonic::Response<iam_api::proto::GetSecretKeyResponse>, String> {
for attempt in 0..2 {
let grpc_channel = Self::grpc_channel(endpoint, channel).await?;
let mut client = IamCredentialClient::new(grpc_channel);
match client
.get_secret_key(GetSecretKeyRequest {
access_key_id: access_key_id.to_string(),
})
.await
{
Ok(response) => return Ok(response),
Err(status)
if attempt == 0
&& matches!(
status.code(),
tonic::Code::Unavailable
| tonic::Code::Cancelled
| tonic::Code::Unknown
| tonic::Code::DeadlineExceeded
| tonic::Code::Internal
) =>
{
Self::invalidate_grpc_channel(channel).await;
}
Err(status) => return Err(status.message().to_string()),
}
}
Err("IAM credential lookup exhausted retries".to_string())
}
}
fn normalize_iam_endpoint(endpoint: &str) -> String {
if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
endpoint.to_string()
} else {
format!("http://{}", endpoint)
}
}
impl AuthState {
/// Create new auth state with IAM integration
pub fn new(iam_endpoint: Option<String>) -> Self {
let iam_client = if let Some(_endpoint) = iam_endpoint {
// TODO: Connect to real IAM gRPC service
// For now, if an endpoint is provided, we still use our env var based client
Some(Arc::new(RwLock::new(IamClient::new())))
} else {
Some(Arc::new(RwLock::new(IamClient::new())))
};
let iam_client = Some(Arc::new(RwLock::new(IamClient::new(iam_endpoint))));
Self {
iam_client,
@ -198,9 +393,9 @@ pub async fn sigv4_auth_middleware(
};
// Get secret key from IAM (or use dummy for MVP)
let secret_key = if let Some(ref iam) = auth_state.iam_client {
match iam.read().await.get_secret_key(&access_key_id).await {
Ok(key) => key,
let credential = if let Some(ref iam) = auth_state.iam_client {
match iam.read().await.get_credential(&access_key_id).await {
Ok(credential) => credential,
Err(e) => {
warn!("IAM credential validation failed: {}", e);
return error_response(
@ -211,18 +406,22 @@ pub async fn sigv4_auth_middleware(
}
}
} else {
// This case should ideally not be hit with the current IamClient::new() logic
// but kept for safety.
debug!("No IAM integration, using dummy secret key if IamClient wasn't initialized.");
"dummy_secret_key_for_mvp".to_string()
ResolvedCredential {
secret_key: "dummy_secret_key_for_mvp".to_string(),
principal_id: access_key_id.clone(),
org_id: Some("default".to_string()),
project_id: Some("default".to_string()),
}
};
let secret_key = credential.secret_key.as_str();
let payload_hash_header = headers
.get("x-amz-content-sha256")
.and_then(|value| value.to_str().ok())
.filter(|value| !value.is_empty())
.map(str::to_string);
let should_buffer_body = !matches!(payload_hash_header.as_deref(), Some(hash) if hash != "UNSIGNED-PAYLOAD");
let should_buffer_body = should_buffer_auth_body(payload_hash_header.as_deref());
let body_bytes = if should_buffer_body {
let max_body_bytes = std::env::var("S3_MAX_AUTH_BODY_BYTES")
@ -282,7 +481,7 @@ pub async fn sigv4_auth_middleware(
);
let expected_signature = match compute_sigv4_signature(
&secret_key,
secret_key,
&method,
&uri,
&headers,
@ -310,6 +509,21 @@ pub async fn sigv4_auth_middleware(
);
}
match (credential.org_id, credential.project_id) {
(Some(org_id), Some(project_id)) => {
request
.extensions_mut()
.insert(VerifiedTenantContext(TenantContext { org_id, project_id }));
}
_ => {
return error_response(
StatusCode::FORBIDDEN,
"AccessDenied",
"S3 credential is missing tenant scope",
);
}
}
// Auth successful
debug!("SigV4 auth successful for access_key={}", access_key_id);
next.run(request).await
@ -558,6 +772,97 @@ fn error_response(status: StatusCode, code: &str, message: &str) -> Response {
mod tests {
use super::*;
use axum::http::HeaderValue;
use iam_api::proto::{
iam_credential_server::{IamCredential, IamCredentialServer},
CreateS3CredentialRequest, CreateS3CredentialResponse, Credential, GetSecretKeyResponse,
ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
RevokeCredentialResponse,
};
use std::collections::HashMap;
use std::net::SocketAddr;
use std::sync::{atomic::{AtomicUsize, Ordering}, Mutex};
use tokio::net::TcpListener;
use tokio::time::{sleep, Duration};
use tonic::{Request as TonicRequest, Response as TonicResponse, Status};
use tonic::transport::Server;
static ENV_LOCK: Mutex<()> = Mutex::new(());
#[derive(Clone, Default)]
struct MockIamCredentialService {
secrets: Arc<HashMap<String, String>>,
get_secret_calls: Arc<AtomicUsize>,
}
#[tonic::async_trait]
impl IamCredential for MockIamCredentialService {
async fn create_s3_credential(
&self,
_request: TonicRequest<CreateS3CredentialRequest>,
) -> Result<TonicResponse<CreateS3CredentialResponse>, Status> {
Err(Status::unimplemented("not needed in test"))
}
async fn get_secret_key(
&self,
request: TonicRequest<GetSecretKeyRequest>,
) -> Result<TonicResponse<GetSecretKeyResponse>, Status> {
let access_key_id = request.into_inner().access_key_id;
self.get_secret_calls.fetch_add(1, Ordering::SeqCst);
let Some(secret_key) = self.secrets.get(&access_key_id) else {
return Err(Status::not_found("access key not found"));
};
Ok(TonicResponse::new(GetSecretKeyResponse {
secret_key: secret_key.clone(),
principal_id: "test-principal".to_string(),
expires_at: None,
org_id: Some("test-org".to_string()),
project_id: Some("test-project".to_string()),
principal_kind: iam_api::proto::PrincipalKind::ServiceAccount as i32,
}))
}
async fn list_credentials(
&self,
_request: TonicRequest<ListCredentialsRequest>,
) -> Result<TonicResponse<ListCredentialsResponse>, Status> {
Ok(TonicResponse::new(ListCredentialsResponse {
credentials: Vec::<Credential>::new(),
}))
}
async fn revoke_credential(
&self,
_request: TonicRequest<RevokeCredentialRequest>,
) -> Result<TonicResponse<RevokeCredentialResponse>, Status> {
Ok(TonicResponse::new(RevokeCredentialResponse { success: true }))
}
}
async fn start_mock_iam(secrets: HashMap<String, String>) -> (SocketAddr, Arc<AtomicUsize>) {
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
let addr = listener.local_addr().unwrap();
let get_secret_calls = Arc::new(AtomicUsize::new(0));
let service = MockIamCredentialService {
secrets: Arc::new(secrets),
get_secret_calls: get_secret_calls.clone(),
};
drop(listener);
tokio::spawn(async move {
Server::builder()
.add_service(IamCredentialServer::new(service))
.serve(addr)
.await
.unwrap();
});
for _ in 0..20 {
if tokio::net::TcpStream::connect(addr).await.is_ok() {
return (addr, get_secret_calls);
}
sleep(Duration::from_millis(25)).await;
}
panic!("mock IAM server did not start on {}", addr);
}
#[tokio::test]
async fn test_parse_auth_header() {
@ -657,6 +962,13 @@ mod tests {
assert_eq!(hashed_payload, "signed-payload-hash");
}
#[test]
fn test_should_buffer_auth_body_only_when_hash_header_missing() {
assert!(should_buffer_auth_body(None));
assert!(!should_buffer_auth_body(Some("signed-payload-hash")));
assert!(!should_buffer_auth_body(Some("UNSIGNED-PAYLOAD")));
}
#[test]
fn test_build_string_to_sign() {
let amz_date = "20231201T000000Z";
@ -677,34 +989,77 @@ mod tests {
#[test]
fn test_iam_client_multi_credentials() {
let _guard = ENV_LOCK.lock().unwrap();
// Test parsing S3_CREDENTIALS format
std::env::set_var("S3_CREDENTIALS", "key1:secret1,key2:secret2,key3:secret3");
let client = IamClient::new();
let client = IamClient::new(None);
let credentials = client.env_credentials().unwrap();
assert_eq!(client.credentials.len(), 3);
assert_eq!(client.credentials.get("key1"), Some(&"secret1".to_string()));
assert_eq!(client.credentials.get("key2"), Some(&"secret2".to_string()));
assert_eq!(client.credentials.get("key3"), Some(&"secret3".to_string()));
assert_eq!(credentials.len(), 3);
assert_eq!(credentials.get("key1"), Some(&"secret1".to_string()));
assert_eq!(credentials.get("key2"), Some(&"secret2".to_string()));
assert_eq!(credentials.get("key3"), Some(&"secret3".to_string()));
std::env::remove_var("S3_CREDENTIALS");
}
#[test]
fn test_iam_client_single_credentials() {
let _guard = ENV_LOCK.lock().unwrap();
// Test legacy S3_ACCESS_KEY_ID/S3_SECRET_KEY format
std::env::remove_var("S3_CREDENTIALS");
std::env::set_var("S3_ACCESS_KEY_ID", "test_key");
std::env::set_var("S3_SECRET_KEY", "test_secret");
let client = IamClient::new();
let client = IamClient::new(None);
let credentials = client.env_credentials().unwrap();
assert_eq!(client.credentials.len(), 1);
assert_eq!(client.credentials.get("test_key"), Some(&"test_secret".to_string()));
assert_eq!(credentials.len(), 1);
assert_eq!(credentials.get("test_key"), Some(&"test_secret".to_string()));
std::env::remove_var("S3_ACCESS_KEY_ID");
std::env::remove_var("S3_SECRET_KEY");
}
#[tokio::test]
async fn test_iam_client_grpc_lookup() {
let (addr, _calls) = start_mock_iam(HashMap::from([(
"grpc_key".to_string(),
"grpc_secret".to_string(),
)]))
.await;
let client = IamClient::new(Some(addr.to_string()));
let credential = client.get_credential("grpc_key").await.unwrap();
assert_eq!(credential.secret_key, "grpc_secret");
assert_eq!(credential.org_id.as_deref(), Some("test-org"));
assert_eq!(credential.project_id.as_deref(), Some("test-project"));
assert_eq!(
client.get_credential("missing").await.unwrap_err(),
"access key not found"
);
}
#[tokio::test]
async fn test_iam_client_grpc_cache_reuses_secret() {
let (addr, calls) = start_mock_iam(HashMap::from([(
"grpc_key".to_string(),
"grpc_secret".to_string(),
)]))
.await;
let client = IamClient::new(Some(addr.to_string()));
assert_eq!(
client.get_credential("grpc_key").await.unwrap().secret_key,
"grpc_secret"
);
assert_eq!(
client.get_credential("grpc_key").await.unwrap().secret_key,
"grpc_secret"
);
assert_eq!(calls.load(Ordering::SeqCst), 1);
}
#[test]
fn test_complete_sigv4_signature() {
// Test with AWS example credentials (from AWS docs)
@ -1039,18 +1394,20 @@ mod tests {
#[test]
fn test_security_credential_lookup_unknown_key() {
let _guard = ENV_LOCK.lock().unwrap();
// Test that unknown access keys return the correct result
std::env::remove_var("S3_CREDENTIALS");
std::env::set_var("S3_ACCESS_KEY_ID", "known_key");
std::env::set_var("S3_SECRET_KEY", "known_secret");
let client = IamClient::new();
let client = IamClient::new(None);
let credentials = client.env_credentials().unwrap();
// Known key should be found in credentials map
assert_eq!(client.credentials.get("known_key"), Some(&"known_secret".to_string()));
assert_eq!(credentials.get("known_key"), Some(&"known_secret".to_string()));
// Unknown key should not be found
assert_eq!(client.credentials.get("unknown_key"), None);
assert_eq!(credentials.get("unknown_key"), None);
std::env::remove_var("S3_ACCESS_KEY_ID");
std::env::remove_var("S3_SECRET_KEY");
@ -1058,33 +1415,36 @@ mod tests {
#[test]
fn test_security_empty_credentials() {
let _guard = ENV_LOCK.lock().unwrap();
// Test that IamClient keeps credentials empty when none provided
std::env::remove_var("S3_CREDENTIALS");
std::env::remove_var("S3_ACCESS_KEY_ID");
std::env::remove_var("S3_SECRET_KEY");
let client = IamClient::new();
let client = IamClient::new(None);
// No credentials configured
assert!(client.credentials.is_empty());
assert!(client.env_credentials().unwrap().is_empty());
}
#[test]
fn test_security_malformed_s3_credentials_env() {
let _guard = ENV_LOCK.lock().unwrap();
// Test that malformed S3_CREDENTIALS are handled gracefully
// Missing colon separator
std::env::set_var("S3_CREDENTIALS", "key1_secret1,key2:secret2");
let client = IamClient::new();
let client = IamClient::new(None);
let credentials = client.env_credentials().unwrap();
// Should only parse the valid pair (key2:secret2)
assert_eq!(client.credentials.len(), 1);
assert!(client.credentials.contains_key("key2"));
assert_eq!(credentials.len(), 1);
assert!(credentials.contains_key("key2"));
// Empty pairs
std::env::set_var("S3_CREDENTIALS", "key1:secret1,,key2:secret2");
let client2 = IamClient::new();
let client2 = IamClient::new(None);
// Should parse both valid pairs, skip empty
assert_eq!(client2.credentials.len(), 2);
assert_eq!(client2.env_credentials().unwrap().len(), 2);
std::env::remove_var("S3_CREDENTIALS");
}

View file

@ -7,4 +7,4 @@ mod router;
mod xml;
pub use auth::{AuthState, sigv4_auth_middleware};
pub use router::{create_router, create_router_with_state};
pub use router::{create_router, create_router_with_auth, create_router_with_state};

File diff suppressed because it is too large Load diff

View file

@ -66,6 +66,9 @@ pub struct ListBucketResult {
pub name: String,
#[serde(rename = "Prefix")]
pub prefix: String,
#[serde(rename = "Marker")]
#[serde(skip_serializing_if = "Option::is_none")]
pub marker: Option<String>,
#[serde(rename = "Delimiter")]
#[serde(skip_serializing_if = "Option::is_none")]
pub delimiter: Option<String>,
@ -73,6 +76,9 @@ pub struct ListBucketResult {
pub max_keys: u32,
#[serde(rename = "IsTruncated")]
pub is_truncated: bool,
#[serde(rename = "NextMarker")]
#[serde(skip_serializing_if = "Option::is_none")]
pub next_marker: Option<String>,
#[serde(rename = "Contents", default)]
pub contents: Vec<ObjectEntry>,
#[serde(rename = "CommonPrefixes", default)]

View file

@ -1,6 +1,6 @@
use tonic::{metadata::MetadataMap, Status};
#[derive(Debug, Clone)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct TenantContext {
pub org_id: String,
pub project_id: String,

146
nix/ci/flake.lock generated
View file

@ -1,5 +1,26 @@
{
"nodes": {
"disko": {
"inputs": {
"nixpkgs": [
"photoncloud",
"nixpkgs"
]
},
"locked": {
"lastModified": 1765326679,
"narHash": "sha256-fTLX9kDwLr9Y0rH/nG+h1XG5UU+jBcy0PFYn5eneRX8=",
"owner": "nix-community",
"repo": "disko",
"rev": "d64e5cdca35b5fad7c504f615357a7afe6d9c49e",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "disko",
"type": "github"
}
},
"flake-utils": {
"inputs": {
"systems": "systems"
@ -18,6 +39,43 @@
"type": "github"
}
},
"flake-utils_2": {
"inputs": {
"systems": "systems_2"
},
"locked": {
"lastModified": 1731533236,
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
"owner": "numtide",
"repo": "flake-utils",
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
"type": "github"
},
"original": {
"owner": "numtide",
"repo": "flake-utils",
"type": "github"
}
},
"nix-nos": {
"inputs": {
"nixpkgs": [
"photoncloud",
"nixpkgs"
]
},
"locked": {
"path": "./nix-nos",
"type": "path"
},
"original": {
"path": "./nix-nos",
"type": "path"
},
"parent": [
"photoncloud"
]
},
"nixpkgs": {
"locked": {
"lastModified": 1765186076,
@ -34,14 +92,71 @@
"type": "github"
}
},
"nixpkgs_2": {
"locked": {
"lastModified": 1765186076,
"narHash": "sha256-hM20uyap1a0M9d344I692r+ik4gTMyj60cQWO+hAYP8=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "addf7cf5f383a3101ecfba091b98d0a1263dc9b8",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"photoncloud": {
"inputs": {
"disko": "disko",
"flake-utils": "flake-utils_2",
"nix-nos": "nix-nos",
"nixpkgs": "nixpkgs_2",
"rust-overlay": "rust-overlay",
"systems": "systems_3"
},
"locked": {
"path": "../..",
"type": "path"
},
"original": {
"path": "../..",
"type": "path"
},
"parent": []
},
"root": {
"inputs": {
"flake-utils": "flake-utils",
"nixpkgs": "nixpkgs",
"rust-overlay": "rust-overlay"
"photoncloud": "photoncloud",
"rust-overlay": "rust-overlay_2"
}
},
"rust-overlay": {
"inputs": {
"nixpkgs": [
"photoncloud",
"nixpkgs"
]
},
"locked": {
"lastModified": 1765465581,
"narHash": "sha256-fCXT0aZXmTalM3NPCTedVs9xb0egBG5BOZkcrYo5PGE=",
"owner": "oxalica",
"repo": "rust-overlay",
"rev": "99cc5667eece98bb35dcf35f7e511031a8b7a125",
"type": "github"
},
"original": {
"owner": "oxalica",
"repo": "rust-overlay",
"type": "github"
}
},
"rust-overlay_2": {
"inputs": {
"nixpkgs": [
"nixpkgs"
@ -75,6 +190,35 @@
"repo": "default",
"type": "github"
}
},
"systems_2": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"owner": "nix-systems",
"repo": "default",
"type": "github"
}
},
"systems_3": {
"locked": {
"lastModified": 1681028828,
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
"owner": "nix-systems",
"repo": "default",
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
"type": "github"
},
"original": {
"id": "systems",
"type": "indirect"
}
}
},
"root": "root",

View file

@ -5,6 +5,7 @@
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
flake-utils.url = "github:numtide/flake-utils";
photoncloud.url = "path:../..";
rust-overlay = {
url = "github:oxalica/rust-overlay";
@ -12,7 +13,7 @@
};
};
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
outputs = { self, nixpkgs, flake-utils, photoncloud, rust-overlay }:
flake-utils.lib.eachDefaultSystem (system:
let
overlays = [ (import rust-overlay) ];
@ -201,7 +202,7 @@
if [[ "$no_logs" == "0" ]]; then
local out
out="$logdir/shared_${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
out="$logdir/shared_''${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
(cd "$repo_root" && bash -c "$cmd") 2>&1 | tee "$out"
else
(cd "$repo_root" && bash -c "$cmd")
@ -291,6 +292,11 @@
${gate}/bin/photoncloud-gate --tier 0 --no-logs
touch $out/ok
'';
checks.deployer-vm-smoke = photoncloud.checks.${system}.deployer-vm-smoke;
checks.deployer-vm-rollback = photoncloud.checks.${system}.deployer-vm-rollback;
checks.deployer-bootstrap-e2e = photoncloud.checks.${system}.deployer-bootstrap-e2e;
checks.host-lifecycle-e2e = photoncloud.checks.${system}.host-lifecycle-e2e;
checks.fleet-scheduler-e2e = photoncloud.checks.${system}.fleet-scheduler-e2e;
devShells.default = pkgs.mkShell {
name = "photoncloud-ci-dev";

View file

@ -0,0 +1,67 @@
{ lib, modulesPath, ... }:
{
imports = [
"${modulesPath}/virtualisation/qemu-vm.nix"
"${modulesPath}/testing/test-instrumentation.nix"
];
boot.loader.grub = {
enable = true;
device = "/dev/vda";
forceInstall = true;
};
fileSystems."/" = {
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};
networking.hostName = "worker";
networking.firewall.enable = false;
networking.useDHCP = lib.mkForce false;
networking.dhcpcd.enable = lib.mkForce false;
systemd.network = {
enable = true;
networks."10-eth0" = {
matchConfig.Name = "eth0";
networkConfig.DHCP = "yes";
linkConfig.RequiredForOnline = "routable";
};
networks."20-eth1" = {
matchConfig.Name = "eth1";
address = [ "192.168.1.2/24" ];
linkConfig.RequiredForOnline = "routable";
};
};
nix.registry = lib.mkForce { };
nix.nixPath = lib.mkForce [ ];
nix.channel.enable = false;
nix.settings = {
experimental-features = [
"nix-command"
"flakes"
];
flake-registry = "";
};
nixpkgs.flake = {
source = lib.mkForce null;
setFlakeRegistry = lib.mkForce false;
setNixPath = lib.mkForce false;
};
system.switch.enable = lib.mkForce true;
system.nixos.label = lib.mkForce "vm-smoke-target";
system.nixos.version = lib.mkForce "vm-smoke-target";
system.nixos.versionSuffix = lib.mkForce "-vm-smoke-target";
environment.etc."photon-vm-smoke-target".text = "vm-smoke-target\n";
documentation.enable = false;
documentation.nixos.enable = false;
documentation.man.enable = false;
documentation.info.enable = false;
documentation.doc.enable = false;
system.stateVersion = "24.11";
}

View file

@ -33,6 +33,12 @@ let
mkDesiredSystemType = types: types.submodule {
options = {
deploymentId = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional host deployment identifier owning this desired system";
};
nixosConfiguration = mkOption {
type = types.nullOr types.str;
default = null;
@ -62,6 +68,119 @@ let
default = null;
description = "Whether nix-agent should roll back when the health check fails";
};
drainBeforeApply = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether the controller should drain the node before issuing this desired system";
};
};
};
mkHostDeploymentSelectorType = types: types.submodule {
options = {
nodeIds = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Explicit node IDs targeted by the deployment";
};
roles = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Node roles targeted by the deployment";
};
pools = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Node pools targeted by the deployment";
};
nodeClasses = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Node classes targeted by the deployment";
};
matchLabels = mkOption {
type = types.attrsOf types.str;
default = { };
description = "Label selectors applied to target nodes";
};
};
};
mkHostDeploymentType = types:
let
selectorType = mkHostDeploymentSelectorType types;
in types.submodule {
options = {
selector = mkOption {
type = selectorType;
default = { };
description = "Node selector used by the host deployment";
};
nixosConfiguration = mkOption {
type = types.nullOr types.str;
default = null;
description = "Name of the nixosConfigurations output to roll out";
};
flakeRef = mkOption {
type = types.nullOr types.str;
default = null;
description = "Explicit flake reference used during rollout";
};
batchSize = mkOption {
type = types.nullOr types.int;
default = null;
description = "Maximum number of nodes started per reconciliation wave";
};
maxUnavailable = mkOption {
type = types.nullOr types.int;
default = null;
description = "Maximum number of unavailable nodes allowed during rollout";
};
healthCheckCommand = mkOption {
type = types.listOf types.str;
default = [ ];
description = "Health check command executed by nix-agent after activation";
};
switchAction = mkOption {
type = types.nullOr types.str;
default = null;
description = "switch-to-configuration action used by nix-agent";
};
rollbackOnFailure = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether nodes should roll back when rollout health checks fail";
};
drainBeforeApply = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether the controller should drain a node before applying the rollout";
};
rebootPolicy = mkOption {
type = types.nullOr types.str;
default = null;
description = "Operator-facing reboot policy associated with the rollout";
};
paused = mkOption {
type = types.nullOr types.bool;
default = null;
description = "Whether the rollout should start in a paused state";
};
};
};
@ -159,6 +278,30 @@ let
default = null;
description = "Desired deployer node lifecycle state";
};
commissionState = mkOption {
type = types.nullOr (types.enum [ "discovered" "commissioning" "commissioned" ]);
default = null;
description = "Optional commissioning state exported into deployer cluster state";
};
installState = mkOption {
type = types.nullOr (types.enum [ "pending" "installing" "installed" "failed" "reinstall_requested" ]);
default = null;
description = "Optional install lifecycle state exported into deployer cluster state";
};
powerState = mkOption {
type = types.nullOr (types.enum [ "on" "off" "cycling" "unknown" ]);
default = null;
description = "Optional external power-management state associated with the node";
};
bmcRef = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional BMC / Redfish reference associated with the node";
};
};
};
@ -339,7 +482,10 @@ let
mkDesiredSystem = nodeName: desiredSystem:
let
rendered =
optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
optionalAttrs (desiredSystem != null && desiredSystem.deploymentId != null) {
deployment_id = desiredSystem.deploymentId;
}
// optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
nixos_configuration = desiredSystem.nixosConfiguration;
}
// optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) {
@ -353,12 +499,60 @@ let
}
// optionalAttrs (desiredSystem != null && desiredSystem.rollbackOnFailure != null) {
rollback_on_failure = desiredSystem.rollbackOnFailure;
}
// optionalAttrs (desiredSystem != null && desiredSystem.drainBeforeApply != null) {
drain_before_apply = desiredSystem.drainBeforeApply;
};
in
if desiredSystem == null || rendered == { } then null else {
node_id = nodeName;
} // rendered;
mkHostDeploymentSelector = selector:
{
node_ids = selector.nodeIds or [ ];
roles = selector.roles or [ ];
pools = selector.pools or [ ];
node_classes = selector.nodeClasses or [ ];
match_labels = selector.matchLabels or { };
};
mkDeployerHostDeploymentSpec = name: deployment:
{
inherit name;
selector = mkHostDeploymentSelector deployment.selector;
}
// optionalAttrs (deployment.nixosConfiguration != null) {
nixos_configuration = deployment.nixosConfiguration;
}
// optionalAttrs (deployment.flakeRef != null) {
flake_ref = deployment.flakeRef;
}
// optionalAttrs (deployment.batchSize != null) {
batch_size = deployment.batchSize;
}
// optionalAttrs (deployment.maxUnavailable != null) {
max_unavailable = deployment.maxUnavailable;
}
// optionalAttrs (deployment.healthCheckCommand != [ ]) {
health_check_command = deployment.healthCheckCommand;
}
// optionalAttrs (deployment.switchAction != null) {
switch_action = deployment.switchAction;
}
// optionalAttrs (deployment.rollbackOnFailure != null) {
rollback_on_failure = deployment.rollbackOnFailure;
}
// optionalAttrs (deployment.drainBeforeApply != null) {
drain_before_apply = deployment.drainBeforeApply;
}
// optionalAttrs (deployment.rebootPolicy != null) {
reboot_policy = deployment.rebootPolicy;
}
// optionalAttrs (deployment.paused != null) {
paused = deployment.paused;
};
mkDeployerNodeSpec = nodeName: node:
{
node_id = nodeName;
@ -390,6 +584,18 @@ let
}
// optionalAttrs (node.state != null) {
state = node.state;
}
// optionalAttrs (node.commissionState != null) {
commission_state = node.commissionState;
}
// optionalAttrs (node.installState != null) {
install_state = node.installState;
}
// optionalAttrs (node.powerState != null) {
power_state = node.powerState;
}
// optionalAttrs (node.bmcRef != null) {
bmc_ref = node.bmcRef;
};
mkDeployerNodeClassSpec = name: nodeClass:
@ -522,6 +728,7 @@ let
nodeClasses = deployer.nodeClasses or { };
pools = deployer.pools or { };
enrollmentRules = deployer.enrollmentRules or { };
hostDeployments = deployer.hostDeployments or { };
in {
cluster = {
cluster_id = clusterId;
@ -532,6 +739,7 @@ let
node_classes = map (name: mkDeployerNodeClassSpec name nodeClasses.${name}) (attrNames nodeClasses);
pools = map (name: mkDeployerPoolSpec name pools.${name}) (attrNames pools);
enrollment_rules = map (name: mkDeployerEnrollmentRuleSpec name enrollmentRules.${name}) (attrNames enrollmentRules);
host_deployments = map (name: mkDeployerHostDeploymentSpec name hostDeployments.${name}) (attrNames hostDeployments);
services = [ ];
instances = [ ];
mtls_policies = [ ];
@ -541,6 +749,8 @@ in
inherit
mkInstallPlanType
mkDesiredSystemType
mkHostDeploymentSelectorType
mkHostDeploymentType
mkNodeType
mkNodeClassType
mkNodePoolType

View file

@ -2,8 +2,61 @@
let
cfg = config.services.coronafs;
chainfireEnabled = lib.hasAttrByPath [ "services" "chainfire" "enable" ] config && config.services.chainfire.enable;
chainfireApiUrls =
if cfg.chainfireApiUrl != null then
lib.filter (item: item != "") (map lib.strings.trim (lib.splitString "," cfg.chainfireApiUrl))
else
[ ];
effectiveChainfireApiUrl =
if cfg.chainfireApiUrl != null then cfg.chainfireApiUrl
else if chainfireEnabled then "http://127.0.0.1:${toString config.services.chainfire.httpPort}"
else null;
localChainfireApiUrl =
lib.any
(url:
lib.hasPrefix "http://127.0.0.1:" url
|| lib.hasPrefix "http://localhost:" url
)
(
if effectiveChainfireApiUrl == null then
[ ]
else if cfg.chainfireApiUrl != null then
chainfireApiUrls
else
[ effectiveChainfireApiUrl ]
);
waitForChainfire =
pkgs.writeShellScript "coronafs-wait-for-chainfire" ''
set -eu
deadline=$((SECONDS + 60))
urls='${lib.concatStringsSep " " (
if effectiveChainfireApiUrl == null then
[ ]
else if cfg.chainfireApiUrl != null then
chainfireApiUrls
else
[ effectiveChainfireApiUrl ]
)}'
while true; do
for url in $urls; do
if curl -fsS "$url/health" >/dev/null 2>&1; then
exit 0
fi
done
if [ "$SECONDS" -ge "$deadline" ]; then
echo "timed out waiting for ChainFire at ${if effectiveChainfireApiUrl == null then "(none)" else effectiveChainfireApiUrl}" >&2
exit 1
fi
sleep 1
done
'';
tomlFormat = pkgs.formats.toml { };
coronafsConfigFile = tomlFormat.generate "coronafs.toml" {
coronafsConfigFile = tomlFormat.generate "coronafs.toml" (
{
mode = cfg.mode;
metadata_backend = cfg.metadataBackend;
chainfire_key_prefix = cfg.chainfireKeyPrefix;
listen_addr = "0.0.0.0:${toString cfg.port}";
advertise_host = cfg.advertiseHost;
data_dir = toString cfg.dataDir;
@ -20,12 +73,41 @@ let
qemu_nbd_path = "${pkgs.qemu}/bin/qemu-nbd";
qemu_img_path = "${pkgs.qemu}/bin/qemu-img";
log_level = "info";
};
}
// lib.optionalAttrs (effectiveChainfireApiUrl != null) {
chainfire_api_url = effectiveChainfireApiUrl;
}
);
in
{
options.services.coronafs = {
enable = lib.mkEnableOption "CoronaFS block volume service";
mode = lib.mkOption {
type = lib.types.enum [ "combined" "controller" "node" ];
default = "combined";
description = "CoronaFS operating mode: combined compatibility mode, controller-only API, or node-local export mode.";
};
metadataBackend = lib.mkOption {
type = lib.types.enum [ "filesystem" "chainfire" ];
default = "filesystem";
description = "Metadata backend for CoronaFS volume metadata. Use chainfire on controller nodes to replicate volume metadata.";
};
chainfireApiUrl = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "Optional ChainFire HTTP API URL used when metadataBackend = chainfire. Comma-separated endpoints are allowed for failover.";
example = "http://127.0.0.1:8081";
};
chainfireKeyPrefix = lib.mkOption {
type = lib.types.str;
default = "/coronafs/volumes";
description = "ChainFire key prefix used to store CoronaFS metadata when metadataBackend = chainfire.";
};
port = lib.mkOption {
type = lib.types.port;
default = 50088;
@ -71,7 +153,7 @@ in
exportAioMode = lib.mkOption {
type = lib.types.enum [ "native" "io_uring" "threads" ];
default = "io_uring";
default = "threads";
description = "qemu-nbd AIO mode for CoronaFS exports.";
};
@ -113,11 +195,22 @@ in
};
config = lib.mkIf cfg.enable {
assertions = [
{
assertion = cfg.metadataBackend != "chainfire" || effectiveChainfireApiUrl != null;
message = "services.coronafs.metadataBackend = \"chainfire\" requires services.coronafs.chainfireApiUrl or a local services.chainfire instance.";
}
];
users.users.coronafs = {
isSystemUser = true;
group = "coronafs";
description = "CoronaFS service user";
home = cfg.dataDir;
extraGroups =
lib.optional
(lib.hasAttrByPath [ "services" "plasmavmc" "enable" ] config && config.services.plasmavmc.enable)
"plasmavmc";
};
users.groups.coronafs = { };
@ -125,8 +218,9 @@ in
systemd.services.coronafs = {
description = "CoronaFS Block Volume Service";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils ];
after = [ "network.target" ] ++ lib.optionals chainfireEnabled [ "chainfire.service" ];
wants = lib.optionals chainfireEnabled [ "chainfire.service" ];
path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils pkgs.curl ];
serviceConfig = {
Type = "simple";
@ -138,13 +232,14 @@ in
StateDirectory = "coronafs";
StateDirectoryMode = "0750";
ReadWritePaths = [ cfg.dataDir ];
ExecStartPre = lib.optionals (cfg.metadataBackend == "chainfire" && localChainfireApiUrl) [ waitForChainfire ];
ExecStart = "${cfg.package}/bin/coronafs-server --config ${coronafsConfigFile}";
};
};
systemd.tmpfiles.rules = [
"d ${toString cfg.dataDir} 0750 coronafs coronafs -"
"d ${toString cfg.dataDir}/volumes 0750 coronafs coronafs -"
"d ${toString cfg.dataDir}/volumes 2770 coronafs coronafs -"
"d ${toString cfg.dataDir}/metadata 0750 coronafs coronafs -"
"d ${toString cfg.dataDir}/pids 0750 coronafs coronafs -"
];

View file

@ -3,6 +3,23 @@
let
cfg = config.services.deployer;
tomlFormat = pkgs.formats.toml { };
usesLocalChainfire =
builtins.any
(
endpoint:
lib.hasPrefix "http://127.0.0.1:" endpoint
|| lib.hasPrefix "http://localhost:" endpoint
|| lib.hasPrefix "http://[::1]:" endpoint
)
cfg.chainfireEndpoints;
localChainfireDeps =
lib.optionals
(
usesLocalChainfire
&& lib.hasAttrByPath [ "services" "chainfire" "enable" ] config
&& config.services.chainfire.enable
)
[ "chainfire.service" ];
generatedConfig = {
bind_addr = cfg.bindAddr;
chainfire = {
@ -226,7 +243,9 @@ in
systemd.services.deployer = {
description = "PlasmaCloud Deployer Server";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
wants = [ "network-online.target" ] ++ localChainfireDeps;
after = [ "network-online.target" ] ++ localChainfireDeps;
requires = localChainfireDeps;
environment = {}
// lib.optionalAttrs (cfg.bootstrapToken != null) {

View file

@ -285,7 +285,7 @@ in
healthUrl = "http://localhost:8082/health"; # Health endpoint on admin port
leaderUrlKey = "flaredb_leader_url";
defaultLeaderUrl = "http://localhost:8082";
joinPath = null;
joinPath = "/admin/member/add";
port = cfg.flaredbPort;
description = "FlareDB";
} // {

View file

@ -297,6 +297,30 @@ in
description = "Prometheus metrics port for lightningstor-node.";
};
s3StreamingPutThresholdBytes = lib.mkOption {
type = lib.types.int;
default = 64 * 1024 * 1024;
description = "Streaming PUT multipart threshold for the S3 frontend.";
};
s3InlinePutMaxBytes = lib.mkOption {
type = lib.types.int;
default = 128 * 1024 * 1024;
description = "Maximum inline single-PUT size for the S3 frontend.";
};
s3MultipartPutConcurrency = lib.mkOption {
type = lib.types.int;
default = 4;
description = "Maximum in-flight multipart PUT part uploads.";
};
s3MultipartFetchConcurrency = lib.mkOption {
type = lib.types.int;
default = 4;
description = "Maximum concurrent multipart GET part fetches.";
};
databaseUrl = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
@ -369,6 +393,14 @@ in
environment = {
RUST_LOG = "info";
LIGHTNINGSTOR_S3_STREAMING_PUT_THRESHOLD_BYTES =
toString cfg.s3StreamingPutThresholdBytes;
LIGHTNINGSTOR_S3_INLINE_PUT_MAX_BYTES =
toString cfg.s3InlinePutMaxBytes;
LIGHTNINGSTOR_S3_MULTIPART_PUT_CONCURRENCY =
toString cfg.s3MultipartPutConcurrency;
LIGHTNINGSTOR_S3_MULTIPART_FETCH_CONCURRENCY =
toString cfg.s3MultipartFetchConcurrency;
};
};
};

View file

@ -9,6 +9,7 @@ let
nodeClassType = clusterConfigLib.mkNodeClassType types;
nodePoolType = clusterConfigLib.mkNodePoolType types;
enrollmentRuleType = clusterConfigLib.mkEnrollmentRuleType types;
hostDeploymentType = clusterConfigLib.mkHostDeploymentType types;
jsonFormat = pkgs.formats.json { };
# Generate cluster-config.json for the current node
@ -98,6 +99,12 @@ in {
default = { };
description = "Deployer auto-enrollment rules derived from Nix";
};
hostDeployments = mkOption {
type = types.attrsOf hostDeploymentType;
default = { };
description = "Declarative host rollout objects derived from Nix";
};
};
generated = {
@ -173,6 +180,16 @@ in {
) (attrNames cfg.deployer.enrollmentRules);
message = "All deployer enrollment rules must reference existing pools and node classes";
}
{
assertion = all (deploymentName:
let
deployment = cfg.deployer.hostDeployments.${deploymentName};
in
all (pool: cfg.deployer.pools ? "${pool}") deployment.selector.pools
&& all (nodeClass: cfg.deployer.nodeClasses ? "${nodeClass}") deployment.selector.nodeClasses
) (attrNames cfg.deployer.hostDeployments);
message = "All deployer host deployments must reference existing pools and node classes";
}
];
# Generate cluster-config.json for first-boot-automation

View file

@ -2,11 +2,30 @@
let
cfg = config.services.plasmavmc;
localIamDeps = lib.optional (config.services.iam.enable or false) "iam.service";
localIamHealthUrl =
if config.services.iam.enable or false
then "http://127.0.0.1:${toString config.services.iam.httpPort}/health"
else null;
remoteIamEndpoint =
if !(config.services.iam.enable or false) && cfg.iamAddr != null
then cfg.iamAddr
else null;
coronafsEnabled = lib.hasAttrByPath [ "services" "coronafs" "enable" ] config && config.services.coronafs.enable;
coronafsDataDir =
if coronafsEnabled && lib.hasAttrByPath [ "services" "coronafs" "dataDir" ] config
then toString config.services.coronafs.dataDir
else null;
effectiveCoronafsControllerEndpoint =
if cfg.coronafsControllerEndpoint != null then cfg.coronafsControllerEndpoint
else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
else null;
effectiveCoronafsNodeEndpoint =
if cfg.coronafsNodeEndpoint != null then cfg.coronafsNodeEndpoint
else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
else null;
tomlFormat = pkgs.formats.toml { };
plasmavmcConfigFile = tomlFormat.generate "plasmavmc.toml" {
addr = "0.0.0.0:${toString cfg.port}";
@ -94,10 +113,41 @@ in
coronafsEndpoint = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
description = "Deprecated combined CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
example = "http://10.0.0.11:50088";
};
coronafsControllerEndpoint = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "CoronaFS controller HTTP endpoint used to provision and resize managed VM volumes. Comma-separated endpoints are allowed for client-side failover.";
example = "http://10.0.0.11:50088";
};
coronafsNodeEndpoint = lib.mkOption {
type = lib.types.nullOr lib.types.str;
default = null;
description = "CoronaFS node-local HTTP endpoint used to resolve local paths and exports for attached VM volumes. Comma-separated endpoints are allowed for client-side failover.";
example = "http://127.0.0.1:50088";
};
coronafsNodeLocalAttach = lib.mkOption {
type = lib.types.bool;
default = false;
description = ''
Enable writable VM attachment through node-local CoronaFS materialization.
This requires services.plasmavmc.sharedLiveMigration = false because migrations use cold relocate plus flush-back.
'';
};
experimentalCoronafsNodeLocalAttach = lib.mkOption {
type = lib.types.bool;
default = false;
description = ''
Deprecated alias for services.plasmavmc.coronafsNodeLocalAttach.
'';
};
managedVolumeRoot = lib.mkOption {
type = lib.types.path;
default = "/var/lib/plasmavmc/managed-volumes";
@ -173,6 +223,24 @@ in
};
config = lib.mkIf cfg.enable {
assertions = [
{
assertion = !((cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) && cfg.sharedLiveMigration);
message = ''
services.plasmavmc.coronafsNodeLocalAttach requires services.plasmavmc.sharedLiveMigration = false
because writable node-local CoronaFS attachment uses cold relocate plus flush-back instead of shared-storage live migration.
'';
}
];
warnings =
lib.optional (cfg.coronafsEndpoint != null) ''
services.plasmavmc.coronafsEndpoint is deprecated; use services.plasmavmc.coronafsControllerEndpoint and services.plasmavmc.coronafsNodeEndpoint.
''
++ lib.optional (cfg.experimentalCoronafsNodeLocalAttach) ''
services.plasmavmc.experimentalCoronafsNodeLocalAttach is deprecated; use services.plasmavmc.coronafsNodeLocalAttach.
'';
# Create system user
users.users.plasmavmc = {
isSystemUser = true;
@ -188,9 +256,35 @@ in
systemd.services.plasmavmc = {
description = "PlasmaVMC Virtual Machine Compute Service";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" "prismnet.service" "flaredb.service" "chainfire.service" ];
wants = [ "prismnet.service" "flaredb.service" "chainfire.service" ];
path = [ pkgs.qemu pkgs.coreutils ];
after = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
wants = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
path = [ pkgs.qemu pkgs.coreutils pkgs.curl ];
preStart =
lib.optionalString (localIamHealthUrl != null) ''
for _ in $(seq 1 90); do
if curl -fsS ${lib.escapeShellArg localIamHealthUrl} >/dev/null 2>&1; then
exit 0
fi
sleep 1
done
echo "plasmavmc: timed out waiting for local IAM health at ${localIamHealthUrl}" >&2
exit 1
''
+ lib.optionalString (remoteIamEndpoint != null) ''
endpoint=${lib.escapeShellArg remoteIamEndpoint}
endpoint="''${endpoint#http://}"
endpoint="''${endpoint#https://}"
host="''${endpoint%:*}"
port="''${endpoint##*:}"
for _ in $(${pkgs.coreutils}/bin/seq 1 90); do
if ${pkgs.coreutils}/bin/timeout 1 ${pkgs.bash}/bin/bash -lc "</dev/tcp/''${host}/''${port}" >/dev/null 2>&1; then
exit 0
fi
sleep 1
done
echo "plasmavmc: timed out waiting for IAM gRPC at ''${host}:''${port}" >&2
exit 1
'';
environment = lib.mkMerge [
{
@ -213,6 +307,16 @@ in
(lib.mkIf (cfg.lightningstorAddr != null) {
PLASMAVMC_LIGHTNINGSTOR_ENDPOINT = cfg.lightningstorAddr;
})
(lib.mkIf (effectiveCoronafsControllerEndpoint != null) {
PLASMAVMC_CORONAFS_CONTROLLER_ENDPOINT = effectiveCoronafsControllerEndpoint;
})
(lib.mkIf (effectiveCoronafsNodeEndpoint != null) {
PLASMAVMC_CORONAFS_NODE_ENDPOINT = effectiveCoronafsNodeEndpoint;
})
(lib.mkIf (cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) {
PLASMAVMC_CORONAFS_NODE_LOCAL_ATTACH = "1";
PLASMAVMC_CORONAFS_ENABLE_EXPERIMENTAL_NODE_LOCAL_ATTACH = "1";
})
(lib.mkIf (cfg.coronafsEndpoint != null) {
PLASMAVMC_CORONAFS_ENDPOINT = cfg.coronafsEndpoint;
})
@ -273,6 +377,8 @@ in
systemd.tmpfiles.rules = [
"d ${builtins.dirOf (toString cfg.managedVolumeRoot)} 0755 plasmavmc plasmavmc -"
"d ${toString cfg.managedVolumeRoot} 0750 plasmavmc plasmavmc -"
] ++ lib.optionals coronafsEnabled [
"d ${toString cfg.dataDir}/images 2770 plasmavmc coronafs -"
];
};
}

View file

@ -108,6 +108,19 @@
};
};
};
hostDeployments = {
control-plane-canary = {
selector.nodeIds = [ "node01" ];
nixosConfiguration = "node01";
flakeRef = "github:centra/cloud";
batchSize = 1;
maxUnavailable = 1;
healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
switchAction = "switch";
rollbackOnFailure = true;
};
};
};
bootstrap.initialPeers = [ "node01" "node02" "node03" ];

View file

@ -32,8 +32,8 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "192.168.100.11:2379";
flaredbAddr = "192.168.100.11:2479";
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
};
services.openssh.enable = true;

View file

@ -42,8 +42,8 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "192.168.100.11:2379";
flaredbAddr = "192.168.100.11:2479";
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
};
services.openssh.enable = true;

View file

@ -42,8 +42,8 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "192.168.100.11:2379";
flaredbAddr = "192.168.100.11:2479";
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
};
services.openssh.enable = true;

View file

@ -63,10 +63,13 @@ Preferred entrypoint for publishable verification: `nix run ./nix/test-cluster#c
Preferred entrypoint for publishable matrix verification: `nix run ./nix/test-cluster#cluster -- fresh-matrix`
`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS local-vs-shared-volume I/O, queued random-read behavior, cross-worker direct-I/O shared-volume reads, and LightningStor large/small-object S3 throughput and writes a report to `docs/storage-benchmarks.md`.
`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS controller-export vs node-local-export I/O, worker-side materialization latency, and LightningStor large/small-object S3 throughput, then writes a report to `docs/storage-benchmarks.md`.
Preferred entrypoint for publishable storage numbers: `nix run ./nix/test-cluster#cluster -- fresh-storage-bench`
`nix run ./nix/test-cluster#cluster -- bench-coronafs-local-matrix` runs the local single-process CoronaFS export benchmark across the supported `cache`/`aio` combinations so software-path regressions can be separated from VM-lab network limits.
On the current lab hosts, `cache=none` with `aio=io_uring` is the strongest local-export profile and should be treated as the reference point when CoronaFS remote numbers are being distorted by the nested-QEMU/VDE network path.
## Advanced usage
Use the script entrypoint only for local debugging inside a prepared Nix shell:

View file

@ -27,6 +27,18 @@ in
default = "/tmp/photoncloud-test-cluster-vde.sock";
description = "VDE control socket path used for the east-west cluster NIC.";
};
chainfireControlPlaneAddrs = lib.mkOption {
type = lib.types.str;
default = "10.100.0.11:2379,10.100.0.12:2379,10.100.0.13:2379";
description = "Comma-separated ChainFire client endpoints for multi-endpoint failover.";
};
flaredbControlPlaneAddrs = lib.mkOption {
type = lib.types.str;
default = "10.100.0.11:2479,10.100.0.12:2479,10.100.0.13:2479";
description = "Comma-separated FlareDB client endpoints for multi-endpoint failover.";
};
};
config = {
@ -84,10 +96,43 @@ in
system.stateVersion = "24.05";
systemd.services.photon-test-cluster-net-tuning = {
description = "Tune cluster NIC offloads for nested-QEMU storage tests";
wantedBy = [ "multi-user.target" ];
after = [ "network-online.target" ];
wants = [ "network-online.target" ];
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
};
path = [ pkgs.ethtool pkgs.iproute2 pkgs.coreutils ];
script = ''
set -eu
iface="eth1"
for _ in $(seq 1 30); do
if ip link show "$iface" >/dev/null 2>&1; then
break
fi
sleep 1
done
if ! ip link show "$iface" >/dev/null 2>&1; then
echo "photon-test-cluster-net-tuning: $iface not present, skipping" >&2
exit 0
fi
# Nested QEMU over VDE is sensitive to guest-side offloads; disabling
# them reduces retransmits and keeps the storage benchmarks closer to
# raw TCP throughput.
ethtool -K "$iface" tso off gso off gro off tx off rx off sg off || true
ip link set dev "$iface" txqueuelen 10000 || true
'';
};
environment.systemPackages = with pkgs; [
awscli2
curl
dnsutils
ethtool
fio
jq
grpcurl

View file

@ -115,12 +115,17 @@
curl
grpcurl
jq
llvmPackages.clang
llvmPackages.libclang
openssh
protobuf
clusterPython
qemu
sshpass
vde2
];
LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib";
PROTOC = "${pkgs.protobuf}/bin/protoc";
};
};
}

View file

@ -69,29 +69,29 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "10.100.0.11:2379";
flaredbAddr = "10.100.0.11:2479";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
services.prismnet = {
enable = true;
port = 50081;
iamAddr = "10.100.0.11:50080";
flaredbAddr = "10.100.0.11:2479";
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
services.flashdns = {
enable = true;
iamAddr = "10.100.0.11:50080";
flaredbAddr = "10.100.0.11:2479";
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
services.fiberlb = {
enable = true;
port = 50085;
iamAddr = "10.100.0.11:50080";
chainfireAddr = "10.100.0.11:2379";
flaredbAddr = "10.100.0.11:2479";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
services.plasmavmc = {
@ -101,14 +101,17 @@
httpPort = 8084;
prismnetAddr = "10.100.0.11:50081";
iamAddr = "10.100.0.11:50080";
chainfireAddr = "10.100.0.11:2379";
flaredbAddr = "10.100.0.11:2479";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
lightningstorAddr = "10.100.0.11:50086";
coronafsEndpoint = "http://10.100.0.11:50088";
coronafsControllerEndpoint = "http://127.0.0.1:50088";
coronafsNodeEndpoint = "http://127.0.0.1:50088";
};
services.coronafs = {
enable = true;
metadataBackend = "chainfire";
chainfireKeyPrefix = "/coronafs/test-cluster/control/volumes";
port = 50088;
advertiseHost = "10.100.0.11";
exportBasePort = 11000;
@ -138,9 +141,9 @@
readQuorum = 1;
writeQuorum = 2;
nodeMetricsPort = 9198;
chainfireAddr = "10.100.0.11:2379";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
iamAddr = "10.100.0.11:50080";
flaredbAddr = "10.100.0.11:2479";
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
zone = "zone-a";
region = "test";
};
@ -149,10 +152,10 @@
enable = true;
port = 50087;
iamAddr = "http://10.100.0.11:50080";
chainfireAddr = "http://10.100.0.11:2379";
chainfireAddr = "http://${config.photonTestCluster.chainfireControlPlaneAddrs}";
prismnetAddr = "http://10.100.0.11:50081";
flaredbPdAddr = "10.100.0.11:2379";
flaredbDirectAddr = "10.100.0.11:2479";
flaredbPdAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbDirectAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
fiberlbAddr = "http://10.100.0.11:50085";
flashdnsAddr = "http://10.100.0.11:50084";
};

View file

@ -41,7 +41,6 @@
nodeId = "node02";
raftAddr = "10.100.0.12:2480";
apiAddr = "10.100.0.12:2479";
pdAddr = "10.100.0.11:2379";
initialPeers = [
"node01=10.100.0.11:2479"
"node02=10.100.0.12:2479"
@ -63,8 +62,8 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "10.100.0.12:2379";
flaredbAddr = "10.100.0.12:2479";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
systemd.services.iam.environment = {

View file

@ -41,7 +41,6 @@
nodeId = "node03";
raftAddr = "10.100.0.13:2480";
apiAddr = "10.100.0.13:2479";
pdAddr = "10.100.0.11:2379";
initialPeers = [
"node01=10.100.0.11:2479"
"node02=10.100.0.12:2479"
@ -63,8 +62,8 @@
services.iam = {
enable = true;
port = 50080;
chainfireAddr = "10.100.0.13:2379";
flaredbAddr = "10.100.0.13:2479";
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
};
systemd.services.iam.environment = {

Some files were not shown because too many files have changed in this diff Show more