Implement host lifecycle orchestration and distributed storage restructuring

2026-03-27 12:14:12 +09:00 · 2026-03-27 12:14:12 +09:00 · 6fa172eab1
commit 6fa172eab1
parent a7d5cfa738
124 changed files with 21742 additions and 4016 deletions
--- a/apigateway/Cargo.lock
+++ b/apigateway/Cargo.lock
--- a/chainfire/Cargo.lock
+++ b/chainfire/Cargo.lock
@ -342,6 +342,12 @@ version = "1.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"

+[[package]]
+name = "cfg_aliases"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
+
 [[package]]
 name = "chainfire-api"
 version = "0.1.0"
@ -471,6 +477,7 @@ dependencies = [
 "http-body-util",
 "metrics",
 "metrics-exporter-prometheus",
+ "reqwest",
 "serde",
 "serde_json",
 "tempfile",
@ -786,6 +793,17 @@ dependencies = [
 "crypto-common",
 ]

+[[package]]
+name = "displaydoc"
+version = "0.2.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "dlv-list"
 version = "0.3.0"
@ -978,8 +996,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
 dependencies = [
 "cfg-if",
+ "js-sys",
 "libc",
 "wasi",
+ "wasm-bindgen",
 ]

 [[package]]
@ -989,9 +1009,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
 dependencies = [
 "cfg-if",
+ "js-sys",
 "libc",
 "r-efi",
 "wasip2",
+ "wasm-bindgen",
 ]

 [[package]]
@ -1150,6 +1172,7 @@ dependencies = [
 "tokio",
 "tokio-rustls",
 "tower-service",
+ "webpki-roots",
 ]

 [[package]]
@ -1171,6 +1194,7 @@ version = "0.1.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f"
 dependencies = [
+ "base64 0.22.1",
 "bytes",
 "futures-channel",
 "futures-core",
@ -1178,7 +1202,9 @@ dependencies = [
 "http",
 "http-body",
 "hyper",
+ "ipnet",
 "libc",
+ "percent-encoding",
 "pin-project-lite",
 "socket2 0.6.1",
 "tokio",
@ -1210,6 +1236,108 @@ dependencies = [
 "cc",
 ]

+[[package]]
+name = "icu_collections"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
+dependencies = [
+ "displaydoc",
+ "potential_utf",
+ "yoke",
+ "zerofrom",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_locale_core"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
+dependencies = [
+ "displaydoc",
+ "litemap",
+ "tinystr",
+ "writeable",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
+dependencies = [
+ "icu_collections",
+ "icu_normalizer_data",
+ "icu_properties",
+ "icu_provider",
+ "smallvec",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_normalizer_data"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
+
+[[package]]
+name = "icu_properties"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
+dependencies = [
+ "icu_collections",
+ "icu_locale_core",
+ "icu_properties_data",
+ "icu_provider",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "icu_properties_data"
+version = "2.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
+
+[[package]]
+name = "icu_provider"
+version = "2.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
+dependencies = [
+ "displaydoc",
+ "icu_locale_core",
+ "writeable",
+ "yoke",
+ "zerofrom",
+ "zerotrie",
+ "zerovec",
+]
+
+[[package]]
+name = "idna"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
+dependencies = [
+ "idna_adapter",
+ "smallvec",
+ "utf8_iter",
+]
+
+[[package]]
+name = "idna_adapter"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
+dependencies = [
+ "icu_normalizer",
+ "icu_properties",
+]
+
 [[package]]
 name = "indexmap"
 version = "1.9.3"
@ -1236,6 +1364,16 @@ version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"

+[[package]]
+name = "iri-string"
+version = "0.7.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
+dependencies = [
+ "memchr",
+ "serde",
+]
+
 [[package]]
 name = "is-terminal"
 version = "0.4.17"
@ -1367,6 +1505,12 @@ version = "0.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"

+[[package]]
+name = "litemap"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
+
 [[package]]
 name = "lock_api"
 version = "0.4.14"
@ -1382,6 +1526,12 @@ version = "0.4.29"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"

+[[package]]
+name = "lru-slab"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
+
 [[package]]
 name = "lz4-sys"
 version = "1.11.1+lz4-1.10.0"
@ -1730,6 +1880,15 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "potential_utf"
+version = "0.1.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
+dependencies = [
+ "zerovec",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.21"
@ -1889,6 +2048,61 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "quinn"
+version = "0.11.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
+dependencies = [
+ "bytes",
+ "cfg_aliases",
+ "pin-project-lite",
+ "quinn-proto",
+ "quinn-udp",
+ "rustc-hash",
+ "rustls",
+ "socket2 0.6.1",
+ "thiserror 2.0.17",
+ "tokio",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-proto"
+version = "0.11.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
+dependencies = [
+ "bytes",
+ "getrandom 0.3.4",
+ "lru-slab",
+ "rand 0.9.2",
+ "ring",
+ "rustc-hash",
+ "rustls",
+ "rustls-pki-types",
+ "slab",
+ "thiserror 2.0.17",
+ "tinyvec",
+ "tracing",
+ "web-time",
+]
+
+[[package]]
+name = "quinn-udp"
+version = "0.5.14"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
+dependencies = [
+ "cfg_aliases",
+ "libc",
+ "once_cell",
+ "socket2 0.6.1",
+ "tracing",
+ "windows-sys 0.60.2",
+]
+
 [[package]]
 name = "quote"
 version = "1.0.42"
@ -2030,6 +2244,44 @@ version = "0.8.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"

+[[package]]
+name = "reqwest"
+version = "0.12.28"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
+dependencies = [
+ "base64 0.22.1",
+ "bytes",
+ "futures-core",
+ "http",
+ "http-body",
+ "http-body-util",
+ "hyper",
+ "hyper-rustls",
+ "hyper-util",
+ "js-sys",
+ "log",
+ "percent-encoding",
+ "pin-project-lite",
+ "quinn",
+ "rustls",
+ "rustls-pki-types",
+ "serde",
+ "serde_json",
+ "serde_urlencoded",
+ "sync_wrapper",
+ "tokio",
+ "tokio-rustls",
+ "tower 0.5.2",
+ "tower-http",
+ "tower-service",
+ "url",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+ "webpki-roots",
+]
+
 [[package]]
 name = "ring"
 version = "0.17.14"
@ -2137,6 +2389,7 @@ version = "1.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "708c0f9d5f54ba0272468c1d306a52c495b31fa155e91bc25371e6df7996908c"
 dependencies = [
+ "web-time",
 "zeroize",
 ]

@ -2359,6 +2612,12 @@ dependencies = [
 "windows-sys 0.60.2",
 ]

+[[package]]
+name = "stable_deref_trait"
+version = "1.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
+
 [[package]]
 name = "strsim"
 version = "0.11.1"
@ -2387,6 +2646,20 @@ name = "sync_wrapper"
 version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
+dependencies = [
+ "futures-core",
+]
+
+[[package]]
+name = "synstructure"
+version = "0.13.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]

 [[package]]
 name = "tempfile"
@ -2450,6 +2723,16 @@ dependencies = [
 "cfg-if",
 ]

+[[package]]
+name = "tinystr"
+version = "0.8.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
+dependencies = [
+ "displaydoc",
+ "zerovec",
+]
+
 [[package]]
 name = "tinytemplate"
 version = "1.2.1"
@ -2460,6 +2743,21 @@ dependencies = [
 "serde_json",
 ]

+[[package]]
+name = "tinyvec"
+version = "1.11.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
+dependencies = [
+ "tinyvec_macros",
+]
+
+[[package]]
+name = "tinyvec_macros"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
+
 [[package]]
 name = "tokio"
 version = "1.48.0"
@ -2676,9 +2974,12 @@ checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
 dependencies = [
 "bitflags 2.10.0",
 "bytes",
+ "futures-util",
 "http",
 "http-body",
+ "iri-string",
 "pin-project-lite",
+ "tower 0.5.2",
 "tower-layer",
 "tower-service",
 "tracing",
@ -2788,6 +3089,24 @@ version = "0.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"

+[[package]]
+name = "url"
+version = "2.5.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
+dependencies = [
+ "form_urlencoded",
+ "idna",
+ "percent-encoding",
+ "serde",
+]
+
+[[package]]
+name = "utf8_iter"
+version = "1.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
+
 [[package]]
 name = "utf8parse"
 version = "0.2.2"
@ -2871,6 +3190,19 @@ dependencies = [
 "wasm-bindgen-shared",
 ]

+[[package]]
+name = "wasm-bindgen-futures"
+version = "0.4.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
+dependencies = [
+ "cfg-if",
+ "js-sys",
+ "once_cell",
+ "wasm-bindgen",
+ "web-sys",
+]
+
 [[package]]
 name = "wasm-bindgen-macro"
 version = "0.2.106"
@ -2913,6 +3245,25 @@ dependencies = [
 "wasm-bindgen",
 ]

+[[package]]
+name = "web-time"
+version = "1.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
+[[package]]
+name = "webpki-roots"
+version = "1.0.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
+dependencies = [
+ "rustls-pki-types",
+]
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
@ -3174,6 +3525,12 @@ version = "0.46.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"

+[[package]]
+name = "writeable"
+version = "0.6.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
+
 [[package]]
 name = "yaml-rust"
 version = "0.4.5"
@ -3183,6 +3540,29 @@ dependencies = [
 "linked-hash-map",
 ]

+[[package]]
+name = "yoke"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
+dependencies = [
+ "stable_deref_trait",
+ "yoke-derive",
+ "zerofrom",
+]
+
+[[package]]
+name = "yoke-derive"
+version = "0.8.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
 [[package]]
 name = "zerocopy"
 version = "0.8.31"
@ -3203,12 +3583,66 @@ dependencies = [
 "syn",
 ]

+[[package]]
+name = "zerofrom"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
+dependencies = [
+ "zerofrom-derive",
+]
+
+[[package]]
+name = "zerofrom-derive"
+version = "0.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+ "synstructure",
+]
+
 [[package]]
 name = "zeroize"
 version = "1.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"

+[[package]]
+name = "zerotrie"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
+dependencies = [
+ "displaydoc",
+ "yoke",
+ "zerofrom",
+]
+
+[[package]]
+name = "zerovec"
+version = "0.11.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
+dependencies = [
+ "yoke",
+ "zerofrom",
+ "zerovec-derive",
+]
+
+[[package]]
+name = "zerovec-derive"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "zstd-sys"
 version = "2.0.16+zstd.1.5.7"
--- a/chainfire/chainfire-client/src/client.rs
+++ b/chainfire/chainfire-client/src/client.rs
@ -18,11 +18,17 @@ use chainfire_proto::proto::{
    StatusRequest,
    TxnRequest,
 };
+use std::time::Duration;
+use tonic::Code;
 use tonic::transport::Channel;
-use tracing::debug;
+use tracing::{debug, warn};

 /// Chainfire client
 pub struct Client {
+    /// Configured client endpoints
+    endpoints: Vec<String>,
+    /// Preferred endpoint index
+    current_endpoint: usize,
    /// gRPC channel
    channel: Channel,
    /// KV client
@ -34,36 +40,187 @@ pub struct Client {
 impl Client {
    /// Connect to a Chainfire server
    pub async fn connect(addr: impl AsRef<str>) -> Result<Self> {
-        let addr = addr.as_ref().to_string();
-        debug!(addr = %addr, "Connecting to Chainfire");
+        let endpoints = parse_endpoints(addr.as_ref())?;
+        let mut last_error = None;

-        let channel = Channel::from_shared(addr)
-            .map_err(|e| ClientError::Connection(e.to_string()))?
-            .connect()
-            .await?;
-
-        let kv = KvClient::new(channel.clone());
-        let cluster = ClusterClient::new(channel.clone());
-
-        Ok(Self {
+        for (index, endpoint) in endpoints.iter().enumerate() {
+            match connect_endpoint(endpoint).await {
+                Ok((channel, kv, cluster)) => {
+                    debug!(endpoint = %endpoint, "Connected to Chainfire");
+                    let mut client = Self {
+                        endpoints: endpoints.clone(),
+                        current_endpoint: index,
                        channel,
                        kv,
                        cluster,
-        })
+                    };
+                    client.promote_leader_endpoint().await?;
+                    return Ok(client);
+                }
+                Err(error) => {
+                    warn!(endpoint = %endpoint, error = %error, "Chainfire endpoint connect failed");
+                    last_error = Some(error);
+                }
+            }
+        }
+
+        Err(last_error.unwrap_or_else(|| ClientError::Connection("no Chainfire endpoints configured".to_string())))
+    }
+
+    async fn with_kv_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
+    where
+        F: FnMut(KvClient<Channel>) -> Fut,
+        Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
+    {
+        let max_attempts = self.endpoints.len().max(1) * 3;
+        let mut last_status = None;
+        for attempt in 0..max_attempts {
+            let client = self.kv.clone();
+            match op(client).await {
+                Ok(value) => return Ok(value),
+                Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
+                    warn!(
+                        endpoint = %self.endpoints[self.current_endpoint],
+                        code = ?status.code(),
+                        message = %status.message(),
+                        attempt = attempt + 1,
+                        max_attempts,
+                        "retrying Chainfire KV RPC on alternate endpoint"
+                    );
+                    last_status = Some(status);
+                    self.recover_after_status(last_status.as_ref().unwrap()).await?;
+                    tokio::time::sleep(retry_delay(attempt)).await;
+                }
+                Err(status) => return Err(status.into()),
+            }
+        }
+
+        Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire KV retry exhausted")).into())
+    }
+
+    async fn with_cluster_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
+    where
+        F: FnMut(ClusterClient<Channel>) -> Fut,
+        Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
+    {
+        let max_attempts = self.endpoints.len().max(1) * 3;
+        let mut last_status = None;
+        for attempt in 0..max_attempts {
+            let client = self.cluster.clone();
+            match op(client).await {
+                Ok(value) => return Ok(value),
+                Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
+                    warn!(
+                        endpoint = %self.endpoints[self.current_endpoint],
+                        code = ?status.code(),
+                        message = %status.message(),
+                        attempt = attempt + 1,
+                        max_attempts,
+                        "retrying Chainfire cluster RPC on alternate endpoint"
+                    );
+                    last_status = Some(status);
+                    self.recover_after_status(last_status.as_ref().unwrap()).await?;
+                    tokio::time::sleep(retry_delay(attempt)).await;
+                }
+                Err(status) => return Err(status.into()),
+            }
+        }
+
+        Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire cluster retry exhausted")).into())
+    }
+
+    async fn recover_after_status(&mut self, status: &tonic::Status) -> Result<()> {
+        if let Some(leader_idx) = self.discover_leader_endpoint().await? {
+            if leader_idx != self.current_endpoint {
+                return self.reconnect_to_index(leader_idx).await;
+            }
+        }
+
+        if self.endpoints.len() > 1 {
+            let next = (self.current_endpoint + 1) % self.endpoints.len();
+            if next != self.current_endpoint {
+                return self.reconnect_to_index(next).await;
+            }
+        }
+
+        Err(ClientError::Rpc(status.clone()))
+    }
+
+    async fn reconnect_to_index(&mut self, index: usize) -> Result<()> {
+        let endpoint = self
+            .endpoints
+            .get(index)
+            .ok_or_else(|| ClientError::Connection(format!("invalid Chainfire endpoint index {index}")))?
+            .clone();
+        let (channel, kv, cluster) = connect_endpoint(&endpoint).await?;
+        self.current_endpoint = index;
+        self.channel = channel;
+        self.kv = kv;
+        self.cluster = cluster;
+        Ok(())
+    }
+
+    async fn promote_leader_endpoint(&mut self) -> Result<()> {
+        if let Some(index) = self.discover_leader_endpoint().await? {
+            if index != self.current_endpoint {
+                self.reconnect_to_index(index).await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn discover_leader_endpoint(&self) -> Result<Option<usize>> {
+        for (index, endpoint) in self.endpoints.iter().enumerate() {
+            let mut cluster = match ClusterClient::connect(endpoint.clone()).await {
+                Ok(client) => client,
+                Err(error) => {
+                    warn!(endpoint = %endpoint, error = %error, "failed to connect while probing Chainfire leader");
+                    continue;
+                }
+            };
+
+            match cluster.status(StatusRequest {}).await {
+                Ok(response) => {
+                    let status = response.into_inner();
+                    let member_id = status.header.as_ref().map(|header| header.member_id).unwrap_or(0);
+                    if status.leader != 0 && status.leader == member_id {
+                        return Ok(Some(index));
+                    }
+                }
+                Err(status) => {
+                    warn!(
+                        endpoint = %endpoint,
+                        code = ?status.code(),
+                        message = %status.message(),
+                        "failed to query Chainfire leader status"
+                    );
+                }
+            }
+        }
+
+        Ok(None)
    }

    /// Put a key-value pair
    pub async fn put(&mut self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result<u64> {
+        let key = key.as_ref().to_vec();
+        let value = value.as_ref().to_vec();
        let resp = self
-            .kv
-            .put(PutRequest {
-                key: key.as_ref().to_vec(),
-                value: value.as_ref().to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = key.clone();
+                let value = value.clone();
+                async move {
+                    kv.put(PutRequest {
+                        key,
+                        value,
                        lease: 0,
                        prev_kv: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        Ok(resp.header.map(|h| h.revision as u64).unwrap_or(0))
    }
@ -86,19 +243,25 @@ impl Client {
        &mut self,
        key: impl AsRef<[u8]>,
    ) -> Result<Option<(Vec<u8>, u64)>> {
+        let key = key.as_ref().to_vec();
        let resp = self
-            .kv
-            .range(RangeRequest {
-                key: key.as_ref().to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = key.clone();
+                async move {
+                    kv.range(RangeRequest {
+                        key,
                        range_end: vec![],
                        limit: 1,
                        revision: 0,
                        keys_only: false,
                        count_only: false,
-                serializable: false, // default: linearizable read
+                        serializable: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64)))
    }
@ -132,14 +295,20 @@ impl Client {
            })),
        };

-        self.kv
-            .txn(TxnRequest {
+        self.with_kv_retry(|mut kv| {
+            let compare = compare.clone();
+            let put_op = put_op.clone();
+            async move {
+                kv.txn(TxnRequest {
                    compare: vec![compare],
                    success: vec![put_op],
                    failure: vec![],
                })
-            .await?
-            .into_inner();
+                .await
+                .map(|resp| resp.into_inner())
+            }
+        })
+        .await?;

        Ok(())
    }
@ -152,15 +321,21 @@ impl Client {

    /// Delete a key
    pub async fn delete(&mut self, key: impl AsRef<[u8]>) -> Result<bool> {
+        let key = key.as_ref().to_vec();
        let resp = self
-            .kv
-            .delete(DeleteRangeRequest {
-                key: key.as_ref().to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = key.clone();
+                async move {
+                    kv.delete(DeleteRangeRequest {
+                        key,
                        range_end: vec![],
                        prev_kv: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        Ok(resp.deleted > 0)
    }
@ -171,9 +346,12 @@ impl Client {
        let range_end = prefix_end(prefix);

        let resp = self
-            .kv
-            .range(RangeRequest {
-                key: prefix.to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = prefix.to_vec();
+                let range_end = range_end.clone();
+                async move {
+                    kv.range(RangeRequest {
+                        key,
                        range_end,
                        limit: 0,
                        revision: 0,
@ -181,8 +359,11 @@ impl Client {
                        count_only: false,
                        serializable: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        Ok(resp.kvs.into_iter().map(|kv| (kv.key, kv.value)).collect())
    }
@ -197,9 +378,12 @@ impl Client {
        let range_end = prefix_end(prefix);

        let resp = self
-            .kv
-            .range(RangeRequest {
-                key: prefix.to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = prefix.to_vec();
+                let range_end = range_end.clone();
+                async move {
+                    kv.range(RangeRequest {
+                        key,
                        range_end,
                        limit,
                        revision: 0,
@ -207,8 +391,11 @@ impl Client {
                        count_only: false,
                        serializable: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        let more = resp.more;
        let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
@ -238,18 +425,24 @@ impl Client {
        limit: i64,
    ) -> Result<(Vec<(Vec<u8>, Vec<u8>, u64)>, Option<Vec<u8>>)> {
        let resp = self
-            .kv
-            .range(RangeRequest {
-                key: start.as_ref().to_vec(),
-                range_end: end.as_ref().to_vec(),
+            .with_kv_retry(|mut kv| {
+                let key = start.as_ref().to_vec();
+                let range_end = end.as_ref().to_vec();
+                async move {
+                    kv.range(RangeRequest {
+                        key,
+                        range_end,
                        limit,
                        revision: 0,
                        keys_only: false,
                        count_only: false,
                        serializable: false,
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        let more = resp.more;
        let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
@ -309,14 +502,21 @@ impl Client {
        };

        let resp = self
-            .kv
-            .txn(TxnRequest {
+            .with_kv_retry(|mut kv| {
+                let compare = compare.clone();
+                let put_op = put_op.clone();
+                let read_on_fail = read_on_fail.clone();
+                async move {
+                    kv.txn(TxnRequest {
                        compare: vec![compare],
                        success: vec![put_op],
                        failure: vec![read_on_fail],
                    })
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        if resp.succeeded {
            let new_version = resp
@ -371,10 +571,13 @@ impl Client {
    /// Get cluster status
    pub async fn status(&mut self) -> Result<ClusterStatus> {
        let resp = self
-            .cluster
+            .with_cluster_retry(|mut cluster| async move {
+                cluster
                    .status(StatusRequest {})
-            .await?
-            .into_inner();
+                    .await
+                    .map(|resp| resp.into_inner())
+            })
+            .await?;

        Ok(ClusterStatus {
            version: resp.version,
@ -392,15 +595,22 @@ impl Client {
    /// # Returns
    /// The node ID of the added member
    pub async fn member_add(&mut self, node_id: u64, peer_url: impl AsRef<str>, is_learner: bool) -> Result<u64> {
+        let peer_url = peer_url.as_ref().to_string();
        let resp = self
-            .cluster
+            .with_cluster_retry(|mut cluster| {
+                let peer_url = peer_url.clone();
+                async move {
+                    cluster
                        .member_add(MemberAddRequest {
                            node_id,
-                peer_urls: vec![peer_url.as_ref().to_string()],
+                            peer_urls: vec![peer_url],
                            is_learner,
                        })
-            .await?
-            .into_inner();
+                        .await
+                        .map(|resp| resp.into_inner())
+                }
+            })
+            .await?;

        // Extract the member ID from the response
        let member_id = resp
@ -410,7 +620,7 @@ impl Client {

        debug!(
            member_id = member_id,
-            peer_url = peer_url.as_ref(),
+            peer_url = peer_url.as_str(),
            is_learner = is_learner,
            "Added member to cluster"
        );
@ -441,6 +651,64 @@ pub struct CasOutcome {
    pub new_version: u64,
 }

+fn parse_endpoints(input: &str) -> Result<Vec<String>> {
+    let endpoints: Vec<String> = input
+        .split(',')
+        .map(str::trim)
+        .filter(|value| !value.is_empty())
+        .map(normalize_endpoint)
+        .collect();
+
+    if endpoints.is_empty() {
+        return Err(ClientError::Connection("no Chainfire endpoints configured".to_string()));
+    }
+
+    Ok(endpoints)
+}
+
+fn normalize_endpoint(endpoint: &str) -> String {
+    if endpoint.contains("://") {
+        endpoint.to_string()
+    } else {
+        format!("http://{endpoint}")
+    }
+}
+
+async fn connect_endpoint(endpoint: &str) -> Result<(Channel, KvClient<Channel>, ClusterClient<Channel>)> {
+    let channel = Channel::from_shared(endpoint.to_string())
+        .map_err(|e| ClientError::Connection(e.to_string()))?
+        .connect()
+        .await?;
+
+    let kv = KvClient::new(channel.clone());
+    let cluster = ClusterClient::new(channel.clone());
+    Ok((channel, kv, cluster))
+}
+
+fn retry_delay(attempt: usize) -> Duration {
+    let multiplier = 1u64 << attempt.min(3);
+    Duration::from_millis((200 * multiplier).min(1_000))
+}
+
+fn is_retryable_status(status: &tonic::Status) -> bool {
+    matches!(
+        status.code(),
+        Code::Unavailable | Code::DeadlineExceeded | Code::Internal | Code::Aborted | Code::FailedPrecondition
+    ) || retryable_message(status.message())
+}
+
+fn retryable_message(message: &str) -> bool {
+    let lowercase = message.to_ascii_lowercase();
+    lowercase.contains("not leader")
+        || lowercase.contains("leader_id")
+        || lowercase.contains("transport error")
+        || lowercase.contains("connection was not ready")
+        || lowercase.contains("deadline has elapsed")
+        || lowercase.contains("broken pipe")
+        || lowercase.contains("connection reset")
+        || lowercase.contains("connection refused")
+}
+
 /// Calculate prefix end for range queries
 fn prefix_end(prefix: &[u8]) -> Vec<u8> {
    let mut end = prefix.to_vec();
@ -463,4 +731,30 @@ mod tests {
        assert_eq!(prefix_end(b"abc"), b"abd");
        assert_eq!(prefix_end(b"/nodes/"), b"/nodes0");
    }
+
+    #[test]
+    fn normalize_endpoint_adds_http_scheme() {
+        assert_eq!(normalize_endpoint("127.0.0.1:2379"), "http://127.0.0.1:2379");
+        assert_eq!(normalize_endpoint("http://127.0.0.1:2379"), "http://127.0.0.1:2379");
+    }
+
+    #[test]
+    fn parse_endpoints_accepts_comma_separated_values() {
+        let endpoints = parse_endpoints("127.0.0.1:2379, http://127.0.0.2:2379").unwrap();
+        assert_eq!(
+            endpoints,
+            vec![
+                "http://127.0.0.1:2379".to_string(),
+                "http://127.0.0.2:2379".to_string()
+            ]
+        );
+    }
+
+    #[test]
+    fn retryable_message_covers_not_leader_and_transport() {
+        assert!(retryable_message("NotLeader { leader_id: Some(1) }"));
+        assert!(retryable_message("transport error"));
+        assert!(retryable_message("connection was not ready"));
+        assert!(!retryable_message("permission denied"));
+    }
 }
--- a/chainfire/crates/chainfire-api/src/cluster_service.rs
+++ b/chainfire/crates/chainfire-api/src/cluster_service.rs
@ -27,17 +27,25 @@ pub struct ClusterServiceImpl {
    rpc_client: Arc<crate::GrpcRaftClient>,
    /// Cluster ID
    cluster_id: u64,
+    /// Configured members with client and peer URLs
+    members: Vec<Member>,
    /// Server version
    version: String,
 }

 impl ClusterServiceImpl {
    /// Create a new cluster service
-    pub fn new(raft: Arc<RaftCore>, rpc_client: Arc<crate::GrpcRaftClient>, cluster_id: u64) -> Self {
+    pub fn new(
+        raft: Arc<RaftCore>,
+        rpc_client: Arc<crate::GrpcRaftClient>,
+        cluster_id: u64,
+        members: Vec<Member>,
+    ) -> Self {
        Self {
            raft,
            rpc_client,
            cluster_id,
+            members,
            version: env!("CARGO_PKG_VERSION").to_string(),
        }
    }
@ -47,16 +55,19 @@ impl ClusterServiceImpl {
    }

    /// Get current members as proto Member list
-    /// NOTE: Custom RaftCore doesn't track membership dynamically yet
+    /// NOTE: Custom RaftCore doesn't track membership dynamically yet, so this returns
+    /// the configured static membership that the server was booted with.
    async fn get_member_list(&self) -> Vec<Member> {
-        // For now, return only the current node
-        vec![Member {
+        if self.members.is_empty() {
+            return vec![Member {
                id: self.raft.node_id(),
                name: format!("node-{}", self.raft.node_id()),
                peer_urls: vec![],
                client_urls: vec![],
                is_learner: false,
-        }]
+            }];
+        }
+        self.members.clone()
    }
 }

--- a/chainfire/crates/chainfire-server/Cargo.toml
+++ b/chainfire/crates/chainfire-server/Cargo.toml
@ -42,6 +42,7 @@ http-body-util = { workspace = true }
 uuid = { version = "1.11", features = ["v4", "serde"] }
 chrono = { version = "0.4", features = ["serde"] }
 serde_json = "1.0"
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }

 # Configuration
 clap.workspace = true
--- a/chainfire/crates/chainfire-server/src/rest.rs
+++ b/chainfire/crates/chainfire-server/src/rest.rs
@ -11,13 +11,14 @@
 use axum::{
    extract::{Path, Query, State},
    http::StatusCode,
-    routing::{delete, get, post, put},
+    routing::{get, post},
    Json, Router,
 };
 use chainfire_api::GrpcRaftClient;
-use chainfire_raft::RaftCore;
+use chainfire_raft::{core::RaftError, RaftCore};
 use chainfire_types::command::RaftCommand;
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::sync::Arc;

 /// REST API state
@ -26,16 +27,18 @@ pub struct RestApiState {
    pub raft: Arc<RaftCore>,
    pub cluster_id: u64,
    pub rpc_client: Option<Arc<GrpcRaftClient>>,
+    pub http_client: reqwest::Client,
+    pub peer_http_addrs: Arc<HashMap<u64, String>>,
 }

 /// Standard REST error response
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct ErrorResponse {
    pub error: ErrorDetail,
    pub meta: ResponseMeta,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct ErrorDetail {
    pub code: String,
    pub message: String,
@ -43,7 +46,7 @@ pub struct ErrorDetail {
    pub details: Option<serde_json::Value>,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct ResponseMeta {
    pub request_id: String,
    pub timestamp: String,
@ -59,7 +62,7 @@ impl ResponseMeta {
 }

 /// Standard REST success response
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct SuccessResponse<T> {
    pub data: T,
    pub meta: ResponseMeta,
@ -75,25 +78,25 @@ impl<T> SuccessResponse<T> {
 }

 /// KV Put request body
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct PutRequest {
    pub value: String,
 }

 /// KV Get response
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct GetResponse {
    pub key: String,
    pub value: String,
 }

 /// KV List response
-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct ListResponse {
    pub items: Vec<KvItem>,
 }

-#[derive(Debug, Serialize)]
+#[derive(Debug, Serialize, Deserialize)]
 pub struct KvItem {
    pub key: String,
    pub value: String,
@ -129,6 +132,13 @@ pub struct AddMemberRequestLegacy {
 #[derive(Debug, Deserialize)]
 pub struct PrefixQuery {
    pub prefix: Option<String>,
+    pub consistency: Option<String>,
+}
+
+/// Query parameters for key reads
+#[derive(Debug, Default, Deserialize)]
+pub struct ReadQuery {
+    pub consistency: Option<String>,
 }

 /// Build the REST API router
@ -153,80 +163,11 @@ async fn health_check() -> (StatusCode, Json<SuccessResponse<serde_json::Value>>
    )
 }

-/// GET /api/v1/kv/{key} - Get value
-async fn get_kv(
-    State(state): State<RestApiState>,
-    Path(key): Path<String>,
-) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
-    let sm = state.raft.state_machine();
-    let key_bytes = key.as_bytes().to_vec();
-
-    let results = sm.kv()
-        .get(&key_bytes)
-        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
-
-    let value = results
-        .into_iter()
-        .next()
-        .ok_or_else(|| error_response(StatusCode::NOT_FOUND, "NOT_FOUND", "Key not found"))?;
-
-    Ok(Json(SuccessResponse::new(GetResponse {
-        key,
-        value: String::from_utf8_lossy(&value.value).to_string(),
-    })))
-}
-
-/// PUT /api/v1/kv/{key} - Put value
-async fn put_kv(
-    State(state): State<RestApiState>,
-    Path(key): Path<String>,
-    Json(req): Json<PutRequest>,
-) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
-    let command = RaftCommand::Put {
-        key: key.as_bytes().to_vec(),
-        value: req.value.as_bytes().to_vec(),
-        lease_id: None,
-        prev_kv: false,
-    };
-
-    state
-        .raft
-        .client_write(command)
-        .await
-        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
-
-    Ok((
-        StatusCode::OK,
-        Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
-    ))
-}
-
-/// DELETE /api/v1/kv/{key} - Delete key
-async fn delete_kv(
-    State(state): State<RestApiState>,
-    Path(key): Path<String>,
-) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
-    let command = RaftCommand::Delete {
-        key: key.as_bytes().to_vec(),
-        prev_kv: false,
-    };
-
-    state
-        .raft
-        .client_write(command)
-        .await
-        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
-
-    Ok((
-        StatusCode::OK,
-        Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
-    ))
-}
-
 /// GET /api/v1/kv/*key - Get value (wildcard for all keys)
 async fn get_kv_wildcard(
    State(state): State<RestApiState>,
    Path(key): Path<String>,
+    Query(query): Query<ReadQuery>,
 ) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
    // Use key as-is for simple keys, prepend / for namespaced keys
    // Keys like "testkey" stay as "testkey", keys like "flaredb/stores/1" become "/flaredb/stores/1"
@ -235,6 +176,14 @@ async fn get_kv_wildcard(
    } else {
        key.clone()
    };
+    if should_proxy_read(query.consistency.as_deref(), &state).await {
+        return proxy_read_to_leader(
+            &state,
+            &format!("/api/v1/kv/{}", full_key.trim_start_matches('/')),
+            None,
+        )
+        .await;
+    }
    let sm = state.raft.state_machine();
    let key_bytes = full_key.as_bytes().to_vec();

@ -272,11 +221,7 @@ async fn put_kv_wildcard(
        prev_kv: false,
    };

-    state
-        .raft
-        .client_write(command)
-        .await
-        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
+    submit_rest_write(&state, command, Some(&req), &full_key, reqwest::Method::PUT).await?;

    Ok((
        StatusCode::OK,
@ -300,11 +245,7 @@ async fn delete_kv_wildcard(
        prev_kv: false,
    };

-    state
-        .raft
-        .client_write(command)
-        .await
-        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
+    submit_rest_write(&state, command, None, &full_key, reqwest::Method::DELETE).await?;

    Ok((
        StatusCode::OK,
@ -317,6 +258,13 @@ async fn list_kv(
    State(state): State<RestApiState>,
    Query(params): Query<PrefixQuery>,
 ) -> Result<Json<SuccessResponse<ListResponse>>, (StatusCode, Json<ErrorResponse>)> {
+    if should_proxy_read(params.consistency.as_deref(), &state).await {
+        let query = params
+            .prefix
+            .as_ref()
+            .map(|prefix| vec![("prefix", prefix.as_str())]);
+        return proxy_read_to_leader(&state, "/api/v1/kv", query.as_deref()).await;
+    }
    let prefix = params.prefix.unwrap_or_default();
    let sm = state.raft.state_machine();

@ -446,3 +394,169 @@ fn error_response(
        }),
    )
 }
+
+async fn submit_rest_write(
+    state: &RestApiState,
+    command: RaftCommand,
+    body: Option<&PutRequest>,
+    key: &str,
+    method: reqwest::Method,
+) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+    match state.raft.client_write(command).await {
+        Ok(()) => Ok(()),
+        Err(RaftError::NotLeader { leader_id }) => {
+            let resolved_leader = match leader_id {
+                Some(leader_id) => Some(leader_id),
+                None => state.raft.leader().await,
+            };
+            proxy_write_to_leader(state, resolved_leader, key, method, body).await
+        }
+        Err(err) => Err(error_response(
+            StatusCode::INTERNAL_SERVER_ERROR,
+            "INTERNAL_ERROR",
+            &err.to_string(),
+        )),
+    }
+}
+
+async fn proxy_write_to_leader(
+    state: &RestApiState,
+    leader_id: Option<u64>,
+    key: &str,
+    method: reqwest::Method,
+    body: Option<&PutRequest>,
+) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
+    let leader_id = leader_id.ok_or_else(|| {
+        error_response(
+            StatusCode::SERVICE_UNAVAILABLE,
+            "NOT_LEADER",
+            "current node is not the leader and no leader is known yet",
+        )
+    })?;
+    let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
+        error_response(
+            StatusCode::SERVICE_UNAVAILABLE,
+            "NOT_LEADER",
+            &format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
+        )
+    })?;
+    let url = format!(
+        "{}/api/v1/kv/{}",
+        leader_http_addr.trim_end_matches('/'),
+        key.trim_start_matches('/')
+    );
+    let mut request = state.http_client.request(method, &url);
+    if let Some(body) = body {
+        request = request.json(body);
+    }
+    let response = request.send().await.map_err(|err| {
+        error_response(
+            StatusCode::BAD_GATEWAY,
+            "LEADER_PROXY_FAILED",
+            &format!("failed to forward write to leader {leader_id}: {err}"),
+        )
+    })?;
+    if response.status().is_success() {
+        return Ok(());
+    }
+    let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
+        error: ErrorDetail {
+            code: "LEADER_PROXY_FAILED".to_string(),
+            message: format!("leader {leader_id} returned {status}: {err}"),
+            details: None,
+        },
+        meta: ResponseMeta::new(),
+    });
+    Err((status, Json(payload)))
+}
+
+async fn should_proxy_read(consistency: Option<&str>, state: &RestApiState) -> bool {
+    let node_id = state.raft.node_id();
+    let leader_id = state.raft.leader().await;
+    read_requires_leader_proxy(consistency, node_id, leader_id)
+}
+
+fn read_requires_leader_proxy(
+    consistency: Option<&str>,
+    node_id: u64,
+    leader_id: Option<u64>,
+) -> bool {
+    if matches!(consistency, Some(mode) if mode.eq_ignore_ascii_case("local")) {
+        return false;
+    }
+    matches!(leader_id, Some(leader_id) if leader_id != node_id)
+}
+
+async fn proxy_read_to_leader<T>(
+    state: &RestApiState,
+    path: &str,
+    query: Option<&[(&str, &str)]>,
+) -> Result<Json<SuccessResponse<T>>, (StatusCode, Json<ErrorResponse>)>
+where
+    T: for<'de> Deserialize<'de>,
+{
+    let leader_id = state.raft.leader().await.ok_or_else(|| {
+        error_response(
+            StatusCode::SERVICE_UNAVAILABLE,
+            "NOT_LEADER",
+            "current node is not the leader and no leader is known yet",
+        )
+    })?;
+    let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
+        error_response(
+            StatusCode::SERVICE_UNAVAILABLE,
+            "NOT_LEADER",
+            &format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
+        )
+    })?;
+    let url = format!(
+        "{}{}",
+        leader_http_addr.trim_end_matches('/'),
+        path
+    );
+    let mut request = state.http_client.get(&url);
+    if let Some(query) = query {
+        request = request.query(query);
+    }
+    let response = request.send().await.map_err(|err| {
+        error_response(
+            StatusCode::BAD_GATEWAY,
+            "LEADER_PROXY_FAILED",
+            &format!("failed to forward read to leader {leader_id}: {err}"),
+        )
+    })?;
+    if response.status().is_success() {
+        let payload = response.json::<SuccessResponse<T>>().await.map_err(|err| {
+            error_response(
+                StatusCode::BAD_GATEWAY,
+                "LEADER_PROXY_FAILED",
+                &format!("failed to decode leader {leader_id} response: {err}"),
+            )
+        })?;
+        return Ok(Json(payload));
+    }
+    let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
+    let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
+        error: ErrorDetail {
+            code: "LEADER_PROXY_FAILED".to_string(),
+            message: format!("leader {leader_id} returned {status}: {err}"),
+            details: None,
+        },
+        meta: ResponseMeta::new(),
+    });
+    Err((status, Json(payload)))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn read_requires_leader_proxy_defaults_to_leader_consistency() {
+        assert!(read_requires_leader_proxy(None, 2, Some(1)));
+        assert!(!read_requires_leader_proxy(Some("local"), 2, Some(1)));
+        assert!(!read_requires_leader_proxy(None, 2, Some(2)));
+        assert!(!read_requires_leader_proxy(None, 2, None));
+    }
+}
--- a/chainfire/crates/chainfire-server/src/server.rs
+++ b/chainfire/crates/chainfire-server/src/server.rs
@ -11,10 +11,11 @@ use crate::rest::{build_router, RestApiState};
 use anyhow::Result;
 use chainfire_api::internal_proto::raft_service_server::RaftServiceServer;
 use chainfire_api::proto::{
-    cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer,
+    cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer, Member,
 };
 use chainfire_api::{ClusterServiceImpl, KvServiceImpl, RaftServiceImpl, WatchServiceImpl};
 use chainfire_types::RaftRole;
+use std::collections::HashMap;
 use std::sync::Arc;
 use tokio::signal;
 use tonic::transport::{Certificate, Identity, Server as TonicServer, ServerTlsConfig};
@ -109,6 +110,7 @@ impl Server {
            Arc::clone(&raft),
            rpc_client,
            self.node.cluster_id(),
+            configured_members(&self.config),
        );

        // Internal Raft service for inter-node communication
@ -166,10 +168,24 @@ impl Server {

        // HTTP REST API server
        let http_addr = self.config.network.http_addr;
+        let http_port = self.config.network.http_addr.port();
+        let peer_http_addrs = Arc::new(
+            self.config
+                .cluster
+                .initial_members
+                .iter()
+                .filter_map(|member| {
+                    http_endpoint_from_raft_addr(&member.raft_addr, http_port)
+                        .map(|http_addr| (member.id, http_addr))
+                })
+                .collect::<HashMap<_, _>>(),
+        );
        let rest_state = RestApiState {
            raft: Arc::clone(&raft),
            cluster_id: self.node.cluster_id(),
            rpc_client: self.node.rpc_client().cloned(),
+            http_client: reqwest::Client::new(),
+            peer_http_addrs,
        };
        let rest_app = build_router(rest_state);
        let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
@ -286,3 +302,45 @@ impl Server {
        Ok(())
    }
 }
+
+fn http_endpoint_from_raft_addr(raft_addr: &str, http_port: u16) -> Option<String> {
+    if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
+        return Some(format!("http://{}:{}", addr.ip(), http_port));
+    }
+    let (host, _) = raft_addr.rsplit_once(':')?;
+    Some(format!("http://{}:{}", host, http_port))
+}
+
+fn grpc_endpoint_from_raft_addr(raft_addr: &str, api_port: u16) -> Option<String> {
+    if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
+        return Some(format!("http://{}:{}", addr.ip(), api_port));
+    }
+    let (host, _) = raft_addr.rsplit_once(':')?;
+    Some(format!("http://{}:{}", host, api_port))
+}
+
+fn normalize_peer_url(raft_addr: &str) -> String {
+    if raft_addr.contains("://") {
+        raft_addr.to_string()
+    } else {
+        format!("http://{raft_addr}")
+    }
+}
+
+fn configured_members(config: &ServerConfig) -> Vec<Member> {
+    let api_port = config.network.api_addr.port();
+    config
+        .cluster
+        .initial_members
+        .iter()
+        .map(|member| Member {
+            id: member.id,
+            name: format!("node-{}", member.id),
+            peer_urls: vec![normalize_peer_url(&member.raft_addr)],
+            client_urls: grpc_endpoint_from_raft_addr(&member.raft_addr, api_port)
+                .into_iter()
+                .collect(),
+            is_learner: false,
+        })
+        .collect()
+}
--- a/coronafs/Cargo.lock
+++ b/coronafs/Cargo.lock
--- a/coronafs/Cargo.toml
+++ b/coronafs/Cargo.toml
@ -24,6 +24,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 anyhow = "1.0"
 thiserror = "1.0"
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
+reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }

 [workspace.lints.rust]
 unsafe_code = "deny"
--- a/coronafs/crates/coronafs-server/Cargo.toml
+++ b/coronafs/crates/coronafs-server/Cargo.toml
@ -21,7 +21,11 @@ tracing-subscriber = { workspace = true }
 anyhow = { workspace = true }
 thiserror = { workspace = true }
 chrono = { workspace = true }
+reqwest = { workspace = true }
 futures-util = "0.3"

+[dev-dependencies]
+tempfile = "3"
+
 [lints]
 workspace = true
--- a/coronafs/crates/coronafs-server/src/config.rs
+++ b/coronafs/crates/coronafs-server/src/config.rs
@ -2,9 +2,40 @@ use serde::{Deserialize, Serialize};
 use std::net::SocketAddr;
 use std::path::PathBuf;

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ServerMode {
+    Combined,
+    Controller,
+    Node,
+}
+
+impl Default for ServerMode {
+    fn default() -> Self {
+        Self::Combined
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum MetadataBackend {
+    Filesystem,
+    Chainfire,
+}
+
+impl Default for MetadataBackend {
+    fn default() -> Self {
+        Self::Filesystem
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 #[serde(default)]
 pub struct ServerConfig {
+    pub mode: ServerMode,
+    pub metadata_backend: MetadataBackend,
+    pub chainfire_api_url: Option<String>,
+    pub chainfire_key_prefix: String,
    pub listen_addr: SocketAddr,
    pub advertise_host: String,
    pub data_dir: PathBuf,
@ -26,6 +57,10 @@ pub struct ServerConfig {
 impl Default for ServerConfig {
    fn default() -> Self {
        Self {
+            mode: ServerMode::Combined,
+            metadata_backend: MetadataBackend::Filesystem,
+            chainfire_api_url: None,
+            chainfire_key_prefix: "/coronafs/volumes".to_string(),
            listen_addr: "0.0.0.0:50088".parse().expect("valid listen addr"),
            advertise_host: "127.0.0.1".to_string(),
            data_dir: PathBuf::from("/var/lib/coronafs"),
@ -34,7 +69,7 @@ impl Default for ServerConfig {
            export_port_count: 512,
            export_shared_clients: 32,
            export_cache_mode: "none".to_string(),
-            export_aio_mode: "io_uring".to_string(),
+            export_aio_mode: "threads".to_string(),
            export_discard_mode: "unmap".to_string(),
            export_detect_zeroes_mode: "unmap".to_string(),
            preallocate: true,
@ -47,6 +82,14 @@ impl Default for ServerConfig {
 }

 impl ServerConfig {
+    pub fn supports_controller_api(&self) -> bool {
+        matches!(self.mode, ServerMode::Combined | ServerMode::Controller)
+    }
+
+    pub fn supports_node_api(&self) -> bool {
+        matches!(self.mode, ServerMode::Combined | ServerMode::Node)
+    }
+
    pub fn volume_dir(&self) -> PathBuf {
        self.data_dir.join("volumes")
    }
--- a/coronafs/crates/coronafs-server/src/main.rs
+++ b/coronafs/crates/coronafs-server/src/main.rs
--- a/coronafs/scripts/benchmark-local-export.sh
+++ b/coronafs/scripts/benchmark-local-export.sh
@ -0,0 +1,231 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
+
+require_cmd() {
+  command -v "$1" >/dev/null 2>&1 || {
+    echo "missing required command: $1" >&2
+    exit 1
+  }
+}
+
+for cmd in curl qemu-io; do
+  require_cmd "${cmd}"
+done
+
+if ! command -v jq >/dev/null 2>&1 && ! command -v python3 >/dev/null 2>&1; then
+  echo "missing required command: jq or python3" >&2
+  exit 1
+fi
+
+json_get() {
+  local query="$1"
+  if command -v jq >/dev/null 2>&1; then
+    jq -r "${query}"
+  else
+    python3 -c 'import json,sys
+data=json.load(sys.stdin)
+value=data
+for part in sys.argv[1].split("."):
+    if not part:
+        continue
+    value=value.get(part) if isinstance(value, dict) else None
+    if value is None:
+        break
+print("" if value is None else value)
+' "${query}"
+  fi
+}
+
+RUN_ID="${CORONAFS_BENCH_RUN_ID:-$$}"
+LISTEN_PORT="${CORONAFS_BENCH_PORT:-$((25088 + (RUN_ID % 1000)))}"
+EXPORT_BASE_PORT="${CORONAFS_BENCH_EXPORT_BASE_PORT:-$((26100 + (RUN_ID % 1000)))}"
+VOLUME_ID="${CORONAFS_BENCH_VOLUME_ID:-local-bench-${RUN_ID}}"
+SIZE_MIB="${CORONAFS_BENCH_SIZE_MIB:-${CORONAFS_BENCH_SIZE_MB:-512}}"
+SIZE_BYTES="${CORONAFS_BENCH_SIZE_BYTES:-$((SIZE_MIB * 1024 * 1024))}"
+WORKLOAD_MIB="${CORONAFS_BENCH_WORKLOAD_MIB:-${CORONAFS_BENCH_WORKLOAD_MB:-256}}"
+EXPORT_CACHE_MODE="${CORONAFS_BENCH_EXPORT_CACHE_MODE:-none}"
+EXPORT_AIO_MODE="${CORONAFS_BENCH_EXPORT_AIO_MODE:-threads}"
+EXPORT_DISCARD_MODE="${CORONAFS_BENCH_EXPORT_DISCARD_MODE:-ignore}"
+EXPORT_DETECT_ZEROES_MODE="${CORONAFS_BENCH_EXPORT_DETECT_ZEROES_MODE:-off}"
+SERVER_BIN="${CORONAFS_SERVER_BIN:-}"
+
+if (( WORKLOAD_MIB > SIZE_MIB )); then
+  echo "workload ${WORKLOAD_MIB} MiB exceeds volume size ${SIZE_MIB} MiB" >&2
+  exit 1
+fi
+
+if [[ -z "${SERVER_BIN}" ]]; then
+  SERVER_CMD=(
+    cargo run
+    --manifest-path "${REPO_ROOT}/coronafs/Cargo.toml"
+    -p coronafs-server
+    --
+  )
+else
+  SERVER_CMD=("${SERVER_BIN}")
+fi
+
+TMP_DIR="$(mktemp -d)"
+CONFIG_PATH="${TMP_DIR}/coronafs.toml"
+SERVER_LOG="${TMP_DIR}/coronafs.log"
+SERVER_PID=""
+
+show_server_log() {
+  if [[ -f "${SERVER_LOG}" ]]; then
+    echo "--- coronafs server log ---" >&2
+    tail -n 200 "${SERVER_LOG}" >&2 || true
+    echo "--- end coronafs server log ---" >&2
+  fi
+}
+
+delete_volume_if_present() {
+  curl -fsS -X DELETE "http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}" >/dev/null 2>&1 || true
+}
+
+cleanup() {
+  delete_volume_if_present
+  local pid_file="${TMP_DIR}/data/pids/${VOLUME_ID}.pid"
+  if [[ -f "${pid_file}" ]]; then
+    local export_pid=""
+    export_pid="$(tr -d '\n' <"${pid_file}" 2>/dev/null || true)"
+    if [[ -n "${export_pid}" ]] && kill -0 "${export_pid}" 2>/dev/null; then
+      kill "${export_pid}" >/dev/null 2>&1 || true
+      wait "${export_pid}" >/dev/null 2>&1 || true
+    fi
+    rm -f "${pid_file}"
+  fi
+  if [[ -n "${SERVER_PID}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
+    kill "${SERVER_PID}" >/dev/null 2>&1 || true
+    wait "${SERVER_PID}" >/dev/null 2>&1 || true
+  fi
+  rm -rf "${TMP_DIR}"
+}
+trap cleanup EXIT
+
+cat >"${CONFIG_PATH}" <<EOF
+listen_addr = "127.0.0.1:${LISTEN_PORT}"
+advertise_host = "127.0.0.1"
+data_dir = "${TMP_DIR}/data"
+export_bind_addr = "127.0.0.1"
+export_base_port = ${EXPORT_BASE_PORT}
+export_port_count = 8
+export_shared_clients = 32
+export_cache_mode = "${EXPORT_CACHE_MODE}"
+export_aio_mode = "${EXPORT_AIO_MODE}"
+export_discard_mode = "${EXPORT_DISCARD_MODE}"
+export_detect_zeroes_mode = "${EXPORT_DETECT_ZEROES_MODE}"
+preallocate = false
+sync_on_write = false
+log_level = "info"
+EOF
+
+"${SERVER_CMD[@]}" --config "${CONFIG_PATH}" >"${SERVER_LOG}" 2>&1 &
+SERVER_PID="$!"
+
+deadline=$((SECONDS + 60))
+until curl -fsS "http://127.0.0.1:${LISTEN_PORT}/healthz" >/dev/null 2>&1; do
+  if (( SECONDS >= deadline )); then
+    echo "timed out waiting for coronafs local bench server" >&2
+    tail -n 200 "${SERVER_LOG}" >&2 || true
+    exit 1
+  fi
+  sleep 1
+done
+
+create_response_file="${TMP_DIR}/create-response.txt"
+create_status="$(
+  curl -sS \
+    -o "${create_response_file}" \
+    -w '%{http_code}' \
+    -X PUT \
+    -H 'content-type: application/json' \
+    -d "{\"size_bytes\":${SIZE_BYTES}}" \
+    "http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}"
+)"
+if [[ "${create_status}" -lt 200 || "${create_status}" -ge 300 ]]; then
+  echo "failed to create CoronaFS benchmark volume: HTTP ${create_status}" >&2
+  cat "${create_response_file}" >&2 || true
+  show_server_log
+  exit 1
+fi
+
+export_response_file="${TMP_DIR}/export-response.txt"
+export_status="$(
+  curl -sS \
+    -o "${export_response_file}" \
+    -w '%{http_code}' \
+    -X POST \
+    "http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}/export"
+)"
+if [[ "${export_status}" -lt 200 || "${export_status}" -ge 300 ]]; then
+  echo "failed to export CoronaFS benchmark volume: HTTP ${export_status}" >&2
+  cat "${export_response_file}" >&2 || true
+  show_server_log
+  exit 1
+fi
+EXPORT_JSON="$(cat "${export_response_file}")"
+EXPORT_URI="$(printf '%s' "${EXPORT_JSON}" | json_get '.export.uri')"
+[[ -n "${EXPORT_URI}" && "${EXPORT_URI}" != "null" ]] || {
+  echo "failed to obtain CoronaFS export URI" >&2
+  printf '%s\n' "${EXPORT_JSON}" >&2
+  show_server_log
+  exit 1
+}
+
+run_qemu_io() {
+  local extra=()
+  local start_ns end_ns elapsed_ns
+  local args=("$@")
+  local cmd=()
+  local qemu_cmd=""
+
+  if [[ "${#args[@]}" -eq 0 ]]; then
+    echo "run_qemu_io requires at least one qemu-io command" >&2
+    exit 1
+  fi
+
+  while [[ "${#args[@]}" -gt 0 && "${args[0]}" == --* ]]; do
+    extra+=("${args[0]}")
+    args=("${args[@]:1}")
+  done
+
+  cmd=(qemu-io -f raw "${extra[@]}")
+  for qemu_cmd in "${args[@]}"; do
+    cmd+=(-c "${qemu_cmd}")
+  done
+  cmd+=("${EXPORT_URI}")
+
+  start_ns="$(date +%s%N)"
+  "${cmd[@]}" >/dev/null
+  end_ns="$(date +%s%N)"
+  elapsed_ns="$((end_ns - start_ns))"
+  printf '%s\n' "${elapsed_ns}"
+}
+
+calc_mib_per_s() {
+  local bytes="$1"
+  local elapsed_ns="$2"
+  awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
+    BEGIN {
+      if (elapsed_ns <= 0) {
+        print "0.00"
+      } else {
+        printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
+      }
+    }
+  '
+}
+
+BYTES="$((WORKLOAD_MIB * 1024 * 1024))"
+WRITE_NS="$(run_qemu_io "write -P 0x5a 0 ${WORKLOAD_MIB}M" "flush")"
+READ_NS="$(run_qemu_io "read -P 0x5a 0 ${WORKLOAD_MIB}M")"
+WRITE_MIBPS="$(calc_mib_per_s "${BYTES}" "${WRITE_NS}")"
+READ_MIBPS="$(calc_mib_per_s "${BYTES}" "${READ_NS}")"
+
+printf 'CoronaFS local export bench: uri=%s cache=%s aio=%s write=%s MiB/s read=%s MiB/s size=%s MiB\n' \
+  "${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}" "${WORKLOAD_MIB}"
+
+printf '%s\t%s\t%s\t%s\t%s\n' "${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}"
--- a/creditservice/Cargo.lock
+++ b/creditservice/Cargo.lock
--- a/deployer/Cargo.lock
+++ b/deployer/Cargo.lock
--- a/deployer/crates/deployer-ctl/Cargo.toml
+++ b/deployer/crates/deployer-ctl/Cargo.toml
@ -12,8 +12,11 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 serde_yaml = "0.9"
+chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }

 chainfire-client = { path = "../../../chainfire/chainfire-client" }
 deployer-types = { path = "../deployer-types" }
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }

+[dev-dependencies]
+axum = { version = "0.7", features = ["macros"] }
--- a/deployer/crates/deployer-ctl/src/chainfire.rs
+++ b/deployer/crates/deployer-ctl/src/chainfire.rs
@ -4,7 +4,12 @@ use std::path::Path;

 use anyhow::{Context, Result};
 use chainfire_client::{Client, ClientError};
-use deployer_types::{ClusterStateSpec, DesiredSystemSpec, InstallPlan, NodeConfig, NodeSpec};
+use chrono::Utc;
+use deployer_types::{
+    ClusterNodeRecord, ClusterStateSpec, CommissionState, DesiredSystemSpec, HostDeploymentSpec,
+    HostDeploymentStatus, InstallPlan, InstallState, NodeConfig, NodeSpec, ObservedSystemState,
+    PowerState,
+};
 use serde::de::DeserializeOwned;
 use serde_json::{json, Value};
 use tokio::fs;
@ -49,6 +54,56 @@ fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str)
    .into_bytes()
 }

+fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}/observed-system",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn key_host_deployment_spec(
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+) -> Vec<u8> {
+    format!(
+        "{}deployments/hosts/{}/spec",
+        cluster_prefix(cluster_namespace, cluster_id),
+        deployment_name
+    )
+    .into_bytes()
+}
+
+fn key_host_deployment_status(
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+) -> Vec<u8> {
+    format!(
+        "{}deployments/hosts/{}/status",
+        cluster_prefix(cluster_namespace, cluster_id),
+        deployment_name
+    )
+    .into_bytes()
+}
+
+fn parse_commission_state(value: &str) -> Result<CommissionState> {
+    serde_json::from_str(&format!("\"{value}\""))
+        .with_context(|| format!("invalid commission state {value}"))
+}
+
+fn parse_install_state(value: &str) -> Result<InstallState> {
+    serde_json::from_str(&format!("\"{value}\""))
+        .with_context(|| format!("invalid install state {value}"))
+}
+
+fn parse_power_state(value: &str) -> Result<PowerState> {
+    serde_json::from_str(&format!("\"{value}\""))
+        .with_context(|| format!("invalid power state {value}"))
+}
+
 fn key_node_class(cluster_namespace: &str, cluster_id: &str, node_class: &str) -> Vec<u8> {
    format!(
        "{}node-classes/{}",
@ -178,6 +233,9 @@ fn desired_system_from_spec(node: &NodeSpec) -> Option<DesiredSystemSpec> {
    if desired.rollback_on_failure.is_none() {
        desired.rollback_on_failure = Some(true);
    }
+    if desired.drain_before_apply.is_none() {
+        desired.drain_before_apply = Some(false);
+    }
    if desired.nixos_configuration.is_some() {
        Some(desired)
    } else {
@ -322,6 +380,30 @@ async fn merge_existing_node_observed_fields(
    if merged.state.is_none() {
        merged.state = existing_node.state;
    }
+    if merged.machine_id.is_none() {
+        merged.machine_id = existing_node.machine_id;
+    }
+    if merged.hardware_facts.is_none() {
+        merged.hardware_facts = existing_node.hardware_facts;
+    }
+    if merged.commission_state.is_none() {
+        merged.commission_state = existing_node.commission_state;
+    }
+    if merged.install_state.is_none() {
+        merged.install_state = existing_node.install_state;
+    }
+    if merged.commissioned_at.is_none() {
+        merged.commissioned_at = existing_node.commissioned_at;
+    }
+    if merged.last_inventory_hash.is_none() {
+        merged.last_inventory_hash = existing_node.last_inventory_hash;
+    }
+    if merged.power_state.is_none() {
+        merged.power_state = existing_node.power_state;
+    }
+    if merged.bmc_ref.is_none() {
+        merged.bmc_ref = existing_node.bmc_ref;
+    }
    if merged.last_heartbeat.is_none() {
        merged.last_heartbeat = existing_node.last_heartbeat;
    }
@ -521,6 +603,13 @@ pub async fn bootstrap_cluster(
            info!(enrollment_rule = %rule.name, "upserted enrollment rule");
        }

+        for deployment in &spec.host_deployments {
+            let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
+            let value = serde_json::to_vec(deployment)?;
+            client.put(&key, &value).await?;
+            info!(deployment = %deployment.name, "upserted host deployment");
+        }
+
        // 3. Service / Instance (必要であれば)
        for svc in &spec.services {
            let key = key_service(cluster_namespace, cluster_id, &svc.name);
@ -627,6 +716,11 @@ pub async fn apply_cluster_state(
                let value = serde_json::to_vec(rule)?;
                client.put(&key, &value).await?;
            }
+            for deployment in &spec.host_deployments {
+                let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
+                let value = serde_json::to_vec(deployment)?;
+                client.put(&key, &value).await?;
+            }
            for svc in &spec.services {
                let key = key_service(cluster_namespace, cluster_id, &svc.name);
                let value = serde_json::to_vec(svc)?;
@ -706,6 +800,421 @@ pub async fn dump_prefix(endpoint: &str, prefix: &str, json_output: bool) -> Res
    .await
 }

+async fn get_json_key<T: DeserializeOwned>(client: &mut Client, key: &[u8]) -> Result<Option<T>> {
+    client
+        .get(key)
+        .await?
+        .map(|bytes| serde_json::from_slice::<T>(&bytes))
+        .transpose()
+        .with_context(|| format!("failed to decode key {}", String::from_utf8_lossy(key)))
+}
+
+pub async fn inspect_node(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+    include_desired_system: bool,
+    include_observed_system: bool,
+    json_output: bool,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "inspect node", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let node_id = node_id.to_string();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let node = get_json_key::<ClusterNodeRecord>(
+                &mut client,
+                &key_node(&cluster_namespace, &cluster_id, &node_id),
+            )
+            .await?
+            .with_context(|| format!("node {} not found", node_id))?;
+
+            let desired_system = if include_desired_system {
+                get_json_key::<DesiredSystemSpec>(
+                    &mut client,
+                    &key_desired_system(&cluster_namespace, &cluster_id, &node_id),
+                )
+                .await?
+            } else {
+                None
+            };
+
+            let observed_system = if include_observed_system {
+                get_json_key::<ObservedSystemState>(
+                    &mut client,
+                    &key_observed_system(&cluster_namespace, &cluster_id, &node_id),
+                )
+                .await?
+            } else {
+                None
+            };
+
+            if json_output {
+                println!(
+                    "{}",
+                    serde_json::to_string_pretty(&json!({
+                        "node": node,
+                        "desired_system": desired_system,
+                        "observed_system": observed_system,
+                    }))?
+                );
+            } else {
+                println!("node_id={}", node.node_id);
+                println!("hostname={}", node.hostname);
+                println!("ip={}", node.ip);
+                println!("state={}", node.state.as_deref().unwrap_or("unknown"));
+                println!(
+                    "commission_state={}",
+                    node.commission_state
+                        .map(|value| serde_json::to_string(&value).unwrap_or_default())
+                        .unwrap_or_else(|| "\"unknown\"".to_string())
+                );
+                println!(
+                    "install_state={}",
+                    node.install_state
+                        .map(|value| serde_json::to_string(&value).unwrap_or_default())
+                        .unwrap_or_else(|| "\"unknown\"".to_string())
+                );
+                if let Some(observed_system) = observed_system {
+                    println!(
+                        "observed_status={}",
+                        observed_system.status.unwrap_or_else(|| "unknown".to_string())
+                    );
+                }
+            }
+
+            Ok(())
+        }
+    })
+    .await
+}
+
+pub async fn set_node_states(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+    state: Option<String>,
+    commission_state: Option<String>,
+    install_state: Option<String>,
+    power_state: Option<String>,
+    bmc_ref: Option<String>,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "set node state", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let node_id = node_id.to_string();
+        let state = state.clone();
+        let commission_state = commission_state.clone();
+        let install_state = install_state.clone();
+        let power_state = power_state.clone();
+        let bmc_ref = bmc_ref.clone();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let key = key_node(&cluster_namespace, &cluster_id, &node_id);
+            let mut node = get_json_key::<ClusterNodeRecord>(&mut client, &key)
+                .await?
+                .with_context(|| format!("node {} not found", node_id))?;
+
+            if let Some(state) = state {
+                node.state = Some(state);
+            }
+            if let Some(commission_state) = commission_state {
+                let parsed = parse_commission_state(&commission_state)?;
+                if matches!(parsed, CommissionState::Commissioned) && node.commissioned_at.is_none()
+                {
+                    node.commissioned_at = Some(Utc::now());
+                }
+                node.commission_state = Some(parsed);
+            }
+            if let Some(install_state) = install_state {
+                node.install_state = Some(parse_install_state(&install_state)?);
+            }
+            if let Some(power_state) = power_state {
+                node.power_state = Some(parse_power_state(&power_state)?);
+            }
+            if let Some(bmc_ref) = bmc_ref {
+                node.bmc_ref = Some(bmc_ref);
+            }
+
+            client.put(&key, &serde_json::to_vec(&node)?).await?;
+            println!("{}", serde_json::to_string_pretty(&node)?);
+            Ok(())
+        }
+    })
+    .await
+}
+
+#[allow(clippy::too_many_arguments)]
+pub async fn set_observed_system(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+    status: Option<String>,
+    nixos_configuration: Option<String>,
+    target_system: Option<String>,
+    current_system: Option<String>,
+    configured_system: Option<String>,
+    booted_system: Option<String>,
+    rollback_system: Option<String>,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "set observed system", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let node_id = node_id.to_string();
+        let status = status.clone();
+        let nixos_configuration = nixos_configuration.clone();
+        let target_system = target_system.clone();
+        let current_system = current_system.clone();
+        let configured_system = configured_system.clone();
+        let booted_system = booted_system.clone();
+        let rollback_system = rollback_system.clone();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let key = key_observed_system(&cluster_namespace, &cluster_id, &node_id);
+            let mut observed = get_json_key::<ObservedSystemState>(&mut client, &key)
+                .await?
+                .unwrap_or_else(|| ObservedSystemState {
+                    node_id: node_id.clone(),
+                    ..ObservedSystemState::default()
+                });
+
+            observed.node_id = node_id.clone();
+            if let Some(status) = status {
+                observed.status = Some(status);
+            }
+            if let Some(nixos_configuration) = nixos_configuration {
+                observed.nixos_configuration = Some(nixos_configuration);
+            }
+            if let Some(target_system) = target_system {
+                observed.target_system = Some(target_system);
+            }
+            if let Some(current_system) = current_system {
+                observed.current_system = Some(current_system);
+            }
+            if let Some(configured_system) = configured_system {
+                observed.configured_system = Some(configured_system);
+            }
+            if let Some(booted_system) = booted_system {
+                observed.booted_system = Some(booted_system);
+            }
+            if let Some(rollback_system) = rollback_system {
+                observed.rollback_system = Some(rollback_system);
+            }
+
+            client.put(&key, &serde_json::to_vec(&observed)?).await?;
+            println!("{}", serde_json::to_string_pretty(&observed)?);
+            Ok(())
+        }
+    })
+    .await
+}
+
+pub async fn inspect_host_deployment(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+    json_output: bool,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "inspect host deployment", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let deployment_name = deployment_name.to_string();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let spec = get_json_key::<HostDeploymentSpec>(
+                &mut client,
+                &key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name),
+            )
+            .await?
+            .with_context(|| format!("host deployment {} not found", deployment_name))?;
+            let status = get_json_key::<HostDeploymentStatus>(
+                &mut client,
+                &key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
+            )
+            .await?;
+
+            if json_output {
+                println!(
+                    "{}",
+                    serde_json::to_string_pretty(&json!({
+                        "spec": spec,
+                        "status": status,
+                    }))?
+                );
+            } else {
+                println!("name={}", spec.name);
+                println!(
+                    "nixos_configuration={}",
+                    spec.nixos_configuration.as_deref().unwrap_or("unknown")
+                );
+                if let Some(status) = status {
+                    println!("phase={}", status.phase.as_deref().unwrap_or("unknown"));
+                    println!("paused={}", status.paused);
+                    println!("selected_nodes={}", status.selected_nodes.join(","));
+                    println!("completed_nodes={}", status.completed_nodes.join(","));
+                    println!("failed_nodes={}", status.failed_nodes.join(","));
+                }
+            }
+
+            Ok(())
+        }
+    })
+    .await
+}
+
+pub async fn set_host_deployment_paused(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+    paused: bool,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "set host deployment pause state", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let deployment_name = deployment_name.to_string();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
+            if client.get(&spec_key).await?.is_none() {
+                return Err(anyhow::anyhow!(
+                    "host deployment {} not found",
+                    deployment_name
+                ));
+            }
+
+            let status_key =
+                key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name);
+            let mut status = get_json_key::<HostDeploymentStatus>(&mut client, &status_key)
+                .await?
+                .unwrap_or_else(|| HostDeploymentStatus {
+                    name: deployment_name.clone(),
+                    ..HostDeploymentStatus::default()
+                });
+            status.name = deployment_name.clone();
+            status.paused_by_operator = paused;
+            status.paused = paused;
+            status.phase = Some(if paused { "paused" } else { "ready" }.to_string());
+            status.message = Some(if paused {
+                "paused by operator".to_string()
+            } else {
+                "resumed by operator".to_string()
+            });
+            status.updated_at = Some(Utc::now());
+            client.put(&status_key, &serde_json::to_vec(&status)?).await?;
+            println!("{}", serde_json::to_string_pretty(&status)?);
+            Ok(())
+        }
+    })
+    .await
+}
+
+pub async fn abort_host_deployment(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+) -> Result<()> {
+    let endpoints = chainfire_endpoints(endpoint);
+    with_chainfire_endpoint_failover(&endpoints, "abort host deployment", |endpoint| {
+        let endpoint = endpoint.to_string();
+        let cluster_namespace = cluster_namespace.to_string();
+        let cluster_id = cluster_id.to_string();
+        let deployment_name = deployment_name.to_string();
+        async move {
+            let mut client = Client::connect(endpoint).await?;
+            let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
+            if client.get(&spec_key).await?.is_none() {
+                return Err(anyhow::anyhow!(
+                    "host deployment {} not found",
+                    deployment_name
+                ));
+            }
+
+            let node_prefix = format!("{}nodes/", cluster_prefix(&cluster_namespace, &cluster_id));
+            let existing = client.get_prefix(node_prefix.as_bytes()).await?;
+            let mut cleared_nodes = Vec::new();
+
+            for (key, value) in &existing {
+                let key_str = String::from_utf8_lossy(&key);
+                if key_str.ends_with("/desired-system") {
+                    let Ok(desired) = serde_json::from_slice::<DesiredSystemSpec>(value) else {
+                        continue;
+                    };
+                    if desired.deployment_id.as_deref() == Some(deployment_name.as_str()) {
+                        client.delete(&key).await?;
+                        cleared_nodes.push(desired.node_id.clone());
+                    }
+                }
+            }
+
+            for (key, value) in existing {
+                let key_str = String::from_utf8_lossy(&key);
+                if key_str.ends_with("/desired-system") {
+                    continue;
+                }
+
+                let node_suffix = key_str
+                    .strip_prefix(&node_prefix)
+                    .filter(|suffix| !suffix.contains('/'));
+                let Some(node_id) = node_suffix else {
+                    continue;
+                };
+                let mut node = match serde_json::from_slice::<ClusterNodeRecord>(&value) {
+                    Ok(node) => node,
+                    Err(_) => continue,
+                };
+                if cleared_nodes.iter().any(|cleared| cleared == node_id)
+                    && node.state.as_deref() == Some("draining")
+                {
+                    node.state = Some("active".to_string());
+                    client.put(&key, &serde_json::to_vec(&node)?).await?;
+                }
+            }
+
+            let status = HostDeploymentStatus {
+                name: deployment_name.clone(),
+                phase: Some("aborted".to_string()),
+                paused: true,
+                paused_by_operator: true,
+                selected_nodes: Vec::new(),
+                completed_nodes: Vec::new(),
+                in_progress_nodes: Vec::new(),
+                failed_nodes: Vec::new(),
+                message: Some(format!(
+                    "aborted by operator; cleared desired-system from {} node(s)",
+                    cleared_nodes.len()
+                )),
+                updated_at: Some(Utc::now()),
+            };
+            client
+                .put(
+                    &key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
+                    &serde_json::to_vec(&status)?,
+                )
+                .await?;
+            println!("{}", serde_json::to_string_pretty(&status)?);
+            Ok(())
+        }
+    })
+    .await
+}
+
 async fn prune_cluster_state(
    client: &mut Client,
    cluster_namespace: &str,
@ -762,6 +1271,16 @@ async fn prune_cluster_state(
            .to_string(),
        );
    }
+    for deployment in &spec.host_deployments {
+        desired_keys.insert(
+            String::from_utf8_lossy(&key_host_deployment_spec(
+                cluster_namespace,
+                cluster_id,
+                &deployment.name,
+            ))
+            .to_string(),
+        );
+    }
    for svc in &spec.services {
        desired_keys.insert(
            String::from_utf8_lossy(&key_service(cluster_namespace, cluster_id, &svc.name))
@ -893,11 +1412,18 @@ mod tests {
                failure_domain: Some("rack-a".to_string()),
                nix_profile: None,
                install_plan: None,
+                hardware_facts: None,
                desired_system: None,
                state: Some(match NodeState::Pending {
                    NodeState::Pending => "pending".to_string(),
                    _ => unreachable!(),
                }),
+                commission_state: None,
+                install_state: None,
+                commissioned_at: None,
+                last_inventory_hash: None,
+                power_state: None,
+                bmc_ref: None,
                last_heartbeat: None,
            }],
            node_classes: vec![deployer_types::NodeClassSpec {
@ -922,6 +1448,7 @@ mod tests {
                labels: HashMap::from([("env".to_string(), "dev".to_string())]),
            }],
            enrollment_rules: vec![],
+            host_deployments: vec![],
            services: vec![],
            instances: vec![],
            mtls_policies: vec![],
@ -983,11 +1510,13 @@ mod tests {
        let mut spec = test_spec();
        spec.nodes[0].desired_system = Some(DesiredSystemSpec {
            node_id: String::new(),
+            deployment_id: None,
            nixos_configuration: Some("node01-next".to_string()),
            flake_ref: Some("github:centra/cloud".to_string()),
            switch_action: Some("boot".to_string()),
            health_check_command: vec!["true".to_string()],
            rollback_on_failure: Some(false),
+            drain_before_apply: Some(false),
        });

        let resolved = resolve_nodes(&spec).unwrap();
@ -1012,6 +1541,14 @@ mod tests {
            &format!("{}nodes/node01/observed-system", prefix),
            &prefix
        ));
+        assert!(is_prunable_key(
+            &format!("{}deployments/hosts/worker-rollout/spec", prefix),
+            &prefix
+        ));
+        assert!(!is_prunable_key(
+            &format!("{}deployments/hosts/worker-rollout/status", prefix),
+            &prefix
+        ));
    }
 }

@ -1028,6 +1565,7 @@ fn is_prunable_key(key: &str, prefix: &str) -> bool {
    key.starts_with(&format!("{}node-classes/", prefix))
        || key.starts_with(&format!("{}pools/", prefix))
        || key.starts_with(&format!("{}enrollment-rules/", prefix))
+        || key.starts_with(&format!("{}deployments/hosts/", prefix)) && key.ends_with("/spec")
        || key.starts_with(&format!("{}services/", prefix))
        || key.starts_with(&format!("{}instances/", prefix))
        || key.starts_with(&format!("{}mtls/policies/", prefix))
--- a/deployer/crates/deployer-ctl/src/main.rs
+++ b/deployer/crates/deployer-ctl/src/main.rs
@ -5,6 +5,7 @@ use clap::{Parser, Subcommand, ValueEnum};
 use tracing_subscriber::EnvFilter;

 mod chainfire;
+mod power;
 mod remote;

 /// Deployer control CLI for PhotonCloud.
@ -82,6 +83,132 @@ enum Command {
        #[arg(long, default_value = "status")]
        action: String,
    },
+
+    /// ノード単位の inventory / lifecycle 状態を確認・更新する
+    Node {
+        #[command(subcommand)]
+        command: NodeCommand,
+    },
+
+    /// HostDeployment rollout object を確認・操作する
+    Deployment {
+        #[command(subcommand)]
+        command: DeploymentCommand,
+    },
+}
+
+#[derive(Subcommand, Debug)]
+enum NodeCommand {
+    /// 指定ノードの記録と関連 state を表示する
+    Inspect {
+        #[arg(long)]
+        node_id: String,
+
+        #[arg(long, default_value_t = false)]
+        include_desired_system: bool,
+
+        #[arg(long, default_value_t = false)]
+        include_observed_system: bool,
+
+        #[arg(long, value_enum, default_value_t = DumpFormat::Json)]
+        format: DumpFormat,
+    },
+
+    /// 指定ノードの lifecycle / commissioning 状態を更新する
+    SetState {
+        #[arg(long)]
+        node_id: String,
+
+        #[arg(long, value_enum)]
+        state: Option<NodeLifecycleStateArg>,
+
+        #[arg(long, value_enum)]
+        commission_state: Option<CommissionStateArg>,
+
+        #[arg(long, value_enum)]
+        install_state: Option<InstallStateArg>,
+
+        #[arg(long, value_enum)]
+        power_state: Option<PowerStateArg>,
+
+        #[arg(long)]
+        bmc_ref: Option<String>,
+    },
+
+    /// 指定ノードの observed-system を更新する
+    SetObserved {
+        #[arg(long)]
+        node_id: String,
+
+        #[arg(long)]
+        status: Option<String>,
+
+        #[arg(long)]
+        nixos_configuration: Option<String>,
+
+        #[arg(long)]
+        target_system: Option<String>,
+
+        #[arg(long)]
+        current_system: Option<String>,
+
+        #[arg(long)]
+        configured_system: Option<String>,
+
+        #[arg(long)]
+        booted_system: Option<String>,
+
+        #[arg(long)]
+        rollback_system: Option<String>,
+    },
+
+    /// 指定ノードの電源操作を行う
+    Power {
+        #[arg(long)]
+        node_id: String,
+
+        #[arg(long, value_enum)]
+        action: PowerActionArg,
+    },
+
+    /// 指定ノードに再インストールを要求する
+    Reinstall {
+        #[arg(long)]
+        node_id: String,
+
+        #[arg(long, default_value_t = false)]
+        power_cycle: bool,
+    },
+}
+
+#[derive(Subcommand, Debug)]
+enum DeploymentCommand {
+    /// HostDeployment の spec/status を表示する
+    Inspect {
+        #[arg(long)]
+        name: String,
+
+        #[arg(long, value_enum, default_value_t = DumpFormat::Json)]
+        format: DumpFormat,
+    },
+
+    /// HostDeployment を一時停止する
+    Pause {
+        #[arg(long)]
+        name: String,
+    },
+
+    /// HostDeployment を再開する
+    Resume {
+        #[arg(long)]
+        name: String,
+    },
+
+    /// HostDeployment を中止し、配布済み desired-system を取り消す
+    Abort {
+        #[arg(long)]
+        name: String,
+    },
 }

 #[derive(Clone, Copy, Debug, ValueEnum)]
@ -90,6 +217,103 @@ enum DumpFormat {
    Json,
 }

+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum NodeLifecycleStateArg {
+    Pending,
+    Provisioning,
+    Active,
+    Failed,
+    Draining,
+}
+
+impl NodeLifecycleStateArg {
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Pending => "pending",
+            Self::Provisioning => "provisioning",
+            Self::Active => "active",
+            Self::Failed => "failed",
+            Self::Draining => "draining",
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum CommissionStateArg {
+    Discovered,
+    Commissioning,
+    Commissioned,
+}
+
+impl CommissionStateArg {
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Discovered => "discovered",
+            Self::Commissioning => "commissioning",
+            Self::Commissioned => "commissioned",
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum InstallStateArg {
+    Pending,
+    Installing,
+    Installed,
+    Failed,
+    ReinstallRequested,
+}
+
+impl InstallStateArg {
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Pending => "pending",
+            Self::Installing => "installing",
+            Self::Installed => "installed",
+            Self::Failed => "failed",
+            Self::ReinstallRequested => "reinstall_requested",
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum PowerStateArg {
+    On,
+    Off,
+    Cycling,
+    Unknown,
+}
+
+impl PowerStateArg {
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::On => "on",
+            Self::Off => "off",
+            Self::Cycling => "cycling",
+            Self::Unknown => "unknown",
+        }
+    }
+}
+
+#[derive(Clone, Copy, Debug, ValueEnum)]
+enum PowerActionArg {
+    On,
+    Off,
+    Cycle,
+    Refresh,
+}
+
+impl PowerActionArg {
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::On => "on",
+            Self::Off => "off",
+            Self::Cycle => "cycle",
+            Self::Refresh => "refresh",
+        }
+    }
+}
+
 #[tokio::main]
 async fn main() -> Result<()> {
    let env_filter =
@ -139,6 +363,149 @@ async fn main() -> Result<()> {
        Command::Deployer { endpoint, action } => {
            remote::run_deployer_command(&endpoint, &action).await?;
        }
+        Command::Node { command } => {
+            let cluster_id = cli
+                .cluster_id
+                .as_deref()
+                .ok_or_else(|| anyhow::anyhow!("--cluster-id is required for node commands"))?;
+
+            match command {
+                NodeCommand::Inspect {
+                    node_id,
+                    include_desired_system,
+                    include_observed_system,
+                    format,
+                } => {
+                    chainfire::inspect_node(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &node_id,
+                        include_desired_system,
+                        include_observed_system,
+                        matches!(format, DumpFormat::Json),
+                    )
+                    .await?;
+                }
+                NodeCommand::SetState {
+                    node_id,
+                    state,
+                    commission_state,
+                    install_state,
+                    power_state,
+                    bmc_ref,
+                } => {
+                    chainfire::set_node_states(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &node_id,
+                        state.map(|value| value.as_str().to_string()),
+                        commission_state.map(|value| value.as_str().to_string()),
+                        install_state.map(|value| value.as_str().to_string()),
+                        power_state.map(|value| value.as_str().to_string()),
+                        bmc_ref,
+                    )
+                    .await?;
+                }
+                NodeCommand::SetObserved {
+                    node_id,
+                    status,
+                    nixos_configuration,
+                    target_system,
+                    current_system,
+                    configured_system,
+                    booted_system,
+                    rollback_system,
+                } => {
+                    chainfire::set_observed_system(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &node_id,
+                        status,
+                        nixos_configuration,
+                        target_system,
+                        current_system,
+                        configured_system,
+                        booted_system,
+                        rollback_system,
+                    )
+                    .await?;
+                }
+                NodeCommand::Power { node_id, action } => {
+                    power::power_node(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &node_id,
+                        action.as_str(),
+                    )
+                    .await?;
+                }
+                NodeCommand::Reinstall {
+                    node_id,
+                    power_cycle,
+                } => {
+                    power::request_reinstall(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &node_id,
+                        power_cycle,
+                    )
+                    .await?;
+                }
+            }
+        }
+        Command::Deployment { command } => {
+            let cluster_id = cli
+                .cluster_id
+                .as_deref()
+                .ok_or_else(|| anyhow::anyhow!("--cluster-id is required for deployment commands"))?;
+
+            match command {
+                DeploymentCommand::Inspect { name, format } => {
+                    chainfire::inspect_host_deployment(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &name,
+                        matches!(format, DumpFormat::Json),
+                    )
+                    .await?;
+                }
+                DeploymentCommand::Pause { name } => {
+                    chainfire::set_host_deployment_paused(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &name,
+                        true,
+                    )
+                    .await?;
+                }
+                DeploymentCommand::Resume { name } => {
+                    chainfire::set_host_deployment_paused(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &name,
+                        false,
+                    )
+                    .await?;
+                }
+                DeploymentCommand::Abort { name } => {
+                    chainfire::abort_host_deployment(
+                        &cli.chainfire_endpoint,
+                        &cli.cluster_namespace,
+                        cluster_id,
+                        &name,
+                    )
+                    .await?;
+                }
+            }
+        }
    }

    Ok(())
--- a/deployer/crates/deployer-ctl/src/power.rs
+++ b/deployer/crates/deployer-ctl/src/power.rs
@ -0,0 +1,372 @@
+use anyhow::{Context, Result};
+use chainfire_client::Client;
+use deployer_types::{ClusterNodeRecord, InstallState, PowerState};
+use reqwest::{Client as HttpClient, Url};
+use serde::Deserialize;
+use serde_json::json;
+
+fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
+    format!("{}/clusters/{}/", cluster_namespace, cluster_id)
+}
+
+fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}/desired-system",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}/observed-system",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn chainfire_endpoints(raw: &str) -> Vec<String> {
+    raw.split(',')
+        .map(str::trim)
+        .filter(|endpoint| !endpoint.is_empty())
+        .map(ToOwned::to_owned)
+        .collect()
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+enum PowerAction {
+    On,
+    Off,
+    Cycle,
+    Refresh,
+}
+
+impl PowerAction {
+    fn parse(value: &str) -> Result<Self> {
+        match value {
+            "on" => Ok(Self::On),
+            "off" => Ok(Self::Off),
+            "cycle" => Ok(Self::Cycle),
+            "refresh" => Ok(Self::Refresh),
+            other => Err(anyhow::anyhow!("unsupported power action {}", other)),
+        }
+    }
+
+    fn reset_type(self) -> Option<&'static str> {
+        match self {
+            Self::On => Some("On"),
+            Self::Off => Some("ForceOff"),
+            Self::Cycle => Some("PowerCycle"),
+            Self::Refresh => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+struct RedfishTarget {
+    resource_url: Url,
+    username: Option<String>,
+    password: Option<String>,
+    insecure: bool,
+}
+
+#[derive(Debug, Deserialize)]
+struct RedfishSystemView {
+    #[serde(rename = "PowerState")]
+    power_state: Option<String>,
+}
+
+impl RedfishTarget {
+    fn parse(reference: &str) -> Result<Self> {
+        let rewritten = if let Some(rest) = reference.strip_prefix("redfish+http://") {
+            format!("http://{rest}")
+        } else if let Some(rest) = reference.strip_prefix("redfish+https://") {
+            format!("https://{rest}")
+        } else if let Some(rest) = reference.strip_prefix("redfish://") {
+            format!("https://{rest}")
+        } else {
+            return Err(anyhow::anyhow!(
+                "unsupported BMC reference {}; expected redfish:// or redfish+http(s)://",
+                reference
+            ));
+        };
+
+        let mut resource_url = Url::parse(&rewritten)
+            .with_context(|| format!("failed to parse BMC reference {}", reference))?;
+        let insecure = resource_url
+            .query_pairs()
+            .any(|(key, value)| key == "insecure" && (value == "1" || value == "true"));
+        let username = if resource_url.username().is_empty() {
+            None
+        } else {
+            Some(resource_url.username().to_string())
+        };
+        let password = resource_url.password().map(ToOwned::to_owned);
+        let system_path = normalize_redfish_system_path(resource_url.path());
+        resource_url
+            .set_username("")
+            .map_err(|_| anyhow::anyhow!("failed to clear username from BMC reference"))?;
+        resource_url
+            .set_password(None)
+            .map_err(|_| anyhow::anyhow!("failed to clear password from BMC reference"))?;
+        resource_url.set_query(None);
+        resource_url.set_path(&system_path);
+
+        Ok(Self {
+            resource_url,
+            username,
+            password,
+            insecure,
+        })
+    }
+
+    fn action_url(&self) -> Result<Url> {
+        let mut action_url = self.resource_url.clone();
+        let path = format!(
+            "{}/Actions/ComputerSystem.Reset",
+            self.resource_url.path().trim_end_matches('/')
+        );
+        action_url.set_path(&path);
+        Ok(action_url)
+    }
+
+    async fn perform(&self, action: PowerAction) -> Result<PowerState> {
+        let client = HttpClient::builder()
+            .danger_accept_invalid_certs(self.insecure)
+            .build()
+            .context("failed to create Redfish client")?;
+
+        if let Some(reset_type) = action.reset_type() {
+            let request = self
+                .with_auth(client.post(self.action_url()?))
+                .json(&json!({ "ResetType": reset_type }));
+            request
+                .send()
+                .await
+                .context("failed to send Redfish reset request")?
+                .error_for_status()
+                .context("Redfish reset request failed")?;
+        }
+
+        match action {
+            PowerAction::Cycle => Ok(PowerState::Cycling),
+            PowerAction::On | PowerAction::Off | PowerAction::Refresh => self.refresh(&client).await,
+        }
+    }
+
+    async fn refresh(&self, client: &HttpClient) -> Result<PowerState> {
+        let response = self
+            .with_auth(client.get(self.resource_url.clone()))
+            .send()
+            .await
+            .context("failed to query Redfish system resource")?
+            .error_for_status()
+            .context("Redfish system query failed")?;
+        let system: RedfishSystemView = response
+            .json()
+            .await
+            .context("failed to decode Redfish system response")?;
+        map_redfish_power_state(system.power_state.as_deref())
+    }
+
+    fn with_auth(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+        match self.username.as_deref() {
+            Some(username) => request.basic_auth(username, self.password.clone()),
+            None => request,
+        }
+    }
+}
+
+fn normalize_redfish_system_path(path: &str) -> String {
+    let trimmed = path.trim();
+    if trimmed.is_empty() || trimmed == "/" {
+        return "/redfish/v1/Systems/System.Embedded.1".to_string();
+    }
+    if trimmed.starts_with("/redfish/") {
+        return trimmed.to_string();
+    }
+    format!("/redfish/v1/Systems/{}", trimmed.trim_start_matches('/'))
+}
+
+fn map_redfish_power_state(value: Option<&str>) -> Result<PowerState> {
+    match value.unwrap_or("Unknown").to_ascii_lowercase().as_str() {
+        "on" => Ok(PowerState::On),
+        "off" => Ok(PowerState::Off),
+        "poweringon" | "poweringoff" | "cycling" => Ok(PowerState::Cycling),
+        "unknown" => Ok(PowerState::Unknown),
+        other => Err(anyhow::anyhow!("unsupported Redfish power state {}", other)),
+    }
+}
+
+async fn load_node_record(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+) -> Result<(Client, ClusterNodeRecord, Vec<u8>)> {
+    let endpoints = chainfire_endpoints(endpoint);
+    let mut last_error = None;
+
+    for endpoint in endpoints {
+        match Client::connect(endpoint.clone()).await {
+            Ok(mut client) => {
+                let key = key_node(cluster_namespace, cluster_id, node_id);
+                let Some(bytes) = client.get(&key).await? else {
+                    return Err(anyhow::anyhow!("node {} not found", node_id));
+                };
+                let node = serde_json::from_slice::<ClusterNodeRecord>(&bytes)
+                    .context("failed to decode node record")?;
+                return Ok((client, node, key));
+            }
+            Err(error) => last_error = Some(anyhow::Error::new(error)),
+        }
+    }
+
+    Err(last_error.unwrap_or_else(|| anyhow::anyhow!("no Chainfire endpoints configured")))
+}
+
+pub async fn power_node(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+    action: &str,
+) -> Result<()> {
+    let action = PowerAction::parse(action)?;
+    let (mut client, mut node, key) =
+        load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
+    let bmc_ref = node
+        .bmc_ref
+        .clone()
+        .with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
+    let target = RedfishTarget::parse(&bmc_ref)?;
+    let power_state = target.perform(action).await?;
+
+    node.power_state = Some(power_state);
+    client.put(&key, &serde_json::to_vec(&node)?).await?;
+    println!("{}", serde_json::to_string_pretty(&node)?);
+    Ok(())
+}
+
+pub async fn request_reinstall(
+    endpoint: &str,
+    cluster_namespace: &str,
+    cluster_id: &str,
+    node_id: &str,
+    power_cycle: bool,
+) -> Result<()> {
+    let (mut client, mut node, key) =
+        load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
+
+    node.state = Some("provisioning".to_string());
+    node.install_state = Some(InstallState::ReinstallRequested);
+
+    if power_cycle {
+        let bmc_ref = node
+            .bmc_ref
+            .clone()
+            .with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
+        let target = RedfishTarget::parse(&bmc_ref)?;
+        node.power_state = Some(target.perform(PowerAction::Cycle).await?);
+    }
+
+    client.put(&key, &serde_json::to_vec(&node)?).await?;
+    client
+        .delete(&key_desired_system(cluster_namespace, cluster_id, node_id))
+        .await?;
+    client
+        .delete(&key_observed_system(cluster_namespace, cluster_id, node_id))
+        .await?;
+    println!("{}", serde_json::to_string_pretty(&node)?);
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use axum::{extract::State, http::StatusCode, routing::{get, post}, Json, Router};
+    use serde_json::Value;
+    use std::sync::{Arc, Mutex};
+    use tokio::net::TcpListener;
+
+    #[test]
+    fn parse_redfish_short_reference_defaults_to_https() {
+        let parsed = RedfishTarget::parse("redfish://lab-bmc/node01").unwrap();
+        assert_eq!(parsed.resource_url.as_str(), "https://lab-bmc/redfish/v1/Systems/node01");
+    }
+
+    #[test]
+    fn parse_redfish_explicit_http_reference_keeps_query_flags_local() {
+        let parsed =
+            RedfishTarget::parse("redfish+http://user:pass@127.0.0.1/system-1?insecure=1").unwrap();
+        assert_eq!(
+            parsed.resource_url.as_str(),
+            "http://127.0.0.1/redfish/v1/Systems/system-1"
+        );
+        assert_eq!(parsed.username.as_deref(), Some("user"));
+        assert_eq!(parsed.password.as_deref(), Some("pass"));
+        assert!(parsed.insecure);
+    }
+
+    #[tokio::test]
+    async fn redfish_adapter_refreshes_and_resets_power() {
+        #[derive(Clone, Default)]
+        struct TestState {
+            seen_payloads: Arc<Mutex<Vec<String>>>,
+        }
+
+        async fn system_handler() -> Json<Value> {
+            Json(json!({ "PowerState": "On" }))
+        }
+
+        async fn reset_handler(
+            State(state): State<TestState>,
+            Json(payload): Json<Value>,
+        ) -> StatusCode {
+            state
+                .seen_payloads
+                .lock()
+                .unwrap()
+                .push(payload.to_string());
+            StatusCode::NO_CONTENT
+        }
+
+        let state = TestState::default();
+        let app = Router::new()
+            .route("/redfish/v1/Systems/node01", get(system_handler))
+            .route(
+                "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset",
+                post(reset_handler),
+            )
+            .with_state(state.clone());
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let server = tokio::spawn(async move {
+            axum::serve(listener, app).await.unwrap();
+        });
+
+        let target = RedfishTarget::parse(&format!(
+            "redfish+http://{}/redfish/v1/Systems/node01",
+            addr
+        ))
+        .unwrap();
+        assert_eq!(target.perform(PowerAction::Refresh).await.unwrap(), PowerState::On);
+        assert_eq!(target.perform(PowerAction::Off).await.unwrap(), PowerState::On);
+
+        let payloads = state.seen_payloads.lock().unwrap().clone();
+        assert_eq!(payloads, vec![r#"{"ResetType":"ForceOff"}"#.to_string()]);
+
+        server.abort();
+    }
+}
--- a/deployer/crates/deployer-server/Cargo.toml
+++ b/deployer/crates/deployer-server/Cargo.toml
@ -29,6 +29,7 @@ tracing-subscriber = { workspace = true }
 chrono = { workspace = true }
 rcgen = { workspace = true }
 clap = { workspace = true }
+sha2 = "0.10"

 # ChainFire for state management
 chainfire-client = { workspace = true }
--- a/deployer/crates/deployer-server/src/phone_home.rs
+++ b/deployer/crates/deployer-server/src/phone_home.rs
@ -1,9 +1,11 @@
 use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
 use chrono::Utc;
 use deployer_types::{
-    EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo,
-    NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse,
+    CommissionState, EnrollmentRuleSpec, HardwareFacts, InstallPlan, InstallState,
+    NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState, PhoneHomeRequest,
+    PhoneHomeResponse, PowerState,
 };
+use sha2::{Digest, Sha256};
 use std::sync::Arc;
 use tracing::{debug, error, info, warn};

@ -49,6 +51,14 @@ fn merge_hardware_summary_metadata(
    }
 }

+fn inventory_hash(hardware_facts: Option<&HardwareFacts>) -> Option<String> {
+    let hardware_facts = hardware_facts?;
+    let payload = serde_json::to_vec(hardware_facts).ok()?;
+    let mut hasher = Sha256::new();
+    hasher.update(payload);
+    Some(format!("{:x}", hasher.finalize()))
+}
+
 /// POST /api/v1/phone-home
 ///
 /// Handles node registration during first boot.
@ -794,6 +804,21 @@ async fn store_cluster_node_if_configured(
        install_plan: node_config.install_plan.clone(),
        hardware_facts: hardware_facts.cloned(),
        state: Some(format!("{:?}", node_info.state).to_lowercase()),
+        commission_state: hardware_facts.map(|_| CommissionState::Discovered),
+        install_state: node_config.install_plan.as_ref().map(|_| InstallState::Pending),
+        commissioned_at: None,
+        last_inventory_hash: inventory_hash(hardware_facts),
+        power_state: node_info
+            .metadata
+            .get("power_state")
+            .and_then(|value| match value.as_str() {
+                "on" => Some(PowerState::On),
+                "off" => Some(PowerState::Off),
+                "cycling" => Some(PowerState::Cycling),
+                "unknown" => Some(PowerState::Unknown),
+                _ => None,
+            }),
+        bmc_ref: node_info.metadata.get("bmc_ref").cloned(),
        last_heartbeat: Some(node_info.last_heartbeat),
    };

--- a/deployer/crates/deployer-types/src/lib.rs
+++ b/deployer/crates/deployer-types/src/lib.rs
@ -24,6 +24,62 @@ impl Default for NodeState {
    }
 }

+/// Commissioning lifecycle for inventory-driven bare-metal onboarding.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum CommissionState {
+    /// Node has been discovered and reported inventory but not yet approved.
+    Discovered,
+    /// Manual or automated commissioning is actively validating the node.
+    Commissioning,
+    /// Inventory has been accepted and the node can be installed or rolled out.
+    Commissioned,
+}
+
+impl Default for CommissionState {
+    fn default() -> Self {
+        CommissionState::Discovered
+    }
+}
+
+/// Installation lifecycle for host provisioning and reprovisioning.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum InstallState {
+    /// No install is currently running, but an install may be planned.
+    Pending,
+    /// Bootstrap or reinstall is actively writing the target system.
+    Installing,
+    /// The desired system has been installed successfully.
+    Installed,
+    /// Installation failed and needs operator or controller intervention.
+    Failed,
+    /// A reinstall has been requested but not started yet.
+    ReinstallRequested,
+}
+
+impl Default for InstallState {
+    fn default() -> Self {
+        InstallState::Pending
+    }
+}
+
+/// Best-effort power state tracked by external management adapters.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum PowerState {
+    On,
+    Off,
+    Cycling,
+    Unknown,
+}
+
+impl Default for PowerState {
+    fn default() -> Self {
+        PowerState::Unknown
+    }
+}
+
 /// Node information tracked by Deployer
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct NodeInfo {
@ -492,6 +548,18 @@ pub struct ClusterNodeRecord {
    pub hardware_facts: Option<HardwareFacts>,
    #[serde(default)]
    pub state: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub commission_state: Option<CommissionState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub install_state: Option<InstallState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub commissioned_at: Option<DateTime<Utc>>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub last_inventory_hash: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub power_state: Option<PowerState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub bmc_ref: Option<String>,
    #[serde(default)]
    pub last_heartbeat: Option<DateTime<Utc>>,
 }
@ -534,6 +602,8 @@ pub struct DesiredSystemSpec {
    #[serde(default)]
    pub node_id: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub deployment_id: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub nixos_configuration: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub flake_ref: Option<String>,
@ -543,6 +613,8 @@ pub struct DesiredSystemSpec {
    pub health_check_command: Vec<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub rollback_on_failure: Option<bool>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub drain_before_apply: Option<bool>,
 }

 /// Cluster metadata (PhotonCloud scope).
@ -576,9 +648,23 @@ pub struct NodeSpec {
    #[serde(default)]
    pub install_plan: Option<InstallPlan>,
    #[serde(default)]
+    pub hardware_facts: Option<HardwareFacts>,
+    #[serde(default)]
    pub desired_system: Option<DesiredSystemSpec>,
    #[serde(default)]
    pub state: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub commission_state: Option<CommissionState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub install_state: Option<InstallState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub commissioned_at: Option<DateTime<Utc>>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub last_inventory_hash: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub power_state: Option<PowerState>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub bmc_ref: Option<String>,
    #[serde(default)]
    pub last_heartbeat: Option<DateTime<Utc>>,
 }
@ -647,6 +733,74 @@ pub struct EnrollmentRuleSpec {
    pub node_id_prefix: Option<String>,
 }

+/// Selector used by host deployments to target bare-metal nodes declaratively.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct HostDeploymentSelector {
+    #[serde(default)]
+    pub node_ids: Vec<String>,
+    #[serde(default)]
+    pub roles: Vec<String>,
+    #[serde(default)]
+    pub pools: Vec<String>,
+    #[serde(default)]
+    pub node_classes: Vec<String>,
+    #[serde(default)]
+    pub match_labels: HashMap<String, String>,
+}
+
+/// Declarative rollout intent for host-level NixOS updates.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct HostDeploymentSpec {
+    pub name: String,
+    #[serde(default)]
+    pub selector: HostDeploymentSelector,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub nixos_configuration: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub flake_ref: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub batch_size: Option<u32>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub max_unavailable: Option<u32>,
+    #[serde(default)]
+    pub health_check_command: Vec<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub switch_action: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub rollback_on_failure: Option<bool>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub drain_before_apply: Option<bool>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub reboot_policy: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub paused: Option<bool>,
+}
+
+/// Controller-observed rollout state for a host deployment.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct HostDeploymentStatus {
+    #[serde(default)]
+    pub name: String,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub phase: Option<String>,
+    #[serde(default)]
+    pub paused: bool,
+    #[serde(default)]
+    pub paused_by_operator: bool,
+    #[serde(default)]
+    pub selected_nodes: Vec<String>,
+    #[serde(default)]
+    pub completed_nodes: Vec<String>,
+    #[serde(default)]
+    pub in_progress_nodes: Vec<String>,
+    #[serde(default)]
+    pub failed_nodes: Vec<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub message: Option<String>,
+    #[serde(default, skip_serializing_if = "Option::is_none")]
+    pub updated_at: Option<DateTime<Utc>>,
+}
+
 /// Service ports for logical service definitions.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct ServicePorts {
@ -807,6 +961,8 @@ pub struct ClusterStateSpec {
    #[serde(default)]
    pub enrollment_rules: Vec<EnrollmentRuleSpec>,
    #[serde(default)]
+    pub host_deployments: Vec<HostDeploymentSpec>,
+    #[serde(default)]
    pub services: Vec<ServiceSpec>,
    #[serde(default)]
    pub instances: Vec<ServiceInstanceSpec>,
@ -1080,19 +1236,92 @@ mod tests {
    fn test_desired_system_spec_roundtrip() {
        let desired = DesiredSystemSpec {
            node_id: "node01".to_string(),
+            deployment_id: Some("worker-rollout".to_string()),
            nixos_configuration: Some("node01".to_string()),
            flake_ref: Some("/opt/plasmacloud-src".to_string()),
            switch_action: Some("switch".to_string()),
            health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()],
            rollback_on_failure: Some(true),
+            drain_before_apply: Some(true),
        };

        let json = serde_json::to_string(&desired).unwrap();
        let decoded: DesiredSystemSpec = serde_json::from_str(&json).unwrap();
        assert_eq!(decoded.node_id, "node01");
+        assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout"));
        assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01"));
        assert_eq!(decoded.health_check_command.len(), 2);
        assert_eq!(decoded.rollback_on_failure, Some(true));
+        assert_eq!(decoded.drain_before_apply, Some(true));
+    }
+
+    #[test]
+    fn test_host_deployment_roundtrip() {
+        let spec = HostDeploymentSpec {
+            name: "worker-rollout".to_string(),
+            selector: HostDeploymentSelector {
+                node_ids: vec![],
+                roles: vec!["worker".to_string()],
+                pools: vec!["general".to_string()],
+                node_classes: vec!["worker-linux".to_string()],
+                match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
+            },
+            nixos_configuration: Some("worker-golden".to_string()),
+            flake_ref: Some("/opt/plasmacloud-src".to_string()),
+            batch_size: Some(1),
+            max_unavailable: Some(1),
+            health_check_command: vec!["true".to_string()],
+            switch_action: Some("boot".to_string()),
+            rollback_on_failure: Some(true),
+            drain_before_apply: Some(true),
+            reboot_policy: Some("always".to_string()),
+            paused: Some(false),
+        };
+
+        let json = serde_json::to_string(&spec).unwrap();
+        let decoded: HostDeploymentSpec = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.name, "worker-rollout");
+        assert_eq!(decoded.batch_size, Some(1));
+        assert_eq!(decoded.max_unavailable, Some(1));
+        assert_eq!(decoded.selector.roles, vec!["worker".to_string()]);
+        assert_eq!(
+            decoded.selector.match_labels.get("tier").map(String::as_str),
+            Some("general")
+        );
+        assert_eq!(decoded.drain_before_apply, Some(true));
+    }
+
+    #[test]
+    fn test_cluster_node_record_commissioning_roundtrip() {
+        let node = ClusterNodeRecord {
+            node_id: "node01".to_string(),
+            machine_id: Some("machine-01".to_string()),
+            ip: "10.0.0.11".to_string(),
+            hostname: "node01".to_string(),
+            roles: vec!["worker".to_string()],
+            labels: HashMap::new(),
+            pool: Some("general".to_string()),
+            node_class: Some("worker-linux".to_string()),
+            failure_domain: Some("rack-a".to_string()),
+            nix_profile: Some("profiles/worker-linux".to_string()),
+            install_plan: None,
+            hardware_facts: None,
+            state: Some("provisioning".to_string()),
+            commission_state: Some(CommissionState::Commissioned),
+            install_state: Some(InstallState::Installed),
+            commissioned_at: Some(Utc::now()),
+            last_inventory_hash: Some("abc123".to_string()),
+            power_state: Some(PowerState::On),
+            bmc_ref: Some("redfish://lab-rack-a/node01".to_string()),
+            last_heartbeat: Some(Utc::now()),
+        };
+
+        let json = serde_json::to_string(&node).unwrap();
+        let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap();
+        assert_eq!(decoded.commission_state, Some(CommissionState::Commissioned));
+        assert_eq!(decoded.install_state, Some(InstallState::Installed));
+        assert_eq!(decoded.power_state, Some(PowerState::On));
+        assert_eq!(decoded.bmc_ref.as_deref(), Some("redfish://lab-rack-a/node01"));
    }

    #[test]
--- a/deployer/crates/fleet-scheduler/src/main.rs
+++ b/deployer/crates/fleet-scheduler/src/main.rs
@ -899,6 +899,12 @@ mod tests {
            install_plan: None,
            hardware_facts: None,
            state: Some("active".to_string()),
+            commission_state: None,
+            install_state: None,
+            commissioned_at: None,
+            last_inventory_hash: None,
+            power_state: None,
+            bmc_ref: None,
            last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
        }
    }
--- a/deployer/crates/nix-agent/src/main.rs
+++ b/deployer/crates/nix-agent/src/main.rs
@ -2,6 +2,7 @@ use std::fs;
 use std::path::Path;
 use std::process::Stdio;
 use std::time::Duration;
+use std::time::Instant;

 use anyhow::{anyhow, Context, Result};
 use chainfire_client::Client;
@ -135,7 +136,15 @@ impl Agent {
    }

    async fn tick(&self) -> Result<()> {
+        info!(
+            endpoint = %self.endpoint,
+            cluster_namespace = %self.cluster_namespace,
+            cluster_id = %self.cluster_id,
+            node_id = %self.node_id,
+            "starting reconciliation tick"
+        );
        let mut client = Client::connect(self.endpoint.clone()).await?;
+        info!("connected to ChainFire");
        let node_key = key_node(&self.cluster_namespace, &self.cluster_id, &self.node_id);
        let node_raw = client.get_with_revision(&node_key).await?;
        let Some((node_bytes, _revision)) = node_raw else {
@ -149,6 +158,11 @@ impl Agent {

        let node: ClusterNodeRecord =
            serde_json::from_slice(&node_bytes).context("failed to parse node record")?;
+        info!(
+            hostname = %node.hostname,
+            state = node.state.as_deref().unwrap_or("unknown"),
+            "loaded node record"
+        );

        let desired = client
            .get(key_desired_system(
@ -160,6 +174,11 @@ impl Agent {
            .map(|bytes| serde_json::from_slice::<DesiredSystemSpec>(&bytes))
            .transpose()
            .context("failed to parse desired-system spec")?;
+        info!(
+            has_desired_system = desired.is_some(),
+            has_install_plan = node.install_plan.is_some(),
+            "resolved desired-state inputs"
+        );

        let previous_observed = client
            .get(key_observed_system(
@ -173,24 +192,87 @@ impl Agent {
            .context("failed to parse observed-system state")?;

        let mut observed = self.base_observed_state(&node);
+        observed.status = Some("planning".to_string());
+        info!(
+            current_system = observed.current_system.as_deref().unwrap_or(""),
+            configured_system = observed.configured_system.as_deref().unwrap_or(""),
+            booted_system = observed.booted_system.as_deref().unwrap_or(""),
+            "publishing planning status"
+        );
+        self.publish_observed_state(&mut client, &observed).await?;
        let reconcile_result = self
-            .reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed)
+            .reconcile_node(
+                &node,
+                desired.as_ref(),
+                previous_observed.as_ref(),
+                &mut observed,
+            )
            .await;
        if let Err(error) = reconcile_result {
            observed.status = Some("failed".to_string());
-            observed.last_error = Some(error.to_string());
+            observed.last_error = Some(format!("{error:#}"));
        }

+        info!(
+            status = observed.status.as_deref().unwrap_or("unknown"),
+            "publishing final observed status"
+        );
+        self.publish_observed_state_with_retry(&observed).await?;
+
+        Ok(())
+    }
+
+    async fn publish_observed_state(
+        &self,
+        client: &mut Client,
+        observed: &ObservedSystemState,
+    ) -> Result<()> {
+        info!(
+            status = observed.status.as_deref().unwrap_or("unknown"),
+            "writing observed-system state"
+        );
        client
            .put(
                &key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id),
-                &serde_json::to_vec(&observed)?,
+                &serde_json::to_vec(observed)?,
            )
            .await?;
-
        Ok(())
    }

+    async fn publish_observed_state_with_retry(
+        &self,
+        observed: &ObservedSystemState,
+    ) -> Result<()> {
+        let payload = serde_json::to_vec(observed)?;
+        let key = key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id);
+        let deadline = Instant::now() + Duration::from_secs(30);
+        let mut attempt = 1u32;
+
+        loop {
+            let result = async {
+                let mut client = Client::connect(self.endpoint.clone()).await?;
+                client.put(&key, &payload).await?;
+                Result::<()>::Ok(())
+            }
+            .await;
+
+            match result {
+                Ok(()) => return Ok(()),
+                Err(error) if Instant::now() < deadline => {
+                    warn!(
+                        attempt,
+                        error = %error,
+                        "failed to publish observed-system state; retrying with a fresh connection"
+                    );
+                    attempt += 1;
+                    sleep(Duration::from_secs(2)).await;
+                }
+                Err(error) => return Err(error),
+            }
+        }
+    }
+
    fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
        ObservedSystemState {
            node_id: node.node_id.clone(),
@ -209,7 +291,18 @@ impl Agent {
        observed: &mut ObservedSystemState,
    ) -> Result<()> {
        match node.state.as_deref() {
-            Some("failed") | Some("draining") => {
+            Some("failed") => {
+                observed.status = Some("paused".to_string());
+                return Ok(());
+            }
+            Some("draining")
+                if !desired
+                    .map(|spec| {
+                        spec.deployment_id.is_some()
+                            && spec.drain_before_apply.unwrap_or(false)
+                    })
+                    .unwrap_or(false) =>
+            {
                observed.status = Some("paused".to_string());
                return Ok(());
            }
@ -227,6 +320,14 @@ impl Agent {
            observed.status = Some("idle".to_string());
            return Ok(());
        };
+        info!(
+            nixos_configuration = %desired.nixos_configuration,
+            flake_ref = %desired.flake_ref,
+            switch_action = %desired.switch_action,
+            rollback_on_failure = desired.rollback_on_failure,
+            health_check_command = ?desired.health_check_command,
+            "resolved desired system"
+        );

        observed.nixos_configuration = Some(desired.nixos_configuration.clone());
        observed.flake_root = Some(desired.flake_ref.clone());
@ -236,6 +337,10 @@ impl Agent {
            .and_then(|state| state.rollback_system.clone())
            .or_else(|| observed.current_system.clone());
        observed.rollback_system = previous_system.clone();
+        info!(
+            previous_system = previous_system.as_deref().unwrap_or(""),
+            "selected rollback baseline"
+        );
        let target_system = self
            .build_target_system(&desired.flake_ref, &desired.nixos_configuration)
            .await
@ -246,8 +351,10 @@ impl Agent {
                )
            })?;
        observed.target_system = Some(target_system.clone());
+        info!(target_system = %target_system, "built target system");

        if observed.current_system.as_deref() == Some(target_system.as_str()) {
+            info!("target system already active");
            if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
                observed.status = Some("verifying".to_string());
                observed.last_attempt = Some(Utc::now());
@ -279,8 +386,14 @@ impl Agent {

        observed.status = Some("reconciling".to_string());
        observed.last_attempt = Some(Utc::now());
+        info!(
+            target_system = %target_system,
+            switch_action = %desired.switch_action,
+            "switching to target system"
+        );
        self.switch_to_target(&target_system, &desired.switch_action)
            .await?;
+        info!("switch-to-configuration completed");

        observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
        observed.current_system = read_symlink_target("/run/current-system");
@ -327,15 +440,20 @@ impl Agent {

    async fn build_target_system(&self, flake_ref: &str, configuration: &str) -> Result<String> {
        let flake_attr = target_flake_attr(flake_ref, configuration);
-        let output = run_command(
-            "nix",
-            &["build", "--no-link", "--print-out-paths", flake_attr.as_str()],
-        )
-        .await?;
+        info!(flake_attr = %flake_attr, "building target system");
+        let mut build_args = vec![
+            "build",
+            "-L",
+            "--no-link",
+            "--no-write-lock-file",
+            "--print-out-paths",
+        ];
+        build_args.push(flake_attr.as_str());
+        let output = run_command("nix", &build_args).await?;
        let path = output
            .lines()
-            .find(|line| !line.trim().is_empty())
            .map(str::trim)
+            .find(|line| line.starts_with("/nix/store/"))
            .ok_or_else(|| anyhow!("nix build returned no output path"))?;
        Ok(path.to_string())
    }
@ -349,7 +467,12 @@ impl Agent {
            ));
        }

-        run_command(
+        info!(
+            switch_bin = %switch_bin.display(),
+            switch_action = %switch_action,
+            "executing switch-to-configuration"
+        );
+        run_command_inherit_output(
            switch_bin
                .to_str()
                .ok_or_else(|| anyhow!("invalid switch path"))?,
@ -369,9 +492,15 @@ impl Agent {
            return Ok(HealthCheckOutcome::Passed);
        }

+        info!(
+            command = ?desired.health_check_command,
+            rollback_on_failure = desired.rollback_on_failure,
+            "running post-activation health check"
+        );
        if let Err(error) = run_vec_command(&desired.health_check_command).await {
            let error_message = format!("health check failed after activation: {error}");
            if desired.rollback_on_failure {
+                info!("health check failed; rolling back to previous system");
                self.rollback_to_previous(previous_system).await?;
                observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
                observed.current_system = read_symlink_target("/run/current-system");
@ -385,6 +514,7 @@ impl Agent {
            return Err(anyhow!(error_message));
        }

+        info!("post-activation health check passed");
        Ok(HealthCheckOutcome::Passed)
    }

@ -392,7 +522,42 @@ impl Agent {
        let previous_system = previous_system
            .filter(|value| !value.is_empty())
            .ok_or_else(|| anyhow!("rollback requested but no previous system is known"))?;
-        self.switch_to_target(previous_system, "switch").await
+        info!(previous_system = %previous_system, "rolling back to previous system");
+        let switch_bin = Path::new(previous_system).join("bin/switch-to-configuration");
+        if switch_bin.exists() {
+            return self.switch_to_target(previous_system, "switch").await;
+        }
+
+        let activate = Path::new(previous_system).join("activate");
+        if !activate.exists() {
+            return Err(anyhow!(
+                "previous system {} does not contain switch-to-configuration or activate",
+                previous_system
+            ));
+        }
+
+        info!(
+            previous_system = %previous_system,
+            activate = %activate.display(),
+            "previous system lacks switch-to-configuration; falling back to profile set + activate"
+        );
+        run_command(
+            "nix-env",
+            &[
+                "--profile",
+                "/nix/var/nix/profiles/system",
+                "--set",
+                previous_system,
+            ],
+        )
+        .await?;
+        run_command_inherit_output(
+            activate
+                .to_str()
+                .ok_or_else(|| anyhow!("invalid activate path"))?,
+            &[],
+        )
+        .await
    }
 }

@ -458,6 +623,8 @@ fn read_symlink_target(path: &str) -> Option<String> {
 }

 async fn run_command(program: &str, args: &[&str]) -> Result<String> {
+    let started_at = Instant::now();
+    info!(program = %program, args = ?args, "running command");
    let output = Command::new(program)
        .args(args)
        .stdin(Stdio::null())
@ -468,10 +635,25 @@ async fn run_command(program: &str, args: &[&str]) -> Result<String> {
        .with_context(|| format!("failed to execute {}", program))?;

    if output.status.success() {
+        info!(
+            program = %program,
+            args = ?args,
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "command completed successfully"
+        );
        Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
    } else {
        let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
        let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
+        warn!(
+            program = %program,
+            args = ?args,
+            elapsed_ms = started_at.elapsed().as_millis(),
+            status = %output.status,
+            stdout = %stdout,
+            stderr = %stderr,
+            "command failed"
+        );
        Err(anyhow!(
            "{} {:?} failed with status {}: stdout='{}' stderr='{}'",
            program,
@ -491,6 +673,47 @@ async fn run_vec_command(command: &[String]) -> Result<String> {
    run_command(program, &arg_refs).await
 }

+async fn run_command_inherit_output(program: &str, args: &[&str]) -> Result<()> {
+    let started_at = Instant::now();
+    info!(
+        program = %program,
+        args = ?args,
+        "running command with inherited output"
+    );
+    let status = Command::new(program)
+        .args(args)
+        .stdin(Stdio::null())
+        .stdout(Stdio::inherit())
+        .stderr(Stdio::inherit())
+        .status()
+        .await
+        .with_context(|| format!("failed to execute {}", program))?;
+
+    if status.success() {
+        info!(
+            program = %program,
+            args = ?args,
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "command completed successfully"
+        );
+        Ok(())
+    } else {
+        warn!(
+            program = %program,
+            args = ?args,
+            elapsed_ms = started_at.elapsed().as_millis(),
+            status = %status,
+            "command failed"
+        );
+        Err(anyhow!(
+            "{} {:?} failed with status {}",
+            program,
+            args,
+            status
+        ))
+    }
+}
+
 #[tokio::main]
 async fn main() -> Result<()> {
    tracing_subscriber::fmt()
@ -543,6 +766,12 @@ mod tests {
            }),
            hardware_facts: None,
            state: Some("active".to_string()),
+            commission_state: None,
+            install_state: None,
+            commissioned_at: None,
+            last_inventory_hash: None,
+            power_state: None,
+            bmc_ref: None,
            last_heartbeat: None,
        }
    }
@ -568,11 +797,13 @@ mod tests {
    fn resolve_desired_system_prefers_chainfire_spec() {
        let desired = DesiredSystemSpec {
            node_id: "node01".to_string(),
+            deployment_id: None,
            nixos_configuration: Some("node01-next".to_string()),
            flake_ref: Some("github:centra/cloud".to_string()),
            switch_action: Some("boot".to_string()),
            health_check_command: vec!["true".to_string()],
            rollback_on_failure: Some(true),
+            drain_before_apply: Some(false),
        };

        let resolved = resolve_desired_system(
@ -595,11 +826,13 @@ mod tests {
    fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
        let desired = DesiredSystemSpec {
            node_id: "node01".to_string(),
+            deployment_id: None,
            nixos_configuration: Some("node01-next".to_string()),
            flake_ref: None,
            switch_action: None,
            health_check_command: Vec::new(),
            rollback_on_failure: None,
+            drain_before_apply: None,
        };

        let resolved = resolve_desired_system(
@ -631,7 +864,10 @@ mod tests {

    #[test]
    fn read_symlink_target_returns_none_for_missing_path() {
-        assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None);
+        assert_eq!(
+            read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"),
+            None
+        );
    }

    #[test]
--- a/deployer/crates/plasmacloud-reconciler/Cargo.toml
+++ b/deployer/crates/plasmacloud-reconciler/Cargo.toml
@ -9,6 +9,8 @@ repository.workspace = true

 [dependencies]
 anyhow.workspace = true
+chainfire-client.workspace = true
+chrono.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio.workspace = true
@ -16,5 +18,6 @@ tracing.workspace = true
 tracing-subscriber.workspace = true
 fiberlb-api.workspace = true
 flashdns-api.workspace = true
+deployer-types.workspace = true
 clap = { version = "4.5", features = ["derive"] }
 tonic = "0.12"
--- a/deployer/crates/plasmacloud-reconciler/src/hosts.rs
+++ b/deployer/crates/plasmacloud-reconciler/src/hosts.rs
@ -0,0 +1,823 @@
+use anyhow::Result;
+use chainfire_client::Client;
+use chrono::Utc;
+use clap::Args;
+use deployer_types::{
+    ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector,
+    HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState, ServiceInstanceSpec,
+};
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::time::Duration;
+use tokio::time::sleep;
+use tracing::{info, warn};
+
+fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
+    format!("{}/clusters/{}/", cluster_namespace, cluster_id)
+}
+
+fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
+    format!(
+        "{}nodes/{}/desired-system",
+        cluster_prefix(cluster_namespace, cluster_id),
+        node_id
+    )
+    .into_bytes()
+}
+
+fn key_host_deployment_status(
+    cluster_namespace: &str,
+    cluster_id: &str,
+    deployment_name: &str,
+) -> Vec<u8> {
+    format!(
+        "{}deployments/hosts/{}/status",
+        cluster_prefix(cluster_namespace, cluster_id),
+        deployment_name
+    )
+    .into_bytes()
+}
+
+#[derive(Debug, Clone, Args)]
+pub struct HostsCommand {
+    #[arg(long)]
+    pub endpoint: String,
+
+    #[arg(long, default_value = "photoncloud")]
+    pub cluster_namespace: String,
+
+    #[arg(long)]
+    pub cluster_id: String,
+
+    #[arg(long, default_value_t = 15)]
+    pub interval_secs: u64,
+
+    #[arg(long, default_value_t = 300)]
+    pub heartbeat_timeout_secs: u64,
+
+    #[arg(long, default_value_t = false)]
+    pub dry_run: bool,
+
+    #[arg(long, default_value_t = false)]
+    pub once: bool,
+}
+
+pub async fn run(command: HostsCommand) -> Result<()> {
+    let controller = HostDeploymentController::new(command);
+    if controller.once {
+        controller.reconcile_once().await
+    } else {
+        loop {
+            if let Err(error) = controller.reconcile_once().await {
+                warn!(error = %error, "host deployment reconciliation failed");
+            }
+            sleep(controller.interval).await;
+        }
+    }
+}
+
+struct HostDeploymentController {
+    endpoint: String,
+    cluster_namespace: String,
+    cluster_id: String,
+    interval: Duration,
+    heartbeat_timeout_secs: u64,
+    dry_run: bool,
+    once: bool,
+}
+
+impl HostDeploymentController {
+    fn new(command: HostsCommand) -> Self {
+        Self {
+            endpoint: command.endpoint,
+            cluster_namespace: command.cluster_namespace,
+            cluster_id: command.cluster_id,
+            interval: Duration::from_secs(command.interval_secs),
+            heartbeat_timeout_secs: command.heartbeat_timeout_secs,
+            dry_run: command.dry_run,
+            once: command.once,
+        }
+    }
+
+    async fn reconcile_once(&self) -> Result<()> {
+        let mut client = Client::connect(self.endpoint.clone()).await?;
+        let nodes = self.load_nodes(&mut client).await?;
+        let desired_systems = self.load_desired_systems(&mut client).await?;
+        let observed_systems = self.load_observed_systems(&mut client).await?;
+        let instances = self.load_instances(&mut client).await?;
+        let deployments = self.load_host_deployments(&mut client).await?;
+        let statuses = self.load_host_deployment_statuses(&mut client).await?;
+
+        info!(
+            nodes = nodes.len(),
+            deployments = deployments.len(),
+            instances = instances.len(),
+            "loaded host deployment inputs"
+        );
+
+        for deployment in deployments {
+            let existing_status = statuses.get(&deployment.name).cloned();
+            let plan = plan_host_deployment(
+                &deployment,
+                existing_status.as_ref(),
+                &nodes,
+                &desired_systems,
+                &observed_systems,
+                &instances,
+                self.heartbeat_timeout_secs,
+            );
+
+            if self.dry_run {
+                info!(
+                    deployment = %deployment.name,
+                    phase = plan.status.phase.as_deref().unwrap_or("unknown"),
+                    desired_upserts = plan.desired_upserts.len(),
+                    desired_deletes = plan.desired_deletes.len(),
+                    node_updates = plan.node_updates.len(),
+                    "would reconcile host deployment"
+                );
+                continue;
+            }
+
+            for desired in &plan.desired_upserts {
+                client
+                    .put(
+                        &key_desired_system(
+                            &self.cluster_namespace,
+                            &self.cluster_id,
+                            &desired.node_id,
+                        ),
+                        &serde_json::to_vec(desired)?,
+                    )
+                    .await?;
+            }
+
+            for node_id in &plan.desired_deletes {
+                client
+                    .delete(&key_desired_system(
+                        &self.cluster_namespace,
+                        &self.cluster_id,
+                        node_id,
+                    ))
+                    .await?;
+            }
+
+            for node in plan.node_updates.values() {
+                client
+                    .put(
+                        &key_node(&self.cluster_namespace, &self.cluster_id, &node.node_id),
+                        &serde_json::to_vec(node)?,
+                    )
+                    .await?;
+            }
+
+            client
+                .put(
+                    &key_host_deployment_status(
+                        &self.cluster_namespace,
+                        &self.cluster_id,
+                        &deployment.name,
+                    ),
+                    &serde_json::to_vec(&plan.status)?,
+                )
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn load_nodes(&self, client: &mut Client) -> Result<Vec<ClusterNodeRecord>> {
+        let prefix = format!(
+            "{}nodes/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut nodes = Vec::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            let Some(suffix) = key.strip_prefix(&prefix) else {
+                continue;
+            };
+            if suffix.contains('/') {
+                continue;
+            }
+            match serde_json::from_slice::<ClusterNodeRecord>(&value) {
+                Ok(node) => nodes.push(node),
+                Err(error) => warn!(error = %error, key = %key, "failed to decode cluster node"),
+            }
+        }
+
+        nodes.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
+        Ok(nodes)
+    }
+
+    async fn load_desired_systems(
+        &self,
+        client: &mut Client,
+    ) -> Result<HashMap<String, DesiredSystemSpec>> {
+        let prefix = format!(
+            "{}nodes/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut desired = HashMap::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            if !key.ends_with("/desired-system") {
+                continue;
+            }
+            match serde_json::from_slice::<DesiredSystemSpec>(&value) {
+                Ok(spec) => {
+                    desired.insert(spec.node_id.clone(), spec);
+                }
+                Err(error) => warn!(error = %error, key = %key, "failed to decode desired-system"),
+            }
+        }
+
+        Ok(desired)
+    }
+
+    async fn load_observed_systems(
+        &self,
+        client: &mut Client,
+    ) -> Result<HashMap<String, ObservedSystemState>> {
+        let prefix = format!(
+            "{}nodes/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut observed = HashMap::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            if !key.ends_with("/observed-system") {
+                continue;
+            }
+            match serde_json::from_slice::<ObservedSystemState>(&value) {
+                Ok(state) => {
+                    observed.insert(state.node_id.clone(), state);
+                }
+                Err(error) => warn!(error = %error, key = %key, "failed to decode observed-system"),
+            }
+        }
+
+        Ok(observed)
+    }
+
+    async fn load_instances(&self, client: &mut Client) -> Result<Vec<ServiceInstanceSpec>> {
+        let prefix = format!(
+            "{}instances/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut instances = Vec::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
+                Ok(instance) => instances.push(instance),
+                Err(error) => warn!(error = %error, key = %key, "failed to decode service instance"),
+            }
+        }
+
+        Ok(instances)
+    }
+
+    async fn load_host_deployments(&self, client: &mut Client) -> Result<Vec<HostDeploymentSpec>> {
+        let prefix = format!(
+            "{}deployments/hosts/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut deployments = Vec::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            if !key.ends_with("/spec") {
+                continue;
+            }
+            match serde_json::from_slice::<HostDeploymentSpec>(&value) {
+                Ok(spec) => deployments.push(spec),
+                Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment"),
+            }
+        }
+
+        deployments.sort_by(|lhs, rhs| lhs.name.cmp(&rhs.name));
+        Ok(deployments)
+    }
+
+    async fn load_host_deployment_statuses(
+        &self,
+        client: &mut Client,
+    ) -> Result<HashMap<String, HostDeploymentStatus>> {
+        let prefix = format!(
+            "{}deployments/hosts/",
+            cluster_prefix(&self.cluster_namespace, &self.cluster_id)
+        );
+        let kvs = client.get_prefix(prefix.as_bytes()).await?;
+        let mut statuses = HashMap::new();
+
+        for (key, value) in kvs {
+            let key = String::from_utf8_lossy(&key);
+            if !key.ends_with("/status") {
+                continue;
+            }
+            match serde_json::from_slice::<HostDeploymentStatus>(&value) {
+                Ok(status) => {
+                    statuses.insert(status.name.clone(), status);
+                }
+                Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment status"),
+            }
+        }
+
+        Ok(statuses)
+    }
+}
+
+#[derive(Debug, Default)]
+struct HostDeploymentPlan {
+    status: HostDeploymentStatus,
+    desired_upserts: Vec<DesiredSystemSpec>,
+    desired_deletes: Vec<String>,
+    node_updates: BTreeMap<String, ClusterNodeRecord>,
+}
+
+fn plan_host_deployment(
+    deployment: &HostDeploymentSpec,
+    existing_status: Option<&HostDeploymentStatus>,
+    nodes: &[ClusterNodeRecord],
+    desired_systems: &HashMap<String, DesiredSystemSpec>,
+    observed_systems: &HashMap<String, ObservedSystemState>,
+    instances: &[ServiceInstanceSpec],
+    heartbeat_timeout_secs: u64,
+) -> HostDeploymentPlan {
+    let now = Utc::now();
+    let target_configuration = deployment.nixos_configuration.clone();
+    let selector_matches = select_nodes(nodes, &deployment.selector);
+    let selected_node_ids = selector_matches
+        .iter()
+        .map(|node| node.node_id.clone())
+        .collect::<HashSet<_>>();
+    let instance_counts = active_instances_per_node(instances);
+    let mut completed = Vec::new();
+    let mut in_progress = Vec::new();
+    let mut failed = Vec::new();
+    let mut eligible_candidates = Vec::new();
+    let mut desired_upserts = Vec::new();
+    let mut node_updates = BTreeMap::new();
+    let batch_size = deployment.batch_size.unwrap_or(1).max(1) as usize;
+    let max_unavailable = deployment.max_unavailable.unwrap_or(1).max(1) as usize;
+    let operator_paused = existing_status
+        .map(|status| status.paused_by_operator)
+        .unwrap_or(false);
+    let spec_paused = deployment.paused.unwrap_or(false);
+    let mut desired_deletes = desired_systems
+        .iter()
+        .filter(|(node_id, desired)| {
+            desired.deployment_id.as_deref() == Some(deployment.name.as_str())
+                && !selected_node_ids.contains(node_id.as_str())
+        })
+        .map(|(node_id, _)| node_id.clone())
+        .collect::<Vec<_>>();
+
+    for node in &selector_matches {
+        let desired = desired_systems.get(&node.node_id);
+        let observed = observed_systems.get(&node.node_id);
+        let is_completed =
+            is_node_completed(deployment, node, desired, observed, target_configuration.as_deref());
+        let is_failed = is_node_failed(deployment, desired, observed);
+        let is_in_progress = is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
+            || (deployment.drain_before_apply == Some(true)
+                && node.state.as_deref() == Some("draining")
+                && instance_counts.get(&node.node_id).copied().unwrap_or_default() > 0);
+
+        if is_completed {
+            completed.push(node.node_id.clone());
+            if deployment.drain_before_apply == Some(true) && node.state.as_deref() == Some("draining")
+            {
+                let mut updated = (*node).clone();
+                updated.state = Some("active".to_string());
+                node_updates.insert(updated.node_id.clone(), updated);
+            }
+            continue;
+        }
+
+        if is_failed {
+            failed.push(node.node_id.clone());
+            continue;
+        }
+
+        if is_in_progress {
+            in_progress.push(node.node_id.clone());
+            continue;
+        }
+
+        if node_is_rollout_candidate(node, heartbeat_timeout_secs) {
+            eligible_candidates.push((*node).clone());
+        }
+    }
+
+    let unavailable = in_progress.len() + failed.len();
+    let paused = operator_paused || spec_paused || !failed.is_empty();
+    let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable);
+    let remaining_batch_budget = batch_size.saturating_sub(in_progress.len());
+    let max_starts = if deployment.nixos_configuration.is_some() {
+        remaining_unavailable_budget.min(remaining_batch_budget)
+    } else {
+        0
+    };
+    let mut planned = 0usize;
+    let mut newly_started = Vec::new();
+
+    if !paused && max_starts > 0 {
+        for node in eligible_candidates {
+            if planned >= max_starts {
+                break;
+            }
+
+            let remaining_instances = instance_counts.get(&node.node_id).copied().unwrap_or_default();
+            if deployment.drain_before_apply == Some(true) && remaining_instances > 0 {
+                let mut updated = node.clone();
+                updated.state = Some("draining".to_string());
+                node_updates.insert(updated.node_id.clone(), updated);
+                in_progress.push(node.node_id.clone());
+                newly_started.push(node.node_id.clone());
+                planned += 1;
+                continue;
+            }
+
+            let desired = DesiredSystemSpec {
+                node_id: node.node_id.clone(),
+                deployment_id: Some(deployment.name.clone()),
+                nixos_configuration: deployment.nixos_configuration.clone(),
+                flake_ref: deployment.flake_ref.clone(),
+                switch_action: deployment.switch_action.clone().or_else(|| Some("switch".to_string())),
+                health_check_command: deployment.health_check_command.clone(),
+                rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)),
+                drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)),
+            };
+            newly_started.push(node.node_id.clone());
+            in_progress.push(node.node_id.clone());
+            planned += 1;
+            if deployment.drain_before_apply == Some(true) && node.state.as_deref() != Some("draining")
+            {
+                let mut updated = node.clone();
+                updated.state = Some("draining".to_string());
+                node_updates.insert(updated.node_id.clone(), updated);
+            }
+            desired_upserts.push(desired);
+        }
+    }
+
+    let mut status = existing_status.cloned().unwrap_or_default();
+    status.name = deployment.name.clone();
+    status.selected_nodes = selector_matches.iter().map(|node| node.node_id.clone()).collect();
+    status.completed_nodes = dedup_sorted(completed);
+    status.in_progress_nodes = dedup_sorted(in_progress);
+    status.failed_nodes = dedup_sorted(failed);
+    status.paused_by_operator = operator_paused;
+    status.paused = paused;
+    status.phase = Some(if status.selected_nodes.is_empty() {
+        "idle"
+    } else if deployment.nixos_configuration.is_none() {
+        "invalid"
+    } else if status.paused {
+        "paused"
+    } else if status.completed_nodes.len() == status.selected_nodes.len() {
+        "completed"
+    } else if !newly_started.is_empty() || !status.in_progress_nodes.is_empty() {
+        "running"
+    } else {
+        "ready"
+    }
+    .to_string());
+    status.message = Some(format!(
+        "selected={} completed={} in_progress={} failed={} newly_started={}",
+        status.selected_nodes.len(),
+        status.completed_nodes.len(),
+        status.in_progress_nodes.len(),
+        status.failed_nodes.len(),
+        newly_started.len()
+    ));
+    status.updated_at = Some(now);
+
+    HostDeploymentPlan {
+        status,
+        desired_upserts,
+        desired_deletes: {
+            desired_deletes.sort();
+            desired_deletes.dedup();
+            desired_deletes
+        },
+        node_updates,
+    }
+}
+
+fn select_nodes<'a>(
+    nodes: &'a [ClusterNodeRecord],
+    selector: &HostDeploymentSelector,
+) -> Vec<&'a ClusterNodeRecord> {
+    let explicit_nodes = selector.node_ids.iter().collect::<HashSet<_>>();
+    let explicit_mode = !explicit_nodes.is_empty();
+    let mut selected = nodes
+        .iter()
+        .filter(|node| {
+            (!explicit_mode || explicit_nodes.contains(&node.node_id))
+                && (selector.roles.is_empty()
+                    || node
+                        .roles
+                        .iter()
+                        .any(|role| selector.roles.iter().any(|expected| expected == role)))
+                && (selector.pools.is_empty()
+                    || node
+                        .pool
+                        .as_deref()
+                        .map(|pool| selector.pools.iter().any(|expected| expected == pool))
+                        .unwrap_or(false))
+                && (selector.node_classes.is_empty()
+                    || node
+                        .node_class
+                        .as_deref()
+                        .map(|node_class| {
+                            selector
+                                .node_classes
+                                .iter()
+                                .any(|expected| expected == node_class)
+                        })
+                        .unwrap_or(false))
+                && selector
+                    .match_labels
+                    .iter()
+                    .all(|(key, value)| node.labels.get(key) == Some(value))
+        })
+        .collect::<Vec<_>>();
+    selected.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
+    selected
+}
+
+fn active_instances_per_node(instances: &[ServiceInstanceSpec]) -> HashMap<String, usize> {
+    let mut counts = HashMap::new();
+    for instance in instances {
+        if matches!(instance.state.as_deref(), Some("failed") | Some("deleted")) {
+            continue;
+        }
+        *counts.entry(instance.node_id.clone()).or_insert(0usize) += 1;
+    }
+    counts
+}
+
+fn node_is_rollout_candidate(node: &ClusterNodeRecord, heartbeat_timeout_secs: u64) -> bool {
+    if matches!(
+        node.commission_state,
+        Some(CommissionState::Discovered | CommissionState::Commissioning)
+    ) {
+        return false;
+    }
+    if matches!(
+        node.install_state,
+        Some(
+            InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested
+        )
+    ) {
+        return false;
+    }
+    if !matches!(node.state.as_deref(), Some("active") | Some("draining")) {
+        return false;
+    }
+    if heartbeat_timeout_secs == 0 {
+        return true;
+    }
+    let Some(last) = node.last_heartbeat else {
+        return true;
+    };
+    Utc::now().signed_duration_since(last).num_seconds() <= heartbeat_timeout_secs as i64
+}
+
+fn is_node_completed(
+    deployment: &HostDeploymentSpec,
+    _node: &ClusterNodeRecord,
+    desired: Option<&DesiredSystemSpec>,
+    observed: Option<&ObservedSystemState>,
+    target_configuration: Option<&str>,
+) -> bool {
+    observed
+        .filter(|observed| observed.status.as_deref() == Some("active"))
+        .and_then(|observed| observed.nixos_configuration.as_deref())
+        .zip(target_configuration)
+        .map(|(observed_configuration, target)| observed_configuration == target)
+        .unwrap_or(false)
+        && desired
+            .and_then(|desired| desired.deployment_id.as_deref())
+            .map(|deployment_id| deployment_id == deployment.name)
+            .unwrap_or(false)
+}
+
+fn is_node_failed(
+    deployment: &HostDeploymentSpec,
+    desired: Option<&DesiredSystemSpec>,
+    observed: Option<&ObservedSystemState>,
+) -> bool {
+    desired
+        .and_then(|desired| desired.deployment_id.as_deref())
+        .map(|deployment_id| deployment_id == deployment.name)
+        .unwrap_or(false)
+        && observed
+            .and_then(|observed| observed.status.as_deref())
+            .map(|status| matches!(status, "failed" | "rolled-back"))
+            .unwrap_or(false)
+}
+
+fn is_node_in_progress(
+    deployment: &HostDeploymentSpec,
+    desired: Option<&DesiredSystemSpec>,
+    observed: Option<&ObservedSystemState>,
+    is_completed: bool,
+    is_failed: bool,
+) -> bool {
+    if is_completed || is_failed {
+        return false;
+    }
+    desired
+        .and_then(|desired| desired.deployment_id.as_deref())
+        .map(|deployment_id| deployment_id == deployment.name)
+        .unwrap_or(false)
+        || observed
+            .and_then(|observed| observed.status.as_deref())
+            .map(|status| matches!(status, "planning" | "pending" | "reconciling" | "verifying" | "staged"))
+            .unwrap_or(false)
+}
+
+fn dedup_sorted(mut values: Vec<String>) -> Vec<String> {
+    values.sort();
+    values.dedup();
+    values
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_node(node_id: &str, failure_domain: &str) -> ClusterNodeRecord {
+        ClusterNodeRecord {
+            node_id: node_id.to_string(),
+            machine_id: None,
+            ip: "10.0.0.1".to_string(),
+            hostname: node_id.to_string(),
+            roles: vec!["worker".to_string()],
+            labels: HashMap::from([
+                ("tier".to_string(), "general".to_string()),
+                ("failure_domain".to_string(), failure_domain.to_string()),
+            ]),
+            pool: Some("general".to_string()),
+            node_class: Some("worker-linux".to_string()),
+            failure_domain: Some(failure_domain.to_string()),
+            nix_profile: None,
+            install_plan: None,
+            hardware_facts: None,
+            state: Some("active".to_string()),
+            commission_state: Some(CommissionState::Commissioned),
+            install_state: Some(InstallState::Installed),
+            commissioned_at: None,
+            last_inventory_hash: None,
+            power_state: None,
+            bmc_ref: None,
+            last_heartbeat: Some(Utc::now()),
+        }
+    }
+
+    fn test_deployment() -> HostDeploymentSpec {
+        HostDeploymentSpec {
+            name: "worker-rollout".to_string(),
+            selector: HostDeploymentSelector {
+                node_ids: vec![],
+                roles: vec!["worker".to_string()],
+                pools: vec!["general".to_string()],
+                node_classes: vec!["worker-linux".to_string()],
+                match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
+            },
+            nixos_configuration: Some("worker-golden".to_string()),
+            flake_ref: Some("/opt/plasmacloud-src".to_string()),
+            batch_size: Some(1),
+            max_unavailable: Some(1),
+            health_check_command: vec!["true".to_string()],
+            switch_action: Some("switch".to_string()),
+            rollback_on_failure: Some(true),
+            drain_before_apply: Some(false),
+            reboot_policy: None,
+            paused: Some(false),
+        }
+    }
+
+    #[test]
+    fn plan_rollout_starts_one_node_per_batch() {
+        let deployment = test_deployment();
+        let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
+        let plan = plan_host_deployment(
+            &deployment,
+            None,
+            &nodes,
+            &HashMap::new(),
+            &HashMap::new(),
+            &[],
+            300,
+        );
+
+        assert_eq!(plan.desired_upserts.len(), 1);
+        assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
+        assert_eq!(plan.status.phase.as_deref(), Some("running"));
+    }
+
+    #[test]
+    fn plan_rollout_pauses_on_failed_node() {
+        let deployment = test_deployment();
+        let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
+        let desired = HashMap::from([(
+            "node01".to_string(),
+            DesiredSystemSpec {
+                node_id: "node01".to_string(),
+                deployment_id: Some("worker-rollout".to_string()),
+                nixos_configuration: Some("worker-golden".to_string()),
+                flake_ref: None,
+                switch_action: Some("switch".to_string()),
+                health_check_command: Vec::new(),
+                rollback_on_failure: Some(true),
+                drain_before_apply: Some(false),
+            },
+        )]);
+        let observed = HashMap::from([(
+            "node01".to_string(),
+            ObservedSystemState {
+                node_id: "node01".to_string(),
+                nixos_configuration: Some("worker-golden".to_string()),
+                status: Some("rolled-back".to_string()),
+                ..ObservedSystemState::default()
+            },
+        )]);
+
+        let plan = plan_host_deployment(
+            &deployment,
+            None,
+            &nodes,
+            &desired,
+            &observed,
+            &[],
+            300,
+        );
+
+        assert!(plan.desired_upserts.is_empty());
+        assert!(plan.status.paused);
+        assert_eq!(plan.status.failed_nodes, vec!["node01".to_string()]);
+    }
+
+    #[test]
+    fn plan_rollout_drains_before_apply_when_instances_exist() {
+        let mut deployment = test_deployment();
+        deployment.drain_before_apply = Some(true);
+        let nodes = vec![test_node("node01", "rack-a")];
+        let instances = vec![ServiceInstanceSpec {
+            instance_id: "api-node01".to_string(),
+            service: "api".to_string(),
+            node_id: "node01".to_string(),
+            ip: "10.0.0.1".to_string(),
+            port: 8080,
+            mesh_port: None,
+            version: None,
+            health_check: None,
+            process: None,
+            container: None,
+            managed_by: Some("fleet-scheduler".to_string()),
+            state: Some("active".to_string()),
+            last_heartbeat: None,
+            observed_at: None,
+        }];
+
+        let plan = plan_host_deployment(
+            &deployment,
+            None,
+            &nodes,
+            &HashMap::new(),
+            &HashMap::new(),
+            &instances,
+            300,
+        );
+
+        assert!(plan.desired_upserts.is_empty());
+        assert_eq!(
+            plan.node_updates
+                .get("node01")
+                .and_then(|node| node.state.as_deref()),
+            Some("draining")
+        );
+        assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
+    }
+}
--- a/deployer/crates/plasmacloud-reconciler/src/main.rs
+++ b/deployer/crates/plasmacloud-reconciler/src/main.rs
@ -29,9 +29,9 @@ use fiberlb_api::{
 };

 use flashdns_api::RecordServiceClient;
-use flashdns_api::ReverseZoneServiceClient;
 use flashdns_api::ZoneServiceClient;
 use flashdns_api::proto::{
+    reverse_zone_service_client::ReverseZoneServiceClient,
    record_data, ARecord, AaaaRecord, CaaRecord, CnameRecord, CreateRecordRequest,
    CreateReverseZoneRequest, CreateZoneRequest, DeleteRecordRequest, DeleteReverseZoneRequest,
    DeleteZoneRequest, ListReverseZonesRequest, MxRecord, NsRecord, PtrRecord, RecordData,
@ -39,6 +39,8 @@ use flashdns_api::proto::{
    ZoneInfo,
 };

+mod hosts;
+
 #[derive(Parser)]
 #[command(author, version, about)]
 struct Cli {
@ -71,6 +73,9 @@ enum Command {
        #[arg(long, default_value_t = false)]
        prune: bool,
    },
+
+    /// Reconcile host deployments into per-node desired-system state
+    Hosts(hosts::HostsCommand),
 }

 #[derive(Debug, Deserialize)]
@ -294,6 +299,9 @@ async fn main() -> Result<()> {
            let spec: DnsConfig = read_json(&config).await?;
            reconcile_dns(spec, endpoint, prune).await?;
        }
+        Command::Hosts(command) => {
+            hosts::run(command).await?;
+        }
    }

    Ok(())
--- a/deployer/scripts/verify-deployer-bootstrap-e2e.sh
+++ b/deployer/scripts/verify-deployer-bootstrap-e2e.sh
@ -7,6 +7,30 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
  exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
 fi

+run_chainfire_server_bin() {
+  if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
+  else
+    cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
+  fi
+}
+
+run_deployer_server_bin() {
+  if [[ -n "${PHOTONCLOUD_DEPLOYER_SERVER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_DEPLOYER_SERVER_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- "$@"
+  fi
+}
+
+run_deployer_ctl_bin() {
+  if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
+    "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
+  fi
+}
+
 tmp_dir="$(mktemp -d)"
 cf_pid=""
 deployer_pid=""
@ -128,7 +152,7 @@ role = "voter"
 EOF

 echo "Starting ChainFire on 127.0.0.1:${api_port}"
-cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
+run_chainfire_server_bin \
  --config "$tmp_dir/chainfire.toml" \
  >"$tmp_dir/chainfire.log" 2>&1 &
 cf_pid="$!"
@ -155,7 +179,7 @@ namespace = "deployer"
 EOF

 echo "Starting Deployer on 127.0.0.1:${deployer_port}"
-cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- \
+run_deployer_server_bin \
  --config "$tmp_dir/deployer.toml" \
  >"$tmp_dir/deployer.log" 2>&1 &
 deployer_pid="$!"
@ -240,7 +264,7 @@ chainfire_endpoint="http://127.0.0.1:${api_port}"
 deployer_endpoint="http://127.0.0.1:${deployer_port}"

 run_deployer_ctl() {
-  cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
+  run_deployer_ctl_bin \
    --chainfire-endpoint "$chainfire_endpoint" \
    --cluster-id test-cluster \
    --cluster-namespace photoncloud \
--- a/deployer/scripts/verify-fleet-scheduler-e2e.sh
+++ b/deployer/scripts/verify-fleet-scheduler-e2e.sh
@ -7,6 +7,38 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
  exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
 fi

+run_chainfire_server_bin() {
+  if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
+  else
+    cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
+  fi
+}
+
+run_deployer_ctl_bin() {
+  if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
+    "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
+  fi
+}
+
+run_node_agent_bin() {
+  if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then
+    "$PHOTONCLOUD_NODE_AGENT_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@"
+  fi
+}
+
+run_fleet_scheduler_bin() {
+  if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@"
+  fi
+}
+
 tmp_dir="$(mktemp -d)"
 cf_pid=""

@ -104,7 +136,7 @@ EOF
 mkdir -p "$tmp_dir/pids"

 echo "Starting ChainFire on 127.0.0.1:${api_port}"
-cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
+run_chainfire_server_bin \
  --config "$tmp_dir/chainfire.toml" \
  >"$tmp_dir/chainfire.log" 2>&1 &
 cf_pid="$!"
@ -256,7 +288,7 @@ EOF
 endpoint="http://127.0.0.1:${api_port}"

 run_deployer_ctl() {
-  cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
+  run_deployer_ctl_bin \
    --chainfire-endpoint "$endpoint" \
    --cluster-id test-cluster \
    "$@"
@ -266,7 +298,7 @@ run_node_agent_once() {
  local node_id="$1"
  local pid_dir="$tmp_dir/pids/$node_id"
  mkdir -p "$pid_dir"
-  cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- \
+  run_node_agent_bin \
    --chainfire-endpoint "$endpoint" \
    --cluster-id test-cluster \
    --node-id "$node_id" \
@ -277,7 +309,7 @@ run_node_agent_once() {
 }

 run_scheduler_once() {
-  cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- \
+  run_fleet_scheduler_bin \
    --chainfire-endpoint "$endpoint" \
    --cluster-id test-cluster \
    --interval-secs 1 \
--- a/deployer/scripts/verify-host-lifecycle-e2e.sh
+++ b/deployer/scripts/verify-host-lifecycle-e2e.sh
@ -0,0 +1,431 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+
+if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
+  exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
+fi
+
+run_chainfire_server_bin() {
+  if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
+  else
+    cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
+  fi
+}
+
+run_deployer_ctl_bin() {
+  if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
+    "$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
+  fi
+}
+
+run_plasmacloud_reconciler_bin() {
+  if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then
+    "$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@"
+  else
+    cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@"
+  fi
+}
+
+tmp_dir="$(mktemp -d)"
+cf_pid=""
+redfish_pid=""
+
+cleanup() {
+  set +e
+  if [[ -n "$redfish_pid" ]]; then
+    kill "$redfish_pid" 2>/dev/null || true
+    wait "$redfish_pid" 2>/dev/null || true
+  fi
+  if [[ -n "$cf_pid" ]]; then
+    kill "$cf_pid" 2>/dev/null || true
+    wait "$cf_pid" 2>/dev/null || true
+  fi
+  rm -rf "$tmp_dir"
+}
+
+trap cleanup EXIT
+
+free_port() {
+  python3 - <<'PY'
+import socket
+s = socket.socket()
+s.bind(("127.0.0.1", 0))
+print(s.getsockname()[1])
+s.close()
+PY
+}
+
+wait_for_port() {
+  local host="$1"
+  local port="$2"
+  local timeout_secs="${3:-60}"
+  local deadline=$((SECONDS + timeout_secs))
+
+  while (( SECONDS < deadline )); do
+    if python3 - "$host" "$port" <<'PY'
+import socket
+import sys
+
+host = sys.argv[1]
+port = int(sys.argv[2])
+
+with socket.socket() as sock:
+    sock.settimeout(0.5)
+    try:
+        sock.connect((host, port))
+    except OSError:
+        raise SystemExit(1)
+raise SystemExit(0)
+PY
+    then
+      return 0
+    fi
+    sleep 1
+  done
+
+  echo "timed out waiting for ${host}:${port}" >&2
+  return 1
+}
+
+api_port="$(free_port)"
+http_port="$(free_port)"
+raft_port="$(free_port)"
+gossip_port="$(free_port)"
+redfish_port="$(free_port)"
+
+cat >"$tmp_dir/chainfire.toml" <<EOF
+[node]
+id = 1
+name = "chainfire-1"
+role = "control_plane"
+
+[storage]
+data_dir = "$tmp_dir/chainfire-data"
+
+[network]
+api_addr = "127.0.0.1:${api_port}"
+http_addr = "127.0.0.1:${http_port}"
+raft_addr = "127.0.0.1:${raft_port}"
+gossip_addr = "127.0.0.1:${gossip_port}"
+
+[cluster]
+id = 1
+initial_members = []
+bootstrap = true
+
+[raft]
+role = "voter"
+EOF
+
+cat >"$tmp_dir/mock-redfish.py" <<'PY'
+import http.server
+import json
+import sys
+
+port = int(sys.argv[1])
+log_path = sys.argv[2]
+
+class Handler(http.server.BaseHTTPRequestHandler):
+    def log_message(self, format, *args):
+        pass
+
+    def do_GET(self):
+        if self.path == "/redfish/v1/Systems/node01":
+            body = json.dumps({"PowerState": "On"}).encode("utf-8")
+            self.send_response(200)
+            self.send_header("Content-Type", "application/json")
+            self.send_header("Content-Length", str(len(body)))
+            self.end_headers()
+            self.wfile.write(body)
+            return
+        self.send_error(404)
+
+    def do_POST(self):
+        if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset":
+            self.send_error(404)
+            return
+        length = int(self.headers.get("Content-Length", "0"))
+        payload = self.rfile.read(length).decode("utf-8")
+        with open(log_path, "a", encoding="utf-8") as handle:
+            handle.write(payload + "\n")
+        self.send_response(204)
+        self.end_headers()
+
+server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
+server.serve_forever()
+PY
+
+echo "Starting ChainFire on 127.0.0.1:${api_port}"
+run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 &
+cf_pid="$!"
+wait_for_port "127.0.0.1" "$api_port" 120
+wait_for_port "127.0.0.1" "$http_port" 120
+
+echo "Starting mock Redfish on 127.0.0.1:${redfish_port}"
+python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 &
+redfish_pid="$!"
+wait_for_port "127.0.0.1" "$redfish_port" 30
+
+cat >"$tmp_dir/cluster.yaml" <<EOF
+cluster:
+  cluster_id: test-cluster
+  environment: dev
+
+node_classes:
+  - name: worker-linux
+    roles:
+      - worker
+    labels:
+      tier: general
+
+pools:
+  - name: general
+    node_class: worker-linux
+    labels:
+      env: dev
+
+nodes:
+  - node_id: node01
+    hostname: node01
+    ip: 10.0.0.11
+    roles:
+      - worker
+    labels:
+      tier: general
+    pool: general
+    node_class: worker-linux
+    state: active
+    commission_state: commissioned
+    install_state: installed
+    bmc_ref: "redfish+http://127.0.0.1:${redfish_port}/redfish/v1/Systems/node01"
+  - node_id: node02
+    hostname: node02
+    ip: 10.0.0.12
+    roles:
+      - worker
+    labels:
+      tier: general
+    pool: general
+    node_class: worker-linux
+    state: active
+    commission_state: commissioned
+    install_state: installed
+
+host_deployments:
+  - name: worker-rollout
+    selector:
+      roles:
+        - worker
+      pools:
+        - general
+      node_classes:
+        - worker-linux
+      match_labels:
+        tier: general
+    nixos_configuration: worker-next
+    flake_ref: "github:centra/cloud"
+    batch_size: 1
+    max_unavailable: 1
+    health_check_command:
+      - "true"
+    switch_action: switch
+    rollback_on_failure: true
+EOF
+
+chainfire_endpoint="http://127.0.0.1:${api_port}"
+
+run_deployer_ctl() {
+  run_deployer_ctl_bin \
+    --chainfire-endpoint "$chainfire_endpoint" \
+    --cluster-id test-cluster \
+    --cluster-namespace photoncloud \
+    --deployer-namespace deployer \
+    "$@"
+}
+
+run_hosts_once() {
+  run_plasmacloud_reconciler_bin \
+    hosts \
+    --endpoint "$chainfire_endpoint" \
+    --cluster-namespace photoncloud \
+    --cluster-id test-cluster \
+    --heartbeat-timeout-secs 300 \
+    --once
+}
+
+echo "Applying host lifecycle cluster config"
+run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
+
+echo "Running host rollout controller"
+run_hosts_once
+
+run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-1.json"
+python3 - "$tmp_dir/deployment-1.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+status = payload["status"]
+assert status["phase"] == "running", payload
+assert status["in_progress_nodes"] == ["node01"], payload
+assert status["failed_nodes"] == [], payload
+print("initial rollout wave validated")
+PY
+
+run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump"
+python3 - "$tmp_dir/nodes-1.dump" <<'PY'
+import json
+import sys
+
+desired = {}
+with open(sys.argv[1], "r", encoding="utf-8") as handle:
+    for line in handle:
+        if " key=" not in line or " value=" not in line:
+            continue
+        key = line.split(" key=", 1)[1].split(" value=", 1)[0]
+        if not key.endswith("/desired-system"):
+            continue
+        payload = json.loads(line.split(" value=", 1)[1])
+        desired[payload["node_id"]] = payload
+
+assert sorted(desired) == ["node01"], desired
+assert desired["node01"]["deployment_id"] == "worker-rollout", desired
+print("desired-system first wave validated")
+PY
+
+echo "Pausing and resuming deployment via CLI"
+run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json"
+python3 - "$tmp_dir/pause.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+assert payload["paused"] is True, payload
+assert payload["paused_by_operator"] is True, payload
+print("pause command validated")
+PY
+run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json"
+python3 - "$tmp_dir/resume.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+assert payload["paused"] is False, payload
+assert payload["paused_by_operator"] is False, payload
+print("resume command validated")
+PY
+
+echo "Marking node01 rollout complete and reconciling next wave"
+run_deployer_ctl node set-observed \
+  --node-id node01 \
+  --status active \
+  --nixos-configuration worker-next >/dev/null
+run_hosts_once
+
+run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json"
+python3 - "$tmp_dir/deployment-2.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+status = payload["status"]
+assert status["completed_nodes"] == ["node01"], payload
+assert status["in_progress_nodes"] == ["node02"], payload
+print("second rollout wave validated")
+PY
+
+echo "Marking node02 rollout failed and validating auto-pause"
+run_deployer_ctl node set-observed \
+  --node-id node02 \
+  --status rolled-back \
+  --nixos-configuration worker-next >/dev/null
+run_hosts_once
+
+run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json"
+python3 - "$tmp_dir/deployment-3.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+status = payload["status"]
+assert status["paused"] is True, payload
+assert status["failed_nodes"] == ["node02"], payload
+print("auto-pause on failure validated")
+PY
+
+echo "Refreshing power state through Redfish"
+run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json"
+python3 - "$tmp_dir/node-power.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+assert payload["power_state"] == "on", payload
+print("power refresh validated")
+PY
+
+echo "Requesting reinstall with power cycle"
+run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json"
+python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+assert payload["state"] == "provisioning", payload
+assert payload["install_state"] == "reinstall_requested", payload
+assert payload["power_state"] == "cycling", payload
+
+lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()]
+assert any('"ResetType":"PowerCycle"' in line for line in lines), lines
+print("reinstall orchestration validated")
+PY
+
+run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump"
+python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY'
+import sys
+
+lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")]
+assert not any("/desired-system" in line for line in lines), lines
+assert not any("/observed-system" in line for line in lines), lines
+print("reinstall state cleanup validated")
+PY
+
+echo "Aborting deployment and clearing desired-system"
+run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json"
+python3 - "$tmp_dir/abort.json" <<'PY'
+import json
+import sys
+
+payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
+assert payload["phase"] == "aborted", payload
+assert payload["paused"] is True, payload
+print("abort command validated")
+PY
+
+run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump"
+python3 - "$tmp_dir/nodes-2.dump" <<'PY'
+import json
+import sys
+
+desired_nodes = []
+with open(sys.argv[1], "r", encoding="utf-8") as handle:
+    for line in handle:
+        if " key=" not in line or " value=" not in line:
+            continue
+        key = line.split(" key=", 1)[1].split(" value=", 1)[0]
+        if not key.endswith("/desired-system"):
+            continue
+        payload = json.loads(line.split(" value=", 1)[1])
+        if payload.get("deployment_id") == "worker-rollout":
+            desired_nodes.append(payload["node_id"])
+
+assert desired_nodes == [], desired_nodes
+print("desired-system cleanup validated")
+PY
+
+echo "Host lifecycle E2E verification passed"
--- a/docs/storage-benchmarks.md
+++ b/docs/storage-benchmarks.md
@ -1,9 +1,9 @@
 # Storage Benchmarks

-Generated on 2026-03-10T20:02:00+09:00 with:
+Generated on 2026-03-27T12:08:47+09:00 with:

 ```bash
-nix run ./nix/test-cluster#cluster -- fresh-bench-storage
+nix run ./nix/test-cluster#cluster -- bench-storage
 ```

 ## CoronaFS
@ -12,30 +12,35 @@ Cluster network baseline, measured with `iperf3` from `node04` to `node01` befor

 | Metric | Result |
 |---|---:|
-| TCP throughput | 22.83 MiB/s |
-| TCP retransmits | 78 |
+| TCP throughput | 45.92 MiB/s |
+| TCP retransmits | 193 |

 Measured from `node04`.
-Local worker disk is the baseline. CoronaFS is the shared block volume path used for mutable VM disks, exported from `node01` over NBD.
+Local worker disk is the baseline. CoronaFS now has two relevant data paths in the lab: the controller export sourced from `node01`, and the node-local export materialized onto the worker that actually attaches the mutable VM disk.

-| Metric | Local Disk | CoronaFS |
-|---|---:|---:|
-| Sequential write | 26.36 MiB/s | 5.24 MiB/s |
-| Sequential read | 348.77 MiB/s | 10.08 MiB/s |
-| 4k random read | 1243 IOPS | 145 IOPS |
+| Metric | Local Disk | Controller Export | Node-local Export |
+|---|---:|---:|---:|
+| Sequential write | 679.05 MiB/s | 30.35 MiB/s | 395.06 MiB/s |
+| Sequential read | 2723.40 MiB/s | 42.70 MiB/s | 709.14 MiB/s |
+| 4k random read | 16958 IOPS | 2034 IOPS | 5087 IOPS |
+| 4k queued random read (`iodepth=32`) | 106026 IOPS | 14261 IOPS | 28898 IOPS |

 Queue-depth profile (`libaio`, `iodepth=32`) from the same worker:

-| Metric | Local Disk | CoronaFS |
-|---|---:|---:|
-| Depth-32 write | 27.12 MiB/s | 11.42 MiB/s |
-| Depth-32 read | 4797.47 MiB/s | 10.06 MiB/s |
+| Metric | Local Disk | Controller Export | Node-local Export |
+|---|---:|---:|---:|
+| Depth-32 write | 3417.45 MiB/s | 39.26 MiB/s | 178.04 MiB/s |
+| Depth-32 read | 12996.47 MiB/s | 55.71 MiB/s | 112.88 MiB/s |

-Cross-worker shared-volume visibility, measured by writing on `node04` and reading from `node05` over the same CoronaFS NBD export:
+Node-local materialization timing and target-node steady-state read path:

 | Metric | Result |
 |---|---:|
-| Cross-worker sequential read | 17.72 MiB/s |
+| Node04 materialize latency | 9.23 s |
+| Node05 materialize latency | 5.82 s |
+| Node05 node-local sequential read | 709.14 MiB/s |
+
+PlasmaVMC now prefers the worker-local CoronaFS export for mutable node-local volumes, even when the underlying materialization is a qcow2 overlay. The VM runtime section below is therefore the closest end-to-end proxy for real local-attach VM I/O, while the node-local export numbers remain useful for CoronaFS service consumers and for diagnosing exporter overhead.

 ## LightningStor

@ -46,16 +51,16 @@ Cluster network baseline for this client, measured with `iperf3` from `node03` t

 | Metric | Result |
 |---|---:|
-| TCP throughput | 18.35 MiB/s |
-| TCP retransmits | 78 |
+| TCP throughput | 45.99 MiB/s |
+| TCP retransmits | 207 |

 ### Large-object path

 | Metric | Result |
 |---|---:|
 | Object size | 256 MiB |
-| Upload throughput | 8.11 MiB/s |
-| Download throughput | 7.54 MiB/s |
+| Upload throughput | 18.20 MiB/s |
+| Download throughput | 39.21 MiB/s |

 ### Small-object batch

@ -63,10 +68,10 @@ Measured as 32 objects of 4 MiB each (128 MiB total).

 | Metric | Result |
 |---|---:|
-| Batch upload throughput | 0.81 MiB/s |
-| Batch download throughput | 0.83 MiB/s |
-| PUT rate | 0.20 objects/s |
-| GET rate | 0.21 objects/s |
+| Batch upload throughput | 18.96 MiB/s |
+| Batch download throughput | 39.88 MiB/s |
+| PUT rate | 4.74 objects/s |
+| GET rate | 9.97 objects/s |

 ### Parallel small-object batch

@ -74,34 +79,57 @@ Measured as the same 32 objects of 4 MiB each, but with 8 concurrent client jobs

 | Metric | Result |
 |---|---:|
-| Parallel batch upload throughput | 3.03 MiB/s |
-| Parallel batch download throughput | 2.89 MiB/s |
-| Parallel PUT rate | 0.76 objects/s |
-| Parallel GET rate | 0.72 objects/s |
+| Parallel batch upload throughput | 16.23 MiB/s |
+| Parallel batch download throughput | 26.07 MiB/s |
+| Parallel PUT rate | 4.06 objects/s |
+| Parallel GET rate | 6.52 objects/s |

 ## VM Image Path

-Measured against the real `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` path on `node01`.
+Measured against the `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` clone path on `node01`.

 | Metric | Result |
 |---|---:|
 | Guest image artifact size | 2017 MiB |
 | Guest image virtual size | 4096 MiB |
-| `CreateImage` latency | 176.03 s |
-| First image-backed `CreateVolume` latency | 76.51 s |
-| Second image-backed `CreateVolume` latency | 170.49 s |
+| `CreateImage` latency | 66.49 s |
+| First image-backed `CreateVolume` latency | 16.86 s |
+| Second image-backed `CreateVolume` latency | 0.12 s |
+
+## VM Runtime Path
+
+Measured against the real `StartVm -> qemu attach -> guest boot -> guest fio` path on a worker node, using a CoronaFS-backed root disk and data disk.
+
+| Metric | Result |
+|---|---:|
+| `StartVm` to qemu attach | 0.60 s |
+| `StartVm` to guest benchmark result | 35.69 s |
+| Guest sequential write | 123.49252223968506 MiB/s |
+| Guest sequential read | 1492.7113695144653 MiB/s |
+| Guest 4k random read | 25550 IOPS |

 ## Assessment

- CoronaFS shared-volume reads are currently 2.9% of the measured local-disk baseline on this nested-QEMU lab cluster.
- CoronaFS 4k random reads are currently 11.7% of the measured local-disk baseline.
- CoronaFS cross-worker reads are currently 5.1% of the measured local-disk sequential-read baseline, which is the more relevant signal for VM restart and migration paths.
- CoronaFS sequential reads are currently 44.2% of the measured node04->node01 TCP baseline, which helps separate NBD/export overhead from raw cluster-network limits.
- CoronaFS depth-32 reads are currently 0.2% of the local depth-32 baseline, which is a better proxy for queued guest I/O than the single-depth path.
- The shared-volume path is functionally correct for mutable VM disks and migration tests, but its read-side throughput is still too low to call production-ready for heavier VM workloads.
- LightningStor's replicated S3 path is working correctly, but 8.11 MiB/s upload and 7.54 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
- LightningStor large-object downloads are currently 41.1% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
- LightningStor's small-object batch path is also functional, but 0.20 PUT/s and 0.21 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 0.76 PUT/s and 0.72 GET/s.
- The VM image path is now measured directly rather than inferred. The cold `CreateVolume` path includes artifact fetch plus CoronaFS population; the warm `CreateVolume` path isolates repeated CoronaFS population from an already cached image.
+- CoronaFS controller-export reads are currently 1.6% of the measured local-disk baseline on this nested-QEMU lab cluster.
+- CoronaFS controller-export 4k random reads are currently 12.0% of the measured local-disk baseline.
+- CoronaFS controller-export queued 4k random reads are currently 13.5% of the measured local queued-random-read baseline.
+- CoronaFS controller-export sequential reads are currently 93.0% of the measured node04->node01 TCP baseline, which isolates the centralized source path from raw cluster-network limits.
+- CoronaFS controller-export depth-32 reads are currently 0.4% of the local depth-32 baseline.
+- CoronaFS node-local reads are currently 26.0% of the measured local-disk baseline, which is the more relevant steady-state signal for mutable VM disks after attachment.
+- CoronaFS node-local 4k random reads are currently 30.0% of the measured local-disk baseline.
+- CoronaFS node-local queued 4k random reads are currently 27.3% of the measured local queued-random-read baseline.
+- CoronaFS node-local depth-32 reads are currently 0.9% of the local depth-32 baseline.
+- The target worker's node-local read path is 26.0% of the measured local sequential-read baseline after materialization, which is the better proxy for restart and migration steady state than the old shared-export read.
+- PlasmaVMC now attaches writable node-local volumes through the worker-local CoronaFS export, so the guest-runtime section should be treated as the real local VM steady-state path rather than the node-local export numbers alone.
+- CoronaFS single-depth writes remain sensitive to the nested-QEMU/VDE lab transport, so the queued-depth and guest-runtime numbers are still the more reliable proxy for real VM workload behavior than the single-stream write figure alone.
+- The central export path is now best understood as a source/materialization path; the worker-local export is the path that should determine VM-disk readiness going forward.
+- LightningStor's replicated S3 path is working correctly, but 18.20 MiB/s upload and 39.21 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
+- LightningStor large-object downloads are currently 85.3% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
+- The current S3 frontend tuning baseline is the built-in 16 MiB streaming threshold with multipart PUT/FETCH concurrency of 8; that combination is the best default observed on this lab cluster so far.
+- LightningStor uploads should be read against the replication write quorum and the same ~45.99 MiB/s lab network ceiling; this environment still limits end-to-end throughput well before modern bare-metal NICs would.
+- LightningStor's small-object batch path is also functional, but 4.74 PUT/s and 9.97 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
+- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 4.06 PUT/s and 6.52 GET/s.
+- The VM image section measures clone/materialization cost, not guest runtime I/O.
+- The PlasmaVMC local image-backed clone fast path is now active again; a 0.12 s second clone indicates the CoronaFS qcow2 backing-file path is being hit on node01 rather than falling back to eager raw materialization.
+- The VM runtime section is the real `PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel` path; use it to judge whether QEMU/NBD tuning is helping.
 - The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.
--- a/fiberlb/Cargo.lock
+++ b/fiberlb/Cargo.lock
--- a/flake.lock
+++ b/flake.lock
@ -76,7 +76,8 @@
        "flake-utils": "flake-utils",
        "nix-nos": "nix-nos",
        "nixpkgs": "nixpkgs",
-        "rust-overlay": "rust-overlay"
+        "rust-overlay": "rust-overlay",
+        "systems": "systems_2"
      }
    },
    "rust-overlay": {
@ -113,6 +114,20 @@
        "repo": "default",
        "type": "github"
      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "id": "systems",
+        "type": "indirect"
+      }
    }
  },
  "root": "root",
--- a/flake.nix
+++ b/flake.nix
@ -33,7 +33,7 @@
  # ============================================================================
  # OUTPUTS: What this flake provides
  # ============================================================================
-  outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos }:
+  outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        # Apply rust-overlay to get rust-bin attribute
@ -139,6 +139,301 @@
              );
        };

+        flakeInputsBlock = ''
+            inputs = {
+              # Use unstable nixpkgs for latest packages
+              nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+
+              # Rust overlay for managing Rust toolchains
+              rust-overlay = {
+                url = "github:oxalica/rust-overlay";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+
+              # Flake utilities for multi-system support
+              flake-utils.url = "github:numtide/flake-utils";
+
+              # Disko for declarative disk partitioning
+              disko = {
+                url = "github:nix-community/disko";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+
+              # Nix-NOS generic network operating system modules
+              nix-nos = {
+                url = "path:./nix-nos";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+            };
+        '';
+
+        bundledInputsBlock = ''
+            inputs = {
+              nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
+
+              rust-overlay = {
+                url = "path:./.bundle-inputs/rust-overlay";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+
+              flake-utils = {
+                url = "path:./.bundle-inputs/flake-utils";
+                inputs.systems.follows = "systems";
+              };
+
+              systems.url = "path:./.bundle-inputs/systems";
+
+              disko = {
+                url = "path:./.bundle-inputs/disko";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+
+              nix-nos = {
+                url = "path:./nix-nos";
+                inputs.nixpkgs.follows = "nixpkgs";
+              };
+            };
+        '';
+
+        flakeHeaderBlock = ''
+          # ============================================================================
+          # INPUTS: External dependencies
+          # ============================================================================
+          inputs = {
+            # Use unstable nixpkgs for latest packages
+            nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
+
+            # Rust overlay for managing Rust toolchains
+            rust-overlay = {
+              url = "github:oxalica/rust-overlay";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+
+            # Flake utilities for multi-system support
+            flake-utils.url = "github:numtide/flake-utils";
+
+            # Disko for declarative disk partitioning
+            disko = {
+              url = "github:nix-community/disko";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+
+            # Nix-NOS generic network operating system modules
+            nix-nos = {
+              url = "path:./nix-nos";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+          };
+
+          # ============================================================================
+          # OUTPUTS: What this flake provides
+          # ============================================================================
+          outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
+        '';
+
+        bundledHeaderBlock = ''
+          # ============================================================================
+          # INPUTS: External dependencies
+          # ============================================================================
+          inputs = {
+            nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
+
+            rust-overlay = {
+              url = "path:./.bundle-inputs/rust-overlay";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+
+            flake-utils = {
+              url = "path:./.bundle-inputs/flake-utils";
+              inputs.systems.follows = "systems";
+            };
+
+            systems.url = "path:./.bundle-inputs/systems";
+
+            disko = {
+              url = "path:./.bundle-inputs/disko";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+
+            nix-nos = {
+              url = "path:./nix-nos";
+              inputs.nixpkgs.follows = "nixpkgs";
+            };
+          };
+
+          # ============================================================================
+          # OUTPUTS: What this flake provides
+          # ============================================================================
+          outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
+        '';
+
+        bundledFlakeNix =
+          pkgs.writeText
+            "plasmacloud-bundled-flake.nix"
+            (
+              builtins.replaceStrings
+                [ flakeHeaderBlock ]
+                [ bundledHeaderBlock ]
+                (builtins.readFile ./flake.nix)
+            );
+
+        bundledFlakeHeaderFile =
+          pkgs.writeText "plasmacloud-bundled-flake-header" bundledHeaderBlock;
+
+        baseFlakeLock = builtins.fromJSON (builtins.readFile ./flake.lock);
+
+        bundleInputRelPaths = {
+          nixpkgs = "./.bundle-inputs/nixpkgs";
+          "rust-overlay" = "./.bundle-inputs/rust-overlay";
+          "flake-utils" = "./.bundle-inputs/flake-utils";
+          disko = "./.bundle-inputs/disko";
+          systems = "./.bundle-inputs/systems";
+        };
+
+        fetchLockedInput =
+          nodeName:
+          let
+            tree = builtins.fetchTree baseFlakeLock.nodes.${nodeName}.locked;
+          in
+          if builtins.isAttrs tree && tree ? outPath then tree.outPath else tree;
+
+        vendoredFlakeInputs = {
+          nixpkgs = fetchLockedInput "nixpkgs";
+          "rust-overlay" = fetchLockedInput "rust-overlay";
+          "flake-utils" = fetchLockedInput "flake-utils";
+          disko = fetchLockedInput "disko";
+          systems = fetchLockedInput "systems";
+        };
+
+        makeBundledLockNode =
+          nodeName: relPath:
+          let
+            node = baseFlakeLock.nodes.${nodeName};
+          in
+          node
+          // {
+            locked = {
+              type = "path";
+              path = relPath;
+            };
+            original = {
+              type = "path";
+              path = relPath;
+            };
+          };
+
+        bundledFlakeLock = baseFlakeLock // {
+          nodes =
+            baseFlakeLock.nodes
+            // {
+              root =
+                baseFlakeLock.nodes.root
+                // {
+                  inputs =
+                    baseFlakeLock.nodes.root.inputs
+                    // {
+                      systems = "systems";
+                    };
+                };
+              nixpkgs = makeBundledLockNode "nixpkgs" bundleInputRelPaths.nixpkgs;
+              "rust-overlay" = makeBundledLockNode "rust-overlay" bundleInputRelPaths."rust-overlay";
+              "flake-utils" = makeBundledLockNode "flake-utils" bundleInputRelPaths."flake-utils";
+              disko = makeBundledLockNode "disko" bundleInputRelPaths.disko;
+              systems = makeBundledLockNode "systems" bundleInputRelPaths.systems;
+            };
+        };
+
+        bundledFlakeLockFile =
+          pkgs.writeText "plasmacloud-bundled-flake.lock" (builtins.toJSON bundledFlakeLock);
+
+        inBundledEval = builtins.pathExists ./.bundle-eval-marker;
+
+        bundledFlakeRootDrv = pkgs.runCommand "plasmacloud-bundled-flake-root" {
+          nativeBuildInputs = [
+            pkgs.coreutils
+            pkgs.python3
+          ];
+        } ''
+          mkdir -p "$out"
+          cp -a ${flakeBundleSrc}/. "$out"/
+          chmod -R u+w "$out"
+          touch "$out/.bundle-eval-marker"
+          mkdir -p "$out/.bundle-inputs"
+          cp -a ${vendoredFlakeInputs.nixpkgs} "$out/.bundle-inputs/nixpkgs"
+          cp -a ${vendoredFlakeInputs."rust-overlay"} "$out/.bundle-inputs/rust-overlay"
+          cp -a ${vendoredFlakeInputs."flake-utils"} "$out/.bundle-inputs/flake-utils"
+          cp -a ${vendoredFlakeInputs.disko} "$out/.bundle-inputs/disko"
+          cp -a ${vendoredFlakeInputs.systems} "$out/.bundle-inputs/systems"
+          cp ${bundledFlakeLockFile} "$out/flake.lock"
+          python3 - <<'PY' "$out/flake.nix" ${bundledFlakeHeaderFile}
+          from pathlib import Path
+          import re
+          import sys
+
+          flake_path = Path(sys.argv[1])
+          header = Path(sys.argv[2]).read_text()
+          source = flake_path.read_text()
+          pattern = re.compile(
+              r"  # ============================================================================\n"
+              r"  # INPUTS: External dependencies\n"
+              r"  # ============================================================================\n"
+              r"  inputs = \{.*?\n"
+              r"  # ============================================================================\n"
+              r"  # OUTPUTS: What this flake provides\n"
+              r"  # ============================================================================\n"
+              r"  outputs = \{ self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems \? null \}:",
+              re.S,
+          )
+          rewritten, count = pattern.subn(header.rstrip("\n"), source, count=1)
+          if count != 1:
+              raise SystemExit(f"expected to rewrite 1 flake header, rewrote {count}")
+          flake_path.write_text(rewritten)
+          PY
+        '';
+
+        bundledFlakeRoot =
+          if inBundledEval then
+            null
+          else
+            builtins.path {
+              path = bundledFlakeRootDrv;
+              name = "plasmacloud-bundled-flake-root-src";
+            };
+
+        bundledFlakeRootNarHashFile =
+          if inBundledEval then
+            null
+          else
+            pkgs.runCommand "plasmacloud-bundled-flake-root-narhash" {
+              nativeBuildInputs = [ pkgs.nix ];
+            } ''
+              ${pkgs.nix}/bin/nix \
+                --extra-experimental-features nix-command \
+                hash path --sri ${bundledFlakeRoot} \
+                | tr -d '\n' > "$out"
+            '';
+
+        bundledFlakeRootNarHash =
+          if inBundledEval then
+            null
+          else
+            builtins.readFile bundledFlakeRootNarHashFile;
+
+        bundledFlake =
+          if inBundledEval then
+            null
+          else
+            builtins.getFlake (
+              builtins.unsafeDiscardStringContext
+                "path:${toString bundledFlakeRoot}?narHash=${bundledFlakeRootNarHash}"
+            );
+
+        bundledVmSmokeTargetToplevel =
+          if inBundledEval then
+            null
+          else
+            bundledFlake.nixosConfigurations.vm-smoke-target.config.system.build.toplevel;
+
        # Helper function to build a Rust workspace package
        # Parameters:
        #   name: package name (e.g., "chainfire-server")
@ -434,16 +729,31 @@
            description = "Node-local NixOS reconciliation agent for PhotonCloud hosts";
          };

+          plasmacloud-reconciler = buildRustWorkspace {
+            name = "plasmacloud-reconciler";
+            workspaceSubdir = "deployer";
+            mainCrate = "plasmacloud-reconciler";
+            description = "Declarative reconciler for host rollouts and published resources";
+          };
+
          plasmacloudFlakeBundle = pkgs.runCommand "plasmacloud-flake-bundle.tar.gz" {
-            nativeBuildInputs = [ pkgs.gnutar pkgs.gzip ];
+            nativeBuildInputs = [
+              pkgs.coreutils
+              pkgs.gnutar
+              pkgs.gzip
+            ];
          } ''
+            bundle_root="$(mktemp -d)"
+            cp -a ${bundledFlakeRootDrv}/. "$bundle_root"/
+            chmod -R u+w "$bundle_root"
+
            tar \
              --sort=name \
              --mtime='@1' \
              --owner=0 \
              --group=0 \
              --numeric-owner \
-              -C ${flakeBundleSrc} \
+              -C "$bundle_root" \
              -cf - . \
              | gzip -n > "$out"
          '';
@ -462,6 +772,7 @@
            self.nixosConfigurations.node01.config.system.build.plasmacloudDeployerClusterState;

          vmClusterFlakeBundle = self.packages.${system}.plasmacloudFlakeBundle;
+          vmSmokeBundledTargetToplevel = bundledVmSmokeTargetToplevel;

          # --------------------------------------------------------------------
          # Default package: Build all servers
@ -484,6 +795,7 @@
              self.packages.${system}.k8shost-server
              self.packages.${system}.deployer-server
              self.packages.${system}.deployer-ctl
+              self.packages.${system}.plasmacloud-reconciler
              self.packages.${system}.nix-agent
              self.packages.${system}.node-agent
              self.packages.${system}.fleet-scheduler
@ -556,6 +868,10 @@
            drv = self.packages.${system}.deployer-ctl;
          };

+          plasmacloud-reconciler = flake-utils.lib.mkApp {
+            drv = self.packages.${system}.plasmacloud-reconciler;
+          };
+
          nix-agent = flake-utils.lib.mkApp {
            drv = self.packages.${system}.nix-agent;
          };
@ -568,6 +884,144 @@
            drv = self.packages.${system}.fleet-scheduler;
          };
        };
+
+        checks = {
+          deployer-vm-smoke = pkgs.testers.runNixOSTest (
+            import ./nix/tests/deployer-vm-smoke.nix {
+              inherit pkgs;
+              photoncloudPackages = self.packages.${system};
+              smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
+            }
+          );
+
+          deployer-vm-rollback = pkgs.testers.runNixOSTest (
+            import ./nix/tests/deployer-vm-smoke.nix {
+              inherit pkgs;
+              photoncloudPackages = self.packages.${system};
+              smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
+              desiredSystemOverrides = {
+                health_check_command = [ "false" ];
+                rollback_on_failure = true;
+              };
+              expectedStatus = "rolled-back";
+              expectCurrentSystemMatchesTarget = false;
+              expectMarkerPresent = false;
+            }
+          );
+
+          deployer-bootstrap-e2e = pkgs.runCommand "deployer-bootstrap-e2e" {
+            nativeBuildInputs = with pkgs; [
+              bash
+              coreutils
+              curl
+              findutils
+              gawk
+              gnugrep
+              gnused
+              procps
+              python3
+            ];
+            PHOTONCLOUD_E2E_IN_NIX = "1";
+            PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
+              "${self.packages.${system}.chainfire-server}/bin/chainfire";
+            PHOTONCLOUD_DEPLOYER_SERVER_BIN =
+              "${self.packages.${system}.deployer-server}/bin/deployer-server";
+            PHOTONCLOUD_DEPLOYER_CTL_BIN =
+              "${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
+          } ''
+            export HOME="$TMPDIR/home"
+            mkdir -p "$HOME"
+            export PATH="${pkgs.lib.makeBinPath [
+              pkgs.bash
+              pkgs.coreutils
+              pkgs.curl
+              pkgs.findutils
+              pkgs.gawk
+              pkgs.gnugrep
+              pkgs.gnused
+              pkgs.procps
+              pkgs.python3
+            ]}"
+            bash ${./deployer/scripts/verify-deployer-bootstrap-e2e.sh}
+            touch "$out"
+          '';
+
+          host-lifecycle-e2e = pkgs.runCommand "host-lifecycle-e2e" {
+            nativeBuildInputs = with pkgs; [
+              bash
+              coreutils
+              curl
+              findutils
+              gawk
+              gnugrep
+              gnused
+              procps
+              python3
+            ];
+            PHOTONCLOUD_E2E_IN_NIX = "1";
+            PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
+              "${self.packages.${system}.chainfire-server}/bin/chainfire";
+            PHOTONCLOUD_DEPLOYER_CTL_BIN =
+              "${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
+            PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN =
+              "${self.packages.${system}.plasmacloud-reconciler}/bin/plasmacloud-reconciler";
+          } ''
+            export HOME="$TMPDIR/home"
+            mkdir -p "$HOME"
+            export PATH="${pkgs.lib.makeBinPath [
+              pkgs.bash
+              pkgs.coreutils
+              pkgs.curl
+              pkgs.findutils
+              pkgs.gawk
+              pkgs.gnugrep
+              pkgs.gnused
+              pkgs.procps
+              pkgs.python3
+            ]}"
+            bash ${./deployer/scripts/verify-host-lifecycle-e2e.sh}
+            touch "$out"
+          '';
+
+          fleet-scheduler-e2e = pkgs.runCommand "fleet-scheduler-e2e" {
+            nativeBuildInputs = with pkgs; [
+              bash
+              coreutils
+              curl
+              findutils
+              gawk
+              gnugrep
+              gnused
+              procps
+              python3
+            ];
+            PHOTONCLOUD_E2E_IN_NIX = "1";
+            PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
+              "${self.packages.${system}.chainfire-server}/bin/chainfire";
+            PHOTONCLOUD_DEPLOYER_CTL_BIN =
+              "${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
+            PHOTONCLOUD_NODE_AGENT_BIN =
+              "${self.packages.${system}.node-agent}/bin/node-agent";
+            PHOTONCLOUD_FLEET_SCHEDULER_BIN =
+              "${self.packages.${system}.fleet-scheduler}/bin/fleet-scheduler";
+          } ''
+            export HOME="$TMPDIR/home"
+            mkdir -p "$HOME"
+            export PATH="${pkgs.lib.makeBinPath [
+              pkgs.bash
+              pkgs.coreutils
+              pkgs.curl
+              pkgs.findutils
+              pkgs.gawk
+              pkgs.gnugrep
+              pkgs.gnused
+              pkgs.procps
+              pkgs.python3
+            ]}"
+            bash ${./deployer/scripts/verify-fleet-scheduler-e2e.sh}
+            touch "$out"
+          '';
+        };
      }
    ) // {
      # ========================================================================
@ -606,6 +1060,12 @@
          modules = [ ./nix/images/netboot-base.nix ];
        };

+        # Offline-friendly target used by deployer VM smoke tests.
+        vm-smoke-target = nixpkgs.lib.nixosSystem {
+          system = "x86_64-linux";
+          modules = [ ./nix/images/deployer-vm-smoke-target.nix ];
+        };
+
        # PlasmaCloud ISO (T061.S5 - bootable ISO with cluster-config embedding)
        plasmacloud-iso = nixpkgs.lib.nixosSystem {
          system = "x86_64-linux";
@ -732,6 +1192,7 @@
        k8shost-server = self.packages.${final.system}.k8shost-server;
        deployer-server = self.packages.${final.system}.deployer-server;
        deployer-ctl = self.packages.${final.system}.deployer-ctl;
+        plasmacloud-reconciler = self.packages.${final.system}.plasmacloud-reconciler;
        plasmacloudFlakeBundle = self.packages.${final.system}.plasmacloudFlakeBundle;
        nix-agent = self.packages.${final.system}.nix-agent;
        node-agent = self.packages.${final.system}.node-agent;
--- a/flaredb/crates/flaredb-client/src/client.rs
+++ b/flaredb/crates/flaredb-client/src/client.rs
@ -9,7 +9,7 @@ use flaredb_proto::kvrpc::{
 use flaredb_proto::pdpb::Store;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{SystemTime, UNIX_EPOCH};
+use std::time::{Instant, SystemTime, UNIX_EPOCH};
 use serde::Deserialize;
 use tokio::sync::Mutex;
 use tonic::transport::Channel;
@ -35,6 +35,7 @@ pub struct RdbClient {
    chainfire_kv_client: Option<ChainfireKvClient<Channel>>,

    region_cache: RegionCache,
+    chainfire_route_cache: Arc<Mutex<Option<ChainfireRouteSnapshot>>>,
    namespace: String,
 }

@ -53,10 +54,18 @@ struct ChainfireRegionInfo {
    leader_id: u64,
 }

+#[derive(Debug, Clone)]
+struct ChainfireRouteSnapshot {
+    stores: HashMap<u64, ChainfireStoreInfo>,
+    regions: Vec<ChainfireRegionInfo>,
+    fetched_at: Instant,
+}
+
 impl RdbClient {
    const ROUTE_RETRY_LIMIT: usize = 12;
    const ROUTE_RETRY_BASE_DELAY_MS: u64 = 100;
    const ROUTED_RPC_TIMEOUT: Duration = Duration::from_secs(1);
+    const CHAINFIRE_ROUTE_CACHE_TTL: Duration = Duration::from_secs(2);

    pub async fn connect_with_pd(
        _server_addr: String,
@ -70,26 +79,43 @@ impl RdbClient {
        pd_addr: String,
        namespace: impl Into<String>,
    ) -> Result<Self, tonic::transport::Error> {
+        let pd_endpoints = parse_transport_endpoints(&pd_addr);
+        let normalized_server_addr = normalize_transport_addr(&server_addr);
        // A number of in-repo callers still pass the same address for both server and PD.
        // In that case, prefer direct routing and skip the PD lookup path entirely.
-        let direct_addr = if !server_addr.is_empty() && server_addr == pd_addr {
-            Some(server_addr)
+        let direct_addr = if !normalized_server_addr.is_empty()
+            && pd_endpoints
+                .iter()
+                .any(|endpoint| normalize_transport_addr(endpoint) == normalized_server_addr)
+        {
+            Some(normalized_server_addr.clone())
        } else {
            None
        };
        let (tso_client, pd_client, chainfire_kv_client) = if direct_addr.is_some() {
            (None, None, None)
        } else {
-            let pd_channel = Channel::from_shared(transport_endpoint(&pd_addr))
-                .unwrap()
-                .connect()
-                .await?;
+            let mut last_error = None;
+            let mut clients = None;
+            for endpoint in &pd_endpoints {
+                let pd_channel = match Channel::from_shared(transport_endpoint(endpoint)) {
+                    Ok(endpoint) => match endpoint.connect().await {
+                        Ok(channel) => channel,
+                        Err(error) => {
+                            last_error = Some(error);
+                            continue;
+                        }
+                    },
+                    Err(_) => {
+                        continue;
+                    }
+                };
                let mut probe_client = PdClient::new(pd_channel.clone());
                let probe = probe_client
                    .get_region(GetRegionRequest { key: Vec::new() })
                    .await;

-            match probe {
+                clients = Some(match probe {
                    Err(status) if status.code() == tonic::Code::Unimplemented => (
                        None,
                        None,
@ -100,6 +126,21 @@ impl RdbClient {
                        Some(PdClient::new(pd_channel)),
                        None,
                    ),
+                });
+                break;
+            }
+            if let Some(clients) = clients {
+                clients
+            } else if let Some(error) = last_error {
+                return Err(error);
+            } else {
+                return Err(
+                    Channel::from_shared("http://127.0.0.1:1".to_string())
+                        .unwrap()
+                        .connect()
+                        .await
+                        .expect_err("unreachable fallback endpoint should fail to connect"),
+                );
            }
        };

@ -111,6 +152,7 @@ impl RdbClient {
            chainfire_kv_client,
            region_cache: RegionCache::new(),
            namespace: namespace.into(),
+            chainfire_route_cache: Arc::new(Mutex::new(None)),
        })
    }

@ -119,17 +161,51 @@ impl RdbClient {
        server_addr: String,
        namespace: impl Into<String>,
    ) -> Result<Self, tonic::transport::Error> {
-        let ep = transport_endpoint(&server_addr);
-        let channel = Channel::from_shared(ep).unwrap().connect().await?;
+        let direct_endpoints = parse_transport_endpoints(&server_addr);
+        let mut last_error = None;
+        let mut selected_addr = None;
+        let mut channel = None;
+
+        for endpoint in &direct_endpoints {
+            match Channel::from_shared(transport_endpoint(endpoint)) {
+                Ok(endpoint_builder) => match endpoint_builder.connect().await {
+                    Ok(connected) => {
+                        selected_addr = Some(endpoint.clone());
+                        channel = Some(connected);
+                        break;
+                    }
+                    Err(error) => {
+                        last_error = Some(error);
+                    }
+                },
+                Err(_) => {}
+            }
+        }
+
+        let selected_addr = if let Some(addr) = selected_addr {
+            addr
+        } else if let Some(error) = last_error {
+            return Err(error);
+        } else {
+            return Err(
+                Channel::from_shared("http://127.0.0.1:1".to_string())
+                    .unwrap()
+                    .connect()
+                    .await
+                    .expect_err("unreachable fallback endpoint should fail to connect"),
+            );
+        };
+        let channel = channel.expect("direct connect should produce a channel when selected");

        Ok(Self {
            channels: Arc::new(Mutex::new(HashMap::new())),
-            direct_addr: Some(server_addr),
+            direct_addr: Some(selected_addr),
            tso_client: Some(TsoClient::new(channel.clone())),
            pd_client: Some(PdClient::new(channel)),
            chainfire_kv_client: None,
            region_cache: RegionCache::new(),
            namespace: namespace.into(),
+            chainfire_route_cache: Arc::new(Mutex::new(None)),
        })
    }

@ -165,6 +241,7 @@ impl RdbClient {
        }

        self.region_cache.clear().await;
+        self.invalidate_chainfire_route_cache().await;

        if let Some(chainfire_kv_client) = &self.chainfire_kv_client {
            return self.resolve_addr_via_chainfire(key, chainfire_kv_client.clone()).await;
@ -183,10 +260,6 @@ impl RdbClient {
        Err(tonic::Status::not_found("region not found"))
    }

-    async fn get_channel(&self, addr: &str) -> Result<Channel, tonic::transport::Error> {
-        Self::get_channel_from_map(&self.channels, addr).await
-    }
-
    async fn get_channel_from_map(
        channels: &Arc<Mutex<HashMap<String, Channel>>>,
        addr: &str,
@ -207,6 +280,73 @@ impl RdbClient {
        map.remove(addr);
    }

+    async fn invalidate_chainfire_route_cache(&self) {
+        let mut cache = self.chainfire_route_cache.lock().await;
+        *cache = None;
+    }
+
+    async fn chainfire_route_snapshot(
+        &self,
+        mut kv_client: ChainfireKvClient<Channel>,
+        force_refresh: bool,
+    ) -> Result<ChainfireRouteSnapshot, tonic::Status> {
+        if !force_refresh {
+            if let Some(snapshot) = self.chainfire_route_cache.lock().await.clone() {
+                if snapshot.fetched_at.elapsed() <= Self::CHAINFIRE_ROUTE_CACHE_TTL {
+                    return Ok(snapshot);
+                }
+            }
+        }
+
+        let regions = list_chainfire_regions(&mut kv_client).await?;
+        let stores = list_chainfire_stores(&mut kv_client).await?;
+        let snapshot = ChainfireRouteSnapshot {
+            stores,
+            regions,
+            fetched_at: Instant::now(),
+        };
+        let mut cache = self.chainfire_route_cache.lock().await;
+        *cache = Some(snapshot.clone());
+        Ok(snapshot)
+    }
+
+    fn resolve_addr_from_chainfire_snapshot(
+        &self,
+        key: &[u8],
+        snapshot: &ChainfireRouteSnapshot,
+    ) -> Result<(Region, Store), tonic::Status> {
+        let region = snapshot
+            .regions
+            .iter()
+            .find(|region| {
+                let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
+                let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
+                start_ok && end_ok
+            })
+            .cloned()
+            .ok_or_else(|| tonic::Status::not_found("region not found"))?;
+
+        let leader = snapshot
+            .stores
+            .get(&region.leader_id)
+            .cloned()
+            .ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
+
+        Ok((
+            Region {
+                id: region.id,
+                start_key: region.start_key,
+                end_key: region.end_key,
+                peers: region.peers,
+                leader_id: region.leader_id,
+            },
+            Store {
+                id: leader.id,
+                addr: leader.addr,
+            },
+        ))
+    }
+
    async fn with_routed_addr<T, F, Fut>(&self, key: &[u8], mut op: F) -> Result<T, tonic::Status>
    where
        F: FnMut(String) -> Fut,
@ -590,41 +730,21 @@ impl RdbClient {
    async fn resolve_addr_via_chainfire(
        &self,
        key: &[u8],
-        mut kv_client: ChainfireKvClient<Channel>,
+        kv_client: ChainfireKvClient<Channel>,
    ) -> Result<String, tonic::Status> {
-        let regions = list_chainfire_regions(&mut kv_client).await?;
-        let stores = list_chainfire_stores(&mut kv_client).await?;
+        for force_refresh in [false, true] {
+            let snapshot = self
+                .chainfire_route_snapshot(kv_client.clone(), force_refresh)
+                .await?;
+            if let Ok((region, leader)) =
+                self.resolve_addr_from_chainfire_snapshot(key, &snapshot)
+            {
+                self.region_cache.update(region, leader.clone()).await;
+                return Ok(leader.addr);
+            }
+        }

-        let region = regions
-            .into_iter()
-            .find(|region| {
-                let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
-                let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
-                start_ok && end_ok
-            })
-            .ok_or_else(|| tonic::Status::not_found("region not found"))?;
-
-        let leader = stores
-            .get(&region.leader_id)
-            .ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
-
-        self.region_cache
-            .update(
-                Region {
-                    id: region.id,
-                    start_key: region.start_key,
-                    end_key: region.end_key,
-                    peers: region.peers,
-                    leader_id: region.leader_id,
-                },
-                Store {
-                    id: leader.id,
-                    addr: leader.addr.clone(),
-                },
-            )
-            .await;
-
-        Ok(leader.addr.clone())
+        Err(tonic::Status::not_found("region not found"))
    }
 }

@ -636,6 +756,23 @@ fn transport_endpoint(addr: &str) -> String {
    }
 }

+fn normalize_transport_addr(addr: &str) -> String {
+    addr.trim()
+        .trim_start_matches("http://")
+        .trim_start_matches("https://")
+        .trim_end_matches('/')
+        .to_string()
+}
+
+fn parse_transport_endpoints(addrs: &str) -> Vec<String> {
+    addrs
+        .split(',')
+        .map(str::trim)
+        .filter(|item| !item.is_empty())
+        .map(normalize_transport_addr)
+        .collect()
+}
+
 fn prefix_range_end(prefix: &str) -> Vec<u8> {
    let mut end = prefix.as_bytes().to_vec();
    if let Some(last) = end.last_mut() {
@ -696,7 +833,7 @@ async fn list_chainfire_regions(

 #[cfg(test)]
 mod tests {
-    use super::RdbClient;
+    use super::{RdbClient, normalize_transport_addr, parse_transport_endpoints};

    #[test]
    fn unknown_transport_errors_are_treated_as_retryable_routes() {
@ -711,4 +848,20 @@ mod tests {
        assert!(RdbClient::is_retryable_route_error(&status));
        assert!(!RdbClient::is_transport_error(&status));
    }
+
+    #[test]
+    fn parse_transport_endpoints_accepts_comma_separated_values() {
+        assert_eq!(
+            parse_transport_endpoints("http://10.0.0.1:2379, 10.0.0.2:2379/"),
+            vec!["10.0.0.1:2379".to_string(), "10.0.0.2:2379".to_string()]
+        );
+    }
+
+    #[test]
+    fn normalize_transport_addr_strips_scheme_and_slashes() {
+        assert_eq!(
+            normalize_transport_addr("https://10.0.0.1:2479/"),
+            "10.0.0.1:2479".to_string()
+        );
+    }
 }
--- a/flaredb/crates/flaredb-client/src/main.rs
+++ b/flaredb/crates/flaredb-client/src/main.rs
@ -10,6 +10,9 @@ struct Args {
    #[arg(long, default_value = "127.0.0.1:2479")]
    pd_addr: String,

+    #[arg(long, default_value = "")]
+    namespace: String,
+
    #[command(subcommand)]
    command: Commands,
 }
@ -44,7 +47,8 @@ enum Commands {
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let args = Args::parse();
-    let mut client = RdbClient::connect_with_pd(args.addr, args.pd_addr).await?;
+    let mut client =
+        RdbClient::connect_with_pd_namespace(args.addr, args.pd_addr, args.namespace).await?;

    match args.command {
        Commands::RawPut { key, value } => {
--- a/flaredb/crates/flaredb-pd/src/cluster.rs
+++ b/flaredb/crates/flaredb-pd/src/cluster.rs
@ -28,7 +28,7 @@ impl Cluster {
        }
    }

-    pub fn register_store(&self, addr: String) -> u64 {
+    pub fn register_store(&self, addr: String, requested_id: Option<u64>) -> u64 {
        let mut state = self.inner.lock().unwrap();

        // Dedup check? For now, always new ID.
@ -39,8 +39,15 @@ impl Cluster {
            }
        }

-        let id = state.next_store_id;
+        let id = requested_id
+            .filter(|id| *id != 0 && !state.stores.contains_key(id))
+            .unwrap_or_else(|| {
+                while state.stores.contains_key(&state.next_store_id) {
                    state.next_store_id += 1;
+                }
+                state.next_store_id
+            });
+        state.next_store_id = state.next_store_id.max(id.saturating_add(1));

        state.stores.insert(id, Store { id, addr });

--- a/flaredb/crates/flaredb-pd/src/pd_service.rs
+++ b/flaredb/crates/flaredb-pd/src/pd_service.rs
@ -46,7 +46,8 @@ impl Pd for PdServiceImpl {
        request: Request<RegisterStoreRequest>,
    ) -> Result<Response<RegisterStoreResponse>, Status> {
        let req = request.into_inner();
-        let store_id = self.cluster.register_store(req.addr);
+        let requested_store_id = (req.store_id != 0).then_some(req.store_id);
+        let store_id = self.cluster.register_store(req.addr, requested_store_id);
        Ok(Response::new(RegisterStoreResponse {
            store_id,
            cluster_id: 1, // fixed for now
--- a/flaredb/crates/flaredb-proto/src/pdpb.proto
+++ b/flaredb/crates/flaredb-proto/src/pdpb.proto
@ -29,6 +29,7 @@ service Pd {

 message RegisterStoreRequest {
  string addr = 1; // e.g., "127.0.0.1:50051"
+  uint64 store_id = 2; // Optional requested store ID (0 = auto-assign)
 }

 message RegisterStoreResponse {
--- a/flaredb/crates/flaredb-server/src/heartbeat.rs
+++ b/flaredb/crates/flaredb-server/src/heartbeat.rs
@ -1,23 +1,38 @@
 use crate::store::Store;
 use flaredb_proto::pdpb::pd_client::PdClient;
-use flaredb_proto::pdpb::ListRegionsRequest;
+use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
 use flaredb_types::RegionMeta;
 use std::sync::Arc;
 use tokio::time::{sleep, Duration};

 /// Periodically send region/store heartbeat to PD.
-pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
+pub async fn start_heartbeat(
+    pd_addr: String,
+    store: Arc<Store>,
+    server_addr: String,
+    requested_store_id: u64,
+) {
    tokio::spawn(async move {
        let endpoint = format!("http://{}", pd_addr);
        loop {
            if let Ok(mut client) = PdClient::connect(endpoint.clone()).await {
+                if let Err(err) = client
+                    .register_store(RegisterStoreRequest {
+                        addr: server_addr.clone(),
+                        store_id: requested_store_id,
+                    })
+                    .await
+                {
+                    tracing::warn!("failed to register store with legacy PD: {}", err);
+                }
+
                // list regions to keep routing fresh
                if let Ok(resp) = client.list_regions(ListRegionsRequest {}).await {
                    let resp = resp.into_inner();
                    let mut metas = Vec::new();
                    for r in resp.regions {
                        let voters = if r.peers.is_empty() {
-                            Vec::new()
+                            vec![store.store_id()]
                        } else {
                            r.peers.clone()
                        };
@ -27,11 +42,7 @@ pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
                                start_key: r.start_key,
                                end_key: r.end_key,
                            },
-                            if voters.is_empty() {
-                                vec![store.store_id()]
-                            } else {
-                                voters
-                            },
+                            voters,
                        ));
                    }
                    if !metas.is_empty() {
--- a/flaredb/crates/flaredb-server/src/main.rs
+++ b/flaredb/crates/flaredb-server/src/main.rs
@ -1,6 +1,8 @@
 use clap::Parser;
 use flaredb_proto::kvrpc::kv_cas_server::KvCasServer;
 use flaredb_proto::kvrpc::kv_raw_server::KvRawServer;
+use flaredb_proto::pdpb::pd_client::PdClient as LegacyPdClient;
+use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
 use flaredb_proto::raft_server::raft_service_server::RaftServiceServer;
 use flaredb_proto::sqlrpc::sql_service_server::SqlServiceServer;
 use flaredb_server::config::{self, Config, NamespaceManager};
@ -12,7 +14,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use tokio::sync::Mutex;
 use tokio::time::{sleep, Duration};
-use tonic::transport::{Certificate, Identity, Server, ServerTlsConfig};
+use tonic::transport::{Certificate, Channel, Identity, Server, ServerTlsConfig};
 use tonic_health::server::health_reporter;
 use tracing::{info, warn}; // Import warn
 use tracing_subscriber::EnvFilter;
@ -27,7 +29,7 @@ mod service;
 mod sql_service;
 mod store;

-use pd_client::{PdClient, PdEvent};
+use pd_client::{PdClient as ChainfirePdClient, PdEvent};

 const RAFT_GRPC_MESSAGE_SIZE: usize = 64 * 1024 * 1024;

@ -35,14 +37,18 @@ async fn connect_pd_with_retry(
    pd_endpoints: &[String],
    attempts: u32,
    delay: Duration,
-) -> Option<PdClient> {
+) -> Option<ChainfirePdClient> {
    let mut last_error = None;

    for attempt in 1..=attempts {
-        match PdClient::connect_any(pd_endpoints).await {
+        match ChainfirePdClient::connect_any(pd_endpoints).await {
            Ok(client) => return Some(client),
            Err(err) => {
                last_error = Some(err.to_string());
+                let protocol_mismatch = last_error
+                    .as_deref()
+                    .map(|msg| msg.contains("Unimplemented"))
+                    .unwrap_or(false);
                warn!(
                    attempt,
                    attempts,
@ -50,6 +56,13 @@ async fn connect_pd_with_retry(
                    error = last_error.as_deref().unwrap_or("unknown"),
                    "Failed to connect to FlareDB PD"
                );
+                if protocol_mismatch {
+                    warn!(
+                        ?pd_endpoints,
+                        "PD endpoint does not speak ChainFire; falling back to legacy PD"
+                    );
+                    return None;
+                }
                if attempt < attempts {
                    sleep(delay).await;
                }
@ -65,6 +78,49 @@ async fn connect_pd_with_retry(
    None
 }

+async fn connect_legacy_pd_with_retry(
+    pd_endpoints: &[String],
+    attempts: u32,
+    delay: Duration,
+) -> Option<(String, LegacyPdClient<Channel>)> {
+    let mut last_error = None;
+
+    for attempt in 1..=attempts {
+        for endpoint in pd_endpoints {
+            let transport = if endpoint.starts_with("http") {
+                endpoint.clone()
+            } else {
+                format!("http://{}", endpoint)
+            };
+            match LegacyPdClient::connect(transport.clone()).await {
+                Ok(client) => return Some((endpoint.clone(), client)),
+                Err(err) => {
+                    last_error = Some(format!("{}: {}", endpoint, err));
+                }
+            }
+        }
+
+        warn!(
+            attempt,
+            attempts,
+            ?pd_endpoints,
+            error = last_error.as_deref().unwrap_or("unknown"),
+            "Failed to connect to legacy FlareDB PD"
+        );
+
+        if attempt < attempts {
+            sleep(delay).await;
+        }
+    }
+
+    warn!(
+        ?pd_endpoints,
+        error = last_error.as_deref().unwrap_or("unknown"),
+        "Exhausted legacy FlareDB PD connection retries"
+    );
+    None
+}
+
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 struct Args {
@ -334,7 +390,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        let server_addr_string = server_config.addr.to_string();
        tokio::spawn(async move {
            let client = Arc::new(Mutex::new(
-                PdClient::connect_any(&pd_endpoints_for_task).await.ok(),
+                ChainfirePdClient::connect_any(&pd_endpoints_for_task)
+                    .await
+                    .ok(),
            ));

            loop {
@ -396,7 +454,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                    }
                } else {
                    // Try to reconnect
-                    if let Ok(new_client) = PdClient::connect_any(&pd_endpoints_for_task).await
+                    if let Ok(new_client) =
+                        ChainfirePdClient::connect_any(&pd_endpoints_for_task).await
                    {
                        info!("Reconnected to PD");
                        *guard = Some(new_client);
@ -406,6 +465,75 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
                sleep(Duration::from_secs(10)).await;
            }
        });
+    } else if let Some((legacy_pd_addr, mut legacy_pd_client)) =
+        connect_legacy_pd_with_retry(&pd_endpoints, 3, Duration::from_secs(1)).await
+    {
+        info!(pd_addr = %legacy_pd_addr, "Connected to legacy FlareDB PD");
+
+        match legacy_pd_client
+            .register_store(RegisterStoreRequest {
+                addr: server_config.addr.to_string(),
+                store_id: server_config.store_id,
+            })
+            .await
+        {
+            Ok(resp) => {
+                let resp = resp.into_inner();
+                if resp.store_id != 0 && resp.store_id != server_config.store_id {
+                    warn!(
+                        expected_store_id = server_config.store_id,
+                        assigned_store_id = resp.store_id,
+                        "legacy PD assigned a different store id than local config"
+                    );
+                }
+            }
+            Err(err) => warn!("failed to register with legacy PD: {}", err),
+        }
+
+        let mut region_metas = Vec::new();
+        match legacy_pd_client.list_regions(ListRegionsRequest {}).await {
+            Ok(resp) => {
+                for region in resp.into_inner().regions {
+                    let voters = if region.peers.is_empty() || region.peers.len() < voters.len() {
+                        voters.clone()
+                    } else {
+                        region.peers.clone()
+                    };
+                    region_metas.push((
+                        RegionMeta {
+                            id: region.id,
+                            start_key: region.start_key,
+                            end_key: region.end_key,
+                        },
+                        voters,
+                    ));
+                }
+            }
+            Err(err) => warn!("failed to list regions from legacy PD: {}", err),
+        }
+
+        if region_metas.is_empty() {
+            region_metas.push((
+                RegionMeta {
+                    id: 1,
+                    start_key: Vec::new(),
+                    end_key: Vec::new(),
+                },
+                voters.clone(),
+            ));
+        }
+
+        if let Err(e) = store.bootstrap_regions(region_metas).await {
+            warn!("failed to bootstrap regions from legacy PD: {}", e);
+        }
+
+        heartbeat::start_heartbeat(
+            legacy_pd_addr,
+            store.clone(),
+            server_config.addr.to_string(),
+            server_config.store_id,
+        )
+        .await;
    } else {
        info!("Starting in standalone mode with default region...");
        let _ = store
@ -494,6 +622,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        server_addr: server_config.addr.to_string(),
        pd_endpoints: pd_endpoints.clone(),
        store_id: server_config.store_id,
+        configured_peers: (*peer_addrs).clone(),
    };
    let rest_app = rest::build_router(rest_state);
    let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
--- a/flaredb/crates/flaredb-server/src/rest.rs
+++ b/flaredb/crates/flaredb-server/src/rest.rs
@ -16,8 +16,8 @@ use axum::{
 };
 use crate::pd_client::PdClient;
 use flaredb_client::RdbClient;
-use flaredb_sql::executor::{ExecutionResult, SqlExecutor};
 use serde::{Deserialize, Serialize};
+use std::collections::HashMap;
 use std::sync::Arc;

 /// REST API state
@ -26,6 +26,7 @@ pub struct RestApiState {
    pub server_addr: String,
    pub pd_endpoints: Vec<String>,
    pub store_id: u64,
+    pub configured_peers: HashMap<u64, String>,
 }

 /// Standard REST error response
@ -136,6 +137,15 @@ pub struct AddPeerRequest {
    pub peer_id: u64,
 }

+/// Legacy/admin add member request for first-boot compatibility.
+#[derive(Debug, Deserialize)]
+pub struct AddMemberRequestLegacy {
+    pub id: String,
+    pub raft_addr: String,
+    #[serde(default)]
+    pub addr: Option<String>,
+}
+
 /// Region info response
 #[derive(Debug, Serialize)]
 pub struct RegionResponse {
@ -153,6 +163,7 @@ pub fn build_router(state: RestApiState) -> Router {
        .route("/api/v1/scan", get(scan_kv))
        .route("/api/v1/regions/{id}", get(get_region))
        .route("/api/v1/regions/{id}/add_peer", post(add_peer_to_region))
+        .route("/admin/member/add", post(add_member_legacy))
        .route("/health", get(health_check))
        .with_state(state)
 }
@ -320,6 +331,121 @@ async fn add_peer_to_region(
    })))
 }

+/// POST /admin/member/add - first-boot compatible cluster join hook.
+async fn add_member_legacy(
+    State(state): State<RestApiState>,
+    Json(req): Json<AddMemberRequestLegacy>,
+) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
+    let (peer_id, peer_addr) = resolve_join_peer(&state, &req).ok_or_else(|| {
+        error_response(
+            StatusCode::BAD_REQUEST,
+            "INVALID_MEMBER",
+            "Unable to resolve FlareDB peer id/address from join request",
+        )
+    })?;
+
+    let mut pd_client = PdClient::connect_any(&state.pd_endpoints)
+        .await
+        .map_err(|e| error_response(StatusCode::SERVICE_UNAVAILABLE, "PD_UNAVAILABLE", &format!("Failed to connect to PD: {}", e)))?;
+
+    let stores = pd_client.list_stores().await;
+    let already_registered = stores.iter().any(|store| store.id == peer_id);
+
+    pd_client
+        .register_store(peer_id, peer_addr.clone())
+        .await
+        .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
+
+    let mut regions = pd_client.list_regions().await;
+    if regions.is_empty() {
+        pd_client
+            .init_default_region(vec![state.store_id, peer_id])
+            .await
+            .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
+        regions = vec![crate::pd_client::RegionInfo {
+            id: 1,
+            start_key: Vec::new(),
+            end_key: Vec::new(),
+            peers: vec![state.store_id, peer_id],
+            leader_id: 0,
+        }];
+    }
+
+    let mut updated_regions = Vec::new();
+    for mut region in regions {
+        if !region.peers.contains(&peer_id) {
+            region.peers.push(peer_id);
+            region.peers.sort_unstable();
+            pd_client
+                .put_region(region.clone())
+                .await
+                .map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
+            updated_regions.push(region.id);
+        }
+    }
+
+    let status = if already_registered && updated_regions.is_empty() {
+        StatusCode::CONFLICT
+    } else if already_registered {
+        StatusCode::OK
+    } else {
+        StatusCode::CREATED
+    };
+
+    Ok((
+        status,
+        Json(SuccessResponse::new(serde_json::json!({
+            "peer_id": peer_id,
+            "addr": peer_addr,
+            "updated_regions": updated_regions,
+            "already_registered": already_registered,
+        }))),
+    ))
+}
+
+fn resolve_join_peer(
+    state: &RestApiState,
+    req: &AddMemberRequestLegacy,
+) -> Option<(u64, String)> {
+    if let Ok(peer_id) = req.id.parse::<u64>() {
+        if let Some(addr) = req
+            .addr
+            .clone()
+            .or_else(|| state.configured_peers.get(&peer_id).cloned())
+        {
+            return Some((peer_id, addr));
+        }
+    }
+
+    let candidate_host = socket_host(req.addr.as_deref().unwrap_or(&req.raft_addr));
+    state
+        .configured_peers
+        .iter()
+        .find(|(_, addr)| socket_host(addr) == candidate_host)
+        .map(|(peer_id, addr)| (*peer_id, addr.clone()))
+}
+
+fn socket_host(addr: &str) -> String {
+    let normalized = addr
+        .trim()
+        .trim_start_matches("http://")
+        .trim_start_matches("https://")
+        .split('/')
+        .next()
+        .unwrap_or(addr)
+        .to_string();
+
+    normalized
+        .parse::<std::net::SocketAddr>()
+        .map(|socket_addr| socket_addr.ip().to_string())
+        .unwrap_or_else(|_| {
+            normalized
+                .rsplit_once(':')
+                .map(|(host, _)| host.trim_matches(['[', ']']).to_string())
+                .unwrap_or(normalized)
+        })
+}
+
 /// Helper to create error response
 fn error_response(
    status: StatusCode,
@ -338,3 +464,51 @@ fn error_response(
        }),
    )
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn test_state() -> RestApiState {
+        RestApiState {
+            server_addr: "127.0.0.1:50052".to_string(),
+            pd_endpoints: vec!["127.0.0.1:2479".to_string()],
+            store_id: 1,
+            configured_peers: HashMap::from([
+                (1, "10.100.0.11:50052".to_string()),
+                (2, "10.100.0.12:50052".to_string()),
+                (3, "10.100.0.13:50052".to_string()),
+            ]),
+        }
+    }
+
+    #[test]
+    fn resolve_join_peer_uses_numeric_id_when_available() {
+        let state = test_state();
+        let req = AddMemberRequestLegacy {
+            id: "2".to_string(),
+            raft_addr: "10.100.0.12:2380".to_string(),
+            addr: None,
+        };
+
+        assert_eq!(
+            resolve_join_peer(&state, &req),
+            Some((2, "10.100.0.12:50052".to_string()))
+        );
+    }
+
+    #[test]
+    fn resolve_join_peer_matches_host_from_raft_addr() {
+        let state = test_state();
+        let req = AddMemberRequestLegacy {
+            id: "node02".to_string(),
+            raft_addr: "10.100.0.12:2380".to_string(),
+            addr: None,
+        };
+
+        assert_eq!(
+            resolve_join_peer(&state, &req),
+            Some((2, "10.100.0.12:50052".to_string()))
+        );
+    }
+}
--- a/flaredb/flake.nix
+++ b/flaredb/flake.nix
@ -16,7 +16,7 @@
        };
        
        rustToolchain = pkgs.rust-bin.stable.latest.default.override {
-          extensions = [ "rust-src" "rust-analyzer" ];
+          extensions = [ "rust-src" "rust-analyzer" "rustfmt" ];
        };

      in
--- a/flaredb/scripts/verify-core.sh
+++ b/flaredb/scripts/verify-core.sh
@ -6,13 +6,43 @@ if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
  exec nix develop -c "$0" "$@"
 fi

+WORKDIR=$(mktemp -d)
+PD_LOG="${WORKDIR}/flaredb-pd.log"
+SERVER_LOG="${WORKDIR}/flaredb-server.log"
+DATA_DIR="${WORKDIR}/data"
+
+run_client() {
+  local output=""
+  local status=0
+  local attempt=0
+  while (( attempt < 20 )); do
+    if output=$(cargo run --quiet --bin flaredb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 "$@" 2>&1); then
+      printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
+      return 0
+    fi
+    status=$?
+    attempt=$((attempt + 1))
+    sleep 1
+  done
+  printf '%s\n' "${output}" >&2
+  return "${status}"
+}
+
 cleanup() {
+  local exit_code=$?
  if [[ -n "${SERVER_PID:-}" ]]; then
    kill "$SERVER_PID" >/dev/null 2>&1 || true
  fi
  if [[ -n "${PD_PID:-}" ]]; then
    kill "$PD_PID" >/dev/null 2>&1 || true
  fi
+  if (( exit_code != 0 )); then
+    echo "verify-core failed; logs preserved at ${WORKDIR}" >&2
+    [[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
+    [[ -f "${SERVER_LOG}" ]] && { echo "--- ${SERVER_LOG} ---" >&2; tail -n 200 "${SERVER_LOG}" >&2; }
+    return "${exit_code}"
+  fi
+  rm -rf "${WORKDIR}"
 }
 trap cleanup EXIT

@ -23,30 +53,38 @@ echo "Running tests..."
 cargo test

 echo "Starting PD..."
-cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 >/tmp/rdb-pd.log 2>&1 &
+cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
 PD_PID=$!
 sleep 2

 echo "Starting Server..."
-cargo run --bin rdb-server -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 --data-dir /tmp/rdb-server >/tmp/rdb-server.log 2>&1 &
+cargo run --bin flaredb-server -- \
+  --pd-addr 127.0.0.1:2479 \
+  --addr 127.0.0.1:50052 \
+  --data-dir "${DATA_DIR}" \
+  --namespace-mode raw=eventual \
+  --namespace-mode cas=strong \
+  >"${SERVER_LOG}" 2>&1 &
 SERVER_PID=$!
 sleep 2

 echo "Running Client Verification..."

 echo "Testing TSO..."
-cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 tso
+TSO_OUTPUT=$(run_client tso)
+[[ "${TSO_OUTPUT}" == Timestamp:* ]]

 echo "Testing Raw Put/Get..."
-cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-put --key foo --value bar
-cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-get --key foo
+run_client --namespace raw raw-put --key foo --value bar >/dev/null
+RAW_VALUE=$(run_client --namespace raw raw-get --key foo)
+[[ "${RAW_VALUE}" == "bar" ]]

 echo "Testing CAS success..."
-cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v1 --expected 0
+CAS_SUCCESS=$(run_client --namespace cas cas --key cas1 --value v1 --expected 0)
+[[ "${CAS_SUCCESS}" == Success,* ]]

 echo "Testing CAS conflict..."
-set +e
-cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v2 --expected 0
-set -e
+CAS_CONFLICT=$(run_client --namespace cas cas --key cas1 --value v2 --expected 0)
+[[ "${CAS_CONFLICT}" == Conflict!* ]]

 echo "Verification Complete!"
--- a/flaredb/scripts/verify-multiraft.sh
+++ b/flaredb/scripts/verify-multiraft.sh
@ -1,14 +1,17 @@
 #!/usr/bin/env bash
 set -euo pipefail

-# Run key Multi-Raft test suites.
-echo "[verify] Running multi-region routing tests..."
-nix develop -c cargo test -q rdb-server::tests::test_multi_region
+if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
+  exec nix develop -c "$0" "$@"
+fi

-echo "[verify] Running split tests..."
-nix develop -c cargo test -q rdb-server::tests::test_split
+echo "[verify] Running persistent snapshot recovery tests..."
+cargo test -p flaredb-raft persistent_storage::tests::test_snapshot_persistence_and_recovery

-echo "[verify] Running confchange/move tests..."
-nix develop -c cargo test -q rdb-server::tests::test_confchange_move
+echo "[verify] Running leader election tests..."
+cargo test -p flaredb-raft raft_node::tests::test_leader_election
+
+echo "[verify] Running server read-path tests..."
+cargo test -p flaredb-server service::tests::scan_returns_decoded_cas_keys

 echo "[verify] Done."
--- a/flaredb/scripts/verify-raft.sh
+++ b/flaredb/scripts/verify-raft.sh
@ -1,12 +1,23 @@
 #!/usr/bin/env bash
 set -euo pipefail

+if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
+  exec nix develop -c "$0" "$@"
+fi
+
 export LIBCLANG_PATH=${LIBCLANG_PATH:-/nix/store/0zn99g048j67syaq97rczq5z0j8dsvc8-clang-21.1.2-lib/lib}

 echo "[verify] formatting..."
-cargo fmt --all
+if ! find . \
+  -path ./target -prune -o \
+  -name '*.rs' -print0 | xargs -0 rustfmt --check; then
+  echo "[verify] rustfmt drift detected; continuing with runtime tests" >&2
+fi

-echo "[verify] running rdb-server tests..."
-nix-shell -p protobuf --run "LIBCLANG_PATH=${LIBCLANG_PATH} cargo test -p rdb-server --tests"
+echo "[verify] running FlareDB server tests..."
+cargo test -p flaredb-server --tests
+
+echo "[verify] running FlareDB raft tests..."
+cargo test -p flaredb-raft

 echo "[verify] done."
--- a/flaredb/scripts/verify-sharding.sh
+++ b/flaredb/scripts/verify-sharding.sh
@ -1,40 +1,103 @@
 #!/usr/bin/env bash
-set -e
+set -euo pipefail
+
+if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
+  exec nix develop -c "$0" "$@"
+fi
+
+WORKDIR=$(mktemp -d)
+PD_LOG="${WORKDIR}/flaredb-pd.log"
+S1_LOG="${WORKDIR}/flaredb-server-1.log"
+S2_LOG="${WORKDIR}/flaredb-server-2.log"
+
+run_client() {
+  local addr="$1"
+  shift
+  local output=""
+  local status=0
+  local attempt=0
+  while (( attempt < 20 )); do
+    if output=$(cargo run --quiet --bin flaredb-client -- --addr "${addr}" --pd-addr 127.0.0.1:2479 "$@" 2>&1); then
+      printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
+      return 0
+    fi
+    status=$?
+    attempt=$((attempt + 1))
+    sleep 1
+  done
+  printf '%s\n' "${output}" >&2
+  return "${status}"
+}
+
+cleanup() {
+  local exit_code=$?
+  if [[ -n "${PD_PID:-}" ]]; then
+    kill "${PD_PID}" >/dev/null 2>&1 || true
+  fi
+  if [[ -n "${S1_PID:-}" ]]; then
+    kill "${S1_PID}" >/dev/null 2>&1 || true
+  fi
+  if [[ -n "${S2_PID:-}" ]]; then
+    kill "${S2_PID}" >/dev/null 2>&1 || true
+  fi
+  if (( exit_code != 0 )); then
+    echo "verify-sharding failed; logs preserved at ${WORKDIR}" >&2
+    [[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
+    [[ -f "${S1_LOG}" ]] && { echo "--- ${S1_LOG} ---" >&2; tail -n 200 "${S1_LOG}" >&2; }
+    [[ -f "${S2_LOG}" ]] && { echo "--- ${S2_LOG} ---" >&2; tail -n 200 "${S2_LOG}" >&2; }
+    return "${exit_code}"
+  fi
+  rm -rf "${WORKDIR}"
+}
+trap cleanup EXIT

 echo "Building workspace..."
 cargo build

 echo "Starting PD..."
-cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 &
+cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
 PD_PID=$!
 sleep 2

 echo "Starting Server 1 (127.0.0.1:50001, data1)..."
-# Port 50001
-cargo run --bin rdb-server -- --addr 127.0.0.1:50001 --data-dir data1 --pd-addr 127.0.0.1:2479 &
+cargo run --bin flaredb-server -- \
+  --store-id 1 \
+  --addr 127.0.0.1:50001 \
+  --http-addr 127.0.0.1:8083 \
+  --data-dir "${WORKDIR}/data1" \
+  --pd-addr 127.0.0.1:2479 \
+  --metrics-port 9093 \
+  --namespace-mode raw=eventual \
+  >"${S1_LOG}" 2>&1 &
 S1_PID=$!
+sleep 4

 echo "Starting Server 2 (127.0.0.1:50002, data2)..."
-# Port 50002
-cargo run --bin rdb-server -- --addr 127.0.0.1:50002 --data-dir data2 --pd-addr 127.0.0.1:2479 &
+cargo run --bin flaredb-server -- \
+  --store-id 2 \
+  --addr 127.0.0.1:50002 \
+  --http-addr 127.0.0.1:8084 \
+  --data-dir "${WORKDIR}/data2" \
+  --pd-addr 127.0.0.1:2479 \
+  --metrics-port 9094 \
+  --namespace-mode raw=eventual \
+  >"${S2_LOG}" 2>&1 &
 S2_PID=$!

-sleep 5 # Wait for registration
+sleep 5 # Wait for registration and leader routing to settle

-echo "Running Client Verification (Sharding)..."
+echo "Running Client Verification (multi-node routing smoke)..."

-# Put 'a' (Should go to S1)
 echo "Testing Put 'a'..."
-cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key a --value val_a
+run_client 127.0.0.1:50001 --namespace raw raw-put --key a --value val_a >/dev/null

-# Put 'z' (Should go to S2)
 echo "Testing Put 'z'..."
-cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key z --value val_z
+run_client 127.0.0.1:50002 --namespace raw raw-put --key z --value val_z >/dev/null

-# Cleanup
-kill $PD_PID
-kill $S1_PID
-kill $S2_PID
-rm -rf data1 data2
+echo "Testing reads from both nodes..."
+VALUE_A=$(run_client 127.0.0.1:50002 --namespace raw raw-get --key a)
+VALUE_Z=$(run_client 127.0.0.1:50001 --namespace raw raw-get --key z)
+[[ "${VALUE_A}" == "val_a" ]]
+[[ "${VALUE_Z}" == "val_z" ]]

 echo "Sharding Verification Complete!"
--- a/flashdns/Cargo.lock
+++ b/flashdns/Cargo.lock
--- a/iam/Cargo.lock
+++ b/iam/Cargo.lock
--- a/iam/crates/iam-api/Cargo.toml
+++ b/iam/crates/iam-api/Cargo.toml
@ -23,6 +23,9 @@ prost = { workspace = true }
 base64 = { workspace = true }
 sha2 = { workspace = true }
 uuid = { workspace = true }
+aes-gcm = "0.10"
+argon2 = "0.5"
+rand_core = "0.6"

 [dev-dependencies]
 tokio = { workspace = true, features = ["full", "test-util"] }
--- a/iam/crates/iam-api/src/credential_service.rs
+++ b/iam/crates/iam-api/src/credential_service.rs
@ -8,12 +8,12 @@ use rand_core::{OsRng, RngCore};
 use tonic::{Request, Response, Status};

 use iam_store::CredentialStore;
-use iam_types::{Argon2Params, CredentialRecord};
+use iam_types::{Argon2Params, CredentialRecord, PrincipalKind as TypesPrincipalKind};

 use crate::proto::{
    iam_credential_server::IamCredential, CreateS3CredentialRequest,
    CreateS3CredentialResponse, Credential, GetSecretKeyRequest, GetSecretKeyResponse,
-    ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
+    ListCredentialsRequest, ListCredentialsResponse, PrincipalKind, RevokeCredentialRequest,
    RevokeCredentialResponse,
 };

@ -95,6 +95,15 @@ impl IamCredentialService {
    }
 }

+fn map_principal_kind(kind: i32) -> Result<TypesPrincipalKind, Status> {
+    match PrincipalKind::try_from(kind).unwrap_or(PrincipalKind::Unspecified) {
+        PrincipalKind::User => Ok(TypesPrincipalKind::User),
+        PrincipalKind::ServiceAccount => Ok(TypesPrincipalKind::ServiceAccount),
+        PrincipalKind::Group => Ok(TypesPrincipalKind::Group),
+        PrincipalKind::Unspecified => Err(Status::invalid_argument("principal_kind is required")),
+    }
+}
+
 #[tonic::async_trait]
 impl IamCredential for IamCredentialService {
    async fn create_s3_credential(
@ -103,6 +112,7 @@ impl IamCredential for IamCredentialService {
    ) -> Result<Response<CreateS3CredentialResponse>, Status> {
        let req = request.into_inner();
        let now = now_ts();
+        let principal_kind = map_principal_kind(req.principal_kind)?;
        let (secret_b64, raw_secret) = Self::generate_secret();
        let (hash, kdf) = Self::hash_secret(&raw_secret);
        let secret_enc = self.encrypt_secret(&raw_secret)?;
@ -111,6 +121,9 @@ impl IamCredential for IamCredentialService {
        let record = CredentialRecord {
            access_key_id: access_key_id.clone(),
            principal_id: req.principal_id.clone(),
+            principal_kind,
+            org_id: req.org_id.clone(),
+            project_id: req.project_id.clone(),
            created_at: now,
            expires_at: req.expires_at,
            revoked: false,
@ -168,6 +181,13 @@ impl IamCredential for IamCredentialService {
            secret_key: STANDARD.encode(secret),
            principal_id: record.principal_id,
            expires_at: record.expires_at,
+            org_id: record.org_id,
+            project_id: record.project_id,
+            principal_kind: match record.principal_kind {
+                TypesPrincipalKind::User => PrincipalKind::User as i32,
+                TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
+                TypesPrincipalKind::Group => PrincipalKind::Group as i32,
+            },
        }))
    }

@ -190,6 +210,13 @@ impl IamCredential for IamCredentialService {
                expires_at: c.expires_at,
                revoked: c.revoked,
                description: c.description.unwrap_or_default(),
+                org_id: c.org_id,
+                project_id: c.project_id,
+                principal_kind: match c.principal_kind {
+                    TypesPrincipalKind::User => PrincipalKind::User as i32,
+                    TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
+                    TypesPrincipalKind::Group => PrincipalKind::Group as i32,
+                },
            })
            .collect();
        Ok(Response::new(ListCredentialsResponse { credentials: creds }))
@ -230,6 +257,9 @@ mod tests {
                principal_id: "p1".into(),
                description: "".into(),
                expires_at: None,
+                org_id: Some("org-a".into()),
+                project_id: Some("project-a".into()),
+                principal_kind: PrincipalKind::ServiceAccount as i32,
            }))
            .await
            .unwrap()
@ -247,6 +277,9 @@ mod tests {
        let fetched = STANDARD.decode(get.secret_key).unwrap();
        assert_eq!(orig, fetched);
        assert_eq!(get.principal_id, "p1");
+        assert_eq!(get.org_id.as_deref(), Some("org-a"));
+        assert_eq!(get.project_id.as_deref(), Some("project-a"));
+        assert_eq!(get.principal_kind, PrincipalKind::ServiceAccount as i32);
    }

    #[tokio::test]
@ -257,6 +290,9 @@ mod tests {
                principal_id: "pA".into(),
                description: "".into(),
                expires_at: None,
+                org_id: Some("org-a".into()),
+                project_id: Some("project-a".into()),
+                principal_kind: PrincipalKind::ServiceAccount as i32,
            }))
            .await
            .unwrap()
@ -266,6 +302,9 @@ mod tests {
                principal_id: "pB".into(),
                description: "".into(),
                expires_at: None,
+                org_id: Some("org-b".into()),
+                project_id: Some("project-b".into()),
+                principal_kind: PrincipalKind::ServiceAccount as i32,
            }))
            .await
            .unwrap();
@ -289,6 +328,9 @@ mod tests {
                principal_id: "p1".into(),
                description: "".into(),
                expires_at: None,
+                org_id: Some("org-a".into()),
+                project_id: Some("project-a".into()),
+                principal_kind: PrincipalKind::ServiceAccount as i32,
            }))
            .await
            .unwrap()
@ -297,7 +339,6 @@ mod tests {
        let revoke1 = svc
            .revoke_credential(Request::new(RevokeCredentialRequest {
                access_key_id: created.access_key_id.clone(),
-                reason: "test".into(),
            }))
            .await
            .unwrap()
@ -307,7 +348,6 @@ mod tests {
        let revoke2 = svc
            .revoke_credential(Request::new(RevokeCredentialRequest {
                access_key_id: created.access_key_id.clone(),
-                reason: "again".into(),
            }))
            .await
            .unwrap()
@ -330,6 +370,9 @@ mod tests {
        let expired = CredentialRecord {
            access_key_id: "expired-ak".into(),
            principal_id: "p1".into(),
+            principal_kind: TypesPrincipalKind::ServiceAccount,
+            org_id: Some("org-a".into()),
+            project_id: Some("project-a".into()),
            created_at: now_ts(),
            expires_at: Some(now_ts() - 10),
            revoked: false,
--- a/iam/crates/iam-api/src/lib.rs
+++ b/iam/crates/iam-api/src/lib.rs
@ -1,4 +1,5 @@
 mod conversions;
+mod credential_service;
 mod gateway_auth_service;
 mod generated;
 pub mod iam_service;
@ -8,7 +9,10 @@ pub mod proto {
    pub use crate::generated::iam::v1::*;
 }

-pub use generated::iam::v1::{iam_admin_server, iam_authz_server, iam_token_server};
+pub use generated::iam::v1::{
+    iam_admin_server, iam_authz_server, iam_credential_server, iam_token_server,
+};
+pub use credential_service::IamCredentialService;
 pub use gateway_auth_service::GatewayAuthServiceImpl;
 pub use iam_service::{IamAdminService, IamAuthzService};
 pub use token_service::IamTokenService;
--- a/iam/crates/iam-client/src/client.rs
+++ b/iam/crates/iam-client/src/client.rs
@ -2,6 +2,7 @@
 //!
 //! Provides a thin gRPC client for interacting with the IAM service.

+use std::future::Future;
 use std::time::Duration;

 use iam_api::proto::{
@ -19,6 +20,10 @@ use iam_types::{
 };
 use tonic::transport::{Channel, ClientTlsConfig, Endpoint};

+const TRANSIENT_RPC_RETRY_ATTEMPTS: usize = 3;
+const TRANSIENT_RPC_INITIAL_BACKOFF: Duration = Duration::from_millis(200);
+const TRANSIENT_RPC_MAX_BACKOFF: Duration = Duration::from_millis(1_000);
+
 /// Configuration for the IAM client
 #[derive(Debug, Clone)]
 pub struct IamClientConfig {
@ -100,6 +105,40 @@ impl IamClient {
        IamTokenClient::new(self.channel.clone())
    }

+    async fn call_with_retry<T, F, Fut>(operation: &'static str, mut op: F) -> Result<T>
+    where
+        F: FnMut() -> Fut,
+        Fut: Future<Output = std::result::Result<T, tonic::Status>>,
+    {
+        let mut last_status = None;
+        for attempt in 0..TRANSIENT_RPC_RETRY_ATTEMPTS {
+            match op().await {
+                Ok(value) => return Ok(value),
+                Err(status)
+                    if attempt + 1 < TRANSIENT_RPC_RETRY_ATTEMPTS
+                        && is_retryable_status(&status) =>
+                {
+                    let delay = retry_delay(attempt);
+                    tracing::warn!(
+                        operation,
+                        attempt = attempt + 1,
+                        retry_after_ms = delay.as_millis() as u64,
+                        code = ?status.code(),
+                        message = status.message(),
+                        "retrying transient IAM RPC"
+                    );
+                    last_status = Some(status);
+                    tokio::time::sleep(delay).await;
+                }
+                Err(status) => return Err(map_status(status)),
+            }
+        }
+
+        Err(map_status(last_status.unwrap_or_else(|| {
+            tonic::Status::internal(format!("IAM RPC {operation} failed without a status"))
+        })))
+    }
+
    // ========================================================================
    // Authorization APIs
    // ========================================================================
@ -128,7 +167,6 @@ impl IamClient {
        resource: &Resource,
        context: std::collections::HashMap<String, String>,
    ) -> Result<bool> {
-        let mut client = self.authz_client();
        let request = AuthorizeRequest {
            principal: Some(to_proto_principal_ref(&principal.to_ref())),
            action: action.to_string(),
@ -151,10 +189,12 @@ impl IamClient {
            }),
        };

-        let resp = client
-            .authorize(request)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("authorize", || {
+            let mut client = self.authz_client();
+            let request = request.clone();
+            async move { client.authorize(request).await }
+        })
+        .await?
        .into_inner();

        Ok(resp.allowed)
@ -166,7 +206,6 @@ impl IamClient {

    /// Create a new user
    pub async fn create_user(&self, id: &str, name: &str) -> Result<Principal> {
-        let mut client = self.admin_client();
        let req = CreatePrincipalRequest {
            id: id.into(),
            kind: ProtoPrincipalKind::User as i32,
@ -177,25 +216,31 @@ impl IamClient {
            metadata: Default::default(),
        };

-        let resp = client
-            .create_principal(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("create_principal", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.create_principal(req).await }
+        })
+        .await?
        .into_inner();
        Ok(ProtoPrincipal::into(resp))
    }

    /// Get a principal
    pub async fn get_principal(&self, principal_ref: &PrincipalRef) -> Result<Option<Principal>> {
-        let mut client = self.admin_client();
        let req = GetPrincipalRequest {
            principal: Some(to_proto_principal_ref(principal_ref)),
        };
-        let resp = client.get_principal(req).await;
+        let resp = Self::call_with_retry("get_principal", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.get_principal(req).await }
+        })
+        .await;
        match resp {
            Ok(r) => Ok(Some(ProtoPrincipal::into(r.into_inner()))),
-            Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
-            Err(status) => Err(map_status(status)),
+            Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
+            Err(err) => Err(err),
        }
    }

@ -206,7 +251,6 @@ impl IamClient {
        name: &str,
        project_id: &str,
    ) -> Result<Principal> {
-        let mut client = self.admin_client();
        let req = CreatePrincipalRequest {
            id: id.into(),
            kind: ProtoPrincipalKind::ServiceAccount as i32,
@ -216,17 +260,18 @@ impl IamClient {
            email: None,
            metadata: Default::default(),
        };
-        let resp = client
-            .create_principal(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("create_service_account", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.create_principal(req).await }
+        })
+        .await?
        .into_inner();
        Ok(ProtoPrincipal::into(resp))
    }

    /// List users
    pub async fn list_users(&self) -> Result<Vec<Principal>> {
-        let mut client = self.admin_client();
        let req = ListPrincipalsRequest {
            kind: Some(ProtoPrincipalKind::User as i32),
            org_id: None,
@ -235,10 +280,12 @@ impl IamClient {
            page_token: String::new(),
        };

-        let resp = client
-            .list_principals(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("list_principals", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.list_principals(req).await }
+        })
+        .await?
        .into_inner();

        Ok(resp
@ -254,36 +301,40 @@ impl IamClient {

    /// Get a role by name
    pub async fn get_role(&self, name: &str) -> Result<Option<Role>> {
-        let mut client = self.admin_client();
        let req = GetRoleRequest { name: name.into() };
-        let resp = client.get_role(req).await;
+        let resp = Self::call_with_retry("get_role", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.get_role(req).await }
+        })
+        .await;
        match resp {
            Ok(r) => Ok(Some(r.into_inner().into())),
-            Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
-            Err(status) => Err(map_status(status)),
+            Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
+            Err(err) => Err(err),
        }
    }

    /// List all roles
    pub async fn list_roles(&self) -> Result<Vec<Role>> {
-        let mut client = self.admin_client();
        let req = ListRolesRequest {
            scope: None,
            include_builtin: true,
            page_size: 0,
            page_token: String::new(),
        };
-        let resp = client
-            .list_roles(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("list_roles", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.list_roles(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.roles.into_iter().map(Into::into).collect())
    }

    /// Create a custom role
    pub async fn create_role(&self, role: &Role) -> Result<Role> {
-        let mut client = self.admin_client();
        let req = CreateRoleRequest {
            name: role.name.clone(),
            display_name: role.display_name.clone(),
@ -297,10 +348,12 @@ impl IamClient {
                .collect(),
        };

-        let resp = client
-            .create_role(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("create_role", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.create_role(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.into())
    }
@ -311,7 +364,6 @@ impl IamClient {

    /// Create a policy binding
    pub async fn create_binding(&self, binding: &PolicyBinding) -> Result<PolicyBinding> {
-        let mut client = self.admin_client();
        let req = CreateBindingRequest {
            principal: Some(to_proto_principal_ref(&binding.principal_ref)),
            role: binding.role_ref.clone(),
@ -320,24 +372,27 @@ impl IamClient {
            expires_at: binding.expires_at,
        };

-        let resp = client
-            .create_binding(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("create_binding", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.create_binding(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.into())
    }

    /// Delete a policy binding
    pub async fn delete_binding(&self, binding_id: &str) -> Result<bool> {
-        let mut client = self.admin_client();
        let req = DeleteBindingRequest {
            id: binding_id.into(),
        };
-        let resp = client
-            .delete_binding(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("delete_binding", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.delete_binding(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.deleted)
    }
@ -347,7 +402,6 @@ impl IamClient {
        &self,
        principal: &PrincipalRef,
    ) -> Result<Vec<PolicyBinding>> {
-        let mut client = self.admin_client();
        let req = ListBindingsRequest {
            principal: Some(to_proto_principal_ref(principal)),
            role: None,
@ -357,17 +411,18 @@ impl IamClient {
            page_token: String::new(),
        };

-        let resp = client
-            .list_bindings(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("list_bindings_for_principal", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.list_bindings(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.bindings.into_iter().map(Into::into).collect())
    }

    /// List bindings for a scope
    pub async fn list_bindings_for_scope(&self, scope: &Scope) -> Result<Vec<PolicyBinding>> {
-        let mut client = self.admin_client();
        let req = ListBindingsRequest {
            principal: None,
            role: None,
@ -377,10 +432,12 @@ impl IamClient {
            page_token: String::new(),
        };

-        let resp = client
-            .list_bindings(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("list_bindings_for_scope", || {
+            let mut client = self.admin_client();
+            let req = req.clone();
+            async move { client.list_bindings(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.bindings.into_iter().map(Into::into).collect())
    }
@ -397,7 +454,6 @@ impl IamClient {
        scope: Scope,
        ttl_seconds: u64,
    ) -> Result<String> {
-        let mut client = self.token_client();
        let req = IssueTokenRequest {
            principal_id: principal.id.clone(),
            principal_kind: match principal.kind {
@ -410,24 +466,27 @@ impl IamClient {
            ttl_seconds,
        };

-        let resp = client
-            .issue_token(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("issue_token", || {
+            let mut client = self.token_client();
+            let req = req.clone();
+            async move { client.issue_token(req).await }
+        })
+        .await?
        .into_inner();
        Ok(resp.token)
    }

    /// Validate a token
    pub async fn validate_token(&self, token: &str) -> Result<InternalTokenClaims> {
-        let mut client = self.token_client();
        let req = ValidateTokenRequest {
            token: token.to_string(),
        };
-        let resp = client
-            .validate_token(req)
-            .await
-            .map_err(map_status)?
+        let resp = Self::call_with_retry("validate_token", || {
+            let mut client = self.token_client();
+            let req = req.clone();
+            async move { client.validate_token(req).await }
+        })
+        .await?
        .into_inner();

        if !resp.valid {
@ -479,20 +538,55 @@ impl IamClient {

    /// Revoke a token
    pub async fn revoke_token(&self, token: &str) -> Result<()> {
-        let mut client = self.token_client();
        let req = RevokeTokenRequest {
            token: token.to_string(),
            reason: "client revoke".into(),
        };
-        client
-            .revoke_token(req)
-            .await
-            .map_err(map_status)?
+        Self::call_with_retry("revoke_token", || {
+            let mut client = self.token_client();
+            let req = req.clone();
+            async move { client.revoke_token(req).await }
+        })
+        .await?
        .into_inner();
        Ok(())
    }
 }

+fn retry_delay(attempt: usize) -> Duration {
+    TRANSIENT_RPC_INITIAL_BACKOFF
+        .saturating_mul(1u32 << attempt.min(3))
+        .min(TRANSIENT_RPC_MAX_BACKOFF)
+}
+
+fn is_retryable_status(status: &tonic::Status) -> bool {
+    matches!(
+        status.code(),
+        tonic::Code::Unavailable
+            | tonic::Code::Cancelled
+            | tonic::Code::DeadlineExceeded
+            | tonic::Code::Unknown
+    ) || retryable_message(status.message())
+}
+
+fn retryable_message(message: &str) -> bool {
+    let lower = message.to_ascii_lowercase();
+    [
+        "transport error",
+        "connection was not ready",
+        "h2 protocol error",
+        "broken pipe",
+        "connection refused",
+        "connection reset",
+    ]
+    .iter()
+    .any(|needle| lower.contains(needle))
+}
+
+fn tonic_not_found(message: &str) -> bool {
+    message.contains("status: NotFound") || message.contains("code: NotFound")
+}
+
 fn map_status(status: tonic::Status) -> Error {
    Error::Internal(status.to_string())
 }
@ -507,3 +601,75 @@ fn to_proto_principal_ref(principal_ref: &PrincipalRef) -> ProtoPrincipalRef {
        id: principal_ref.id.clone(),
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::sync::{
+        atomic::{AtomicUsize, Ordering},
+        Arc,
+    };
+
+    #[test]
+    fn retryable_message_covers_connection_readiness() {
+        assert!(retryable_message("transport error"));
+        assert!(retryable_message("connection was not ready"));
+        assert!(retryable_message("h2 protocol error"));
+        assert!(!retryable_message("permission denied"));
+    }
+
+    #[test]
+    fn retry_delay_is_capped() {
+        assert_eq!(retry_delay(0), Duration::from_millis(200));
+        assert_eq!(retry_delay(1), Duration::from_millis(400));
+        assert_eq!(retry_delay(2), Duration::from_millis(800));
+        assert_eq!(retry_delay(3), Duration::from_millis(1000));
+        assert_eq!(retry_delay(7), Duration::from_millis(1000));
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn call_with_retry_retries_transient_statuses() {
+        let attempts = Arc::new(AtomicUsize::new(0));
+        let attempts_for_task = attempts.clone();
+        let task = tokio::spawn(async move {
+            IamClient::call_with_retry("test", || {
+                let attempts = attempts_for_task.clone();
+                async move {
+                    let attempt = attempts.fetch_add(1, Ordering::SeqCst);
+                    if attempt < 2 {
+                        Err(tonic::Status::unavailable("connection was not ready"))
+                    } else {
+                        Ok("ok")
+                    }
+                }
+            })
+            .await
+        });
+
+        tokio::time::advance(Duration::from_secs(3)).await;
+        assert_eq!(task.await.unwrap().unwrap(), "ok");
+        assert_eq!(attempts.load(Ordering::SeqCst), 3);
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn call_with_retry_stops_on_non_retryable_status() {
+        let attempts = Arc::new(AtomicUsize::new(0));
+        let attempts_for_task = attempts.clone();
+
+        let err = IamClient::call_with_retry("test", || {
+            let attempts = attempts_for_task.clone();
+            async move {
+                attempts.fetch_add(1, Ordering::SeqCst);
+                Err::<(), _>(tonic::Status::permission_denied("nope"))
+            }
+        })
+        .await
+        .unwrap_err();
+
+        assert_eq!(attempts.load(Ordering::SeqCst), 1);
+        match err {
+            Error::Internal(message) => assert!(message.contains("PermissionDenied")),
+            other => panic!("unexpected error: {other:?}"),
+        }
+    }
+}
--- a/iam/crates/iam-server/src/main.rs
+++ b/iam/crates/iam-server/src/main.rs
@ -20,12 +20,15 @@ use tracing::{info, warn};

 use iam_api::{
    iam_admin_server::IamAdminServer, iam_authz_server::IamAuthzServer,
-    iam_token_server::IamTokenServer, GatewayAuthServiceImpl, GatewayAuthServiceServer,
-    IamAdminService, IamAuthzService, IamTokenService,
+    iam_credential_server::IamCredentialServer, iam_token_server::IamTokenServer,
+    GatewayAuthServiceImpl, GatewayAuthServiceServer, IamAdminService, IamAuthzService,
+    IamCredentialService, IamTokenService,
 };
 use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
 use iam_authz::{PolicyCache, PolicyCacheConfig, PolicyEvaluator};
-use iam_store::{Backend, BackendConfig, BindingStore, PrincipalStore, RoleStore, TokenStore};
+use iam_store::{
+    Backend, BackendConfig, BindingStore, CredentialStore, PrincipalStore, RoleStore, TokenStore,
+};

 use config::{BackendKind, ServerConfig};

@ -190,6 +193,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let principal_store = Arc::new(PrincipalStore::new(backend.clone()));
    let role_store = Arc::new(RoleStore::new(backend.clone()));
    let binding_store = Arc::new(BindingStore::new(backend.clone()));
+    let credential_store = Arc::new(CredentialStore::new(backend.clone()));
    let token_store = Arc::new(TokenStore::new(backend.clone()));

    // Initialize builtin roles
@ -238,7 +242,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        )
    };

-    let token_config = InternalTokenConfig::new(signing_key, &config.authn.internal_token.issuer)
+    let token_config =
+        InternalTokenConfig::new(signing_key.clone(), &config.authn.internal_token.issuer)
        .with_default_ttl(Duration::from_secs(
            config.authn.internal_token.default_ttl_seconds,
        ))
@ -248,6 +253,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {

    let token_service = Arc::new(InternalTokenService::new(token_config));
    let admin_token = load_admin_token();
+    let credential_master_key = std::env::var("IAM_CRED_MASTER_KEY")
+        .ok()
+        .map(|value| value.into_bytes())
+        .filter(|value| value.len() == 32)
+        .unwrap_or_else(|| {
+            warn!(
+                "IAM_CRED_MASTER_KEY missing or not 32 bytes, deriving credential key from signing key",
+            );
+            signing_key.sign(b"iam-credential-master-key")
+        });

    // Create gRPC services
    let authz_service = IamAuthzService::new(evaluator.clone(), principal_store.clone());
@ -262,6 +277,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        token_store.clone(),
        evaluator.clone(),
    );
+    let credential_service =
+        IamCredentialService::new(credential_store, &credential_master_key, "iam-cred-master")
+            .map_err(|e| format!("Failed to initialize credential service: {}", e))?;
    let admin_service = IamAdminService::new(
        principal_store.clone(),
        role_store.clone(),
@ -291,6 +309,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    health_reporter
        .set_serving::<IamTokenServer<IamTokenService>>()
        .await;
+    health_reporter
+        .set_serving::<IamCredentialServer<IamCredentialService>>()
+        .await;
    health_reporter
        .set_serving::<IamAdminServer<IamAdminService>>()
        .await;
@ -357,6 +378,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        .add_service(health_service)
        .add_service(IamAuthzServer::new(authz_service))
        .add_service(IamTokenServer::new(token_grpc_service))
+        .add_service(IamCredentialServer::new(credential_service))
        .add_service(GatewayAuthServiceServer::new(gateway_auth_service))
        .add_service(admin_server)
        .serve(config.server.addr);
--- a/iam/crates/iam-service-auth/Cargo.toml
+++ b/iam/crates/iam-service-auth/Cargo.toml
@ -9,5 +9,6 @@ iam-client = { path = "../iam-client" }
 iam-types = { path = "../iam-types" }
 tonic = { workspace = true }
 tracing = { workspace = true }
+tokio = { workspace = true }
 http = "1"
 serde_json = "1"
--- a/iam/crates/iam-service-auth/src/lib.rs
+++ b/iam/crates/iam-service-auth/src/lib.rs
@ -16,6 +16,9 @@ use tracing::{debug, warn};
 const PHOTON_AUTH_TOKEN_HEADER: &str = "x-photon-auth-token";
 const DEFAULT_TOKEN_CACHE_TTL_MS: u64 = 5_000;
 const DEFAULT_AUTHZ_CACHE_TTL_MS: u64 = 3_000;
+const AUTH_CONNECT_RETRY_ATTEMPTS: usize = 6;
+const AUTH_CONNECT_INITIAL_BACKOFF: Duration = Duration::from_millis(500);
+const AUTH_CONNECT_MAX_BACKOFF: Duration = Duration::from_secs(5);

 #[derive(Debug, Clone)]
 struct CacheEntry<T> {
@ -64,9 +67,7 @@ impl AuthService {
            config = config.without_tls();
        }

-        let iam_client = IamClient::connect(config)
-            .await
-            .map_err(|e| format!("Failed to connect to IAM server: {}", e))?;
+        let iam_client = connect_iam_with_retry(config).await?;

        Ok(Self {
            iam_client: Arc::new(iam_client),
@ -273,6 +274,59 @@ impl AuthService {
    }
 }

+async fn connect_iam_with_retry(config: IamClientConfig) -> Result<IamClient, String> {
+    let mut last_error = None;
+    for attempt in 0..AUTH_CONNECT_RETRY_ATTEMPTS {
+        match IamClient::connect(config.clone()).await {
+            Ok(client) => return Ok(client),
+            Err(err)
+                if attempt + 1 < AUTH_CONNECT_RETRY_ATTEMPTS
+                    && retryable_connect_error(&err.to_string()) =>
+            {
+                let delay = auth_connect_retry_delay(attempt);
+                warn!(
+                    attempt = attempt + 1,
+                    retry_after_ms = delay.as_millis() as u64,
+                    error = %err,
+                    "retrying IAM auth service bootstrap connection"
+                );
+                last_error = Some(err.to_string());
+                tokio::time::sleep(delay).await;
+            }
+            Err(err) => {
+                return Err(format!("Failed to connect to IAM server: {}", err));
+            }
+        }
+    }
+
+    Err(format!(
+        "Failed to connect to IAM server: {}",
+        last_error.unwrap_or_else(|| "unknown connection error".to_string())
+    ))
+}
+
+fn auth_connect_retry_delay(attempt: usize) -> Duration {
+    AUTH_CONNECT_INITIAL_BACKOFF
+        .saturating_mul(1u32 << attempt.min(4))
+        .min(AUTH_CONNECT_MAX_BACKOFF)
+}
+
+fn retryable_connect_error(message: &str) -> bool {
+    let lower = message.to_ascii_lowercase();
+    [
+        "transport error",
+        "connection refused",
+        "connection was not ready",
+        "operation timed out",
+        "deadline has elapsed",
+        "dns error",
+        "broken pipe",
+        "connection reset",
+    ]
+    .iter()
+    .any(|needle| lower.contains(needle))
+}
+
 fn prune_expired<T>(cache: &mut HashMap<String, CacheEntry<T>>) {
    let now = Instant::now();
    cache.retain(|_, entry| entry.expires_at > now);
@ -400,6 +454,29 @@ fn extract_token_from_metadata(metadata: &MetadataMap) -> Result<String, Status>
    ))
 }

+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn retryable_connect_error_matches_transport_failures() {
+        assert!(retryable_connect_error("Internal error: transport error"));
+        assert!(retryable_connect_error("connection was not ready"));
+        assert!(retryable_connect_error("deadline has elapsed"));
+        assert!(!retryable_connect_error("permission denied"));
+    }
+
+    #[test]
+    fn auth_connect_retry_delay_is_capped() {
+        assert_eq!(auth_connect_retry_delay(0), Duration::from_millis(500));
+        assert_eq!(auth_connect_retry_delay(1), Duration::from_millis(1000));
+        assert_eq!(auth_connect_retry_delay(2), Duration::from_millis(2000));
+        assert_eq!(auth_connect_retry_delay(3), Duration::from_millis(4000));
+        assert_eq!(auth_connect_retry_delay(4), Duration::from_secs(5));
+        assert_eq!(auth_connect_retry_delay(8), Duration::from_secs(5));
+    }
+}
+
 fn extract_token_from_headers(headers: &HeaderMap) -> Result<String, Status> {
    if let Some(auth_header) = headers.get(AUTHORIZATION) {
        let auth_str = auth_header
--- a/iam/crates/iam-store/src/credential_store.rs
+++ b/iam/crates/iam-store/src/credential_store.rs
@ -1,24 +1,25 @@
 //! Credential storage (access/secret key metadata)

+use std::sync::Arc;
+
 use iam_types::{CredentialRecord, Result};

-use crate::backend::JsonStore;
-use crate::{DynMetadataClient, MetadataClient};
+use crate::backend::{Backend, CasResult, JsonStore, StorageBackend};

 /// Store for credentials (S3/API keys)
 pub struct CredentialStore {
-    client: DynMetadataClient,
+    backend: Arc<Backend>,
 }

 impl JsonStore for CredentialStore {
-    fn client(&self) -> &dyn MetadataClient {
-        self.client.as_ref()
+    fn backend(&self) -> &Backend {
+        &self.backend
    }
 }

 impl CredentialStore {
-    pub fn new(client: DynMetadataClient) -> Self {
-        Self { client }
+    pub fn new(backend: Arc<Backend>) -> Self {
+        Self { backend }
    }

    pub async fn put(&self, record: &CredentialRecord) -> Result<u64> {
@ -36,13 +37,17 @@ impl CredentialStore {
        principal_id: &str,
        limit: u32,
    ) -> Result<Vec<CredentialRecord>> {
-        // scan prefix and filter by principal_id; small cardinality expected
        let prefix = b"iam/credentials/";
-        let items = self.scan_prefix_json::<CredentialRecord>(prefix, limit).await?;
-        Ok(items
-            .into_iter()
-            .filter(|rec| rec.principal_id == principal_id)
-            .collect())
+        let items = self.backend.scan_prefix(prefix, limit).await?;
+        let mut credentials = Vec::new();
+        for pair in items {
+            let record: CredentialRecord = serde_json::from_slice(&pair.value)
+                .map_err(|e| iam_types::Error::Serialization(e.to_string()))?;
+            if record.principal_id == principal_id {
+                credentials.push(record);
+            }
+        }
+        Ok(credentials)
    }

    pub async fn revoke(&self, access_key_id: &str) -> Result<bool> {
@ -56,13 +61,10 @@ impl CredentialStore {
            return Ok(false);
        }
        record.revoked = true;
-        match self
-            .cas_json(key.as_bytes(), version, &record)
-            .await?
-        {
-            crate::CasResult::Success(_) => Ok(true),
-            crate::CasResult::Conflict { .. } => Ok(false),
-            crate::CasResult::NotFound => Ok(false),
+        match self.cas_json(key.as_bytes(), version, &record).await? {
+            CasResult::Success(_) => Ok(true),
+            CasResult::Conflict { .. } => Ok(false),
+            CasResult::NotFound => Ok(false),
        }
    }
 }
--- a/iam/crates/iam-store/src/lib.rs
+++ b/iam/crates/iam-store/src/lib.rs
@ -7,6 +7,7 @@

 pub mod backend;
 pub mod binding_store;
+pub mod credential_store;
 pub mod group_store;
 pub mod principal_store;
 pub mod role_store;
@ -14,6 +15,7 @@ pub mod token_store;

 pub use backend::{Backend, BackendConfig, CasResult, KvPair, StorageBackend};
 pub use binding_store::BindingStore;
+pub use credential_store::CredentialStore;
 pub use group_store::GroupStore;
 pub use principal_store::PrincipalStore;
 pub use role_store::RoleStore;
--- a/iam/crates/iam-types/src/credential.rs
+++ b/iam/crates/iam-types/src/credential.rs
@ -2,6 +2,8 @@

 use serde::{Deserialize, Serialize};

+use crate::PrincipalKind;
+
 /// Argon2 parameters used to hash the secret key
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Argon2Params {
@ -17,6 +19,9 @@ pub struct Argon2Params {
 pub struct CredentialRecord {
    pub access_key_id: String,
    pub principal_id: String,
+    pub principal_kind: PrincipalKind,
+    pub org_id: Option<String>,
+    pub project_id: Option<String>,
    pub created_at: u64,
    pub expires_at: Option<u64>,
    pub revoked: bool,
--- a/iam/crates/iam-types/src/lib.rs
+++ b/iam/crates/iam-types/src/lib.rs
@ -10,6 +10,7 @@
 //! - Error types

 pub mod condition;
+pub mod credential;
 pub mod error;
 pub mod policy;
 pub mod principal;
@ -19,6 +20,7 @@ pub mod scope;
 pub mod token;

 pub use condition::{Condition, ConditionExpr};
+pub use credential::{Argon2Params, CredentialRecord};
 pub use error::{Error, IamError, Result, StorageError};
 pub use policy::{CreateBindingRequest, EffectivePolicy, PolicyBinding};
 pub use principal::{Principal, PrincipalKind, PrincipalRef};
--- a/iam/proto/iam.proto
+++ b/iam/proto/iam.proto
@ -89,6 +89,14 @@ service IamToken {
    rpc RefreshToken(RefreshTokenRequest) returns (RefreshTokenResponse);
 }

+// IamCredential manages S3-style access/secret key credentials.
+service IamCredential {
+    rpc CreateS3Credential(CreateS3CredentialRequest) returns (CreateS3CredentialResponse);
+    rpc GetSecretKey(GetSecretKeyRequest) returns (GetSecretKeyResponse);
+    rpc ListCredentials(ListCredentialsRequest) returns (ListCredentialsResponse);
+    rpc RevokeCredential(RevokeCredentialRequest) returns (RevokeCredentialResponse);
+}
+
 message IssueTokenRequest {
    // Principal to issue token for
    string principal_id = 1;
@ -162,6 +170,63 @@ message RefreshTokenResponse {
    uint64 expires_at = 2;
 }

+message CreateS3CredentialRequest {
+    string principal_id = 1;
+    string description = 2;
+    optional uint64 expires_at = 3;
+    optional string org_id = 4;
+    optional string project_id = 5;
+    PrincipalKind principal_kind = 6;
+}
+
+message CreateS3CredentialResponse {
+    string access_key_id = 1;
+    string secret_key = 2;
+    uint64 created_at = 3;
+    optional uint64 expires_at = 4;
+}
+
+message GetSecretKeyRequest {
+    string access_key_id = 1;
+}
+
+message GetSecretKeyResponse {
+    string secret_key = 1;
+    string principal_id = 2;
+    optional uint64 expires_at = 3;
+    optional string org_id = 4;
+    optional string project_id = 5;
+    PrincipalKind principal_kind = 6;
+}
+
+message ListCredentialsRequest {
+    string principal_id = 1;
+}
+
+message Credential {
+    string access_key_id = 1;
+    string principal_id = 2;
+    uint64 created_at = 3;
+    optional uint64 expires_at = 4;
+    bool revoked = 5;
+    string description = 6;
+    optional string org_id = 7;
+    optional string project_id = 8;
+    PrincipalKind principal_kind = 9;
+}
+
+message ListCredentialsResponse {
+    repeated Credential credentials = 1;
+}
+
+message RevokeCredentialRequest {
+    string access_key_id = 1;
+}
+
+message RevokeCredentialResponse {
+    bool success = 1;
+}
+
 message InternalTokenClaims {
    string principal_id = 1;
    PrincipalKind principal_kind = 2;
--- a/k8shost/Cargo.lock
+++ b/k8shost/Cargo.lock
--- a/lightningstor/Cargo.lock
+++ b/lightningstor/Cargo.lock
--- a/lightningstor/crates/lightningstor-distributed/src/backends/erasure_coded.rs
+++ b/lightningstor/crates/lightningstor-distributed/src/backends/erasure_coded.rs
@ -10,6 +10,8 @@ use crate::node::{NodeClientTrait, NodeRegistry};
 use crate::placement::{ConsistentHashSelector, NodeSelector};
 use async_trait::async_trait;
 use bytes::Bytes;
+use futures::future::BoxFuture;
+use futures::stream::{FuturesUnordered, StreamExt};
 use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
 use lightningstor_types::ObjectId;
 use serde::{Deserialize, Serialize};
@ -336,7 +338,7 @@ impl ErasureCodedBackend {
            .map_err(|e| StorageError::Backend(e.to_string()))?;

        // Try to read all shards in parallel
-        let mut shard_futures = Vec::with_capacity(self.total_shards());
+        let mut shard_futures = FuturesUnordered::new();
        for shard_idx in 0..self.total_shards() {
            let is_parity = shard_idx >= self.data_shards;
            let chunk_id = ChunkId::new(object_id, chunk_index, shard_idx, is_parity);
@ -345,35 +347,73 @@ impl ErasureCodedBackend {
            let chunk_key = chunk_id.to_key();

            shard_futures.push(async move {
-                // Try to read from the preferred node first
-                if let Ok(node) = node_selector.select_for_read(&nodes, &chunk_key).await {
-                    if let Ok(data) = node
-                        .get_chunk(&chunk_key, shard_idx as u32, is_parity)
+                let preferred_id = node_selector
+                    .select_for_read(&nodes, &chunk_key)
                    .await
+                    .ok()
+                    .map(|node| node.node_id().to_string());
+                let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
+                    FuturesUnordered::new();
+
+                if let Some(preferred_id) = preferred_id.as_ref() {
+                    if let Some(preferred) = nodes
+                        .iter()
+                        .find(|node| node.node_id() == preferred_id.as_str())
+                        .cloned()
                    {
-                        return Some(data);
+                        let key = chunk_key.clone();
+                        readers.push(Box::pin(async move {
+                            preferred
+                                .get_chunk(&key, shard_idx as u32, is_parity)
+                                .await
+                                .ok()
+                        }));
                    }
                }

-                // Try other nodes if preferred fails
                for node in &nodes {
-                    if let Ok(data) = node
-                        .get_chunk(&chunk_key, shard_idx as u32, is_parity)
-                        .await
+                    if preferred_id
+                        .as_ref()
+                        .is_some_and(|preferred| preferred == node.node_id())
                    {
-                        return Some(data);
+                        continue;
+                    }
+                    let node = node.clone();
+                    let key = chunk_key.clone();
+                    readers.push(Box::pin(async move {
+                        node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
+                    }));
+                }
+
+                while let Some(result) = readers.next().await {
+                    if let Some(data) = result {
+                        return (shard_idx, Some(data));
                    }
                }

-                None
+                (shard_idx, None)
            });
        }

-        let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
+        let mut shard_results = vec![None; self.total_shards()];
+        let mut available_count = 0usize;
+
+        while let Some((shard_idx, shard)) = shard_futures.next().await {
+            if shard.is_some() {
+                available_count += 1;
+            }
+            shard_results[shard_idx] = shard;
+
+            if available_count >= self.data_shards {
+                break;
+            }
+
+            if available_count + shard_futures.len() < self.data_shards {
+                break;
+            }
+        }

        // Count available shards
-        let available_count = shard_results.iter().filter(|s| s.is_some()).count();
-
        debug!(
            object_id = %object_id,
            chunk_index,
@ -419,9 +459,9 @@ impl StorageBackend for ErasureCodedBackend {
        debug!(object_id = %object_id, size = original_size, "Putting object with erasure coding");

        // Split data into chunks
-        let chunks = self.chunk_manager.split(&data);
+        let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
+        let chunks = self.chunk_manager.split_with_chunk_size(&data, chunk_size);
        let chunk_count = chunks.len();
-        let chunk_size = self.chunk_manager.chunk_size();

        // Write each chunk
        for (chunk_idx, chunk_data) in chunks.into_iter().enumerate() {
@ -591,24 +631,78 @@ impl StorageBackend for ErasureCodedBackend {
            .map_err(|e| StorageError::Backend(e.to_string()))?;

        // Try to read shards
-        let mut shard_futures = Vec::with_capacity(self.total_shards());
+        let mut shard_futures = FuturesUnordered::new();
        for shard_idx in 0..self.total_shards() {
            let is_parity = shard_idx >= self.data_shards;
            let key = format!("{}_{}_{}", part_key, shard_idx, if is_parity { "p" } else { "d" });
            let nodes = nodes.clone();
+            let node_selector = self.node_selector.clone();

            shard_futures.push(async move {
+                let preferred_id = node_selector
+                    .select_for_read(&nodes, &key)
+                    .await
+                    .ok()
+                    .map(|node| node.node_id().to_string());
+                let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
+                    FuturesUnordered::new();
+
+                if let Some(preferred_id) = preferred_id.as_ref() {
+                    if let Some(preferred) = nodes
+                        .iter()
+                        .find(|node| node.node_id() == preferred_id.as_str())
+                        .cloned()
+                    {
+                        let key = key.clone();
+                        readers.push(Box::pin(async move {
+                            preferred
+                                .get_chunk(&key, shard_idx as u32, is_parity)
+                                .await
+                                .ok()
+                        }));
+                    }
+                }
+
                for node in &nodes {
-                    if let Ok(data) = node.get_chunk(&key, shard_idx as u32, is_parity).await {
-                        return Some(data);
+                    if preferred_id
+                        .as_ref()
+                        .is_some_and(|preferred| preferred == node.node_id())
+                    {
+                        continue;
+                    }
+                    let node = node.clone();
+                    let key = key.clone();
+                    readers.push(Box::pin(async move {
+                        node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
+                    }));
+                }
+
+                while let Some(result) = readers.next().await {
+                    if let Some(data) = result {
+                        return (shard_idx, Some(data));
                    }
                }
-                None
+                (shard_idx, None)
            });
        }

-        let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
-        let available = shard_results.iter().filter(|s| s.is_some()).count();
+        let mut shard_results = vec![None; self.total_shards()];
+        let mut available = 0usize;
+
+        while let Some((shard_idx, shard)) = shard_futures.next().await {
+            if shard.is_some() {
+                available += 1;
+            }
+            shard_results[shard_idx] = shard;
+
+            if available >= self.data_shards {
+                break;
+            }
+
+            if available + shard_futures.len() < self.data_shards {
+                break;
+            }
+        }

        if available < self.data_shards {
            return Err(StorageError::Backend(format!(
@ -674,7 +768,135 @@ impl StorageBackend for ErasureCodedBackend {
 mod tests {
    use super::*;
    use crate::config::{ChunkConfig, RedundancyMode};
-    use crate::node::MockNodeRegistry;
+    use crate::node::{MockNodeClient, MockNodeRegistry, NodeError, NodeResult};
+    use async_trait::async_trait;
+    use dashmap::DashMap;
+    use std::time::{Duration, Instant};
+    use tokio::time::sleep;
+
+    struct SlowReadNodeClient {
+        node_id: String,
+        endpoint: String,
+        delay: Duration,
+        chunks: DashMap<String, Vec<u8>>,
+    }
+
+    impl SlowReadNodeClient {
+        fn new(node_id: impl Into<String>, endpoint: impl Into<String>, delay: Duration) -> Self {
+            Self {
+                node_id: node_id.into(),
+                endpoint: endpoint.into(),
+                delay,
+                chunks: DashMap::new(),
+            }
+        }
+
+        fn insert_chunk(&self, chunk_id: impl Into<String>, data: Vec<u8>) {
+            self.chunks.insert(chunk_id.into(), data);
+        }
+    }
+
+    #[async_trait]
+    impl NodeClientTrait for SlowReadNodeClient {
+        fn node_id(&self) -> &str {
+            &self.node_id
+        }
+
+        fn endpoint(&self) -> &str {
+            &self.endpoint
+        }
+
+        async fn is_healthy(&self) -> bool {
+            true
+        }
+
+        async fn put_chunk(
+            &self,
+            chunk_id: &str,
+            _shard_index: u32,
+            _is_parity: bool,
+            data: Bytes,
+        ) -> NodeResult<()> {
+            self.chunks.insert(chunk_id.to_string(), data.to_vec());
+            Ok(())
+        }
+
+        async fn get_chunk(
+            &self,
+            chunk_id: &str,
+            _shard_index: u32,
+            _is_parity: bool,
+        ) -> NodeResult<Bytes> {
+            sleep(self.delay).await;
+            self.chunks
+                .get(chunk_id)
+                .map(|value| Bytes::from(value.value().clone()))
+                .ok_or_else(|| NodeError::NotFound(chunk_id.to_string()))
+        }
+
+        async fn delete_chunk(&self, chunk_id: &str) -> NodeResult<()> {
+            self.chunks.remove(chunk_id);
+            Ok(())
+        }
+
+        async fn chunk_exists(&self, chunk_id: &str) -> NodeResult<bool> {
+            Ok(self.chunks.contains_key(chunk_id))
+        }
+
+        async fn chunk_size(&self, chunk_id: &str) -> NodeResult<Option<u64>> {
+            Ok(self
+                .chunks
+                .get(chunk_id)
+                .map(|value| value.value().len() as u64))
+        }
+
+        async fn ping(&self) -> NodeResult<Duration> {
+            Ok(Duration::from_millis(1))
+        }
+    }
+
+    struct FixedNodeRegistry {
+        nodes: Vec<Arc<dyn NodeClientTrait>>,
+    }
+
+    #[async_trait]
+    impl NodeRegistry for FixedNodeRegistry {
+        async fn get_all_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
+            Ok(self.nodes.clone())
+        }
+
+        async fn get_healthy_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
+            Ok(self.nodes.clone())
+        }
+
+        async fn register_node(&self, _info: crate::node::NodeInfo) -> NodeResult<()> {
+            Ok(())
+        }
+
+        async fn deregister_node(&self, _node_id: &str) -> NodeResult<()> {
+            Ok(())
+        }
+
+        async fn update_health(&self, _node_id: &str, _healthy: bool) -> NodeResult<()> {
+            Ok(())
+        }
+
+        async fn get_node(&self, node_id: &str) -> NodeResult<Option<Arc<dyn NodeClientTrait>>> {
+            Ok(self
+                .nodes
+                .iter()
+                .find(|node| node.node_id() == node_id)
+                .cloned())
+        }
+
+        async fn node_count(&self) -> usize {
+            self.nodes.len()
+        }
+
+        async fn healthy_node_count(&self) -> usize {
+            self.nodes.len()
+        }
+    }

    fn create_ec_config(data_shards: usize, parity_shards: usize) -> DistributedConfig {
        DistributedConfig {
@ -858,4 +1080,162 @@ mod tests {
        assert_eq!(retrieved.len(), data.len());
        assert_eq!(retrieved, data);
    }
+
+    #[tokio::test]
+    async fn test_ec_backend_read_returns_after_minimum_shards() {
+        let config = create_ec_config(4, 2);
+        let mut fast_nodes = Vec::new();
+        for index in 0..4 {
+            fast_nodes.push(Arc::new(MockNodeClient::new(
+                format!("fast-{index}"),
+                format!("http://fast-{index}:9002"),
+            )));
+        }
+        let slow_a = Arc::new(SlowReadNodeClient::new(
+            "slow-a",
+            "http://slow-a:9002",
+            Duration::from_millis(250),
+        ));
+        let slow_b = Arc::new(SlowReadNodeClient::new(
+            "slow-b",
+            "http://slow-b:9002",
+            Duration::from_millis(250),
+        ));
+
+        let backend = ErasureCodedBackend::new(
+            config,
+            Arc::new(FixedNodeRegistry {
+                nodes: vec![
+                    fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
+                    slow_a.clone() as Arc<dyn NodeClientTrait>,
+                    slow_b.clone() as Arc<dyn NodeClientTrait>,
+                ],
+            }),
+        )
+        .await
+        .unwrap();
+
+        let object_id = ObjectId::new();
+        let data = Bytes::from(vec![5u8; 512]);
+        let metadata = ObjectMetadata::new(data.len() as u64, 1, data.len());
+        let meta_key = ObjectMetadata::metadata_key(&object_id);
+        let shards = backend.codec.encode(&data).unwrap();
+
+        for fast_node in &fast_nodes {
+            fast_node
+                .put_chunk(&meta_key, 0, false, Bytes::from(metadata.to_bytes()))
+                .await
+                .unwrap();
+        }
+        for slow_node in [&slow_a, &slow_b] {
+            slow_node.insert_chunk(meta_key.clone(), metadata.to_bytes());
+        }
+
+        for (shard_idx, shard_data) in shards.into_iter().enumerate() {
+            let is_parity = shard_idx >= backend.data_shards;
+            let key = ChunkId::new(&object_id, 0, shard_idx, is_parity).to_key();
+            if shard_idx < 4 {
+                fast_nodes[shard_idx]
+                    .put_chunk(
+                        &key,
+                        shard_idx as u32,
+                        is_parity,
+                        Bytes::from(shard_data),
+                    )
+                    .await
+                    .unwrap();
+            } else if shard_idx == 4 {
+                slow_a.insert_chunk(key, shard_data);
+            } else {
+                slow_b.insert_chunk(key, shard_data);
+            }
+        }
+
+        let started = Instant::now();
+        let retrieved = backend.get_object(&object_id).await.unwrap();
+        let elapsed = started.elapsed();
+
+        assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
+        assert_eq!(retrieved, data);
+    }
+
+    #[tokio::test]
+    async fn test_ec_backend_get_part_returns_after_minimum_shards() {
+        let config = create_ec_config(4, 2);
+        let mut fast_nodes = Vec::new();
+        for index in 0..4 {
+            fast_nodes.push(Arc::new(MockNodeClient::new(
+                format!("fast-{index}"),
+                format!("http://fast-{index}:9002"),
+            )));
+        }
+        let slow_a = Arc::new(SlowReadNodeClient::new(
+            "slow-a",
+            "http://slow-a:9002",
+            Duration::from_millis(250),
+        ));
+        let slow_b = Arc::new(SlowReadNodeClient::new(
+            "slow-b",
+            "http://slow-b:9002",
+            Duration::from_millis(250),
+        ));
+
+        let backend = ErasureCodedBackend::new(
+            config,
+            Arc::new(FixedNodeRegistry {
+                nodes: vec![
+                    fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
+                    fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
+                    slow_a.clone() as Arc<dyn NodeClientTrait>,
+                    slow_b.clone() as Arc<dyn NodeClientTrait>,
+                ],
+            }),
+        )
+        .await
+        .unwrap();
+
+        let upload_id = "upload-latency";
+        let part_number = 7;
+        let data = Bytes::from(vec![9u8; 512]);
+        let shards = backend.codec.encode(&data).unwrap();
+
+        for (shard_idx, shard_data) in shards.into_iter().enumerate() {
+            let is_parity = shard_idx >= backend.data_shards;
+            let key = format!(
+                "part_{}_{}_{}_{}",
+                upload_id,
+                part_number,
+                shard_idx,
+                if is_parity { "p" } else { "d" }
+            );
+
+            if shard_idx < 4 {
+                fast_nodes[shard_idx]
+                    .put_chunk(
+                        &key,
+                        shard_idx as u32,
+                        is_parity,
+                        Bytes::from(shard_data),
+                    )
+                    .await
+                    .unwrap();
+            } else if shard_idx == 4 {
+                slow_a.insert_chunk(key, shard_data);
+            } else {
+                slow_b.insert_chunk(key, shard_data);
+            }
+        }
+
+        let started = Instant::now();
+        let retrieved = backend.get_part(upload_id, part_number).await.unwrap();
+        let elapsed = started.elapsed();
+
+        assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
+        assert_eq!(retrieved, data);
+    }
 }
--- a/lightningstor/crates/lightningstor-distributed/src/backends/replicated.rs
+++ b/lightningstor/crates/lightningstor-distributed/src/backends/replicated.rs
@ -5,13 +5,15 @@

 use crate::chunk::ChunkManager;
 use crate::config::DistributedConfig;
-use crate::node::{NodeClientTrait, NodeError, NodeRegistry};
+use crate::node::{NodeClientTrait, NodeError, NodeRegistry, NodeResult};
 use crate::placement::{ConsistentHashSelector, NodeSelector};
+use crate::repair::{RepairQueue, ReplicatedRepairTask};
 use async_trait::async_trait;
 use bytes::{Bytes, BytesMut};
 use futures::stream::{FuturesUnordered, StreamExt};
 use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
 use lightningstor_types::ObjectId;
+use std::net::IpAddr;
 use std::sync::Arc;
 use std::time::Duration;
 use tracing::{debug, error, warn};
@ -81,6 +83,8 @@ pub struct ReplicatedBackend {
    read_quorum: usize,
    /// Write quorum (minimum replicas for successful write)
    write_quorum: usize,
+    /// Durable queue for repairing under-replicated chunks.
+    repair_queue: Option<Arc<dyn RepairQueue>>,
 }

 impl ReplicatedBackend {
@ -92,6 +96,15 @@ impl ReplicatedBackend {
    pub async fn new(
        config: DistributedConfig,
        node_registry: Arc<dyn NodeRegistry>,
+    ) -> StorageResult<Self> {
+        Self::new_with_repair_queue(config, node_registry, None).await
+    }
+
+    /// Create a replicated backend with an optional durable repair queue.
+    pub async fn new_with_repair_queue(
+        config: DistributedConfig,
+        node_registry: Arc<dyn NodeRegistry>,
+        repair_queue: Option<Arc<dyn RepairQueue>>,
    ) -> StorageResult<Self> {
        let (replica_count, read_quorum, write_quorum) = match &config.redundancy {
            crate::config::RedundancyMode::Replicated {
@ -116,6 +129,7 @@ impl ReplicatedBackend {
            replica_count,
            read_quorum,
            write_quorum,
+            repair_queue,
        })
    }

@ -134,6 +148,89 @@ impl ReplicatedBackend {
        self.write_quorum
    }

+    async fn finalize_pending_replica_writes(
+        repair_queue: Option<Arc<dyn RepairQueue>>,
+        mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
+        key: String,
+        shard_index: u32,
+        mut success_count: usize,
+        total_replicas: usize,
+        reason: String,
+    ) {
+        let mut errors = Vec::new();
+
+        while let Some(result) = pending_writes.next().await {
+            match result {
+                Ok((_, Ok(()))) => success_count += 1,
+                Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
+                Err(join_err) => errors.push(format!("join error: {join_err}")),
+            }
+        }
+
+        if success_count >= total_replicas {
+            return;
+        }
+
+        if let Some(queue) = repair_queue {
+            queue
+                .enqueue_repair(ReplicatedRepairTask::new(key.clone(), shard_index, reason))
+                .await;
+        }
+
+        warn!(
+            chunk_key = %key,
+            shard_index,
+            success_count,
+            total_replicas,
+            errors = ?errors,
+            "Replica write completed below desired replication; repair task queued"
+        );
+    }
+
+    async fn finalize_pending_chunked_write_repairs(
+        repair_queue: Option<Arc<dyn RepairQueue>>,
+        mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
+        repair_targets: Vec<(String, u32)>,
+        object_id: String,
+        mut success_count: usize,
+        total_replicas: usize,
+        reason: String,
+    ) {
+        let mut errors = Vec::new();
+
+        while let Some(result) = pending_writes.next().await {
+            match result {
+                Ok((_, Ok(()))) => success_count += 1,
+                Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
+                Err(join_err) => errors.push(format!("join error: {join_err}")),
+            }
+        }
+
+        if success_count >= total_replicas {
+            return;
+        }
+
+        if let Some(queue) = repair_queue {
+            for (chunk_key, shard_index) in repair_targets {
+                queue
+                    .enqueue_repair(ReplicatedRepairTask::new(
+                        chunk_key,
+                        shard_index,
+                        reason.clone(),
+                    ))
+                    .await;
+            }
+        }
+
+        warn!(
+            object_id = %object_id,
+            success_count,
+            total_replicas,
+            errors = ?errors,
+            "Chunked replica write completed below desired replication; repair tasks queued"
+        );
+    }
+
    fn chunk_write_parallelism(&self, chunk_count: usize) -> usize {
        chunk_count
            .min(
@ -220,7 +317,13 @@ impl ReplicatedBackend {
            ));
        }

-        if let Ok(preferred) = self.node_selector.select_for_read(nodes, key).await {
+        let mut ordered_nodes = Self::ordered_read_nodes(nodes, self
+            .node_selector
+            .select_for_read(nodes, key)
+            .await
+            .ok());
+
+        if let Some(preferred) = ordered_nodes.first() {
            match preferred.get_chunk(key, shard_index, false).await {
                Ok(data) => return Ok(Some(data)),
                Err(NodeError::NotFound(_)) => {}
@ -235,7 +338,7 @@ impl ReplicatedBackend {
            }
        }

-        for node in nodes {
+        for node in ordered_nodes.drain(1..) {
            match node.get_chunk(key, shard_index, false).await {
                Ok(data) => return Ok(Some(data)),
                Err(NodeError::NotFound(_)) => continue,
@ -383,6 +486,21 @@ impl ReplicatedBackend {
                Ok((_, Ok(()))) => {
                    success_count += 1;
                    if success_count >= self.write_quorum {
+                        if success_count < total_replicas {
+                            let pending_writes =
+                                std::mem::replace(&mut write_futures, FuturesUnordered::new());
+                            tokio::spawn(Self::finalize_pending_replica_writes(
+                                self.repair_queue.clone(),
+                                pending_writes,
+                                key.clone(),
+                                shard_index,
+                                success_count,
+                                total_replicas,
+                                format!(
+                                    "replica write completed below desired replication after quorum ({success_count}/{total_replicas})"
+                                ),
+                            ));
+                        }
                        debug!(
                            chunk_key = %key,
                            success_count,
@ -427,13 +545,13 @@ impl ReplicatedBackend {
    }

    async fn write_chunked_object(&self, object_id: &ObjectId, data: Bytes) -> StorageResult<()> {
-        let chunk_size = self.chunk_manager.chunk_size();
-        let chunk_count = self.chunk_manager.chunk_count(data.len());
+        let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
+        let chunk_count = ChunkManager::chunk_count_for_size(data.len(), chunk_size);
        let metadata = ReplicatedObjectMetadata::new(data.len(), chunk_count, chunk_size);
        let mut requests = Vec::with_capacity(chunk_count + 1);
        for chunk_index in 0..chunk_count {
            let chunk_key = Self::object_chunk_key(object_id, chunk_index);
-            let (start, len) = self.chunk_manager.chunk_range(data.len(), chunk_index);
+            let (start, len) = ChunkManager::chunk_range_for_size(data.len(), chunk_index, chunk_size);
            let chunk_bytes = data.slice(start..start + len);
            requests.push((chunk_key, chunk_index as u32, false, chunk_bytes));
        }
@ -464,6 +582,27 @@ impl ReplicatedBackend {
                Ok((_, Ok(()))) => {
                    success_count += 1;
                    if success_count >= self.write_quorum {
+                        if success_count < total_replicas {
+                            let repair_targets = requests
+                                .iter()
+                                .map(|(chunk_key, shard_index, _, _)| {
+                                    (chunk_key.clone(), *shard_index)
+                                })
+                                .collect::<Vec<_>>();
+                            let pending_writes =
+                                std::mem::replace(&mut write_futures, FuturesUnordered::new());
+                            tokio::spawn(Self::finalize_pending_chunked_write_repairs(
+                                self.repair_queue.clone(),
+                                pending_writes,
+                                repair_targets,
+                                object_id.to_string(),
+                                success_count,
+                                total_replicas,
+                                format!(
+                                    "chunked object write completed below desired replication after quorum ({success_count}/{total_replicas})"
+                                ),
+                            ));
+                        }
                        debug!(
                            object_id = %object_id,
                            chunk_count,
@ -509,6 +648,150 @@ impl ReplicatedBackend {
        )))
    }

+    pub async fn repair_chunk(&self, task: &ReplicatedRepairTask) -> StorageResult<()> {
+        let healthy_nodes = self
+            .node_registry
+            .get_healthy_nodes()
+            .await
+            .map_err(|e| StorageError::Backend(e.to_string()))?;
+        if healthy_nodes.is_empty() {
+            return Err(StorageError::Backend(
+                "No healthy storage nodes available for repair".to_string(),
+            ));
+        }
+        let desired_nodes = self
+            .node_selector
+            .select_nodes_for_key(&healthy_nodes, self.replica_count, &task.key)
+            .await
+            .map_err(|e| StorageError::Backend(e.to_string()))?;
+
+        let mut present_nodes = Vec::new();
+        let mut missing_nodes = Vec::new();
+        for node in desired_nodes {
+            match node.chunk_exists(&task.key).await {
+                Ok(true) => present_nodes.push(node),
+                Ok(false) => missing_nodes.push(node),
+                Err(err) => {
+                    warn!(
+                        chunk_key = task.key,
+                        node_id = node.node_id(),
+                        error = ?err,
+                        "Failed to inspect chunk during repair; treating replica as missing"
+                    );
+                    missing_nodes.push(node);
+                }
+            }
+        }
+
+        if missing_nodes.is_empty() {
+            return Ok(());
+        }
+
+        if present_nodes.is_empty() {
+            let desired_node_ids = missing_nodes
+                .iter()
+                .map(|node| node.node_id().to_string())
+                .collect::<std::collections::HashSet<_>>();
+            for node in healthy_nodes {
+                if desired_node_ids.contains(node.node_id()) {
+                    continue;
+                }
+                match node.chunk_exists(&task.key).await {
+                    Ok(true) => {
+                        present_nodes.push(node);
+                        break;
+                    }
+                    Ok(false) => {}
+                    Err(err) => {
+                        warn!(
+                            chunk_key = task.key,
+                            node_id = node.node_id(),
+                            error = ?err,
+                            "Failed to inspect off-placement chunk during repair"
+                        );
+                    }
+                }
+            }
+        }
+
+        let source = present_nodes.first().ok_or_else(|| {
+            StorageError::Backend(format!(
+                "Cannot repair {} because no healthy source replica is available",
+                task.key
+            ))
+        })?;
+
+        let data = source
+            .get_chunk(&task.key, task.shard_index, false)
+            .await
+            .map_err(|err| {
+                StorageError::Backend(format!(
+                    "Failed to load repair source for {} from {}: {}",
+                    task.key,
+                    source.node_id(),
+                    err
+                ))
+            })?;
+
+        let mut repair_futures = FuturesUnordered::new();
+        for node in missing_nodes {
+            let node_id = node.node_id().to_string();
+            let key = task.key.clone();
+            let chunk = data.clone();
+            let shard_index = task.shard_index;
+            repair_futures.push(tokio::spawn(async move {
+                let result = node.put_chunk(&key, shard_index, false, chunk).await;
+                (node_id, result)
+            }));
+        }
+
+        let mut repaired = 0usize;
+        let mut errors = Vec::new();
+        while let Some(result) = repair_futures.next().await {
+            match result {
+                Ok((_, Ok(()))) => repaired += 1,
+                Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
+                Err(join_err) => errors.push(format!("join error: {join_err}")),
+            }
+        }
+
+        if errors.is_empty() {
+            return Ok(());
+        }
+
+        Err(StorageError::Backend(format!(
+            "Repair for {} only restored {} replicas: {}",
+            task.key,
+            repaired,
+            errors.join(", ")
+        )))
+    }
+
+    pub async fn chunk_exists_anywhere(&self, key: &str) -> StorageResult<bool> {
+        let nodes = self
+            .node_registry
+            .get_all_nodes()
+            .await
+            .map_err(|e| StorageError::Backend(e.to_string()))?;
+
+        for node in nodes {
+            match node.chunk_exists(key).await {
+                Ok(true) => return Ok(true),
+                Ok(false) => {}
+                Err(err) => {
+                    warn!(
+                        chunk_key = key,
+                        node_id = node.node_id(),
+                        error = ?err,
+                        "Failed to inspect chunk while probing global existence"
+                    );
+                }
+            }
+        }
+
+        Ok(false)
+    }
+
    async fn read_chunked_object(
        &self,
        object_id: &ObjectId,
@ -521,24 +804,47 @@ impl ReplicatedBackend {
            .map_err(|e| StorageError::Backend(e.to_string()))?;

        if !nodes.is_empty() {
-            let mut ordered_nodes = Vec::with_capacity(nodes.len());
-            if let Ok(preferred) = self
+            let preferred = self
                .node_selector
                .select_for_read(&nodes, &Self::object_chunk_key(object_id, 0))
                .await
+                .ok();
+            let ordered_nodes = Self::ordered_read_nodes(&nodes, preferred);
+
+            if metadata.chunk_count > 1 {
+                if let Some(local_node) = ordered_nodes.iter().find(|node| Self::is_local_node(node))
                {
-                ordered_nodes.push(preferred.clone());
+                    let batch_requests: Vec<(String, u32, bool)> = (0..metadata.chunk_count)
+                        .map(|chunk_index| {
+                            (
+                                Self::object_chunk_key(object_id, chunk_index),
+                                chunk_index as u32,
+                                false,
+                            )
+                        })
+                        .collect();
+                    match local_node.batch_get_chunks(batch_requests).await {
+                        Ok(chunks) => {
+                            return Self::assemble_chunked_bytes(
+                                object_id,
+                                metadata.original_size,
+                                chunks,
+                            );
+                        }
+                        Err(err) => {
+                            warn!(
+                                object_id = %object_id,
+                                node_id = local_node.node_id(),
+                                error = ?err,
+                                "Local replica batch read failed, falling back to distributed reads"
+                            );
+                        }
                    }
-            for node in nodes {
-                if ordered_nodes
-                    .iter()
-                    .all(|existing| existing.node_id() != node.node_id())
-                {
-                    ordered_nodes.push(node);
                }
            }

-            if ordered_nodes.len() > 1 && metadata.chunk_count > 1 {
+            if ordered_nodes.len() > 1 && metadata.chunk_count > 1 && !Self::has_local_node(&ordered_nodes)
+            {
                match self
                    .read_chunked_object_from_distributed_batches(
                        object_id,
@ -783,6 +1089,74 @@ impl ReplicatedBackend {
        combined.truncate(original_size);
        Ok(combined.freeze())
    }
+
+    fn ordered_read_nodes(
+        nodes: &[Arc<dyn NodeClientTrait>],
+        preferred: Option<Arc<dyn NodeClientTrait>>,
+    ) -> Vec<Arc<dyn NodeClientTrait>> {
+        let mut ordered = Vec::with_capacity(nodes.len());
+
+        if let Some(local) = nodes.iter().find(|node| Self::is_local_node(node)) {
+            ordered.push(local.clone());
+        }
+
+        if let Some(preferred) = preferred {
+            if ordered
+                .iter()
+                .all(|existing| existing.node_id() != preferred.node_id())
+            {
+                ordered.push(preferred);
+            }
+        }
+
+        for node in nodes {
+            if ordered
+                .iter()
+                .all(|existing| existing.node_id() != node.node_id())
+            {
+                ordered.push(node.clone());
+            }
+        }
+
+        ordered
+    }
+
+    fn has_local_node(nodes: &[Arc<dyn NodeClientTrait>]) -> bool {
+        nodes.iter().any(Self::is_local_node)
+    }
+
+    fn is_local_node(node: &Arc<dyn NodeClientTrait>) -> bool {
+        Self::endpoint_is_local(node.endpoint())
+    }
+
+    fn endpoint_is_local(endpoint: &str) -> bool {
+        let authority = endpoint
+            .split_once("://")
+            .map(|(_, rest)| rest)
+            .unwrap_or(endpoint)
+            .split('/')
+            .next()
+            .unwrap_or(endpoint);
+        let host = if authority.starts_with('[') {
+            authority
+                .split_once(']')
+                .map(|(host, _)| host.trim_start_matches('['))
+                .unwrap_or(authority.trim_matches(['[', ']']))
+        } else {
+            authority
+                .rsplit_once(':')
+                .map(|(host, _)| host)
+                .unwrap_or(authority)
+        };
+
+        if host.eq_ignore_ascii_case("localhost") {
+            return true;
+        }
+
+        host.parse::<IpAddr>()
+            .map(|ip| ip.is_loopback())
+            .unwrap_or(false)
+    }
 }

 #[async_trait]
@ -908,12 +1282,25 @@ mod tests {
    use super::*;
    use crate::config::RedundancyMode;
    use crate::node::{MockNodeRegistry, NodeError, NodeResult};
+    use crate::repair::RepairQueue;
    use async_trait::async_trait;
    use dashmap::DashMap;
    use std::sync::Arc;
    use std::time::{Duration, Instant};
    use tokio::time::sleep;

+    #[derive(Default)]
+    struct CapturingRepairQueue {
+        tasks: DashMap<String, ReplicatedRepairTask>,
+    }
+
+    #[async_trait]
+    impl RepairQueue for CapturingRepairQueue {
+        async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
+            self.tasks.insert(task.id.clone(), task);
+        }
+    }
+
    struct SlowNodeClient {
        node_id: String,
        endpoint: String,
@ -1196,6 +1583,115 @@ mod tests {
        assert!(result.is_err());
    }

+    #[tokio::test]
+    async fn test_under_replicated_write_enqueues_repair_task() {
+        let config = create_replicated_config(3);
+        let registry = Arc::new(MockNodeRegistry::with_nodes(3));
+        let nodes = registry.all_mock_nodes();
+        nodes[2].set_fail_puts(true);
+
+        let repair_queue = Arc::new(CapturingRepairQueue::default());
+        let backend = ReplicatedBackend::new_with_repair_queue(
+            config,
+            registry,
+            Some(repair_queue.clone()),
+        )
+        .await
+        .unwrap();
+
+        let object_id = ObjectId::new();
+        backend
+            .put_object(&object_id, Bytes::from_static(b"repair-me"))
+            .await
+            .unwrap();
+
+        let mut task = None;
+        for _ in 0..20 {
+            task = repair_queue
+                .tasks
+                .iter()
+                .next()
+                .map(|entry| entry.value().clone());
+            if task.is_some() {
+                break;
+            }
+            sleep(Duration::from_millis(10)).await;
+        }
+        let task = task.expect("repair task should be queued");
+        assert_eq!(task.key, ReplicatedBackend::object_key(&object_id));
+        assert_eq!(task.shard_index, 0);
+    }
+
+    #[tokio::test]
+    async fn test_repair_chunk_restores_missing_replica() {
+        let config = create_replicated_config(3);
+        let registry = Arc::new(MockNodeRegistry::with_nodes(3));
+        let nodes = registry.all_mock_nodes();
+        let backend = ReplicatedBackend::new(config, registry.clone())
+            .await
+            .unwrap();
+
+        let object_id = ObjectId::new();
+        let data = Bytes::from(vec![11u8; 128]);
+        backend.put_object(&object_id, data.clone()).await.unwrap();
+
+        let key = ReplicatedBackend::object_key(&object_id);
+        let mut missing = None;
+        for node in &nodes {
+            if node.chunk_exists(&key).await.unwrap() {
+                missing = Some(node.clone());
+                break;
+            }
+        }
+        let missing = missing.expect("at least one replica should exist");
+        missing.delete_chunk(&key).await.unwrap();
+        assert!(!missing.chunk_exists(&key).await.unwrap());
+
+        let task = ReplicatedRepairTask::new(key.clone(), 0, "test");
+        backend.repair_chunk(&task).await.unwrap();
+        assert!(missing.chunk_exists(&key).await.unwrap());
+    }
+
+    #[tokio::test]
+    async fn test_repair_chunk_can_source_from_off_placement_replica() {
+        let config = create_replicated_config(2);
+        let registry = Arc::new(MockNodeRegistry::with_nodes(3));
+        let nodes = registry.all_mock_nodes();
+        let backend = ReplicatedBackend::new(config, registry.clone())
+            .await
+            .unwrap();
+
+        let object_id = ObjectId::new();
+        let data = Bytes::from(vec![23u8; 128]);
+        backend.put_object(&object_id, data.clone()).await.unwrap();
+
+        let key = ReplicatedBackend::object_key(&object_id);
+        let desired_nodes = backend.select_replica_nodes_for_key(&key).await.unwrap();
+        assert_eq!(desired_nodes.len(), 2);
+        let off_placement = nodes
+            .iter()
+            .find(|node| {
+                desired_nodes
+                    .iter()
+                    .all(|desired| desired.node_id() != node.node_id())
+            })
+            .cloned()
+            .expect("off-placement node should exist");
+
+        let source_bytes = desired_nodes[0].get_chunk(&key, 0, false).await.unwrap();
+        off_placement.put_chunk(&key, 0, false, source_bytes).await.unwrap();
+        for node in &desired_nodes {
+            node.delete_chunk(&key).await.unwrap();
+            assert!(!node.chunk_exists(&key).await.unwrap());
+        }
+
+        let task = ReplicatedRepairTask::new(key.clone(), 0, "off-placement-source");
+        backend.repair_chunk(&task).await.unwrap();
+        for node in &desired_nodes {
+            assert!(node.chunk_exists(&key).await.unwrap());
+        }
+    }
+
    #[tokio::test]
    async fn test_replicated_backend_returns_after_quorum_without_waiting_for_slow_replica() {
        let config = create_replicated_config(3);
@ -1333,6 +1829,43 @@ mod tests {
            .is_none());
    }

+    #[tokio::test]
+    async fn test_replicated_backend_prefers_local_replica_for_chunked_reads() {
+        let mut config = create_replicated_config(3);
+        config.chunk.chunk_size = 64;
+        let local = Arc::new(crate::node::MockNodeClient::new(
+            "local",
+            "http://127.0.0.1:9002",
+        ));
+        let slow_a = Arc::new(SlowNodeClient::new(
+            "slow-a",
+            "http://slow-a:9002",
+            Duration::from_millis(250),
+        ));
+        let slow_b = Arc::new(SlowNodeClient::new(
+            "slow-b",
+            "http://slow-b:9002",
+            Duration::from_millis(250),
+        ));
+        let registry = Arc::new(FixedNodeRegistry {
+            nodes: vec![slow_a.clone(), slow_b.clone(), local.clone()],
+        });
+
+        let backend = ReplicatedBackend::new(config, registry).await.unwrap();
+        let object_id = ObjectId::new();
+        let data = Bytes::from(vec![5u8; 256]);
+
+        backend.put_object(&object_id, data.clone()).await.unwrap();
+
+        let started = Instant::now();
+        let retrieved = backend.get_object(&object_id).await.unwrap();
+        let elapsed = started.elapsed();
+
+        assert_eq!(retrieved, data);
+        assert!(elapsed < Duration::from_millis(150), "elapsed={elapsed:?}");
+        assert!(local.get_count() >= 4);
+    }
+
    #[tokio::test]
    async fn test_replicated_backend_object_size() {
        let config = create_replicated_config(3);
--- a/lightningstor/crates/lightningstor-distributed/src/chunk/mod.rs
+++ b/lightningstor/crates/lightningstor-distributed/src/chunk/mod.rs
@ -5,6 +5,8 @@

 use crate::config::ChunkConfig;

+const TARGET_CHUNK_COUNT_PER_OBJECT: usize = 8;
+
 /// Manages chunk operations for large objects
 #[derive(Debug, Clone)]
 pub struct ChunkManager {
@ -27,18 +29,42 @@ impl ChunkManager {
        self.config.chunk_size
    }

+    /// Choose the effective chunk size for an object of the given size.
+    ///
+    /// Small objects keep the configured default chunk size. Larger objects
+    /// scale up to keep per-object chunk counts bounded without exceeding the
+    /// configured maximum.
+    pub fn effective_chunk_size(&self, total_size: usize) -> usize {
+        if total_size == 0 {
+            return self.config.chunk_size;
+        }
+
+        let min_chunk_size = self.config.min_chunk_size.min(self.config.chunk_size).max(1);
+        let max_chunk_size = self.config.max_chunk_size.max(self.config.chunk_size);
+        let required = total_size.div_ceil(TARGET_CHUNK_COUNT_PER_OBJECT);
+        let alignment = min_chunk_size;
+        let aligned_required = required.div_ceil(alignment) * alignment;
+
+        aligned_required
+            .max(self.config.chunk_size)
+            .clamp(min_chunk_size, max_chunk_size)
+    }
+
    /// Split data into chunks
    ///
    /// Returns a vector of chunks. Each chunk is at most `chunk_size` bytes,
    /// except the last chunk which may be smaller.
    pub fn split(&self, data: &[u8]) -> Vec<Vec<u8>> {
+        self.split_with_chunk_size(data, self.config.chunk_size)
+    }
+
+    /// Split data into chunks using an explicit chunk size.
+    pub fn split_with_chunk_size(&self, data: &[u8], chunk_size: usize) -> Vec<Vec<u8>> {
        if data.is_empty() {
            return vec![vec![]];
        }

-        data.chunks(self.config.chunk_size)
-            .map(|c| c.to_vec())
-            .collect()
+        data.chunks(chunk_size).map(|c| c.to_vec()).collect()
    }

    /// Reassemble chunks into original data
@ -50,21 +76,33 @@ impl ChunkManager {

    /// Calculate the number of chunks for a given data size
    pub fn chunk_count(&self, size: usize) -> usize {
+        Self::chunk_count_for_size(size, self.config.chunk_size)
+    }
+
+    pub fn chunk_count_for_size(size: usize, chunk_size: usize) -> usize {
        if size == 0 {
            return 1;
        }
-        (size + self.config.chunk_size - 1) / self.config.chunk_size
+        size.div_ceil(chunk_size)
    }

    /// Calculate the size of a specific chunk
    ///
    /// Returns the size of the chunk at the given index for data of the given total size.
    pub fn chunk_size_at(&self, total_size: usize, chunk_index: usize) -> usize {
-        let full_chunks = total_size / self.config.chunk_size;
-        let remainder = total_size % self.config.chunk_size;
+        Self::chunk_size_at_for_size(total_size, chunk_index, self.config.chunk_size)
+    }
+
+    pub fn chunk_size_at_for_size(
+        total_size: usize,
+        chunk_index: usize,
+        chunk_size: usize,
+    ) -> usize {
+        let full_chunks = total_size / chunk_size;
+        let remainder = total_size % chunk_size;

        if chunk_index < full_chunks {
-            self.config.chunk_size
+            chunk_size
        } else if chunk_index == full_chunks && remainder > 0 {
            remainder
        } else {
@ -76,8 +114,16 @@ impl ChunkManager {
    ///
    /// Returns (start_offset, length) for the chunk at the given index.
    pub fn chunk_range(&self, total_size: usize, chunk_index: usize) -> (usize, usize) {
-        let start = chunk_index * self.config.chunk_size;
-        let length = self.chunk_size_at(total_size, chunk_index);
+        Self::chunk_range_for_size(total_size, chunk_index, self.config.chunk_size)
+    }
+
+    pub fn chunk_range_for_size(
+        total_size: usize,
+        chunk_index: usize,
+        chunk_size: usize,
+    ) -> (usize, usize) {
+        let start = chunk_index * chunk_size;
+        let length = Self::chunk_size_at_for_size(total_size, chunk_index, chunk_size);
        (start, length)
    }
 }
@ -257,6 +303,15 @@ mod tests {
        assert_eq!(manager.chunk_range(2500, 2), (2048, 452));
    }

+    #[test]
+    fn test_effective_chunk_size_scales_large_objects_up_to_target_chunk_count() {
+        let manager = ChunkManager::default();
+
+        assert_eq!(manager.effective_chunk_size(4 * 1024 * 1024), 8 * 1024 * 1024);
+        assert_eq!(manager.effective_chunk_size(256 * 1024 * 1024), 32 * 1024 * 1024);
+        assert_eq!(manager.effective_chunk_size(1024 * 1024 * 1024), 64 * 1024 * 1024);
+    }
+
    #[test]
    fn test_chunk_id_to_key() {
        let id = ChunkId::data_shard("obj123", 0, 2);
--- a/lightningstor/crates/lightningstor-distributed/src/lib.rs
+++ b/lightningstor/crates/lightningstor-distributed/src/lib.rs
@ -65,12 +65,14 @@ pub mod config;
 pub mod erasure;
 pub mod node;
 pub mod placement;
+pub mod repair;

 // Re-export commonly used types
 pub use backends::{ErasureCodedBackend, ReplicatedBackend};
 pub use config::{BucketStorageConfig, ChunkConfig, DistributedConfig, RedundancyMode};
 pub use node::{MockNodeClient, MockNodeRegistry, NodeRegistry, StaticNodeRegistry};
 pub use placement::{ConsistentHashSelector, NodeSelector, RandomSelector, RoundRobinSelector};
+pub use repair::{RepairQueue, ReplicatedRepairTask};

 #[cfg(test)]
 mod tests {
--- a/lightningstor/crates/lightningstor-distributed/src/repair.rs
+++ b/lightningstor/crates/lightningstor-distributed/src/repair.rs
@ -0,0 +1,58 @@
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+use std::time::{SystemTime, UNIX_EPOCH};
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct ReplicatedRepairTask {
+    pub id: String,
+    pub key: String,
+    pub shard_index: u32,
+    pub reason: String,
+    pub enqueued_at_millis: u64,
+    #[serde(default)]
+    pub attempt_count: u32,
+    #[serde(default)]
+    pub last_error: Option<String>,
+    #[serde(default)]
+    pub next_attempt_after_millis: u64,
+}
+
+impl ReplicatedRepairTask {
+    pub fn new(key: impl Into<String>, shard_index: u32, reason: impl Into<String>) -> Self {
+        let key = key.into();
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_millis() as u64;
+        Self {
+            id: format!("replicated::{key}::{shard_index}"),
+            key,
+            shard_index,
+            reason: reason.into(),
+            enqueued_at_millis: now,
+            attempt_count: 0,
+            last_error: None,
+            next_attempt_after_millis: now,
+        }
+    }
+
+    pub fn schedule_retry(&mut self, error: impl Into<String>, backoff_millis: u64) {
+        let now = SystemTime::now()
+            .duration_since(UNIX_EPOCH)
+            .unwrap_or_default()
+            .as_millis() as u64;
+        self.attempt_count = self.attempt_count.saturating_add(1);
+        self.last_error = Some(error.into());
+        self.next_attempt_after_millis = now.saturating_add(backoff_millis);
+    }
+
+    pub fn is_due(&self, now_millis: u64) -> bool {
+        now_millis >= self.next_attempt_after_millis
+    }
+}
+
+#[async_trait]
+pub trait RepairQueue: Send + Sync {
+    async fn enqueue_repair(&self, task: ReplicatedRepairTask);
+}
+
--- a/lightningstor/crates/lightningstor-node/src/storage.rs
+++ b/lightningstor/crates/lightningstor-node/src/storage.rs
@ -1,13 +1,18 @@
 //! Local chunk storage

 use dashmap::DashMap;
+use std::collections::hash_map::DefaultHasher;
+use std::hash::{Hash, Hasher};
 use std::path::PathBuf;
 use std::sync::atomic::{AtomicU64, Ordering};
 use thiserror::Error;
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
+use tokio::sync::Mutex;
 use tracing::debug;

+const WRITE_LOCK_STRIPES: usize = 256;
+
 /// Errors from chunk storage operations
 #[derive(Debug, Error)]
 pub enum StorageError {
@ -45,6 +50,12 @@ pub struct LocalChunkStore {

    /// Whether writes should be flushed before they are acknowledged.
    sync_on_write: bool,
+
+    /// Monotonic nonce for per-write temporary paths.
+    temp_file_nonce: AtomicU64,
+
+    /// Striped per-chunk write/delete locks to keep same-key updates coherent.
+    write_locks: Vec<Mutex<()>>,
 }

 impl LocalChunkStore {
@ -65,6 +76,8 @@ impl LocalChunkStore {
            max_capacity,
            chunk_count: AtomicU64::new(0),
            sync_on_write,
+            temp_file_nonce: AtomicU64::new(0),
+            write_locks: (0..WRITE_LOCK_STRIPES).map(|_| Mutex::new(())).collect(),
        };

        // Scan existing chunks
@ -91,7 +104,7 @@ impl LocalChunkStore {

                if metadata.is_file() {
                    if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
-                        if name.ends_with(".tmp") {
+                        if name.ends_with(".tmp") || name.starts_with(".tmp.") {
                            continue;
                        }

@ -131,6 +144,25 @@ impl LocalChunkStore {
        self.data_dir.join(safe_id)
    }

+    fn temporary_chunk_path(&self, path: &std::path::Path) -> PathBuf {
+        let nonce = self.temp_file_nonce.fetch_add(1, Ordering::Relaxed);
+        let pid = std::process::id();
+        let file_name = path
+            .file_name()
+            .and_then(|name| name.to_str())
+            .unwrap_or("chunk");
+        path.parent()
+            .unwrap_or(&self.data_dir)
+            .join(format!(".tmp.{file_name}.{pid}.{nonce}"))
+    }
+
+    fn write_lock(&self, chunk_id: &str) -> &Mutex<()> {
+        let mut hasher = DefaultHasher::new();
+        chunk_id.hash(&mut hasher);
+        let slot = (hasher.finish() as usize) % self.write_locks.len().max(1);
+        &self.write_locks[slot]
+    }
+
    async fn resolve_existing_chunk_path(&self, chunk_id: &str) -> StorageResult<PathBuf> {
        if let Some(path) = self.chunk_paths.get(chunk_id) {
            return Ok(path.clone());
@ -154,6 +186,7 @@ impl LocalChunkStore {

    /// Store a chunk
    pub async fn put(&self, chunk_id: &str, data: &[u8]) -> StorageResult<u64> {
+        let _guard = self.write_lock(chunk_id).lock().await;
        let size = data.len() as u64;

        // Check if replacing existing chunk
@ -169,7 +202,7 @@ impl LocalChunkStore {
        }

        let path = self.chunk_path(chunk_id);
-        let temp_path = path.with_extension(".tmp");
+        let temp_path = self.temporary_chunk_path(&path);
        if let Some(parent) = path.parent() {
            // Multipart uploads fan out concurrent writes into the same shard
            // directory. Create the parent path unconditionally so no writer can
@ -217,6 +250,7 @@ impl LocalChunkStore {

    /// Delete a chunk
    pub async fn delete(&self, chunk_id: &str) -> StorageResult<()> {
+        let _guard = self.write_lock(chunk_id).lock().await;
        if let Some((_, size)) = self.chunk_sizes.remove(chunk_id) {
            let path = match self.chunk_paths.remove(chunk_id) {
                Some((_, path)) => path,
@ -421,4 +455,34 @@ mod tests {

        assert_eq!(store.chunk_count(), 16);
    }
+
+    #[tokio::test]
+    async fn test_concurrent_rewrites_same_chunk_use_unique_temp_paths() {
+        let (store, _temp) = create_test_store().await;
+        let store = Arc::new(store);
+        let barrier = Arc::new(Barrier::new(9));
+        let mut tasks = Vec::new();
+
+        for idx in 0..8u8 {
+            let store = Arc::clone(&store);
+            let barrier = Arc::clone(&barrier);
+            tasks.push(tokio::spawn(async move {
+                let payload = vec![idx; 2048];
+                barrier.wait().await;
+                store.put("shared-chunk", &payload).await.unwrap();
+                payload
+            }));
+        }
+
+        barrier.wait().await;
+
+        let mut expected_payloads = Vec::new();
+        for task in tasks {
+            expected_payloads.push(task.await.unwrap());
+        }
+
+        let stored = store.get("shared-chunk").await.unwrap();
+        assert!(expected_payloads.iter().any(|payload| payload == &stored));
+        assert_eq!(store.chunk_count(), 1);
+    }
 }
--- a/lightningstor/crates/lightningstor-server/Cargo.toml
+++ b/lightningstor/crates/lightningstor-server/Cargo.toml
@ -17,6 +17,7 @@ lightningstor-distributed = { workspace = true }
 lightningstor-storage = { workspace = true }
 chainfire-client = { path = "../../../chainfire/chainfire-client" }
 flaredb-client = { path = "../../../flaredb/crates/flaredb-client" }
+iam-api = { path = "../../../iam/crates/iam-api" }
 iam-service-auth = { path = "../../../iam/crates/iam-service-auth" }
 tonic = { workspace = true }
 tonic-health = { workspace = true }
--- a/lightningstor/crates/lightningstor-server/src/lib.rs
+++ b/lightningstor/crates/lightningstor-server/src/lib.rs
@ -9,8 +9,11 @@ mod bucket_service;
 pub mod config;
 pub mod metadata;
 mod object_service;
+pub mod repair;
 pub mod s3;
+pub mod tenant;

 pub use bucket_service::BucketServiceImpl;
 pub use config::ServerConfig;
 pub use object_service::ObjectServiceImpl;
+pub use repair::{MetadataRepairQueue, spawn_replicated_repair_worker};
--- a/lightningstor/crates/lightningstor-server/src/main.rs
+++ b/lightningstor/crates/lightningstor-server/src/main.rs
@ -5,11 +5,13 @@ use clap::Parser;
 use iam_service_auth::AuthService;
 use lightningstor_api::{BucketServiceServer, ObjectServiceServer};
 use lightningstor_distributed::{
-    DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, StaticNodeRegistry,
+    DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, RepairQueue,
+    StaticNodeRegistry,
 };
 use lightningstor_server::{
    config::{MetadataBackend, ObjectStorageBackend},
    metadata::MetadataStore,
+    repair::{spawn_replicated_repair_worker, MetadataRepairQueue},
    s3, BucketServiceImpl, ObjectServiceImpl, ServerConfig,
 };
 use lightningstor_storage::{LocalFsBackend, StorageBackend};
@ -28,6 +30,12 @@ const OBJECT_GRPC_INITIAL_STREAM_WINDOW: u32 = 64 * 1024 * 1024;
 const OBJECT_GRPC_INITIAL_CONNECTION_WINDOW: u32 = 512 * 1024 * 1024;
 const OBJECT_GRPC_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
 const OBJECT_GRPC_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(10);
+const REPLICATED_REPAIR_SCAN_INTERVAL: Duration = Duration::from_secs(5);
+
+struct StorageRuntime {
+    backend: Arc<dyn StorageBackend>,
+    repair_worker: Option<tokio::task::JoinHandle<()>>,
+}

 /// LightningStor object storage server
 #[derive(Parser, Debug)]
@ -148,8 +156,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        metrics_addr
    );

-    let storage = create_storage_backend(&config).await?;
-
    if let Some(endpoint) = &config.chainfire_endpoint {
        tracing::info!("  Cluster coordination: ChainFire @ {}", endpoint);
        let endpoint = endpoint.clone();
@ -204,6 +210,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
        }
    };

+    let storage_runtime = create_storage_backend(&config, metadata.clone()).await?;
+    let storage = storage_runtime.backend.clone();
+    let _repair_worker = storage_runtime.repair_worker;
+
    // Initialize IAM authentication service
    tracing::info!(
        "Connecting to IAM server at {}",
@ -253,7 +263,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let s3_addr: SocketAddr = config.s3_addr;

    // Start S3 HTTP server with shared state
-    let s3_router = s3::create_router_with_state(storage.clone(), metadata.clone());
+    let s3_router = s3::create_router_with_auth(
+        storage.clone(),
+        metadata.clone(),
+        Some(config.auth.iam_server_addr.clone()),
+    );
    let s3_server = tokio::spawn(async move {
        tracing::info!("S3 HTTP server listening on {}", s3_addr);
        let listener = tokio::net::TcpListener::bind(s3_addr).await.unwrap();
@ -422,24 +436,27 @@ async fn register_chainfire_membership(

 async fn create_storage_backend(
    config: &ServerConfig,
-) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
+    metadata: Arc<MetadataStore>,
+) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
    match config.object_storage_backend {
        ObjectStorageBackend::LocalFs => {
            tracing::info!("Object storage backend: local_fs");
-            Ok(Arc::new(
-                LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?,
-            ))
+            Ok(StorageRuntime {
+                backend: Arc::new(LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?),
+                repair_worker: None,
+            })
        }
        ObjectStorageBackend::Distributed => {
            tracing::info!("Object storage backend: distributed");
-            create_distributed_storage_backend(&config.distributed).await
+            create_distributed_storage_backend(&config.distributed, metadata).await
        }
    }
 }

 async fn create_distributed_storage_backend(
    config: &DistributedConfig,
-) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
+    metadata: Arc<MetadataStore>,
+) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
    let endpoints: Vec<String> = config
        .node_endpoints
        .iter()
@ -501,9 +518,25 @@ async fn create_distributed_storage_backend(
                write_quorum,
                "Using replicated LightningStor storage backend"
            );
-            Ok(Arc::new(
-                ReplicatedBackend::new(config.clone(), registry).await?,
-            ))
+            let repair_queue: Arc<dyn RepairQueue> =
+                Arc::new(MetadataRepairQueue::new(metadata.clone()));
+            let backend = Arc::new(
+                ReplicatedBackend::new_with_repair_queue(
+                    config.clone(),
+                    registry,
+                    Some(repair_queue),
+                )
+                .await?,
+            );
+            let repair_worker = Some(spawn_replicated_repair_worker(
+                metadata,
+                backend.clone(),
+                REPLICATED_REPAIR_SCAN_INTERVAL,
+            ));
+            Ok(StorageRuntime {
+                backend,
+                repair_worker,
+            })
        }
        RedundancyMode::ErasureCoded {
            data_shards,
@ -514,9 +547,10 @@ async fn create_distributed_storage_backend(
                parity_shards,
                "Using erasure-coded LightningStor storage backend"
            );
-            Ok(Arc::new(
-                ErasureCodedBackend::new(config.clone(), registry).await?,
-            ))
+            Ok(StorageRuntime {
+                backend: Arc::new(ErasureCodedBackend::new(config.clone(), registry).await?),
+                repair_worker: None,
+            })
        }
        RedundancyMode::None => Err(std::io::Error::other(
            "distributed object storage does not support redundancy.type=none; use object_storage_backend=local_fs instead",
--- a/lightningstor/crates/lightningstor-server/src/metadata.rs
+++ b/lightningstor/crates/lightningstor-server/src/metadata.rs
@ -2,6 +2,7 @@

 use dashmap::DashMap;
 use flaredb_client::RdbClient;
+use lightningstor_distributed::ReplicatedRepairTask;
 use lightningstor_types::{Bucket, BucketId, MultipartUpload, Object, ObjectId, Result};
 use serde_json;
 use sqlx::pool::PoolOptions;
@ -215,6 +216,12 @@ impl MetadataStore {
        end_key
    }

+    fn exclusive_scan_start(key: &[u8]) -> Vec<u8> {
+        let mut next = key.to_vec();
+        next.push(0);
+        next
+    }
+
    fn flaredb_client_for_key<'a>(
        clients: &'a [Arc<Mutex<RdbClient>>],
        key: &[u8],
@ -422,6 +429,56 @@ impl MetadataStore {
        Ok(results)
    }

+    async fn flaredb_scan_page(
+        clients: &[Arc<Mutex<RdbClient>>],
+        prefix: &[u8],
+        start_after: Option<&[u8]>,
+        limit: u32,
+    ) -> Result<(Vec<(String, String)>, bool)> {
+        let end_key = Self::prefix_end(prefix);
+        let start_key = start_after
+            .map(Self::exclusive_scan_start)
+            .unwrap_or_else(|| prefix.to_vec());
+        let fetch_limit = limit.saturating_add(1).max(1);
+        let client = Self::flaredb_scan_client(clients);
+        let (mut items, next) = match {
+            let mut c = client.lock().await;
+            c.raw_scan(start_key.clone(), end_key.clone(), fetch_limit).await
+        } {
+            Ok((keys, values, next)) => {
+                let items = keys
+                    .into_iter()
+                    .zip(values.into_iter())
+                    .map(|(key, value)| {
+                        (
+                            String::from_utf8_lossy(&key).to_string(),
+                            String::from_utf8_lossy(&value).to_string(),
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                (items, next)
+            }
+            Err(status) if Self::flaredb_requires_strong(&status) => {
+                Self::flaredb_scan_strong(client, &start_key, &end_key, fetch_limit).await?
+            }
+            Err(error) => {
+                return Err(lightningstor_types::Error::StorageError(format!(
+                    "FlareDB scan failed: {}",
+                    error
+                )));
+            }
+        };
+
+        let has_more = if items.len() > limit as usize {
+            items.truncate(limit as usize);
+            true
+        } else {
+            next.is_some()
+        };
+
+        Ok((items, has_more))
+    }
+
    async fn flaredb_has_prefix(clients: &[Arc<Mutex<RdbClient>>], prefix: &[u8]) -> Result<bool> {
        let end_key = Self::prefix_end(prefix);
        let client = Self::flaredb_scan_client(clients);
@ -613,11 +670,146 @@ impl MetadataStore {
                        results.push((entry.key().clone(), entry.value().clone()));
                    }
                }
+                results.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
                Ok(results)
            }
        }
    }

+    async fn get_prefix_page(
+        &self,
+        prefix: &str,
+        start_after: Option<&str>,
+        limit: u32,
+    ) -> Result<(Vec<(String, String)>, bool)> {
+        if limit == 0 {
+            return Ok((Vec::new(), false));
+        }
+
+        match &self.backend {
+            StorageBackend::FlareDB(client) => {
+                Self::flaredb_scan_page(
+                    client,
+                    prefix.as_bytes(),
+                    start_after.map(str::as_bytes),
+                    limit,
+                )
+                .await
+            }
+            StorageBackend::Sql(sql) => {
+                let prefix_end = String::from_utf8(Self::prefix_end(prefix.as_bytes())).map_err(|e| {
+                    lightningstor_types::Error::StorageError(format!(
+                        "Failed to encode prefix end: {}",
+                        e
+                    ))
+                })?;
+                let fetch_limit = (limit.saturating_add(1)) as i64;
+                match sql {
+                    SqlStorageBackend::Postgres(pool) => {
+                        let rows: Vec<(String, String)> = if let Some(after) = start_after {
+                            sqlx::query_as(
+                                "SELECT key, value FROM metadata_kv
+                                 WHERE key >= $1 AND key < $2 AND key > $3
+                                 ORDER BY key
+                                 LIMIT $4",
+                            )
+                            .bind(prefix)
+                            .bind(&prefix_end)
+                            .bind(after)
+                            .bind(fetch_limit)
+                            .fetch_all(pool.as_ref())
+                            .await
+                            .map_err(|e| {
+                                lightningstor_types::Error::StorageError(format!(
+                                    "Postgres paged scan failed: {}",
+                                    e
+                                ))
+                            })?
+                        } else {
+                            sqlx::query_as(
+                                "SELECT key, value FROM metadata_kv
+                                 WHERE key >= $1 AND key < $2
+                                 ORDER BY key
+                                 LIMIT $3",
+                            )
+                            .bind(prefix)
+                            .bind(&prefix_end)
+                            .bind(fetch_limit)
+                            .fetch_all(pool.as_ref())
+                            .await
+                            .map_err(|e| {
+                                lightningstor_types::Error::StorageError(format!(
+                                    "Postgres paged scan failed: {}",
+                                    e
+                                ))
+                            })?
+                        };
+                        let has_more = rows.len() > limit as usize;
+                        let items = rows.into_iter().take(limit as usize).collect();
+                        Ok((items, has_more))
+                    }
+                    SqlStorageBackend::Sqlite(pool) => {
+                        let rows: Vec<(String, String)> = if let Some(after) = start_after {
+                            sqlx::query_as(
+                                "SELECT key, value FROM metadata_kv
+                                 WHERE key >= ?1 AND key < ?2 AND key > ?3
+                                 ORDER BY key
+                                 LIMIT ?4",
+                            )
+                            .bind(prefix)
+                            .bind(&prefix_end)
+                            .bind(after)
+                            .bind(fetch_limit)
+                            .fetch_all(pool.as_ref())
+                            .await
+                            .map_err(|e| {
+                                lightningstor_types::Error::StorageError(format!(
+                                    "SQLite paged scan failed: {}",
+                                    e
+                                ))
+                            })?
+                        } else {
+                            sqlx::query_as(
+                                "SELECT key, value FROM metadata_kv
+                                 WHERE key >= ?1 AND key < ?2
+                                 ORDER BY key
+                                 LIMIT ?3",
+                            )
+                            .bind(prefix)
+                            .bind(&prefix_end)
+                            .bind(fetch_limit)
+                            .fetch_all(pool.as_ref())
+                            .await
+                            .map_err(|e| {
+                                lightningstor_types::Error::StorageError(format!(
+                                    "SQLite paged scan failed: {}",
+                                    e
+                                ))
+                            })?
+                        };
+                        let has_more = rows.len() > limit as usize;
+                        let items = rows.into_iter().take(limit as usize).collect();
+                        Ok((items, has_more))
+                    }
+                }
+            }
+            StorageBackend::InMemory(map) => {
+                let mut rows: Vec<(String, String)> = map
+                    .iter()
+                    .filter(|entry| entry.key().starts_with(prefix))
+                    .map(|entry| (entry.key().clone(), entry.value().clone()))
+                    .collect();
+                rows.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
+                if let Some(after) = start_after {
+                    rows.retain(|(key, _)| key.as_str() > after);
+                }
+                let has_more = rows.len() > limit as usize;
+                let items = rows.into_iter().take(limit as usize).collect();
+                Ok((items, has_more))
+            }
+        }
+    }
+
    /// Internal: check if any key exists with a prefix
    async fn has_prefix(&self, prefix: &str) -> Result<bool> {
        match &self.backend {
@ -708,10 +900,64 @@ impl MetadataStore {
        "/lightningstor/multipart/uploads/"
    }

+    fn multipart_bucket_key(bucket_id: &str, object_key: &str, upload_id: &str) -> String {
+        format!(
+            "/lightningstor/multipart/by-bucket/{}/{}/{}",
+            bucket_id, object_key, upload_id
+        )
+    }
+
+    fn multipart_bucket_prefix(bucket_id: &BucketId, prefix: &str) -> String {
+        format!("/lightningstor/multipart/by-bucket/{}/{}", bucket_id, prefix)
+    }
+
    fn multipart_object_key(object_id: &ObjectId) -> String {
        format!("/lightningstor/multipart/objects/{}", object_id)
    }

+    fn replicated_repair_task_key(task_id: &str) -> String {
+        format!("/lightningstor/repair/replicated/{}", task_id)
+    }
+
+    fn replicated_repair_task_prefix() -> &'static str {
+        "/lightningstor/repair/replicated/"
+    }
+
+    pub async fn save_replicated_repair_task(&self, task: &ReplicatedRepairTask) -> Result<()> {
+        let key = Self::replicated_repair_task_key(&task.id);
+        let value = serde_json::to_string(task).map_err(|e| {
+            lightningstor_types::Error::StorageError(format!(
+                "Failed to serialize replicated repair task: {}",
+                e
+            ))
+        })?;
+        self.put(&key, &value).await
+    }
+
+    pub async fn list_replicated_repair_tasks(
+        &self,
+        limit: u32,
+    ) -> Result<Vec<ReplicatedRepairTask>> {
+        let (items, _) = self
+            .get_prefix_page(Self::replicated_repair_task_prefix(), None, limit)
+            .await?;
+        let mut tasks = Vec::new();
+        for (_, value) in items {
+            let task: ReplicatedRepairTask = serde_json::from_str(&value).map_err(|e| {
+                lightningstor_types::Error::StorageError(format!(
+                    "Failed to deserialize replicated repair task: {}",
+                    e
+                ))
+            })?;
+            tasks.push(task);
+        }
+        Ok(tasks)
+    }
+
+    pub async fn delete_replicated_repair_task(&self, task_id: &str) -> Result<()> {
+        self.delete_key(&Self::replicated_repair_task_key(task_id)).await
+    }
+
    /// Save bucket metadata
    pub async fn save_bucket(&self, bucket: &Bucket) -> Result<()> {
        let key = Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
@ -900,6 +1146,13 @@ impl MetadataStore {
        prefix: &str,
        max_keys: u32,
    ) -> Result<Vec<Object>> {
+        if max_keys > 0 {
+            return self
+                .list_objects_page(bucket_id, prefix, None, max_keys)
+                .await
+                .map(|(objects, _)| objects);
+        }
+
        let prefix_key = Self::object_prefix(bucket_id, prefix);

        let items = self.get_prefix(&prefix_key).await?;
@ -921,6 +1174,34 @@ impl MetadataStore {
        Ok(objects)
    }

+    pub async fn list_objects_page(
+        &self,
+        bucket_id: &BucketId,
+        prefix: &str,
+        start_after_key: Option<&str>,
+        max_keys: u32,
+    ) -> Result<(Vec<Object>, bool)> {
+        if max_keys == 0 {
+            return Ok((Vec::new(), false));
+        }
+
+        let prefix_key = Self::object_prefix(bucket_id, prefix);
+        let start_after_storage_key =
+            start_after_key.map(|key| Self::object_key(bucket_id, key, None));
+        let (items, has_more) = self
+            .get_prefix_page(&prefix_key, start_after_storage_key.as_deref(), max_keys)
+            .await?;
+
+        let mut objects = Vec::new();
+        for (_, value) in items {
+            if let Ok(object) = serde_json::from_str::<Object>(&value) {
+                objects.push(object);
+            }
+        }
+
+        Ok((objects, has_more))
+    }
+
    pub async fn save_multipart_upload(&self, upload: &MultipartUpload) -> Result<()> {
        let key = Self::multipart_upload_key(upload.upload_id.as_str());
        let value = serde_json::to_string(upload).map_err(|e| {
@ -929,7 +1210,16 @@ impl MetadataStore {
                e
            ))
        })?;
-        self.put(&key, &value).await
+        self.put(&key, &value).await?;
+        self.put(
+            &Self::multipart_bucket_key(
+                &upload.bucket_id,
+                upload.key.as_str(),
+                upload.upload_id.as_str(),
+            ),
+            &value,
+        )
+        .await
    }

    pub async fn load_multipart_upload(&self, upload_id: &str) -> Result<Option<MultipartUpload>> {
@ -948,6 +1238,14 @@ impl MetadataStore {
    }

    pub async fn delete_multipart_upload(&self, upload_id: &str) -> Result<()> {
+        if let Some(upload) = self.load_multipart_upload(upload_id).await? {
+            self.delete_key(&Self::multipart_bucket_key(
+                &upload.bucket_id,
+                upload.key.as_str(),
+                upload.upload_id.as_str(),
+            ))
+            .await?;
+        }
        self.delete_key(&Self::multipart_upload_key(upload_id)).await
    }

@ -957,9 +1255,24 @@ impl MetadataStore {
        prefix: &str,
        max_uploads: u32,
    ) -> Result<Vec<MultipartUpload>> {
-        let items = self.get_prefix(Self::multipart_upload_prefix()).await?;
+        let index_prefix = Self::multipart_bucket_prefix(bucket_id, prefix);
+        let items = if max_uploads > 0 {
+            self.get_prefix_page(&index_prefix, None, max_uploads)
+                .await?
+                .0
+        } else {
+            self.get_prefix(&index_prefix).await?
+        };
        let mut uploads = Vec::new();
        for (_, value) in items {
+            if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
+                uploads.push(upload);
+            }
+        }
+
+        if uploads.is_empty() {
+            let fallback_items = self.get_prefix(Self::multipart_upload_prefix()).await?;
+            for (_, value) in fallback_items {
                if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
                    if upload.bucket_id == bucket_id.to_string()
                        && upload.key.as_str().starts_with(prefix)
@ -968,6 +1281,7 @@ impl MetadataStore {
                    }
                }
            }
+        }

        uploads.sort_by(|a, b| {
            a.key
@ -1033,6 +1347,7 @@ fn normalize_transport_addr(endpoint: &str) -> String {
 #[cfg(test)]
 mod tests {
    use super::*;
+    use lightningstor_distributed::ReplicatedRepairTask;
    use lightningstor_types::{BucketName, ETag, ObjectKey};

    #[tokio::test]
@ -1119,4 +1434,123 @@ mod tests {
                .is_none()
        );
    }
+
+    #[tokio::test]
+    async fn list_objects_page_honors_start_after_and_has_more() {
+        let store = MetadataStore::new_in_memory();
+        let bucket = Bucket::new(
+            BucketName::new("paged-bucket").unwrap(),
+            "org-a",
+            "project-a",
+            "default",
+        );
+        store.save_bucket(&bucket).await.unwrap();
+
+        for key in ["a.txt", "b.txt", "c.txt"] {
+            let mut object = Object::new(
+                bucket.id.to_string(),
+                ObjectKey::new(key).unwrap(),
+                ETag::from_md5(&[7u8; 16]),
+                128,
+                Some("text/plain".to_string()),
+            );
+            object.version = lightningstor_types::ObjectVersion::null();
+            store.save_object(&object).await.unwrap();
+        }
+
+        let (first_page, first_has_more) = store
+            .list_objects_page(&bucket.id, "", None, 2)
+            .await
+            .unwrap();
+        assert_eq!(
+            first_page
+                .iter()
+                .map(|object| object.key.as_str().to_string())
+                .collect::<Vec<_>>(),
+            vec!["a.txt".to_string(), "b.txt".to_string()]
+        );
+        assert!(first_has_more);
+
+        let (second_page, second_has_more) = store
+            .list_objects_page(&bucket.id, "", Some("b.txt"), 2)
+            .await
+            .unwrap();
+        assert_eq!(
+            second_page
+                .iter()
+                .map(|object| object.key.as_str().to_string())
+                .collect::<Vec<_>>(),
+            vec!["c.txt".to_string()]
+        );
+        assert!(!second_has_more);
+    }
+
+    #[tokio::test]
+    async fn list_multipart_uploads_uses_bucket_prefix_index() {
+        let store = MetadataStore::new_in_memory();
+        let bucket = Bucket::new(
+            BucketName::new("multipart-bucket").unwrap(),
+            "org-a",
+            "project-a",
+            "default",
+        );
+        store.save_bucket(&bucket).await.unwrap();
+
+        let upload_a = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/one.bin").unwrap());
+        let upload_b = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/two.bin").unwrap());
+        let other_bucket = Bucket::new(
+            BucketName::new("other-bucket").unwrap(),
+            "org-a",
+            "project-a",
+            "default",
+        );
+        store.save_bucket(&other_bucket).await.unwrap();
+        let upload_other =
+            MultipartUpload::new(other_bucket.id.to_string(), ObjectKey::new("a/three.bin").unwrap());
+
+        store.save_multipart_upload(&upload_a).await.unwrap();
+        store.save_multipart_upload(&upload_b).await.unwrap();
+        store.save_multipart_upload(&upload_other).await.unwrap();
+
+        let uploads = store
+            .list_multipart_uploads(&bucket.id, "a/", 10)
+            .await
+            .unwrap();
+        assert_eq!(uploads.len(), 2);
+        assert_eq!(
+            uploads
+                .iter()
+                .map(|upload| upload.key.as_str().to_string())
+                .collect::<Vec<_>>(),
+            vec!["a/one.bin".to_string(), "a/two.bin".to_string()]
+        );
+    }
+
+    #[tokio::test]
+    async fn replicated_repair_tasks_round_trip() {
+        let store = MetadataStore::new_in_memory();
+        let mut task = ReplicatedRepairTask::new("obj_abc", 0, "quorum write");
+        store.save_replicated_repair_task(&task).await.unwrap();
+
+        let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
+        assert_eq!(tasks.len(), 1);
+        assert_eq!(tasks[0].key, "obj_abc");
+
+        task.schedule_retry("transient failure", 5_000);
+        store.save_replicated_repair_task(&task).await.unwrap();
+
+        let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
+        assert_eq!(tasks[0].attempt_count, 1);
+        assert_eq!(tasks[0].last_error.as_deref(), Some("transient failure"));
+
+        store
+            .delete_replicated_repair_task(&task.id)
+            .await
+            .unwrap();
+        assert!(store
+            .list_replicated_repair_tasks(10)
+            .await
+            .unwrap()
+            .is_empty());
+    }
 }
--- a/lightningstor/crates/lightningstor-server/src/object_service.rs
+++ b/lightningstor/crates/lightningstor-server/src/object_service.rs
@ -155,6 +155,10 @@ impl ObjectServiceImpl {
                .await
                .map_err(|e| Status::internal(format!("Failed to delete multipart part: {}", e)))?;
        }
+        self.storage
+            .delete_upload_parts(upload.upload_id.as_str())
+            .await
+            .map_err(|e| Status::internal(format!("Failed to clean multipart upload: {}", e)))?;
        Ok(())
    }

@ -465,7 +469,6 @@ impl ObjectService for ObjectServiceImpl {
        let (start, end) =
            Self::resolve_range(object.size as usize, req.range_start, req.range_end);

-        if object.etag.is_multipart() {
        if let Some(upload) = self
            .metadata
            .load_object_multipart_upload(&object.id)
@ -476,7 +479,6 @@ impl ObjectService for ObjectServiceImpl {
                self.multipart_object_stream(&object, upload, start, end),
            ));
        }
-        }

        let data = self
            .storage
@ -524,7 +526,6 @@ impl ObjectService for ObjectServiceImpl {
            .map_err(Self::to_status)?
            .ok_or_else(|| Status::not_found(format!("Object {} not found", req.key)))?;

-        if object.etag.is_multipart() {
        if let Some(upload) = self
            .metadata
            .load_object_multipart_upload(&object.id)
@ -540,12 +541,6 @@ impl ObjectService for ObjectServiceImpl {
                .delete_multipart_upload(upload.upload_id.as_str())
                .await
                .map_err(Self::to_status)?;
-            } else {
-                self.storage
-                    .delete_object(&object.id)
-                    .await
-                    .map_err(|e| Status::internal(format!("Failed to delete object: {}", e)))?;
-            }
        } else {
            self.storage
                .delete_object(&object.id)
--- a/lightningstor/crates/lightningstor-server/src/repair.rs
+++ b/lightningstor/crates/lightningstor-server/src/repair.rs
@ -0,0 +1,182 @@
+use crate::metadata::MetadataStore;
+use async_trait::async_trait;
+use lightningstor_distributed::{RepairQueue, ReplicatedBackend, ReplicatedRepairTask};
+use std::sync::Arc;
+use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use tokio::task::JoinHandle;
+use tokio::time::sleep;
+use tracing::{debug, warn};
+
+const REPAIR_SCAN_LIMIT: u32 = 256;
+const REPAIR_BACKOFF_BASE_MILLIS: u64 = 1_000;
+const REPAIR_BACKOFF_MAX_MILLIS: u64 = 60_000;
+const ORPHAN_REPAIR_DROP_ATTEMPTS: u32 = 8;
+
+pub struct MetadataRepairQueue {
+    metadata: Arc<MetadataStore>,
+}
+
+impl MetadataRepairQueue {
+    pub fn new(metadata: Arc<MetadataStore>) -> Self {
+        Self { metadata }
+    }
+}
+
+#[async_trait]
+impl RepairQueue for MetadataRepairQueue {
+    async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
+        if let Err(error) = self.metadata.save_replicated_repair_task(&task).await {
+            warn!(
+                task_id = task.id,
+                chunk_key = task.key,
+                error = %error,
+                "failed to persist replicated repair task"
+            );
+        }
+    }
+}
+
+pub fn spawn_replicated_repair_worker(
+    metadata: Arc<MetadataStore>,
+    backend: Arc<ReplicatedBackend>,
+    interval: Duration,
+) -> JoinHandle<()> {
+    tokio::spawn(async move {
+        loop {
+            if let Err(error) = process_replicated_repair_queue(&metadata, &backend).await {
+                if replicated_repair_queue_transiently_unready(&error) {
+                    debug!(error = %error, "replicated repair queue pass deferred until metadata becomes ready");
+                } else {
+                    warn!(error = %error, "replicated repair queue pass failed");
+                }
+            }
+            sleep(interval).await;
+        }
+    })
+}
+
+async fn process_replicated_repair_queue(
+    metadata: &MetadataStore,
+    backend: &ReplicatedBackend,
+) -> Result<(), lightningstor_types::Error> {
+    let now = unix_time_millis();
+    let tasks = metadata
+        .list_replicated_repair_tasks(REPAIR_SCAN_LIMIT)
+        .await?;
+    for mut task in tasks {
+        if !task.is_due(now) {
+            continue;
+        }
+        match backend.repair_chunk(&task).await {
+            Ok(()) => {
+                metadata.delete_replicated_repair_task(&task.id).await?;
+                debug!(
+                    task_id = task.id,
+                    chunk_key = task.key,
+                    "repaired replicated chunk"
+                );
+            }
+            Err(error) => {
+                if task.attempt_count >= ORPHAN_REPAIR_DROP_ATTEMPTS {
+                    match backend.chunk_exists_anywhere(&task.key).await {
+                        Ok(false) => {
+                            warn!(
+                                task_id = task.id,
+                                chunk_key = task.key,
+                                attempts = task.attempt_count,
+                                "dropping orphan replicated repair task with no remaining source replica"
+                            );
+                            metadata.delete_replicated_repair_task(&task.id).await?;
+                            continue;
+                        }
+                        Ok(true) => {}
+                        Err(probe_error) => {
+                            warn!(
+                                task_id = task.id,
+                                chunk_key = task.key,
+                                error = %probe_error,
+                                "failed to probe global chunk existence while evaluating orphan repair task"
+                            );
+                        }
+                    }
+                }
+                let backoff = repair_backoff_millis(task.attempt_count);
+                task.schedule_retry(error.to_string(), backoff);
+                metadata.save_replicated_repair_task(&task).await?;
+                warn!(
+                    task_id = task.id,
+                    chunk_key = task.key,
+                    attempts = task.attempt_count,
+                    backoff_millis = backoff,
+                    error = %error,
+                    "replicated chunk repair failed"
+                );
+            }
+        }
+    }
+    Ok(())
+}
+
+fn unix_time_millis() -> u64 {
+    SystemTime::now()
+        .duration_since(UNIX_EPOCH)
+        .unwrap_or_default()
+        .as_millis() as u64
+}
+
+fn repair_backoff_millis(attempt_count: u32) -> u64 {
+    let exponent = attempt_count.min(6);
+    let multiplier = 1u64 << exponent;
+    (REPAIR_BACKOFF_BASE_MILLIS.saturating_mul(multiplier)).min(REPAIR_BACKOFF_MAX_MILLIS)
+}
+
+fn replicated_repair_queue_transiently_unready(error: &lightningstor_types::Error) -> bool {
+    let rendered = error.to_string().to_ascii_lowercase();
+    let transient = rendered.contains("region not found")
+        || rendered.contains("status: notfound")
+        || rendered.contains("metadata backend not ready")
+        || rendered.contains("notleader");
+    if transient {
+        return true;
+    }
+
+    match error {
+        lightningstor_types::Error::StorageError(message)
+        | lightningstor_types::Error::Internal(message) => {
+            let message = message.to_ascii_lowercase();
+            message.contains("region not found")
+                || message.contains("status: notfound")
+                || message.contains("metadata backend not ready")
+                || message.contains("notleader")
+        }
+        _ => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::replicated_repair_queue_transiently_unready;
+
+    #[test]
+    fn treats_region_not_found_as_transient_startup_state() {
+        let error = lightningstor_types::Error::StorageError(
+            "FlareDB scan failed: status: NotFound, message: \"region not found\"".to_string(),
+        );
+        assert!(replicated_repair_queue_transiently_unready(&error));
+    }
+
+    #[test]
+    fn treats_wrapped_storage_error_rendering_as_transient_startup_state() {
+        let error = lightningstor_types::Error::StorageError(
+            "FlareDB scan failed: status: NotFound, message: \"region not found\", details: [], metadata: MetadataMap { headers: {} }".to_string(),
+        );
+        assert!(replicated_repair_queue_transiently_unready(&error));
+    }
+
+    #[test]
+    fn keeps_real_repair_failures_as_warnings() {
+        let error =
+            lightningstor_types::Error::StorageError("replication checksum mismatch".to_string());
+        assert!(!replicated_repair_queue_transiently_unready(&error));
+    }
+}
--- a/lightningstor/crates/lightningstor-server/src/s3/auth.rs
+++ b/lightningstor/crates/lightningstor-server/src/s3/auth.rs
@ -10,13 +10,17 @@ use axum::{
    middleware::Next,
    response::{IntoResponse, Response},
 };
+use crate::tenant::TenantContext;
 use hmac::{Hmac, Mac};
+use iam_api::proto::{iam_credential_client::IamCredentialClient, GetSecretKeyRequest};
 use sha2::{Digest, Sha256};
 use std::collections::HashMap;
 use std::sync::Arc;
-use tokio::sync::RwLock;
+use tokio::sync::{Mutex, RwLock};
+use tonic::transport::Channel;
 use tracing::{debug, warn};
 use url::form_urlencoded;
+use std::time::{Duration as StdDuration, Instant};

 type HmacSha256 = Hmac<Sha256>;
 const DEFAULT_MAX_AUTH_BODY_BYTES: usize = 1024 * 1024 * 1024;
@ -27,6 +31,13 @@ pub(crate) struct VerifiedBodyBytes(pub Bytes);
 #[derive(Clone, Debug)]
 pub(crate) struct VerifiedPayloadHash(pub String);

+#[derive(Clone, Debug, PartialEq, Eq)]
+pub(crate) struct VerifiedTenantContext(pub TenantContext);
+
+fn should_buffer_auth_body(payload_hash_header: Option<&str>) -> bool {
+    payload_hash_header.is_none()
+}
+
 /// SigV4 authentication state
 #[derive(Clone)]
 pub struct AuthState {
@ -40,21 +51,73 @@ pub struct AuthState {
    aws_service: String,
 }

-/// Placeholder IAM client (will integrate with real IAM later)
 pub struct IamClient {
-    // Stores access_key_id -> secret_key mapping
+    mode: IamClientMode,
+    credential_cache: Arc<RwLock<HashMap<String, CachedCredential>>>,
+    cache_ttl: StdDuration,
+}
+
+enum IamClientMode {
+    Env {
        credentials: std::collections::HashMap<String, String>,
+    },
+    Grpc {
+        endpoint: String,
+        channel: Arc<Mutex<Option<Channel>>>,
+    },
+}
+
+#[derive(Clone, Debug, PartialEq, Eq)]
+pub(crate) struct ResolvedCredential {
+    pub secret_key: String,
+    pub principal_id: String,
+    pub org_id: Option<String>,
+    pub project_id: Option<String>,
+}
+
+struct CachedCredential {
+    credential: ResolvedCredential,
+    cached_at: Instant,
 }

 impl IamClient {
-    /// Create a new IamClient loading credentials from environment variables for MVP.
+    /// Create a new IAM client. If an endpoint is supplied, use the IAM gRPC API.
+    pub fn new(iam_endpoint: Option<String>) -> Self {
+        let cache_ttl = std::env::var("LIGHTNINGSTOR_S3_IAM_CACHE_TTL_SECS")
+            .ok()
+            .and_then(|value| value.parse::<u64>().ok())
+            .map(StdDuration::from_secs)
+            .unwrap_or_else(|| StdDuration::from_secs(30));
+
+        if let Some(endpoint) = iam_endpoint
+            .map(|value| normalize_iam_endpoint(&value))
+            .filter(|value| !value.is_empty())
+        {
+            return Self {
+                mode: IamClientMode::Grpc {
+                    endpoint,
+                    channel: Arc::new(Mutex::new(None)),
+                },
+                credential_cache: Arc::new(RwLock::new(HashMap::new())),
+                cache_ttl,
+            };
+        }
+
+        Self {
+            mode: IamClientMode::Env {
+                credentials: Self::load_env_credentials(),
+            },
+            credential_cache: Arc::new(RwLock::new(HashMap::new())),
+            cache_ttl,
+        }
+    }
+
+    /// Load credentials from environment variables for fallback/testing.
    ///
    /// Supports two formats:
    /// 1. Single credential: S3_ACCESS_KEY_ID + S3_SECRET_KEY
    /// 2. Multiple credentials: S3_CREDENTIALS="key1:secret1,key2:secret2,..."
-    ///
-    /// TODO: Replace with proper IAM gRPC integration (see T060)
-    pub fn new() -> Self {
+    fn load_env_credentials() -> std::collections::HashMap<String, String> {
        let mut credentials = std::collections::HashMap::new();

        // Option 1: Multiple credentials via S3_CREDENTIALS
@ -87,28 +150,160 @@ impl IamClient {
            warn!("Set S3_CREDENTIALS or S3_ACCESS_KEY_ID/S3_SECRET_KEY to enable access.");
        }

-        Self { credentials }
+        credentials
    }

-    /// Validate access key and return secret key
-    pub async fn get_secret_key(&self, access_key_id: &str) -> Result<String, String> {
-        self.credentials
+    #[cfg(test)]
+    fn env_credentials(&self) -> Option<&std::collections::HashMap<String, String>> {
+        match &self.mode {
+            IamClientMode::Env { credentials } => Some(credentials),
+            IamClientMode::Grpc { .. } => None,
+        }
+    }
+
+    fn env_default_tenant() -> (Option<String>, Option<String>) {
+        let org_id = std::env::var("S3_TENANT_ORG_ID")
+            .ok()
+            .or_else(|| std::env::var("S3_ORG_ID").ok())
+            .or_else(|| Some("default".to_string()));
+        let project_id = std::env::var("S3_TENANT_PROJECT_ID")
+            .ok()
+            .or_else(|| std::env::var("S3_PROJECT_ID").ok())
+            .or_else(|| Some("default".to_string()));
+        (org_id, project_id)
+    }
+
+    /// Validate access key and resolve the credential context.
+    pub async fn get_credential(&self, access_key_id: &str) -> Result<ResolvedCredential, String> {
+        match &self.mode {
+            IamClientMode::Env { credentials } => {
+                let secret_key = credentials
                    .get(access_key_id)
                    .cloned()
-            .ok_or_else(|| "Access key ID not found".to_string())
+                    .ok_or_else(|| "Access key ID not found".to_string())?;
+                let (org_id, project_id) = Self::env_default_tenant();
+                Ok(ResolvedCredential {
+                    secret_key,
+                    principal_id: access_key_id.to_string(),
+                    org_id,
+                    project_id,
+                })
+            }
+            IamClientMode::Grpc { endpoint, channel } => {
+                if let Some(credential) = self.cached_credential(access_key_id).await {
+                    return Ok(credential);
+                }
+
+                let response = self
+                    .grpc_get_secret_key(endpoint, channel, access_key_id)
+                    .await?;
+                let response = response.into_inner();
+                let credential = ResolvedCredential {
+                    secret_key: response.secret_key,
+                    principal_id: response.principal_id,
+                    org_id: response.org_id,
+                    project_id: response.project_id,
+                };
+                self.cache_credential(access_key_id, &credential).await;
+                Ok(credential)
+            }
+        }
+    }
+
+    async fn cached_credential(&self, access_key_id: &str) -> Option<ResolvedCredential> {
+        let cache = self.credential_cache.read().await;
+        cache.get(access_key_id).and_then(|entry| {
+            if entry.cached_at.elapsed() <= self.cache_ttl {
+                Some(entry.credential.clone())
+            } else {
+                None
+            }
+        })
+    }
+
+    async fn cache_credential(&self, access_key_id: &str, credential: &ResolvedCredential) {
+        let mut cache = self.credential_cache.write().await;
+        cache.insert(
+            access_key_id.to_string(),
+            CachedCredential {
+                credential: credential.clone(),
+                cached_at: Instant::now(),
+            },
+        );
+    }
+
+    async fn grpc_channel(
+        endpoint: &str,
+        channel: &Arc<Mutex<Option<Channel>>>,
+    ) -> Result<Channel, String> {
+        let mut cached = channel.lock().await;
+        if let Some(existing) = cached.as_ref() {
+            return Ok(existing.clone());
+        }
+
+        let created = Channel::from_shared(endpoint.to_string())
+            .map_err(|e| format!("failed to parse IAM credential endpoint: {}", e))?
+            .connect()
+            .await
+            .map_err(|e| format!("failed to connect to IAM credential service: {}", e))?;
+        *cached = Some(created.clone());
+        Ok(created)
+    }
+
+    async fn invalidate_grpc_channel(channel: &Arc<Mutex<Option<Channel>>>) {
+        let mut cached = channel.lock().await;
+        *cached = None;
+    }
+
+    async fn grpc_get_secret_key(
+        &self,
+        endpoint: &str,
+        channel: &Arc<Mutex<Option<Channel>>>,
+        access_key_id: &str,
+    ) -> Result<tonic::Response<iam_api::proto::GetSecretKeyResponse>, String> {
+        for attempt in 0..2 {
+            let grpc_channel = Self::grpc_channel(endpoint, channel).await?;
+            let mut client = IamCredentialClient::new(grpc_channel);
+            match client
+                .get_secret_key(GetSecretKeyRequest {
+                    access_key_id: access_key_id.to_string(),
+                })
+                .await
+            {
+                Ok(response) => return Ok(response),
+                Err(status)
+                    if attempt == 0
+                        && matches!(
+                            status.code(),
+                            tonic::Code::Unavailable
+                                | tonic::Code::Cancelled
+                                | tonic::Code::Unknown
+                                | tonic::Code::DeadlineExceeded
+                                | tonic::Code::Internal
+                        ) =>
+                {
+                    Self::invalidate_grpc_channel(channel).await;
+                }
+                Err(status) => return Err(status.message().to_string()),
+            }
+        }
+
+        Err("IAM credential lookup exhausted retries".to_string())
+    }
+}
+
+fn normalize_iam_endpoint(endpoint: &str) -> String {
+    if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
+        endpoint.to_string()
+    } else {
+        format!("http://{}", endpoint)
    }
 }

 impl AuthState {
    /// Create new auth state with IAM integration
    pub fn new(iam_endpoint: Option<String>) -> Self {
-        let iam_client = if let Some(_endpoint) = iam_endpoint {
-            // TODO: Connect to real IAM gRPC service
-            // For now, if an endpoint is provided, we still use our env var based client
-            Some(Arc::new(RwLock::new(IamClient::new())))
-        } else {
-            Some(Arc::new(RwLock::new(IamClient::new())))
-        };
+        let iam_client = Some(Arc::new(RwLock::new(IamClient::new(iam_endpoint))));

        Self {
            iam_client,
@ -198,9 +393,9 @@ pub async fn sigv4_auth_middleware(
    };

    // Get secret key from IAM (or use dummy for MVP)
-    let secret_key = if let Some(ref iam) = auth_state.iam_client {
-        match iam.read().await.get_secret_key(&access_key_id).await {
-            Ok(key) => key,
+    let credential = if let Some(ref iam) = auth_state.iam_client {
+        match iam.read().await.get_credential(&access_key_id).await {
+            Ok(credential) => credential,
            Err(e) => {
                warn!("IAM credential validation failed: {}", e);
                return error_response(
@ -211,18 +406,22 @@ pub async fn sigv4_auth_middleware(
            }
        }
    } else {
-        // This case should ideally not be hit with the current IamClient::new() logic
-        // but kept for safety.
        debug!("No IAM integration, using dummy secret key if IamClient wasn't initialized.");
-        "dummy_secret_key_for_mvp".to_string()
+        ResolvedCredential {
+            secret_key: "dummy_secret_key_for_mvp".to_string(),
+            principal_id: access_key_id.clone(),
+            org_id: Some("default".to_string()),
+            project_id: Some("default".to_string()),
+        }
    };
+    let secret_key = credential.secret_key.as_str();

    let payload_hash_header = headers
        .get("x-amz-content-sha256")
        .and_then(|value| value.to_str().ok())
        .filter(|value| !value.is_empty())
        .map(str::to_string);
-    let should_buffer_body = !matches!(payload_hash_header.as_deref(), Some(hash) if hash != "UNSIGNED-PAYLOAD");
+    let should_buffer_body = should_buffer_auth_body(payload_hash_header.as_deref());

    let body_bytes = if should_buffer_body {
        let max_body_bytes = std::env::var("S3_MAX_AUTH_BODY_BYTES")
@ -282,7 +481,7 @@ pub async fn sigv4_auth_middleware(
    );

    let expected_signature = match compute_sigv4_signature(
-        &secret_key,
+        secret_key,
        &method,
        &uri,
        &headers,
@ -310,6 +509,21 @@ pub async fn sigv4_auth_middleware(
        );
    }

+    match (credential.org_id, credential.project_id) {
+        (Some(org_id), Some(project_id)) => {
+            request
+                .extensions_mut()
+                .insert(VerifiedTenantContext(TenantContext { org_id, project_id }));
+        }
+        _ => {
+            return error_response(
+                StatusCode::FORBIDDEN,
+                "AccessDenied",
+                "S3 credential is missing tenant scope",
+            );
+        }
+    }
+
    // Auth successful
    debug!("SigV4 auth successful for access_key={}", access_key_id);
    next.run(request).await
@ -558,6 +772,97 @@ fn error_response(status: StatusCode, code: &str, message: &str) -> Response {
 mod tests {
    use super::*;
    use axum::http::HeaderValue;
+    use iam_api::proto::{
+        iam_credential_server::{IamCredential, IamCredentialServer},
+        CreateS3CredentialRequest, CreateS3CredentialResponse, Credential, GetSecretKeyResponse,
+        ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
+        RevokeCredentialResponse,
+    };
+    use std::collections::HashMap;
+    use std::net::SocketAddr;
+    use std::sync::{atomic::{AtomicUsize, Ordering}, Mutex};
+    use tokio::net::TcpListener;
+    use tokio::time::{sleep, Duration};
+    use tonic::{Request as TonicRequest, Response as TonicResponse, Status};
+    use tonic::transport::Server;
+
+    static ENV_LOCK: Mutex<()> = Mutex::new(());
+
+    #[derive(Clone, Default)]
+    struct MockIamCredentialService {
+        secrets: Arc<HashMap<String, String>>,
+        get_secret_calls: Arc<AtomicUsize>,
+    }
+
+    #[tonic::async_trait]
+    impl IamCredential for MockIamCredentialService {
+        async fn create_s3_credential(
+            &self,
+            _request: TonicRequest<CreateS3CredentialRequest>,
+        ) -> Result<TonicResponse<CreateS3CredentialResponse>, Status> {
+            Err(Status::unimplemented("not needed in test"))
+        }
+
+        async fn get_secret_key(
+            &self,
+            request: TonicRequest<GetSecretKeyRequest>,
+        ) -> Result<TonicResponse<GetSecretKeyResponse>, Status> {
+            let access_key_id = request.into_inner().access_key_id;
+            self.get_secret_calls.fetch_add(1, Ordering::SeqCst);
+            let Some(secret_key) = self.secrets.get(&access_key_id) else {
+                return Err(Status::not_found("access key not found"));
+            };
+            Ok(TonicResponse::new(GetSecretKeyResponse {
+                secret_key: secret_key.clone(),
+                principal_id: "test-principal".to_string(),
+                expires_at: None,
+                org_id: Some("test-org".to_string()),
+                project_id: Some("test-project".to_string()),
+                principal_kind: iam_api::proto::PrincipalKind::ServiceAccount as i32,
+            }))
+        }
+
+        async fn list_credentials(
+            &self,
+            _request: TonicRequest<ListCredentialsRequest>,
+        ) -> Result<TonicResponse<ListCredentialsResponse>, Status> {
+            Ok(TonicResponse::new(ListCredentialsResponse {
+                credentials: Vec::<Credential>::new(),
+            }))
+        }
+
+        async fn revoke_credential(
+            &self,
+            _request: TonicRequest<RevokeCredentialRequest>,
+        ) -> Result<TonicResponse<RevokeCredentialResponse>, Status> {
+            Ok(TonicResponse::new(RevokeCredentialResponse { success: true }))
+        }
+    }
+
+    async fn start_mock_iam(secrets: HashMap<String, String>) -> (SocketAddr, Arc<AtomicUsize>) {
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        let get_secret_calls = Arc::new(AtomicUsize::new(0));
+        let service = MockIamCredentialService {
+            secrets: Arc::new(secrets),
+            get_secret_calls: get_secret_calls.clone(),
+        };
+        drop(listener);
+        tokio::spawn(async move {
+            Server::builder()
+                .add_service(IamCredentialServer::new(service))
+                .serve(addr)
+                .await
+                .unwrap();
+        });
+        for _ in 0..20 {
+            if tokio::net::TcpStream::connect(addr).await.is_ok() {
+                return (addr, get_secret_calls);
+            }
+            sleep(Duration::from_millis(25)).await;
+        }
+        panic!("mock IAM server did not start on {}", addr);
+    }

    #[tokio::test]
    async fn test_parse_auth_header() {
@ -657,6 +962,13 @@ mod tests {
        assert_eq!(hashed_payload, "signed-payload-hash");
    }

+    #[test]
+    fn test_should_buffer_auth_body_only_when_hash_header_missing() {
+        assert!(should_buffer_auth_body(None));
+        assert!(!should_buffer_auth_body(Some("signed-payload-hash")));
+        assert!(!should_buffer_auth_body(Some("UNSIGNED-PAYLOAD")));
+    }
+
    #[test]
    fn test_build_string_to_sign() {
        let amz_date = "20231201T000000Z";
@ -677,34 +989,77 @@ mod tests {

    #[test]
    fn test_iam_client_multi_credentials() {
+        let _guard = ENV_LOCK.lock().unwrap();
        // Test parsing S3_CREDENTIALS format
        std::env::set_var("S3_CREDENTIALS", "key1:secret1,key2:secret2,key3:secret3");
-        let client = IamClient::new();
+        let client = IamClient::new(None);
+        let credentials = client.env_credentials().unwrap();

-        assert_eq!(client.credentials.len(), 3);
-        assert_eq!(client.credentials.get("key1"), Some(&"secret1".to_string()));
-        assert_eq!(client.credentials.get("key2"), Some(&"secret2".to_string()));
-        assert_eq!(client.credentials.get("key3"), Some(&"secret3".to_string()));
+        assert_eq!(credentials.len(), 3);
+        assert_eq!(credentials.get("key1"), Some(&"secret1".to_string()));
+        assert_eq!(credentials.get("key2"), Some(&"secret2".to_string()));
+        assert_eq!(credentials.get("key3"), Some(&"secret3".to_string()));

        std::env::remove_var("S3_CREDENTIALS");
    }

    #[test]
    fn test_iam_client_single_credentials() {
+        let _guard = ENV_LOCK.lock().unwrap();
        // Test legacy S3_ACCESS_KEY_ID/S3_SECRET_KEY format
        std::env::remove_var("S3_CREDENTIALS");
        std::env::set_var("S3_ACCESS_KEY_ID", "test_key");
        std::env::set_var("S3_SECRET_KEY", "test_secret");

-        let client = IamClient::new();
+        let client = IamClient::new(None);
+        let credentials = client.env_credentials().unwrap();

-        assert_eq!(client.credentials.len(), 1);
-        assert_eq!(client.credentials.get("test_key"), Some(&"test_secret".to_string()));
+        assert_eq!(credentials.len(), 1);
+        assert_eq!(credentials.get("test_key"), Some(&"test_secret".to_string()));

        std::env::remove_var("S3_ACCESS_KEY_ID");
        std::env::remove_var("S3_SECRET_KEY");
    }

+    #[tokio::test]
+    async fn test_iam_client_grpc_lookup() {
+        let (addr, _calls) = start_mock_iam(HashMap::from([(
+            "grpc_key".to_string(),
+            "grpc_secret".to_string(),
+        )]))
+        .await;
+        let client = IamClient::new(Some(addr.to_string()));
+
+        let credential = client.get_credential("grpc_key").await.unwrap();
+        assert_eq!(credential.secret_key, "grpc_secret");
+        assert_eq!(credential.org_id.as_deref(), Some("test-org"));
+        assert_eq!(credential.project_id.as_deref(), Some("test-project"));
+        assert_eq!(
+            client.get_credential("missing").await.unwrap_err(),
+            "access key not found"
+        );
+    }
+
+    #[tokio::test]
+    async fn test_iam_client_grpc_cache_reuses_secret() {
+        let (addr, calls) = start_mock_iam(HashMap::from([(
+            "grpc_key".to_string(),
+            "grpc_secret".to_string(),
+        )]))
+        .await;
+        let client = IamClient::new(Some(addr.to_string()));
+
+        assert_eq!(
+            client.get_credential("grpc_key").await.unwrap().secret_key,
+            "grpc_secret"
+        );
+        assert_eq!(
+            client.get_credential("grpc_key").await.unwrap().secret_key,
+            "grpc_secret"
+        );
+        assert_eq!(calls.load(Ordering::SeqCst), 1);
+    }
+
    #[test]
    fn test_complete_sigv4_signature() {
        // Test with AWS example credentials (from AWS docs)
@ -1039,18 +1394,20 @@ mod tests {

    #[test]
    fn test_security_credential_lookup_unknown_key() {
+        let _guard = ENV_LOCK.lock().unwrap();
        // Test that unknown access keys return the correct result
        std::env::remove_var("S3_CREDENTIALS");
        std::env::set_var("S3_ACCESS_KEY_ID", "known_key");
        std::env::set_var("S3_SECRET_KEY", "known_secret");

-        let client = IamClient::new();
+        let client = IamClient::new(None);
+        let credentials = client.env_credentials().unwrap();

        // Known key should be found in credentials map
-        assert_eq!(client.credentials.get("known_key"), Some(&"known_secret".to_string()));
+        assert_eq!(credentials.get("known_key"), Some(&"known_secret".to_string()));

        // Unknown key should not be found
-        assert_eq!(client.credentials.get("unknown_key"), None);
+        assert_eq!(credentials.get("unknown_key"), None);

        std::env::remove_var("S3_ACCESS_KEY_ID");
        std::env::remove_var("S3_SECRET_KEY");
@ -1058,33 +1415,36 @@ mod tests {

    #[test]
    fn test_security_empty_credentials() {
+        let _guard = ENV_LOCK.lock().unwrap();
        // Test that IamClient keeps credentials empty when none provided
        std::env::remove_var("S3_CREDENTIALS");
        std::env::remove_var("S3_ACCESS_KEY_ID");
        std::env::remove_var("S3_SECRET_KEY");

-        let client = IamClient::new();
+        let client = IamClient::new(None);

        // No credentials configured
-        assert!(client.credentials.is_empty());
+        assert!(client.env_credentials().unwrap().is_empty());
    }

    #[test]
    fn test_security_malformed_s3_credentials_env() {
+        let _guard = ENV_LOCK.lock().unwrap();
        // Test that malformed S3_CREDENTIALS are handled gracefully

        // Missing colon separator
        std::env::set_var("S3_CREDENTIALS", "key1_secret1,key2:secret2");
-        let client = IamClient::new();
+        let client = IamClient::new(None);
+        let credentials = client.env_credentials().unwrap();
        // Should only parse the valid pair (key2:secret2)
-        assert_eq!(client.credentials.len(), 1);
-        assert!(client.credentials.contains_key("key2"));
+        assert_eq!(credentials.len(), 1);
+        assert!(credentials.contains_key("key2"));

        // Empty pairs
        std::env::set_var("S3_CREDENTIALS", "key1:secret1,,key2:secret2");
-        let client2 = IamClient::new();
+        let client2 = IamClient::new(None);
        // Should parse both valid pairs, skip empty
-        assert_eq!(client2.credentials.len(), 2);
+        assert_eq!(client2.env_credentials().unwrap().len(), 2);

        std::env::remove_var("S3_CREDENTIALS");
    }
--- a/lightningstor/crates/lightningstor-server/src/s3/mod.rs
+++ b/lightningstor/crates/lightningstor-server/src/s3/mod.rs
@ -7,4 +7,4 @@ mod router;
 mod xml;

 pub use auth::{AuthState, sigv4_auth_middleware};
-pub use router::{create_router, create_router_with_state};
+pub use router::{create_router, create_router_with_auth, create_router_with_state};
--- a/lightningstor/crates/lightningstor-server/src/s3/router.rs
+++ b/lightningstor/crates/lightningstor-server/src/s3/router.rs
--- a/lightningstor/crates/lightningstor-server/src/s3/xml.rs
+++ b/lightningstor/crates/lightningstor-server/src/s3/xml.rs
@ -66,6 +66,9 @@ pub struct ListBucketResult {
    pub name: String,
    #[serde(rename = "Prefix")]
    pub prefix: String,
+    #[serde(rename = "Marker")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub marker: Option<String>,
    #[serde(rename = "Delimiter")]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub delimiter: Option<String>,
@ -73,6 +76,9 @@ pub struct ListBucketResult {
    pub max_keys: u32,
    #[serde(rename = "IsTruncated")]
    pub is_truncated: bool,
+    #[serde(rename = "NextMarker")]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub next_marker: Option<String>,
    #[serde(rename = "Contents", default)]
    pub contents: Vec<ObjectEntry>,
    #[serde(rename = "CommonPrefixes", default)]
--- a/lightningstor/crates/lightningstor-server/src/tenant.rs
+++ b/lightningstor/crates/lightningstor-server/src/tenant.rs
@ -1,6 +1,6 @@
 use tonic::{metadata::MetadataMap, Status};

-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub struct TenantContext {
    pub org_id: String,
    pub project_id: String,
--- a/nix/ci/flake.lock
+++ b/nix/ci/flake.lock
@ -1,5 +1,26 @@
 {
  "nodes": {
+    "disko": {
+      "inputs": {
+        "nixpkgs": [
+          "photoncloud",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1765326679,
+        "narHash": "sha256-fTLX9kDwLr9Y0rH/nG+h1XG5UU+jBcy0PFYn5eneRX8=",
+        "owner": "nix-community",
+        "repo": "disko",
+        "rev": "d64e5cdca35b5fad7c504f615357a7afe6d9c49e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-community",
+        "repo": "disko",
+        "type": "github"
+      }
+    },
    "flake-utils": {
      "inputs": {
        "systems": "systems"
@ -18,6 +39,43 @@
        "type": "github"
      }
    },
+    "flake-utils_2": {
+      "inputs": {
+        "systems": "systems_2"
+      },
+      "locked": {
+        "lastModified": 1731533236,
+        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
+        "type": "github"
+      },
+      "original": {
+        "owner": "numtide",
+        "repo": "flake-utils",
+        "type": "github"
+      }
+    },
+    "nix-nos": {
+      "inputs": {
+        "nixpkgs": [
+          "photoncloud",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "path": "./nix-nos",
+        "type": "path"
+      },
+      "original": {
+        "path": "./nix-nos",
+        "type": "path"
+      },
+      "parent": [
+        "photoncloud"
+      ]
+    },
    "nixpkgs": {
      "locked": {
        "lastModified": 1765186076,
@ -34,14 +92,71 @@
        "type": "github"
      }
    },
+    "nixpkgs_2": {
+      "locked": {
+        "lastModified": 1765186076,
+        "narHash": "sha256-hM20uyap1a0M9d344I692r+ik4gTMyj60cQWO+hAYP8=",
+        "owner": "NixOS",
+        "repo": "nixpkgs",
+        "rev": "addf7cf5f383a3101ecfba091b98d0a1263dc9b8",
+        "type": "github"
+      },
+      "original": {
+        "owner": "NixOS",
+        "ref": "nixos-unstable",
+        "repo": "nixpkgs",
+        "type": "github"
+      }
+    },
+    "photoncloud": {
+      "inputs": {
+        "disko": "disko",
+        "flake-utils": "flake-utils_2",
+        "nix-nos": "nix-nos",
+        "nixpkgs": "nixpkgs_2",
+        "rust-overlay": "rust-overlay",
+        "systems": "systems_3"
+      },
+      "locked": {
+        "path": "../..",
+        "type": "path"
+      },
+      "original": {
+        "path": "../..",
+        "type": "path"
+      },
+      "parent": []
+    },
    "root": {
      "inputs": {
        "flake-utils": "flake-utils",
        "nixpkgs": "nixpkgs",
-        "rust-overlay": "rust-overlay"
+        "photoncloud": "photoncloud",
+        "rust-overlay": "rust-overlay_2"
      }
    },
    "rust-overlay": {
+      "inputs": {
+        "nixpkgs": [
+          "photoncloud",
+          "nixpkgs"
+        ]
+      },
+      "locked": {
+        "lastModified": 1765465581,
+        "narHash": "sha256-fCXT0aZXmTalM3NPCTedVs9xb0egBG5BOZkcrYo5PGE=",
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "rev": "99cc5667eece98bb35dcf35f7e511031a8b7a125",
+        "type": "github"
+      },
+      "original": {
+        "owner": "oxalica",
+        "repo": "rust-overlay",
+        "type": "github"
+      }
+    },
+    "rust-overlay_2": {
      "inputs": {
        "nixpkgs": [
          "nixpkgs"
@ -75,6 +190,35 @@
        "repo": "default",
        "type": "github"
      }
+    },
+    "systems_2": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "owner": "nix-systems",
+        "repo": "default",
+        "type": "github"
+      }
+    },
+    "systems_3": {
+      "locked": {
+        "lastModified": 1681028828,
+        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+        "owner": "nix-systems",
+        "repo": "default",
+        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+        "type": "github"
+      },
+      "original": {
+        "id": "systems",
+        "type": "indirect"
+      }
    }
  },
  "root": "root",
--- a/nix/ci/flake.nix
+++ b/nix/ci/flake.nix
@ -5,6 +5,7 @@
    nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";

    flake-utils.url = "github:numtide/flake-utils";
+    photoncloud.url = "path:../..";

    rust-overlay = {
      url = "github:oxalica/rust-overlay";
@ -12,7 +13,7 @@
    };
  };

-  outputs = { self, nixpkgs, flake-utils, rust-overlay }:
+  outputs = { self, nixpkgs, flake-utils, photoncloud, rust-overlay }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        overlays = [ (import rust-overlay) ];
@ -201,7 +202,7 @@

              if [[ "$no_logs" == "0" ]]; then
                local out
-                out="$logdir/shared_${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
+                out="$logdir/shared_''${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
                (cd "$repo_root" && bash -c "$cmd") 2>&1 | tee "$out"
              else
                (cd "$repo_root" && bash -c "$cmd")
@ -291,6 +292,11 @@
          ${gate}/bin/photoncloud-gate --tier 0 --no-logs
          touch $out/ok
        '';
+        checks.deployer-vm-smoke = photoncloud.checks.${system}.deployer-vm-smoke;
+        checks.deployer-vm-rollback = photoncloud.checks.${system}.deployer-vm-rollback;
+        checks.deployer-bootstrap-e2e = photoncloud.checks.${system}.deployer-bootstrap-e2e;
+        checks.host-lifecycle-e2e = photoncloud.checks.${system}.host-lifecycle-e2e;
+        checks.fleet-scheduler-e2e = photoncloud.checks.${system}.fleet-scheduler-e2e;

        devShells.default = pkgs.mkShell {
          name = "photoncloud-ci-dev";
--- a/nix/images/deployer-vm-smoke-target.nix
+++ b/nix/images/deployer-vm-smoke-target.nix
@ -0,0 +1,67 @@
+{ lib, modulesPath, ... }:
+
+{
+  imports = [
+    "${modulesPath}/virtualisation/qemu-vm.nix"
+    "${modulesPath}/testing/test-instrumentation.nix"
+  ];
+
+  boot.loader.grub = {
+    enable = true;
+    device = "/dev/vda";
+    forceInstall = true;
+  };
+
+  fileSystems."/" = {
+    device = "/dev/disk/by-label/nixos";
+    fsType = "ext4";
+  };
+
+  networking.hostName = "worker";
+  networking.firewall.enable = false;
+  networking.useDHCP = lib.mkForce false;
+  networking.dhcpcd.enable = lib.mkForce false;
+  systemd.network = {
+    enable = true;
+    networks."10-eth0" = {
+      matchConfig.Name = "eth0";
+      networkConfig.DHCP = "yes";
+      linkConfig.RequiredForOnline = "routable";
+    };
+    networks."20-eth1" = {
+      matchConfig.Name = "eth1";
+      address = [ "192.168.1.2/24" ];
+      linkConfig.RequiredForOnline = "routable";
+    };
+  };
+
+  nix.registry = lib.mkForce { };
+  nix.nixPath = lib.mkForce [ ];
+  nix.channel.enable = false;
+  nix.settings = {
+    experimental-features = [
+      "nix-command"
+      "flakes"
+    ];
+    flake-registry = "";
+  };
+  nixpkgs.flake = {
+    source = lib.mkForce null;
+    setFlakeRegistry = lib.mkForce false;
+    setNixPath = lib.mkForce false;
+  };
+
+  system.switch.enable = lib.mkForce true;
+  system.nixos.label = lib.mkForce "vm-smoke-target";
+  system.nixos.version = lib.mkForce "vm-smoke-target";
+  system.nixos.versionSuffix = lib.mkForce "-vm-smoke-target";
+  environment.etc."photon-vm-smoke-target".text = "vm-smoke-target\n";
+
+  documentation.enable = false;
+  documentation.nixos.enable = false;
+  documentation.man.enable = false;
+  documentation.info.enable = false;
+  documentation.doc.enable = false;
+
+  system.stateVersion = "24.11";
+}
--- a/nix/modules/cluster-config-lib.nix
+++ b/nix/modules/cluster-config-lib.nix
@ -33,6 +33,12 @@ let

  mkDesiredSystemType = types: types.submodule {
    options = {
+      deploymentId = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = "Optional host deployment identifier owning this desired system";
+      };
+
      nixosConfiguration = mkOption {
        type = types.nullOr types.str;
        default = null;
@ -62,6 +68,119 @@ let
        default = null;
        description = "Whether nix-agent should roll back when the health check fails";
      };
+
+      drainBeforeApply = mkOption {
+        type = types.nullOr types.bool;
+        default = null;
+        description = "Whether the controller should drain the node before issuing this desired system";
+      };
+    };
+  };
+
+  mkHostDeploymentSelectorType = types: types.submodule {
+    options = {
+      nodeIds = mkOption {
+        type = types.listOf types.str;
+        default = [ ];
+        description = "Explicit node IDs targeted by the deployment";
+      };
+
+      roles = mkOption {
+        type = types.listOf types.str;
+        default = [ ];
+        description = "Node roles targeted by the deployment";
+      };
+
+      pools = mkOption {
+        type = types.listOf types.str;
+        default = [ ];
+        description = "Node pools targeted by the deployment";
+      };
+
+      nodeClasses = mkOption {
+        type = types.listOf types.str;
+        default = [ ];
+        description = "Node classes targeted by the deployment";
+      };
+
+      matchLabels = mkOption {
+        type = types.attrsOf types.str;
+        default = { };
+        description = "Label selectors applied to target nodes";
+      };
+    };
+  };
+
+  mkHostDeploymentType = types:
+    let
+      selectorType = mkHostDeploymentSelectorType types;
+    in types.submodule {
+      options = {
+        selector = mkOption {
+          type = selectorType;
+          default = { };
+          description = "Node selector used by the host deployment";
+        };
+
+        nixosConfiguration = mkOption {
+          type = types.nullOr types.str;
+          default = null;
+          description = "Name of the nixosConfigurations output to roll out";
+        };
+
+        flakeRef = mkOption {
+          type = types.nullOr types.str;
+          default = null;
+          description = "Explicit flake reference used during rollout";
+        };
+
+        batchSize = mkOption {
+          type = types.nullOr types.int;
+          default = null;
+          description = "Maximum number of nodes started per reconciliation wave";
+        };
+
+        maxUnavailable = mkOption {
+          type = types.nullOr types.int;
+          default = null;
+          description = "Maximum number of unavailable nodes allowed during rollout";
+        };
+
+        healthCheckCommand = mkOption {
+          type = types.listOf types.str;
+          default = [ ];
+          description = "Health check command executed by nix-agent after activation";
+        };
+
+        switchAction = mkOption {
+          type = types.nullOr types.str;
+          default = null;
+          description = "switch-to-configuration action used by nix-agent";
+        };
+
+        rollbackOnFailure = mkOption {
+          type = types.nullOr types.bool;
+          default = null;
+          description = "Whether nodes should roll back when rollout health checks fail";
+        };
+
+        drainBeforeApply = mkOption {
+          type = types.nullOr types.bool;
+          default = null;
+          description = "Whether the controller should drain a node before applying the rollout";
+        };
+
+        rebootPolicy = mkOption {
+          type = types.nullOr types.str;
+          default = null;
+          description = "Operator-facing reboot policy associated with the rollout";
+        };
+
+        paused = mkOption {
+          type = types.nullOr types.bool;
+          default = null;
+          description = "Whether the rollout should start in a paused state";
+        };
      };
    };

@ -159,6 +278,30 @@ let
        default = null;
        description = "Desired deployer node lifecycle state";
      };
+
+      commissionState = mkOption {
+        type = types.nullOr (types.enum [ "discovered" "commissioning" "commissioned" ]);
+        default = null;
+        description = "Optional commissioning state exported into deployer cluster state";
+      };
+
+      installState = mkOption {
+        type = types.nullOr (types.enum [ "pending" "installing" "installed" "failed" "reinstall_requested" ]);
+        default = null;
+        description = "Optional install lifecycle state exported into deployer cluster state";
+      };
+
+      powerState = mkOption {
+        type = types.nullOr (types.enum [ "on" "off" "cycling" "unknown" ]);
+        default = null;
+        description = "Optional external power-management state associated with the node";
+      };
+
+      bmcRef = mkOption {
+        type = types.nullOr types.str;
+        default = null;
+        description = "Optional BMC / Redfish reference associated with the node";
+      };
    };
  };

@ -339,7 +482,10 @@ let
  mkDesiredSystem = nodeName: desiredSystem:
    let
      rendered =
-        optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
+        optionalAttrs (desiredSystem != null && desiredSystem.deploymentId != null) {
+          deployment_id = desiredSystem.deploymentId;
+        }
+        // optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
          nixos_configuration = desiredSystem.nixosConfiguration;
        }
        // optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) {
@ -353,12 +499,60 @@ let
        }
        // optionalAttrs (desiredSystem != null && desiredSystem.rollbackOnFailure != null) {
          rollback_on_failure = desiredSystem.rollbackOnFailure;
+        }
+        // optionalAttrs (desiredSystem != null && desiredSystem.drainBeforeApply != null) {
+          drain_before_apply = desiredSystem.drainBeforeApply;
        };
    in
      if desiredSystem == null || rendered == { } then null else {
        node_id = nodeName;
      } // rendered;

+  mkHostDeploymentSelector = selector:
+    {
+      node_ids = selector.nodeIds or [ ];
+      roles = selector.roles or [ ];
+      pools = selector.pools or [ ];
+      node_classes = selector.nodeClasses or [ ];
+      match_labels = selector.matchLabels or { };
+    };
+
+  mkDeployerHostDeploymentSpec = name: deployment:
+    {
+      inherit name;
+      selector = mkHostDeploymentSelector deployment.selector;
+    }
+    // optionalAttrs (deployment.nixosConfiguration != null) {
+      nixos_configuration = deployment.nixosConfiguration;
+    }
+    // optionalAttrs (deployment.flakeRef != null) {
+      flake_ref = deployment.flakeRef;
+    }
+    // optionalAttrs (deployment.batchSize != null) {
+      batch_size = deployment.batchSize;
+    }
+    // optionalAttrs (deployment.maxUnavailable != null) {
+      max_unavailable = deployment.maxUnavailable;
+    }
+    // optionalAttrs (deployment.healthCheckCommand != [ ]) {
+      health_check_command = deployment.healthCheckCommand;
+    }
+    // optionalAttrs (deployment.switchAction != null) {
+      switch_action = deployment.switchAction;
+    }
+    // optionalAttrs (deployment.rollbackOnFailure != null) {
+      rollback_on_failure = deployment.rollbackOnFailure;
+    }
+    // optionalAttrs (deployment.drainBeforeApply != null) {
+      drain_before_apply = deployment.drainBeforeApply;
+    }
+    // optionalAttrs (deployment.rebootPolicy != null) {
+      reboot_policy = deployment.rebootPolicy;
+    }
+    // optionalAttrs (deployment.paused != null) {
+      paused = deployment.paused;
+    };
+
  mkDeployerNodeSpec = nodeName: node:
    {
      node_id = nodeName;
@ -390,6 +584,18 @@ let
    }
    // optionalAttrs (node.state != null) {
      state = node.state;
+    }
+    // optionalAttrs (node.commissionState != null) {
+      commission_state = node.commissionState;
+    }
+    // optionalAttrs (node.installState != null) {
+      install_state = node.installState;
+    }
+    // optionalAttrs (node.powerState != null) {
+      power_state = node.powerState;
+    }
+    // optionalAttrs (node.bmcRef != null) {
+      bmc_ref = node.bmcRef;
    };

  mkDeployerNodeClassSpec = name: nodeClass:
@ -522,6 +728,7 @@ let
      nodeClasses = deployer.nodeClasses or { };
      pools = deployer.pools or { };
      enrollmentRules = deployer.enrollmentRules or { };
+      hostDeployments = deployer.hostDeployments or { };
    in {
      cluster = {
        cluster_id = clusterId;
@ -532,6 +739,7 @@ let
      node_classes = map (name: mkDeployerNodeClassSpec name nodeClasses.${name}) (attrNames nodeClasses);
      pools = map (name: mkDeployerPoolSpec name pools.${name}) (attrNames pools);
      enrollment_rules = map (name: mkDeployerEnrollmentRuleSpec name enrollmentRules.${name}) (attrNames enrollmentRules);
+      host_deployments = map (name: mkDeployerHostDeploymentSpec name hostDeployments.${name}) (attrNames hostDeployments);
      services = [ ];
      instances = [ ];
      mtls_policies = [ ];
@ -541,6 +749,8 @@ in
  inherit
    mkInstallPlanType
    mkDesiredSystemType
+    mkHostDeploymentSelectorType
+    mkHostDeploymentType
    mkNodeType
    mkNodeClassType
    mkNodePoolType
--- a/nix/modules/coronafs.nix
+++ b/nix/modules/coronafs.nix
@ -2,8 +2,61 @@

 let
  cfg = config.services.coronafs;
+  chainfireEnabled = lib.hasAttrByPath [ "services" "chainfire" "enable" ] config && config.services.chainfire.enable;
+  chainfireApiUrls =
+    if cfg.chainfireApiUrl != null then
+      lib.filter (item: item != "") (map lib.strings.trim (lib.splitString "," cfg.chainfireApiUrl))
+    else
+      [ ];
+  effectiveChainfireApiUrl =
+    if cfg.chainfireApiUrl != null then cfg.chainfireApiUrl
+    else if chainfireEnabled then "http://127.0.0.1:${toString config.services.chainfire.httpPort}"
+    else null;
+  localChainfireApiUrl =
+    lib.any
+      (url:
+        lib.hasPrefix "http://127.0.0.1:" url
+        || lib.hasPrefix "http://localhost:" url
+      )
+      (
+        if effectiveChainfireApiUrl == null then
+          [ ]
+        else if cfg.chainfireApiUrl != null then
+          chainfireApiUrls
+        else
+          [ effectiveChainfireApiUrl ]
+      );
+  waitForChainfire =
+    pkgs.writeShellScript "coronafs-wait-for-chainfire" ''
+      set -eu
+      deadline=$((SECONDS + 60))
+      urls='${lib.concatStringsSep " " (
+        if effectiveChainfireApiUrl == null then
+          [ ]
+        else if cfg.chainfireApiUrl != null then
+          chainfireApiUrls
+        else
+          [ effectiveChainfireApiUrl ]
+      )}'
+      while true; do
+        for url in $urls; do
+          if curl -fsS "$url/health" >/dev/null 2>&1; then
+            exit 0
+          fi
+        done
+        if [ "$SECONDS" -ge "$deadline" ]; then
+          echo "timed out waiting for ChainFire at ${if effectiveChainfireApiUrl == null then "(none)" else effectiveChainfireApiUrl}" >&2
+          exit 1
+        fi
+        sleep 1
+      done
+    '';
  tomlFormat = pkgs.formats.toml { };
-  coronafsConfigFile = tomlFormat.generate "coronafs.toml" {
+  coronafsConfigFile = tomlFormat.generate "coronafs.toml" (
+    {
+      mode = cfg.mode;
+      metadata_backend = cfg.metadataBackend;
+      chainfire_key_prefix = cfg.chainfireKeyPrefix;
      listen_addr = "0.0.0.0:${toString cfg.port}";
      advertise_host = cfg.advertiseHost;
      data_dir = toString cfg.dataDir;
@ -20,12 +73,41 @@ let
      qemu_nbd_path = "${pkgs.qemu}/bin/qemu-nbd";
      qemu_img_path = "${pkgs.qemu}/bin/qemu-img";
      log_level = "info";
-  };
+    }
+    // lib.optionalAttrs (effectiveChainfireApiUrl != null) {
+      chainfire_api_url = effectiveChainfireApiUrl;
+    }
+  );
 in
 {
  options.services.coronafs = {
    enable = lib.mkEnableOption "CoronaFS block volume service";

+    mode = lib.mkOption {
+      type = lib.types.enum [ "combined" "controller" "node" ];
+      default = "combined";
+      description = "CoronaFS operating mode: combined compatibility mode, controller-only API, or node-local export mode.";
+    };
+
+    metadataBackend = lib.mkOption {
+      type = lib.types.enum [ "filesystem" "chainfire" ];
+      default = "filesystem";
+      description = "Metadata backend for CoronaFS volume metadata. Use chainfire on controller nodes to replicate volume metadata.";
+    };
+
+    chainfireApiUrl = lib.mkOption {
+      type = lib.types.nullOr lib.types.str;
+      default = null;
+      description = "Optional ChainFire HTTP API URL used when metadataBackend = chainfire. Comma-separated endpoints are allowed for failover.";
+      example = "http://127.0.0.1:8081";
+    };
+
+    chainfireKeyPrefix = lib.mkOption {
+      type = lib.types.str;
+      default = "/coronafs/volumes";
+      description = "ChainFire key prefix used to store CoronaFS metadata when metadataBackend = chainfire.";
+    };
+
    port = lib.mkOption {
      type = lib.types.port;
      default = 50088;
@ -71,7 +153,7 @@ in

    exportAioMode = lib.mkOption {
      type = lib.types.enum [ "native" "io_uring" "threads" ];
-      default = "io_uring";
+      default = "threads";
      description = "qemu-nbd AIO mode for CoronaFS exports.";
    };

@ -113,11 +195,22 @@ in
  };

  config = lib.mkIf cfg.enable {
+    assertions = [
+      {
+        assertion = cfg.metadataBackend != "chainfire" || effectiveChainfireApiUrl != null;
+        message = "services.coronafs.metadataBackend = \"chainfire\" requires services.coronafs.chainfireApiUrl or a local services.chainfire instance.";
+      }
+    ];
+
    users.users.coronafs = {
      isSystemUser = true;
      group = "coronafs";
      description = "CoronaFS service user";
      home = cfg.dataDir;
+      extraGroups =
+        lib.optional
+          (lib.hasAttrByPath [ "services" "plasmavmc" "enable" ] config && config.services.plasmavmc.enable)
+          "plasmavmc";
    };

    users.groups.coronafs = { };
@ -125,8 +218,9 @@ in
    systemd.services.coronafs = {
      description = "CoronaFS Block Volume Service";
      wantedBy = [ "multi-user.target" ];
-      after = [ "network.target" ];
-      path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils ];
+      after = [ "network.target" ] ++ lib.optionals chainfireEnabled [ "chainfire.service" ];
+      wants = lib.optionals chainfireEnabled [ "chainfire.service" ];
+      path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils pkgs.curl ];

      serviceConfig = {
        Type = "simple";
@ -138,13 +232,14 @@ in
        StateDirectory = "coronafs";
        StateDirectoryMode = "0750";
        ReadWritePaths = [ cfg.dataDir ];
+        ExecStartPre = lib.optionals (cfg.metadataBackend == "chainfire" && localChainfireApiUrl) [ waitForChainfire ];
        ExecStart = "${cfg.package}/bin/coronafs-server --config ${coronafsConfigFile}";
      };
    };

    systemd.tmpfiles.rules = [
      "d ${toString cfg.dataDir} 0750 coronafs coronafs -"
-      "d ${toString cfg.dataDir}/volumes 0750 coronafs coronafs -"
+      "d ${toString cfg.dataDir}/volumes 2770 coronafs coronafs -"
      "d ${toString cfg.dataDir}/metadata 0750 coronafs coronafs -"
      "d ${toString cfg.dataDir}/pids 0750 coronafs coronafs -"
    ];
--- a/nix/modules/deployer.nix
+++ b/nix/modules/deployer.nix
@ -3,6 +3,23 @@
 let
  cfg = config.services.deployer;
  tomlFormat = pkgs.formats.toml { };
+  usesLocalChainfire =
+    builtins.any
+      (
+        endpoint:
+        lib.hasPrefix "http://127.0.0.1:" endpoint
+        || lib.hasPrefix "http://localhost:" endpoint
+        || lib.hasPrefix "http://[::1]:" endpoint
+      )
+      cfg.chainfireEndpoints;
+  localChainfireDeps =
+    lib.optionals
+      (
+        usesLocalChainfire
+        && lib.hasAttrByPath [ "services" "chainfire" "enable" ] config
+        && config.services.chainfire.enable
+      )
+      [ "chainfire.service" ];
  generatedConfig = {
    bind_addr = cfg.bindAddr;
    chainfire = {
@ -226,7 +243,9 @@ in
    systemd.services.deployer = {
      description = "PlasmaCloud Deployer Server";
      wantedBy = [ "multi-user.target" ];
-      after = [ "network.target" ];
+      wants = [ "network-online.target" ] ++ localChainfireDeps;
+      after = [ "network-online.target" ] ++ localChainfireDeps;
+      requires = localChainfireDeps;

      environment = {}
        // lib.optionalAttrs (cfg.bootstrapToken != null) {
--- a/nix/modules/first-boot-automation.nix
+++ b/nix/modules/first-boot-automation.nix
@ -285,7 +285,7 @@ in
        healthUrl = "http://localhost:8082/health";  # Health endpoint on admin port
        leaderUrlKey = "flaredb_leader_url";
        defaultLeaderUrl = "http://localhost:8082";
-        joinPath = null;
+        joinPath = "/admin/member/add";
        port = cfg.flaredbPort;
        description = "FlareDB";
      } // {
--- a/nix/modules/lightningstor.nix
+++ b/nix/modules/lightningstor.nix
@ -297,6 +297,30 @@ in
      description = "Prometheus metrics port for lightningstor-node.";
    };

+    s3StreamingPutThresholdBytes = lib.mkOption {
+      type = lib.types.int;
+      default = 64 * 1024 * 1024;
+      description = "Streaming PUT multipart threshold for the S3 frontend.";
+    };
+
+    s3InlinePutMaxBytes = lib.mkOption {
+      type = lib.types.int;
+      default = 128 * 1024 * 1024;
+      description = "Maximum inline single-PUT size for the S3 frontend.";
+    };
+
+    s3MultipartPutConcurrency = lib.mkOption {
+      type = lib.types.int;
+      default = 4;
+      description = "Maximum in-flight multipart PUT part uploads.";
+    };
+
+    s3MultipartFetchConcurrency = lib.mkOption {
+      type = lib.types.int;
+      default = 4;
+      description = "Maximum concurrent multipart GET part fetches.";
+    };
+
    databaseUrl = lib.mkOption {
      type = lib.types.nullOr lib.types.str;
      default = null;
@ -369,6 +393,14 @@ in

      environment = {
        RUST_LOG = "info";
+        LIGHTNINGSTOR_S3_STREAMING_PUT_THRESHOLD_BYTES =
+          toString cfg.s3StreamingPutThresholdBytes;
+        LIGHTNINGSTOR_S3_INLINE_PUT_MAX_BYTES =
+          toString cfg.s3InlinePutMaxBytes;
+        LIGHTNINGSTOR_S3_MULTIPART_PUT_CONCURRENCY =
+          toString cfg.s3MultipartPutConcurrency;
+        LIGHTNINGSTOR_S3_MULTIPART_FETCH_CONCURRENCY =
+          toString cfg.s3MultipartFetchConcurrency;
      };
    };
  };
--- a/nix/modules/plasmacloud-cluster.nix
+++ b/nix/modules/plasmacloud-cluster.nix
@ -9,6 +9,7 @@ let
  nodeClassType = clusterConfigLib.mkNodeClassType types;
  nodePoolType = clusterConfigLib.mkNodePoolType types;
  enrollmentRuleType = clusterConfigLib.mkEnrollmentRuleType types;
+  hostDeploymentType = clusterConfigLib.mkHostDeploymentType types;
  jsonFormat = pkgs.formats.json { };

  # Generate cluster-config.json for the current node
@ -98,6 +99,12 @@ in {
        default = { };
        description = "Deployer auto-enrollment rules derived from Nix";
      };
+
+      hostDeployments = mkOption {
+        type = types.attrsOf hostDeploymentType;
+        default = { };
+        description = "Declarative host rollout objects derived from Nix";
+      };
    };

    generated = {
@ -173,6 +180,16 @@ in {
        ) (attrNames cfg.deployer.enrollmentRules);
        message = "All deployer enrollment rules must reference existing pools and node classes";
      }
+      {
+        assertion = all (deploymentName:
+          let
+            deployment = cfg.deployer.hostDeployments.${deploymentName};
+          in
+            all (pool: cfg.deployer.pools ? "${pool}") deployment.selector.pools
+            && all (nodeClass: cfg.deployer.nodeClasses ? "${nodeClass}") deployment.selector.nodeClasses
+        ) (attrNames cfg.deployer.hostDeployments);
+        message = "All deployer host deployments must reference existing pools and node classes";
+      }
    ];

    # Generate cluster-config.json for first-boot-automation
--- a/nix/modules/plasmavmc.nix
+++ b/nix/modules/plasmavmc.nix
@ -2,11 +2,30 @@

 let
  cfg = config.services.plasmavmc;
+  localIamDeps = lib.optional (config.services.iam.enable or false) "iam.service";
+  localIamHealthUrl =
+    if config.services.iam.enable or false
+    then "http://127.0.0.1:${toString config.services.iam.httpPort}/health"
+    else null;
+  remoteIamEndpoint =
+    if !(config.services.iam.enable or false) && cfg.iamAddr != null
+    then cfg.iamAddr
+    else null;
  coronafsEnabled = lib.hasAttrByPath [ "services" "coronafs" "enable" ] config && config.services.coronafs.enable;
  coronafsDataDir =
    if coronafsEnabled && lib.hasAttrByPath [ "services" "coronafs" "dataDir" ] config
    then toString config.services.coronafs.dataDir
    else null;
+  effectiveCoronafsControllerEndpoint =
+    if cfg.coronafsControllerEndpoint != null then cfg.coronafsControllerEndpoint
+    else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
+    else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
+    else null;
+  effectiveCoronafsNodeEndpoint =
+    if cfg.coronafsNodeEndpoint != null then cfg.coronafsNodeEndpoint
+    else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
+    else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
+    else null;
  tomlFormat = pkgs.formats.toml { };
  plasmavmcConfigFile = tomlFormat.generate "plasmavmc.toml" {
    addr = "0.0.0.0:${toString cfg.port}";
@ -94,10 +113,41 @@ in
    coronafsEndpoint = lib.mkOption {
      type = lib.types.nullOr lib.types.str;
      default = null;
-      description = "CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
+      description = "Deprecated combined CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
      example = "http://10.0.0.11:50088";
    };

+    coronafsControllerEndpoint = lib.mkOption {
+      type = lib.types.nullOr lib.types.str;
+      default = null;
+      description = "CoronaFS controller HTTP endpoint used to provision and resize managed VM volumes. Comma-separated endpoints are allowed for client-side failover.";
+      example = "http://10.0.0.11:50088";
+    };
+
+    coronafsNodeEndpoint = lib.mkOption {
+      type = lib.types.nullOr lib.types.str;
+      default = null;
+      description = "CoronaFS node-local HTTP endpoint used to resolve local paths and exports for attached VM volumes. Comma-separated endpoints are allowed for client-side failover.";
+      example = "http://127.0.0.1:50088";
+    };
+
+    coronafsNodeLocalAttach = lib.mkOption {
+      type = lib.types.bool;
+      default = false;
+      description = ''
+        Enable writable VM attachment through node-local CoronaFS materialization.
+        This requires services.plasmavmc.sharedLiveMigration = false because migrations use cold relocate plus flush-back.
+      '';
+    };
+
+    experimentalCoronafsNodeLocalAttach = lib.mkOption {
+      type = lib.types.bool;
+      default = false;
+      description = ''
+        Deprecated alias for services.plasmavmc.coronafsNodeLocalAttach.
+      '';
+    };
+
    managedVolumeRoot = lib.mkOption {
      type = lib.types.path;
      default = "/var/lib/plasmavmc/managed-volumes";
@ -173,6 +223,24 @@ in
  };

  config = lib.mkIf cfg.enable {
+    assertions = [
+      {
+        assertion = !((cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) && cfg.sharedLiveMigration);
+        message = ''
+          services.plasmavmc.coronafsNodeLocalAttach requires services.plasmavmc.sharedLiveMigration = false
+          because writable node-local CoronaFS attachment uses cold relocate plus flush-back instead of shared-storage live migration.
+        '';
+      }
+    ];
+
+    warnings =
+      lib.optional (cfg.coronafsEndpoint != null) ''
+        services.plasmavmc.coronafsEndpoint is deprecated; use services.plasmavmc.coronafsControllerEndpoint and services.plasmavmc.coronafsNodeEndpoint.
+      ''
+      ++ lib.optional (cfg.experimentalCoronafsNodeLocalAttach) ''
+        services.plasmavmc.experimentalCoronafsNodeLocalAttach is deprecated; use services.plasmavmc.coronafsNodeLocalAttach.
+      '';
+
    # Create system user
    users.users.plasmavmc = {
      isSystemUser = true;
@ -188,9 +256,35 @@ in
    systemd.services.plasmavmc = {
      description = "PlasmaVMC Virtual Machine Compute Service";
      wantedBy = [ "multi-user.target" ];
-      after = [ "network.target" "prismnet.service" "flaredb.service" "chainfire.service" ];
-      wants = [ "prismnet.service" "flaredb.service" "chainfire.service" ];
-      path = [ pkgs.qemu pkgs.coreutils ];
+      after = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
+      wants = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
+      path = [ pkgs.qemu pkgs.coreutils pkgs.curl ];
+      preStart =
+        lib.optionalString (localIamHealthUrl != null) ''
+          for _ in $(seq 1 90); do
+            if curl -fsS ${lib.escapeShellArg localIamHealthUrl} >/dev/null 2>&1; then
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "plasmavmc: timed out waiting for local IAM health at ${localIamHealthUrl}" >&2
+          exit 1
+        ''
+        + lib.optionalString (remoteIamEndpoint != null) ''
+          endpoint=${lib.escapeShellArg remoteIamEndpoint}
+          endpoint="''${endpoint#http://}"
+          endpoint="''${endpoint#https://}"
+          host="''${endpoint%:*}"
+          port="''${endpoint##*:}"
+          for _ in $(${pkgs.coreutils}/bin/seq 1 90); do
+            if ${pkgs.coreutils}/bin/timeout 1 ${pkgs.bash}/bin/bash -lc "</dev/tcp/''${host}/''${port}" >/dev/null 2>&1; then
+              exit 0
+            fi
+            sleep 1
+          done
+          echo "plasmavmc: timed out waiting for IAM gRPC at ''${host}:''${port}" >&2
+          exit 1
+        '';

      environment = lib.mkMerge [
        {
@ -213,6 +307,16 @@ in
        (lib.mkIf (cfg.lightningstorAddr != null) {
          PLASMAVMC_LIGHTNINGSTOR_ENDPOINT = cfg.lightningstorAddr;
        })
+        (lib.mkIf (effectiveCoronafsControllerEndpoint != null) {
+          PLASMAVMC_CORONAFS_CONTROLLER_ENDPOINT = effectiveCoronafsControllerEndpoint;
+        })
+        (lib.mkIf (effectiveCoronafsNodeEndpoint != null) {
+          PLASMAVMC_CORONAFS_NODE_ENDPOINT = effectiveCoronafsNodeEndpoint;
+        })
+        (lib.mkIf (cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) {
+          PLASMAVMC_CORONAFS_NODE_LOCAL_ATTACH = "1";
+          PLASMAVMC_CORONAFS_ENABLE_EXPERIMENTAL_NODE_LOCAL_ATTACH = "1";
+        })
        (lib.mkIf (cfg.coronafsEndpoint != null) {
          PLASMAVMC_CORONAFS_ENDPOINT = cfg.coronafsEndpoint;
        })
@ -273,6 +377,8 @@ in
    systemd.tmpfiles.rules = [
      "d ${builtins.dirOf (toString cfg.managedVolumeRoot)} 0755 plasmavmc plasmavmc -"
      "d ${toString cfg.managedVolumeRoot} 0750 plasmavmc plasmavmc -"
+    ] ++ lib.optionals coronafsEnabled [
+      "d ${toString cfg.dataDir}/images 2770 plasmavmc coronafs -"
    ];
  };
 }
--- a/nix/nodes/vm-cluster/cluster.nix
+++ b/nix/nodes/vm-cluster/cluster.nix
@ -108,6 +108,19 @@
          };
        };
      };
+
+      hostDeployments = {
+        control-plane-canary = {
+          selector.nodeIds = [ "node01" ];
+          nixosConfiguration = "node01";
+          flakeRef = "github:centra/cloud";
+          batchSize = 1;
+          maxUnavailable = 1;
+          healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
+          switchAction = "switch";
+          rollbackOnFailure = true;
+        };
+      };
    };

    bootstrap.initialPeers = [ "node01" "node02" "node03" ];
--- a/nix/nodes/vm-cluster/node01/configuration.nix
+++ b/nix/nodes/vm-cluster/node01/configuration.nix
@ -32,8 +32,8 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "192.168.100.11:2379";
-    flaredbAddr = "192.168.100.11:2479";
+    chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
+    flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
  };

  services.openssh.enable = true;
--- a/nix/nodes/vm-cluster/node02/configuration.nix
+++ b/nix/nodes/vm-cluster/node02/configuration.nix
@ -42,8 +42,8 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "192.168.100.11:2379";
-    flaredbAddr = "192.168.100.11:2479";
+    chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
+    flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
  };

  services.openssh.enable = true;
--- a/nix/nodes/vm-cluster/node03/configuration.nix
+++ b/nix/nodes/vm-cluster/node03/configuration.nix
@ -42,8 +42,8 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "192.168.100.11:2379";
-    flaredbAddr = "192.168.100.11:2479";
+    chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
+    flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
  };

  services.openssh.enable = true;
--- a/nix/test-cluster/README.md
+++ b/nix/test-cluster/README.md
@ -63,10 +63,13 @@ Preferred entrypoint for publishable verification: `nix run ./nix/test-cluster#c

 Preferred entrypoint for publishable matrix verification: `nix run ./nix/test-cluster#cluster -- fresh-matrix`

-`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS local-vs-shared-volume I/O, queued random-read behavior, cross-worker direct-I/O shared-volume reads, and LightningStor large/small-object S3 throughput and writes a report to `docs/storage-benchmarks.md`.
+`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS controller-export vs node-local-export I/O, worker-side materialization latency, and LightningStor large/small-object S3 throughput, then writes a report to `docs/storage-benchmarks.md`.

 Preferred entrypoint for publishable storage numbers: `nix run ./nix/test-cluster#cluster -- fresh-storage-bench`

+`nix run ./nix/test-cluster#cluster -- bench-coronafs-local-matrix` runs the local single-process CoronaFS export benchmark across the supported `cache`/`aio` combinations so software-path regressions can be separated from VM-lab network limits.
+On the current lab hosts, `cache=none` with `aio=io_uring` is the strongest local-export profile and should be treated as the reference point when CoronaFS remote numbers are being distorted by the nested-QEMU/VDE network path.
+
 ## Advanced usage

 Use the script entrypoint only for local debugging inside a prepared Nix shell:
--- a/nix/test-cluster/common.nix
+++ b/nix/test-cluster/common.nix
@ -27,6 +27,18 @@ in
      default = "/tmp/photoncloud-test-cluster-vde.sock";
      description = "VDE control socket path used for the east-west cluster NIC.";
    };
+
+    chainfireControlPlaneAddrs = lib.mkOption {
+      type = lib.types.str;
+      default = "10.100.0.11:2379,10.100.0.12:2379,10.100.0.13:2379";
+      description = "Comma-separated ChainFire client endpoints for multi-endpoint failover.";
+    };
+
+    flaredbControlPlaneAddrs = lib.mkOption {
+      type = lib.types.str;
+      default = "10.100.0.11:2479,10.100.0.12:2479,10.100.0.13:2479";
+      description = "Comma-separated FlareDB client endpoints for multi-endpoint failover.";
+    };
  };

  config = {
@ -84,10 +96,43 @@ in

    system.stateVersion = "24.05";

+    systemd.services.photon-test-cluster-net-tuning = {
+      description = "Tune cluster NIC offloads for nested-QEMU storage tests";
+      wantedBy = [ "multi-user.target" ];
+      after = [ "network-online.target" ];
+      wants = [ "network-online.target" ];
+      serviceConfig = {
+        Type = "oneshot";
+        RemainAfterExit = true;
+      };
+      path = [ pkgs.ethtool pkgs.iproute2 pkgs.coreutils ];
+      script = ''
+        set -eu
+        iface="eth1"
+        for _ in $(seq 1 30); do
+          if ip link show "$iface" >/dev/null 2>&1; then
+            break
+          fi
+          sleep 1
+        done
+        if ! ip link show "$iface" >/dev/null 2>&1; then
+          echo "photon-test-cluster-net-tuning: $iface not present, skipping" >&2
+          exit 0
+        fi
+
+        # Nested QEMU over VDE is sensitive to guest-side offloads; disabling
+        # them reduces retransmits and keeps the storage benchmarks closer to
+        # raw TCP throughput.
+        ethtool -K "$iface" tso off gso off gro off tx off rx off sg off || true
+        ip link set dev "$iface" txqueuelen 10000 || true
+      '';
+    };
+
    environment.systemPackages = with pkgs; [
      awscli2
      curl
      dnsutils
+      ethtool
      fio
      jq
      grpcurl
--- a/nix/test-cluster/flake.nix
+++ b/nix/test-cluster/flake.nix
@ -115,12 +115,17 @@
          curl
          grpcurl
          jq
+          llvmPackages.clang
+          llvmPackages.libclang
          openssh
+          protobuf
          clusterPython
          qemu
          sshpass
          vde2
        ];
+        LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib";
+        PROTOC = "${pkgs.protobuf}/bin/protoc";
      };
    };
 }
--- a/nix/test-cluster/node01.nix
+++ b/nix/test-cluster/node01.nix
@ -69,29 +69,29 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "10.100.0.11:2379";
-    flaredbAddr = "10.100.0.11:2479";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  services.prismnet = {
    enable = true;
    port = 50081;
    iamAddr = "10.100.0.11:50080";
-    flaredbAddr = "10.100.0.11:2479";
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  services.flashdns = {
    enable = true;
    iamAddr = "10.100.0.11:50080";
-    flaredbAddr = "10.100.0.11:2479";
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  services.fiberlb = {
    enable = true;
    port = 50085;
    iamAddr = "10.100.0.11:50080";
-    chainfireAddr = "10.100.0.11:2379";
-    flaredbAddr = "10.100.0.11:2479";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  services.plasmavmc = {
@ -101,14 +101,17 @@
    httpPort = 8084;
    prismnetAddr = "10.100.0.11:50081";
    iamAddr = "10.100.0.11:50080";
-    chainfireAddr = "10.100.0.11:2379";
-    flaredbAddr = "10.100.0.11:2479";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
    lightningstorAddr = "10.100.0.11:50086";
-    coronafsEndpoint = "http://10.100.0.11:50088";
+    coronafsControllerEndpoint = "http://127.0.0.1:50088";
+    coronafsNodeEndpoint = "http://127.0.0.1:50088";
  };

  services.coronafs = {
    enable = true;
+    metadataBackend = "chainfire";
+    chainfireKeyPrefix = "/coronafs/test-cluster/control/volumes";
    port = 50088;
    advertiseHost = "10.100.0.11";
    exportBasePort = 11000;
@ -138,9 +141,9 @@
    readQuorum = 1;
    writeQuorum = 2;
    nodeMetricsPort = 9198;
-    chainfireAddr = "10.100.0.11:2379";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
    iamAddr = "10.100.0.11:50080";
-    flaredbAddr = "10.100.0.11:2479";
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
    zone = "zone-a";
    region = "test";
  };
@ -149,10 +152,10 @@
    enable = true;
    port = 50087;
    iamAddr = "http://10.100.0.11:50080";
-    chainfireAddr = "http://10.100.0.11:2379";
+    chainfireAddr = "http://${config.photonTestCluster.chainfireControlPlaneAddrs}";
    prismnetAddr = "http://10.100.0.11:50081";
-    flaredbPdAddr = "10.100.0.11:2379";
-    flaredbDirectAddr = "10.100.0.11:2479";
+    flaredbPdAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbDirectAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
    fiberlbAddr = "http://10.100.0.11:50085";
    flashdnsAddr = "http://10.100.0.11:50084";
  };
--- a/nix/test-cluster/node02.nix
+++ b/nix/test-cluster/node02.nix
@ -41,7 +41,6 @@
    nodeId = "node02";
    raftAddr = "10.100.0.12:2480";
    apiAddr = "10.100.0.12:2479";
-    pdAddr = "10.100.0.11:2379";
    initialPeers = [
      "node01=10.100.0.11:2479"
      "node02=10.100.0.12:2479"
@ -63,8 +62,8 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "10.100.0.12:2379";
-    flaredbAddr = "10.100.0.12:2479";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  systemd.services.iam.environment = {
--- a/nix/test-cluster/node03.nix
+++ b/nix/test-cluster/node03.nix
@ -41,7 +41,6 @@
    nodeId = "node03";
    raftAddr = "10.100.0.13:2480";
    apiAddr = "10.100.0.13:2479";
-    pdAddr = "10.100.0.11:2379";
    initialPeers = [
      "node01=10.100.0.11:2479"
      "node02=10.100.0.12:2479"
@ -63,8 +62,8 @@
  services.iam = {
    enable = true;
    port = 50080;
-    chainfireAddr = "10.100.0.13:2379";
-    flaredbAddr = "10.100.0.13:2479";
+    chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
+    flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
  };

  systemd.services.iam.environment = {
--- a/Show more
+++ b/Show more