Implement host lifecycle orchestration and distributed storage restructuring
This commit is contained in:
parent
a7d5cfa738
commit
6fa172eab1
124 changed files with 21742 additions and 4016 deletions
552
apigateway/Cargo.lock
generated
552
apigateway/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
434
chainfire/Cargo.lock
generated
434
chainfire/Cargo.lock
generated
|
|
@ -342,6 +342,12 @@ version = "1.0.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801"
|
||||
|
||||
[[package]]
|
||||
name = "cfg_aliases"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724"
|
||||
|
||||
[[package]]
|
||||
name = "chainfire-api"
|
||||
version = "0.1.0"
|
||||
|
|
@ -471,6 +477,7 @@ dependencies = [
|
|||
"http-body-util",
|
||||
"metrics",
|
||||
"metrics-exporter-prometheus",
|
||||
"reqwest",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"tempfile",
|
||||
|
|
@ -786,6 +793,17 @@ dependencies = [
|
|||
"crypto-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dlv-list"
|
||||
version = "0.3.0"
|
||||
|
|
@ -978,8 +996,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"wasi",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -989,9 +1009,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||
checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasip2",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1150,6 +1172,7 @@ dependencies = [
|
|||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tower-service",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -1171,6 +1194,7 @@ version = "0.1.19"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
|
|
@ -1178,7 +1202,9 @@ dependencies = [
|
|||
"http",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"ipnet",
|
||||
"libc",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"socket2 0.6.1",
|
||||
"tokio",
|
||||
|
|
@ -1210,6 +1236,108 @@ dependencies = [
|
|||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"potential_utf",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locale_core"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599"
|
||||
dependencies = [
|
||||
"icu_collections",
|
||||
"icu_normalizer_data",
|
||||
"icu_properties",
|
||||
"icu_provider",
|
||||
"smallvec",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer_data"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a"
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec"
|
||||
dependencies = [
|
||||
"icu_collections",
|
||||
"icu_locale_core",
|
||||
"icu_properties_data",
|
||||
"icu_provider",
|
||||
"zerotrie",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties_data"
|
||||
version = "2.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af"
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "2.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locale_core",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerotrie",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de"
|
||||
dependencies = [
|
||||
"idna_adapter",
|
||||
"smallvec",
|
||||
"utf8_iter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna_adapter"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344"
|
||||
dependencies = [
|
||||
"icu_normalizer",
|
||||
"icu_properties",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
|
|
@ -1236,6 +1364,16 @@ version = "2.11.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
|
||||
|
||||
[[package]]
|
||||
name = "iri-string"
|
||||
version = "0.7.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is-terminal"
|
||||
version = "0.4.17"
|
||||
|
|
@ -1367,6 +1505,12 @@ version = "0.11.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.14"
|
||||
|
|
@ -1382,6 +1526,12 @@ version = "0.4.29"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897"
|
||||
|
||||
[[package]]
|
||||
name = "lru-slab"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154"
|
||||
|
||||
[[package]]
|
||||
name = "lz4-sys"
|
||||
version = "1.11.1+lz4-1.10.0"
|
||||
|
|
@ -1730,6 +1880,15 @@ dependencies = [
|
|||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77"
|
||||
dependencies = [
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.21"
|
||||
|
|
@ -1889,6 +2048,61 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"cfg_aliases",
|
||||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"socket2 0.6.1",
|
||||
"thiserror 2.0.17",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-proto"
|
||||
version = "0.11.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"getrandom 0.3.4",
|
||||
"lru-slab",
|
||||
"rand 0.9.2",
|
||||
"ring",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"slab",
|
||||
"thiserror 2.0.17",
|
||||
"tinyvec",
|
||||
"tracing",
|
||||
"web-time",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn-udp"
|
||||
version = "0.5.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd"
|
||||
dependencies = [
|
||||
"cfg_aliases",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"socket2 0.6.1",
|
||||
"tracing",
|
||||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.42"
|
||||
|
|
@ -2030,6 +2244,44 @@ version = "0.8.8"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.12.28"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"http",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-util",
|
||||
"js-sys",
|
||||
"log",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"quinn",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tower 0.5.2",
|
||||
"tower-http",
|
||||
"tower-service",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
"webpki-roots",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
|
|
@ -2137,6 +2389,7 @@ version = "1.13.1"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "708c0f9d5f54ba0272468c1d306a52c495b31fa155e91bc25371e6df7996908c"
|
||||
dependencies = [
|
||||
"web-time",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
|
|
@ -2359,6 +2612,12 @@ dependencies = [
|
|||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.11.1"
|
||||
|
|
@ -2387,6 +2646,20 @@ name = "sync_wrapper"
|
|||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
|
|
@ -2450,6 +2723,16 @@ dependencies = [
|
|||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.2.1"
|
||||
|
|
@ -2460,6 +2743,21 @@ dependencies = [
|
|||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec_macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "1.48.0"
|
||||
|
|
@ -2676,9 +2974,12 @@ checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8"
|
|||
dependencies = [
|
||||
"bitflags 2.10.0",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http-body",
|
||||
"iri-string",
|
||||
"pin-project-lite",
|
||||
"tower 0.5.2",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
|
|
@ -2788,6 +3089,24 @@ version = "0.9.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
"percent-encoding",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "utf8parse"
|
||||
version = "0.2.2"
|
||||
|
|
@ -2871,6 +3190,19 @@ dependencies = [
|
|||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.56"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.106"
|
||||
|
|
@ -2913,6 +3245,25 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-time"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "webpki-roots"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed"
|
||||
dependencies = [
|
||||
"rustls-pki-types",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
|
|
@ -3174,6 +3525,12 @@ version = "0.46.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.6.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9"
|
||||
|
||||
[[package]]
|
||||
name = "yaml-rust"
|
||||
version = "0.4.5"
|
||||
|
|
@ -3183,6 +3540,29 @@ dependencies = [
|
|||
"linked-hash-map",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954"
|
||||
dependencies = [
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.31"
|
||||
|
|
@ -3203,12 +3583,66 @@ dependencies = [
|
|||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5"
|
||||
dependencies = [
|
||||
"zerofrom-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom-derive"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0"
|
||||
|
||||
[[package]]
|
||||
name = "zerotrie"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.11.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.11.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-sys"
|
||||
version = "2.0.16+zstd.1.5.7"
|
||||
|
|
|
|||
|
|
@ -18,11 +18,17 @@ use chainfire_proto::proto::{
|
|||
StatusRequest,
|
||||
TxnRequest,
|
||||
};
|
||||
use std::time::Duration;
|
||||
use tonic::Code;
|
||||
use tonic::transport::Channel;
|
||||
use tracing::debug;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
/// Chainfire client
|
||||
pub struct Client {
|
||||
/// Configured client endpoints
|
||||
endpoints: Vec<String>,
|
||||
/// Preferred endpoint index
|
||||
current_endpoint: usize,
|
||||
/// gRPC channel
|
||||
channel: Channel,
|
||||
/// KV client
|
||||
|
|
@ -34,36 +40,187 @@ pub struct Client {
|
|||
impl Client {
|
||||
/// Connect to a Chainfire server
|
||||
pub async fn connect(addr: impl AsRef<str>) -> Result<Self> {
|
||||
let addr = addr.as_ref().to_string();
|
||||
debug!(addr = %addr, "Connecting to Chainfire");
|
||||
let endpoints = parse_endpoints(addr.as_ref())?;
|
||||
let mut last_error = None;
|
||||
|
||||
let channel = Channel::from_shared(addr)
|
||||
.map_err(|e| ClientError::Connection(e.to_string()))?
|
||||
.connect()
|
||||
.await?;
|
||||
|
||||
let kv = KvClient::new(channel.clone());
|
||||
let cluster = ClusterClient::new(channel.clone());
|
||||
|
||||
Ok(Self {
|
||||
for (index, endpoint) in endpoints.iter().enumerate() {
|
||||
match connect_endpoint(endpoint).await {
|
||||
Ok((channel, kv, cluster)) => {
|
||||
debug!(endpoint = %endpoint, "Connected to Chainfire");
|
||||
let mut client = Self {
|
||||
endpoints: endpoints.clone(),
|
||||
current_endpoint: index,
|
||||
channel,
|
||||
kv,
|
||||
cluster,
|
||||
})
|
||||
};
|
||||
client.promote_leader_endpoint().await?;
|
||||
return Ok(client);
|
||||
}
|
||||
Err(error) => {
|
||||
warn!(endpoint = %endpoint, error = %error, "Chainfire endpoint connect failed");
|
||||
last_error = Some(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_error.unwrap_or_else(|| ClientError::Connection("no Chainfire endpoints configured".to_string())))
|
||||
}
|
||||
|
||||
async fn with_kv_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
|
||||
where
|
||||
F: FnMut(KvClient<Channel>) -> Fut,
|
||||
Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
|
||||
{
|
||||
let max_attempts = self.endpoints.len().max(1) * 3;
|
||||
let mut last_status = None;
|
||||
for attempt in 0..max_attempts {
|
||||
let client = self.kv.clone();
|
||||
match op(client).await {
|
||||
Ok(value) => return Ok(value),
|
||||
Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
|
||||
warn!(
|
||||
endpoint = %self.endpoints[self.current_endpoint],
|
||||
code = ?status.code(),
|
||||
message = %status.message(),
|
||||
attempt = attempt + 1,
|
||||
max_attempts,
|
||||
"retrying Chainfire KV RPC on alternate endpoint"
|
||||
);
|
||||
last_status = Some(status);
|
||||
self.recover_after_status(last_status.as_ref().unwrap()).await?;
|
||||
tokio::time::sleep(retry_delay(attempt)).await;
|
||||
}
|
||||
Err(status) => return Err(status.into()),
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire KV retry exhausted")).into())
|
||||
}
|
||||
|
||||
async fn with_cluster_retry<T, F, Fut>(&mut self, mut op: F) -> Result<T>
|
||||
where
|
||||
F: FnMut(ClusterClient<Channel>) -> Fut,
|
||||
Fut: std::future::Future<Output = std::result::Result<T, tonic::Status>>,
|
||||
{
|
||||
let max_attempts = self.endpoints.len().max(1) * 3;
|
||||
let mut last_status = None;
|
||||
for attempt in 0..max_attempts {
|
||||
let client = self.cluster.clone();
|
||||
match op(client).await {
|
||||
Ok(value) => return Ok(value),
|
||||
Err(status) if attempt + 1 < max_attempts && is_retryable_status(&status) => {
|
||||
warn!(
|
||||
endpoint = %self.endpoints[self.current_endpoint],
|
||||
code = ?status.code(),
|
||||
message = %status.message(),
|
||||
attempt = attempt + 1,
|
||||
max_attempts,
|
||||
"retrying Chainfire cluster RPC on alternate endpoint"
|
||||
);
|
||||
last_status = Some(status);
|
||||
self.recover_after_status(last_status.as_ref().unwrap()).await?;
|
||||
tokio::time::sleep(retry_delay(attempt)).await;
|
||||
}
|
||||
Err(status) => return Err(status.into()),
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_status.unwrap_or_else(|| tonic::Status::unavailable("Chainfire cluster retry exhausted")).into())
|
||||
}
|
||||
|
||||
async fn recover_after_status(&mut self, status: &tonic::Status) -> Result<()> {
|
||||
if let Some(leader_idx) = self.discover_leader_endpoint().await? {
|
||||
if leader_idx != self.current_endpoint {
|
||||
return self.reconnect_to_index(leader_idx).await;
|
||||
}
|
||||
}
|
||||
|
||||
if self.endpoints.len() > 1 {
|
||||
let next = (self.current_endpoint + 1) % self.endpoints.len();
|
||||
if next != self.current_endpoint {
|
||||
return self.reconnect_to_index(next).await;
|
||||
}
|
||||
}
|
||||
|
||||
Err(ClientError::Rpc(status.clone()))
|
||||
}
|
||||
|
||||
async fn reconnect_to_index(&mut self, index: usize) -> Result<()> {
|
||||
let endpoint = self
|
||||
.endpoints
|
||||
.get(index)
|
||||
.ok_or_else(|| ClientError::Connection(format!("invalid Chainfire endpoint index {index}")))?
|
||||
.clone();
|
||||
let (channel, kv, cluster) = connect_endpoint(&endpoint).await?;
|
||||
self.current_endpoint = index;
|
||||
self.channel = channel;
|
||||
self.kv = kv;
|
||||
self.cluster = cluster;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn promote_leader_endpoint(&mut self) -> Result<()> {
|
||||
if let Some(index) = self.discover_leader_endpoint().await? {
|
||||
if index != self.current_endpoint {
|
||||
self.reconnect_to_index(index).await?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn discover_leader_endpoint(&self) -> Result<Option<usize>> {
|
||||
for (index, endpoint) in self.endpoints.iter().enumerate() {
|
||||
let mut cluster = match ClusterClient::connect(endpoint.clone()).await {
|
||||
Ok(client) => client,
|
||||
Err(error) => {
|
||||
warn!(endpoint = %endpoint, error = %error, "failed to connect while probing Chainfire leader");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
match cluster.status(StatusRequest {}).await {
|
||||
Ok(response) => {
|
||||
let status = response.into_inner();
|
||||
let member_id = status.header.as_ref().map(|header| header.member_id).unwrap_or(0);
|
||||
if status.leader != 0 && status.leader == member_id {
|
||||
return Ok(Some(index));
|
||||
}
|
||||
}
|
||||
Err(status) => {
|
||||
warn!(
|
||||
endpoint = %endpoint,
|
||||
code = ?status.code(),
|
||||
message = %status.message(),
|
||||
"failed to query Chainfire leader status"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(None)
|
||||
}
|
||||
|
||||
/// Put a key-value pair
|
||||
pub async fn put(&mut self, key: impl AsRef<[u8]>, value: impl AsRef<[u8]>) -> Result<u64> {
|
||||
let key = key.as_ref().to_vec();
|
||||
let value = value.as_ref().to_vec();
|
||||
let resp = self
|
||||
.kv
|
||||
.put(PutRequest {
|
||||
key: key.as_ref().to_vec(),
|
||||
value: value.as_ref().to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = key.clone();
|
||||
let value = value.clone();
|
||||
async move {
|
||||
kv.put(PutRequest {
|
||||
key,
|
||||
value,
|
||||
lease: 0,
|
||||
prev_kv: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(resp.header.map(|h| h.revision as u64).unwrap_or(0))
|
||||
}
|
||||
|
|
@ -86,19 +243,25 @@ impl Client {
|
|||
&mut self,
|
||||
key: impl AsRef<[u8]>,
|
||||
) -> Result<Option<(Vec<u8>, u64)>> {
|
||||
let key = key.as_ref().to_vec();
|
||||
let resp = self
|
||||
.kv
|
||||
.range(RangeRequest {
|
||||
key: key.as_ref().to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = key.clone();
|
||||
async move {
|
||||
kv.range(RangeRequest {
|
||||
key,
|
||||
range_end: vec![],
|
||||
limit: 1,
|
||||
revision: 0,
|
||||
keys_only: false,
|
||||
count_only: false,
|
||||
serializable: false, // default: linearizable read
|
||||
serializable: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(resp.kvs.into_iter().next().map(|kv| (kv.value, kv.mod_revision as u64)))
|
||||
}
|
||||
|
|
@ -132,14 +295,20 @@ impl Client {
|
|||
})),
|
||||
};
|
||||
|
||||
self.kv
|
||||
.txn(TxnRequest {
|
||||
self.with_kv_retry(|mut kv| {
|
||||
let compare = compare.clone();
|
||||
let put_op = put_op.clone();
|
||||
async move {
|
||||
kv.txn(TxnRequest {
|
||||
compare: vec![compare],
|
||||
success: vec![put_op],
|
||||
failure: vec![],
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -152,15 +321,21 @@ impl Client {
|
|||
|
||||
/// Delete a key
|
||||
pub async fn delete(&mut self, key: impl AsRef<[u8]>) -> Result<bool> {
|
||||
let key = key.as_ref().to_vec();
|
||||
let resp = self
|
||||
.kv
|
||||
.delete(DeleteRangeRequest {
|
||||
key: key.as_ref().to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = key.clone();
|
||||
async move {
|
||||
kv.delete(DeleteRangeRequest {
|
||||
key,
|
||||
range_end: vec![],
|
||||
prev_kv: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(resp.deleted > 0)
|
||||
}
|
||||
|
|
@ -171,9 +346,12 @@ impl Client {
|
|||
let range_end = prefix_end(prefix);
|
||||
|
||||
let resp = self
|
||||
.kv
|
||||
.range(RangeRequest {
|
||||
key: prefix.to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = prefix.to_vec();
|
||||
let range_end = range_end.clone();
|
||||
async move {
|
||||
kv.range(RangeRequest {
|
||||
key,
|
||||
range_end,
|
||||
limit: 0,
|
||||
revision: 0,
|
||||
|
|
@ -181,8 +359,11 @@ impl Client {
|
|||
count_only: false,
|
||||
serializable: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(resp.kvs.into_iter().map(|kv| (kv.key, kv.value)).collect())
|
||||
}
|
||||
|
|
@ -197,9 +378,12 @@ impl Client {
|
|||
let range_end = prefix_end(prefix);
|
||||
|
||||
let resp = self
|
||||
.kv
|
||||
.range(RangeRequest {
|
||||
key: prefix.to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = prefix.to_vec();
|
||||
let range_end = range_end.clone();
|
||||
async move {
|
||||
kv.range(RangeRequest {
|
||||
key,
|
||||
range_end,
|
||||
limit,
|
||||
revision: 0,
|
||||
|
|
@ -207,8 +391,11 @@ impl Client {
|
|||
count_only: false,
|
||||
serializable: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
let more = resp.more;
|
||||
let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
|
||||
|
|
@ -238,18 +425,24 @@ impl Client {
|
|||
limit: i64,
|
||||
) -> Result<(Vec<(Vec<u8>, Vec<u8>, u64)>, Option<Vec<u8>>)> {
|
||||
let resp = self
|
||||
.kv
|
||||
.range(RangeRequest {
|
||||
key: start.as_ref().to_vec(),
|
||||
range_end: end.as_ref().to_vec(),
|
||||
.with_kv_retry(|mut kv| {
|
||||
let key = start.as_ref().to_vec();
|
||||
let range_end = end.as_ref().to_vec();
|
||||
async move {
|
||||
kv.range(RangeRequest {
|
||||
key,
|
||||
range_end,
|
||||
limit,
|
||||
revision: 0,
|
||||
keys_only: false,
|
||||
count_only: false,
|
||||
serializable: false,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
let more = resp.more;
|
||||
let kvs: Vec<(Vec<u8>, Vec<u8>, u64)> = resp
|
||||
|
|
@ -309,14 +502,21 @@ impl Client {
|
|||
};
|
||||
|
||||
let resp = self
|
||||
.kv
|
||||
.txn(TxnRequest {
|
||||
.with_kv_retry(|mut kv| {
|
||||
let compare = compare.clone();
|
||||
let put_op = put_op.clone();
|
||||
let read_on_fail = read_on_fail.clone();
|
||||
async move {
|
||||
kv.txn(TxnRequest {
|
||||
compare: vec![compare],
|
||||
success: vec![put_op],
|
||||
failure: vec![read_on_fail],
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
if resp.succeeded {
|
||||
let new_version = resp
|
||||
|
|
@ -371,10 +571,13 @@ impl Client {
|
|||
/// Get cluster status
|
||||
pub async fn status(&mut self) -> Result<ClusterStatus> {
|
||||
let resp = self
|
||||
.cluster
|
||||
.with_cluster_retry(|mut cluster| async move {
|
||||
cluster
|
||||
.status(StatusRequest {})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
})
|
||||
.await?;
|
||||
|
||||
Ok(ClusterStatus {
|
||||
version: resp.version,
|
||||
|
|
@ -392,15 +595,22 @@ impl Client {
|
|||
/// # Returns
|
||||
/// The node ID of the added member
|
||||
pub async fn member_add(&mut self, node_id: u64, peer_url: impl AsRef<str>, is_learner: bool) -> Result<u64> {
|
||||
let peer_url = peer_url.as_ref().to_string();
|
||||
let resp = self
|
||||
.cluster
|
||||
.with_cluster_retry(|mut cluster| {
|
||||
let peer_url = peer_url.clone();
|
||||
async move {
|
||||
cluster
|
||||
.member_add(MemberAddRequest {
|
||||
node_id,
|
||||
peer_urls: vec![peer_url.as_ref().to_string()],
|
||||
peer_urls: vec![peer_url],
|
||||
is_learner,
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
.await
|
||||
.map(|resp| resp.into_inner())
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
// Extract the member ID from the response
|
||||
let member_id = resp
|
||||
|
|
@ -410,7 +620,7 @@ impl Client {
|
|||
|
||||
debug!(
|
||||
member_id = member_id,
|
||||
peer_url = peer_url.as_ref(),
|
||||
peer_url = peer_url.as_str(),
|
||||
is_learner = is_learner,
|
||||
"Added member to cluster"
|
||||
);
|
||||
|
|
@ -441,6 +651,64 @@ pub struct CasOutcome {
|
|||
pub new_version: u64,
|
||||
}
|
||||
|
||||
fn parse_endpoints(input: &str) -> Result<Vec<String>> {
|
||||
let endpoints: Vec<String> = input
|
||||
.split(',')
|
||||
.map(str::trim)
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(normalize_endpoint)
|
||||
.collect();
|
||||
|
||||
if endpoints.is_empty() {
|
||||
return Err(ClientError::Connection("no Chainfire endpoints configured".to_string()));
|
||||
}
|
||||
|
||||
Ok(endpoints)
|
||||
}
|
||||
|
||||
fn normalize_endpoint(endpoint: &str) -> String {
|
||||
if endpoint.contains("://") {
|
||||
endpoint.to_string()
|
||||
} else {
|
||||
format!("http://{endpoint}")
|
||||
}
|
||||
}
|
||||
|
||||
async fn connect_endpoint(endpoint: &str) -> Result<(Channel, KvClient<Channel>, ClusterClient<Channel>)> {
|
||||
let channel = Channel::from_shared(endpoint.to_string())
|
||||
.map_err(|e| ClientError::Connection(e.to_string()))?
|
||||
.connect()
|
||||
.await?;
|
||||
|
||||
let kv = KvClient::new(channel.clone());
|
||||
let cluster = ClusterClient::new(channel.clone());
|
||||
Ok((channel, kv, cluster))
|
||||
}
|
||||
|
||||
fn retry_delay(attempt: usize) -> Duration {
|
||||
let multiplier = 1u64 << attempt.min(3);
|
||||
Duration::from_millis((200 * multiplier).min(1_000))
|
||||
}
|
||||
|
||||
fn is_retryable_status(status: &tonic::Status) -> bool {
|
||||
matches!(
|
||||
status.code(),
|
||||
Code::Unavailable | Code::DeadlineExceeded | Code::Internal | Code::Aborted | Code::FailedPrecondition
|
||||
) || retryable_message(status.message())
|
||||
}
|
||||
|
||||
fn retryable_message(message: &str) -> bool {
|
||||
let lowercase = message.to_ascii_lowercase();
|
||||
lowercase.contains("not leader")
|
||||
|| lowercase.contains("leader_id")
|
||||
|| lowercase.contains("transport error")
|
||||
|| lowercase.contains("connection was not ready")
|
||||
|| lowercase.contains("deadline has elapsed")
|
||||
|| lowercase.contains("broken pipe")
|
||||
|| lowercase.contains("connection reset")
|
||||
|| lowercase.contains("connection refused")
|
||||
}
|
||||
|
||||
/// Calculate prefix end for range queries
|
||||
fn prefix_end(prefix: &[u8]) -> Vec<u8> {
|
||||
let mut end = prefix.to_vec();
|
||||
|
|
@ -463,4 +731,30 @@ mod tests {
|
|||
assert_eq!(prefix_end(b"abc"), b"abd");
|
||||
assert_eq!(prefix_end(b"/nodes/"), b"/nodes0");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_endpoint_adds_http_scheme() {
|
||||
assert_eq!(normalize_endpoint("127.0.0.1:2379"), "http://127.0.0.1:2379");
|
||||
assert_eq!(normalize_endpoint("http://127.0.0.1:2379"), "http://127.0.0.1:2379");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_endpoints_accepts_comma_separated_values() {
|
||||
let endpoints = parse_endpoints("127.0.0.1:2379, http://127.0.0.2:2379").unwrap();
|
||||
assert_eq!(
|
||||
endpoints,
|
||||
vec![
|
||||
"http://127.0.0.1:2379".to_string(),
|
||||
"http://127.0.0.2:2379".to_string()
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn retryable_message_covers_not_leader_and_transport() {
|
||||
assert!(retryable_message("NotLeader { leader_id: Some(1) }"));
|
||||
assert!(retryable_message("transport error"));
|
||||
assert!(retryable_message("connection was not ready"));
|
||||
assert!(!retryable_message("permission denied"));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,17 +27,25 @@ pub struct ClusterServiceImpl {
|
|||
rpc_client: Arc<crate::GrpcRaftClient>,
|
||||
/// Cluster ID
|
||||
cluster_id: u64,
|
||||
/// Configured members with client and peer URLs
|
||||
members: Vec<Member>,
|
||||
/// Server version
|
||||
version: String,
|
||||
}
|
||||
|
||||
impl ClusterServiceImpl {
|
||||
/// Create a new cluster service
|
||||
pub fn new(raft: Arc<RaftCore>, rpc_client: Arc<crate::GrpcRaftClient>, cluster_id: u64) -> Self {
|
||||
pub fn new(
|
||||
raft: Arc<RaftCore>,
|
||||
rpc_client: Arc<crate::GrpcRaftClient>,
|
||||
cluster_id: u64,
|
||||
members: Vec<Member>,
|
||||
) -> Self {
|
||||
Self {
|
||||
raft,
|
||||
rpc_client,
|
||||
cluster_id,
|
||||
members,
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
}
|
||||
}
|
||||
|
|
@ -47,16 +55,19 @@ impl ClusterServiceImpl {
|
|||
}
|
||||
|
||||
/// Get current members as proto Member list
|
||||
/// NOTE: Custom RaftCore doesn't track membership dynamically yet
|
||||
/// NOTE: Custom RaftCore doesn't track membership dynamically yet, so this returns
|
||||
/// the configured static membership that the server was booted with.
|
||||
async fn get_member_list(&self) -> Vec<Member> {
|
||||
// For now, return only the current node
|
||||
vec![Member {
|
||||
if self.members.is_empty() {
|
||||
return vec![Member {
|
||||
id: self.raft.node_id(),
|
||||
name: format!("node-{}", self.raft.node_id()),
|
||||
peer_urls: vec![],
|
||||
client_urls: vec![],
|
||||
is_learner: false,
|
||||
}]
|
||||
}];
|
||||
}
|
||||
self.members.clone()
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ http-body-util = { workspace = true }
|
|||
uuid = { version = "1.11", features = ["v4", "serde"] }
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
serde_json = "1.0"
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
|
||||
# Configuration
|
||||
clap.workspace = true
|
||||
|
|
|
|||
|
|
@ -11,13 +11,14 @@
|
|||
use axum::{
|
||||
extract::{Path, Query, State},
|
||||
http::StatusCode,
|
||||
routing::{delete, get, post, put},
|
||||
routing::{get, post},
|
||||
Json, Router,
|
||||
};
|
||||
use chainfire_api::GrpcRaftClient;
|
||||
use chainfire_raft::RaftCore;
|
||||
use chainfire_raft::{core::RaftError, RaftCore};
|
||||
use chainfire_types::command::RaftCommand;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// REST API state
|
||||
|
|
@ -26,16 +27,18 @@ pub struct RestApiState {
|
|||
pub raft: Arc<RaftCore>,
|
||||
pub cluster_id: u64,
|
||||
pub rpc_client: Option<Arc<GrpcRaftClient>>,
|
||||
pub http_client: reqwest::Client,
|
||||
pub peer_http_addrs: Arc<HashMap<u64, String>>,
|
||||
}
|
||||
|
||||
/// Standard REST error response
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ErrorResponse {
|
||||
pub error: ErrorDetail,
|
||||
pub meta: ResponseMeta,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ErrorDetail {
|
||||
pub code: String,
|
||||
pub message: String,
|
||||
|
|
@ -43,7 +46,7 @@ pub struct ErrorDetail {
|
|||
pub details: Option<serde_json::Value>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ResponseMeta {
|
||||
pub request_id: String,
|
||||
pub timestamp: String,
|
||||
|
|
@ -59,7 +62,7 @@ impl ResponseMeta {
|
|||
}
|
||||
|
||||
/// Standard REST success response
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct SuccessResponse<T> {
|
||||
pub data: T,
|
||||
pub meta: ResponseMeta,
|
||||
|
|
@ -75,25 +78,25 @@ impl<T> SuccessResponse<T> {
|
|||
}
|
||||
|
||||
/// KV Put request body
|
||||
#[derive(Debug, Deserialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct PutRequest {
|
||||
pub value: String,
|
||||
}
|
||||
|
||||
/// KV Get response
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct GetResponse {
|
||||
pub key: String,
|
||||
pub value: String,
|
||||
}
|
||||
|
||||
/// KV List response
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct ListResponse {
|
||||
pub items: Vec<KvItem>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
pub struct KvItem {
|
||||
pub key: String,
|
||||
pub value: String,
|
||||
|
|
@ -129,6 +132,13 @@ pub struct AddMemberRequestLegacy {
|
|||
#[derive(Debug, Deserialize)]
|
||||
pub struct PrefixQuery {
|
||||
pub prefix: Option<String>,
|
||||
pub consistency: Option<String>,
|
||||
}
|
||||
|
||||
/// Query parameters for key reads
|
||||
#[derive(Debug, Default, Deserialize)]
|
||||
pub struct ReadQuery {
|
||||
pub consistency: Option<String>,
|
||||
}
|
||||
|
||||
/// Build the REST API router
|
||||
|
|
@ -153,80 +163,11 @@ async fn health_check() -> (StatusCode, Json<SuccessResponse<serde_json::Value>>
|
|||
)
|
||||
}
|
||||
|
||||
/// GET /api/v1/kv/{key} - Get value
|
||||
async fn get_kv(
|
||||
State(state): State<RestApiState>,
|
||||
Path(key): Path<String>,
|
||||
) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
|
||||
let sm = state.raft.state_machine();
|
||||
let key_bytes = key.as_bytes().to_vec();
|
||||
|
||||
let results = sm.kv()
|
||||
.get(&key_bytes)
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
|
||||
let value = results
|
||||
.into_iter()
|
||||
.next()
|
||||
.ok_or_else(|| error_response(StatusCode::NOT_FOUND, "NOT_FOUND", "Key not found"))?;
|
||||
|
||||
Ok(Json(SuccessResponse::new(GetResponse {
|
||||
key,
|
||||
value: String::from_utf8_lossy(&value.value).to_string(),
|
||||
})))
|
||||
}
|
||||
|
||||
/// PUT /api/v1/kv/{key} - Put value
|
||||
async fn put_kv(
|
||||
State(state): State<RestApiState>,
|
||||
Path(key): Path<String>,
|
||||
Json(req): Json<PutRequest>,
|
||||
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
|
||||
let command = RaftCommand::Put {
|
||||
key: key.as_bytes().to_vec(),
|
||||
value: req.value.as_bytes().to_vec(),
|
||||
lease_id: None,
|
||||
prev_kv: false,
|
||||
};
|
||||
|
||||
state
|
||||
.raft
|
||||
.client_write(command)
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
|
||||
))
|
||||
}
|
||||
|
||||
/// DELETE /api/v1/kv/{key} - Delete key
|
||||
async fn delete_kv(
|
||||
State(state): State<RestApiState>,
|
||||
Path(key): Path<String>,
|
||||
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
|
||||
let command = RaftCommand::Delete {
|
||||
key: key.as_bytes().to_vec(),
|
||||
prev_kv: false,
|
||||
};
|
||||
|
||||
state
|
||||
.raft
|
||||
.client_write(command)
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
Json(SuccessResponse::new(serde_json::json!({ "key": key, "success": true }))),
|
||||
))
|
||||
}
|
||||
|
||||
/// GET /api/v1/kv/*key - Get value (wildcard for all keys)
|
||||
async fn get_kv_wildcard(
|
||||
State(state): State<RestApiState>,
|
||||
Path(key): Path<String>,
|
||||
Query(query): Query<ReadQuery>,
|
||||
) -> Result<Json<SuccessResponse<GetResponse>>, (StatusCode, Json<ErrorResponse>)> {
|
||||
// Use key as-is for simple keys, prepend / for namespaced keys
|
||||
// Keys like "testkey" stay as "testkey", keys like "flaredb/stores/1" become "/flaredb/stores/1"
|
||||
|
|
@ -235,6 +176,14 @@ async fn get_kv_wildcard(
|
|||
} else {
|
||||
key.clone()
|
||||
};
|
||||
if should_proxy_read(query.consistency.as_deref(), &state).await {
|
||||
return proxy_read_to_leader(
|
||||
&state,
|
||||
&format!("/api/v1/kv/{}", full_key.trim_start_matches('/')),
|
||||
None,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
let sm = state.raft.state_machine();
|
||||
let key_bytes = full_key.as_bytes().to_vec();
|
||||
|
||||
|
|
@ -272,11 +221,7 @@ async fn put_kv_wildcard(
|
|||
prev_kv: false,
|
||||
};
|
||||
|
||||
state
|
||||
.raft
|
||||
.client_write(command)
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
submit_rest_write(&state, command, Some(&req), &full_key, reqwest::Method::PUT).await?;
|
||||
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
|
|
@ -300,11 +245,7 @@ async fn delete_kv_wildcard(
|
|||
prev_kv: false,
|
||||
};
|
||||
|
||||
state
|
||||
.raft
|
||||
.client_write(command)
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
submit_rest_write(&state, command, None, &full_key, reqwest::Method::DELETE).await?;
|
||||
|
||||
Ok((
|
||||
StatusCode::OK,
|
||||
|
|
@ -317,6 +258,13 @@ async fn list_kv(
|
|||
State(state): State<RestApiState>,
|
||||
Query(params): Query<PrefixQuery>,
|
||||
) -> Result<Json<SuccessResponse<ListResponse>>, (StatusCode, Json<ErrorResponse>)> {
|
||||
if should_proxy_read(params.consistency.as_deref(), &state).await {
|
||||
let query = params
|
||||
.prefix
|
||||
.as_ref()
|
||||
.map(|prefix| vec![("prefix", prefix.as_str())]);
|
||||
return proxy_read_to_leader(&state, "/api/v1/kv", query.as_deref()).await;
|
||||
}
|
||||
let prefix = params.prefix.unwrap_or_default();
|
||||
let sm = state.raft.state_machine();
|
||||
|
||||
|
|
@ -446,3 +394,169 @@ fn error_response(
|
|||
}),
|
||||
)
|
||||
}
|
||||
|
||||
async fn submit_rest_write(
|
||||
state: &RestApiState,
|
||||
command: RaftCommand,
|
||||
body: Option<&PutRequest>,
|
||||
key: &str,
|
||||
method: reqwest::Method,
|
||||
) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
|
||||
match state.raft.client_write(command).await {
|
||||
Ok(()) => Ok(()),
|
||||
Err(RaftError::NotLeader { leader_id }) => {
|
||||
let resolved_leader = match leader_id {
|
||||
Some(leader_id) => Some(leader_id),
|
||||
None => state.raft.leader().await,
|
||||
};
|
||||
proxy_write_to_leader(state, resolved_leader, key, method, body).await
|
||||
}
|
||||
Err(err) => Err(error_response(
|
||||
StatusCode::INTERNAL_SERVER_ERROR,
|
||||
"INTERNAL_ERROR",
|
||||
&err.to_string(),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn proxy_write_to_leader(
|
||||
state: &RestApiState,
|
||||
leader_id: Option<u64>,
|
||||
key: &str,
|
||||
method: reqwest::Method,
|
||||
body: Option<&PutRequest>,
|
||||
) -> Result<(), (StatusCode, Json<ErrorResponse>)> {
|
||||
let leader_id = leader_id.ok_or_else(|| {
|
||||
error_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"NOT_LEADER",
|
||||
"current node is not the leader and no leader is known yet",
|
||||
)
|
||||
})?;
|
||||
let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
|
||||
error_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"NOT_LEADER",
|
||||
&format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
|
||||
)
|
||||
})?;
|
||||
let url = format!(
|
||||
"{}/api/v1/kv/{}",
|
||||
leader_http_addr.trim_end_matches('/'),
|
||||
key.trim_start_matches('/')
|
||||
);
|
||||
let mut request = state.http_client.request(method, &url);
|
||||
if let Some(body) = body {
|
||||
request = request.json(body);
|
||||
}
|
||||
let response = request.send().await.map_err(|err| {
|
||||
error_response(
|
||||
StatusCode::BAD_GATEWAY,
|
||||
"LEADER_PROXY_FAILED",
|
||||
&format!("failed to forward write to leader {leader_id}: {err}"),
|
||||
)
|
||||
})?;
|
||||
if response.status().is_success() {
|
||||
return Ok(());
|
||||
}
|
||||
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
|
||||
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
|
||||
error: ErrorDetail {
|
||||
code: "LEADER_PROXY_FAILED".to_string(),
|
||||
message: format!("leader {leader_id} returned {status}: {err}"),
|
||||
details: None,
|
||||
},
|
||||
meta: ResponseMeta::new(),
|
||||
});
|
||||
Err((status, Json(payload)))
|
||||
}
|
||||
|
||||
async fn should_proxy_read(consistency: Option<&str>, state: &RestApiState) -> bool {
|
||||
let node_id = state.raft.node_id();
|
||||
let leader_id = state.raft.leader().await;
|
||||
read_requires_leader_proxy(consistency, node_id, leader_id)
|
||||
}
|
||||
|
||||
fn read_requires_leader_proxy(
|
||||
consistency: Option<&str>,
|
||||
node_id: u64,
|
||||
leader_id: Option<u64>,
|
||||
) -> bool {
|
||||
if matches!(consistency, Some(mode) if mode.eq_ignore_ascii_case("local")) {
|
||||
return false;
|
||||
}
|
||||
matches!(leader_id, Some(leader_id) if leader_id != node_id)
|
||||
}
|
||||
|
||||
async fn proxy_read_to_leader<T>(
|
||||
state: &RestApiState,
|
||||
path: &str,
|
||||
query: Option<&[(&str, &str)]>,
|
||||
) -> Result<Json<SuccessResponse<T>>, (StatusCode, Json<ErrorResponse>)>
|
||||
where
|
||||
T: for<'de> Deserialize<'de>,
|
||||
{
|
||||
let leader_id = state.raft.leader().await.ok_or_else(|| {
|
||||
error_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"NOT_LEADER",
|
||||
"current node is not the leader and no leader is known yet",
|
||||
)
|
||||
})?;
|
||||
let leader_http_addr = state.peer_http_addrs.get(&leader_id).ok_or_else(|| {
|
||||
error_response(
|
||||
StatusCode::SERVICE_UNAVAILABLE,
|
||||
"NOT_LEADER",
|
||||
&format!("leader {leader_id} is known but has no HTTP endpoint mapping"),
|
||||
)
|
||||
})?;
|
||||
let url = format!(
|
||||
"{}{}",
|
||||
leader_http_addr.trim_end_matches('/'),
|
||||
path
|
||||
);
|
||||
let mut request = state.http_client.get(&url);
|
||||
if let Some(query) = query {
|
||||
request = request.query(query);
|
||||
}
|
||||
let response = request.send().await.map_err(|err| {
|
||||
error_response(
|
||||
StatusCode::BAD_GATEWAY,
|
||||
"LEADER_PROXY_FAILED",
|
||||
&format!("failed to forward read to leader {leader_id}: {err}"),
|
||||
)
|
||||
})?;
|
||||
if response.status().is_success() {
|
||||
let payload = response.json::<SuccessResponse<T>>().await.map_err(|err| {
|
||||
error_response(
|
||||
StatusCode::BAD_GATEWAY,
|
||||
"LEADER_PROXY_FAILED",
|
||||
&format!("failed to decode leader {leader_id} response: {err}"),
|
||||
)
|
||||
})?;
|
||||
return Ok(Json(payload));
|
||||
}
|
||||
let status = StatusCode::from_u16(response.status().as_u16()).unwrap_or(StatusCode::BAD_GATEWAY);
|
||||
let payload = response.json::<ErrorResponse>().await.unwrap_or_else(|err| ErrorResponse {
|
||||
error: ErrorDetail {
|
||||
code: "LEADER_PROXY_FAILED".to_string(),
|
||||
message: format!("leader {leader_id} returned {status}: {err}"),
|
||||
details: None,
|
||||
},
|
||||
meta: ResponseMeta::new(),
|
||||
});
|
||||
Err((status, Json(payload)))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn read_requires_leader_proxy_defaults_to_leader_consistency() {
|
||||
assert!(read_requires_leader_proxy(None, 2, Some(1)));
|
||||
assert!(!read_requires_leader_proxy(Some("local"), 2, Some(1)));
|
||||
assert!(!read_requires_leader_proxy(None, 2, Some(2)));
|
||||
assert!(!read_requires_leader_proxy(None, 2, None));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,10 +11,11 @@ use crate::rest::{build_router, RestApiState};
|
|||
use anyhow::Result;
|
||||
use chainfire_api::internal_proto::raft_service_server::RaftServiceServer;
|
||||
use chainfire_api::proto::{
|
||||
cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer,
|
||||
cluster_server::ClusterServer, kv_server::KvServer, watch_server::WatchServer, Member,
|
||||
};
|
||||
use chainfire_api::{ClusterServiceImpl, KvServiceImpl, RaftServiceImpl, WatchServiceImpl};
|
||||
use chainfire_types::RaftRole;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::signal;
|
||||
use tonic::transport::{Certificate, Identity, Server as TonicServer, ServerTlsConfig};
|
||||
|
|
@ -109,6 +110,7 @@ impl Server {
|
|||
Arc::clone(&raft),
|
||||
rpc_client,
|
||||
self.node.cluster_id(),
|
||||
configured_members(&self.config),
|
||||
);
|
||||
|
||||
// Internal Raft service for inter-node communication
|
||||
|
|
@ -166,10 +168,24 @@ impl Server {
|
|||
|
||||
// HTTP REST API server
|
||||
let http_addr = self.config.network.http_addr;
|
||||
let http_port = self.config.network.http_addr.port();
|
||||
let peer_http_addrs = Arc::new(
|
||||
self.config
|
||||
.cluster
|
||||
.initial_members
|
||||
.iter()
|
||||
.filter_map(|member| {
|
||||
http_endpoint_from_raft_addr(&member.raft_addr, http_port)
|
||||
.map(|http_addr| (member.id, http_addr))
|
||||
})
|
||||
.collect::<HashMap<_, _>>(),
|
||||
);
|
||||
let rest_state = RestApiState {
|
||||
raft: Arc::clone(&raft),
|
||||
cluster_id: self.node.cluster_id(),
|
||||
rpc_client: self.node.rpc_client().cloned(),
|
||||
http_client: reqwest::Client::new(),
|
||||
peer_http_addrs,
|
||||
};
|
||||
let rest_app = build_router(rest_state);
|
||||
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
|
||||
|
|
@ -286,3 +302,45 @@ impl Server {
|
|||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn http_endpoint_from_raft_addr(raft_addr: &str, http_port: u16) -> Option<String> {
|
||||
if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
|
||||
return Some(format!("http://{}:{}", addr.ip(), http_port));
|
||||
}
|
||||
let (host, _) = raft_addr.rsplit_once(':')?;
|
||||
Some(format!("http://{}:{}", host, http_port))
|
||||
}
|
||||
|
||||
fn grpc_endpoint_from_raft_addr(raft_addr: &str, api_port: u16) -> Option<String> {
|
||||
if let Ok(addr) = raft_addr.parse::<std::net::SocketAddr>() {
|
||||
return Some(format!("http://{}:{}", addr.ip(), api_port));
|
||||
}
|
||||
let (host, _) = raft_addr.rsplit_once(':')?;
|
||||
Some(format!("http://{}:{}", host, api_port))
|
||||
}
|
||||
|
||||
fn normalize_peer_url(raft_addr: &str) -> String {
|
||||
if raft_addr.contains("://") {
|
||||
raft_addr.to_string()
|
||||
} else {
|
||||
format!("http://{raft_addr}")
|
||||
}
|
||||
}
|
||||
|
||||
fn configured_members(config: &ServerConfig) -> Vec<Member> {
|
||||
let api_port = config.network.api_addr.port();
|
||||
config
|
||||
.cluster
|
||||
.initial_members
|
||||
.iter()
|
||||
.map(|member| Member {
|
||||
id: member.id,
|
||||
name: format!("node-{}", member.id),
|
||||
peer_urls: vec![normalize_peer_url(&member.raft_addr)],
|
||||
client_urls: grpc_endpoint_from_raft_addr(&member.raft_addr, api_port)
|
||||
.into_iter()
|
||||
.collect(),
|
||||
is_learner: false,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
|
|
|||
1114
coronafs/Cargo.lock
generated
1114
coronafs/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -24,6 +24,7 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|||
anyhow = "1.0"
|
||||
thiserror = "1.0"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] }
|
||||
|
||||
[workspace.lints.rust]
|
||||
unsafe_code = "deny"
|
||||
|
|
|
|||
|
|
@ -21,7 +21,11 @@ tracing-subscriber = { workspace = true }
|
|||
anyhow = { workspace = true }
|
||||
thiserror = { workspace = true }
|
||||
chrono = { workspace = true }
|
||||
reqwest = { workspace = true }
|
||||
futures-util = "0.3"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
|
|
|||
|
|
@ -2,9 +2,40 @@ use serde::{Deserialize, Serialize};
|
|||
use std::net::SocketAddr;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum ServerMode {
|
||||
Combined,
|
||||
Controller,
|
||||
Node,
|
||||
}
|
||||
|
||||
impl Default for ServerMode {
|
||||
fn default() -> Self {
|
||||
Self::Combined
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum MetadataBackend {
|
||||
Filesystem,
|
||||
Chainfire,
|
||||
}
|
||||
|
||||
impl Default for MetadataBackend {
|
||||
fn default() -> Self {
|
||||
Self::Filesystem
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(default)]
|
||||
pub struct ServerConfig {
|
||||
pub mode: ServerMode,
|
||||
pub metadata_backend: MetadataBackend,
|
||||
pub chainfire_api_url: Option<String>,
|
||||
pub chainfire_key_prefix: String,
|
||||
pub listen_addr: SocketAddr,
|
||||
pub advertise_host: String,
|
||||
pub data_dir: PathBuf,
|
||||
|
|
@ -26,6 +57,10 @@ pub struct ServerConfig {
|
|||
impl Default for ServerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
mode: ServerMode::Combined,
|
||||
metadata_backend: MetadataBackend::Filesystem,
|
||||
chainfire_api_url: None,
|
||||
chainfire_key_prefix: "/coronafs/volumes".to_string(),
|
||||
listen_addr: "0.0.0.0:50088".parse().expect("valid listen addr"),
|
||||
advertise_host: "127.0.0.1".to_string(),
|
||||
data_dir: PathBuf::from("/var/lib/coronafs"),
|
||||
|
|
@ -34,7 +69,7 @@ impl Default for ServerConfig {
|
|||
export_port_count: 512,
|
||||
export_shared_clients: 32,
|
||||
export_cache_mode: "none".to_string(),
|
||||
export_aio_mode: "io_uring".to_string(),
|
||||
export_aio_mode: "threads".to_string(),
|
||||
export_discard_mode: "unmap".to_string(),
|
||||
export_detect_zeroes_mode: "unmap".to_string(),
|
||||
preallocate: true,
|
||||
|
|
@ -47,6 +82,14 @@ impl Default for ServerConfig {
|
|||
}
|
||||
|
||||
impl ServerConfig {
|
||||
pub fn supports_controller_api(&self) -> bool {
|
||||
matches!(self.mode, ServerMode::Combined | ServerMode::Controller)
|
||||
}
|
||||
|
||||
pub fn supports_node_api(&self) -> bool {
|
||||
matches!(self.mode, ServerMode::Combined | ServerMode::Node)
|
||||
}
|
||||
|
||||
pub fn volume_dir(&self) -> PathBuf {
|
||||
self.data_dir.join("volumes")
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
231
coronafs/scripts/benchmark-local-export.sh
Executable file
231
coronafs/scripts/benchmark-local-export.sh
Executable file
|
|
@ -0,0 +1,231 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||
|
||||
require_cmd() {
|
||||
command -v "$1" >/dev/null 2>&1 || {
|
||||
echo "missing required command: $1" >&2
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
for cmd in curl qemu-io; do
|
||||
require_cmd "${cmd}"
|
||||
done
|
||||
|
||||
if ! command -v jq >/dev/null 2>&1 && ! command -v python3 >/dev/null 2>&1; then
|
||||
echo "missing required command: jq or python3" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
json_get() {
|
||||
local query="$1"
|
||||
if command -v jq >/dev/null 2>&1; then
|
||||
jq -r "${query}"
|
||||
else
|
||||
python3 -c 'import json,sys
|
||||
data=json.load(sys.stdin)
|
||||
value=data
|
||||
for part in sys.argv[1].split("."):
|
||||
if not part:
|
||||
continue
|
||||
value=value.get(part) if isinstance(value, dict) else None
|
||||
if value is None:
|
||||
break
|
||||
print("" if value is None else value)
|
||||
' "${query}"
|
||||
fi
|
||||
}
|
||||
|
||||
RUN_ID="${CORONAFS_BENCH_RUN_ID:-$$}"
|
||||
LISTEN_PORT="${CORONAFS_BENCH_PORT:-$((25088 + (RUN_ID % 1000)))}"
|
||||
EXPORT_BASE_PORT="${CORONAFS_BENCH_EXPORT_BASE_PORT:-$((26100 + (RUN_ID % 1000)))}"
|
||||
VOLUME_ID="${CORONAFS_BENCH_VOLUME_ID:-local-bench-${RUN_ID}}"
|
||||
SIZE_MIB="${CORONAFS_BENCH_SIZE_MIB:-${CORONAFS_BENCH_SIZE_MB:-512}}"
|
||||
SIZE_BYTES="${CORONAFS_BENCH_SIZE_BYTES:-$((SIZE_MIB * 1024 * 1024))}"
|
||||
WORKLOAD_MIB="${CORONAFS_BENCH_WORKLOAD_MIB:-${CORONAFS_BENCH_WORKLOAD_MB:-256}}"
|
||||
EXPORT_CACHE_MODE="${CORONAFS_BENCH_EXPORT_CACHE_MODE:-none}"
|
||||
EXPORT_AIO_MODE="${CORONAFS_BENCH_EXPORT_AIO_MODE:-threads}"
|
||||
EXPORT_DISCARD_MODE="${CORONAFS_BENCH_EXPORT_DISCARD_MODE:-ignore}"
|
||||
EXPORT_DETECT_ZEROES_MODE="${CORONAFS_BENCH_EXPORT_DETECT_ZEROES_MODE:-off}"
|
||||
SERVER_BIN="${CORONAFS_SERVER_BIN:-}"
|
||||
|
||||
if (( WORKLOAD_MIB > SIZE_MIB )); then
|
||||
echo "workload ${WORKLOAD_MIB} MiB exceeds volume size ${SIZE_MIB} MiB" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ -z "${SERVER_BIN}" ]]; then
|
||||
SERVER_CMD=(
|
||||
cargo run
|
||||
--manifest-path "${REPO_ROOT}/coronafs/Cargo.toml"
|
||||
-p coronafs-server
|
||||
--
|
||||
)
|
||||
else
|
||||
SERVER_CMD=("${SERVER_BIN}")
|
||||
fi
|
||||
|
||||
TMP_DIR="$(mktemp -d)"
|
||||
CONFIG_PATH="${TMP_DIR}/coronafs.toml"
|
||||
SERVER_LOG="${TMP_DIR}/coronafs.log"
|
||||
SERVER_PID=""
|
||||
|
||||
show_server_log() {
|
||||
if [[ -f "${SERVER_LOG}" ]]; then
|
||||
echo "--- coronafs server log ---" >&2
|
||||
tail -n 200 "${SERVER_LOG}" >&2 || true
|
||||
echo "--- end coronafs server log ---" >&2
|
||||
fi
|
||||
}
|
||||
|
||||
delete_volume_if_present() {
|
||||
curl -fsS -X DELETE "http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}" >/dev/null 2>&1 || true
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
delete_volume_if_present
|
||||
local pid_file="${TMP_DIR}/data/pids/${VOLUME_ID}.pid"
|
||||
if [[ -f "${pid_file}" ]]; then
|
||||
local export_pid=""
|
||||
export_pid="$(tr -d '\n' <"${pid_file}" 2>/dev/null || true)"
|
||||
if [[ -n "${export_pid}" ]] && kill -0 "${export_pid}" 2>/dev/null; then
|
||||
kill "${export_pid}" >/dev/null 2>&1 || true
|
||||
wait "${export_pid}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
rm -f "${pid_file}"
|
||||
fi
|
||||
if [[ -n "${SERVER_PID}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||
kill "${SERVER_PID}" >/dev/null 2>&1 || true
|
||||
wait "${SERVER_PID}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
rm -rf "${TMP_DIR}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
cat >"${CONFIG_PATH}" <<EOF
|
||||
listen_addr = "127.0.0.1:${LISTEN_PORT}"
|
||||
advertise_host = "127.0.0.1"
|
||||
data_dir = "${TMP_DIR}/data"
|
||||
export_bind_addr = "127.0.0.1"
|
||||
export_base_port = ${EXPORT_BASE_PORT}
|
||||
export_port_count = 8
|
||||
export_shared_clients = 32
|
||||
export_cache_mode = "${EXPORT_CACHE_MODE}"
|
||||
export_aio_mode = "${EXPORT_AIO_MODE}"
|
||||
export_discard_mode = "${EXPORT_DISCARD_MODE}"
|
||||
export_detect_zeroes_mode = "${EXPORT_DETECT_ZEROES_MODE}"
|
||||
preallocate = false
|
||||
sync_on_write = false
|
||||
log_level = "info"
|
||||
EOF
|
||||
|
||||
"${SERVER_CMD[@]}" --config "${CONFIG_PATH}" >"${SERVER_LOG}" 2>&1 &
|
||||
SERVER_PID="$!"
|
||||
|
||||
deadline=$((SECONDS + 60))
|
||||
until curl -fsS "http://127.0.0.1:${LISTEN_PORT}/healthz" >/dev/null 2>&1; do
|
||||
if (( SECONDS >= deadline )); then
|
||||
echo "timed out waiting for coronafs local bench server" >&2
|
||||
tail -n 200 "${SERVER_LOG}" >&2 || true
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
create_response_file="${TMP_DIR}/create-response.txt"
|
||||
create_status="$(
|
||||
curl -sS \
|
||||
-o "${create_response_file}" \
|
||||
-w '%{http_code}' \
|
||||
-X PUT \
|
||||
-H 'content-type: application/json' \
|
||||
-d "{\"size_bytes\":${SIZE_BYTES}}" \
|
||||
"http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}"
|
||||
)"
|
||||
if [[ "${create_status}" -lt 200 || "${create_status}" -ge 300 ]]; then
|
||||
echo "failed to create CoronaFS benchmark volume: HTTP ${create_status}" >&2
|
||||
cat "${create_response_file}" >&2 || true
|
||||
show_server_log
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export_response_file="${TMP_DIR}/export-response.txt"
|
||||
export_status="$(
|
||||
curl -sS \
|
||||
-o "${export_response_file}" \
|
||||
-w '%{http_code}' \
|
||||
-X POST \
|
||||
"http://127.0.0.1:${LISTEN_PORT}/v1/volumes/${VOLUME_ID}/export"
|
||||
)"
|
||||
if [[ "${export_status}" -lt 200 || "${export_status}" -ge 300 ]]; then
|
||||
echo "failed to export CoronaFS benchmark volume: HTTP ${export_status}" >&2
|
||||
cat "${export_response_file}" >&2 || true
|
||||
show_server_log
|
||||
exit 1
|
||||
fi
|
||||
EXPORT_JSON="$(cat "${export_response_file}")"
|
||||
EXPORT_URI="$(printf '%s' "${EXPORT_JSON}" | json_get '.export.uri')"
|
||||
[[ -n "${EXPORT_URI}" && "${EXPORT_URI}" != "null" ]] || {
|
||||
echo "failed to obtain CoronaFS export URI" >&2
|
||||
printf '%s\n' "${EXPORT_JSON}" >&2
|
||||
show_server_log
|
||||
exit 1
|
||||
}
|
||||
|
||||
run_qemu_io() {
|
||||
local extra=()
|
||||
local start_ns end_ns elapsed_ns
|
||||
local args=("$@")
|
||||
local cmd=()
|
||||
local qemu_cmd=""
|
||||
|
||||
if [[ "${#args[@]}" -eq 0 ]]; then
|
||||
echo "run_qemu_io requires at least one qemu-io command" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
while [[ "${#args[@]}" -gt 0 && "${args[0]}" == --* ]]; do
|
||||
extra+=("${args[0]}")
|
||||
args=("${args[@]:1}")
|
||||
done
|
||||
|
||||
cmd=(qemu-io -f raw "${extra[@]}")
|
||||
for qemu_cmd in "${args[@]}"; do
|
||||
cmd+=(-c "${qemu_cmd}")
|
||||
done
|
||||
cmd+=("${EXPORT_URI}")
|
||||
|
||||
start_ns="$(date +%s%N)"
|
||||
"${cmd[@]}" >/dev/null
|
||||
end_ns="$(date +%s%N)"
|
||||
elapsed_ns="$((end_ns - start_ns))"
|
||||
printf '%s\n' "${elapsed_ns}"
|
||||
}
|
||||
|
||||
calc_mib_per_s() {
|
||||
local bytes="$1"
|
||||
local elapsed_ns="$2"
|
||||
awk -v bytes="${bytes}" -v elapsed_ns="${elapsed_ns}" '
|
||||
BEGIN {
|
||||
if (elapsed_ns <= 0) {
|
||||
print "0.00"
|
||||
} else {
|
||||
printf "%.2f", (bytes / 1048576.0) / (elapsed_ns / 1000000000.0)
|
||||
}
|
||||
}
|
||||
'
|
||||
}
|
||||
|
||||
BYTES="$((WORKLOAD_MIB * 1024 * 1024))"
|
||||
WRITE_NS="$(run_qemu_io "write -P 0x5a 0 ${WORKLOAD_MIB}M" "flush")"
|
||||
READ_NS="$(run_qemu_io "read -P 0x5a 0 ${WORKLOAD_MIB}M")"
|
||||
WRITE_MIBPS="$(calc_mib_per_s "${BYTES}" "${WRITE_NS}")"
|
||||
READ_MIBPS="$(calc_mib_per_s "${BYTES}" "${READ_NS}")"
|
||||
|
||||
printf 'CoronaFS local export bench: uri=%s cache=%s aio=%s write=%s MiB/s read=%s MiB/s size=%s MiB\n' \
|
||||
"${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}" "${WORKLOAD_MIB}"
|
||||
|
||||
printf '%s\t%s\t%s\t%s\t%s\n' "${EXPORT_URI}" "${EXPORT_CACHE_MODE}" "${EXPORT_AIO_MODE}" "${WRITE_MIBPS}" "${READ_MIBPS}"
|
||||
561
creditservice/Cargo.lock
generated
561
creditservice/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
525
deployer/Cargo.lock
generated
525
deployer/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -12,8 +12,11 @@ tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
|
|||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
serde_yaml = "0.9"
|
||||
chrono = { version = "0.4", default-features = false, features = ["clock", "serde"] }
|
||||
|
||||
chainfire-client = { path = "../../../chainfire/chainfire-client" }
|
||||
deployer-types = { path = "../deployer-types" }
|
||||
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "json"] }
|
||||
|
||||
[dev-dependencies]
|
||||
axum = { version = "0.7", features = ["macros"] }
|
||||
|
|
|
|||
|
|
@ -4,7 +4,12 @@ use std::path::Path;
|
|||
|
||||
use anyhow::{Context, Result};
|
||||
use chainfire_client::{Client, ClientError};
|
||||
use deployer_types::{ClusterStateSpec, DesiredSystemSpec, InstallPlan, NodeConfig, NodeSpec};
|
||||
use chrono::Utc;
|
||||
use deployer_types::{
|
||||
ClusterNodeRecord, ClusterStateSpec, CommissionState, DesiredSystemSpec, HostDeploymentSpec,
|
||||
HostDeploymentStatus, InstallPlan, InstallState, NodeConfig, NodeSpec, ObservedSystemState,
|
||||
PowerState,
|
||||
};
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde_json::{json, Value};
|
||||
use tokio::fs;
|
||||
|
|
@ -49,6 +54,56 @@ fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str)
|
|||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}/observed-system",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_host_deployment_spec(
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
) -> Vec<u8> {
|
||||
format!(
|
||||
"{}deployments/hosts/{}/spec",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
deployment_name
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_host_deployment_status(
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
) -> Vec<u8> {
|
||||
format!(
|
||||
"{}deployments/hosts/{}/status",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
deployment_name
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn parse_commission_state(value: &str) -> Result<CommissionState> {
|
||||
serde_json::from_str(&format!("\"{value}\""))
|
||||
.with_context(|| format!("invalid commission state {value}"))
|
||||
}
|
||||
|
||||
fn parse_install_state(value: &str) -> Result<InstallState> {
|
||||
serde_json::from_str(&format!("\"{value}\""))
|
||||
.with_context(|| format!("invalid install state {value}"))
|
||||
}
|
||||
|
||||
fn parse_power_state(value: &str) -> Result<PowerState> {
|
||||
serde_json::from_str(&format!("\"{value}\""))
|
||||
.with_context(|| format!("invalid power state {value}"))
|
||||
}
|
||||
|
||||
fn key_node_class(cluster_namespace: &str, cluster_id: &str, node_class: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}node-classes/{}",
|
||||
|
|
@ -178,6 +233,9 @@ fn desired_system_from_spec(node: &NodeSpec) -> Option<DesiredSystemSpec> {
|
|||
if desired.rollback_on_failure.is_none() {
|
||||
desired.rollback_on_failure = Some(true);
|
||||
}
|
||||
if desired.drain_before_apply.is_none() {
|
||||
desired.drain_before_apply = Some(false);
|
||||
}
|
||||
if desired.nixos_configuration.is_some() {
|
||||
Some(desired)
|
||||
} else {
|
||||
|
|
@ -322,6 +380,30 @@ async fn merge_existing_node_observed_fields(
|
|||
if merged.state.is_none() {
|
||||
merged.state = existing_node.state;
|
||||
}
|
||||
if merged.machine_id.is_none() {
|
||||
merged.machine_id = existing_node.machine_id;
|
||||
}
|
||||
if merged.hardware_facts.is_none() {
|
||||
merged.hardware_facts = existing_node.hardware_facts;
|
||||
}
|
||||
if merged.commission_state.is_none() {
|
||||
merged.commission_state = existing_node.commission_state;
|
||||
}
|
||||
if merged.install_state.is_none() {
|
||||
merged.install_state = existing_node.install_state;
|
||||
}
|
||||
if merged.commissioned_at.is_none() {
|
||||
merged.commissioned_at = existing_node.commissioned_at;
|
||||
}
|
||||
if merged.last_inventory_hash.is_none() {
|
||||
merged.last_inventory_hash = existing_node.last_inventory_hash;
|
||||
}
|
||||
if merged.power_state.is_none() {
|
||||
merged.power_state = existing_node.power_state;
|
||||
}
|
||||
if merged.bmc_ref.is_none() {
|
||||
merged.bmc_ref = existing_node.bmc_ref;
|
||||
}
|
||||
if merged.last_heartbeat.is_none() {
|
||||
merged.last_heartbeat = existing_node.last_heartbeat;
|
||||
}
|
||||
|
|
@ -521,6 +603,13 @@ pub async fn bootstrap_cluster(
|
|||
info!(enrollment_rule = %rule.name, "upserted enrollment rule");
|
||||
}
|
||||
|
||||
for deployment in &spec.host_deployments {
|
||||
let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
|
||||
let value = serde_json::to_vec(deployment)?;
|
||||
client.put(&key, &value).await?;
|
||||
info!(deployment = %deployment.name, "upserted host deployment");
|
||||
}
|
||||
|
||||
// 3. Service / Instance (必要であれば)
|
||||
for svc in &spec.services {
|
||||
let key = key_service(cluster_namespace, cluster_id, &svc.name);
|
||||
|
|
@ -627,6 +716,11 @@ pub async fn apply_cluster_state(
|
|||
let value = serde_json::to_vec(rule)?;
|
||||
client.put(&key, &value).await?;
|
||||
}
|
||||
for deployment in &spec.host_deployments {
|
||||
let key = key_host_deployment_spec(cluster_namespace, cluster_id, &deployment.name);
|
||||
let value = serde_json::to_vec(deployment)?;
|
||||
client.put(&key, &value).await?;
|
||||
}
|
||||
for svc in &spec.services {
|
||||
let key = key_service(cluster_namespace, cluster_id, &svc.name);
|
||||
let value = serde_json::to_vec(svc)?;
|
||||
|
|
@ -706,6 +800,421 @@ pub async fn dump_prefix(endpoint: &str, prefix: &str, json_output: bool) -> Res
|
|||
.await
|
||||
}
|
||||
|
||||
async fn get_json_key<T: DeserializeOwned>(client: &mut Client, key: &[u8]) -> Result<Option<T>> {
|
||||
client
|
||||
.get(key)
|
||||
.await?
|
||||
.map(|bytes| serde_json::from_slice::<T>(&bytes))
|
||||
.transpose()
|
||||
.with_context(|| format!("failed to decode key {}", String::from_utf8_lossy(key)))
|
||||
}
|
||||
|
||||
pub async fn inspect_node(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
include_desired_system: bool,
|
||||
include_observed_system: bool,
|
||||
json_output: bool,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "inspect node", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let node_id = node_id.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let node = get_json_key::<ClusterNodeRecord>(
|
||||
&mut client,
|
||||
&key_node(&cluster_namespace, &cluster_id, &node_id),
|
||||
)
|
||||
.await?
|
||||
.with_context(|| format!("node {} not found", node_id))?;
|
||||
|
||||
let desired_system = if include_desired_system {
|
||||
get_json_key::<DesiredSystemSpec>(
|
||||
&mut client,
|
||||
&key_desired_system(&cluster_namespace, &cluster_id, &node_id),
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let observed_system = if include_observed_system {
|
||||
get_json_key::<ObservedSystemState>(
|
||||
&mut client,
|
||||
&key_observed_system(&cluster_namespace, &cluster_id, &node_id),
|
||||
)
|
||||
.await?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
if json_output {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&json!({
|
||||
"node": node,
|
||||
"desired_system": desired_system,
|
||||
"observed_system": observed_system,
|
||||
}))?
|
||||
);
|
||||
} else {
|
||||
println!("node_id={}", node.node_id);
|
||||
println!("hostname={}", node.hostname);
|
||||
println!("ip={}", node.ip);
|
||||
println!("state={}", node.state.as_deref().unwrap_or("unknown"));
|
||||
println!(
|
||||
"commission_state={}",
|
||||
node.commission_state
|
||||
.map(|value| serde_json::to_string(&value).unwrap_or_default())
|
||||
.unwrap_or_else(|| "\"unknown\"".to_string())
|
||||
);
|
||||
println!(
|
||||
"install_state={}",
|
||||
node.install_state
|
||||
.map(|value| serde_json::to_string(&value).unwrap_or_default())
|
||||
.unwrap_or_else(|| "\"unknown\"".to_string())
|
||||
);
|
||||
if let Some(observed_system) = observed_system {
|
||||
println!(
|
||||
"observed_status={}",
|
||||
observed_system.status.unwrap_or_else(|| "unknown".to_string())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn set_node_states(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
state: Option<String>,
|
||||
commission_state: Option<String>,
|
||||
install_state: Option<String>,
|
||||
power_state: Option<String>,
|
||||
bmc_ref: Option<String>,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "set node state", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let node_id = node_id.to_string();
|
||||
let state = state.clone();
|
||||
let commission_state = commission_state.clone();
|
||||
let install_state = install_state.clone();
|
||||
let power_state = power_state.clone();
|
||||
let bmc_ref = bmc_ref.clone();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let key = key_node(&cluster_namespace, &cluster_id, &node_id);
|
||||
let mut node = get_json_key::<ClusterNodeRecord>(&mut client, &key)
|
||||
.await?
|
||||
.with_context(|| format!("node {} not found", node_id))?;
|
||||
|
||||
if let Some(state) = state {
|
||||
node.state = Some(state);
|
||||
}
|
||||
if let Some(commission_state) = commission_state {
|
||||
let parsed = parse_commission_state(&commission_state)?;
|
||||
if matches!(parsed, CommissionState::Commissioned) && node.commissioned_at.is_none()
|
||||
{
|
||||
node.commissioned_at = Some(Utc::now());
|
||||
}
|
||||
node.commission_state = Some(parsed);
|
||||
}
|
||||
if let Some(install_state) = install_state {
|
||||
node.install_state = Some(parse_install_state(&install_state)?);
|
||||
}
|
||||
if let Some(power_state) = power_state {
|
||||
node.power_state = Some(parse_power_state(&power_state)?);
|
||||
}
|
||||
if let Some(bmc_ref) = bmc_ref {
|
||||
node.bmc_ref = Some(bmc_ref);
|
||||
}
|
||||
|
||||
client.put(&key, &serde_json::to_vec(&node)?).await?;
|
||||
println!("{}", serde_json::to_string_pretty(&node)?);
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn set_observed_system(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
status: Option<String>,
|
||||
nixos_configuration: Option<String>,
|
||||
target_system: Option<String>,
|
||||
current_system: Option<String>,
|
||||
configured_system: Option<String>,
|
||||
booted_system: Option<String>,
|
||||
rollback_system: Option<String>,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "set observed system", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let node_id = node_id.to_string();
|
||||
let status = status.clone();
|
||||
let nixos_configuration = nixos_configuration.clone();
|
||||
let target_system = target_system.clone();
|
||||
let current_system = current_system.clone();
|
||||
let configured_system = configured_system.clone();
|
||||
let booted_system = booted_system.clone();
|
||||
let rollback_system = rollback_system.clone();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let key = key_observed_system(&cluster_namespace, &cluster_id, &node_id);
|
||||
let mut observed = get_json_key::<ObservedSystemState>(&mut client, &key)
|
||||
.await?
|
||||
.unwrap_or_else(|| ObservedSystemState {
|
||||
node_id: node_id.clone(),
|
||||
..ObservedSystemState::default()
|
||||
});
|
||||
|
||||
observed.node_id = node_id.clone();
|
||||
if let Some(status) = status {
|
||||
observed.status = Some(status);
|
||||
}
|
||||
if let Some(nixos_configuration) = nixos_configuration {
|
||||
observed.nixos_configuration = Some(nixos_configuration);
|
||||
}
|
||||
if let Some(target_system) = target_system {
|
||||
observed.target_system = Some(target_system);
|
||||
}
|
||||
if let Some(current_system) = current_system {
|
||||
observed.current_system = Some(current_system);
|
||||
}
|
||||
if let Some(configured_system) = configured_system {
|
||||
observed.configured_system = Some(configured_system);
|
||||
}
|
||||
if let Some(booted_system) = booted_system {
|
||||
observed.booted_system = Some(booted_system);
|
||||
}
|
||||
if let Some(rollback_system) = rollback_system {
|
||||
observed.rollback_system = Some(rollback_system);
|
||||
}
|
||||
|
||||
client.put(&key, &serde_json::to_vec(&observed)?).await?;
|
||||
println!("{}", serde_json::to_string_pretty(&observed)?);
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn inspect_host_deployment(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
json_output: bool,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "inspect host deployment", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let deployment_name = deployment_name.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let spec = get_json_key::<HostDeploymentSpec>(
|
||||
&mut client,
|
||||
&key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name),
|
||||
)
|
||||
.await?
|
||||
.with_context(|| format!("host deployment {} not found", deployment_name))?;
|
||||
let status = get_json_key::<HostDeploymentStatus>(
|
||||
&mut client,
|
||||
&key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
|
||||
)
|
||||
.await?;
|
||||
|
||||
if json_output {
|
||||
println!(
|
||||
"{}",
|
||||
serde_json::to_string_pretty(&json!({
|
||||
"spec": spec,
|
||||
"status": status,
|
||||
}))?
|
||||
);
|
||||
} else {
|
||||
println!("name={}", spec.name);
|
||||
println!(
|
||||
"nixos_configuration={}",
|
||||
spec.nixos_configuration.as_deref().unwrap_or("unknown")
|
||||
);
|
||||
if let Some(status) = status {
|
||||
println!("phase={}", status.phase.as_deref().unwrap_or("unknown"));
|
||||
println!("paused={}", status.paused);
|
||||
println!("selected_nodes={}", status.selected_nodes.join(","));
|
||||
println!("completed_nodes={}", status.completed_nodes.join(","));
|
||||
println!("failed_nodes={}", status.failed_nodes.join(","));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn set_host_deployment_paused(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
paused: bool,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "set host deployment pause state", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let deployment_name = deployment_name.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
if client.get(&spec_key).await?.is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"host deployment {} not found",
|
||||
deployment_name
|
||||
));
|
||||
}
|
||||
|
||||
let status_key =
|
||||
key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
let mut status = get_json_key::<HostDeploymentStatus>(&mut client, &status_key)
|
||||
.await?
|
||||
.unwrap_or_else(|| HostDeploymentStatus {
|
||||
name: deployment_name.clone(),
|
||||
..HostDeploymentStatus::default()
|
||||
});
|
||||
status.name = deployment_name.clone();
|
||||
status.paused_by_operator = paused;
|
||||
status.paused = paused;
|
||||
status.phase = Some(if paused { "paused" } else { "ready" }.to_string());
|
||||
status.message = Some(if paused {
|
||||
"paused by operator".to_string()
|
||||
} else {
|
||||
"resumed by operator".to_string()
|
||||
});
|
||||
status.updated_at = Some(Utc::now());
|
||||
client.put(&status_key, &serde_json::to_vec(&status)?).await?;
|
||||
println!("{}", serde_json::to_string_pretty(&status)?);
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn abort_host_deployment(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
) -> Result<()> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
with_chainfire_endpoint_failover(&endpoints, "abort host deployment", |endpoint| {
|
||||
let endpoint = endpoint.to_string();
|
||||
let cluster_namespace = cluster_namespace.to_string();
|
||||
let cluster_id = cluster_id.to_string();
|
||||
let deployment_name = deployment_name.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
if client.get(&spec_key).await?.is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"host deployment {} not found",
|
||||
deployment_name
|
||||
));
|
||||
}
|
||||
|
||||
let node_prefix = format!("{}nodes/", cluster_prefix(&cluster_namespace, &cluster_id));
|
||||
let existing = client.get_prefix(node_prefix.as_bytes()).await?;
|
||||
let mut cleared_nodes = Vec::new();
|
||||
|
||||
for (key, value) in &existing {
|
||||
let key_str = String::from_utf8_lossy(&key);
|
||||
if key_str.ends_with("/desired-system") {
|
||||
let Ok(desired) = serde_json::from_slice::<DesiredSystemSpec>(value) else {
|
||||
continue;
|
||||
};
|
||||
if desired.deployment_id.as_deref() == Some(deployment_name.as_str()) {
|
||||
client.delete(&key).await?;
|
||||
cleared_nodes.push(desired.node_id.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (key, value) in existing {
|
||||
let key_str = String::from_utf8_lossy(&key);
|
||||
if key_str.ends_with("/desired-system") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let node_suffix = key_str
|
||||
.strip_prefix(&node_prefix)
|
||||
.filter(|suffix| !suffix.contains('/'));
|
||||
let Some(node_id) = node_suffix else {
|
||||
continue;
|
||||
};
|
||||
let mut node = match serde_json::from_slice::<ClusterNodeRecord>(&value) {
|
||||
Ok(node) => node,
|
||||
Err(_) => continue,
|
||||
};
|
||||
if cleared_nodes.iter().any(|cleared| cleared == node_id)
|
||||
&& node.state.as_deref() == Some("draining")
|
||||
{
|
||||
node.state = Some("active".to_string());
|
||||
client.put(&key, &serde_json::to_vec(&node)?).await?;
|
||||
}
|
||||
}
|
||||
|
||||
let status = HostDeploymentStatus {
|
||||
name: deployment_name.clone(),
|
||||
phase: Some("aborted".to_string()),
|
||||
paused: true,
|
||||
paused_by_operator: true,
|
||||
selected_nodes: Vec::new(),
|
||||
completed_nodes: Vec::new(),
|
||||
in_progress_nodes: Vec::new(),
|
||||
failed_nodes: Vec::new(),
|
||||
message: Some(format!(
|
||||
"aborted by operator; cleared desired-system from {} node(s)",
|
||||
cleared_nodes.len()
|
||||
)),
|
||||
updated_at: Some(Utc::now()),
|
||||
};
|
||||
client
|
||||
.put(
|
||||
&key_host_deployment_status(&cluster_namespace, &cluster_id, &deployment_name),
|
||||
&serde_json::to_vec(&status)?,
|
||||
)
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string_pretty(&status)?);
|
||||
Ok(())
|
||||
}
|
||||
})
|
||||
.await
|
||||
}
|
||||
|
||||
async fn prune_cluster_state(
|
||||
client: &mut Client,
|
||||
cluster_namespace: &str,
|
||||
|
|
@ -762,6 +1271,16 @@ async fn prune_cluster_state(
|
|||
.to_string(),
|
||||
);
|
||||
}
|
||||
for deployment in &spec.host_deployments {
|
||||
desired_keys.insert(
|
||||
String::from_utf8_lossy(&key_host_deployment_spec(
|
||||
cluster_namespace,
|
||||
cluster_id,
|
||||
&deployment.name,
|
||||
))
|
||||
.to_string(),
|
||||
);
|
||||
}
|
||||
for svc in &spec.services {
|
||||
desired_keys.insert(
|
||||
String::from_utf8_lossy(&key_service(cluster_namespace, cluster_id, &svc.name))
|
||||
|
|
@ -893,11 +1412,18 @@ mod tests {
|
|||
failure_domain: Some("rack-a".to_string()),
|
||||
nix_profile: None,
|
||||
install_plan: None,
|
||||
hardware_facts: None,
|
||||
desired_system: None,
|
||||
state: Some(match NodeState::Pending {
|
||||
NodeState::Pending => "pending".to_string(),
|
||||
_ => unreachable!(),
|
||||
}),
|
||||
commission_state: None,
|
||||
install_state: None,
|
||||
commissioned_at: None,
|
||||
last_inventory_hash: None,
|
||||
power_state: None,
|
||||
bmc_ref: None,
|
||||
last_heartbeat: None,
|
||||
}],
|
||||
node_classes: vec![deployer_types::NodeClassSpec {
|
||||
|
|
@ -922,6 +1448,7 @@ mod tests {
|
|||
labels: HashMap::from([("env".to_string(), "dev".to_string())]),
|
||||
}],
|
||||
enrollment_rules: vec![],
|
||||
host_deployments: vec![],
|
||||
services: vec![],
|
||||
instances: vec![],
|
||||
mtls_policies: vec![],
|
||||
|
|
@ -983,11 +1510,13 @@ mod tests {
|
|||
let mut spec = test_spec();
|
||||
spec.nodes[0].desired_system = Some(DesiredSystemSpec {
|
||||
node_id: String::new(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
flake_ref: Some("github:centra/cloud".to_string()),
|
||||
switch_action: Some("boot".to_string()),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
rollback_on_failure: Some(false),
|
||||
drain_before_apply: Some(false),
|
||||
});
|
||||
|
||||
let resolved = resolve_nodes(&spec).unwrap();
|
||||
|
|
@ -1012,6 +1541,14 @@ mod tests {
|
|||
&format!("{}nodes/node01/observed-system", prefix),
|
||||
&prefix
|
||||
));
|
||||
assert!(is_prunable_key(
|
||||
&format!("{}deployments/hosts/worker-rollout/spec", prefix),
|
||||
&prefix
|
||||
));
|
||||
assert!(!is_prunable_key(
|
||||
&format!("{}deployments/hosts/worker-rollout/status", prefix),
|
||||
&prefix
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1028,6 +1565,7 @@ fn is_prunable_key(key: &str, prefix: &str) -> bool {
|
|||
key.starts_with(&format!("{}node-classes/", prefix))
|
||||
|| key.starts_with(&format!("{}pools/", prefix))
|
||||
|| key.starts_with(&format!("{}enrollment-rules/", prefix))
|
||||
|| key.starts_with(&format!("{}deployments/hosts/", prefix)) && key.ends_with("/spec")
|
||||
|| key.starts_with(&format!("{}services/", prefix))
|
||||
|| key.starts_with(&format!("{}instances/", prefix))
|
||||
|| key.starts_with(&format!("{}mtls/policies/", prefix))
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ use clap::{Parser, Subcommand, ValueEnum};
|
|||
use tracing_subscriber::EnvFilter;
|
||||
|
||||
mod chainfire;
|
||||
mod power;
|
||||
mod remote;
|
||||
|
||||
/// Deployer control CLI for PhotonCloud.
|
||||
|
|
@ -82,6 +83,132 @@ enum Command {
|
|||
#[arg(long, default_value = "status")]
|
||||
action: String,
|
||||
},
|
||||
|
||||
/// ノード単位の inventory / lifecycle 状態を確認・更新する
|
||||
Node {
|
||||
#[command(subcommand)]
|
||||
command: NodeCommand,
|
||||
},
|
||||
|
||||
/// HostDeployment rollout object を確認・操作する
|
||||
Deployment {
|
||||
#[command(subcommand)]
|
||||
command: DeploymentCommand,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum NodeCommand {
|
||||
/// 指定ノードの記録と関連 state を表示する
|
||||
Inspect {
|
||||
#[arg(long)]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long, default_value_t = false)]
|
||||
include_desired_system: bool,
|
||||
|
||||
#[arg(long, default_value_t = false)]
|
||||
include_observed_system: bool,
|
||||
|
||||
#[arg(long, value_enum, default_value_t = DumpFormat::Json)]
|
||||
format: DumpFormat,
|
||||
},
|
||||
|
||||
/// 指定ノードの lifecycle / commissioning 状態を更新する
|
||||
SetState {
|
||||
#[arg(long)]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long, value_enum)]
|
||||
state: Option<NodeLifecycleStateArg>,
|
||||
|
||||
#[arg(long, value_enum)]
|
||||
commission_state: Option<CommissionStateArg>,
|
||||
|
||||
#[arg(long, value_enum)]
|
||||
install_state: Option<InstallStateArg>,
|
||||
|
||||
#[arg(long, value_enum)]
|
||||
power_state: Option<PowerStateArg>,
|
||||
|
||||
#[arg(long)]
|
||||
bmc_ref: Option<String>,
|
||||
},
|
||||
|
||||
/// 指定ノードの observed-system を更新する
|
||||
SetObserved {
|
||||
#[arg(long)]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long)]
|
||||
status: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
nixos_configuration: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
target_system: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
current_system: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
configured_system: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
booted_system: Option<String>,
|
||||
|
||||
#[arg(long)]
|
||||
rollback_system: Option<String>,
|
||||
},
|
||||
|
||||
/// 指定ノードの電源操作を行う
|
||||
Power {
|
||||
#[arg(long)]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long, value_enum)]
|
||||
action: PowerActionArg,
|
||||
},
|
||||
|
||||
/// 指定ノードに再インストールを要求する
|
||||
Reinstall {
|
||||
#[arg(long)]
|
||||
node_id: String,
|
||||
|
||||
#[arg(long, default_value_t = false)]
|
||||
power_cycle: bool,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Subcommand, Debug)]
|
||||
enum DeploymentCommand {
|
||||
/// HostDeployment の spec/status を表示する
|
||||
Inspect {
|
||||
#[arg(long)]
|
||||
name: String,
|
||||
|
||||
#[arg(long, value_enum, default_value_t = DumpFormat::Json)]
|
||||
format: DumpFormat,
|
||||
},
|
||||
|
||||
/// HostDeployment を一時停止する
|
||||
Pause {
|
||||
#[arg(long)]
|
||||
name: String,
|
||||
},
|
||||
|
||||
/// HostDeployment を再開する
|
||||
Resume {
|
||||
#[arg(long)]
|
||||
name: String,
|
||||
},
|
||||
|
||||
/// HostDeployment を中止し、配布済み desired-system を取り消す
|
||||
Abort {
|
||||
#[arg(long)]
|
||||
name: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
|
|
@ -90,6 +217,103 @@ enum DumpFormat {
|
|||
Json,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum NodeLifecycleStateArg {
|
||||
Pending,
|
||||
Provisioning,
|
||||
Active,
|
||||
Failed,
|
||||
Draining,
|
||||
}
|
||||
|
||||
impl NodeLifecycleStateArg {
|
||||
fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Pending => "pending",
|
||||
Self::Provisioning => "provisioning",
|
||||
Self::Active => "active",
|
||||
Self::Failed => "failed",
|
||||
Self::Draining => "draining",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum CommissionStateArg {
|
||||
Discovered,
|
||||
Commissioning,
|
||||
Commissioned,
|
||||
}
|
||||
|
||||
impl CommissionStateArg {
|
||||
fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Discovered => "discovered",
|
||||
Self::Commissioning => "commissioning",
|
||||
Self::Commissioned => "commissioned",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum InstallStateArg {
|
||||
Pending,
|
||||
Installing,
|
||||
Installed,
|
||||
Failed,
|
||||
ReinstallRequested,
|
||||
}
|
||||
|
||||
impl InstallStateArg {
|
||||
fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::Pending => "pending",
|
||||
Self::Installing => "installing",
|
||||
Self::Installed => "installed",
|
||||
Self::Failed => "failed",
|
||||
Self::ReinstallRequested => "reinstall_requested",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum PowerStateArg {
|
||||
On,
|
||||
Off,
|
||||
Cycling,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl PowerStateArg {
|
||||
fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::On => "on",
|
||||
Self::Off => "off",
|
||||
Self::Cycling => "cycling",
|
||||
Self::Unknown => "unknown",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, ValueEnum)]
|
||||
enum PowerActionArg {
|
||||
On,
|
||||
Off,
|
||||
Cycle,
|
||||
Refresh,
|
||||
}
|
||||
|
||||
impl PowerActionArg {
|
||||
fn as_str(self) -> &'static str {
|
||||
match self {
|
||||
Self::On => "on",
|
||||
Self::Off => "off",
|
||||
Self::Cycle => "cycle",
|
||||
Self::Refresh => "refresh",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
let env_filter =
|
||||
|
|
@ -139,6 +363,149 @@ async fn main() -> Result<()> {
|
|||
Command::Deployer { endpoint, action } => {
|
||||
remote::run_deployer_command(&endpoint, &action).await?;
|
||||
}
|
||||
Command::Node { command } => {
|
||||
let cluster_id = cli
|
||||
.cluster_id
|
||||
.as_deref()
|
||||
.ok_or_else(|| anyhow::anyhow!("--cluster-id is required for node commands"))?;
|
||||
|
||||
match command {
|
||||
NodeCommand::Inspect {
|
||||
node_id,
|
||||
include_desired_system,
|
||||
include_observed_system,
|
||||
format,
|
||||
} => {
|
||||
chainfire::inspect_node(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&node_id,
|
||||
include_desired_system,
|
||||
include_observed_system,
|
||||
matches!(format, DumpFormat::Json),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
NodeCommand::SetState {
|
||||
node_id,
|
||||
state,
|
||||
commission_state,
|
||||
install_state,
|
||||
power_state,
|
||||
bmc_ref,
|
||||
} => {
|
||||
chainfire::set_node_states(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&node_id,
|
||||
state.map(|value| value.as_str().to_string()),
|
||||
commission_state.map(|value| value.as_str().to_string()),
|
||||
install_state.map(|value| value.as_str().to_string()),
|
||||
power_state.map(|value| value.as_str().to_string()),
|
||||
bmc_ref,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
NodeCommand::SetObserved {
|
||||
node_id,
|
||||
status,
|
||||
nixos_configuration,
|
||||
target_system,
|
||||
current_system,
|
||||
configured_system,
|
||||
booted_system,
|
||||
rollback_system,
|
||||
} => {
|
||||
chainfire::set_observed_system(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&node_id,
|
||||
status,
|
||||
nixos_configuration,
|
||||
target_system,
|
||||
current_system,
|
||||
configured_system,
|
||||
booted_system,
|
||||
rollback_system,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
NodeCommand::Power { node_id, action } => {
|
||||
power::power_node(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&node_id,
|
||||
action.as_str(),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
NodeCommand::Reinstall {
|
||||
node_id,
|
||||
power_cycle,
|
||||
} => {
|
||||
power::request_reinstall(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&node_id,
|
||||
power_cycle,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Command::Deployment { command } => {
|
||||
let cluster_id = cli
|
||||
.cluster_id
|
||||
.as_deref()
|
||||
.ok_or_else(|| anyhow::anyhow!("--cluster-id is required for deployment commands"))?;
|
||||
|
||||
match command {
|
||||
DeploymentCommand::Inspect { name, format } => {
|
||||
chainfire::inspect_host_deployment(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&name,
|
||||
matches!(format, DumpFormat::Json),
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
DeploymentCommand::Pause { name } => {
|
||||
chainfire::set_host_deployment_paused(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&name,
|
||||
true,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
DeploymentCommand::Resume { name } => {
|
||||
chainfire::set_host_deployment_paused(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&name,
|
||||
false,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
DeploymentCommand::Abort { name } => {
|
||||
chainfire::abort_host_deployment(
|
||||
&cli.chainfire_endpoint,
|
||||
&cli.cluster_namespace,
|
||||
cluster_id,
|
||||
&name,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
372
deployer/crates/deployer-ctl/src/power.rs
Normal file
372
deployer/crates/deployer-ctl/src/power.rs
Normal file
|
|
@ -0,0 +1,372 @@
|
|||
use anyhow::{Context, Result};
|
||||
use chainfire_client::Client;
|
||||
use deployer_types::{ClusterNodeRecord, InstallState, PowerState};
|
||||
use reqwest::{Client as HttpClient, Url};
|
||||
use serde::Deserialize;
|
||||
use serde_json::json;
|
||||
|
||||
fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
|
||||
format!("{}/clusters/{}/", cluster_namespace, cluster_id)
|
||||
}
|
||||
|
||||
fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}/desired-system",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}/observed-system",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn chainfire_endpoints(raw: &str) -> Vec<String> {
|
||||
raw.split(',')
|
||||
.map(str::trim)
|
||||
.filter(|endpoint| !endpoint.is_empty())
|
||||
.map(ToOwned::to_owned)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
enum PowerAction {
|
||||
On,
|
||||
Off,
|
||||
Cycle,
|
||||
Refresh,
|
||||
}
|
||||
|
||||
impl PowerAction {
|
||||
fn parse(value: &str) -> Result<Self> {
|
||||
match value {
|
||||
"on" => Ok(Self::On),
|
||||
"off" => Ok(Self::Off),
|
||||
"cycle" => Ok(Self::Cycle),
|
||||
"refresh" => Ok(Self::Refresh),
|
||||
other => Err(anyhow::anyhow!("unsupported power action {}", other)),
|
||||
}
|
||||
}
|
||||
|
||||
fn reset_type(self) -> Option<&'static str> {
|
||||
match self {
|
||||
Self::On => Some("On"),
|
||||
Self::Off => Some("ForceOff"),
|
||||
Self::Cycle => Some("PowerCycle"),
|
||||
Self::Refresh => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RedfishTarget {
|
||||
resource_url: Url,
|
||||
username: Option<String>,
|
||||
password: Option<String>,
|
||||
insecure: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct RedfishSystemView {
|
||||
#[serde(rename = "PowerState")]
|
||||
power_state: Option<String>,
|
||||
}
|
||||
|
||||
impl RedfishTarget {
|
||||
fn parse(reference: &str) -> Result<Self> {
|
||||
let rewritten = if let Some(rest) = reference.strip_prefix("redfish+http://") {
|
||||
format!("http://{rest}")
|
||||
} else if let Some(rest) = reference.strip_prefix("redfish+https://") {
|
||||
format!("https://{rest}")
|
||||
} else if let Some(rest) = reference.strip_prefix("redfish://") {
|
||||
format!("https://{rest}")
|
||||
} else {
|
||||
return Err(anyhow::anyhow!(
|
||||
"unsupported BMC reference {}; expected redfish:// or redfish+http(s)://",
|
||||
reference
|
||||
));
|
||||
};
|
||||
|
||||
let mut resource_url = Url::parse(&rewritten)
|
||||
.with_context(|| format!("failed to parse BMC reference {}", reference))?;
|
||||
let insecure = resource_url
|
||||
.query_pairs()
|
||||
.any(|(key, value)| key == "insecure" && (value == "1" || value == "true"));
|
||||
let username = if resource_url.username().is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(resource_url.username().to_string())
|
||||
};
|
||||
let password = resource_url.password().map(ToOwned::to_owned);
|
||||
let system_path = normalize_redfish_system_path(resource_url.path());
|
||||
resource_url
|
||||
.set_username("")
|
||||
.map_err(|_| anyhow::anyhow!("failed to clear username from BMC reference"))?;
|
||||
resource_url
|
||||
.set_password(None)
|
||||
.map_err(|_| anyhow::anyhow!("failed to clear password from BMC reference"))?;
|
||||
resource_url.set_query(None);
|
||||
resource_url.set_path(&system_path);
|
||||
|
||||
Ok(Self {
|
||||
resource_url,
|
||||
username,
|
||||
password,
|
||||
insecure,
|
||||
})
|
||||
}
|
||||
|
||||
fn action_url(&self) -> Result<Url> {
|
||||
let mut action_url = self.resource_url.clone();
|
||||
let path = format!(
|
||||
"{}/Actions/ComputerSystem.Reset",
|
||||
self.resource_url.path().trim_end_matches('/')
|
||||
);
|
||||
action_url.set_path(&path);
|
||||
Ok(action_url)
|
||||
}
|
||||
|
||||
async fn perform(&self, action: PowerAction) -> Result<PowerState> {
|
||||
let client = HttpClient::builder()
|
||||
.danger_accept_invalid_certs(self.insecure)
|
||||
.build()
|
||||
.context("failed to create Redfish client")?;
|
||||
|
||||
if let Some(reset_type) = action.reset_type() {
|
||||
let request = self
|
||||
.with_auth(client.post(self.action_url()?))
|
||||
.json(&json!({ "ResetType": reset_type }));
|
||||
request
|
||||
.send()
|
||||
.await
|
||||
.context("failed to send Redfish reset request")?
|
||||
.error_for_status()
|
||||
.context("Redfish reset request failed")?;
|
||||
}
|
||||
|
||||
match action {
|
||||
PowerAction::Cycle => Ok(PowerState::Cycling),
|
||||
PowerAction::On | PowerAction::Off | PowerAction::Refresh => self.refresh(&client).await,
|
||||
}
|
||||
}
|
||||
|
||||
async fn refresh(&self, client: &HttpClient) -> Result<PowerState> {
|
||||
let response = self
|
||||
.with_auth(client.get(self.resource_url.clone()))
|
||||
.send()
|
||||
.await
|
||||
.context("failed to query Redfish system resource")?
|
||||
.error_for_status()
|
||||
.context("Redfish system query failed")?;
|
||||
let system: RedfishSystemView = response
|
||||
.json()
|
||||
.await
|
||||
.context("failed to decode Redfish system response")?;
|
||||
map_redfish_power_state(system.power_state.as_deref())
|
||||
}
|
||||
|
||||
fn with_auth(&self, request: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
|
||||
match self.username.as_deref() {
|
||||
Some(username) => request.basic_auth(username, self.password.clone()),
|
||||
None => request,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_redfish_system_path(path: &str) -> String {
|
||||
let trimmed = path.trim();
|
||||
if trimmed.is_empty() || trimmed == "/" {
|
||||
return "/redfish/v1/Systems/System.Embedded.1".to_string();
|
||||
}
|
||||
if trimmed.starts_with("/redfish/") {
|
||||
return trimmed.to_string();
|
||||
}
|
||||
format!("/redfish/v1/Systems/{}", trimmed.trim_start_matches('/'))
|
||||
}
|
||||
|
||||
fn map_redfish_power_state(value: Option<&str>) -> Result<PowerState> {
|
||||
match value.unwrap_or("Unknown").to_ascii_lowercase().as_str() {
|
||||
"on" => Ok(PowerState::On),
|
||||
"off" => Ok(PowerState::Off),
|
||||
"poweringon" | "poweringoff" | "cycling" => Ok(PowerState::Cycling),
|
||||
"unknown" => Ok(PowerState::Unknown),
|
||||
other => Err(anyhow::anyhow!("unsupported Redfish power state {}", other)),
|
||||
}
|
||||
}
|
||||
|
||||
async fn load_node_record(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
) -> Result<(Client, ClusterNodeRecord, Vec<u8>)> {
|
||||
let endpoints = chainfire_endpoints(endpoint);
|
||||
let mut last_error = None;
|
||||
|
||||
for endpoint in endpoints {
|
||||
match Client::connect(endpoint.clone()).await {
|
||||
Ok(mut client) => {
|
||||
let key = key_node(cluster_namespace, cluster_id, node_id);
|
||||
let Some(bytes) = client.get(&key).await? else {
|
||||
return Err(anyhow::anyhow!("node {} not found", node_id));
|
||||
};
|
||||
let node = serde_json::from_slice::<ClusterNodeRecord>(&bytes)
|
||||
.context("failed to decode node record")?;
|
||||
return Ok((client, node, key));
|
||||
}
|
||||
Err(error) => last_error = Some(anyhow::Error::new(error)),
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_error.unwrap_or_else(|| anyhow::anyhow!("no Chainfire endpoints configured")))
|
||||
}
|
||||
|
||||
pub async fn power_node(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
action: &str,
|
||||
) -> Result<()> {
|
||||
let action = PowerAction::parse(action)?;
|
||||
let (mut client, mut node, key) =
|
||||
load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
|
||||
let bmc_ref = node
|
||||
.bmc_ref
|
||||
.clone()
|
||||
.with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
|
||||
let target = RedfishTarget::parse(&bmc_ref)?;
|
||||
let power_state = target.perform(action).await?;
|
||||
|
||||
node.power_state = Some(power_state);
|
||||
client.put(&key, &serde_json::to_vec(&node)?).await?;
|
||||
println!("{}", serde_json::to_string_pretty(&node)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub async fn request_reinstall(
|
||||
endpoint: &str,
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
node_id: &str,
|
||||
power_cycle: bool,
|
||||
) -> Result<()> {
|
||||
let (mut client, mut node, key) =
|
||||
load_node_record(endpoint, cluster_namespace, cluster_id, node_id).await?;
|
||||
|
||||
node.state = Some("provisioning".to_string());
|
||||
node.install_state = Some(InstallState::ReinstallRequested);
|
||||
|
||||
if power_cycle {
|
||||
let bmc_ref = node
|
||||
.bmc_ref
|
||||
.clone()
|
||||
.with_context(|| format!("node {} does not have a bmc_ref", node_id))?;
|
||||
let target = RedfishTarget::parse(&bmc_ref)?;
|
||||
node.power_state = Some(target.perform(PowerAction::Cycle).await?);
|
||||
}
|
||||
|
||||
client.put(&key, &serde_json::to_vec(&node)?).await?;
|
||||
client
|
||||
.delete(&key_desired_system(cluster_namespace, cluster_id, node_id))
|
||||
.await?;
|
||||
client
|
||||
.delete(&key_observed_system(cluster_namespace, cluster_id, node_id))
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string_pretty(&node)?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use axum::{extract::State, http::StatusCode, routing::{get, post}, Json, Router};
|
||||
use serde_json::Value;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use tokio::net::TcpListener;
|
||||
|
||||
#[test]
|
||||
fn parse_redfish_short_reference_defaults_to_https() {
|
||||
let parsed = RedfishTarget::parse("redfish://lab-bmc/node01").unwrap();
|
||||
assert_eq!(parsed.resource_url.as_str(), "https://lab-bmc/redfish/v1/Systems/node01");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_redfish_explicit_http_reference_keeps_query_flags_local() {
|
||||
let parsed =
|
||||
RedfishTarget::parse("redfish+http://user:pass@127.0.0.1/system-1?insecure=1").unwrap();
|
||||
assert_eq!(
|
||||
parsed.resource_url.as_str(),
|
||||
"http://127.0.0.1/redfish/v1/Systems/system-1"
|
||||
);
|
||||
assert_eq!(parsed.username.as_deref(), Some("user"));
|
||||
assert_eq!(parsed.password.as_deref(), Some("pass"));
|
||||
assert!(parsed.insecure);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn redfish_adapter_refreshes_and_resets_power() {
|
||||
#[derive(Clone, Default)]
|
||||
struct TestState {
|
||||
seen_payloads: Arc<Mutex<Vec<String>>>,
|
||||
}
|
||||
|
||||
async fn system_handler() -> Json<Value> {
|
||||
Json(json!({ "PowerState": "On" }))
|
||||
}
|
||||
|
||||
async fn reset_handler(
|
||||
State(state): State<TestState>,
|
||||
Json(payload): Json<Value>,
|
||||
) -> StatusCode {
|
||||
state
|
||||
.seen_payloads
|
||||
.lock()
|
||||
.unwrap()
|
||||
.push(payload.to_string());
|
||||
StatusCode::NO_CONTENT
|
||||
}
|
||||
|
||||
let state = TestState::default();
|
||||
let app = Router::new()
|
||||
.route("/redfish/v1/Systems/node01", get(system_handler))
|
||||
.route(
|
||||
"/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset",
|
||||
post(reset_handler),
|
||||
)
|
||||
.with_state(state.clone());
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let server = tokio::spawn(async move {
|
||||
axum::serve(listener, app).await.unwrap();
|
||||
});
|
||||
|
||||
let target = RedfishTarget::parse(&format!(
|
||||
"redfish+http://{}/redfish/v1/Systems/node01",
|
||||
addr
|
||||
))
|
||||
.unwrap();
|
||||
assert_eq!(target.perform(PowerAction::Refresh).await.unwrap(), PowerState::On);
|
||||
assert_eq!(target.perform(PowerAction::Off).await.unwrap(), PowerState::On);
|
||||
|
||||
let payloads = state.seen_payloads.lock().unwrap().clone();
|
||||
assert_eq!(payloads, vec![r#"{"ResetType":"ForceOff"}"#.to_string()]);
|
||||
|
||||
server.abort();
|
||||
}
|
||||
}
|
||||
|
|
@ -29,6 +29,7 @@ tracing-subscriber = { workspace = true }
|
|||
chrono = { workspace = true }
|
||||
rcgen = { workspace = true }
|
||||
clap = { workspace = true }
|
||||
sha2 = "0.10"
|
||||
|
||||
# ChainFire for state management
|
||||
chainfire-client = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
|
||||
use chrono::Utc;
|
||||
use deployer_types::{
|
||||
EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo,
|
||||
NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse,
|
||||
CommissionState, EnrollmentRuleSpec, HardwareFacts, InstallPlan, InstallState,
|
||||
NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState, PhoneHomeRequest,
|
||||
PhoneHomeResponse, PowerState,
|
||||
};
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
|
|
@ -49,6 +51,14 @@ fn merge_hardware_summary_metadata(
|
|||
}
|
||||
}
|
||||
|
||||
fn inventory_hash(hardware_facts: Option<&HardwareFacts>) -> Option<String> {
|
||||
let hardware_facts = hardware_facts?;
|
||||
let payload = serde_json::to_vec(hardware_facts).ok()?;
|
||||
let mut hasher = Sha256::new();
|
||||
hasher.update(payload);
|
||||
Some(format!("{:x}", hasher.finalize()))
|
||||
}
|
||||
|
||||
/// POST /api/v1/phone-home
|
||||
///
|
||||
/// Handles node registration during first boot.
|
||||
|
|
@ -794,6 +804,21 @@ async fn store_cluster_node_if_configured(
|
|||
install_plan: node_config.install_plan.clone(),
|
||||
hardware_facts: hardware_facts.cloned(),
|
||||
state: Some(format!("{:?}", node_info.state).to_lowercase()),
|
||||
commission_state: hardware_facts.map(|_| CommissionState::Discovered),
|
||||
install_state: node_config.install_plan.as_ref().map(|_| InstallState::Pending),
|
||||
commissioned_at: None,
|
||||
last_inventory_hash: inventory_hash(hardware_facts),
|
||||
power_state: node_info
|
||||
.metadata
|
||||
.get("power_state")
|
||||
.and_then(|value| match value.as_str() {
|
||||
"on" => Some(PowerState::On),
|
||||
"off" => Some(PowerState::Off),
|
||||
"cycling" => Some(PowerState::Cycling),
|
||||
"unknown" => Some(PowerState::Unknown),
|
||||
_ => None,
|
||||
}),
|
||||
bmc_ref: node_info.metadata.get("bmc_ref").cloned(),
|
||||
last_heartbeat: Some(node_info.last_heartbeat),
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,62 @@ impl Default for NodeState {
|
|||
}
|
||||
}
|
||||
|
||||
/// Commissioning lifecycle for inventory-driven bare-metal onboarding.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum CommissionState {
|
||||
/// Node has been discovered and reported inventory but not yet approved.
|
||||
Discovered,
|
||||
/// Manual or automated commissioning is actively validating the node.
|
||||
Commissioning,
|
||||
/// Inventory has been accepted and the node can be installed or rolled out.
|
||||
Commissioned,
|
||||
}
|
||||
|
||||
impl Default for CommissionState {
|
||||
fn default() -> Self {
|
||||
CommissionState::Discovered
|
||||
}
|
||||
}
|
||||
|
||||
/// Installation lifecycle for host provisioning and reprovisioning.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum InstallState {
|
||||
/// No install is currently running, but an install may be planned.
|
||||
Pending,
|
||||
/// Bootstrap or reinstall is actively writing the target system.
|
||||
Installing,
|
||||
/// The desired system has been installed successfully.
|
||||
Installed,
|
||||
/// Installation failed and needs operator or controller intervention.
|
||||
Failed,
|
||||
/// A reinstall has been requested but not started yet.
|
||||
ReinstallRequested,
|
||||
}
|
||||
|
||||
impl Default for InstallState {
|
||||
fn default() -> Self {
|
||||
InstallState::Pending
|
||||
}
|
||||
}
|
||||
|
||||
/// Best-effort power state tracked by external management adapters.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum PowerState {
|
||||
On,
|
||||
Off,
|
||||
Cycling,
|
||||
Unknown,
|
||||
}
|
||||
|
||||
impl Default for PowerState {
|
||||
fn default() -> Self {
|
||||
PowerState::Unknown
|
||||
}
|
||||
}
|
||||
|
||||
/// Node information tracked by Deployer
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NodeInfo {
|
||||
|
|
@ -492,6 +548,18 @@ pub struct ClusterNodeRecord {
|
|||
pub hardware_facts: Option<HardwareFacts>,
|
||||
#[serde(default)]
|
||||
pub state: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub commission_state: Option<CommissionState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub install_state: Option<InstallState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub commissioned_at: Option<DateTime<Utc>>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub last_inventory_hash: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub power_state: Option<PowerState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub bmc_ref: Option<String>,
|
||||
#[serde(default)]
|
||||
pub last_heartbeat: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
|
@ -534,6 +602,8 @@ pub struct DesiredSystemSpec {
|
|||
#[serde(default)]
|
||||
pub node_id: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub deployment_id: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub nixos_configuration: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub flake_ref: Option<String>,
|
||||
|
|
@ -543,6 +613,8 @@ pub struct DesiredSystemSpec {
|
|||
pub health_check_command: Vec<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub rollback_on_failure: Option<bool>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub drain_before_apply: Option<bool>,
|
||||
}
|
||||
|
||||
/// Cluster metadata (PhotonCloud scope).
|
||||
|
|
@ -576,9 +648,23 @@ pub struct NodeSpec {
|
|||
#[serde(default)]
|
||||
pub install_plan: Option<InstallPlan>,
|
||||
#[serde(default)]
|
||||
pub hardware_facts: Option<HardwareFacts>,
|
||||
#[serde(default)]
|
||||
pub desired_system: Option<DesiredSystemSpec>,
|
||||
#[serde(default)]
|
||||
pub state: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub commission_state: Option<CommissionState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub install_state: Option<InstallState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub commissioned_at: Option<DateTime<Utc>>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub last_inventory_hash: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub power_state: Option<PowerState>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub bmc_ref: Option<String>,
|
||||
#[serde(default)]
|
||||
pub last_heartbeat: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
|
@ -647,6 +733,74 @@ pub struct EnrollmentRuleSpec {
|
|||
pub node_id_prefix: Option<String>,
|
||||
}
|
||||
|
||||
/// Selector used by host deployments to target bare-metal nodes declaratively.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct HostDeploymentSelector {
|
||||
#[serde(default)]
|
||||
pub node_ids: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub roles: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub pools: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub node_classes: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub match_labels: HashMap<String, String>,
|
||||
}
|
||||
|
||||
/// Declarative rollout intent for host-level NixOS updates.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct HostDeploymentSpec {
|
||||
pub name: String,
|
||||
#[serde(default)]
|
||||
pub selector: HostDeploymentSelector,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub nixos_configuration: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub flake_ref: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub batch_size: Option<u32>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub max_unavailable: Option<u32>,
|
||||
#[serde(default)]
|
||||
pub health_check_command: Vec<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub switch_action: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub rollback_on_failure: Option<bool>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub drain_before_apply: Option<bool>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub reboot_policy: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub paused: Option<bool>,
|
||||
}
|
||||
|
||||
/// Controller-observed rollout state for a host deployment.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct HostDeploymentStatus {
|
||||
#[serde(default)]
|
||||
pub name: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub phase: Option<String>,
|
||||
#[serde(default)]
|
||||
pub paused: bool,
|
||||
#[serde(default)]
|
||||
pub paused_by_operator: bool,
|
||||
#[serde(default)]
|
||||
pub selected_nodes: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub completed_nodes: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub in_progress_nodes: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub failed_nodes: Vec<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub message: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub updated_at: Option<DateTime<Utc>>,
|
||||
}
|
||||
|
||||
/// Service ports for logical service definitions.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct ServicePorts {
|
||||
|
|
@ -807,6 +961,8 @@ pub struct ClusterStateSpec {
|
|||
#[serde(default)]
|
||||
pub enrollment_rules: Vec<EnrollmentRuleSpec>,
|
||||
#[serde(default)]
|
||||
pub host_deployments: Vec<HostDeploymentSpec>,
|
||||
#[serde(default)]
|
||||
pub services: Vec<ServiceSpec>,
|
||||
#[serde(default)]
|
||||
pub instances: Vec<ServiceInstanceSpec>,
|
||||
|
|
@ -1080,19 +1236,92 @@ mod tests {
|
|||
fn test_desired_system_spec_roundtrip() {
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: Some("worker-rollout".to_string()),
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
switch_action: Some("switch".to_string()),
|
||||
health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()],
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(true),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&desired).unwrap();
|
||||
let decoded: DesiredSystemSpec = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(decoded.node_id, "node01");
|
||||
assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout"));
|
||||
assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01"));
|
||||
assert_eq!(decoded.health_check_command.len(), 2);
|
||||
assert_eq!(decoded.rollback_on_failure, Some(true));
|
||||
assert_eq!(decoded.drain_before_apply, Some(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_host_deployment_roundtrip() {
|
||||
let spec = HostDeploymentSpec {
|
||||
name: "worker-rollout".to_string(),
|
||||
selector: HostDeploymentSelector {
|
||||
node_ids: vec![],
|
||||
roles: vec!["worker".to_string()],
|
||||
pools: vec!["general".to_string()],
|
||||
node_classes: vec!["worker-linux".to_string()],
|
||||
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
|
||||
},
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
batch_size: Some(1),
|
||||
max_unavailable: Some(1),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
switch_action: Some("boot".to_string()),
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(true),
|
||||
reboot_policy: Some("always".to_string()),
|
||||
paused: Some(false),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&spec).unwrap();
|
||||
let decoded: HostDeploymentSpec = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(decoded.name, "worker-rollout");
|
||||
assert_eq!(decoded.batch_size, Some(1));
|
||||
assert_eq!(decoded.max_unavailable, Some(1));
|
||||
assert_eq!(decoded.selector.roles, vec!["worker".to_string()]);
|
||||
assert_eq!(
|
||||
decoded.selector.match_labels.get("tier").map(String::as_str),
|
||||
Some("general")
|
||||
);
|
||||
assert_eq!(decoded.drain_before_apply, Some(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cluster_node_record_commissioning_roundtrip() {
|
||||
let node = ClusterNodeRecord {
|
||||
node_id: "node01".to_string(),
|
||||
machine_id: Some("machine-01".to_string()),
|
||||
ip: "10.0.0.11".to_string(),
|
||||
hostname: "node01".to_string(),
|
||||
roles: vec!["worker".to_string()],
|
||||
labels: HashMap::new(),
|
||||
pool: Some("general".to_string()),
|
||||
node_class: Some("worker-linux".to_string()),
|
||||
failure_domain: Some("rack-a".to_string()),
|
||||
nix_profile: Some("profiles/worker-linux".to_string()),
|
||||
install_plan: None,
|
||||
hardware_facts: None,
|
||||
state: Some("provisioning".to_string()),
|
||||
commission_state: Some(CommissionState::Commissioned),
|
||||
install_state: Some(InstallState::Installed),
|
||||
commissioned_at: Some(Utc::now()),
|
||||
last_inventory_hash: Some("abc123".to_string()),
|
||||
power_state: Some(PowerState::On),
|
||||
bmc_ref: Some("redfish://lab-rack-a/node01".to_string()),
|
||||
last_heartbeat: Some(Utc::now()),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&node).unwrap();
|
||||
let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(decoded.commission_state, Some(CommissionState::Commissioned));
|
||||
assert_eq!(decoded.install_state, Some(InstallState::Installed));
|
||||
assert_eq!(decoded.power_state, Some(PowerState::On));
|
||||
assert_eq!(decoded.bmc_ref.as_deref(), Some("redfish://lab-rack-a/node01"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -899,6 +899,12 @@ mod tests {
|
|||
install_plan: None,
|
||||
hardware_facts: None,
|
||||
state: Some("active".to_string()),
|
||||
commission_state: None,
|
||||
install_state: None,
|
||||
commissioned_at: None,
|
||||
last_inventory_hash: None,
|
||||
power_state: None,
|
||||
bmc_ref: None,
|
||||
last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ use std::fs;
|
|||
use std::path::Path;
|
||||
use std::process::Stdio;
|
||||
use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
|
||||
use anyhow::{anyhow, Context, Result};
|
||||
use chainfire_client::Client;
|
||||
|
|
@ -135,7 +136,15 @@ impl Agent {
|
|||
}
|
||||
|
||||
async fn tick(&self) -> Result<()> {
|
||||
info!(
|
||||
endpoint = %self.endpoint,
|
||||
cluster_namespace = %self.cluster_namespace,
|
||||
cluster_id = %self.cluster_id,
|
||||
node_id = %self.node_id,
|
||||
"starting reconciliation tick"
|
||||
);
|
||||
let mut client = Client::connect(self.endpoint.clone()).await?;
|
||||
info!("connected to ChainFire");
|
||||
let node_key = key_node(&self.cluster_namespace, &self.cluster_id, &self.node_id);
|
||||
let node_raw = client.get_with_revision(&node_key).await?;
|
||||
let Some((node_bytes, _revision)) = node_raw else {
|
||||
|
|
@ -149,6 +158,11 @@ impl Agent {
|
|||
|
||||
let node: ClusterNodeRecord =
|
||||
serde_json::from_slice(&node_bytes).context("failed to parse node record")?;
|
||||
info!(
|
||||
hostname = %node.hostname,
|
||||
state = node.state.as_deref().unwrap_or("unknown"),
|
||||
"loaded node record"
|
||||
);
|
||||
|
||||
let desired = client
|
||||
.get(key_desired_system(
|
||||
|
|
@ -160,6 +174,11 @@ impl Agent {
|
|||
.map(|bytes| serde_json::from_slice::<DesiredSystemSpec>(&bytes))
|
||||
.transpose()
|
||||
.context("failed to parse desired-system spec")?;
|
||||
info!(
|
||||
has_desired_system = desired.is_some(),
|
||||
has_install_plan = node.install_plan.is_some(),
|
||||
"resolved desired-state inputs"
|
||||
);
|
||||
|
||||
let previous_observed = client
|
||||
.get(key_observed_system(
|
||||
|
|
@ -173,24 +192,87 @@ impl Agent {
|
|||
.context("failed to parse observed-system state")?;
|
||||
|
||||
let mut observed = self.base_observed_state(&node);
|
||||
observed.status = Some("planning".to_string());
|
||||
info!(
|
||||
current_system = observed.current_system.as_deref().unwrap_or(""),
|
||||
configured_system = observed.configured_system.as_deref().unwrap_or(""),
|
||||
booted_system = observed.booted_system.as_deref().unwrap_or(""),
|
||||
"publishing planning status"
|
||||
);
|
||||
self.publish_observed_state(&mut client, &observed).await?;
|
||||
let reconcile_result = self
|
||||
.reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed)
|
||||
.reconcile_node(
|
||||
&node,
|
||||
desired.as_ref(),
|
||||
previous_observed.as_ref(),
|
||||
&mut observed,
|
||||
)
|
||||
.await;
|
||||
if let Err(error) = reconcile_result {
|
||||
observed.status = Some("failed".to_string());
|
||||
observed.last_error = Some(error.to_string());
|
||||
observed.last_error = Some(format!("{error:#}"));
|
||||
}
|
||||
|
||||
info!(
|
||||
status = observed.status.as_deref().unwrap_or("unknown"),
|
||||
"publishing final observed status"
|
||||
);
|
||||
self.publish_observed_state_with_retry(&observed).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn publish_observed_state(
|
||||
&self,
|
||||
client: &mut Client,
|
||||
observed: &ObservedSystemState,
|
||||
) -> Result<()> {
|
||||
info!(
|
||||
status = observed.status.as_deref().unwrap_or("unknown"),
|
||||
"writing observed-system state"
|
||||
);
|
||||
client
|
||||
.put(
|
||||
&key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id),
|
||||
&serde_json::to_vec(&observed)?,
|
||||
&serde_json::to_vec(observed)?,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn publish_observed_state_with_retry(
|
||||
&self,
|
||||
observed: &ObservedSystemState,
|
||||
) -> Result<()> {
|
||||
let payload = serde_json::to_vec(observed)?;
|
||||
let key = key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id);
|
||||
let deadline = Instant::now() + Duration::from_secs(30);
|
||||
let mut attempt = 1u32;
|
||||
|
||||
loop {
|
||||
let result = async {
|
||||
let mut client = Client::connect(self.endpoint.clone()).await?;
|
||||
client.put(&key, &payload).await?;
|
||||
Result::<()>::Ok(())
|
||||
}
|
||||
.await;
|
||||
|
||||
match result {
|
||||
Ok(()) => return Ok(()),
|
||||
Err(error) if Instant::now() < deadline => {
|
||||
warn!(
|
||||
attempt,
|
||||
error = %error,
|
||||
"failed to publish observed-system state; retrying with a fresh connection"
|
||||
);
|
||||
attempt += 1;
|
||||
sleep(Duration::from_secs(2)).await;
|
||||
}
|
||||
Err(error) => return Err(error),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
|
||||
ObservedSystemState {
|
||||
node_id: node.node_id.clone(),
|
||||
|
|
@ -209,7 +291,18 @@ impl Agent {
|
|||
observed: &mut ObservedSystemState,
|
||||
) -> Result<()> {
|
||||
match node.state.as_deref() {
|
||||
Some("failed") | Some("draining") => {
|
||||
Some("failed") => {
|
||||
observed.status = Some("paused".to_string());
|
||||
return Ok(());
|
||||
}
|
||||
Some("draining")
|
||||
if !desired
|
||||
.map(|spec| {
|
||||
spec.deployment_id.is_some()
|
||||
&& spec.drain_before_apply.unwrap_or(false)
|
||||
})
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
observed.status = Some("paused".to_string());
|
||||
return Ok(());
|
||||
}
|
||||
|
|
@ -227,6 +320,14 @@ impl Agent {
|
|||
observed.status = Some("idle".to_string());
|
||||
return Ok(());
|
||||
};
|
||||
info!(
|
||||
nixos_configuration = %desired.nixos_configuration,
|
||||
flake_ref = %desired.flake_ref,
|
||||
switch_action = %desired.switch_action,
|
||||
rollback_on_failure = desired.rollback_on_failure,
|
||||
health_check_command = ?desired.health_check_command,
|
||||
"resolved desired system"
|
||||
);
|
||||
|
||||
observed.nixos_configuration = Some(desired.nixos_configuration.clone());
|
||||
observed.flake_root = Some(desired.flake_ref.clone());
|
||||
|
|
@ -236,6 +337,10 @@ impl Agent {
|
|||
.and_then(|state| state.rollback_system.clone())
|
||||
.or_else(|| observed.current_system.clone());
|
||||
observed.rollback_system = previous_system.clone();
|
||||
info!(
|
||||
previous_system = previous_system.as_deref().unwrap_or(""),
|
||||
"selected rollback baseline"
|
||||
);
|
||||
let target_system = self
|
||||
.build_target_system(&desired.flake_ref, &desired.nixos_configuration)
|
||||
.await
|
||||
|
|
@ -246,8 +351,10 @@ impl Agent {
|
|||
)
|
||||
})?;
|
||||
observed.target_system = Some(target_system.clone());
|
||||
info!(target_system = %target_system, "built target system");
|
||||
|
||||
if observed.current_system.as_deref() == Some(target_system.as_str()) {
|
||||
info!("target system already active");
|
||||
if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
|
||||
observed.status = Some("verifying".to_string());
|
||||
observed.last_attempt = Some(Utc::now());
|
||||
|
|
@ -279,8 +386,14 @@ impl Agent {
|
|||
|
||||
observed.status = Some("reconciling".to_string());
|
||||
observed.last_attempt = Some(Utc::now());
|
||||
info!(
|
||||
target_system = %target_system,
|
||||
switch_action = %desired.switch_action,
|
||||
"switching to target system"
|
||||
);
|
||||
self.switch_to_target(&target_system, &desired.switch_action)
|
||||
.await?;
|
||||
info!("switch-to-configuration completed");
|
||||
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
|
|
@ -327,15 +440,20 @@ impl Agent {
|
|||
|
||||
async fn build_target_system(&self, flake_ref: &str, configuration: &str) -> Result<String> {
|
||||
let flake_attr = target_flake_attr(flake_ref, configuration);
|
||||
let output = run_command(
|
||||
"nix",
|
||||
&["build", "--no-link", "--print-out-paths", flake_attr.as_str()],
|
||||
)
|
||||
.await?;
|
||||
info!(flake_attr = %flake_attr, "building target system");
|
||||
let mut build_args = vec![
|
||||
"build",
|
||||
"-L",
|
||||
"--no-link",
|
||||
"--no-write-lock-file",
|
||||
"--print-out-paths",
|
||||
];
|
||||
build_args.push(flake_attr.as_str());
|
||||
let output = run_command("nix", &build_args).await?;
|
||||
let path = output
|
||||
.lines()
|
||||
.find(|line| !line.trim().is_empty())
|
||||
.map(str::trim)
|
||||
.find(|line| line.starts_with("/nix/store/"))
|
||||
.ok_or_else(|| anyhow!("nix build returned no output path"))?;
|
||||
Ok(path.to_string())
|
||||
}
|
||||
|
|
@ -349,7 +467,12 @@ impl Agent {
|
|||
));
|
||||
}
|
||||
|
||||
run_command(
|
||||
info!(
|
||||
switch_bin = %switch_bin.display(),
|
||||
switch_action = %switch_action,
|
||||
"executing switch-to-configuration"
|
||||
);
|
||||
run_command_inherit_output(
|
||||
switch_bin
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("invalid switch path"))?,
|
||||
|
|
@ -369,9 +492,15 @@ impl Agent {
|
|||
return Ok(HealthCheckOutcome::Passed);
|
||||
}
|
||||
|
||||
info!(
|
||||
command = ?desired.health_check_command,
|
||||
rollback_on_failure = desired.rollback_on_failure,
|
||||
"running post-activation health check"
|
||||
);
|
||||
if let Err(error) = run_vec_command(&desired.health_check_command).await {
|
||||
let error_message = format!("health check failed after activation: {error}");
|
||||
if desired.rollback_on_failure {
|
||||
info!("health check failed; rolling back to previous system");
|
||||
self.rollback_to_previous(previous_system).await?;
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
|
|
@ -385,6 +514,7 @@ impl Agent {
|
|||
return Err(anyhow!(error_message));
|
||||
}
|
||||
|
||||
info!("post-activation health check passed");
|
||||
Ok(HealthCheckOutcome::Passed)
|
||||
}
|
||||
|
||||
|
|
@ -392,7 +522,42 @@ impl Agent {
|
|||
let previous_system = previous_system
|
||||
.filter(|value| !value.is_empty())
|
||||
.ok_or_else(|| anyhow!("rollback requested but no previous system is known"))?;
|
||||
self.switch_to_target(previous_system, "switch").await
|
||||
info!(previous_system = %previous_system, "rolling back to previous system");
|
||||
let switch_bin = Path::new(previous_system).join("bin/switch-to-configuration");
|
||||
if switch_bin.exists() {
|
||||
return self.switch_to_target(previous_system, "switch").await;
|
||||
}
|
||||
|
||||
let activate = Path::new(previous_system).join("activate");
|
||||
if !activate.exists() {
|
||||
return Err(anyhow!(
|
||||
"previous system {} does not contain switch-to-configuration or activate",
|
||||
previous_system
|
||||
));
|
||||
}
|
||||
|
||||
info!(
|
||||
previous_system = %previous_system,
|
||||
activate = %activate.display(),
|
||||
"previous system lacks switch-to-configuration; falling back to profile set + activate"
|
||||
);
|
||||
run_command(
|
||||
"nix-env",
|
||||
&[
|
||||
"--profile",
|
||||
"/nix/var/nix/profiles/system",
|
||||
"--set",
|
||||
previous_system,
|
||||
],
|
||||
)
|
||||
.await?;
|
||||
run_command_inherit_output(
|
||||
activate
|
||||
.to_str()
|
||||
.ok_or_else(|| anyhow!("invalid activate path"))?,
|
||||
&[],
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -458,6 +623,8 @@ fn read_symlink_target(path: &str) -> Option<String> {
|
|||
}
|
||||
|
||||
async fn run_command(program: &str, args: &[&str]) -> Result<String> {
|
||||
let started_at = Instant::now();
|
||||
info!(program = %program, args = ?args, "running command");
|
||||
let output = Command::new(program)
|
||||
.args(args)
|
||||
.stdin(Stdio::null())
|
||||
|
|
@ -468,10 +635,25 @@ async fn run_command(program: &str, args: &[&str]) -> Result<String> {
|
|||
.with_context(|| format!("failed to execute {}", program))?;
|
||||
|
||||
if output.status.success() {
|
||||
info!(
|
||||
program = %program,
|
||||
args = ?args,
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"command completed successfully"
|
||||
);
|
||||
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
|
||||
let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
||||
warn!(
|
||||
program = %program,
|
||||
args = ?args,
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
status = %output.status,
|
||||
stdout = %stdout,
|
||||
stderr = %stderr,
|
||||
"command failed"
|
||||
);
|
||||
Err(anyhow!(
|
||||
"{} {:?} failed with status {}: stdout='{}' stderr='{}'",
|
||||
program,
|
||||
|
|
@ -491,6 +673,47 @@ async fn run_vec_command(command: &[String]) -> Result<String> {
|
|||
run_command(program, &arg_refs).await
|
||||
}
|
||||
|
||||
async fn run_command_inherit_output(program: &str, args: &[&str]) -> Result<()> {
|
||||
let started_at = Instant::now();
|
||||
info!(
|
||||
program = %program,
|
||||
args = ?args,
|
||||
"running command with inherited output"
|
||||
);
|
||||
let status = Command::new(program)
|
||||
.args(args)
|
||||
.stdin(Stdio::null())
|
||||
.stdout(Stdio::inherit())
|
||||
.stderr(Stdio::inherit())
|
||||
.status()
|
||||
.await
|
||||
.with_context(|| format!("failed to execute {}", program))?;
|
||||
|
||||
if status.success() {
|
||||
info!(
|
||||
program = %program,
|
||||
args = ?args,
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
"command completed successfully"
|
||||
);
|
||||
Ok(())
|
||||
} else {
|
||||
warn!(
|
||||
program = %program,
|
||||
args = ?args,
|
||||
elapsed_ms = started_at.elapsed().as_millis(),
|
||||
status = %status,
|
||||
"command failed"
|
||||
);
|
||||
Err(anyhow!(
|
||||
"{} {:?} failed with status {}",
|
||||
program,
|
||||
args,
|
||||
status
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
tracing_subscriber::fmt()
|
||||
|
|
@ -543,6 +766,12 @@ mod tests {
|
|||
}),
|
||||
hardware_facts: None,
|
||||
state: Some("active".to_string()),
|
||||
commission_state: None,
|
||||
install_state: None,
|
||||
commissioned_at: None,
|
||||
last_inventory_hash: None,
|
||||
power_state: None,
|
||||
bmc_ref: None,
|
||||
last_heartbeat: None,
|
||||
}
|
||||
}
|
||||
|
|
@ -568,11 +797,13 @@ mod tests {
|
|||
fn resolve_desired_system_prefers_chainfire_spec() {
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
flake_ref: Some("github:centra/cloud".to_string()),
|
||||
switch_action: Some("boot".to_string()),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(false),
|
||||
};
|
||||
|
||||
let resolved = resolve_desired_system(
|
||||
|
|
@ -595,11 +826,13 @@ mod tests {
|
|||
fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
flake_ref: None,
|
||||
switch_action: None,
|
||||
health_check_command: Vec::new(),
|
||||
rollback_on_failure: None,
|
||||
drain_before_apply: None,
|
||||
};
|
||||
|
||||
let resolved = resolve_desired_system(
|
||||
|
|
@ -631,7 +864,10 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn read_symlink_target_returns_none_for_missing_path() {
|
||||
assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None);
|
||||
assert_eq!(
|
||||
read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -9,6 +9,8 @@ repository.workspace = true
|
|||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
chainfire-client.workspace = true
|
||||
chrono.workspace = true
|
||||
serde.workspace = true
|
||||
serde_json.workspace = true
|
||||
tokio.workspace = true
|
||||
|
|
@ -16,5 +18,6 @@ tracing.workspace = true
|
|||
tracing-subscriber.workspace = true
|
||||
fiberlb-api.workspace = true
|
||||
flashdns-api.workspace = true
|
||||
deployer-types.workspace = true
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
tonic = "0.12"
|
||||
|
|
|
|||
823
deployer/crates/plasmacloud-reconciler/src/hosts.rs
Normal file
823
deployer/crates/plasmacloud-reconciler/src/hosts.rs
Normal file
|
|
@ -0,0 +1,823 @@
|
|||
use anyhow::Result;
|
||||
use chainfire_client::Client;
|
||||
use chrono::Utc;
|
||||
use clap::Args;
|
||||
use deployer_types::{
|
||||
ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector,
|
||||
HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState, ServiceInstanceSpec,
|
||||
};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::time::Duration;
|
||||
use tokio::time::sleep;
|
||||
use tracing::{info, warn};
|
||||
|
||||
fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
|
||||
format!("{}/clusters/{}/", cluster_namespace, cluster_id)
|
||||
}
|
||||
|
||||
fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
||||
format!(
|
||||
"{}nodes/{}/desired-system",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
node_id
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
fn key_host_deployment_status(
|
||||
cluster_namespace: &str,
|
||||
cluster_id: &str,
|
||||
deployment_name: &str,
|
||||
) -> Vec<u8> {
|
||||
format!(
|
||||
"{}deployments/hosts/{}/status",
|
||||
cluster_prefix(cluster_namespace, cluster_id),
|
||||
deployment_name
|
||||
)
|
||||
.into_bytes()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Args)]
|
||||
pub struct HostsCommand {
|
||||
#[arg(long)]
|
||||
pub endpoint: String,
|
||||
|
||||
#[arg(long, default_value = "photoncloud")]
|
||||
pub cluster_namespace: String,
|
||||
|
||||
#[arg(long)]
|
||||
pub cluster_id: String,
|
||||
|
||||
#[arg(long, default_value_t = 15)]
|
||||
pub interval_secs: u64,
|
||||
|
||||
#[arg(long, default_value_t = 300)]
|
||||
pub heartbeat_timeout_secs: u64,
|
||||
|
||||
#[arg(long, default_value_t = false)]
|
||||
pub dry_run: bool,
|
||||
|
||||
#[arg(long, default_value_t = false)]
|
||||
pub once: bool,
|
||||
}
|
||||
|
||||
pub async fn run(command: HostsCommand) -> Result<()> {
|
||||
let controller = HostDeploymentController::new(command);
|
||||
if controller.once {
|
||||
controller.reconcile_once().await
|
||||
} else {
|
||||
loop {
|
||||
if let Err(error) = controller.reconcile_once().await {
|
||||
warn!(error = %error, "host deployment reconciliation failed");
|
||||
}
|
||||
sleep(controller.interval).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct HostDeploymentController {
|
||||
endpoint: String,
|
||||
cluster_namespace: String,
|
||||
cluster_id: String,
|
||||
interval: Duration,
|
||||
heartbeat_timeout_secs: u64,
|
||||
dry_run: bool,
|
||||
once: bool,
|
||||
}
|
||||
|
||||
impl HostDeploymentController {
|
||||
fn new(command: HostsCommand) -> Self {
|
||||
Self {
|
||||
endpoint: command.endpoint,
|
||||
cluster_namespace: command.cluster_namespace,
|
||||
cluster_id: command.cluster_id,
|
||||
interval: Duration::from_secs(command.interval_secs),
|
||||
heartbeat_timeout_secs: command.heartbeat_timeout_secs,
|
||||
dry_run: command.dry_run,
|
||||
once: command.once,
|
||||
}
|
||||
}
|
||||
|
||||
async fn reconcile_once(&self) -> Result<()> {
|
||||
let mut client = Client::connect(self.endpoint.clone()).await?;
|
||||
let nodes = self.load_nodes(&mut client).await?;
|
||||
let desired_systems = self.load_desired_systems(&mut client).await?;
|
||||
let observed_systems = self.load_observed_systems(&mut client).await?;
|
||||
let instances = self.load_instances(&mut client).await?;
|
||||
let deployments = self.load_host_deployments(&mut client).await?;
|
||||
let statuses = self.load_host_deployment_statuses(&mut client).await?;
|
||||
|
||||
info!(
|
||||
nodes = nodes.len(),
|
||||
deployments = deployments.len(),
|
||||
instances = instances.len(),
|
||||
"loaded host deployment inputs"
|
||||
);
|
||||
|
||||
for deployment in deployments {
|
||||
let existing_status = statuses.get(&deployment.name).cloned();
|
||||
let plan = plan_host_deployment(
|
||||
&deployment,
|
||||
existing_status.as_ref(),
|
||||
&nodes,
|
||||
&desired_systems,
|
||||
&observed_systems,
|
||||
&instances,
|
||||
self.heartbeat_timeout_secs,
|
||||
);
|
||||
|
||||
if self.dry_run {
|
||||
info!(
|
||||
deployment = %deployment.name,
|
||||
phase = plan.status.phase.as_deref().unwrap_or("unknown"),
|
||||
desired_upserts = plan.desired_upserts.len(),
|
||||
desired_deletes = plan.desired_deletes.len(),
|
||||
node_updates = plan.node_updates.len(),
|
||||
"would reconcile host deployment"
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
for desired in &plan.desired_upserts {
|
||||
client
|
||||
.put(
|
||||
&key_desired_system(
|
||||
&self.cluster_namespace,
|
||||
&self.cluster_id,
|
||||
&desired.node_id,
|
||||
),
|
||||
&serde_json::to_vec(desired)?,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
for node_id in &plan.desired_deletes {
|
||||
client
|
||||
.delete(&key_desired_system(
|
||||
&self.cluster_namespace,
|
||||
&self.cluster_id,
|
||||
node_id,
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
|
||||
for node in plan.node_updates.values() {
|
||||
client
|
||||
.put(
|
||||
&key_node(&self.cluster_namespace, &self.cluster_id, &node.node_id),
|
||||
&serde_json::to_vec(node)?,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
client
|
||||
.put(
|
||||
&key_host_deployment_status(
|
||||
&self.cluster_namespace,
|
||||
&self.cluster_id,
|
||||
&deployment.name,
|
||||
),
|
||||
&serde_json::to_vec(&plan.status)?,
|
||||
)
|
||||
.await?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn load_nodes(&self, client: &mut Client) -> Result<Vec<ClusterNodeRecord>> {
|
||||
let prefix = format!(
|
||||
"{}nodes/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut nodes = Vec::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
let Some(suffix) = key.strip_prefix(&prefix) else {
|
||||
continue;
|
||||
};
|
||||
if suffix.contains('/') {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_slice::<ClusterNodeRecord>(&value) {
|
||||
Ok(node) => nodes.push(node),
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode cluster node"),
|
||||
}
|
||||
}
|
||||
|
||||
nodes.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
|
||||
Ok(nodes)
|
||||
}
|
||||
|
||||
async fn load_desired_systems(
|
||||
&self,
|
||||
client: &mut Client,
|
||||
) -> Result<HashMap<String, DesiredSystemSpec>> {
|
||||
let prefix = format!(
|
||||
"{}nodes/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut desired = HashMap::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
if !key.ends_with("/desired-system") {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_slice::<DesiredSystemSpec>(&value) {
|
||||
Ok(spec) => {
|
||||
desired.insert(spec.node_id.clone(), spec);
|
||||
}
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode desired-system"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(desired)
|
||||
}
|
||||
|
||||
async fn load_observed_systems(
|
||||
&self,
|
||||
client: &mut Client,
|
||||
) -> Result<HashMap<String, ObservedSystemState>> {
|
||||
let prefix = format!(
|
||||
"{}nodes/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut observed = HashMap::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
if !key.ends_with("/observed-system") {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_slice::<ObservedSystemState>(&value) {
|
||||
Ok(state) => {
|
||||
observed.insert(state.node_id.clone(), state);
|
||||
}
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode observed-system"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(observed)
|
||||
}
|
||||
|
||||
async fn load_instances(&self, client: &mut Client) -> Result<Vec<ServiceInstanceSpec>> {
|
||||
let prefix = format!(
|
||||
"{}instances/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut instances = Vec::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
|
||||
Ok(instance) => instances.push(instance),
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode service instance"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(instances)
|
||||
}
|
||||
|
||||
async fn load_host_deployments(&self, client: &mut Client) -> Result<Vec<HostDeploymentSpec>> {
|
||||
let prefix = format!(
|
||||
"{}deployments/hosts/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut deployments = Vec::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
if !key.ends_with("/spec") {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_slice::<HostDeploymentSpec>(&value) {
|
||||
Ok(spec) => deployments.push(spec),
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment"),
|
||||
}
|
||||
}
|
||||
|
||||
deployments.sort_by(|lhs, rhs| lhs.name.cmp(&rhs.name));
|
||||
Ok(deployments)
|
||||
}
|
||||
|
||||
async fn load_host_deployment_statuses(
|
||||
&self,
|
||||
client: &mut Client,
|
||||
) -> Result<HashMap<String, HostDeploymentStatus>> {
|
||||
let prefix = format!(
|
||||
"{}deployments/hosts/",
|
||||
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
|
||||
);
|
||||
let kvs = client.get_prefix(prefix.as_bytes()).await?;
|
||||
let mut statuses = HashMap::new();
|
||||
|
||||
for (key, value) in kvs {
|
||||
let key = String::from_utf8_lossy(&key);
|
||||
if !key.ends_with("/status") {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_slice::<HostDeploymentStatus>(&value) {
|
||||
Ok(status) => {
|
||||
statuses.insert(status.name.clone(), status);
|
||||
}
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment status"),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(statuses)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct HostDeploymentPlan {
|
||||
status: HostDeploymentStatus,
|
||||
desired_upserts: Vec<DesiredSystemSpec>,
|
||||
desired_deletes: Vec<String>,
|
||||
node_updates: BTreeMap<String, ClusterNodeRecord>,
|
||||
}
|
||||
|
||||
fn plan_host_deployment(
|
||||
deployment: &HostDeploymentSpec,
|
||||
existing_status: Option<&HostDeploymentStatus>,
|
||||
nodes: &[ClusterNodeRecord],
|
||||
desired_systems: &HashMap<String, DesiredSystemSpec>,
|
||||
observed_systems: &HashMap<String, ObservedSystemState>,
|
||||
instances: &[ServiceInstanceSpec],
|
||||
heartbeat_timeout_secs: u64,
|
||||
) -> HostDeploymentPlan {
|
||||
let now = Utc::now();
|
||||
let target_configuration = deployment.nixos_configuration.clone();
|
||||
let selector_matches = select_nodes(nodes, &deployment.selector);
|
||||
let selected_node_ids = selector_matches
|
||||
.iter()
|
||||
.map(|node| node.node_id.clone())
|
||||
.collect::<HashSet<_>>();
|
||||
let instance_counts = active_instances_per_node(instances);
|
||||
let mut completed = Vec::new();
|
||||
let mut in_progress = Vec::new();
|
||||
let mut failed = Vec::new();
|
||||
let mut eligible_candidates = Vec::new();
|
||||
let mut desired_upserts = Vec::new();
|
||||
let mut node_updates = BTreeMap::new();
|
||||
let batch_size = deployment.batch_size.unwrap_or(1).max(1) as usize;
|
||||
let max_unavailable = deployment.max_unavailable.unwrap_or(1).max(1) as usize;
|
||||
let operator_paused = existing_status
|
||||
.map(|status| status.paused_by_operator)
|
||||
.unwrap_or(false);
|
||||
let spec_paused = deployment.paused.unwrap_or(false);
|
||||
let mut desired_deletes = desired_systems
|
||||
.iter()
|
||||
.filter(|(node_id, desired)| {
|
||||
desired.deployment_id.as_deref() == Some(deployment.name.as_str())
|
||||
&& !selected_node_ids.contains(node_id.as_str())
|
||||
})
|
||||
.map(|(node_id, _)| node_id.clone())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
for node in &selector_matches {
|
||||
let desired = desired_systems.get(&node.node_id);
|
||||
let observed = observed_systems.get(&node.node_id);
|
||||
let is_completed =
|
||||
is_node_completed(deployment, node, desired, observed, target_configuration.as_deref());
|
||||
let is_failed = is_node_failed(deployment, desired, observed);
|
||||
let is_in_progress = is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
|
||||
|| (deployment.drain_before_apply == Some(true)
|
||||
&& node.state.as_deref() == Some("draining")
|
||||
&& instance_counts.get(&node.node_id).copied().unwrap_or_default() > 0);
|
||||
|
||||
if is_completed {
|
||||
completed.push(node.node_id.clone());
|
||||
if deployment.drain_before_apply == Some(true) && node.state.as_deref() == Some("draining")
|
||||
{
|
||||
let mut updated = (*node).clone();
|
||||
updated.state = Some("active".to_string());
|
||||
node_updates.insert(updated.node_id.clone(), updated);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if is_failed {
|
||||
failed.push(node.node_id.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
if is_in_progress {
|
||||
in_progress.push(node.node_id.clone());
|
||||
continue;
|
||||
}
|
||||
|
||||
if node_is_rollout_candidate(node, heartbeat_timeout_secs) {
|
||||
eligible_candidates.push((*node).clone());
|
||||
}
|
||||
}
|
||||
|
||||
let unavailable = in_progress.len() + failed.len();
|
||||
let paused = operator_paused || spec_paused || !failed.is_empty();
|
||||
let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable);
|
||||
let remaining_batch_budget = batch_size.saturating_sub(in_progress.len());
|
||||
let max_starts = if deployment.nixos_configuration.is_some() {
|
||||
remaining_unavailable_budget.min(remaining_batch_budget)
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let mut planned = 0usize;
|
||||
let mut newly_started = Vec::new();
|
||||
|
||||
if !paused && max_starts > 0 {
|
||||
for node in eligible_candidates {
|
||||
if planned >= max_starts {
|
||||
break;
|
||||
}
|
||||
|
||||
let remaining_instances = instance_counts.get(&node.node_id).copied().unwrap_or_default();
|
||||
if deployment.drain_before_apply == Some(true) && remaining_instances > 0 {
|
||||
let mut updated = node.clone();
|
||||
updated.state = Some("draining".to_string());
|
||||
node_updates.insert(updated.node_id.clone(), updated);
|
||||
in_progress.push(node.node_id.clone());
|
||||
newly_started.push(node.node_id.clone());
|
||||
planned += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: node.node_id.clone(),
|
||||
deployment_id: Some(deployment.name.clone()),
|
||||
nixos_configuration: deployment.nixos_configuration.clone(),
|
||||
flake_ref: deployment.flake_ref.clone(),
|
||||
switch_action: deployment.switch_action.clone().or_else(|| Some("switch".to_string())),
|
||||
health_check_command: deployment.health_check_command.clone(),
|
||||
rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)),
|
||||
drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)),
|
||||
};
|
||||
newly_started.push(node.node_id.clone());
|
||||
in_progress.push(node.node_id.clone());
|
||||
planned += 1;
|
||||
if deployment.drain_before_apply == Some(true) && node.state.as_deref() != Some("draining")
|
||||
{
|
||||
let mut updated = node.clone();
|
||||
updated.state = Some("draining".to_string());
|
||||
node_updates.insert(updated.node_id.clone(), updated);
|
||||
}
|
||||
desired_upserts.push(desired);
|
||||
}
|
||||
}
|
||||
|
||||
let mut status = existing_status.cloned().unwrap_or_default();
|
||||
status.name = deployment.name.clone();
|
||||
status.selected_nodes = selector_matches.iter().map(|node| node.node_id.clone()).collect();
|
||||
status.completed_nodes = dedup_sorted(completed);
|
||||
status.in_progress_nodes = dedup_sorted(in_progress);
|
||||
status.failed_nodes = dedup_sorted(failed);
|
||||
status.paused_by_operator = operator_paused;
|
||||
status.paused = paused;
|
||||
status.phase = Some(if status.selected_nodes.is_empty() {
|
||||
"idle"
|
||||
} else if deployment.nixos_configuration.is_none() {
|
||||
"invalid"
|
||||
} else if status.paused {
|
||||
"paused"
|
||||
} else if status.completed_nodes.len() == status.selected_nodes.len() {
|
||||
"completed"
|
||||
} else if !newly_started.is_empty() || !status.in_progress_nodes.is_empty() {
|
||||
"running"
|
||||
} else {
|
||||
"ready"
|
||||
}
|
||||
.to_string());
|
||||
status.message = Some(format!(
|
||||
"selected={} completed={} in_progress={} failed={} newly_started={}",
|
||||
status.selected_nodes.len(),
|
||||
status.completed_nodes.len(),
|
||||
status.in_progress_nodes.len(),
|
||||
status.failed_nodes.len(),
|
||||
newly_started.len()
|
||||
));
|
||||
status.updated_at = Some(now);
|
||||
|
||||
HostDeploymentPlan {
|
||||
status,
|
||||
desired_upserts,
|
||||
desired_deletes: {
|
||||
desired_deletes.sort();
|
||||
desired_deletes.dedup();
|
||||
desired_deletes
|
||||
},
|
||||
node_updates,
|
||||
}
|
||||
}
|
||||
|
||||
fn select_nodes<'a>(
|
||||
nodes: &'a [ClusterNodeRecord],
|
||||
selector: &HostDeploymentSelector,
|
||||
) -> Vec<&'a ClusterNodeRecord> {
|
||||
let explicit_nodes = selector.node_ids.iter().collect::<HashSet<_>>();
|
||||
let explicit_mode = !explicit_nodes.is_empty();
|
||||
let mut selected = nodes
|
||||
.iter()
|
||||
.filter(|node| {
|
||||
(!explicit_mode || explicit_nodes.contains(&node.node_id))
|
||||
&& (selector.roles.is_empty()
|
||||
|| node
|
||||
.roles
|
||||
.iter()
|
||||
.any(|role| selector.roles.iter().any(|expected| expected == role)))
|
||||
&& (selector.pools.is_empty()
|
||||
|| node
|
||||
.pool
|
||||
.as_deref()
|
||||
.map(|pool| selector.pools.iter().any(|expected| expected == pool))
|
||||
.unwrap_or(false))
|
||||
&& (selector.node_classes.is_empty()
|
||||
|| node
|
||||
.node_class
|
||||
.as_deref()
|
||||
.map(|node_class| {
|
||||
selector
|
||||
.node_classes
|
||||
.iter()
|
||||
.any(|expected| expected == node_class)
|
||||
})
|
||||
.unwrap_or(false))
|
||||
&& selector
|
||||
.match_labels
|
||||
.iter()
|
||||
.all(|(key, value)| node.labels.get(key) == Some(value))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
selected.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
|
||||
selected
|
||||
}
|
||||
|
||||
fn active_instances_per_node(instances: &[ServiceInstanceSpec]) -> HashMap<String, usize> {
|
||||
let mut counts = HashMap::new();
|
||||
for instance in instances {
|
||||
if matches!(instance.state.as_deref(), Some("failed") | Some("deleted")) {
|
||||
continue;
|
||||
}
|
||||
*counts.entry(instance.node_id.clone()).or_insert(0usize) += 1;
|
||||
}
|
||||
counts
|
||||
}
|
||||
|
||||
fn node_is_rollout_candidate(node: &ClusterNodeRecord, heartbeat_timeout_secs: u64) -> bool {
|
||||
if matches!(
|
||||
node.commission_state,
|
||||
Some(CommissionState::Discovered | CommissionState::Commissioning)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
if matches!(
|
||||
node.install_state,
|
||||
Some(
|
||||
InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
if !matches!(node.state.as_deref(), Some("active") | Some("draining")) {
|
||||
return false;
|
||||
}
|
||||
if heartbeat_timeout_secs == 0 {
|
||||
return true;
|
||||
}
|
||||
let Some(last) = node.last_heartbeat else {
|
||||
return true;
|
||||
};
|
||||
Utc::now().signed_duration_since(last).num_seconds() <= heartbeat_timeout_secs as i64
|
||||
}
|
||||
|
||||
fn is_node_completed(
|
||||
deployment: &HostDeploymentSpec,
|
||||
_node: &ClusterNodeRecord,
|
||||
desired: Option<&DesiredSystemSpec>,
|
||||
observed: Option<&ObservedSystemState>,
|
||||
target_configuration: Option<&str>,
|
||||
) -> bool {
|
||||
observed
|
||||
.filter(|observed| observed.status.as_deref() == Some("active"))
|
||||
.and_then(|observed| observed.nixos_configuration.as_deref())
|
||||
.zip(target_configuration)
|
||||
.map(|(observed_configuration, target)| observed_configuration == target)
|
||||
.unwrap_or(false)
|
||||
&& desired
|
||||
.and_then(|desired| desired.deployment_id.as_deref())
|
||||
.map(|deployment_id| deployment_id == deployment.name)
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_node_failed(
|
||||
deployment: &HostDeploymentSpec,
|
||||
desired: Option<&DesiredSystemSpec>,
|
||||
observed: Option<&ObservedSystemState>,
|
||||
) -> bool {
|
||||
desired
|
||||
.and_then(|desired| desired.deployment_id.as_deref())
|
||||
.map(|deployment_id| deployment_id == deployment.name)
|
||||
.unwrap_or(false)
|
||||
&& observed
|
||||
.and_then(|observed| observed.status.as_deref())
|
||||
.map(|status| matches!(status, "failed" | "rolled-back"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn is_node_in_progress(
|
||||
deployment: &HostDeploymentSpec,
|
||||
desired: Option<&DesiredSystemSpec>,
|
||||
observed: Option<&ObservedSystemState>,
|
||||
is_completed: bool,
|
||||
is_failed: bool,
|
||||
) -> bool {
|
||||
if is_completed || is_failed {
|
||||
return false;
|
||||
}
|
||||
desired
|
||||
.and_then(|desired| desired.deployment_id.as_deref())
|
||||
.map(|deployment_id| deployment_id == deployment.name)
|
||||
.unwrap_or(false)
|
||||
|| observed
|
||||
.and_then(|observed| observed.status.as_deref())
|
||||
.map(|status| matches!(status, "planning" | "pending" | "reconciling" | "verifying" | "staged"))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn dedup_sorted(mut values: Vec<String>) -> Vec<String> {
|
||||
values.sort();
|
||||
values.dedup();
|
||||
values
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn test_node(node_id: &str, failure_domain: &str) -> ClusterNodeRecord {
|
||||
ClusterNodeRecord {
|
||||
node_id: node_id.to_string(),
|
||||
machine_id: None,
|
||||
ip: "10.0.0.1".to_string(),
|
||||
hostname: node_id.to_string(),
|
||||
roles: vec!["worker".to_string()],
|
||||
labels: HashMap::from([
|
||||
("tier".to_string(), "general".to_string()),
|
||||
("failure_domain".to_string(), failure_domain.to_string()),
|
||||
]),
|
||||
pool: Some("general".to_string()),
|
||||
node_class: Some("worker-linux".to_string()),
|
||||
failure_domain: Some(failure_domain.to_string()),
|
||||
nix_profile: None,
|
||||
install_plan: None,
|
||||
hardware_facts: None,
|
||||
state: Some("active".to_string()),
|
||||
commission_state: Some(CommissionState::Commissioned),
|
||||
install_state: Some(InstallState::Installed),
|
||||
commissioned_at: None,
|
||||
last_inventory_hash: None,
|
||||
power_state: None,
|
||||
bmc_ref: None,
|
||||
last_heartbeat: Some(Utc::now()),
|
||||
}
|
||||
}
|
||||
|
||||
fn test_deployment() -> HostDeploymentSpec {
|
||||
HostDeploymentSpec {
|
||||
name: "worker-rollout".to_string(),
|
||||
selector: HostDeploymentSelector {
|
||||
node_ids: vec![],
|
||||
roles: vec!["worker".to_string()],
|
||||
pools: vec!["general".to_string()],
|
||||
node_classes: vec!["worker-linux".to_string()],
|
||||
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
|
||||
},
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
batch_size: Some(1),
|
||||
max_unavailable: Some(1),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
switch_action: Some("switch".to_string()),
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(false),
|
||||
reboot_policy: None,
|
||||
paused: Some(false),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn plan_rollout_starts_one_node_per_batch() {
|
||||
let deployment = test_deployment();
|
||||
let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
|
||||
let plan = plan_host_deployment(
|
||||
&deployment,
|
||||
None,
|
||||
&nodes,
|
||||
&HashMap::new(),
|
||||
&HashMap::new(),
|
||||
&[],
|
||||
300,
|
||||
);
|
||||
|
||||
assert_eq!(plan.desired_upserts.len(), 1);
|
||||
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
|
||||
assert_eq!(plan.status.phase.as_deref(), Some("running"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn plan_rollout_pauses_on_failed_node() {
|
||||
let deployment = test_deployment();
|
||||
let nodes = vec![test_node("node01", "rack-a"), test_node("node02", "rack-b")];
|
||||
let desired = HashMap::from([(
|
||||
"node01".to_string(),
|
||||
DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: Some("worker-rollout".to_string()),
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
flake_ref: None,
|
||||
switch_action: Some("switch".to_string()),
|
||||
health_check_command: Vec::new(),
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(false),
|
||||
},
|
||||
)]);
|
||||
let observed = HashMap::from([(
|
||||
"node01".to_string(),
|
||||
ObservedSystemState {
|
||||
node_id: "node01".to_string(),
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
status: Some("rolled-back".to_string()),
|
||||
..ObservedSystemState::default()
|
||||
},
|
||||
)]);
|
||||
|
||||
let plan = plan_host_deployment(
|
||||
&deployment,
|
||||
None,
|
||||
&nodes,
|
||||
&desired,
|
||||
&observed,
|
||||
&[],
|
||||
300,
|
||||
);
|
||||
|
||||
assert!(plan.desired_upserts.is_empty());
|
||||
assert!(plan.status.paused);
|
||||
assert_eq!(plan.status.failed_nodes, vec!["node01".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn plan_rollout_drains_before_apply_when_instances_exist() {
|
||||
let mut deployment = test_deployment();
|
||||
deployment.drain_before_apply = Some(true);
|
||||
let nodes = vec![test_node("node01", "rack-a")];
|
||||
let instances = vec![ServiceInstanceSpec {
|
||||
instance_id: "api-node01".to_string(),
|
||||
service: "api".to_string(),
|
||||
node_id: "node01".to_string(),
|
||||
ip: "10.0.0.1".to_string(),
|
||||
port: 8080,
|
||||
mesh_port: None,
|
||||
version: None,
|
||||
health_check: None,
|
||||
process: None,
|
||||
container: None,
|
||||
managed_by: Some("fleet-scheduler".to_string()),
|
||||
state: Some("active".to_string()),
|
||||
last_heartbeat: None,
|
||||
observed_at: None,
|
||||
}];
|
||||
|
||||
let plan = plan_host_deployment(
|
||||
&deployment,
|
||||
None,
|
||||
&nodes,
|
||||
&HashMap::new(),
|
||||
&HashMap::new(),
|
||||
&instances,
|
||||
300,
|
||||
);
|
||||
|
||||
assert!(plan.desired_upserts.is_empty());
|
||||
assert_eq!(
|
||||
plan.node_updates
|
||||
.get("node01")
|
||||
.and_then(|node| node.state.as_deref()),
|
||||
Some("draining")
|
||||
);
|
||||
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
|
||||
}
|
||||
}
|
||||
|
|
@ -29,9 +29,9 @@ use fiberlb_api::{
|
|||
};
|
||||
|
||||
use flashdns_api::RecordServiceClient;
|
||||
use flashdns_api::ReverseZoneServiceClient;
|
||||
use flashdns_api::ZoneServiceClient;
|
||||
use flashdns_api::proto::{
|
||||
reverse_zone_service_client::ReverseZoneServiceClient,
|
||||
record_data, ARecord, AaaaRecord, CaaRecord, CnameRecord, CreateRecordRequest,
|
||||
CreateReverseZoneRequest, CreateZoneRequest, DeleteRecordRequest, DeleteReverseZoneRequest,
|
||||
DeleteZoneRequest, ListReverseZonesRequest, MxRecord, NsRecord, PtrRecord, RecordData,
|
||||
|
|
@ -39,6 +39,8 @@ use flashdns_api::proto::{
|
|||
ZoneInfo,
|
||||
};
|
||||
|
||||
mod hosts;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about)]
|
||||
struct Cli {
|
||||
|
|
@ -71,6 +73,9 @@ enum Command {
|
|||
#[arg(long, default_value_t = false)]
|
||||
prune: bool,
|
||||
},
|
||||
|
||||
/// Reconcile host deployments into per-node desired-system state
|
||||
Hosts(hosts::HostsCommand),
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
|
|
@ -294,6 +299,9 @@ async fn main() -> Result<()> {
|
|||
let spec: DnsConfig = read_json(&config).await?;
|
||||
reconcile_dns(spec, endpoint, prune).await?;
|
||||
}
|
||||
Command::Hosts(command) => {
|
||||
hosts::run(command).await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -7,6 +7,30 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
|
|||
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
|
||||
fi
|
||||
|
||||
run_chainfire_server_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
|
||||
else
|
||||
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_deployer_server_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_DEPLOYER_SERVER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_DEPLOYER_SERVER_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_deployer_ctl_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
tmp_dir="$(mktemp -d)"
|
||||
cf_pid=""
|
||||
deployer_pid=""
|
||||
|
|
@ -128,7 +152,7 @@ role = "voter"
|
|||
EOF
|
||||
|
||||
echo "Starting ChainFire on 127.0.0.1:${api_port}"
|
||||
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
|
||||
run_chainfire_server_bin \
|
||||
--config "$tmp_dir/chainfire.toml" \
|
||||
>"$tmp_dir/chainfire.log" 2>&1 &
|
||||
cf_pid="$!"
|
||||
|
|
@ -155,7 +179,7 @@ namespace = "deployer"
|
|||
EOF
|
||||
|
||||
echo "Starting Deployer on 127.0.0.1:${deployer_port}"
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-server -- \
|
||||
run_deployer_server_bin \
|
||||
--config "$tmp_dir/deployer.toml" \
|
||||
>"$tmp_dir/deployer.log" 2>&1 &
|
||||
deployer_pid="$!"
|
||||
|
|
@ -240,7 +264,7 @@ chainfire_endpoint="http://127.0.0.1:${api_port}"
|
|||
deployer_endpoint="http://127.0.0.1:${deployer_port}"
|
||||
|
||||
run_deployer_ctl() {
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
|
||||
run_deployer_ctl_bin \
|
||||
--chainfire-endpoint "$chainfire_endpoint" \
|
||||
--cluster-id test-cluster \
|
||||
--cluster-namespace photoncloud \
|
||||
|
|
|
|||
|
|
@ -7,6 +7,38 @@ if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
|
|||
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
|
||||
fi
|
||||
|
||||
run_chainfire_server_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
|
||||
else
|
||||
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_deployer_ctl_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_node_agent_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_NODE_AGENT_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_NODE_AGENT_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_fleet_scheduler_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_FLEET_SCHEDULER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_FLEET_SCHEDULER_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
tmp_dir="$(mktemp -d)"
|
||||
cf_pid=""
|
||||
|
||||
|
|
@ -104,7 +136,7 @@ EOF
|
|||
mkdir -p "$tmp_dir/pids"
|
||||
|
||||
echo "Starting ChainFire on 127.0.0.1:${api_port}"
|
||||
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- \
|
||||
run_chainfire_server_bin \
|
||||
--config "$tmp_dir/chainfire.toml" \
|
||||
>"$tmp_dir/chainfire.log" 2>&1 &
|
||||
cf_pid="$!"
|
||||
|
|
@ -256,7 +288,7 @@ EOF
|
|||
endpoint="http://127.0.0.1:${api_port}"
|
||||
|
||||
run_deployer_ctl() {
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- \
|
||||
run_deployer_ctl_bin \
|
||||
--chainfire-endpoint "$endpoint" \
|
||||
--cluster-id test-cluster \
|
||||
"$@"
|
||||
|
|
@ -266,7 +298,7 @@ run_node_agent_once() {
|
|||
local node_id="$1"
|
||||
local pid_dir="$tmp_dir/pids/$node_id"
|
||||
mkdir -p "$pid_dir"
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p node-agent -- \
|
||||
run_node_agent_bin \
|
||||
--chainfire-endpoint "$endpoint" \
|
||||
--cluster-id test-cluster \
|
||||
--node-id "$node_id" \
|
||||
|
|
@ -277,7 +309,7 @@ run_node_agent_once() {
|
|||
}
|
||||
|
||||
run_scheduler_once() {
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p fleet-scheduler -- \
|
||||
run_fleet_scheduler_bin \
|
||||
--chainfire-endpoint "$endpoint" \
|
||||
--cluster-id test-cluster \
|
||||
--interval-secs 1 \
|
||||
|
|
|
|||
431
deployer/scripts/verify-host-lifecycle-e2e.sh
Normal file
431
deployer/scripts/verify-host-lifecycle-e2e.sh
Normal file
|
|
@ -0,0 +1,431 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
|
||||
if [[ -z "${PHOTONCLOUD_E2E_IN_NIX:-}" ]]; then
|
||||
exec nix develop "$ROOT" -c env PHOTONCLOUD_E2E_IN_NIX=1 bash "$0" "$@"
|
||||
fi
|
||||
|
||||
run_chainfire_server_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_CHAINFIRE_SERVER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_CHAINFIRE_SERVER_BIN" "$@"
|
||||
else
|
||||
cargo run --manifest-path "$ROOT/chainfire/Cargo.toml" -p chainfire-server -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_deployer_ctl_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_DEPLOYER_CTL_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_DEPLOYER_CTL_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p deployer-ctl -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
run_plasmacloud_reconciler_bin() {
|
||||
if [[ -n "${PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN:-}" ]]; then
|
||||
"$PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN" "$@"
|
||||
else
|
||||
cargo run --quiet --manifest-path "$ROOT/deployer/Cargo.toml" -p plasmacloud-reconciler -- "$@"
|
||||
fi
|
||||
}
|
||||
|
||||
tmp_dir="$(mktemp -d)"
|
||||
cf_pid=""
|
||||
redfish_pid=""
|
||||
|
||||
cleanup() {
|
||||
set +e
|
||||
if [[ -n "$redfish_pid" ]]; then
|
||||
kill "$redfish_pid" 2>/dev/null || true
|
||||
wait "$redfish_pid" 2>/dev/null || true
|
||||
fi
|
||||
if [[ -n "$cf_pid" ]]; then
|
||||
kill "$cf_pid" 2>/dev/null || true
|
||||
wait "$cf_pid" 2>/dev/null || true
|
||||
fi
|
||||
rm -rf "$tmp_dir"
|
||||
}
|
||||
|
||||
trap cleanup EXIT
|
||||
|
||||
free_port() {
|
||||
python3 - <<'PY'
|
||||
import socket
|
||||
s = socket.socket()
|
||||
s.bind(("127.0.0.1", 0))
|
||||
print(s.getsockname()[1])
|
||||
s.close()
|
||||
PY
|
||||
}
|
||||
|
||||
wait_for_port() {
|
||||
local host="$1"
|
||||
local port="$2"
|
||||
local timeout_secs="${3:-60}"
|
||||
local deadline=$((SECONDS + timeout_secs))
|
||||
|
||||
while (( SECONDS < deadline )); do
|
||||
if python3 - "$host" "$port" <<'PY'
|
||||
import socket
|
||||
import sys
|
||||
|
||||
host = sys.argv[1]
|
||||
port = int(sys.argv[2])
|
||||
|
||||
with socket.socket() as sock:
|
||||
sock.settimeout(0.5)
|
||||
try:
|
||||
sock.connect((host, port))
|
||||
except OSError:
|
||||
raise SystemExit(1)
|
||||
raise SystemExit(0)
|
||||
PY
|
||||
then
|
||||
return 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
echo "timed out waiting for ${host}:${port}" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
api_port="$(free_port)"
|
||||
http_port="$(free_port)"
|
||||
raft_port="$(free_port)"
|
||||
gossip_port="$(free_port)"
|
||||
redfish_port="$(free_port)"
|
||||
|
||||
cat >"$tmp_dir/chainfire.toml" <<EOF
|
||||
[node]
|
||||
id = 1
|
||||
name = "chainfire-1"
|
||||
role = "control_plane"
|
||||
|
||||
[storage]
|
||||
data_dir = "$tmp_dir/chainfire-data"
|
||||
|
||||
[network]
|
||||
api_addr = "127.0.0.1:${api_port}"
|
||||
http_addr = "127.0.0.1:${http_port}"
|
||||
raft_addr = "127.0.0.1:${raft_port}"
|
||||
gossip_addr = "127.0.0.1:${gossip_port}"
|
||||
|
||||
[cluster]
|
||||
id = 1
|
||||
initial_members = []
|
||||
bootstrap = true
|
||||
|
||||
[raft]
|
||||
role = "voter"
|
||||
EOF
|
||||
|
||||
cat >"$tmp_dir/mock-redfish.py" <<'PY'
|
||||
import http.server
|
||||
import json
|
||||
import sys
|
||||
|
||||
port = int(sys.argv[1])
|
||||
log_path = sys.argv[2]
|
||||
|
||||
class Handler(http.server.BaseHTTPRequestHandler):
|
||||
def log_message(self, format, *args):
|
||||
pass
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == "/redfish/v1/Systems/node01":
|
||||
body = json.dumps({"PowerState": "On"}).encode("utf-8")
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "application/json")
|
||||
self.send_header("Content-Length", str(len(body)))
|
||||
self.end_headers()
|
||||
self.wfile.write(body)
|
||||
return
|
||||
self.send_error(404)
|
||||
|
||||
def do_POST(self):
|
||||
if self.path != "/redfish/v1/Systems/node01/Actions/ComputerSystem.Reset":
|
||||
self.send_error(404)
|
||||
return
|
||||
length = int(self.headers.get("Content-Length", "0"))
|
||||
payload = self.rfile.read(length).decode("utf-8")
|
||||
with open(log_path, "a", encoding="utf-8") as handle:
|
||||
handle.write(payload + "\n")
|
||||
self.send_response(204)
|
||||
self.end_headers()
|
||||
|
||||
server = http.server.ThreadingHTTPServer(("127.0.0.1", port), Handler)
|
||||
server.serve_forever()
|
||||
PY
|
||||
|
||||
echo "Starting ChainFire on 127.0.0.1:${api_port}"
|
||||
run_chainfire_server_bin --config "$tmp_dir/chainfire.toml" >"$tmp_dir/chainfire.log" 2>&1 &
|
||||
cf_pid="$!"
|
||||
wait_for_port "127.0.0.1" "$api_port" 120
|
||||
wait_for_port "127.0.0.1" "$http_port" 120
|
||||
|
||||
echo "Starting mock Redfish on 127.0.0.1:${redfish_port}"
|
||||
python3 "$tmp_dir/mock-redfish.py" "$redfish_port" "$tmp_dir/redfish.log" >"$tmp_dir/redfish.stdout" 2>&1 &
|
||||
redfish_pid="$!"
|
||||
wait_for_port "127.0.0.1" "$redfish_port" 30
|
||||
|
||||
cat >"$tmp_dir/cluster.yaml" <<EOF
|
||||
cluster:
|
||||
cluster_id: test-cluster
|
||||
environment: dev
|
||||
|
||||
node_classes:
|
||||
- name: worker-linux
|
||||
roles:
|
||||
- worker
|
||||
labels:
|
||||
tier: general
|
||||
|
||||
pools:
|
||||
- name: general
|
||||
node_class: worker-linux
|
||||
labels:
|
||||
env: dev
|
||||
|
||||
nodes:
|
||||
- node_id: node01
|
||||
hostname: node01
|
||||
ip: 10.0.0.11
|
||||
roles:
|
||||
- worker
|
||||
labels:
|
||||
tier: general
|
||||
pool: general
|
||||
node_class: worker-linux
|
||||
state: active
|
||||
commission_state: commissioned
|
||||
install_state: installed
|
||||
bmc_ref: "redfish+http://127.0.0.1:${redfish_port}/redfish/v1/Systems/node01"
|
||||
- node_id: node02
|
||||
hostname: node02
|
||||
ip: 10.0.0.12
|
||||
roles:
|
||||
- worker
|
||||
labels:
|
||||
tier: general
|
||||
pool: general
|
||||
node_class: worker-linux
|
||||
state: active
|
||||
commission_state: commissioned
|
||||
install_state: installed
|
||||
|
||||
host_deployments:
|
||||
- name: worker-rollout
|
||||
selector:
|
||||
roles:
|
||||
- worker
|
||||
pools:
|
||||
- general
|
||||
node_classes:
|
||||
- worker-linux
|
||||
match_labels:
|
||||
tier: general
|
||||
nixos_configuration: worker-next
|
||||
flake_ref: "github:centra/cloud"
|
||||
batch_size: 1
|
||||
max_unavailable: 1
|
||||
health_check_command:
|
||||
- "true"
|
||||
switch_action: switch
|
||||
rollback_on_failure: true
|
||||
EOF
|
||||
|
||||
chainfire_endpoint="http://127.0.0.1:${api_port}"
|
||||
|
||||
run_deployer_ctl() {
|
||||
run_deployer_ctl_bin \
|
||||
--chainfire-endpoint "$chainfire_endpoint" \
|
||||
--cluster-id test-cluster \
|
||||
--cluster-namespace photoncloud \
|
||||
--deployer-namespace deployer \
|
||||
"$@"
|
||||
}
|
||||
|
||||
run_hosts_once() {
|
||||
run_plasmacloud_reconciler_bin \
|
||||
hosts \
|
||||
--endpoint "$chainfire_endpoint" \
|
||||
--cluster-namespace photoncloud \
|
||||
--cluster-id test-cluster \
|
||||
--heartbeat-timeout-secs 300 \
|
||||
--once
|
||||
}
|
||||
|
||||
echo "Applying host lifecycle cluster config"
|
||||
run_deployer_ctl apply --config "$tmp_dir/cluster.yaml" --prune
|
||||
|
||||
echo "Running host rollout controller"
|
||||
run_hosts_once
|
||||
|
||||
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-1.json"
|
||||
python3 - "$tmp_dir/deployment-1.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
status = payload["status"]
|
||||
assert status["phase"] == "running", payload
|
||||
assert status["in_progress_nodes"] == ["node01"], payload
|
||||
assert status["failed_nodes"] == [], payload
|
||||
print("initial rollout wave validated")
|
||||
PY
|
||||
|
||||
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-1.dump"
|
||||
python3 - "$tmp_dir/nodes-1.dump" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
desired = {}
|
||||
with open(sys.argv[1], "r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
if " key=" not in line or " value=" not in line:
|
||||
continue
|
||||
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
|
||||
if not key.endswith("/desired-system"):
|
||||
continue
|
||||
payload = json.loads(line.split(" value=", 1)[1])
|
||||
desired[payload["node_id"]] = payload
|
||||
|
||||
assert sorted(desired) == ["node01"], desired
|
||||
assert desired["node01"]["deployment_id"] == "worker-rollout", desired
|
||||
print("desired-system first wave validated")
|
||||
PY
|
||||
|
||||
echo "Pausing and resuming deployment via CLI"
|
||||
run_deployer_ctl deployment pause --name worker-rollout >"$tmp_dir/pause.json"
|
||||
python3 - "$tmp_dir/pause.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
assert payload["paused"] is True, payload
|
||||
assert payload["paused_by_operator"] is True, payload
|
||||
print("pause command validated")
|
||||
PY
|
||||
run_deployer_ctl deployment resume --name worker-rollout >"$tmp_dir/resume.json"
|
||||
python3 - "$tmp_dir/resume.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
assert payload["paused"] is False, payload
|
||||
assert payload["paused_by_operator"] is False, payload
|
||||
print("resume command validated")
|
||||
PY
|
||||
|
||||
echo "Marking node01 rollout complete and reconciling next wave"
|
||||
run_deployer_ctl node set-observed \
|
||||
--node-id node01 \
|
||||
--status active \
|
||||
--nixos-configuration worker-next >/dev/null
|
||||
run_hosts_once
|
||||
|
||||
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-2.json"
|
||||
python3 - "$tmp_dir/deployment-2.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
status = payload["status"]
|
||||
assert status["completed_nodes"] == ["node01"], payload
|
||||
assert status["in_progress_nodes"] == ["node02"], payload
|
||||
print("second rollout wave validated")
|
||||
PY
|
||||
|
||||
echo "Marking node02 rollout failed and validating auto-pause"
|
||||
run_deployer_ctl node set-observed \
|
||||
--node-id node02 \
|
||||
--status rolled-back \
|
||||
--nixos-configuration worker-next >/dev/null
|
||||
run_hosts_once
|
||||
|
||||
run_deployer_ctl deployment inspect --name worker-rollout --format json >"$tmp_dir/deployment-3.json"
|
||||
python3 - "$tmp_dir/deployment-3.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
status = payload["status"]
|
||||
assert status["paused"] is True, payload
|
||||
assert status["failed_nodes"] == ["node02"], payload
|
||||
print("auto-pause on failure validated")
|
||||
PY
|
||||
|
||||
echo "Refreshing power state through Redfish"
|
||||
run_deployer_ctl node power --node-id node01 --action refresh >"$tmp_dir/node-power.json"
|
||||
python3 - "$tmp_dir/node-power.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
assert payload["power_state"] == "on", payload
|
||||
print("power refresh validated")
|
||||
PY
|
||||
|
||||
echo "Requesting reinstall with power cycle"
|
||||
run_deployer_ctl node reinstall --node-id node01 --power-cycle >"$tmp_dir/node-reinstall.json"
|
||||
python3 - "$tmp_dir/node-reinstall.json" "$tmp_dir/redfish.log" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
assert payload["state"] == "provisioning", payload
|
||||
assert payload["install_state"] == "reinstall_requested", payload
|
||||
assert payload["power_state"] == "cycling", payload
|
||||
|
||||
lines = [line.strip() for line in open(sys.argv[2], "r", encoding="utf-8") if line.strip()]
|
||||
assert any('"ResetType":"PowerCycle"' in line for line in lines), lines
|
||||
print("reinstall orchestration validated")
|
||||
PY
|
||||
|
||||
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/node01" >"$tmp_dir/node01-post-reinstall.dump"
|
||||
python3 - "$tmp_dir/node01-post-reinstall.dump" <<'PY'
|
||||
import sys
|
||||
|
||||
lines = [line.strip() for line in open(sys.argv[1], "r", encoding="utf-8")]
|
||||
assert not any("/desired-system" in line for line in lines), lines
|
||||
assert not any("/observed-system" in line for line in lines), lines
|
||||
print("reinstall state cleanup validated")
|
||||
PY
|
||||
|
||||
echo "Aborting deployment and clearing desired-system"
|
||||
run_deployer_ctl deployment abort --name worker-rollout >"$tmp_dir/abort.json"
|
||||
python3 - "$tmp_dir/abort.json" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
|
||||
assert payload["phase"] == "aborted", payload
|
||||
assert payload["paused"] is True, payload
|
||||
print("abort command validated")
|
||||
PY
|
||||
|
||||
run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/nodes/" >"$tmp_dir/nodes-2.dump"
|
||||
python3 - "$tmp_dir/nodes-2.dump" <<'PY'
|
||||
import json
|
||||
import sys
|
||||
|
||||
desired_nodes = []
|
||||
with open(sys.argv[1], "r", encoding="utf-8") as handle:
|
||||
for line in handle:
|
||||
if " key=" not in line or " value=" not in line:
|
||||
continue
|
||||
key = line.split(" key=", 1)[1].split(" value=", 1)[0]
|
||||
if not key.endswith("/desired-system"):
|
||||
continue
|
||||
payload = json.loads(line.split(" value=", 1)[1])
|
||||
if payload.get("deployment_id") == "worker-rollout":
|
||||
desired_nodes.append(payload["node_id"])
|
||||
|
||||
assert desired_nodes == [], desired_nodes
|
||||
print("desired-system cleanup validated")
|
||||
PY
|
||||
|
||||
echo "Host lifecycle E2E verification passed"
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
# Storage Benchmarks
|
||||
|
||||
Generated on 2026-03-10T20:02:00+09:00 with:
|
||||
Generated on 2026-03-27T12:08:47+09:00 with:
|
||||
|
||||
```bash
|
||||
nix run ./nix/test-cluster#cluster -- fresh-bench-storage
|
||||
nix run ./nix/test-cluster#cluster -- bench-storage
|
||||
```
|
||||
|
||||
## CoronaFS
|
||||
|
|
@ -12,30 +12,35 @@ Cluster network baseline, measured with `iperf3` from `node04` to `node01` befor
|
|||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| TCP throughput | 22.83 MiB/s |
|
||||
| TCP retransmits | 78 |
|
||||
| TCP throughput | 45.92 MiB/s |
|
||||
| TCP retransmits | 193 |
|
||||
|
||||
Measured from `node04`.
|
||||
Local worker disk is the baseline. CoronaFS is the shared block volume path used for mutable VM disks, exported from `node01` over NBD.
|
||||
Local worker disk is the baseline. CoronaFS now has two relevant data paths in the lab: the controller export sourced from `node01`, and the node-local export materialized onto the worker that actually attaches the mutable VM disk.
|
||||
|
||||
| Metric | Local Disk | CoronaFS |
|
||||
|---|---:|---:|
|
||||
| Sequential write | 26.36 MiB/s | 5.24 MiB/s |
|
||||
| Sequential read | 348.77 MiB/s | 10.08 MiB/s |
|
||||
| 4k random read | 1243 IOPS | 145 IOPS |
|
||||
| Metric | Local Disk | Controller Export | Node-local Export |
|
||||
|---|---:|---:|---:|
|
||||
| Sequential write | 679.05 MiB/s | 30.35 MiB/s | 395.06 MiB/s |
|
||||
| Sequential read | 2723.40 MiB/s | 42.70 MiB/s | 709.14 MiB/s |
|
||||
| 4k random read | 16958 IOPS | 2034 IOPS | 5087 IOPS |
|
||||
| 4k queued random read (`iodepth=32`) | 106026 IOPS | 14261 IOPS | 28898 IOPS |
|
||||
|
||||
Queue-depth profile (`libaio`, `iodepth=32`) from the same worker:
|
||||
|
||||
| Metric | Local Disk | CoronaFS |
|
||||
|---|---:|---:|
|
||||
| Depth-32 write | 27.12 MiB/s | 11.42 MiB/s |
|
||||
| Depth-32 read | 4797.47 MiB/s | 10.06 MiB/s |
|
||||
| Metric | Local Disk | Controller Export | Node-local Export |
|
||||
|---|---:|---:|---:|
|
||||
| Depth-32 write | 3417.45 MiB/s | 39.26 MiB/s | 178.04 MiB/s |
|
||||
| Depth-32 read | 12996.47 MiB/s | 55.71 MiB/s | 112.88 MiB/s |
|
||||
|
||||
Cross-worker shared-volume visibility, measured by writing on `node04` and reading from `node05` over the same CoronaFS NBD export:
|
||||
Node-local materialization timing and target-node steady-state read path:
|
||||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| Cross-worker sequential read | 17.72 MiB/s |
|
||||
| Node04 materialize latency | 9.23 s |
|
||||
| Node05 materialize latency | 5.82 s |
|
||||
| Node05 node-local sequential read | 709.14 MiB/s |
|
||||
|
||||
PlasmaVMC now prefers the worker-local CoronaFS export for mutable node-local volumes, even when the underlying materialization is a qcow2 overlay. The VM runtime section below is therefore the closest end-to-end proxy for real local-attach VM I/O, while the node-local export numbers remain useful for CoronaFS service consumers and for diagnosing exporter overhead.
|
||||
|
||||
## LightningStor
|
||||
|
||||
|
|
@ -46,16 +51,16 @@ Cluster network baseline for this client, measured with `iperf3` from `node03` t
|
|||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| TCP throughput | 18.35 MiB/s |
|
||||
| TCP retransmits | 78 |
|
||||
| TCP throughput | 45.99 MiB/s |
|
||||
| TCP retransmits | 207 |
|
||||
|
||||
### Large-object path
|
||||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| Object size | 256 MiB |
|
||||
| Upload throughput | 8.11 MiB/s |
|
||||
| Download throughput | 7.54 MiB/s |
|
||||
| Upload throughput | 18.20 MiB/s |
|
||||
| Download throughput | 39.21 MiB/s |
|
||||
|
||||
### Small-object batch
|
||||
|
||||
|
|
@ -63,10 +68,10 @@ Measured as 32 objects of 4 MiB each (128 MiB total).
|
|||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| Batch upload throughput | 0.81 MiB/s |
|
||||
| Batch download throughput | 0.83 MiB/s |
|
||||
| PUT rate | 0.20 objects/s |
|
||||
| GET rate | 0.21 objects/s |
|
||||
| Batch upload throughput | 18.96 MiB/s |
|
||||
| Batch download throughput | 39.88 MiB/s |
|
||||
| PUT rate | 4.74 objects/s |
|
||||
| GET rate | 9.97 objects/s |
|
||||
|
||||
### Parallel small-object batch
|
||||
|
||||
|
|
@ -74,34 +79,57 @@ Measured as the same 32 objects of 4 MiB each, but with 8 concurrent client jobs
|
|||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| Parallel batch upload throughput | 3.03 MiB/s |
|
||||
| Parallel batch download throughput | 2.89 MiB/s |
|
||||
| Parallel PUT rate | 0.76 objects/s |
|
||||
| Parallel GET rate | 0.72 objects/s |
|
||||
| Parallel batch upload throughput | 16.23 MiB/s |
|
||||
| Parallel batch download throughput | 26.07 MiB/s |
|
||||
| Parallel PUT rate | 4.06 objects/s |
|
||||
| Parallel GET rate | 6.52 objects/s |
|
||||
|
||||
## VM Image Path
|
||||
|
||||
Measured against the real `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` path on `node01`.
|
||||
Measured against the `PlasmaVMC -> LightningStor artifact -> CoronaFS-backed managed volume` clone path on `node01`.
|
||||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| Guest image artifact size | 2017 MiB |
|
||||
| Guest image virtual size | 4096 MiB |
|
||||
| `CreateImage` latency | 176.03 s |
|
||||
| First image-backed `CreateVolume` latency | 76.51 s |
|
||||
| Second image-backed `CreateVolume` latency | 170.49 s |
|
||||
| `CreateImage` latency | 66.49 s |
|
||||
| First image-backed `CreateVolume` latency | 16.86 s |
|
||||
| Second image-backed `CreateVolume` latency | 0.12 s |
|
||||
|
||||
## VM Runtime Path
|
||||
|
||||
Measured against the real `StartVm -> qemu attach -> guest boot -> guest fio` path on a worker node, using a CoronaFS-backed root disk and data disk.
|
||||
|
||||
| Metric | Result |
|
||||
|---|---:|
|
||||
| `StartVm` to qemu attach | 0.60 s |
|
||||
| `StartVm` to guest benchmark result | 35.69 s |
|
||||
| Guest sequential write | 123.49252223968506 MiB/s |
|
||||
| Guest sequential read | 1492.7113695144653 MiB/s |
|
||||
| Guest 4k random read | 25550 IOPS |
|
||||
|
||||
## Assessment
|
||||
|
||||
- CoronaFS shared-volume reads are currently 2.9% of the measured local-disk baseline on this nested-QEMU lab cluster.
|
||||
- CoronaFS 4k random reads are currently 11.7% of the measured local-disk baseline.
|
||||
- CoronaFS cross-worker reads are currently 5.1% of the measured local-disk sequential-read baseline, which is the more relevant signal for VM restart and migration paths.
|
||||
- CoronaFS sequential reads are currently 44.2% of the measured node04->node01 TCP baseline, which helps separate NBD/export overhead from raw cluster-network limits.
|
||||
- CoronaFS depth-32 reads are currently 0.2% of the local depth-32 baseline, which is a better proxy for queued guest I/O than the single-depth path.
|
||||
- The shared-volume path is functionally correct for mutable VM disks and migration tests, but its read-side throughput is still too low to call production-ready for heavier VM workloads.
|
||||
- LightningStor's replicated S3 path is working correctly, but 8.11 MiB/s upload and 7.54 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
|
||||
- LightningStor large-object downloads are currently 41.1% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
|
||||
- LightningStor's small-object batch path is also functional, but 0.20 PUT/s and 0.21 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
|
||||
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 0.76 PUT/s and 0.72 GET/s.
|
||||
- The VM image path is now measured directly rather than inferred. The cold `CreateVolume` path includes artifact fetch plus CoronaFS population; the warm `CreateVolume` path isolates repeated CoronaFS population from an already cached image.
|
||||
- CoronaFS controller-export reads are currently 1.6% of the measured local-disk baseline on this nested-QEMU lab cluster.
|
||||
- CoronaFS controller-export 4k random reads are currently 12.0% of the measured local-disk baseline.
|
||||
- CoronaFS controller-export queued 4k random reads are currently 13.5% of the measured local queued-random-read baseline.
|
||||
- CoronaFS controller-export sequential reads are currently 93.0% of the measured node04->node01 TCP baseline, which isolates the centralized source path from raw cluster-network limits.
|
||||
- CoronaFS controller-export depth-32 reads are currently 0.4% of the local depth-32 baseline.
|
||||
- CoronaFS node-local reads are currently 26.0% of the measured local-disk baseline, which is the more relevant steady-state signal for mutable VM disks after attachment.
|
||||
- CoronaFS node-local 4k random reads are currently 30.0% of the measured local-disk baseline.
|
||||
- CoronaFS node-local queued 4k random reads are currently 27.3% of the measured local queued-random-read baseline.
|
||||
- CoronaFS node-local depth-32 reads are currently 0.9% of the local depth-32 baseline.
|
||||
- The target worker's node-local read path is 26.0% of the measured local sequential-read baseline after materialization, which is the better proxy for restart and migration steady state than the old shared-export read.
|
||||
- PlasmaVMC now attaches writable node-local volumes through the worker-local CoronaFS export, so the guest-runtime section should be treated as the real local VM steady-state path rather than the node-local export numbers alone.
|
||||
- CoronaFS single-depth writes remain sensitive to the nested-QEMU/VDE lab transport, so the queued-depth and guest-runtime numbers are still the more reliable proxy for real VM workload behavior than the single-stream write figure alone.
|
||||
- The central export path is now best understood as a source/materialization path; the worker-local export is the path that should determine VM-disk readiness going forward.
|
||||
- LightningStor's replicated S3 path is working correctly, but 18.20 MiB/s upload and 39.21 MiB/s download are still lab-grade numbers rather than strong object-store throughput.
|
||||
- LightningStor large-object downloads are currently 85.3% of the same node04->node01 TCP baseline, which indicates how much of the headroom is being lost above the raw network path.
|
||||
- The current S3 frontend tuning baseline is the built-in 16 MiB streaming threshold with multipart PUT/FETCH concurrency of 8; that combination is the best default observed on this lab cluster so far.
|
||||
- LightningStor uploads should be read against the replication write quorum and the same ~45.99 MiB/s lab network ceiling; this environment still limits end-to-end throughput well before modern bare-metal NICs would.
|
||||
- LightningStor's small-object batch path is also functional, but 4.74 PUT/s and 9.97 GET/s still indicate a lab cluster rather than a tuned object-storage deployment.
|
||||
- The parallel small-object profile is the more relevant control-plane/object-ingest signal; it currently reaches 4.06 PUT/s and 6.52 GET/s.
|
||||
- The VM image section measures clone/materialization cost, not guest runtime I/O.
|
||||
- The PlasmaVMC local image-backed clone fast path is now active again; a 0.12 s second clone indicates the CoronaFS qcow2 backing-file path is being hit on node01 rather than falling back to eager raw materialization.
|
||||
- The VM runtime section is the real `PlasmaVMC + CoronaFS + QEMU virtio-blk + guest kernel` path; use it to judge whether QEMU/NBD tuning is helping.
|
||||
- The local sequential-write baseline is noisy in this environment, so the read and random-read deltas are the more reliable signal.
|
||||
|
|
|
|||
574
fiberlb/Cargo.lock
generated
574
fiberlb/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
17
flake.lock
generated
17
flake.lock
generated
|
|
@ -76,7 +76,8 @@
|
|||
"flake-utils": "flake-utils",
|
||||
"nix-nos": "nix-nos",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-overlay": "rust-overlay"
|
||||
"rust-overlay": "rust-overlay",
|
||||
"systems": "systems_2"
|
||||
}
|
||||
},
|
||||
"rust-overlay": {
|
||||
|
|
@ -113,6 +114,20 @@
|
|||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_2": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "systems",
|
||||
"type": "indirect"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
|
|||
467
flake.nix
467
flake.nix
|
|
@ -33,7 +33,7 @@
|
|||
# ============================================================================
|
||||
# OUTPUTS: What this flake provides
|
||||
# ============================================================================
|
||||
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos }:
|
||||
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
# Apply rust-overlay to get rust-bin attribute
|
||||
|
|
@ -139,6 +139,301 @@
|
|||
);
|
||||
};
|
||||
|
||||
flakeInputsBlock = ''
|
||||
inputs = {
|
||||
# Use unstable nixpkgs for latest packages
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
|
||||
# Rust overlay for managing Rust toolchains
|
||||
rust-overlay = {
|
||||
url = "github:oxalica/rust-overlay";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
# Flake utilities for multi-system support
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
|
||||
# Disko for declarative disk partitioning
|
||||
disko = {
|
||||
url = "github:nix-community/disko";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
# Nix-NOS generic network operating system modules
|
||||
nix-nos = {
|
||||
url = "path:./nix-nos";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
'';
|
||||
|
||||
bundledInputsBlock = ''
|
||||
inputs = {
|
||||
nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
|
||||
|
||||
rust-overlay = {
|
||||
url = "path:./.bundle-inputs/rust-overlay";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
flake-utils = {
|
||||
url = "path:./.bundle-inputs/flake-utils";
|
||||
inputs.systems.follows = "systems";
|
||||
};
|
||||
|
||||
systems.url = "path:./.bundle-inputs/systems";
|
||||
|
||||
disko = {
|
||||
url = "path:./.bundle-inputs/disko";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
nix-nos = {
|
||||
url = "path:./nix-nos";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
'';
|
||||
|
||||
flakeHeaderBlock = ''
|
||||
# ============================================================================
|
||||
# INPUTS: External dependencies
|
||||
# ============================================================================
|
||||
inputs = {
|
||||
# Use unstable nixpkgs for latest packages
|
||||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
|
||||
# Rust overlay for managing Rust toolchains
|
||||
rust-overlay = {
|
||||
url = "github:oxalica/rust-overlay";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
# Flake utilities for multi-system support
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
|
||||
# Disko for declarative disk partitioning
|
||||
disko = {
|
||||
url = "github:nix-community/disko";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
# Nix-NOS generic network operating system modules
|
||||
nix-nos = {
|
||||
url = "path:./nix-nos";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
# ============================================================================
|
||||
# OUTPUTS: What this flake provides
|
||||
# ============================================================================
|
||||
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
|
||||
'';
|
||||
|
||||
bundledHeaderBlock = ''
|
||||
# ============================================================================
|
||||
# INPUTS: External dependencies
|
||||
# ============================================================================
|
||||
inputs = {
|
||||
nixpkgs.url = "path:./.bundle-inputs/nixpkgs";
|
||||
|
||||
rust-overlay = {
|
||||
url = "path:./.bundle-inputs/rust-overlay";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
flake-utils = {
|
||||
url = "path:./.bundle-inputs/flake-utils";
|
||||
inputs.systems.follows = "systems";
|
||||
};
|
||||
|
||||
systems.url = "path:./.bundle-inputs/systems";
|
||||
|
||||
disko = {
|
||||
url = "path:./.bundle-inputs/disko";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
|
||||
nix-nos = {
|
||||
url = "path:./nix-nos";
|
||||
inputs.nixpkgs.follows = "nixpkgs";
|
||||
};
|
||||
};
|
||||
|
||||
# ============================================================================
|
||||
# OUTPUTS: What this flake provides
|
||||
# ============================================================================
|
||||
outputs = { self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems ? null }:
|
||||
'';
|
||||
|
||||
bundledFlakeNix =
|
||||
pkgs.writeText
|
||||
"plasmacloud-bundled-flake.nix"
|
||||
(
|
||||
builtins.replaceStrings
|
||||
[ flakeHeaderBlock ]
|
||||
[ bundledHeaderBlock ]
|
||||
(builtins.readFile ./flake.nix)
|
||||
);
|
||||
|
||||
bundledFlakeHeaderFile =
|
||||
pkgs.writeText "plasmacloud-bundled-flake-header" bundledHeaderBlock;
|
||||
|
||||
baseFlakeLock = builtins.fromJSON (builtins.readFile ./flake.lock);
|
||||
|
||||
bundleInputRelPaths = {
|
||||
nixpkgs = "./.bundle-inputs/nixpkgs";
|
||||
"rust-overlay" = "./.bundle-inputs/rust-overlay";
|
||||
"flake-utils" = "./.bundle-inputs/flake-utils";
|
||||
disko = "./.bundle-inputs/disko";
|
||||
systems = "./.bundle-inputs/systems";
|
||||
};
|
||||
|
||||
fetchLockedInput =
|
||||
nodeName:
|
||||
let
|
||||
tree = builtins.fetchTree baseFlakeLock.nodes.${nodeName}.locked;
|
||||
in
|
||||
if builtins.isAttrs tree && tree ? outPath then tree.outPath else tree;
|
||||
|
||||
vendoredFlakeInputs = {
|
||||
nixpkgs = fetchLockedInput "nixpkgs";
|
||||
"rust-overlay" = fetchLockedInput "rust-overlay";
|
||||
"flake-utils" = fetchLockedInput "flake-utils";
|
||||
disko = fetchLockedInput "disko";
|
||||
systems = fetchLockedInput "systems";
|
||||
};
|
||||
|
||||
makeBundledLockNode =
|
||||
nodeName: relPath:
|
||||
let
|
||||
node = baseFlakeLock.nodes.${nodeName};
|
||||
in
|
||||
node
|
||||
// {
|
||||
locked = {
|
||||
type = "path";
|
||||
path = relPath;
|
||||
};
|
||||
original = {
|
||||
type = "path";
|
||||
path = relPath;
|
||||
};
|
||||
};
|
||||
|
||||
bundledFlakeLock = baseFlakeLock // {
|
||||
nodes =
|
||||
baseFlakeLock.nodes
|
||||
// {
|
||||
root =
|
||||
baseFlakeLock.nodes.root
|
||||
// {
|
||||
inputs =
|
||||
baseFlakeLock.nodes.root.inputs
|
||||
// {
|
||||
systems = "systems";
|
||||
};
|
||||
};
|
||||
nixpkgs = makeBundledLockNode "nixpkgs" bundleInputRelPaths.nixpkgs;
|
||||
"rust-overlay" = makeBundledLockNode "rust-overlay" bundleInputRelPaths."rust-overlay";
|
||||
"flake-utils" = makeBundledLockNode "flake-utils" bundleInputRelPaths."flake-utils";
|
||||
disko = makeBundledLockNode "disko" bundleInputRelPaths.disko;
|
||||
systems = makeBundledLockNode "systems" bundleInputRelPaths.systems;
|
||||
};
|
||||
};
|
||||
|
||||
bundledFlakeLockFile =
|
||||
pkgs.writeText "plasmacloud-bundled-flake.lock" (builtins.toJSON bundledFlakeLock);
|
||||
|
||||
inBundledEval = builtins.pathExists ./.bundle-eval-marker;
|
||||
|
||||
bundledFlakeRootDrv = pkgs.runCommand "plasmacloud-bundled-flake-root" {
|
||||
nativeBuildInputs = [
|
||||
pkgs.coreutils
|
||||
pkgs.python3
|
||||
];
|
||||
} ''
|
||||
mkdir -p "$out"
|
||||
cp -a ${flakeBundleSrc}/. "$out"/
|
||||
chmod -R u+w "$out"
|
||||
touch "$out/.bundle-eval-marker"
|
||||
mkdir -p "$out/.bundle-inputs"
|
||||
cp -a ${vendoredFlakeInputs.nixpkgs} "$out/.bundle-inputs/nixpkgs"
|
||||
cp -a ${vendoredFlakeInputs."rust-overlay"} "$out/.bundle-inputs/rust-overlay"
|
||||
cp -a ${vendoredFlakeInputs."flake-utils"} "$out/.bundle-inputs/flake-utils"
|
||||
cp -a ${vendoredFlakeInputs.disko} "$out/.bundle-inputs/disko"
|
||||
cp -a ${vendoredFlakeInputs.systems} "$out/.bundle-inputs/systems"
|
||||
cp ${bundledFlakeLockFile} "$out/flake.lock"
|
||||
python3 - <<'PY' "$out/flake.nix" ${bundledFlakeHeaderFile}
|
||||
from pathlib import Path
|
||||
import re
|
||||
import sys
|
||||
|
||||
flake_path = Path(sys.argv[1])
|
||||
header = Path(sys.argv[2]).read_text()
|
||||
source = flake_path.read_text()
|
||||
pattern = re.compile(
|
||||
r" # ============================================================================\n"
|
||||
r" # INPUTS: External dependencies\n"
|
||||
r" # ============================================================================\n"
|
||||
r" inputs = \{.*?\n"
|
||||
r" # ============================================================================\n"
|
||||
r" # OUTPUTS: What this flake provides\n"
|
||||
r" # ============================================================================\n"
|
||||
r" outputs = \{ self, nixpkgs, rust-overlay, flake-utils, disko, nix-nos, systems \? null \}:",
|
||||
re.S,
|
||||
)
|
||||
rewritten, count = pattern.subn(header.rstrip("\n"), source, count=1)
|
||||
if count != 1:
|
||||
raise SystemExit(f"expected to rewrite 1 flake header, rewrote {count}")
|
||||
flake_path.write_text(rewritten)
|
||||
PY
|
||||
'';
|
||||
|
||||
bundledFlakeRoot =
|
||||
if inBundledEval then
|
||||
null
|
||||
else
|
||||
builtins.path {
|
||||
path = bundledFlakeRootDrv;
|
||||
name = "plasmacloud-bundled-flake-root-src";
|
||||
};
|
||||
|
||||
bundledFlakeRootNarHashFile =
|
||||
if inBundledEval then
|
||||
null
|
||||
else
|
||||
pkgs.runCommand "plasmacloud-bundled-flake-root-narhash" {
|
||||
nativeBuildInputs = [ pkgs.nix ];
|
||||
} ''
|
||||
${pkgs.nix}/bin/nix \
|
||||
--extra-experimental-features nix-command \
|
||||
hash path --sri ${bundledFlakeRoot} \
|
||||
| tr -d '\n' > "$out"
|
||||
'';
|
||||
|
||||
bundledFlakeRootNarHash =
|
||||
if inBundledEval then
|
||||
null
|
||||
else
|
||||
builtins.readFile bundledFlakeRootNarHashFile;
|
||||
|
||||
bundledFlake =
|
||||
if inBundledEval then
|
||||
null
|
||||
else
|
||||
builtins.getFlake (
|
||||
builtins.unsafeDiscardStringContext
|
||||
"path:${toString bundledFlakeRoot}?narHash=${bundledFlakeRootNarHash}"
|
||||
);
|
||||
|
||||
bundledVmSmokeTargetToplevel =
|
||||
if inBundledEval then
|
||||
null
|
||||
else
|
||||
bundledFlake.nixosConfigurations.vm-smoke-target.config.system.build.toplevel;
|
||||
|
||||
# Helper function to build a Rust workspace package
|
||||
# Parameters:
|
||||
# name: package name (e.g., "chainfire-server")
|
||||
|
|
@ -434,16 +729,31 @@
|
|||
description = "Node-local NixOS reconciliation agent for PhotonCloud hosts";
|
||||
};
|
||||
|
||||
plasmacloud-reconciler = buildRustWorkspace {
|
||||
name = "plasmacloud-reconciler";
|
||||
workspaceSubdir = "deployer";
|
||||
mainCrate = "plasmacloud-reconciler";
|
||||
description = "Declarative reconciler for host rollouts and published resources";
|
||||
};
|
||||
|
||||
plasmacloudFlakeBundle = pkgs.runCommand "plasmacloud-flake-bundle.tar.gz" {
|
||||
nativeBuildInputs = [ pkgs.gnutar pkgs.gzip ];
|
||||
nativeBuildInputs = [
|
||||
pkgs.coreutils
|
||||
pkgs.gnutar
|
||||
pkgs.gzip
|
||||
];
|
||||
} ''
|
||||
bundle_root="$(mktemp -d)"
|
||||
cp -a ${bundledFlakeRootDrv}/. "$bundle_root"/
|
||||
chmod -R u+w "$bundle_root"
|
||||
|
||||
tar \
|
||||
--sort=name \
|
||||
--mtime='@1' \
|
||||
--owner=0 \
|
||||
--group=0 \
|
||||
--numeric-owner \
|
||||
-C ${flakeBundleSrc} \
|
||||
-C "$bundle_root" \
|
||||
-cf - . \
|
||||
| gzip -n > "$out"
|
||||
'';
|
||||
|
|
@ -462,6 +772,7 @@
|
|||
self.nixosConfigurations.node01.config.system.build.plasmacloudDeployerClusterState;
|
||||
|
||||
vmClusterFlakeBundle = self.packages.${system}.plasmacloudFlakeBundle;
|
||||
vmSmokeBundledTargetToplevel = bundledVmSmokeTargetToplevel;
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Default package: Build all servers
|
||||
|
|
@ -484,6 +795,7 @@
|
|||
self.packages.${system}.k8shost-server
|
||||
self.packages.${system}.deployer-server
|
||||
self.packages.${system}.deployer-ctl
|
||||
self.packages.${system}.plasmacloud-reconciler
|
||||
self.packages.${system}.nix-agent
|
||||
self.packages.${system}.node-agent
|
||||
self.packages.${system}.fleet-scheduler
|
||||
|
|
@ -556,6 +868,10 @@
|
|||
drv = self.packages.${system}.deployer-ctl;
|
||||
};
|
||||
|
||||
plasmacloud-reconciler = flake-utils.lib.mkApp {
|
||||
drv = self.packages.${system}.plasmacloud-reconciler;
|
||||
};
|
||||
|
||||
nix-agent = flake-utils.lib.mkApp {
|
||||
drv = self.packages.${system}.nix-agent;
|
||||
};
|
||||
|
|
@ -568,6 +884,144 @@
|
|||
drv = self.packages.${system}.fleet-scheduler;
|
||||
};
|
||||
};
|
||||
|
||||
checks = {
|
||||
deployer-vm-smoke = pkgs.testers.runNixOSTest (
|
||||
import ./nix/tests/deployer-vm-smoke.nix {
|
||||
inherit pkgs;
|
||||
photoncloudPackages = self.packages.${system};
|
||||
smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
|
||||
}
|
||||
);
|
||||
|
||||
deployer-vm-rollback = pkgs.testers.runNixOSTest (
|
||||
import ./nix/tests/deployer-vm-smoke.nix {
|
||||
inherit pkgs;
|
||||
photoncloudPackages = self.packages.${system};
|
||||
smokeTargetToplevel = self.packages.${system}.vmSmokeBundledTargetToplevel;
|
||||
desiredSystemOverrides = {
|
||||
health_check_command = [ "false" ];
|
||||
rollback_on_failure = true;
|
||||
};
|
||||
expectedStatus = "rolled-back";
|
||||
expectCurrentSystemMatchesTarget = false;
|
||||
expectMarkerPresent = false;
|
||||
}
|
||||
);
|
||||
|
||||
deployer-bootstrap-e2e = pkgs.runCommand "deployer-bootstrap-e2e" {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
bash
|
||||
coreutils
|
||||
curl
|
||||
findutils
|
||||
gawk
|
||||
gnugrep
|
||||
gnused
|
||||
procps
|
||||
python3
|
||||
];
|
||||
PHOTONCLOUD_E2E_IN_NIX = "1";
|
||||
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
|
||||
"${self.packages.${system}.chainfire-server}/bin/chainfire";
|
||||
PHOTONCLOUD_DEPLOYER_SERVER_BIN =
|
||||
"${self.packages.${system}.deployer-server}/bin/deployer-server";
|
||||
PHOTONCLOUD_DEPLOYER_CTL_BIN =
|
||||
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
|
||||
} ''
|
||||
export HOME="$TMPDIR/home"
|
||||
mkdir -p "$HOME"
|
||||
export PATH="${pkgs.lib.makeBinPath [
|
||||
pkgs.bash
|
||||
pkgs.coreutils
|
||||
pkgs.curl
|
||||
pkgs.findutils
|
||||
pkgs.gawk
|
||||
pkgs.gnugrep
|
||||
pkgs.gnused
|
||||
pkgs.procps
|
||||
pkgs.python3
|
||||
]}"
|
||||
bash ${./deployer/scripts/verify-deployer-bootstrap-e2e.sh}
|
||||
touch "$out"
|
||||
'';
|
||||
|
||||
host-lifecycle-e2e = pkgs.runCommand "host-lifecycle-e2e" {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
bash
|
||||
coreutils
|
||||
curl
|
||||
findutils
|
||||
gawk
|
||||
gnugrep
|
||||
gnused
|
||||
procps
|
||||
python3
|
||||
];
|
||||
PHOTONCLOUD_E2E_IN_NIX = "1";
|
||||
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
|
||||
"${self.packages.${system}.chainfire-server}/bin/chainfire";
|
||||
PHOTONCLOUD_DEPLOYER_CTL_BIN =
|
||||
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
|
||||
PHOTONCLOUD_PLASMACLOUD_RECONCILER_BIN =
|
||||
"${self.packages.${system}.plasmacloud-reconciler}/bin/plasmacloud-reconciler";
|
||||
} ''
|
||||
export HOME="$TMPDIR/home"
|
||||
mkdir -p "$HOME"
|
||||
export PATH="${pkgs.lib.makeBinPath [
|
||||
pkgs.bash
|
||||
pkgs.coreutils
|
||||
pkgs.curl
|
||||
pkgs.findutils
|
||||
pkgs.gawk
|
||||
pkgs.gnugrep
|
||||
pkgs.gnused
|
||||
pkgs.procps
|
||||
pkgs.python3
|
||||
]}"
|
||||
bash ${./deployer/scripts/verify-host-lifecycle-e2e.sh}
|
||||
touch "$out"
|
||||
'';
|
||||
|
||||
fleet-scheduler-e2e = pkgs.runCommand "fleet-scheduler-e2e" {
|
||||
nativeBuildInputs = with pkgs; [
|
||||
bash
|
||||
coreutils
|
||||
curl
|
||||
findutils
|
||||
gawk
|
||||
gnugrep
|
||||
gnused
|
||||
procps
|
||||
python3
|
||||
];
|
||||
PHOTONCLOUD_E2E_IN_NIX = "1";
|
||||
PHOTONCLOUD_CHAINFIRE_SERVER_BIN =
|
||||
"${self.packages.${system}.chainfire-server}/bin/chainfire";
|
||||
PHOTONCLOUD_DEPLOYER_CTL_BIN =
|
||||
"${self.packages.${system}.deployer-ctl}/bin/deployer-ctl";
|
||||
PHOTONCLOUD_NODE_AGENT_BIN =
|
||||
"${self.packages.${system}.node-agent}/bin/node-agent";
|
||||
PHOTONCLOUD_FLEET_SCHEDULER_BIN =
|
||||
"${self.packages.${system}.fleet-scheduler}/bin/fleet-scheduler";
|
||||
} ''
|
||||
export HOME="$TMPDIR/home"
|
||||
mkdir -p "$HOME"
|
||||
export PATH="${pkgs.lib.makeBinPath [
|
||||
pkgs.bash
|
||||
pkgs.coreutils
|
||||
pkgs.curl
|
||||
pkgs.findutils
|
||||
pkgs.gawk
|
||||
pkgs.gnugrep
|
||||
pkgs.gnused
|
||||
pkgs.procps
|
||||
pkgs.python3
|
||||
]}"
|
||||
bash ${./deployer/scripts/verify-fleet-scheduler-e2e.sh}
|
||||
touch "$out"
|
||||
'';
|
||||
};
|
||||
}
|
||||
) // {
|
||||
# ========================================================================
|
||||
|
|
@ -606,6 +1060,12 @@
|
|||
modules = [ ./nix/images/netboot-base.nix ];
|
||||
};
|
||||
|
||||
# Offline-friendly target used by deployer VM smoke tests.
|
||||
vm-smoke-target = nixpkgs.lib.nixosSystem {
|
||||
system = "x86_64-linux";
|
||||
modules = [ ./nix/images/deployer-vm-smoke-target.nix ];
|
||||
};
|
||||
|
||||
# PlasmaCloud ISO (T061.S5 - bootable ISO with cluster-config embedding)
|
||||
plasmacloud-iso = nixpkgs.lib.nixosSystem {
|
||||
system = "x86_64-linux";
|
||||
|
|
@ -732,6 +1192,7 @@
|
|||
k8shost-server = self.packages.${final.system}.k8shost-server;
|
||||
deployer-server = self.packages.${final.system}.deployer-server;
|
||||
deployer-ctl = self.packages.${final.system}.deployer-ctl;
|
||||
plasmacloud-reconciler = self.packages.${final.system}.plasmacloud-reconciler;
|
||||
plasmacloudFlakeBundle = self.packages.${final.system}.plasmacloudFlakeBundle;
|
||||
nix-agent = self.packages.${final.system}.nix-agent;
|
||||
node-agent = self.packages.${final.system}.node-agent;
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ use flaredb_proto::kvrpc::{
|
|||
use flaredb_proto::pdpb::Store;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
use std::time::{Instant, SystemTime, UNIX_EPOCH};
|
||||
use serde::Deserialize;
|
||||
use tokio::sync::Mutex;
|
||||
use tonic::transport::Channel;
|
||||
|
|
@ -35,6 +35,7 @@ pub struct RdbClient {
|
|||
chainfire_kv_client: Option<ChainfireKvClient<Channel>>,
|
||||
|
||||
region_cache: RegionCache,
|
||||
chainfire_route_cache: Arc<Mutex<Option<ChainfireRouteSnapshot>>>,
|
||||
namespace: String,
|
||||
}
|
||||
|
||||
|
|
@ -53,10 +54,18 @@ struct ChainfireRegionInfo {
|
|||
leader_id: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ChainfireRouteSnapshot {
|
||||
stores: HashMap<u64, ChainfireStoreInfo>,
|
||||
regions: Vec<ChainfireRegionInfo>,
|
||||
fetched_at: Instant,
|
||||
}
|
||||
|
||||
impl RdbClient {
|
||||
const ROUTE_RETRY_LIMIT: usize = 12;
|
||||
const ROUTE_RETRY_BASE_DELAY_MS: u64 = 100;
|
||||
const ROUTED_RPC_TIMEOUT: Duration = Duration::from_secs(1);
|
||||
const CHAINFIRE_ROUTE_CACHE_TTL: Duration = Duration::from_secs(2);
|
||||
|
||||
pub async fn connect_with_pd(
|
||||
_server_addr: String,
|
||||
|
|
@ -70,26 +79,43 @@ impl RdbClient {
|
|||
pd_addr: String,
|
||||
namespace: impl Into<String>,
|
||||
) -> Result<Self, tonic::transport::Error> {
|
||||
let pd_endpoints = parse_transport_endpoints(&pd_addr);
|
||||
let normalized_server_addr = normalize_transport_addr(&server_addr);
|
||||
// A number of in-repo callers still pass the same address for both server and PD.
|
||||
// In that case, prefer direct routing and skip the PD lookup path entirely.
|
||||
let direct_addr = if !server_addr.is_empty() && server_addr == pd_addr {
|
||||
Some(server_addr)
|
||||
let direct_addr = if !normalized_server_addr.is_empty()
|
||||
&& pd_endpoints
|
||||
.iter()
|
||||
.any(|endpoint| normalize_transport_addr(endpoint) == normalized_server_addr)
|
||||
{
|
||||
Some(normalized_server_addr.clone())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let (tso_client, pd_client, chainfire_kv_client) = if direct_addr.is_some() {
|
||||
(None, None, None)
|
||||
} else {
|
||||
let pd_channel = Channel::from_shared(transport_endpoint(&pd_addr))
|
||||
.unwrap()
|
||||
.connect()
|
||||
.await?;
|
||||
let mut last_error = None;
|
||||
let mut clients = None;
|
||||
for endpoint in &pd_endpoints {
|
||||
let pd_channel = match Channel::from_shared(transport_endpoint(endpoint)) {
|
||||
Ok(endpoint) => match endpoint.connect().await {
|
||||
Ok(channel) => channel,
|
||||
Err(error) => {
|
||||
last_error = Some(error);
|
||||
continue;
|
||||
}
|
||||
},
|
||||
Err(_) => {
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let mut probe_client = PdClient::new(pd_channel.clone());
|
||||
let probe = probe_client
|
||||
.get_region(GetRegionRequest { key: Vec::new() })
|
||||
.await;
|
||||
|
||||
match probe {
|
||||
clients = Some(match probe {
|
||||
Err(status) if status.code() == tonic::Code::Unimplemented => (
|
||||
None,
|
||||
None,
|
||||
|
|
@ -100,6 +126,21 @@ impl RdbClient {
|
|||
Some(PdClient::new(pd_channel)),
|
||||
None,
|
||||
),
|
||||
});
|
||||
break;
|
||||
}
|
||||
if let Some(clients) = clients {
|
||||
clients
|
||||
} else if let Some(error) = last_error {
|
||||
return Err(error);
|
||||
} else {
|
||||
return Err(
|
||||
Channel::from_shared("http://127.0.0.1:1".to_string())
|
||||
.unwrap()
|
||||
.connect()
|
||||
.await
|
||||
.expect_err("unreachable fallback endpoint should fail to connect"),
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
|
|
@ -111,6 +152,7 @@ impl RdbClient {
|
|||
chainfire_kv_client,
|
||||
region_cache: RegionCache::new(),
|
||||
namespace: namespace.into(),
|
||||
chainfire_route_cache: Arc::new(Mutex::new(None)),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -119,17 +161,51 @@ impl RdbClient {
|
|||
server_addr: String,
|
||||
namespace: impl Into<String>,
|
||||
) -> Result<Self, tonic::transport::Error> {
|
||||
let ep = transport_endpoint(&server_addr);
|
||||
let channel = Channel::from_shared(ep).unwrap().connect().await?;
|
||||
let direct_endpoints = parse_transport_endpoints(&server_addr);
|
||||
let mut last_error = None;
|
||||
let mut selected_addr = None;
|
||||
let mut channel = None;
|
||||
|
||||
for endpoint in &direct_endpoints {
|
||||
match Channel::from_shared(transport_endpoint(endpoint)) {
|
||||
Ok(endpoint_builder) => match endpoint_builder.connect().await {
|
||||
Ok(connected) => {
|
||||
selected_addr = Some(endpoint.clone());
|
||||
channel = Some(connected);
|
||||
break;
|
||||
}
|
||||
Err(error) => {
|
||||
last_error = Some(error);
|
||||
}
|
||||
},
|
||||
Err(_) => {}
|
||||
}
|
||||
}
|
||||
|
||||
let selected_addr = if let Some(addr) = selected_addr {
|
||||
addr
|
||||
} else if let Some(error) = last_error {
|
||||
return Err(error);
|
||||
} else {
|
||||
return Err(
|
||||
Channel::from_shared("http://127.0.0.1:1".to_string())
|
||||
.unwrap()
|
||||
.connect()
|
||||
.await
|
||||
.expect_err("unreachable fallback endpoint should fail to connect"),
|
||||
);
|
||||
};
|
||||
let channel = channel.expect("direct connect should produce a channel when selected");
|
||||
|
||||
Ok(Self {
|
||||
channels: Arc::new(Mutex::new(HashMap::new())),
|
||||
direct_addr: Some(server_addr),
|
||||
direct_addr: Some(selected_addr),
|
||||
tso_client: Some(TsoClient::new(channel.clone())),
|
||||
pd_client: Some(PdClient::new(channel)),
|
||||
chainfire_kv_client: None,
|
||||
region_cache: RegionCache::new(),
|
||||
namespace: namespace.into(),
|
||||
chainfire_route_cache: Arc::new(Mutex::new(None)),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -165,6 +241,7 @@ impl RdbClient {
|
|||
}
|
||||
|
||||
self.region_cache.clear().await;
|
||||
self.invalidate_chainfire_route_cache().await;
|
||||
|
||||
if let Some(chainfire_kv_client) = &self.chainfire_kv_client {
|
||||
return self.resolve_addr_via_chainfire(key, chainfire_kv_client.clone()).await;
|
||||
|
|
@ -183,10 +260,6 @@ impl RdbClient {
|
|||
Err(tonic::Status::not_found("region not found"))
|
||||
}
|
||||
|
||||
async fn get_channel(&self, addr: &str) -> Result<Channel, tonic::transport::Error> {
|
||||
Self::get_channel_from_map(&self.channels, addr).await
|
||||
}
|
||||
|
||||
async fn get_channel_from_map(
|
||||
channels: &Arc<Mutex<HashMap<String, Channel>>>,
|
||||
addr: &str,
|
||||
|
|
@ -207,6 +280,73 @@ impl RdbClient {
|
|||
map.remove(addr);
|
||||
}
|
||||
|
||||
async fn invalidate_chainfire_route_cache(&self) {
|
||||
let mut cache = self.chainfire_route_cache.lock().await;
|
||||
*cache = None;
|
||||
}
|
||||
|
||||
async fn chainfire_route_snapshot(
|
||||
&self,
|
||||
mut kv_client: ChainfireKvClient<Channel>,
|
||||
force_refresh: bool,
|
||||
) -> Result<ChainfireRouteSnapshot, tonic::Status> {
|
||||
if !force_refresh {
|
||||
if let Some(snapshot) = self.chainfire_route_cache.lock().await.clone() {
|
||||
if snapshot.fetched_at.elapsed() <= Self::CHAINFIRE_ROUTE_CACHE_TTL {
|
||||
return Ok(snapshot);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let regions = list_chainfire_regions(&mut kv_client).await?;
|
||||
let stores = list_chainfire_stores(&mut kv_client).await?;
|
||||
let snapshot = ChainfireRouteSnapshot {
|
||||
stores,
|
||||
regions,
|
||||
fetched_at: Instant::now(),
|
||||
};
|
||||
let mut cache = self.chainfire_route_cache.lock().await;
|
||||
*cache = Some(snapshot.clone());
|
||||
Ok(snapshot)
|
||||
}
|
||||
|
||||
fn resolve_addr_from_chainfire_snapshot(
|
||||
&self,
|
||||
key: &[u8],
|
||||
snapshot: &ChainfireRouteSnapshot,
|
||||
) -> Result<(Region, Store), tonic::Status> {
|
||||
let region = snapshot
|
||||
.regions
|
||||
.iter()
|
||||
.find(|region| {
|
||||
let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
|
||||
let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
|
||||
start_ok && end_ok
|
||||
})
|
||||
.cloned()
|
||||
.ok_or_else(|| tonic::Status::not_found("region not found"))?;
|
||||
|
||||
let leader = snapshot
|
||||
.stores
|
||||
.get(®ion.leader_id)
|
||||
.cloned()
|
||||
.ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
|
||||
|
||||
Ok((
|
||||
Region {
|
||||
id: region.id,
|
||||
start_key: region.start_key,
|
||||
end_key: region.end_key,
|
||||
peers: region.peers,
|
||||
leader_id: region.leader_id,
|
||||
},
|
||||
Store {
|
||||
id: leader.id,
|
||||
addr: leader.addr,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
async fn with_routed_addr<T, F, Fut>(&self, key: &[u8], mut op: F) -> Result<T, tonic::Status>
|
||||
where
|
||||
F: FnMut(String) -> Fut,
|
||||
|
|
@ -590,41 +730,21 @@ impl RdbClient {
|
|||
async fn resolve_addr_via_chainfire(
|
||||
&self,
|
||||
key: &[u8],
|
||||
mut kv_client: ChainfireKvClient<Channel>,
|
||||
kv_client: ChainfireKvClient<Channel>,
|
||||
) -> Result<String, tonic::Status> {
|
||||
let regions = list_chainfire_regions(&mut kv_client).await?;
|
||||
let stores = list_chainfire_stores(&mut kv_client).await?;
|
||||
for force_refresh in [false, true] {
|
||||
let snapshot = self
|
||||
.chainfire_route_snapshot(kv_client.clone(), force_refresh)
|
||||
.await?;
|
||||
if let Ok((region, leader)) =
|
||||
self.resolve_addr_from_chainfire_snapshot(key, &snapshot)
|
||||
{
|
||||
self.region_cache.update(region, leader.clone()).await;
|
||||
return Ok(leader.addr);
|
||||
}
|
||||
}
|
||||
|
||||
let region = regions
|
||||
.into_iter()
|
||||
.find(|region| {
|
||||
let start_ok = region.start_key.is_empty() || key >= region.start_key.as_slice();
|
||||
let end_ok = region.end_key.is_empty() || key < region.end_key.as_slice();
|
||||
start_ok && end_ok
|
||||
})
|
||||
.ok_or_else(|| tonic::Status::not_found("region not found"))?;
|
||||
|
||||
let leader = stores
|
||||
.get(®ion.leader_id)
|
||||
.ok_or_else(|| tonic::Status::not_found("leader store not found"))?;
|
||||
|
||||
self.region_cache
|
||||
.update(
|
||||
Region {
|
||||
id: region.id,
|
||||
start_key: region.start_key,
|
||||
end_key: region.end_key,
|
||||
peers: region.peers,
|
||||
leader_id: region.leader_id,
|
||||
},
|
||||
Store {
|
||||
id: leader.id,
|
||||
addr: leader.addr.clone(),
|
||||
},
|
||||
)
|
||||
.await;
|
||||
|
||||
Ok(leader.addr.clone())
|
||||
Err(tonic::Status::not_found("region not found"))
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -636,6 +756,23 @@ fn transport_endpoint(addr: &str) -> String {
|
|||
}
|
||||
}
|
||||
|
||||
fn normalize_transport_addr(addr: &str) -> String {
|
||||
addr.trim()
|
||||
.trim_start_matches("http://")
|
||||
.trim_start_matches("https://")
|
||||
.trim_end_matches('/')
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn parse_transport_endpoints(addrs: &str) -> Vec<String> {
|
||||
addrs
|
||||
.split(',')
|
||||
.map(str::trim)
|
||||
.filter(|item| !item.is_empty())
|
||||
.map(normalize_transport_addr)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn prefix_range_end(prefix: &str) -> Vec<u8> {
|
||||
let mut end = prefix.as_bytes().to_vec();
|
||||
if let Some(last) = end.last_mut() {
|
||||
|
|
@ -696,7 +833,7 @@ async fn list_chainfire_regions(
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::RdbClient;
|
||||
use super::{RdbClient, normalize_transport_addr, parse_transport_endpoints};
|
||||
|
||||
#[test]
|
||||
fn unknown_transport_errors_are_treated_as_retryable_routes() {
|
||||
|
|
@ -711,4 +848,20 @@ mod tests {
|
|||
assert!(RdbClient::is_retryable_route_error(&status));
|
||||
assert!(!RdbClient::is_transport_error(&status));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_transport_endpoints_accepts_comma_separated_values() {
|
||||
assert_eq!(
|
||||
parse_transport_endpoints("http://10.0.0.1:2379, 10.0.0.2:2379/"),
|
||||
vec!["10.0.0.1:2379".to_string(), "10.0.0.2:2379".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn normalize_transport_addr_strips_scheme_and_slashes() {
|
||||
assert_eq!(
|
||||
normalize_transport_addr("https://10.0.0.1:2479/"),
|
||||
"10.0.0.1:2479".to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -10,6 +10,9 @@ struct Args {
|
|||
#[arg(long, default_value = "127.0.0.1:2479")]
|
||||
pd_addr: String,
|
||||
|
||||
#[arg(long, default_value = "")]
|
||||
namespace: String,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Commands,
|
||||
}
|
||||
|
|
@ -44,7 +47,8 @@ enum Commands {
|
|||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let args = Args::parse();
|
||||
let mut client = RdbClient::connect_with_pd(args.addr, args.pd_addr).await?;
|
||||
let mut client =
|
||||
RdbClient::connect_with_pd_namespace(args.addr, args.pd_addr, args.namespace).await?;
|
||||
|
||||
match args.command {
|
||||
Commands::RawPut { key, value } => {
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ impl Cluster {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn register_store(&self, addr: String) -> u64 {
|
||||
pub fn register_store(&self, addr: String, requested_id: Option<u64>) -> u64 {
|
||||
let mut state = self.inner.lock().unwrap();
|
||||
|
||||
// Dedup check? For now, always new ID.
|
||||
|
|
@ -39,8 +39,15 @@ impl Cluster {
|
|||
}
|
||||
}
|
||||
|
||||
let id = state.next_store_id;
|
||||
let id = requested_id
|
||||
.filter(|id| *id != 0 && !state.stores.contains_key(id))
|
||||
.unwrap_or_else(|| {
|
||||
while state.stores.contains_key(&state.next_store_id) {
|
||||
state.next_store_id += 1;
|
||||
}
|
||||
state.next_store_id
|
||||
});
|
||||
state.next_store_id = state.next_store_id.max(id.saturating_add(1));
|
||||
|
||||
state.stores.insert(id, Store { id, addr });
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,8 @@ impl Pd for PdServiceImpl {
|
|||
request: Request<RegisterStoreRequest>,
|
||||
) -> Result<Response<RegisterStoreResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let store_id = self.cluster.register_store(req.addr);
|
||||
let requested_store_id = (req.store_id != 0).then_some(req.store_id);
|
||||
let store_id = self.cluster.register_store(req.addr, requested_store_id);
|
||||
Ok(Response::new(RegisterStoreResponse {
|
||||
store_id,
|
||||
cluster_id: 1, // fixed for now
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@ service Pd {
|
|||
|
||||
message RegisterStoreRequest {
|
||||
string addr = 1; // e.g., "127.0.0.1:50051"
|
||||
uint64 store_id = 2; // Optional requested store ID (0 = auto-assign)
|
||||
}
|
||||
|
||||
message RegisterStoreResponse {
|
||||
|
|
|
|||
|
|
@ -1,23 +1,38 @@
|
|||
use crate::store::Store;
|
||||
use flaredb_proto::pdpb::pd_client::PdClient;
|
||||
use flaredb_proto::pdpb::ListRegionsRequest;
|
||||
use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
|
||||
use flaredb_types::RegionMeta;
|
||||
use std::sync::Arc;
|
||||
use tokio::time::{sleep, Duration};
|
||||
|
||||
/// Periodically send region/store heartbeat to PD.
|
||||
pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
|
||||
pub async fn start_heartbeat(
|
||||
pd_addr: String,
|
||||
store: Arc<Store>,
|
||||
server_addr: String,
|
||||
requested_store_id: u64,
|
||||
) {
|
||||
tokio::spawn(async move {
|
||||
let endpoint = format!("http://{}", pd_addr);
|
||||
loop {
|
||||
if let Ok(mut client) = PdClient::connect(endpoint.clone()).await {
|
||||
if let Err(err) = client
|
||||
.register_store(RegisterStoreRequest {
|
||||
addr: server_addr.clone(),
|
||||
store_id: requested_store_id,
|
||||
})
|
||||
.await
|
||||
{
|
||||
tracing::warn!("failed to register store with legacy PD: {}", err);
|
||||
}
|
||||
|
||||
// list regions to keep routing fresh
|
||||
if let Ok(resp) = client.list_regions(ListRegionsRequest {}).await {
|
||||
let resp = resp.into_inner();
|
||||
let mut metas = Vec::new();
|
||||
for r in resp.regions {
|
||||
let voters = if r.peers.is_empty() {
|
||||
Vec::new()
|
||||
vec![store.store_id()]
|
||||
} else {
|
||||
r.peers.clone()
|
||||
};
|
||||
|
|
@ -27,11 +42,7 @@ pub async fn start_heartbeat(pd_addr: String, store: Arc<Store>) {
|
|||
start_key: r.start_key,
|
||||
end_key: r.end_key,
|
||||
},
|
||||
if voters.is_empty() {
|
||||
vec![store.store_id()]
|
||||
} else {
|
||||
voters
|
||||
},
|
||||
voters,
|
||||
));
|
||||
}
|
||||
if !metas.is_empty() {
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
use clap::Parser;
|
||||
use flaredb_proto::kvrpc::kv_cas_server::KvCasServer;
|
||||
use flaredb_proto::kvrpc::kv_raw_server::KvRawServer;
|
||||
use flaredb_proto::pdpb::pd_client::PdClient as LegacyPdClient;
|
||||
use flaredb_proto::pdpb::{ListRegionsRequest, RegisterStoreRequest};
|
||||
use flaredb_proto::raft_server::raft_service_server::RaftServiceServer;
|
||||
use flaredb_proto::sqlrpc::sql_service_server::SqlServiceServer;
|
||||
use flaredb_server::config::{self, Config, NamespaceManager};
|
||||
|
|
@ -12,7 +14,7 @@ use std::path::PathBuf;
|
|||
use std::sync::Arc;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tonic::transport::{Certificate, Identity, Server, ServerTlsConfig};
|
||||
use tonic::transport::{Certificate, Channel, Identity, Server, ServerTlsConfig};
|
||||
use tonic_health::server::health_reporter;
|
||||
use tracing::{info, warn}; // Import warn
|
||||
use tracing_subscriber::EnvFilter;
|
||||
|
|
@ -27,7 +29,7 @@ mod service;
|
|||
mod sql_service;
|
||||
mod store;
|
||||
|
||||
use pd_client::{PdClient, PdEvent};
|
||||
use pd_client::{PdClient as ChainfirePdClient, PdEvent};
|
||||
|
||||
const RAFT_GRPC_MESSAGE_SIZE: usize = 64 * 1024 * 1024;
|
||||
|
||||
|
|
@ -35,14 +37,18 @@ async fn connect_pd_with_retry(
|
|||
pd_endpoints: &[String],
|
||||
attempts: u32,
|
||||
delay: Duration,
|
||||
) -> Option<PdClient> {
|
||||
) -> Option<ChainfirePdClient> {
|
||||
let mut last_error = None;
|
||||
|
||||
for attempt in 1..=attempts {
|
||||
match PdClient::connect_any(pd_endpoints).await {
|
||||
match ChainfirePdClient::connect_any(pd_endpoints).await {
|
||||
Ok(client) => return Some(client),
|
||||
Err(err) => {
|
||||
last_error = Some(err.to_string());
|
||||
let protocol_mismatch = last_error
|
||||
.as_deref()
|
||||
.map(|msg| msg.contains("Unimplemented"))
|
||||
.unwrap_or(false);
|
||||
warn!(
|
||||
attempt,
|
||||
attempts,
|
||||
|
|
@ -50,6 +56,13 @@ async fn connect_pd_with_retry(
|
|||
error = last_error.as_deref().unwrap_or("unknown"),
|
||||
"Failed to connect to FlareDB PD"
|
||||
);
|
||||
if protocol_mismatch {
|
||||
warn!(
|
||||
?pd_endpoints,
|
||||
"PD endpoint does not speak ChainFire; falling back to legacy PD"
|
||||
);
|
||||
return None;
|
||||
}
|
||||
if attempt < attempts {
|
||||
sleep(delay).await;
|
||||
}
|
||||
|
|
@ -65,6 +78,49 @@ async fn connect_pd_with_retry(
|
|||
None
|
||||
}
|
||||
|
||||
async fn connect_legacy_pd_with_retry(
|
||||
pd_endpoints: &[String],
|
||||
attempts: u32,
|
||||
delay: Duration,
|
||||
) -> Option<(String, LegacyPdClient<Channel>)> {
|
||||
let mut last_error = None;
|
||||
|
||||
for attempt in 1..=attempts {
|
||||
for endpoint in pd_endpoints {
|
||||
let transport = if endpoint.starts_with("http") {
|
||||
endpoint.clone()
|
||||
} else {
|
||||
format!("http://{}", endpoint)
|
||||
};
|
||||
match LegacyPdClient::connect(transport.clone()).await {
|
||||
Ok(client) => return Some((endpoint.clone(), client)),
|
||||
Err(err) => {
|
||||
last_error = Some(format!("{}: {}", endpoint, err));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
warn!(
|
||||
attempt,
|
||||
attempts,
|
||||
?pd_endpoints,
|
||||
error = last_error.as_deref().unwrap_or("unknown"),
|
||||
"Failed to connect to legacy FlareDB PD"
|
||||
);
|
||||
|
||||
if attempt < attempts {
|
||||
sleep(delay).await;
|
||||
}
|
||||
}
|
||||
|
||||
warn!(
|
||||
?pd_endpoints,
|
||||
error = last_error.as_deref().unwrap_or("unknown"),
|
||||
"Exhausted legacy FlareDB PD connection retries"
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Args {
|
||||
|
|
@ -334,7 +390,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
let server_addr_string = server_config.addr.to_string();
|
||||
tokio::spawn(async move {
|
||||
let client = Arc::new(Mutex::new(
|
||||
PdClient::connect_any(&pd_endpoints_for_task).await.ok(),
|
||||
ChainfirePdClient::connect_any(&pd_endpoints_for_task)
|
||||
.await
|
||||
.ok(),
|
||||
));
|
||||
|
||||
loop {
|
||||
|
|
@ -396,7 +454,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
}
|
||||
} else {
|
||||
// Try to reconnect
|
||||
if let Ok(new_client) = PdClient::connect_any(&pd_endpoints_for_task).await
|
||||
if let Ok(new_client) =
|
||||
ChainfirePdClient::connect_any(&pd_endpoints_for_task).await
|
||||
{
|
||||
info!("Reconnected to PD");
|
||||
*guard = Some(new_client);
|
||||
|
|
@ -406,6 +465,75 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
sleep(Duration::from_secs(10)).await;
|
||||
}
|
||||
});
|
||||
} else if let Some((legacy_pd_addr, mut legacy_pd_client)) =
|
||||
connect_legacy_pd_with_retry(&pd_endpoints, 3, Duration::from_secs(1)).await
|
||||
{
|
||||
info!(pd_addr = %legacy_pd_addr, "Connected to legacy FlareDB PD");
|
||||
|
||||
match legacy_pd_client
|
||||
.register_store(RegisterStoreRequest {
|
||||
addr: server_config.addr.to_string(),
|
||||
store_id: server_config.store_id,
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(resp) => {
|
||||
let resp = resp.into_inner();
|
||||
if resp.store_id != 0 && resp.store_id != server_config.store_id {
|
||||
warn!(
|
||||
expected_store_id = server_config.store_id,
|
||||
assigned_store_id = resp.store_id,
|
||||
"legacy PD assigned a different store id than local config"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(err) => warn!("failed to register with legacy PD: {}", err),
|
||||
}
|
||||
|
||||
let mut region_metas = Vec::new();
|
||||
match legacy_pd_client.list_regions(ListRegionsRequest {}).await {
|
||||
Ok(resp) => {
|
||||
for region in resp.into_inner().regions {
|
||||
let voters = if region.peers.is_empty() || region.peers.len() < voters.len() {
|
||||
voters.clone()
|
||||
} else {
|
||||
region.peers.clone()
|
||||
};
|
||||
region_metas.push((
|
||||
RegionMeta {
|
||||
id: region.id,
|
||||
start_key: region.start_key,
|
||||
end_key: region.end_key,
|
||||
},
|
||||
voters,
|
||||
));
|
||||
}
|
||||
}
|
||||
Err(err) => warn!("failed to list regions from legacy PD: {}", err),
|
||||
}
|
||||
|
||||
if region_metas.is_empty() {
|
||||
region_metas.push((
|
||||
RegionMeta {
|
||||
id: 1,
|
||||
start_key: Vec::new(),
|
||||
end_key: Vec::new(),
|
||||
},
|
||||
voters.clone(),
|
||||
));
|
||||
}
|
||||
|
||||
if let Err(e) = store.bootstrap_regions(region_metas).await {
|
||||
warn!("failed to bootstrap regions from legacy PD: {}", e);
|
||||
}
|
||||
|
||||
heartbeat::start_heartbeat(
|
||||
legacy_pd_addr,
|
||||
store.clone(),
|
||||
server_config.addr.to_string(),
|
||||
server_config.store_id,
|
||||
)
|
||||
.await;
|
||||
} else {
|
||||
info!("Starting in standalone mode with default region...");
|
||||
let _ = store
|
||||
|
|
@ -494,6 +622,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
server_addr: server_config.addr.to_string(),
|
||||
pd_endpoints: pd_endpoints.clone(),
|
||||
store_id: server_config.store_id,
|
||||
configured_peers: (*peer_addrs).clone(),
|
||||
};
|
||||
let rest_app = rest::build_router(rest_state);
|
||||
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
|
||||
|
|
|
|||
|
|
@ -16,8 +16,8 @@ use axum::{
|
|||
};
|
||||
use crate::pd_client::PdClient;
|
||||
use flaredb_client::RdbClient;
|
||||
use flaredb_sql::executor::{ExecutionResult, SqlExecutor};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
/// REST API state
|
||||
|
|
@ -26,6 +26,7 @@ pub struct RestApiState {
|
|||
pub server_addr: String,
|
||||
pub pd_endpoints: Vec<String>,
|
||||
pub store_id: u64,
|
||||
pub configured_peers: HashMap<u64, String>,
|
||||
}
|
||||
|
||||
/// Standard REST error response
|
||||
|
|
@ -136,6 +137,15 @@ pub struct AddPeerRequest {
|
|||
pub peer_id: u64,
|
||||
}
|
||||
|
||||
/// Legacy/admin add member request for first-boot compatibility.
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct AddMemberRequestLegacy {
|
||||
pub id: String,
|
||||
pub raft_addr: String,
|
||||
#[serde(default)]
|
||||
pub addr: Option<String>,
|
||||
}
|
||||
|
||||
/// Region info response
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct RegionResponse {
|
||||
|
|
@ -153,6 +163,7 @@ pub fn build_router(state: RestApiState) -> Router {
|
|||
.route("/api/v1/scan", get(scan_kv))
|
||||
.route("/api/v1/regions/{id}", get(get_region))
|
||||
.route("/api/v1/regions/{id}/add_peer", post(add_peer_to_region))
|
||||
.route("/admin/member/add", post(add_member_legacy))
|
||||
.route("/health", get(health_check))
|
||||
.with_state(state)
|
||||
}
|
||||
|
|
@ -320,6 +331,121 @@ async fn add_peer_to_region(
|
|||
})))
|
||||
}
|
||||
|
||||
/// POST /admin/member/add - first-boot compatible cluster join hook.
|
||||
async fn add_member_legacy(
|
||||
State(state): State<RestApiState>,
|
||||
Json(req): Json<AddMemberRequestLegacy>,
|
||||
) -> Result<(StatusCode, Json<SuccessResponse<serde_json::Value>>), (StatusCode, Json<ErrorResponse>)> {
|
||||
let (peer_id, peer_addr) = resolve_join_peer(&state, &req).ok_or_else(|| {
|
||||
error_response(
|
||||
StatusCode::BAD_REQUEST,
|
||||
"INVALID_MEMBER",
|
||||
"Unable to resolve FlareDB peer id/address from join request",
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut pd_client = PdClient::connect_any(&state.pd_endpoints)
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::SERVICE_UNAVAILABLE, "PD_UNAVAILABLE", &format!("Failed to connect to PD: {}", e)))?;
|
||||
|
||||
let stores = pd_client.list_stores().await;
|
||||
let already_registered = stores.iter().any(|store| store.id == peer_id);
|
||||
|
||||
pd_client
|
||||
.register_store(peer_id, peer_addr.clone())
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
|
||||
let mut regions = pd_client.list_regions().await;
|
||||
if regions.is_empty() {
|
||||
pd_client
|
||||
.init_default_region(vec![state.store_id, peer_id])
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
regions = vec![crate::pd_client::RegionInfo {
|
||||
id: 1,
|
||||
start_key: Vec::new(),
|
||||
end_key: Vec::new(),
|
||||
peers: vec![state.store_id, peer_id],
|
||||
leader_id: 0,
|
||||
}];
|
||||
}
|
||||
|
||||
let mut updated_regions = Vec::new();
|
||||
for mut region in regions {
|
||||
if !region.peers.contains(&peer_id) {
|
||||
region.peers.push(peer_id);
|
||||
region.peers.sort_unstable();
|
||||
pd_client
|
||||
.put_region(region.clone())
|
||||
.await
|
||||
.map_err(|e| error_response(StatusCode::INTERNAL_SERVER_ERROR, "INTERNAL_ERROR", &e.to_string()))?;
|
||||
updated_regions.push(region.id);
|
||||
}
|
||||
}
|
||||
|
||||
let status = if already_registered && updated_regions.is_empty() {
|
||||
StatusCode::CONFLICT
|
||||
} else if already_registered {
|
||||
StatusCode::OK
|
||||
} else {
|
||||
StatusCode::CREATED
|
||||
};
|
||||
|
||||
Ok((
|
||||
status,
|
||||
Json(SuccessResponse::new(serde_json::json!({
|
||||
"peer_id": peer_id,
|
||||
"addr": peer_addr,
|
||||
"updated_regions": updated_regions,
|
||||
"already_registered": already_registered,
|
||||
}))),
|
||||
))
|
||||
}
|
||||
|
||||
fn resolve_join_peer(
|
||||
state: &RestApiState,
|
||||
req: &AddMemberRequestLegacy,
|
||||
) -> Option<(u64, String)> {
|
||||
if let Ok(peer_id) = req.id.parse::<u64>() {
|
||||
if let Some(addr) = req
|
||||
.addr
|
||||
.clone()
|
||||
.or_else(|| state.configured_peers.get(&peer_id).cloned())
|
||||
{
|
||||
return Some((peer_id, addr));
|
||||
}
|
||||
}
|
||||
|
||||
let candidate_host = socket_host(req.addr.as_deref().unwrap_or(&req.raft_addr));
|
||||
state
|
||||
.configured_peers
|
||||
.iter()
|
||||
.find(|(_, addr)| socket_host(addr) == candidate_host)
|
||||
.map(|(peer_id, addr)| (*peer_id, addr.clone()))
|
||||
}
|
||||
|
||||
fn socket_host(addr: &str) -> String {
|
||||
let normalized = addr
|
||||
.trim()
|
||||
.trim_start_matches("http://")
|
||||
.trim_start_matches("https://")
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or(addr)
|
||||
.to_string();
|
||||
|
||||
normalized
|
||||
.parse::<std::net::SocketAddr>()
|
||||
.map(|socket_addr| socket_addr.ip().to_string())
|
||||
.unwrap_or_else(|_| {
|
||||
normalized
|
||||
.rsplit_once(':')
|
||||
.map(|(host, _)| host.trim_matches(['[', ']']).to_string())
|
||||
.unwrap_or(normalized)
|
||||
})
|
||||
}
|
||||
|
||||
/// Helper to create error response
|
||||
fn error_response(
|
||||
status: StatusCode,
|
||||
|
|
@ -338,3 +464,51 @@ fn error_response(
|
|||
}),
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn test_state() -> RestApiState {
|
||||
RestApiState {
|
||||
server_addr: "127.0.0.1:50052".to_string(),
|
||||
pd_endpoints: vec!["127.0.0.1:2479".to_string()],
|
||||
store_id: 1,
|
||||
configured_peers: HashMap::from([
|
||||
(1, "10.100.0.11:50052".to_string()),
|
||||
(2, "10.100.0.12:50052".to_string()),
|
||||
(3, "10.100.0.13:50052".to_string()),
|
||||
]),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_join_peer_uses_numeric_id_when_available() {
|
||||
let state = test_state();
|
||||
let req = AddMemberRequestLegacy {
|
||||
id: "2".to_string(),
|
||||
raft_addr: "10.100.0.12:2380".to_string(),
|
||||
addr: None,
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
resolve_join_peer(&state, &req),
|
||||
Some((2, "10.100.0.12:50052".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_join_peer_matches_host_from_raft_addr() {
|
||||
let state = test_state();
|
||||
let req = AddMemberRequestLegacy {
|
||||
id: "node02".to_string(),
|
||||
raft_addr: "10.100.0.12:2380".to_string(),
|
||||
addr: None,
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
resolve_join_peer(&state, &req),
|
||||
Some((2, "10.100.0.12:50052".to_string()))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@
|
|||
};
|
||||
|
||||
rustToolchain = pkgs.rust-bin.stable.latest.default.override {
|
||||
extensions = [ "rust-src" "rust-analyzer" ];
|
||||
extensions = [ "rust-src" "rust-analyzer" "rustfmt" ];
|
||||
};
|
||||
|
||||
in
|
||||
|
|
|
|||
|
|
@ -6,13 +6,43 @@ if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
|
|||
exec nix develop -c "$0" "$@"
|
||||
fi
|
||||
|
||||
WORKDIR=$(mktemp -d)
|
||||
PD_LOG="${WORKDIR}/flaredb-pd.log"
|
||||
SERVER_LOG="${WORKDIR}/flaredb-server.log"
|
||||
DATA_DIR="${WORKDIR}/data"
|
||||
|
||||
run_client() {
|
||||
local output=""
|
||||
local status=0
|
||||
local attempt=0
|
||||
while (( attempt < 20 )); do
|
||||
if output=$(cargo run --quiet --bin flaredb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 "$@" 2>&1); then
|
||||
printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
|
||||
return 0
|
||||
fi
|
||||
status=$?
|
||||
attempt=$((attempt + 1))
|
||||
sleep 1
|
||||
done
|
||||
printf '%s\n' "${output}" >&2
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
if [[ -n "${SERVER_PID:-}" ]]; then
|
||||
kill "$SERVER_PID" >/dev/null 2>&1 || true
|
||||
fi
|
||||
if [[ -n "${PD_PID:-}" ]]; then
|
||||
kill "$PD_PID" >/dev/null 2>&1 || true
|
||||
fi
|
||||
if (( exit_code != 0 )); then
|
||||
echo "verify-core failed; logs preserved at ${WORKDIR}" >&2
|
||||
[[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
|
||||
[[ -f "${SERVER_LOG}" ]] && { echo "--- ${SERVER_LOG} ---" >&2; tail -n 200 "${SERVER_LOG}" >&2; }
|
||||
return "${exit_code}"
|
||||
fi
|
||||
rm -rf "${WORKDIR}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
|
|
@ -23,30 +53,38 @@ echo "Running tests..."
|
|||
cargo test
|
||||
|
||||
echo "Starting PD..."
|
||||
cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 >/tmp/rdb-pd.log 2>&1 &
|
||||
cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
|
||||
PD_PID=$!
|
||||
sleep 2
|
||||
|
||||
echo "Starting Server..."
|
||||
cargo run --bin rdb-server -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 --data-dir /tmp/rdb-server >/tmp/rdb-server.log 2>&1 &
|
||||
cargo run --bin flaredb-server -- \
|
||||
--pd-addr 127.0.0.1:2479 \
|
||||
--addr 127.0.0.1:50052 \
|
||||
--data-dir "${DATA_DIR}" \
|
||||
--namespace-mode raw=eventual \
|
||||
--namespace-mode cas=strong \
|
||||
>"${SERVER_LOG}" 2>&1 &
|
||||
SERVER_PID=$!
|
||||
sleep 2
|
||||
|
||||
echo "Running Client Verification..."
|
||||
|
||||
echo "Testing TSO..."
|
||||
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 tso
|
||||
TSO_OUTPUT=$(run_client tso)
|
||||
[[ "${TSO_OUTPUT}" == Timestamp:* ]]
|
||||
|
||||
echo "Testing Raw Put/Get..."
|
||||
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-put --key foo --value bar
|
||||
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 raw-get --key foo
|
||||
run_client --namespace raw raw-put --key foo --value bar >/dev/null
|
||||
RAW_VALUE=$(run_client --namespace raw raw-get --key foo)
|
||||
[[ "${RAW_VALUE}" == "bar" ]]
|
||||
|
||||
echo "Testing CAS success..."
|
||||
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v1 --expected 0
|
||||
CAS_SUCCESS=$(run_client --namespace cas cas --key cas1 --value v1 --expected 0)
|
||||
[[ "${CAS_SUCCESS}" == Success,* ]]
|
||||
|
||||
echo "Testing CAS conflict..."
|
||||
set +e
|
||||
cargo run --bin rdb-client -- --pd-addr 127.0.0.1:2479 --addr 127.0.0.1:50052 cas --key cas1 --value v2 --expected 0
|
||||
set -e
|
||||
CAS_CONFLICT=$(run_client --namespace cas cas --key cas1 --value v2 --expected 0)
|
||||
[[ "${CAS_CONFLICT}" == Conflict!* ]]
|
||||
|
||||
echo "Verification Complete!"
|
||||
|
|
|
|||
|
|
@ -1,14 +1,17 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# Run key Multi-Raft test suites.
|
||||
echo "[verify] Running multi-region routing tests..."
|
||||
nix develop -c cargo test -q rdb-server::tests::test_multi_region
|
||||
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
|
||||
exec nix develop -c "$0" "$@"
|
||||
fi
|
||||
|
||||
echo "[verify] Running split tests..."
|
||||
nix develop -c cargo test -q rdb-server::tests::test_split
|
||||
echo "[verify] Running persistent snapshot recovery tests..."
|
||||
cargo test -p flaredb-raft persistent_storage::tests::test_snapshot_persistence_and_recovery
|
||||
|
||||
echo "[verify] Running confchange/move tests..."
|
||||
nix develop -c cargo test -q rdb-server::tests::test_confchange_move
|
||||
echo "[verify] Running leader election tests..."
|
||||
cargo test -p flaredb-raft raft_node::tests::test_leader_election
|
||||
|
||||
echo "[verify] Running server read-path tests..."
|
||||
cargo test -p flaredb-server service::tests::scan_returns_decoded_cas_keys
|
||||
|
||||
echo "[verify] Done."
|
||||
|
|
|
|||
|
|
@ -1,12 +1,23 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
|
||||
exec nix develop -c "$0" "$@"
|
||||
fi
|
||||
|
||||
export LIBCLANG_PATH=${LIBCLANG_PATH:-/nix/store/0zn99g048j67syaq97rczq5z0j8dsvc8-clang-21.1.2-lib/lib}
|
||||
|
||||
echo "[verify] formatting..."
|
||||
cargo fmt --all
|
||||
if ! find . \
|
||||
-path ./target -prune -o \
|
||||
-name '*.rs' -print0 | xargs -0 rustfmt --check; then
|
||||
echo "[verify] rustfmt drift detected; continuing with runtime tests" >&2
|
||||
fi
|
||||
|
||||
echo "[verify] running rdb-server tests..."
|
||||
nix-shell -p protobuf --run "LIBCLANG_PATH=${LIBCLANG_PATH} cargo test -p rdb-server --tests"
|
||||
echo "[verify] running FlareDB server tests..."
|
||||
cargo test -p flaredb-server --tests
|
||||
|
||||
echo "[verify] running FlareDB raft tests..."
|
||||
cargo test -p flaredb-raft
|
||||
|
||||
echo "[verify] done."
|
||||
|
|
|
|||
|
|
@ -1,40 +1,103 @@
|
|||
#!/usr/bin/env bash
|
||||
set -e
|
||||
set -euo pipefail
|
||||
|
||||
if [[ -z "${IN_NIX_SHELL:-}" ]] && command -v nix >/dev/null 2>&1; then
|
||||
exec nix develop -c "$0" "$@"
|
||||
fi
|
||||
|
||||
WORKDIR=$(mktemp -d)
|
||||
PD_LOG="${WORKDIR}/flaredb-pd.log"
|
||||
S1_LOG="${WORKDIR}/flaredb-server-1.log"
|
||||
S2_LOG="${WORKDIR}/flaredb-server-2.log"
|
||||
|
||||
run_client() {
|
||||
local addr="$1"
|
||||
shift
|
||||
local output=""
|
||||
local status=0
|
||||
local attempt=0
|
||||
while (( attempt < 20 )); do
|
||||
if output=$(cargo run --quiet --bin flaredb-client -- --addr "${addr}" --pd-addr 127.0.0.1:2479 "$@" 2>&1); then
|
||||
printf '%s\n' "${output}" | awk 'NF { last = $0 } END { print last }'
|
||||
return 0
|
||||
fi
|
||||
status=$?
|
||||
attempt=$((attempt + 1))
|
||||
sleep 1
|
||||
done
|
||||
printf '%s\n' "${output}" >&2
|
||||
return "${status}"
|
||||
}
|
||||
|
||||
cleanup() {
|
||||
local exit_code=$?
|
||||
if [[ -n "${PD_PID:-}" ]]; then
|
||||
kill "${PD_PID}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
if [[ -n "${S1_PID:-}" ]]; then
|
||||
kill "${S1_PID}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
if [[ -n "${S2_PID:-}" ]]; then
|
||||
kill "${S2_PID}" >/dev/null 2>&1 || true
|
||||
fi
|
||||
if (( exit_code != 0 )); then
|
||||
echo "verify-sharding failed; logs preserved at ${WORKDIR}" >&2
|
||||
[[ -f "${PD_LOG}" ]] && { echo "--- ${PD_LOG} ---" >&2; tail -n 200 "${PD_LOG}" >&2; }
|
||||
[[ -f "${S1_LOG}" ]] && { echo "--- ${S1_LOG} ---" >&2; tail -n 200 "${S1_LOG}" >&2; }
|
||||
[[ -f "${S2_LOG}" ]] && { echo "--- ${S2_LOG} ---" >&2; tail -n 200 "${S2_LOG}" >&2; }
|
||||
return "${exit_code}"
|
||||
fi
|
||||
rm -rf "${WORKDIR}"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
echo "Building workspace..."
|
||||
cargo build
|
||||
|
||||
echo "Starting PD..."
|
||||
cargo run --bin rdb-pd -- --addr 127.0.0.1:2479 &
|
||||
cargo run --bin flaredb-pd -- --addr 127.0.0.1:2479 >"${PD_LOG}" 2>&1 &
|
||||
PD_PID=$!
|
||||
sleep 2
|
||||
|
||||
echo "Starting Server 1 (127.0.0.1:50001, data1)..."
|
||||
# Port 50001
|
||||
cargo run --bin rdb-server -- --addr 127.0.0.1:50001 --data-dir data1 --pd-addr 127.0.0.1:2479 &
|
||||
cargo run --bin flaredb-server -- \
|
||||
--store-id 1 \
|
||||
--addr 127.0.0.1:50001 \
|
||||
--http-addr 127.0.0.1:8083 \
|
||||
--data-dir "${WORKDIR}/data1" \
|
||||
--pd-addr 127.0.0.1:2479 \
|
||||
--metrics-port 9093 \
|
||||
--namespace-mode raw=eventual \
|
||||
>"${S1_LOG}" 2>&1 &
|
||||
S1_PID=$!
|
||||
sleep 4
|
||||
|
||||
echo "Starting Server 2 (127.0.0.1:50002, data2)..."
|
||||
# Port 50002
|
||||
cargo run --bin rdb-server -- --addr 127.0.0.1:50002 --data-dir data2 --pd-addr 127.0.0.1:2479 &
|
||||
cargo run --bin flaredb-server -- \
|
||||
--store-id 2 \
|
||||
--addr 127.0.0.1:50002 \
|
||||
--http-addr 127.0.0.1:8084 \
|
||||
--data-dir "${WORKDIR}/data2" \
|
||||
--pd-addr 127.0.0.1:2479 \
|
||||
--metrics-port 9094 \
|
||||
--namespace-mode raw=eventual \
|
||||
>"${S2_LOG}" 2>&1 &
|
||||
S2_PID=$!
|
||||
|
||||
sleep 5 # Wait for registration
|
||||
sleep 5 # Wait for registration and leader routing to settle
|
||||
|
||||
echo "Running Client Verification (Sharding)..."
|
||||
echo "Running Client Verification (multi-node routing smoke)..."
|
||||
|
||||
# Put 'a' (Should go to S1)
|
||||
echo "Testing Put 'a'..."
|
||||
cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key a --value val_a
|
||||
run_client 127.0.0.1:50001 --namespace raw raw-put --key a --value val_a >/dev/null
|
||||
|
||||
# Put 'z' (Should go to S2)
|
||||
echo "Testing Put 'z'..."
|
||||
cargo run --bin rdb-client -- --addr 127.0.0.1:50001 --pd-addr 127.0.0.1:2479 raw-put --key z --value val_z
|
||||
run_client 127.0.0.1:50002 --namespace raw raw-put --key z --value val_z >/dev/null
|
||||
|
||||
# Cleanup
|
||||
kill $PD_PID
|
||||
kill $S1_PID
|
||||
kill $S2_PID
|
||||
rm -rf data1 data2
|
||||
echo "Testing reads from both nodes..."
|
||||
VALUE_A=$(run_client 127.0.0.1:50002 --namespace raw raw-get --key a)
|
||||
VALUE_Z=$(run_client 127.0.0.1:50001 --namespace raw raw-get --key z)
|
||||
[[ "${VALUE_A}" == "val_a" ]]
|
||||
[[ "${VALUE_Z}" == "val_z" ]]
|
||||
|
||||
echo "Sharding Verification Complete!"
|
||||
|
|
|
|||
607
flashdns/Cargo.lock
generated
607
flashdns/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
621
iam/Cargo.lock
generated
621
iam/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -23,6 +23,9 @@ prost = { workspace = true }
|
|||
base64 = { workspace = true }
|
||||
sha2 = { workspace = true }
|
||||
uuid = { workspace = true }
|
||||
aes-gcm = "0.10"
|
||||
argon2 = "0.5"
|
||||
rand_core = "0.6"
|
||||
|
||||
[dev-dependencies]
|
||||
tokio = { workspace = true, features = ["full", "test-util"] }
|
||||
|
|
|
|||
|
|
@ -8,12 +8,12 @@ use rand_core::{OsRng, RngCore};
|
|||
use tonic::{Request, Response, Status};
|
||||
|
||||
use iam_store::CredentialStore;
|
||||
use iam_types::{Argon2Params, CredentialRecord};
|
||||
use iam_types::{Argon2Params, CredentialRecord, PrincipalKind as TypesPrincipalKind};
|
||||
|
||||
use crate::proto::{
|
||||
iam_credential_server::IamCredential, CreateS3CredentialRequest,
|
||||
CreateS3CredentialResponse, Credential, GetSecretKeyRequest, GetSecretKeyResponse,
|
||||
ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
|
||||
ListCredentialsRequest, ListCredentialsResponse, PrincipalKind, RevokeCredentialRequest,
|
||||
RevokeCredentialResponse,
|
||||
};
|
||||
|
||||
|
|
@ -95,6 +95,15 @@ impl IamCredentialService {
|
|||
}
|
||||
}
|
||||
|
||||
fn map_principal_kind(kind: i32) -> Result<TypesPrincipalKind, Status> {
|
||||
match PrincipalKind::try_from(kind).unwrap_or(PrincipalKind::Unspecified) {
|
||||
PrincipalKind::User => Ok(TypesPrincipalKind::User),
|
||||
PrincipalKind::ServiceAccount => Ok(TypesPrincipalKind::ServiceAccount),
|
||||
PrincipalKind::Group => Ok(TypesPrincipalKind::Group),
|
||||
PrincipalKind::Unspecified => Err(Status::invalid_argument("principal_kind is required")),
|
||||
}
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl IamCredential for IamCredentialService {
|
||||
async fn create_s3_credential(
|
||||
|
|
@ -103,6 +112,7 @@ impl IamCredential for IamCredentialService {
|
|||
) -> Result<Response<CreateS3CredentialResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
let now = now_ts();
|
||||
let principal_kind = map_principal_kind(req.principal_kind)?;
|
||||
let (secret_b64, raw_secret) = Self::generate_secret();
|
||||
let (hash, kdf) = Self::hash_secret(&raw_secret);
|
||||
let secret_enc = self.encrypt_secret(&raw_secret)?;
|
||||
|
|
@ -111,6 +121,9 @@ impl IamCredential for IamCredentialService {
|
|||
let record = CredentialRecord {
|
||||
access_key_id: access_key_id.clone(),
|
||||
principal_id: req.principal_id.clone(),
|
||||
principal_kind,
|
||||
org_id: req.org_id.clone(),
|
||||
project_id: req.project_id.clone(),
|
||||
created_at: now,
|
||||
expires_at: req.expires_at,
|
||||
revoked: false,
|
||||
|
|
@ -168,6 +181,13 @@ impl IamCredential for IamCredentialService {
|
|||
secret_key: STANDARD.encode(secret),
|
||||
principal_id: record.principal_id,
|
||||
expires_at: record.expires_at,
|
||||
org_id: record.org_id,
|
||||
project_id: record.project_id,
|
||||
principal_kind: match record.principal_kind {
|
||||
TypesPrincipalKind::User => PrincipalKind::User as i32,
|
||||
TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
|
||||
TypesPrincipalKind::Group => PrincipalKind::Group as i32,
|
||||
},
|
||||
}))
|
||||
}
|
||||
|
||||
|
|
@ -190,6 +210,13 @@ impl IamCredential for IamCredentialService {
|
|||
expires_at: c.expires_at,
|
||||
revoked: c.revoked,
|
||||
description: c.description.unwrap_or_default(),
|
||||
org_id: c.org_id,
|
||||
project_id: c.project_id,
|
||||
principal_kind: match c.principal_kind {
|
||||
TypesPrincipalKind::User => PrincipalKind::User as i32,
|
||||
TypesPrincipalKind::ServiceAccount => PrincipalKind::ServiceAccount as i32,
|
||||
TypesPrincipalKind::Group => PrincipalKind::Group as i32,
|
||||
},
|
||||
})
|
||||
.collect();
|
||||
Ok(Response::new(ListCredentialsResponse { credentials: creds }))
|
||||
|
|
@ -230,6 +257,9 @@ mod tests {
|
|||
principal_id: "p1".into(),
|
||||
description: "".into(),
|
||||
expires_at: None,
|
||||
org_id: Some("org-a".into()),
|
||||
project_id: Some("project-a".into()),
|
||||
principal_kind: PrincipalKind::ServiceAccount as i32,
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
|
|
@ -247,6 +277,9 @@ mod tests {
|
|||
let fetched = STANDARD.decode(get.secret_key).unwrap();
|
||||
assert_eq!(orig, fetched);
|
||||
assert_eq!(get.principal_id, "p1");
|
||||
assert_eq!(get.org_id.as_deref(), Some("org-a"));
|
||||
assert_eq!(get.project_id.as_deref(), Some("project-a"));
|
||||
assert_eq!(get.principal_kind, PrincipalKind::ServiceAccount as i32);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -257,6 +290,9 @@ mod tests {
|
|||
principal_id: "pA".into(),
|
||||
description: "".into(),
|
||||
expires_at: None,
|
||||
org_id: Some("org-a".into()),
|
||||
project_id: Some("project-a".into()),
|
||||
principal_kind: PrincipalKind::ServiceAccount as i32,
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
|
|
@ -266,6 +302,9 @@ mod tests {
|
|||
principal_id: "pB".into(),
|
||||
description: "".into(),
|
||||
expires_at: None,
|
||||
org_id: Some("org-b".into()),
|
||||
project_id: Some("project-b".into()),
|
||||
principal_kind: PrincipalKind::ServiceAccount as i32,
|
||||
}))
|
||||
.await
|
||||
.unwrap();
|
||||
|
|
@ -289,6 +328,9 @@ mod tests {
|
|||
principal_id: "p1".into(),
|
||||
description: "".into(),
|
||||
expires_at: None,
|
||||
org_id: Some("org-a".into()),
|
||||
project_id: Some("project-a".into()),
|
||||
principal_kind: PrincipalKind::ServiceAccount as i32,
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
|
|
@ -297,7 +339,6 @@ mod tests {
|
|||
let revoke1 = svc
|
||||
.revoke_credential(Request::new(RevokeCredentialRequest {
|
||||
access_key_id: created.access_key_id.clone(),
|
||||
reason: "test".into(),
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
|
|
@ -307,7 +348,6 @@ mod tests {
|
|||
let revoke2 = svc
|
||||
.revoke_credential(Request::new(RevokeCredentialRequest {
|
||||
access_key_id: created.access_key_id.clone(),
|
||||
reason: "again".into(),
|
||||
}))
|
||||
.await
|
||||
.unwrap()
|
||||
|
|
@ -330,6 +370,9 @@ mod tests {
|
|||
let expired = CredentialRecord {
|
||||
access_key_id: "expired-ak".into(),
|
||||
principal_id: "p1".into(),
|
||||
principal_kind: TypesPrincipalKind::ServiceAccount,
|
||||
org_id: Some("org-a".into()),
|
||||
project_id: Some("project-a".into()),
|
||||
created_at: now_ts(),
|
||||
expires_at: Some(now_ts() - 10),
|
||||
revoked: false,
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
mod conversions;
|
||||
mod credential_service;
|
||||
mod gateway_auth_service;
|
||||
mod generated;
|
||||
pub mod iam_service;
|
||||
|
|
@ -8,7 +9,10 @@ pub mod proto {
|
|||
pub use crate::generated::iam::v1::*;
|
||||
}
|
||||
|
||||
pub use generated::iam::v1::{iam_admin_server, iam_authz_server, iam_token_server};
|
||||
pub use generated::iam::v1::{
|
||||
iam_admin_server, iam_authz_server, iam_credential_server, iam_token_server,
|
||||
};
|
||||
pub use credential_service::IamCredentialService;
|
||||
pub use gateway_auth_service::GatewayAuthServiceImpl;
|
||||
pub use iam_service::{IamAdminService, IamAuthzService};
|
||||
pub use token_service::IamTokenService;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
//!
|
||||
//! Provides a thin gRPC client for interacting with the IAM service.
|
||||
|
||||
use std::future::Future;
|
||||
use std::time::Duration;
|
||||
|
||||
use iam_api::proto::{
|
||||
|
|
@ -19,6 +20,10 @@ use iam_types::{
|
|||
};
|
||||
use tonic::transport::{Channel, ClientTlsConfig, Endpoint};
|
||||
|
||||
const TRANSIENT_RPC_RETRY_ATTEMPTS: usize = 3;
|
||||
const TRANSIENT_RPC_INITIAL_BACKOFF: Duration = Duration::from_millis(200);
|
||||
const TRANSIENT_RPC_MAX_BACKOFF: Duration = Duration::from_millis(1_000);
|
||||
|
||||
/// Configuration for the IAM client
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IamClientConfig {
|
||||
|
|
@ -100,6 +105,40 @@ impl IamClient {
|
|||
IamTokenClient::new(self.channel.clone())
|
||||
}
|
||||
|
||||
async fn call_with_retry<T, F, Fut>(operation: &'static str, mut op: F) -> Result<T>
|
||||
where
|
||||
F: FnMut() -> Fut,
|
||||
Fut: Future<Output = std::result::Result<T, tonic::Status>>,
|
||||
{
|
||||
let mut last_status = None;
|
||||
for attempt in 0..TRANSIENT_RPC_RETRY_ATTEMPTS {
|
||||
match op().await {
|
||||
Ok(value) => return Ok(value),
|
||||
Err(status)
|
||||
if attempt + 1 < TRANSIENT_RPC_RETRY_ATTEMPTS
|
||||
&& is_retryable_status(&status) =>
|
||||
{
|
||||
let delay = retry_delay(attempt);
|
||||
tracing::warn!(
|
||||
operation,
|
||||
attempt = attempt + 1,
|
||||
retry_after_ms = delay.as_millis() as u64,
|
||||
code = ?status.code(),
|
||||
message = status.message(),
|
||||
"retrying transient IAM RPC"
|
||||
);
|
||||
last_status = Some(status);
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
Err(status) => return Err(map_status(status)),
|
||||
}
|
||||
}
|
||||
|
||||
Err(map_status(last_status.unwrap_or_else(|| {
|
||||
tonic::Status::internal(format!("IAM RPC {operation} failed without a status"))
|
||||
})))
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Authorization APIs
|
||||
// ========================================================================
|
||||
|
|
@ -128,7 +167,6 @@ impl IamClient {
|
|||
resource: &Resource,
|
||||
context: std::collections::HashMap<String, String>,
|
||||
) -> Result<bool> {
|
||||
let mut client = self.authz_client();
|
||||
let request = AuthorizeRequest {
|
||||
principal: Some(to_proto_principal_ref(&principal.to_ref())),
|
||||
action: action.to_string(),
|
||||
|
|
@ -151,10 +189,12 @@ impl IamClient {
|
|||
}),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.authorize(request)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("authorize", || {
|
||||
let mut client = self.authz_client();
|
||||
let request = request.clone();
|
||||
async move { client.authorize(request).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
Ok(resp.allowed)
|
||||
|
|
@ -166,7 +206,6 @@ impl IamClient {
|
|||
|
||||
/// Create a new user
|
||||
pub async fn create_user(&self, id: &str, name: &str) -> Result<Principal> {
|
||||
let mut client = self.admin_client();
|
||||
let req = CreatePrincipalRequest {
|
||||
id: id.into(),
|
||||
kind: ProtoPrincipalKind::User as i32,
|
||||
|
|
@ -177,25 +216,31 @@ impl IamClient {
|
|||
metadata: Default::default(),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.create_principal(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("create_principal", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.create_principal(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(ProtoPrincipal::into(resp))
|
||||
}
|
||||
|
||||
/// Get a principal
|
||||
pub async fn get_principal(&self, principal_ref: &PrincipalRef) -> Result<Option<Principal>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = GetPrincipalRequest {
|
||||
principal: Some(to_proto_principal_ref(principal_ref)),
|
||||
};
|
||||
let resp = client.get_principal(req).await;
|
||||
let resp = Self::call_with_retry("get_principal", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.get_principal(req).await }
|
||||
})
|
||||
.await;
|
||||
match resp {
|
||||
Ok(r) => Ok(Some(ProtoPrincipal::into(r.into_inner()))),
|
||||
Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
|
||||
Err(status) => Err(map_status(status)),
|
||||
Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -206,7 +251,6 @@ impl IamClient {
|
|||
name: &str,
|
||||
project_id: &str,
|
||||
) -> Result<Principal> {
|
||||
let mut client = self.admin_client();
|
||||
let req = CreatePrincipalRequest {
|
||||
id: id.into(),
|
||||
kind: ProtoPrincipalKind::ServiceAccount as i32,
|
||||
|
|
@ -216,17 +260,18 @@ impl IamClient {
|
|||
email: None,
|
||||
metadata: Default::default(),
|
||||
};
|
||||
let resp = client
|
||||
.create_principal(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("create_service_account", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.create_principal(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(ProtoPrincipal::into(resp))
|
||||
}
|
||||
|
||||
/// List users
|
||||
pub async fn list_users(&self) -> Result<Vec<Principal>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = ListPrincipalsRequest {
|
||||
kind: Some(ProtoPrincipalKind::User as i32),
|
||||
org_id: None,
|
||||
|
|
@ -235,10 +280,12 @@ impl IamClient {
|
|||
page_token: String::new(),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.list_principals(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("list_principals", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.list_principals(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
Ok(resp
|
||||
|
|
@ -254,36 +301,40 @@ impl IamClient {
|
|||
|
||||
/// Get a role by name
|
||||
pub async fn get_role(&self, name: &str) -> Result<Option<Role>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = GetRoleRequest { name: name.into() };
|
||||
let resp = client.get_role(req).await;
|
||||
let resp = Self::call_with_retry("get_role", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.get_role(req).await }
|
||||
})
|
||||
.await;
|
||||
match resp {
|
||||
Ok(r) => Ok(Some(r.into_inner().into())),
|
||||
Err(status) if status.code() == tonic::Code::NotFound => Ok(None),
|
||||
Err(status) => Err(map_status(status)),
|
||||
Err(Error::Internal(message)) if tonic_not_found(&message) => Ok(None),
|
||||
Err(err) => Err(err),
|
||||
}
|
||||
}
|
||||
|
||||
/// List all roles
|
||||
pub async fn list_roles(&self) -> Result<Vec<Role>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = ListRolesRequest {
|
||||
scope: None,
|
||||
include_builtin: true,
|
||||
page_size: 0,
|
||||
page_token: String::new(),
|
||||
};
|
||||
let resp = client
|
||||
.list_roles(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("list_roles", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.list_roles(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.roles.into_iter().map(Into::into).collect())
|
||||
}
|
||||
|
||||
/// Create a custom role
|
||||
pub async fn create_role(&self, role: &Role) -> Result<Role> {
|
||||
let mut client = self.admin_client();
|
||||
let req = CreateRoleRequest {
|
||||
name: role.name.clone(),
|
||||
display_name: role.display_name.clone(),
|
||||
|
|
@ -297,10 +348,12 @@ impl IamClient {
|
|||
.collect(),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.create_role(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("create_role", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.create_role(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
|
@ -311,7 +364,6 @@ impl IamClient {
|
|||
|
||||
/// Create a policy binding
|
||||
pub async fn create_binding(&self, binding: &PolicyBinding) -> Result<PolicyBinding> {
|
||||
let mut client = self.admin_client();
|
||||
let req = CreateBindingRequest {
|
||||
principal: Some(to_proto_principal_ref(&binding.principal_ref)),
|
||||
role: binding.role_ref.clone(),
|
||||
|
|
@ -320,24 +372,27 @@ impl IamClient {
|
|||
expires_at: binding.expires_at,
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.create_binding(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("create_binding", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.create_binding(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.into())
|
||||
}
|
||||
|
||||
/// Delete a policy binding
|
||||
pub async fn delete_binding(&self, binding_id: &str) -> Result<bool> {
|
||||
let mut client = self.admin_client();
|
||||
let req = DeleteBindingRequest {
|
||||
id: binding_id.into(),
|
||||
};
|
||||
let resp = client
|
||||
.delete_binding(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("delete_binding", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.delete_binding(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.deleted)
|
||||
}
|
||||
|
|
@ -347,7 +402,6 @@ impl IamClient {
|
|||
&self,
|
||||
principal: &PrincipalRef,
|
||||
) -> Result<Vec<PolicyBinding>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = ListBindingsRequest {
|
||||
principal: Some(to_proto_principal_ref(principal)),
|
||||
role: None,
|
||||
|
|
@ -357,17 +411,18 @@ impl IamClient {
|
|||
page_token: String::new(),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.list_bindings(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("list_bindings_for_principal", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.list_bindings(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.bindings.into_iter().map(Into::into).collect())
|
||||
}
|
||||
|
||||
/// List bindings for a scope
|
||||
pub async fn list_bindings_for_scope(&self, scope: &Scope) -> Result<Vec<PolicyBinding>> {
|
||||
let mut client = self.admin_client();
|
||||
let req = ListBindingsRequest {
|
||||
principal: None,
|
||||
role: None,
|
||||
|
|
@ -377,10 +432,12 @@ impl IamClient {
|
|||
page_token: String::new(),
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.list_bindings(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("list_bindings_for_scope", || {
|
||||
let mut client = self.admin_client();
|
||||
let req = req.clone();
|
||||
async move { client.list_bindings(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.bindings.into_iter().map(Into::into).collect())
|
||||
}
|
||||
|
|
@ -397,7 +454,6 @@ impl IamClient {
|
|||
scope: Scope,
|
||||
ttl_seconds: u64,
|
||||
) -> Result<String> {
|
||||
let mut client = self.token_client();
|
||||
let req = IssueTokenRequest {
|
||||
principal_id: principal.id.clone(),
|
||||
principal_kind: match principal.kind {
|
||||
|
|
@ -410,24 +466,27 @@ impl IamClient {
|
|||
ttl_seconds,
|
||||
};
|
||||
|
||||
let resp = client
|
||||
.issue_token(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("issue_token", || {
|
||||
let mut client = self.token_client();
|
||||
let req = req.clone();
|
||||
async move { client.issue_token(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(resp.token)
|
||||
}
|
||||
|
||||
/// Validate a token
|
||||
pub async fn validate_token(&self, token: &str) -> Result<InternalTokenClaims> {
|
||||
let mut client = self.token_client();
|
||||
let req = ValidateTokenRequest {
|
||||
token: token.to_string(),
|
||||
};
|
||||
let resp = client
|
||||
.validate_token(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
let resp = Self::call_with_retry("validate_token", || {
|
||||
let mut client = self.token_client();
|
||||
let req = req.clone();
|
||||
async move { client.validate_token(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
|
||||
if !resp.valid {
|
||||
|
|
@ -479,20 +538,55 @@ impl IamClient {
|
|||
|
||||
/// Revoke a token
|
||||
pub async fn revoke_token(&self, token: &str) -> Result<()> {
|
||||
let mut client = self.token_client();
|
||||
let req = RevokeTokenRequest {
|
||||
token: token.to_string(),
|
||||
reason: "client revoke".into(),
|
||||
};
|
||||
client
|
||||
.revoke_token(req)
|
||||
.await
|
||||
.map_err(map_status)?
|
||||
Self::call_with_retry("revoke_token", || {
|
||||
let mut client = self.token_client();
|
||||
let req = req.clone();
|
||||
async move { client.revoke_token(req).await }
|
||||
})
|
||||
.await?
|
||||
.into_inner();
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn retry_delay(attempt: usize) -> Duration {
|
||||
TRANSIENT_RPC_INITIAL_BACKOFF
|
||||
.saturating_mul(1u32 << attempt.min(3))
|
||||
.min(TRANSIENT_RPC_MAX_BACKOFF)
|
||||
}
|
||||
|
||||
fn is_retryable_status(status: &tonic::Status) -> bool {
|
||||
matches!(
|
||||
status.code(),
|
||||
tonic::Code::Unavailable
|
||||
| tonic::Code::Cancelled
|
||||
| tonic::Code::DeadlineExceeded
|
||||
| tonic::Code::Unknown
|
||||
) || retryable_message(status.message())
|
||||
}
|
||||
|
||||
fn retryable_message(message: &str) -> bool {
|
||||
let lower = message.to_ascii_lowercase();
|
||||
[
|
||||
"transport error",
|
||||
"connection was not ready",
|
||||
"h2 protocol error",
|
||||
"broken pipe",
|
||||
"connection refused",
|
||||
"connection reset",
|
||||
]
|
||||
.iter()
|
||||
.any(|needle| lower.contains(needle))
|
||||
}
|
||||
|
||||
fn tonic_not_found(message: &str) -> bool {
|
||||
message.contains("status: NotFound") || message.contains("code: NotFound")
|
||||
}
|
||||
|
||||
fn map_status(status: tonic::Status) -> Error {
|
||||
Error::Internal(status.to_string())
|
||||
}
|
||||
|
|
@ -507,3 +601,75 @@ fn to_proto_principal_ref(principal_ref: &PrincipalRef) -> ProtoPrincipalRef {
|
|||
id: principal_ref.id.clone(),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
};
|
||||
|
||||
#[test]
|
||||
fn retryable_message_covers_connection_readiness() {
|
||||
assert!(retryable_message("transport error"));
|
||||
assert!(retryable_message("connection was not ready"));
|
||||
assert!(retryable_message("h2 protocol error"));
|
||||
assert!(!retryable_message("permission denied"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn retry_delay_is_capped() {
|
||||
assert_eq!(retry_delay(0), Duration::from_millis(200));
|
||||
assert_eq!(retry_delay(1), Duration::from_millis(400));
|
||||
assert_eq!(retry_delay(2), Duration::from_millis(800));
|
||||
assert_eq!(retry_delay(3), Duration::from_millis(1000));
|
||||
assert_eq!(retry_delay(7), Duration::from_millis(1000));
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn call_with_retry_retries_transient_statuses() {
|
||||
let attempts = Arc::new(AtomicUsize::new(0));
|
||||
let attempts_for_task = attempts.clone();
|
||||
let task = tokio::spawn(async move {
|
||||
IamClient::call_with_retry("test", || {
|
||||
let attempts = attempts_for_task.clone();
|
||||
async move {
|
||||
let attempt = attempts.fetch_add(1, Ordering::SeqCst);
|
||||
if attempt < 2 {
|
||||
Err(tonic::Status::unavailable("connection was not ready"))
|
||||
} else {
|
||||
Ok("ok")
|
||||
}
|
||||
}
|
||||
})
|
||||
.await
|
||||
});
|
||||
|
||||
tokio::time::advance(Duration::from_secs(3)).await;
|
||||
assert_eq!(task.await.unwrap().unwrap(), "ok");
|
||||
assert_eq!(attempts.load(Ordering::SeqCst), 3);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn call_with_retry_stops_on_non_retryable_status() {
|
||||
let attempts = Arc::new(AtomicUsize::new(0));
|
||||
let attempts_for_task = attempts.clone();
|
||||
|
||||
let err = IamClient::call_with_retry("test", || {
|
||||
let attempts = attempts_for_task.clone();
|
||||
async move {
|
||||
attempts.fetch_add(1, Ordering::SeqCst);
|
||||
Err::<(), _>(tonic::Status::permission_denied("nope"))
|
||||
}
|
||||
})
|
||||
.await
|
||||
.unwrap_err();
|
||||
|
||||
assert_eq!(attempts.load(Ordering::SeqCst), 1);
|
||||
match err {
|
||||
Error::Internal(message) => assert!(message.contains("PermissionDenied")),
|
||||
other => panic!("unexpected error: {other:?}"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -20,12 +20,15 @@ use tracing::{info, warn};
|
|||
|
||||
use iam_api::{
|
||||
iam_admin_server::IamAdminServer, iam_authz_server::IamAuthzServer,
|
||||
iam_token_server::IamTokenServer, GatewayAuthServiceImpl, GatewayAuthServiceServer,
|
||||
IamAdminService, IamAuthzService, IamTokenService,
|
||||
iam_credential_server::IamCredentialServer, iam_token_server::IamTokenServer,
|
||||
GatewayAuthServiceImpl, GatewayAuthServiceServer, IamAdminService, IamAuthzService,
|
||||
IamCredentialService, IamTokenService,
|
||||
};
|
||||
use iam_authn::{InternalTokenConfig, InternalTokenService, SigningKey};
|
||||
use iam_authz::{PolicyCache, PolicyCacheConfig, PolicyEvaluator};
|
||||
use iam_store::{Backend, BackendConfig, BindingStore, PrincipalStore, RoleStore, TokenStore};
|
||||
use iam_store::{
|
||||
Backend, BackendConfig, BindingStore, CredentialStore, PrincipalStore, RoleStore, TokenStore,
|
||||
};
|
||||
|
||||
use config::{BackendKind, ServerConfig};
|
||||
|
||||
|
|
@ -190,6 +193,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
let principal_store = Arc::new(PrincipalStore::new(backend.clone()));
|
||||
let role_store = Arc::new(RoleStore::new(backend.clone()));
|
||||
let binding_store = Arc::new(BindingStore::new(backend.clone()));
|
||||
let credential_store = Arc::new(CredentialStore::new(backend.clone()));
|
||||
let token_store = Arc::new(TokenStore::new(backend.clone()));
|
||||
|
||||
// Initialize builtin roles
|
||||
|
|
@ -238,7 +242,8 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
)
|
||||
};
|
||||
|
||||
let token_config = InternalTokenConfig::new(signing_key, &config.authn.internal_token.issuer)
|
||||
let token_config =
|
||||
InternalTokenConfig::new(signing_key.clone(), &config.authn.internal_token.issuer)
|
||||
.with_default_ttl(Duration::from_secs(
|
||||
config.authn.internal_token.default_ttl_seconds,
|
||||
))
|
||||
|
|
@ -248,6 +253,16 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
|
||||
let token_service = Arc::new(InternalTokenService::new(token_config));
|
||||
let admin_token = load_admin_token();
|
||||
let credential_master_key = std::env::var("IAM_CRED_MASTER_KEY")
|
||||
.ok()
|
||||
.map(|value| value.into_bytes())
|
||||
.filter(|value| value.len() == 32)
|
||||
.unwrap_or_else(|| {
|
||||
warn!(
|
||||
"IAM_CRED_MASTER_KEY missing or not 32 bytes, deriving credential key from signing key",
|
||||
);
|
||||
signing_key.sign(b"iam-credential-master-key")
|
||||
});
|
||||
|
||||
// Create gRPC services
|
||||
let authz_service = IamAuthzService::new(evaluator.clone(), principal_store.clone());
|
||||
|
|
@ -262,6 +277,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
token_store.clone(),
|
||||
evaluator.clone(),
|
||||
);
|
||||
let credential_service =
|
||||
IamCredentialService::new(credential_store, &credential_master_key, "iam-cred-master")
|
||||
.map_err(|e| format!("Failed to initialize credential service: {}", e))?;
|
||||
let admin_service = IamAdminService::new(
|
||||
principal_store.clone(),
|
||||
role_store.clone(),
|
||||
|
|
@ -291,6 +309,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
health_reporter
|
||||
.set_serving::<IamTokenServer<IamTokenService>>()
|
||||
.await;
|
||||
health_reporter
|
||||
.set_serving::<IamCredentialServer<IamCredentialService>>()
|
||||
.await;
|
||||
health_reporter
|
||||
.set_serving::<IamAdminServer<IamAdminService>>()
|
||||
.await;
|
||||
|
|
@ -357,6 +378,7 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
.add_service(health_service)
|
||||
.add_service(IamAuthzServer::new(authz_service))
|
||||
.add_service(IamTokenServer::new(token_grpc_service))
|
||||
.add_service(IamCredentialServer::new(credential_service))
|
||||
.add_service(GatewayAuthServiceServer::new(gateway_auth_service))
|
||||
.add_service(admin_server)
|
||||
.serve(config.server.addr);
|
||||
|
|
|
|||
|
|
@ -9,5 +9,6 @@ iam-client = { path = "../iam-client" }
|
|||
iam-types = { path = "../iam-types" }
|
||||
tonic = { workspace = true }
|
||||
tracing = { workspace = true }
|
||||
tokio = { workspace = true }
|
||||
http = "1"
|
||||
serde_json = "1"
|
||||
|
|
|
|||
|
|
@ -16,6 +16,9 @@ use tracing::{debug, warn};
|
|||
const PHOTON_AUTH_TOKEN_HEADER: &str = "x-photon-auth-token";
|
||||
const DEFAULT_TOKEN_CACHE_TTL_MS: u64 = 5_000;
|
||||
const DEFAULT_AUTHZ_CACHE_TTL_MS: u64 = 3_000;
|
||||
const AUTH_CONNECT_RETRY_ATTEMPTS: usize = 6;
|
||||
const AUTH_CONNECT_INITIAL_BACKOFF: Duration = Duration::from_millis(500);
|
||||
const AUTH_CONNECT_MAX_BACKOFF: Duration = Duration::from_secs(5);
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct CacheEntry<T> {
|
||||
|
|
@ -64,9 +67,7 @@ impl AuthService {
|
|||
config = config.without_tls();
|
||||
}
|
||||
|
||||
let iam_client = IamClient::connect(config)
|
||||
.await
|
||||
.map_err(|e| format!("Failed to connect to IAM server: {}", e))?;
|
||||
let iam_client = connect_iam_with_retry(config).await?;
|
||||
|
||||
Ok(Self {
|
||||
iam_client: Arc::new(iam_client),
|
||||
|
|
@ -273,6 +274,59 @@ impl AuthService {
|
|||
}
|
||||
}
|
||||
|
||||
async fn connect_iam_with_retry(config: IamClientConfig) -> Result<IamClient, String> {
|
||||
let mut last_error = None;
|
||||
for attempt in 0..AUTH_CONNECT_RETRY_ATTEMPTS {
|
||||
match IamClient::connect(config.clone()).await {
|
||||
Ok(client) => return Ok(client),
|
||||
Err(err)
|
||||
if attempt + 1 < AUTH_CONNECT_RETRY_ATTEMPTS
|
||||
&& retryable_connect_error(&err.to_string()) =>
|
||||
{
|
||||
let delay = auth_connect_retry_delay(attempt);
|
||||
warn!(
|
||||
attempt = attempt + 1,
|
||||
retry_after_ms = delay.as_millis() as u64,
|
||||
error = %err,
|
||||
"retrying IAM auth service bootstrap connection"
|
||||
);
|
||||
last_error = Some(err.to_string());
|
||||
tokio::time::sleep(delay).await;
|
||||
}
|
||||
Err(err) => {
|
||||
return Err(format!("Failed to connect to IAM server: {}", err));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(format!(
|
||||
"Failed to connect to IAM server: {}",
|
||||
last_error.unwrap_or_else(|| "unknown connection error".to_string())
|
||||
))
|
||||
}
|
||||
|
||||
fn auth_connect_retry_delay(attempt: usize) -> Duration {
|
||||
AUTH_CONNECT_INITIAL_BACKOFF
|
||||
.saturating_mul(1u32 << attempt.min(4))
|
||||
.min(AUTH_CONNECT_MAX_BACKOFF)
|
||||
}
|
||||
|
||||
fn retryable_connect_error(message: &str) -> bool {
|
||||
let lower = message.to_ascii_lowercase();
|
||||
[
|
||||
"transport error",
|
||||
"connection refused",
|
||||
"connection was not ready",
|
||||
"operation timed out",
|
||||
"deadline has elapsed",
|
||||
"dns error",
|
||||
"broken pipe",
|
||||
"connection reset",
|
||||
]
|
||||
.iter()
|
||||
.any(|needle| lower.contains(needle))
|
||||
}
|
||||
|
||||
fn prune_expired<T>(cache: &mut HashMap<String, CacheEntry<T>>) {
|
||||
let now = Instant::now();
|
||||
cache.retain(|_, entry| entry.expires_at > now);
|
||||
|
|
@ -400,6 +454,29 @@ fn extract_token_from_metadata(metadata: &MetadataMap) -> Result<String, Status>
|
|||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn retryable_connect_error_matches_transport_failures() {
|
||||
assert!(retryable_connect_error("Internal error: transport error"));
|
||||
assert!(retryable_connect_error("connection was not ready"));
|
||||
assert!(retryable_connect_error("deadline has elapsed"));
|
||||
assert!(!retryable_connect_error("permission denied"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn auth_connect_retry_delay_is_capped() {
|
||||
assert_eq!(auth_connect_retry_delay(0), Duration::from_millis(500));
|
||||
assert_eq!(auth_connect_retry_delay(1), Duration::from_millis(1000));
|
||||
assert_eq!(auth_connect_retry_delay(2), Duration::from_millis(2000));
|
||||
assert_eq!(auth_connect_retry_delay(3), Duration::from_millis(4000));
|
||||
assert_eq!(auth_connect_retry_delay(4), Duration::from_secs(5));
|
||||
assert_eq!(auth_connect_retry_delay(8), Duration::from_secs(5));
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_token_from_headers(headers: &HeaderMap) -> Result<String, Status> {
|
||||
if let Some(auth_header) = headers.get(AUTHORIZATION) {
|
||||
let auth_str = auth_header
|
||||
|
|
|
|||
|
|
@ -1,24 +1,25 @@
|
|||
//! Credential storage (access/secret key metadata)
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use iam_types::{CredentialRecord, Result};
|
||||
|
||||
use crate::backend::JsonStore;
|
||||
use crate::{DynMetadataClient, MetadataClient};
|
||||
use crate::backend::{Backend, CasResult, JsonStore, StorageBackend};
|
||||
|
||||
/// Store for credentials (S3/API keys)
|
||||
pub struct CredentialStore {
|
||||
client: DynMetadataClient,
|
||||
backend: Arc<Backend>,
|
||||
}
|
||||
|
||||
impl JsonStore for CredentialStore {
|
||||
fn client(&self) -> &dyn MetadataClient {
|
||||
self.client.as_ref()
|
||||
fn backend(&self) -> &Backend {
|
||||
&self.backend
|
||||
}
|
||||
}
|
||||
|
||||
impl CredentialStore {
|
||||
pub fn new(client: DynMetadataClient) -> Self {
|
||||
Self { client }
|
||||
pub fn new(backend: Arc<Backend>) -> Self {
|
||||
Self { backend }
|
||||
}
|
||||
|
||||
pub async fn put(&self, record: &CredentialRecord) -> Result<u64> {
|
||||
|
|
@ -36,13 +37,17 @@ impl CredentialStore {
|
|||
principal_id: &str,
|
||||
limit: u32,
|
||||
) -> Result<Vec<CredentialRecord>> {
|
||||
// scan prefix and filter by principal_id; small cardinality expected
|
||||
let prefix = b"iam/credentials/";
|
||||
let items = self.scan_prefix_json::<CredentialRecord>(prefix, limit).await?;
|
||||
Ok(items
|
||||
.into_iter()
|
||||
.filter(|rec| rec.principal_id == principal_id)
|
||||
.collect())
|
||||
let items = self.backend.scan_prefix(prefix, limit).await?;
|
||||
let mut credentials = Vec::new();
|
||||
for pair in items {
|
||||
let record: CredentialRecord = serde_json::from_slice(&pair.value)
|
||||
.map_err(|e| iam_types::Error::Serialization(e.to_string()))?;
|
||||
if record.principal_id == principal_id {
|
||||
credentials.push(record);
|
||||
}
|
||||
}
|
||||
Ok(credentials)
|
||||
}
|
||||
|
||||
pub async fn revoke(&self, access_key_id: &str) -> Result<bool> {
|
||||
|
|
@ -56,13 +61,10 @@ impl CredentialStore {
|
|||
return Ok(false);
|
||||
}
|
||||
record.revoked = true;
|
||||
match self
|
||||
.cas_json(key.as_bytes(), version, &record)
|
||||
.await?
|
||||
{
|
||||
crate::CasResult::Success(_) => Ok(true),
|
||||
crate::CasResult::Conflict { .. } => Ok(false),
|
||||
crate::CasResult::NotFound => Ok(false),
|
||||
match self.cas_json(key.as_bytes(), version, &record).await? {
|
||||
CasResult::Success(_) => Ok(true),
|
||||
CasResult::Conflict { .. } => Ok(false),
|
||||
CasResult::NotFound => Ok(false),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
pub mod backend;
|
||||
pub mod binding_store;
|
||||
pub mod credential_store;
|
||||
pub mod group_store;
|
||||
pub mod principal_store;
|
||||
pub mod role_store;
|
||||
|
|
@ -14,6 +15,7 @@ pub mod token_store;
|
|||
|
||||
pub use backend::{Backend, BackendConfig, CasResult, KvPair, StorageBackend};
|
||||
pub use binding_store::BindingStore;
|
||||
pub use credential_store::CredentialStore;
|
||||
pub use group_store::GroupStore;
|
||||
pub use principal_store::PrincipalStore;
|
||||
pub use role_store::RoleStore;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,8 @@
|
|||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::PrincipalKind;
|
||||
|
||||
/// Argon2 parameters used to hash the secret key
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct Argon2Params {
|
||||
|
|
@ -17,6 +19,9 @@ pub struct Argon2Params {
|
|||
pub struct CredentialRecord {
|
||||
pub access_key_id: String,
|
||||
pub principal_id: String,
|
||||
pub principal_kind: PrincipalKind,
|
||||
pub org_id: Option<String>,
|
||||
pub project_id: Option<String>,
|
||||
pub created_at: u64,
|
||||
pub expires_at: Option<u64>,
|
||||
pub revoked: bool,
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@
|
|||
//! - Error types
|
||||
|
||||
pub mod condition;
|
||||
pub mod credential;
|
||||
pub mod error;
|
||||
pub mod policy;
|
||||
pub mod principal;
|
||||
|
|
@ -19,6 +20,7 @@ pub mod scope;
|
|||
pub mod token;
|
||||
|
||||
pub use condition::{Condition, ConditionExpr};
|
||||
pub use credential::{Argon2Params, CredentialRecord};
|
||||
pub use error::{Error, IamError, Result, StorageError};
|
||||
pub use policy::{CreateBindingRequest, EffectivePolicy, PolicyBinding};
|
||||
pub use principal::{Principal, PrincipalKind, PrincipalRef};
|
||||
|
|
|
|||
|
|
@ -89,6 +89,14 @@ service IamToken {
|
|||
rpc RefreshToken(RefreshTokenRequest) returns (RefreshTokenResponse);
|
||||
}
|
||||
|
||||
// IamCredential manages S3-style access/secret key credentials.
|
||||
service IamCredential {
|
||||
rpc CreateS3Credential(CreateS3CredentialRequest) returns (CreateS3CredentialResponse);
|
||||
rpc GetSecretKey(GetSecretKeyRequest) returns (GetSecretKeyResponse);
|
||||
rpc ListCredentials(ListCredentialsRequest) returns (ListCredentialsResponse);
|
||||
rpc RevokeCredential(RevokeCredentialRequest) returns (RevokeCredentialResponse);
|
||||
}
|
||||
|
||||
message IssueTokenRequest {
|
||||
// Principal to issue token for
|
||||
string principal_id = 1;
|
||||
|
|
@ -162,6 +170,63 @@ message RefreshTokenResponse {
|
|||
uint64 expires_at = 2;
|
||||
}
|
||||
|
||||
message CreateS3CredentialRequest {
|
||||
string principal_id = 1;
|
||||
string description = 2;
|
||||
optional uint64 expires_at = 3;
|
||||
optional string org_id = 4;
|
||||
optional string project_id = 5;
|
||||
PrincipalKind principal_kind = 6;
|
||||
}
|
||||
|
||||
message CreateS3CredentialResponse {
|
||||
string access_key_id = 1;
|
||||
string secret_key = 2;
|
||||
uint64 created_at = 3;
|
||||
optional uint64 expires_at = 4;
|
||||
}
|
||||
|
||||
message GetSecretKeyRequest {
|
||||
string access_key_id = 1;
|
||||
}
|
||||
|
||||
message GetSecretKeyResponse {
|
||||
string secret_key = 1;
|
||||
string principal_id = 2;
|
||||
optional uint64 expires_at = 3;
|
||||
optional string org_id = 4;
|
||||
optional string project_id = 5;
|
||||
PrincipalKind principal_kind = 6;
|
||||
}
|
||||
|
||||
message ListCredentialsRequest {
|
||||
string principal_id = 1;
|
||||
}
|
||||
|
||||
message Credential {
|
||||
string access_key_id = 1;
|
||||
string principal_id = 2;
|
||||
uint64 created_at = 3;
|
||||
optional uint64 expires_at = 4;
|
||||
bool revoked = 5;
|
||||
string description = 6;
|
||||
optional string org_id = 7;
|
||||
optional string project_id = 8;
|
||||
PrincipalKind principal_kind = 9;
|
||||
}
|
||||
|
||||
message ListCredentialsResponse {
|
||||
repeated Credential credentials = 1;
|
||||
}
|
||||
|
||||
message RevokeCredentialRequest {
|
||||
string access_key_id = 1;
|
||||
}
|
||||
|
||||
message RevokeCredentialResponse {
|
||||
bool success = 1;
|
||||
}
|
||||
|
||||
message InternalTokenClaims {
|
||||
string principal_id = 1;
|
||||
PrincipalKind principal_kind = 2;
|
||||
|
|
|
|||
796
k8shost/Cargo.lock
generated
796
k8shost/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
588
lightningstor/Cargo.lock
generated
588
lightningstor/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -10,6 +10,8 @@ use crate::node::{NodeClientTrait, NodeRegistry};
|
|||
use crate::placement::{ConsistentHashSelector, NodeSelector};
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
|
||||
use lightningstor_types::ObjectId;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
|
@ -336,7 +338,7 @@ impl ErasureCodedBackend {
|
|||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
|
||||
// Try to read all shards in parallel
|
||||
let mut shard_futures = Vec::with_capacity(self.total_shards());
|
||||
let mut shard_futures = FuturesUnordered::new();
|
||||
for shard_idx in 0..self.total_shards() {
|
||||
let is_parity = shard_idx >= self.data_shards;
|
||||
let chunk_id = ChunkId::new(object_id, chunk_index, shard_idx, is_parity);
|
||||
|
|
@ -345,35 +347,73 @@ impl ErasureCodedBackend {
|
|||
let chunk_key = chunk_id.to_key();
|
||||
|
||||
shard_futures.push(async move {
|
||||
// Try to read from the preferred node first
|
||||
if let Ok(node) = node_selector.select_for_read(&nodes, &chunk_key).await {
|
||||
if let Ok(data) = node
|
||||
.get_chunk(&chunk_key, shard_idx as u32, is_parity)
|
||||
let preferred_id = node_selector
|
||||
.select_for_read(&nodes, &chunk_key)
|
||||
.await
|
||||
.ok()
|
||||
.map(|node| node.node_id().to_string());
|
||||
let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
|
||||
FuturesUnordered::new();
|
||||
|
||||
if let Some(preferred_id) = preferred_id.as_ref() {
|
||||
if let Some(preferred) = nodes
|
||||
.iter()
|
||||
.find(|node| node.node_id() == preferred_id.as_str())
|
||||
.cloned()
|
||||
{
|
||||
return Some(data);
|
||||
let key = chunk_key.clone();
|
||||
readers.push(Box::pin(async move {
|
||||
preferred
|
||||
.get_chunk(&key, shard_idx as u32, is_parity)
|
||||
.await
|
||||
.ok()
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
// Try other nodes if preferred fails
|
||||
for node in &nodes {
|
||||
if let Ok(data) = node
|
||||
.get_chunk(&chunk_key, shard_idx as u32, is_parity)
|
||||
.await
|
||||
if preferred_id
|
||||
.as_ref()
|
||||
.is_some_and(|preferred| preferred == node.node_id())
|
||||
{
|
||||
return Some(data);
|
||||
continue;
|
||||
}
|
||||
let node = node.clone();
|
||||
let key = chunk_key.clone();
|
||||
readers.push(Box::pin(async move {
|
||||
node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
|
||||
}));
|
||||
}
|
||||
|
||||
while let Some(result) = readers.next().await {
|
||||
if let Some(data) = result {
|
||||
return (shard_idx, Some(data));
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
(shard_idx, None)
|
||||
});
|
||||
}
|
||||
|
||||
let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
|
||||
let mut shard_results = vec![None; self.total_shards()];
|
||||
let mut available_count = 0usize;
|
||||
|
||||
while let Some((shard_idx, shard)) = shard_futures.next().await {
|
||||
if shard.is_some() {
|
||||
available_count += 1;
|
||||
}
|
||||
shard_results[shard_idx] = shard;
|
||||
|
||||
if available_count >= self.data_shards {
|
||||
break;
|
||||
}
|
||||
|
||||
if available_count + shard_futures.len() < self.data_shards {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Count available shards
|
||||
let available_count = shard_results.iter().filter(|s| s.is_some()).count();
|
||||
|
||||
debug!(
|
||||
object_id = %object_id,
|
||||
chunk_index,
|
||||
|
|
@ -419,9 +459,9 @@ impl StorageBackend for ErasureCodedBackend {
|
|||
debug!(object_id = %object_id, size = original_size, "Putting object with erasure coding");
|
||||
|
||||
// Split data into chunks
|
||||
let chunks = self.chunk_manager.split(&data);
|
||||
let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
|
||||
let chunks = self.chunk_manager.split_with_chunk_size(&data, chunk_size);
|
||||
let chunk_count = chunks.len();
|
||||
let chunk_size = self.chunk_manager.chunk_size();
|
||||
|
||||
// Write each chunk
|
||||
for (chunk_idx, chunk_data) in chunks.into_iter().enumerate() {
|
||||
|
|
@ -591,24 +631,78 @@ impl StorageBackend for ErasureCodedBackend {
|
|||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
|
||||
// Try to read shards
|
||||
let mut shard_futures = Vec::with_capacity(self.total_shards());
|
||||
let mut shard_futures = FuturesUnordered::new();
|
||||
for shard_idx in 0..self.total_shards() {
|
||||
let is_parity = shard_idx >= self.data_shards;
|
||||
let key = format!("{}_{}_{}", part_key, shard_idx, if is_parity { "p" } else { "d" });
|
||||
let nodes = nodes.clone();
|
||||
let node_selector = self.node_selector.clone();
|
||||
|
||||
shard_futures.push(async move {
|
||||
let preferred_id = node_selector
|
||||
.select_for_read(&nodes, &key)
|
||||
.await
|
||||
.ok()
|
||||
.map(|node| node.node_id().to_string());
|
||||
let mut readers: FuturesUnordered<BoxFuture<'static, Option<Bytes>>> =
|
||||
FuturesUnordered::new();
|
||||
|
||||
if let Some(preferred_id) = preferred_id.as_ref() {
|
||||
if let Some(preferred) = nodes
|
||||
.iter()
|
||||
.find(|node| node.node_id() == preferred_id.as_str())
|
||||
.cloned()
|
||||
{
|
||||
let key = key.clone();
|
||||
readers.push(Box::pin(async move {
|
||||
preferred
|
||||
.get_chunk(&key, shard_idx as u32, is_parity)
|
||||
.await
|
||||
.ok()
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
for node in &nodes {
|
||||
if let Ok(data) = node.get_chunk(&key, shard_idx as u32, is_parity).await {
|
||||
return Some(data);
|
||||
if preferred_id
|
||||
.as_ref()
|
||||
.is_some_and(|preferred| preferred == node.node_id())
|
||||
{
|
||||
continue;
|
||||
}
|
||||
let node = node.clone();
|
||||
let key = key.clone();
|
||||
readers.push(Box::pin(async move {
|
||||
node.get_chunk(&key, shard_idx as u32, is_parity).await.ok()
|
||||
}));
|
||||
}
|
||||
|
||||
while let Some(result) = readers.next().await {
|
||||
if let Some(data) = result {
|
||||
return (shard_idx, Some(data));
|
||||
}
|
||||
}
|
||||
None
|
||||
(shard_idx, None)
|
||||
});
|
||||
}
|
||||
|
||||
let shard_results: Vec<Option<Bytes>> = futures::future::join_all(shard_futures).await;
|
||||
let available = shard_results.iter().filter(|s| s.is_some()).count();
|
||||
let mut shard_results = vec![None; self.total_shards()];
|
||||
let mut available = 0usize;
|
||||
|
||||
while let Some((shard_idx, shard)) = shard_futures.next().await {
|
||||
if shard.is_some() {
|
||||
available += 1;
|
||||
}
|
||||
shard_results[shard_idx] = shard;
|
||||
|
||||
if available >= self.data_shards {
|
||||
break;
|
||||
}
|
||||
|
||||
if available + shard_futures.len() < self.data_shards {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if available < self.data_shards {
|
||||
return Err(StorageError::Backend(format!(
|
||||
|
|
@ -674,7 +768,135 @@ impl StorageBackend for ErasureCodedBackend {
|
|||
mod tests {
|
||||
use super::*;
|
||||
use crate::config::{ChunkConfig, RedundancyMode};
|
||||
use crate::node::MockNodeRegistry;
|
||||
use crate::node::{MockNodeClient, MockNodeRegistry, NodeError, NodeResult};
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::sleep;
|
||||
|
||||
struct SlowReadNodeClient {
|
||||
node_id: String,
|
||||
endpoint: String,
|
||||
delay: Duration,
|
||||
chunks: DashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl SlowReadNodeClient {
|
||||
fn new(node_id: impl Into<String>, endpoint: impl Into<String>, delay: Duration) -> Self {
|
||||
Self {
|
||||
node_id: node_id.into(),
|
||||
endpoint: endpoint.into(),
|
||||
delay,
|
||||
chunks: DashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_chunk(&self, chunk_id: impl Into<String>, data: Vec<u8>) {
|
||||
self.chunks.insert(chunk_id.into(), data);
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl NodeClientTrait for SlowReadNodeClient {
|
||||
fn node_id(&self) -> &str {
|
||||
&self.node_id
|
||||
}
|
||||
|
||||
fn endpoint(&self) -> &str {
|
||||
&self.endpoint
|
||||
}
|
||||
|
||||
async fn is_healthy(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
async fn put_chunk(
|
||||
&self,
|
||||
chunk_id: &str,
|
||||
_shard_index: u32,
|
||||
_is_parity: bool,
|
||||
data: Bytes,
|
||||
) -> NodeResult<()> {
|
||||
self.chunks.insert(chunk_id.to_string(), data.to_vec());
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_chunk(
|
||||
&self,
|
||||
chunk_id: &str,
|
||||
_shard_index: u32,
|
||||
_is_parity: bool,
|
||||
) -> NodeResult<Bytes> {
|
||||
sleep(self.delay).await;
|
||||
self.chunks
|
||||
.get(chunk_id)
|
||||
.map(|value| Bytes::from(value.value().clone()))
|
||||
.ok_or_else(|| NodeError::NotFound(chunk_id.to_string()))
|
||||
}
|
||||
|
||||
async fn delete_chunk(&self, chunk_id: &str) -> NodeResult<()> {
|
||||
self.chunks.remove(chunk_id);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn chunk_exists(&self, chunk_id: &str) -> NodeResult<bool> {
|
||||
Ok(self.chunks.contains_key(chunk_id))
|
||||
}
|
||||
|
||||
async fn chunk_size(&self, chunk_id: &str) -> NodeResult<Option<u64>> {
|
||||
Ok(self
|
||||
.chunks
|
||||
.get(chunk_id)
|
||||
.map(|value| value.value().len() as u64))
|
||||
}
|
||||
|
||||
async fn ping(&self) -> NodeResult<Duration> {
|
||||
Ok(Duration::from_millis(1))
|
||||
}
|
||||
}
|
||||
|
||||
struct FixedNodeRegistry {
|
||||
nodes: Vec<Arc<dyn NodeClientTrait>>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl NodeRegistry for FixedNodeRegistry {
|
||||
async fn get_all_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
|
||||
Ok(self.nodes.clone())
|
||||
}
|
||||
|
||||
async fn get_healthy_nodes(&self) -> NodeResult<Vec<Arc<dyn NodeClientTrait>>> {
|
||||
Ok(self.nodes.clone())
|
||||
}
|
||||
|
||||
async fn register_node(&self, _info: crate::node::NodeInfo) -> NodeResult<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn deregister_node(&self, _node_id: &str) -> NodeResult<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn update_health(&self, _node_id: &str, _healthy: bool) -> NodeResult<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_node(&self, node_id: &str) -> NodeResult<Option<Arc<dyn NodeClientTrait>>> {
|
||||
Ok(self
|
||||
.nodes
|
||||
.iter()
|
||||
.find(|node| node.node_id() == node_id)
|
||||
.cloned())
|
||||
}
|
||||
|
||||
async fn node_count(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
async fn healthy_node_count(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
}
|
||||
|
||||
fn create_ec_config(data_shards: usize, parity_shards: usize) -> DistributedConfig {
|
||||
DistributedConfig {
|
||||
|
|
@ -858,4 +1080,162 @@ mod tests {
|
|||
assert_eq!(retrieved.len(), data.len());
|
||||
assert_eq!(retrieved, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ec_backend_read_returns_after_minimum_shards() {
|
||||
let config = create_ec_config(4, 2);
|
||||
let mut fast_nodes = Vec::new();
|
||||
for index in 0..4 {
|
||||
fast_nodes.push(Arc::new(MockNodeClient::new(
|
||||
format!("fast-{index}"),
|
||||
format!("http://fast-{index}:9002"),
|
||||
)));
|
||||
}
|
||||
let slow_a = Arc::new(SlowReadNodeClient::new(
|
||||
"slow-a",
|
||||
"http://slow-a:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
let slow_b = Arc::new(SlowReadNodeClient::new(
|
||||
"slow-b",
|
||||
"http://slow-b:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
|
||||
let backend = ErasureCodedBackend::new(
|
||||
config,
|
||||
Arc::new(FixedNodeRegistry {
|
||||
nodes: vec![
|
||||
fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
|
||||
slow_a.clone() as Arc<dyn NodeClientTrait>,
|
||||
slow_b.clone() as Arc<dyn NodeClientTrait>,
|
||||
],
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let object_id = ObjectId::new();
|
||||
let data = Bytes::from(vec![5u8; 512]);
|
||||
let metadata = ObjectMetadata::new(data.len() as u64, 1, data.len());
|
||||
let meta_key = ObjectMetadata::metadata_key(&object_id);
|
||||
let shards = backend.codec.encode(&data).unwrap();
|
||||
|
||||
for fast_node in &fast_nodes {
|
||||
fast_node
|
||||
.put_chunk(&meta_key, 0, false, Bytes::from(metadata.to_bytes()))
|
||||
.await
|
||||
.unwrap();
|
||||
}
|
||||
for slow_node in [&slow_a, &slow_b] {
|
||||
slow_node.insert_chunk(meta_key.clone(), metadata.to_bytes());
|
||||
}
|
||||
|
||||
for (shard_idx, shard_data) in shards.into_iter().enumerate() {
|
||||
let is_parity = shard_idx >= backend.data_shards;
|
||||
let key = ChunkId::new(&object_id, 0, shard_idx, is_parity).to_key();
|
||||
if shard_idx < 4 {
|
||||
fast_nodes[shard_idx]
|
||||
.put_chunk(
|
||||
&key,
|
||||
shard_idx as u32,
|
||||
is_parity,
|
||||
Bytes::from(shard_data),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
} else if shard_idx == 4 {
|
||||
slow_a.insert_chunk(key, shard_data);
|
||||
} else {
|
||||
slow_b.insert_chunk(key, shard_data);
|
||||
}
|
||||
}
|
||||
|
||||
let started = Instant::now();
|
||||
let retrieved = backend.get_object(&object_id).await.unwrap();
|
||||
let elapsed = started.elapsed();
|
||||
|
||||
assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
|
||||
assert_eq!(retrieved, data);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_ec_backend_get_part_returns_after_minimum_shards() {
|
||||
let config = create_ec_config(4, 2);
|
||||
let mut fast_nodes = Vec::new();
|
||||
for index in 0..4 {
|
||||
fast_nodes.push(Arc::new(MockNodeClient::new(
|
||||
format!("fast-{index}"),
|
||||
format!("http://fast-{index}:9002"),
|
||||
)));
|
||||
}
|
||||
let slow_a = Arc::new(SlowReadNodeClient::new(
|
||||
"slow-a",
|
||||
"http://slow-a:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
let slow_b = Arc::new(SlowReadNodeClient::new(
|
||||
"slow-b",
|
||||
"http://slow-b:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
|
||||
let backend = ErasureCodedBackend::new(
|
||||
config,
|
||||
Arc::new(FixedNodeRegistry {
|
||||
nodes: vec![
|
||||
fast_nodes[0].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[1].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[2].clone() as Arc<dyn NodeClientTrait>,
|
||||
fast_nodes[3].clone() as Arc<dyn NodeClientTrait>,
|
||||
slow_a.clone() as Arc<dyn NodeClientTrait>,
|
||||
slow_b.clone() as Arc<dyn NodeClientTrait>,
|
||||
],
|
||||
}),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let upload_id = "upload-latency";
|
||||
let part_number = 7;
|
||||
let data = Bytes::from(vec![9u8; 512]);
|
||||
let shards = backend.codec.encode(&data).unwrap();
|
||||
|
||||
for (shard_idx, shard_data) in shards.into_iter().enumerate() {
|
||||
let is_parity = shard_idx >= backend.data_shards;
|
||||
let key = format!(
|
||||
"part_{}_{}_{}_{}",
|
||||
upload_id,
|
||||
part_number,
|
||||
shard_idx,
|
||||
if is_parity { "p" } else { "d" }
|
||||
);
|
||||
|
||||
if shard_idx < 4 {
|
||||
fast_nodes[shard_idx]
|
||||
.put_chunk(
|
||||
&key,
|
||||
shard_idx as u32,
|
||||
is_parity,
|
||||
Bytes::from(shard_data),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
} else if shard_idx == 4 {
|
||||
slow_a.insert_chunk(key, shard_data);
|
||||
} else {
|
||||
slow_b.insert_chunk(key, shard_data);
|
||||
}
|
||||
}
|
||||
|
||||
let started = Instant::now();
|
||||
let retrieved = backend.get_part(upload_id, part_number).await.unwrap();
|
||||
let elapsed = started.elapsed();
|
||||
|
||||
assert!(elapsed < Duration::from_millis(200), "elapsed={elapsed:?}");
|
||||
assert_eq!(retrieved, data);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,13 +5,15 @@
|
|||
|
||||
use crate::chunk::ChunkManager;
|
||||
use crate::config::DistributedConfig;
|
||||
use crate::node::{NodeClientTrait, NodeError, NodeRegistry};
|
||||
use crate::node::{NodeClientTrait, NodeError, NodeRegistry, NodeResult};
|
||||
use crate::placement::{ConsistentHashSelector, NodeSelector};
|
||||
use crate::repair::{RepairQueue, ReplicatedRepairTask};
|
||||
use async_trait::async_trait;
|
||||
use bytes::{Bytes, BytesMut};
|
||||
use futures::stream::{FuturesUnordered, StreamExt};
|
||||
use lightningstor_storage::{StorageBackend, StorageError, StorageResult};
|
||||
use lightningstor_types::ObjectId;
|
||||
use std::net::IpAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use tracing::{debug, error, warn};
|
||||
|
|
@ -81,6 +83,8 @@ pub struct ReplicatedBackend {
|
|||
read_quorum: usize,
|
||||
/// Write quorum (minimum replicas for successful write)
|
||||
write_quorum: usize,
|
||||
/// Durable queue for repairing under-replicated chunks.
|
||||
repair_queue: Option<Arc<dyn RepairQueue>>,
|
||||
}
|
||||
|
||||
impl ReplicatedBackend {
|
||||
|
|
@ -92,6 +96,15 @@ impl ReplicatedBackend {
|
|||
pub async fn new(
|
||||
config: DistributedConfig,
|
||||
node_registry: Arc<dyn NodeRegistry>,
|
||||
) -> StorageResult<Self> {
|
||||
Self::new_with_repair_queue(config, node_registry, None).await
|
||||
}
|
||||
|
||||
/// Create a replicated backend with an optional durable repair queue.
|
||||
pub async fn new_with_repair_queue(
|
||||
config: DistributedConfig,
|
||||
node_registry: Arc<dyn NodeRegistry>,
|
||||
repair_queue: Option<Arc<dyn RepairQueue>>,
|
||||
) -> StorageResult<Self> {
|
||||
let (replica_count, read_quorum, write_quorum) = match &config.redundancy {
|
||||
crate::config::RedundancyMode::Replicated {
|
||||
|
|
@ -116,6 +129,7 @@ impl ReplicatedBackend {
|
|||
replica_count,
|
||||
read_quorum,
|
||||
write_quorum,
|
||||
repair_queue,
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -134,6 +148,89 @@ impl ReplicatedBackend {
|
|||
self.write_quorum
|
||||
}
|
||||
|
||||
async fn finalize_pending_replica_writes(
|
||||
repair_queue: Option<Arc<dyn RepairQueue>>,
|
||||
mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
|
||||
key: String,
|
||||
shard_index: u32,
|
||||
mut success_count: usize,
|
||||
total_replicas: usize,
|
||||
reason: String,
|
||||
) {
|
||||
let mut errors = Vec::new();
|
||||
|
||||
while let Some(result) = pending_writes.next().await {
|
||||
match result {
|
||||
Ok((_, Ok(()))) => success_count += 1,
|
||||
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
|
||||
Err(join_err) => errors.push(format!("join error: {join_err}")),
|
||||
}
|
||||
}
|
||||
|
||||
if success_count >= total_replicas {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(queue) = repair_queue {
|
||||
queue
|
||||
.enqueue_repair(ReplicatedRepairTask::new(key.clone(), shard_index, reason))
|
||||
.await;
|
||||
}
|
||||
|
||||
warn!(
|
||||
chunk_key = %key,
|
||||
shard_index,
|
||||
success_count,
|
||||
total_replicas,
|
||||
errors = ?errors,
|
||||
"Replica write completed below desired replication; repair task queued"
|
||||
);
|
||||
}
|
||||
|
||||
async fn finalize_pending_chunked_write_repairs(
|
||||
repair_queue: Option<Arc<dyn RepairQueue>>,
|
||||
mut pending_writes: FuturesUnordered<tokio::task::JoinHandle<(String, NodeResult<()>)>>,
|
||||
repair_targets: Vec<(String, u32)>,
|
||||
object_id: String,
|
||||
mut success_count: usize,
|
||||
total_replicas: usize,
|
||||
reason: String,
|
||||
) {
|
||||
let mut errors = Vec::new();
|
||||
|
||||
while let Some(result) = pending_writes.next().await {
|
||||
match result {
|
||||
Ok((_, Ok(()))) => success_count += 1,
|
||||
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
|
||||
Err(join_err) => errors.push(format!("join error: {join_err}")),
|
||||
}
|
||||
}
|
||||
|
||||
if success_count >= total_replicas {
|
||||
return;
|
||||
}
|
||||
|
||||
if let Some(queue) = repair_queue {
|
||||
for (chunk_key, shard_index) in repair_targets {
|
||||
queue
|
||||
.enqueue_repair(ReplicatedRepairTask::new(
|
||||
chunk_key,
|
||||
shard_index,
|
||||
reason.clone(),
|
||||
))
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
warn!(
|
||||
object_id = %object_id,
|
||||
success_count,
|
||||
total_replicas,
|
||||
errors = ?errors,
|
||||
"Chunked replica write completed below desired replication; repair tasks queued"
|
||||
);
|
||||
}
|
||||
|
||||
fn chunk_write_parallelism(&self, chunk_count: usize) -> usize {
|
||||
chunk_count
|
||||
.min(
|
||||
|
|
@ -220,7 +317,13 @@ impl ReplicatedBackend {
|
|||
));
|
||||
}
|
||||
|
||||
if let Ok(preferred) = self.node_selector.select_for_read(nodes, key).await {
|
||||
let mut ordered_nodes = Self::ordered_read_nodes(nodes, self
|
||||
.node_selector
|
||||
.select_for_read(nodes, key)
|
||||
.await
|
||||
.ok());
|
||||
|
||||
if let Some(preferred) = ordered_nodes.first() {
|
||||
match preferred.get_chunk(key, shard_index, false).await {
|
||||
Ok(data) => return Ok(Some(data)),
|
||||
Err(NodeError::NotFound(_)) => {}
|
||||
|
|
@ -235,7 +338,7 @@ impl ReplicatedBackend {
|
|||
}
|
||||
}
|
||||
|
||||
for node in nodes {
|
||||
for node in ordered_nodes.drain(1..) {
|
||||
match node.get_chunk(key, shard_index, false).await {
|
||||
Ok(data) => return Ok(Some(data)),
|
||||
Err(NodeError::NotFound(_)) => continue,
|
||||
|
|
@ -383,6 +486,21 @@ impl ReplicatedBackend {
|
|||
Ok((_, Ok(()))) => {
|
||||
success_count += 1;
|
||||
if success_count >= self.write_quorum {
|
||||
if success_count < total_replicas {
|
||||
let pending_writes =
|
||||
std::mem::replace(&mut write_futures, FuturesUnordered::new());
|
||||
tokio::spawn(Self::finalize_pending_replica_writes(
|
||||
self.repair_queue.clone(),
|
||||
pending_writes,
|
||||
key.clone(),
|
||||
shard_index,
|
||||
success_count,
|
||||
total_replicas,
|
||||
format!(
|
||||
"replica write completed below desired replication after quorum ({success_count}/{total_replicas})"
|
||||
),
|
||||
));
|
||||
}
|
||||
debug!(
|
||||
chunk_key = %key,
|
||||
success_count,
|
||||
|
|
@ -427,13 +545,13 @@ impl ReplicatedBackend {
|
|||
}
|
||||
|
||||
async fn write_chunked_object(&self, object_id: &ObjectId, data: Bytes) -> StorageResult<()> {
|
||||
let chunk_size = self.chunk_manager.chunk_size();
|
||||
let chunk_count = self.chunk_manager.chunk_count(data.len());
|
||||
let chunk_size = self.chunk_manager.effective_chunk_size(data.len());
|
||||
let chunk_count = ChunkManager::chunk_count_for_size(data.len(), chunk_size);
|
||||
let metadata = ReplicatedObjectMetadata::new(data.len(), chunk_count, chunk_size);
|
||||
let mut requests = Vec::with_capacity(chunk_count + 1);
|
||||
for chunk_index in 0..chunk_count {
|
||||
let chunk_key = Self::object_chunk_key(object_id, chunk_index);
|
||||
let (start, len) = self.chunk_manager.chunk_range(data.len(), chunk_index);
|
||||
let (start, len) = ChunkManager::chunk_range_for_size(data.len(), chunk_index, chunk_size);
|
||||
let chunk_bytes = data.slice(start..start + len);
|
||||
requests.push((chunk_key, chunk_index as u32, false, chunk_bytes));
|
||||
}
|
||||
|
|
@ -464,6 +582,27 @@ impl ReplicatedBackend {
|
|||
Ok((_, Ok(()))) => {
|
||||
success_count += 1;
|
||||
if success_count >= self.write_quorum {
|
||||
if success_count < total_replicas {
|
||||
let repair_targets = requests
|
||||
.iter()
|
||||
.map(|(chunk_key, shard_index, _, _)| {
|
||||
(chunk_key.clone(), *shard_index)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let pending_writes =
|
||||
std::mem::replace(&mut write_futures, FuturesUnordered::new());
|
||||
tokio::spawn(Self::finalize_pending_chunked_write_repairs(
|
||||
self.repair_queue.clone(),
|
||||
pending_writes,
|
||||
repair_targets,
|
||||
object_id.to_string(),
|
||||
success_count,
|
||||
total_replicas,
|
||||
format!(
|
||||
"chunked object write completed below desired replication after quorum ({success_count}/{total_replicas})"
|
||||
),
|
||||
));
|
||||
}
|
||||
debug!(
|
||||
object_id = %object_id,
|
||||
chunk_count,
|
||||
|
|
@ -509,6 +648,150 @@ impl ReplicatedBackend {
|
|||
)))
|
||||
}
|
||||
|
||||
pub async fn repair_chunk(&self, task: &ReplicatedRepairTask) -> StorageResult<()> {
|
||||
let healthy_nodes = self
|
||||
.node_registry
|
||||
.get_healthy_nodes()
|
||||
.await
|
||||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
if healthy_nodes.is_empty() {
|
||||
return Err(StorageError::Backend(
|
||||
"No healthy storage nodes available for repair".to_string(),
|
||||
));
|
||||
}
|
||||
let desired_nodes = self
|
||||
.node_selector
|
||||
.select_nodes_for_key(&healthy_nodes, self.replica_count, &task.key)
|
||||
.await
|
||||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
|
||||
let mut present_nodes = Vec::new();
|
||||
let mut missing_nodes = Vec::new();
|
||||
for node in desired_nodes {
|
||||
match node.chunk_exists(&task.key).await {
|
||||
Ok(true) => present_nodes.push(node),
|
||||
Ok(false) => missing_nodes.push(node),
|
||||
Err(err) => {
|
||||
warn!(
|
||||
chunk_key = task.key,
|
||||
node_id = node.node_id(),
|
||||
error = ?err,
|
||||
"Failed to inspect chunk during repair; treating replica as missing"
|
||||
);
|
||||
missing_nodes.push(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if missing_nodes.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if present_nodes.is_empty() {
|
||||
let desired_node_ids = missing_nodes
|
||||
.iter()
|
||||
.map(|node| node.node_id().to_string())
|
||||
.collect::<std::collections::HashSet<_>>();
|
||||
for node in healthy_nodes {
|
||||
if desired_node_ids.contains(node.node_id()) {
|
||||
continue;
|
||||
}
|
||||
match node.chunk_exists(&task.key).await {
|
||||
Ok(true) => {
|
||||
present_nodes.push(node);
|
||||
break;
|
||||
}
|
||||
Ok(false) => {}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
chunk_key = task.key,
|
||||
node_id = node.node_id(),
|
||||
error = ?err,
|
||||
"Failed to inspect off-placement chunk during repair"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let source = present_nodes.first().ok_or_else(|| {
|
||||
StorageError::Backend(format!(
|
||||
"Cannot repair {} because no healthy source replica is available",
|
||||
task.key
|
||||
))
|
||||
})?;
|
||||
|
||||
let data = source
|
||||
.get_chunk(&task.key, task.shard_index, false)
|
||||
.await
|
||||
.map_err(|err| {
|
||||
StorageError::Backend(format!(
|
||||
"Failed to load repair source for {} from {}: {}",
|
||||
task.key,
|
||||
source.node_id(),
|
||||
err
|
||||
))
|
||||
})?;
|
||||
|
||||
let mut repair_futures = FuturesUnordered::new();
|
||||
for node in missing_nodes {
|
||||
let node_id = node.node_id().to_string();
|
||||
let key = task.key.clone();
|
||||
let chunk = data.clone();
|
||||
let shard_index = task.shard_index;
|
||||
repair_futures.push(tokio::spawn(async move {
|
||||
let result = node.put_chunk(&key, shard_index, false, chunk).await;
|
||||
(node_id, result)
|
||||
}));
|
||||
}
|
||||
|
||||
let mut repaired = 0usize;
|
||||
let mut errors = Vec::new();
|
||||
while let Some(result) = repair_futures.next().await {
|
||||
match result {
|
||||
Ok((_, Ok(()))) => repaired += 1,
|
||||
Ok((node_id, Err(err))) => errors.push(format!("{node_id}: {err}")),
|
||||
Err(join_err) => errors.push(format!("join error: {join_err}")),
|
||||
}
|
||||
}
|
||||
|
||||
if errors.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Err(StorageError::Backend(format!(
|
||||
"Repair for {} only restored {} replicas: {}",
|
||||
task.key,
|
||||
repaired,
|
||||
errors.join(", ")
|
||||
)))
|
||||
}
|
||||
|
||||
pub async fn chunk_exists_anywhere(&self, key: &str) -> StorageResult<bool> {
|
||||
let nodes = self
|
||||
.node_registry
|
||||
.get_all_nodes()
|
||||
.await
|
||||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
|
||||
for node in nodes {
|
||||
match node.chunk_exists(key).await {
|
||||
Ok(true) => return Ok(true),
|
||||
Ok(false) => {}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
chunk_key = key,
|
||||
node_id = node.node_id(),
|
||||
error = ?err,
|
||||
"Failed to inspect chunk while probing global existence"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(false)
|
||||
}
|
||||
|
||||
async fn read_chunked_object(
|
||||
&self,
|
||||
object_id: &ObjectId,
|
||||
|
|
@ -521,24 +804,47 @@ impl ReplicatedBackend {
|
|||
.map_err(|e| StorageError::Backend(e.to_string()))?;
|
||||
|
||||
if !nodes.is_empty() {
|
||||
let mut ordered_nodes = Vec::with_capacity(nodes.len());
|
||||
if let Ok(preferred) = self
|
||||
let preferred = self
|
||||
.node_selector
|
||||
.select_for_read(&nodes, &Self::object_chunk_key(object_id, 0))
|
||||
.await
|
||||
.ok();
|
||||
let ordered_nodes = Self::ordered_read_nodes(&nodes, preferred);
|
||||
|
||||
if metadata.chunk_count > 1 {
|
||||
if let Some(local_node) = ordered_nodes.iter().find(|node| Self::is_local_node(node))
|
||||
{
|
||||
ordered_nodes.push(preferred.clone());
|
||||
let batch_requests: Vec<(String, u32, bool)> = (0..metadata.chunk_count)
|
||||
.map(|chunk_index| {
|
||||
(
|
||||
Self::object_chunk_key(object_id, chunk_index),
|
||||
chunk_index as u32,
|
||||
false,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
match local_node.batch_get_chunks(batch_requests).await {
|
||||
Ok(chunks) => {
|
||||
return Self::assemble_chunked_bytes(
|
||||
object_id,
|
||||
metadata.original_size,
|
||||
chunks,
|
||||
);
|
||||
}
|
||||
Err(err) => {
|
||||
warn!(
|
||||
object_id = %object_id,
|
||||
node_id = local_node.node_id(),
|
||||
error = ?err,
|
||||
"Local replica batch read failed, falling back to distributed reads"
|
||||
);
|
||||
}
|
||||
}
|
||||
for node in nodes {
|
||||
if ordered_nodes
|
||||
.iter()
|
||||
.all(|existing| existing.node_id() != node.node_id())
|
||||
{
|
||||
ordered_nodes.push(node);
|
||||
}
|
||||
}
|
||||
|
||||
if ordered_nodes.len() > 1 && metadata.chunk_count > 1 {
|
||||
if ordered_nodes.len() > 1 && metadata.chunk_count > 1 && !Self::has_local_node(&ordered_nodes)
|
||||
{
|
||||
match self
|
||||
.read_chunked_object_from_distributed_batches(
|
||||
object_id,
|
||||
|
|
@ -783,6 +1089,74 @@ impl ReplicatedBackend {
|
|||
combined.truncate(original_size);
|
||||
Ok(combined.freeze())
|
||||
}
|
||||
|
||||
fn ordered_read_nodes(
|
||||
nodes: &[Arc<dyn NodeClientTrait>],
|
||||
preferred: Option<Arc<dyn NodeClientTrait>>,
|
||||
) -> Vec<Arc<dyn NodeClientTrait>> {
|
||||
let mut ordered = Vec::with_capacity(nodes.len());
|
||||
|
||||
if let Some(local) = nodes.iter().find(|node| Self::is_local_node(node)) {
|
||||
ordered.push(local.clone());
|
||||
}
|
||||
|
||||
if let Some(preferred) = preferred {
|
||||
if ordered
|
||||
.iter()
|
||||
.all(|existing| existing.node_id() != preferred.node_id())
|
||||
{
|
||||
ordered.push(preferred);
|
||||
}
|
||||
}
|
||||
|
||||
for node in nodes {
|
||||
if ordered
|
||||
.iter()
|
||||
.all(|existing| existing.node_id() != node.node_id())
|
||||
{
|
||||
ordered.push(node.clone());
|
||||
}
|
||||
}
|
||||
|
||||
ordered
|
||||
}
|
||||
|
||||
fn has_local_node(nodes: &[Arc<dyn NodeClientTrait>]) -> bool {
|
||||
nodes.iter().any(Self::is_local_node)
|
||||
}
|
||||
|
||||
fn is_local_node(node: &Arc<dyn NodeClientTrait>) -> bool {
|
||||
Self::endpoint_is_local(node.endpoint())
|
||||
}
|
||||
|
||||
fn endpoint_is_local(endpoint: &str) -> bool {
|
||||
let authority = endpoint
|
||||
.split_once("://")
|
||||
.map(|(_, rest)| rest)
|
||||
.unwrap_or(endpoint)
|
||||
.split('/')
|
||||
.next()
|
||||
.unwrap_or(endpoint);
|
||||
let host = if authority.starts_with('[') {
|
||||
authority
|
||||
.split_once(']')
|
||||
.map(|(host, _)| host.trim_start_matches('['))
|
||||
.unwrap_or(authority.trim_matches(['[', ']']))
|
||||
} else {
|
||||
authority
|
||||
.rsplit_once(':')
|
||||
.map(|(host, _)| host)
|
||||
.unwrap_or(authority)
|
||||
};
|
||||
|
||||
if host.eq_ignore_ascii_case("localhost") {
|
||||
return true;
|
||||
}
|
||||
|
||||
host.parse::<IpAddr>()
|
||||
.map(|ip| ip.is_loopback())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
|
|
@ -908,12 +1282,25 @@ mod tests {
|
|||
use super::*;
|
||||
use crate::config::RedundancyMode;
|
||||
use crate::node::{MockNodeRegistry, NodeError, NodeResult};
|
||||
use crate::repair::RepairQueue;
|
||||
use async_trait::async_trait;
|
||||
use dashmap::DashMap;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use tokio::time::sleep;
|
||||
|
||||
#[derive(Default)]
|
||||
struct CapturingRepairQueue {
|
||||
tasks: DashMap<String, ReplicatedRepairTask>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl RepairQueue for CapturingRepairQueue {
|
||||
async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
|
||||
self.tasks.insert(task.id.clone(), task);
|
||||
}
|
||||
}
|
||||
|
||||
struct SlowNodeClient {
|
||||
node_id: String,
|
||||
endpoint: String,
|
||||
|
|
@ -1196,6 +1583,115 @@ mod tests {
|
|||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_under_replicated_write_enqueues_repair_task() {
|
||||
let config = create_replicated_config(3);
|
||||
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
|
||||
let nodes = registry.all_mock_nodes();
|
||||
nodes[2].set_fail_puts(true);
|
||||
|
||||
let repair_queue = Arc::new(CapturingRepairQueue::default());
|
||||
let backend = ReplicatedBackend::new_with_repair_queue(
|
||||
config,
|
||||
registry,
|
||||
Some(repair_queue.clone()),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let object_id = ObjectId::new();
|
||||
backend
|
||||
.put_object(&object_id, Bytes::from_static(b"repair-me"))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut task = None;
|
||||
for _ in 0..20 {
|
||||
task = repair_queue
|
||||
.tasks
|
||||
.iter()
|
||||
.next()
|
||||
.map(|entry| entry.value().clone());
|
||||
if task.is_some() {
|
||||
break;
|
||||
}
|
||||
sleep(Duration::from_millis(10)).await;
|
||||
}
|
||||
let task = task.expect("repair task should be queued");
|
||||
assert_eq!(task.key, ReplicatedBackend::object_key(&object_id));
|
||||
assert_eq!(task.shard_index, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_chunk_restores_missing_replica() {
|
||||
let config = create_replicated_config(3);
|
||||
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
|
||||
let nodes = registry.all_mock_nodes();
|
||||
let backend = ReplicatedBackend::new(config, registry.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let object_id = ObjectId::new();
|
||||
let data = Bytes::from(vec![11u8; 128]);
|
||||
backend.put_object(&object_id, data.clone()).await.unwrap();
|
||||
|
||||
let key = ReplicatedBackend::object_key(&object_id);
|
||||
let mut missing = None;
|
||||
for node in &nodes {
|
||||
if node.chunk_exists(&key).await.unwrap() {
|
||||
missing = Some(node.clone());
|
||||
break;
|
||||
}
|
||||
}
|
||||
let missing = missing.expect("at least one replica should exist");
|
||||
missing.delete_chunk(&key).await.unwrap();
|
||||
assert!(!missing.chunk_exists(&key).await.unwrap());
|
||||
|
||||
let task = ReplicatedRepairTask::new(key.clone(), 0, "test");
|
||||
backend.repair_chunk(&task).await.unwrap();
|
||||
assert!(missing.chunk_exists(&key).await.unwrap());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_repair_chunk_can_source_from_off_placement_replica() {
|
||||
let config = create_replicated_config(2);
|
||||
let registry = Arc::new(MockNodeRegistry::with_nodes(3));
|
||||
let nodes = registry.all_mock_nodes();
|
||||
let backend = ReplicatedBackend::new(config, registry.clone())
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let object_id = ObjectId::new();
|
||||
let data = Bytes::from(vec![23u8; 128]);
|
||||
backend.put_object(&object_id, data.clone()).await.unwrap();
|
||||
|
||||
let key = ReplicatedBackend::object_key(&object_id);
|
||||
let desired_nodes = backend.select_replica_nodes_for_key(&key).await.unwrap();
|
||||
assert_eq!(desired_nodes.len(), 2);
|
||||
let off_placement = nodes
|
||||
.iter()
|
||||
.find(|node| {
|
||||
desired_nodes
|
||||
.iter()
|
||||
.all(|desired| desired.node_id() != node.node_id())
|
||||
})
|
||||
.cloned()
|
||||
.expect("off-placement node should exist");
|
||||
|
||||
let source_bytes = desired_nodes[0].get_chunk(&key, 0, false).await.unwrap();
|
||||
off_placement.put_chunk(&key, 0, false, source_bytes).await.unwrap();
|
||||
for node in &desired_nodes {
|
||||
node.delete_chunk(&key).await.unwrap();
|
||||
assert!(!node.chunk_exists(&key).await.unwrap());
|
||||
}
|
||||
|
||||
let task = ReplicatedRepairTask::new(key.clone(), 0, "off-placement-source");
|
||||
backend.repair_chunk(&task).await.unwrap();
|
||||
for node in &desired_nodes {
|
||||
assert!(node.chunk_exists(&key).await.unwrap());
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_replicated_backend_returns_after_quorum_without_waiting_for_slow_replica() {
|
||||
let config = create_replicated_config(3);
|
||||
|
|
@ -1333,6 +1829,43 @@ mod tests {
|
|||
.is_none());
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_replicated_backend_prefers_local_replica_for_chunked_reads() {
|
||||
let mut config = create_replicated_config(3);
|
||||
config.chunk.chunk_size = 64;
|
||||
let local = Arc::new(crate::node::MockNodeClient::new(
|
||||
"local",
|
||||
"http://127.0.0.1:9002",
|
||||
));
|
||||
let slow_a = Arc::new(SlowNodeClient::new(
|
||||
"slow-a",
|
||||
"http://slow-a:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
let slow_b = Arc::new(SlowNodeClient::new(
|
||||
"slow-b",
|
||||
"http://slow-b:9002",
|
||||
Duration::from_millis(250),
|
||||
));
|
||||
let registry = Arc::new(FixedNodeRegistry {
|
||||
nodes: vec![slow_a.clone(), slow_b.clone(), local.clone()],
|
||||
});
|
||||
|
||||
let backend = ReplicatedBackend::new(config, registry).await.unwrap();
|
||||
let object_id = ObjectId::new();
|
||||
let data = Bytes::from(vec![5u8; 256]);
|
||||
|
||||
backend.put_object(&object_id, data.clone()).await.unwrap();
|
||||
|
||||
let started = Instant::now();
|
||||
let retrieved = backend.get_object(&object_id).await.unwrap();
|
||||
let elapsed = started.elapsed();
|
||||
|
||||
assert_eq!(retrieved, data);
|
||||
assert!(elapsed < Duration::from_millis(150), "elapsed={elapsed:?}");
|
||||
assert!(local.get_count() >= 4);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_replicated_backend_object_size() {
|
||||
let config = create_replicated_config(3);
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@
|
|||
|
||||
use crate::config::ChunkConfig;
|
||||
|
||||
const TARGET_CHUNK_COUNT_PER_OBJECT: usize = 8;
|
||||
|
||||
/// Manages chunk operations for large objects
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ChunkManager {
|
||||
|
|
@ -27,18 +29,42 @@ impl ChunkManager {
|
|||
self.config.chunk_size
|
||||
}
|
||||
|
||||
/// Choose the effective chunk size for an object of the given size.
|
||||
///
|
||||
/// Small objects keep the configured default chunk size. Larger objects
|
||||
/// scale up to keep per-object chunk counts bounded without exceeding the
|
||||
/// configured maximum.
|
||||
pub fn effective_chunk_size(&self, total_size: usize) -> usize {
|
||||
if total_size == 0 {
|
||||
return self.config.chunk_size;
|
||||
}
|
||||
|
||||
let min_chunk_size = self.config.min_chunk_size.min(self.config.chunk_size).max(1);
|
||||
let max_chunk_size = self.config.max_chunk_size.max(self.config.chunk_size);
|
||||
let required = total_size.div_ceil(TARGET_CHUNK_COUNT_PER_OBJECT);
|
||||
let alignment = min_chunk_size;
|
||||
let aligned_required = required.div_ceil(alignment) * alignment;
|
||||
|
||||
aligned_required
|
||||
.max(self.config.chunk_size)
|
||||
.clamp(min_chunk_size, max_chunk_size)
|
||||
}
|
||||
|
||||
/// Split data into chunks
|
||||
///
|
||||
/// Returns a vector of chunks. Each chunk is at most `chunk_size` bytes,
|
||||
/// except the last chunk which may be smaller.
|
||||
pub fn split(&self, data: &[u8]) -> Vec<Vec<u8>> {
|
||||
self.split_with_chunk_size(data, self.config.chunk_size)
|
||||
}
|
||||
|
||||
/// Split data into chunks using an explicit chunk size.
|
||||
pub fn split_with_chunk_size(&self, data: &[u8], chunk_size: usize) -> Vec<Vec<u8>> {
|
||||
if data.is_empty() {
|
||||
return vec![vec![]];
|
||||
}
|
||||
|
||||
data.chunks(self.config.chunk_size)
|
||||
.map(|c| c.to_vec())
|
||||
.collect()
|
||||
data.chunks(chunk_size).map(|c| c.to_vec()).collect()
|
||||
}
|
||||
|
||||
/// Reassemble chunks into original data
|
||||
|
|
@ -50,21 +76,33 @@ impl ChunkManager {
|
|||
|
||||
/// Calculate the number of chunks for a given data size
|
||||
pub fn chunk_count(&self, size: usize) -> usize {
|
||||
Self::chunk_count_for_size(size, self.config.chunk_size)
|
||||
}
|
||||
|
||||
pub fn chunk_count_for_size(size: usize, chunk_size: usize) -> usize {
|
||||
if size == 0 {
|
||||
return 1;
|
||||
}
|
||||
(size + self.config.chunk_size - 1) / self.config.chunk_size
|
||||
size.div_ceil(chunk_size)
|
||||
}
|
||||
|
||||
/// Calculate the size of a specific chunk
|
||||
///
|
||||
/// Returns the size of the chunk at the given index for data of the given total size.
|
||||
pub fn chunk_size_at(&self, total_size: usize, chunk_index: usize) -> usize {
|
||||
let full_chunks = total_size / self.config.chunk_size;
|
||||
let remainder = total_size % self.config.chunk_size;
|
||||
Self::chunk_size_at_for_size(total_size, chunk_index, self.config.chunk_size)
|
||||
}
|
||||
|
||||
pub fn chunk_size_at_for_size(
|
||||
total_size: usize,
|
||||
chunk_index: usize,
|
||||
chunk_size: usize,
|
||||
) -> usize {
|
||||
let full_chunks = total_size / chunk_size;
|
||||
let remainder = total_size % chunk_size;
|
||||
|
||||
if chunk_index < full_chunks {
|
||||
self.config.chunk_size
|
||||
chunk_size
|
||||
} else if chunk_index == full_chunks && remainder > 0 {
|
||||
remainder
|
||||
} else {
|
||||
|
|
@ -76,8 +114,16 @@ impl ChunkManager {
|
|||
///
|
||||
/// Returns (start_offset, length) for the chunk at the given index.
|
||||
pub fn chunk_range(&self, total_size: usize, chunk_index: usize) -> (usize, usize) {
|
||||
let start = chunk_index * self.config.chunk_size;
|
||||
let length = self.chunk_size_at(total_size, chunk_index);
|
||||
Self::chunk_range_for_size(total_size, chunk_index, self.config.chunk_size)
|
||||
}
|
||||
|
||||
pub fn chunk_range_for_size(
|
||||
total_size: usize,
|
||||
chunk_index: usize,
|
||||
chunk_size: usize,
|
||||
) -> (usize, usize) {
|
||||
let start = chunk_index * chunk_size;
|
||||
let length = Self::chunk_size_at_for_size(total_size, chunk_index, chunk_size);
|
||||
(start, length)
|
||||
}
|
||||
}
|
||||
|
|
@ -257,6 +303,15 @@ mod tests {
|
|||
assert_eq!(manager.chunk_range(2500, 2), (2048, 452));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_effective_chunk_size_scales_large_objects_up_to_target_chunk_count() {
|
||||
let manager = ChunkManager::default();
|
||||
|
||||
assert_eq!(manager.effective_chunk_size(4 * 1024 * 1024), 8 * 1024 * 1024);
|
||||
assert_eq!(manager.effective_chunk_size(256 * 1024 * 1024), 32 * 1024 * 1024);
|
||||
assert_eq!(manager.effective_chunk_size(1024 * 1024 * 1024), 64 * 1024 * 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_chunk_id_to_key() {
|
||||
let id = ChunkId::data_shard("obj123", 0, 2);
|
||||
|
|
|
|||
|
|
@ -65,12 +65,14 @@ pub mod config;
|
|||
pub mod erasure;
|
||||
pub mod node;
|
||||
pub mod placement;
|
||||
pub mod repair;
|
||||
|
||||
// Re-export commonly used types
|
||||
pub use backends::{ErasureCodedBackend, ReplicatedBackend};
|
||||
pub use config::{BucketStorageConfig, ChunkConfig, DistributedConfig, RedundancyMode};
|
||||
pub use node::{MockNodeClient, MockNodeRegistry, NodeRegistry, StaticNodeRegistry};
|
||||
pub use placement::{ConsistentHashSelector, NodeSelector, RandomSelector, RoundRobinSelector};
|
||||
pub use repair::{RepairQueue, ReplicatedRepairTask};
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
|
|
|||
58
lightningstor/crates/lightningstor-distributed/src/repair.rs
Normal file
58
lightningstor/crates/lightningstor-distributed/src/repair.rs
Normal file
|
|
@ -0,0 +1,58 @@
|
|||
use async_trait::async_trait;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
|
||||
pub struct ReplicatedRepairTask {
|
||||
pub id: String,
|
||||
pub key: String,
|
||||
pub shard_index: u32,
|
||||
pub reason: String,
|
||||
pub enqueued_at_millis: u64,
|
||||
#[serde(default)]
|
||||
pub attempt_count: u32,
|
||||
#[serde(default)]
|
||||
pub last_error: Option<String>,
|
||||
#[serde(default)]
|
||||
pub next_attempt_after_millis: u64,
|
||||
}
|
||||
|
||||
impl ReplicatedRepairTask {
|
||||
pub fn new(key: impl Into<String>, shard_index: u32, reason: impl Into<String>) -> Self {
|
||||
let key = key.into();
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64;
|
||||
Self {
|
||||
id: format!("replicated::{key}::{shard_index}"),
|
||||
key,
|
||||
shard_index,
|
||||
reason: reason.into(),
|
||||
enqueued_at_millis: now,
|
||||
attempt_count: 0,
|
||||
last_error: None,
|
||||
next_attempt_after_millis: now,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn schedule_retry(&mut self, error: impl Into<String>, backoff_millis: u64) {
|
||||
let now = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64;
|
||||
self.attempt_count = self.attempt_count.saturating_add(1);
|
||||
self.last_error = Some(error.into());
|
||||
self.next_attempt_after_millis = now.saturating_add(backoff_millis);
|
||||
}
|
||||
|
||||
pub fn is_due(&self, now_millis: u64) -> bool {
|
||||
now_millis >= self.next_attempt_after_millis
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
pub trait RepairQueue: Send + Sync {
|
||||
async fn enqueue_repair(&self, task: ReplicatedRepairTask);
|
||||
}
|
||||
|
||||
|
|
@ -1,13 +1,18 @@
|
|||
//! Local chunk storage
|
||||
|
||||
use dashmap::DashMap;
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use thiserror::Error;
|
||||
use tokio::fs;
|
||||
use tokio::io::AsyncWriteExt;
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::debug;
|
||||
|
||||
const WRITE_LOCK_STRIPES: usize = 256;
|
||||
|
||||
/// Errors from chunk storage operations
|
||||
#[derive(Debug, Error)]
|
||||
pub enum StorageError {
|
||||
|
|
@ -45,6 +50,12 @@ pub struct LocalChunkStore {
|
|||
|
||||
/// Whether writes should be flushed before they are acknowledged.
|
||||
sync_on_write: bool,
|
||||
|
||||
/// Monotonic nonce for per-write temporary paths.
|
||||
temp_file_nonce: AtomicU64,
|
||||
|
||||
/// Striped per-chunk write/delete locks to keep same-key updates coherent.
|
||||
write_locks: Vec<Mutex<()>>,
|
||||
}
|
||||
|
||||
impl LocalChunkStore {
|
||||
|
|
@ -65,6 +76,8 @@ impl LocalChunkStore {
|
|||
max_capacity,
|
||||
chunk_count: AtomicU64::new(0),
|
||||
sync_on_write,
|
||||
temp_file_nonce: AtomicU64::new(0),
|
||||
write_locks: (0..WRITE_LOCK_STRIPES).map(|_| Mutex::new(())).collect(),
|
||||
};
|
||||
|
||||
// Scan existing chunks
|
||||
|
|
@ -91,7 +104,7 @@ impl LocalChunkStore {
|
|||
|
||||
if metadata.is_file() {
|
||||
if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
|
||||
if name.ends_with(".tmp") {
|
||||
if name.ends_with(".tmp") || name.starts_with(".tmp.") {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -131,6 +144,25 @@ impl LocalChunkStore {
|
|||
self.data_dir.join(safe_id)
|
||||
}
|
||||
|
||||
fn temporary_chunk_path(&self, path: &std::path::Path) -> PathBuf {
|
||||
let nonce = self.temp_file_nonce.fetch_add(1, Ordering::Relaxed);
|
||||
let pid = std::process::id();
|
||||
let file_name = path
|
||||
.file_name()
|
||||
.and_then(|name| name.to_str())
|
||||
.unwrap_or("chunk");
|
||||
path.parent()
|
||||
.unwrap_or(&self.data_dir)
|
||||
.join(format!(".tmp.{file_name}.{pid}.{nonce}"))
|
||||
}
|
||||
|
||||
fn write_lock(&self, chunk_id: &str) -> &Mutex<()> {
|
||||
let mut hasher = DefaultHasher::new();
|
||||
chunk_id.hash(&mut hasher);
|
||||
let slot = (hasher.finish() as usize) % self.write_locks.len().max(1);
|
||||
&self.write_locks[slot]
|
||||
}
|
||||
|
||||
async fn resolve_existing_chunk_path(&self, chunk_id: &str) -> StorageResult<PathBuf> {
|
||||
if let Some(path) = self.chunk_paths.get(chunk_id) {
|
||||
return Ok(path.clone());
|
||||
|
|
@ -154,6 +186,7 @@ impl LocalChunkStore {
|
|||
|
||||
/// Store a chunk
|
||||
pub async fn put(&self, chunk_id: &str, data: &[u8]) -> StorageResult<u64> {
|
||||
let _guard = self.write_lock(chunk_id).lock().await;
|
||||
let size = data.len() as u64;
|
||||
|
||||
// Check if replacing existing chunk
|
||||
|
|
@ -169,7 +202,7 @@ impl LocalChunkStore {
|
|||
}
|
||||
|
||||
let path = self.chunk_path(chunk_id);
|
||||
let temp_path = path.with_extension(".tmp");
|
||||
let temp_path = self.temporary_chunk_path(&path);
|
||||
if let Some(parent) = path.parent() {
|
||||
// Multipart uploads fan out concurrent writes into the same shard
|
||||
// directory. Create the parent path unconditionally so no writer can
|
||||
|
|
@ -217,6 +250,7 @@ impl LocalChunkStore {
|
|||
|
||||
/// Delete a chunk
|
||||
pub async fn delete(&self, chunk_id: &str) -> StorageResult<()> {
|
||||
let _guard = self.write_lock(chunk_id).lock().await;
|
||||
if let Some((_, size)) = self.chunk_sizes.remove(chunk_id) {
|
||||
let path = match self.chunk_paths.remove(chunk_id) {
|
||||
Some((_, path)) => path,
|
||||
|
|
@ -421,4 +455,34 @@ mod tests {
|
|||
|
||||
assert_eq!(store.chunk_count(), 16);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_concurrent_rewrites_same_chunk_use_unique_temp_paths() {
|
||||
let (store, _temp) = create_test_store().await;
|
||||
let store = Arc::new(store);
|
||||
let barrier = Arc::new(Barrier::new(9));
|
||||
let mut tasks = Vec::new();
|
||||
|
||||
for idx in 0..8u8 {
|
||||
let store = Arc::clone(&store);
|
||||
let barrier = Arc::clone(&barrier);
|
||||
tasks.push(tokio::spawn(async move {
|
||||
let payload = vec![idx; 2048];
|
||||
barrier.wait().await;
|
||||
store.put("shared-chunk", &payload).await.unwrap();
|
||||
payload
|
||||
}));
|
||||
}
|
||||
|
||||
barrier.wait().await;
|
||||
|
||||
let mut expected_payloads = Vec::new();
|
||||
for task in tasks {
|
||||
expected_payloads.push(task.await.unwrap());
|
||||
}
|
||||
|
||||
let stored = store.get("shared-chunk").await.unwrap();
|
||||
assert!(expected_payloads.iter().any(|payload| payload == &stored));
|
||||
assert_eq!(store.chunk_count(), 1);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ lightningstor-distributed = { workspace = true }
|
|||
lightningstor-storage = { workspace = true }
|
||||
chainfire-client = { path = "../../../chainfire/chainfire-client" }
|
||||
flaredb-client = { path = "../../../flaredb/crates/flaredb-client" }
|
||||
iam-api = { path = "../../../iam/crates/iam-api" }
|
||||
iam-service-auth = { path = "../../../iam/crates/iam-service-auth" }
|
||||
tonic = { workspace = true }
|
||||
tonic-health = { workspace = true }
|
||||
|
|
|
|||
|
|
@ -9,8 +9,11 @@ mod bucket_service;
|
|||
pub mod config;
|
||||
pub mod metadata;
|
||||
mod object_service;
|
||||
pub mod repair;
|
||||
pub mod s3;
|
||||
pub mod tenant;
|
||||
|
||||
pub use bucket_service::BucketServiceImpl;
|
||||
pub use config::ServerConfig;
|
||||
pub use object_service::ObjectServiceImpl;
|
||||
pub use repair::{MetadataRepairQueue, spawn_replicated_repair_worker};
|
||||
|
|
|
|||
|
|
@ -5,11 +5,13 @@ use clap::Parser;
|
|||
use iam_service_auth::AuthService;
|
||||
use lightningstor_api::{BucketServiceServer, ObjectServiceServer};
|
||||
use lightningstor_distributed::{
|
||||
DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, StaticNodeRegistry,
|
||||
DistributedConfig, ErasureCodedBackend, RedundancyMode, ReplicatedBackend, RepairQueue,
|
||||
StaticNodeRegistry,
|
||||
};
|
||||
use lightningstor_server::{
|
||||
config::{MetadataBackend, ObjectStorageBackend},
|
||||
metadata::MetadataStore,
|
||||
repair::{spawn_replicated_repair_worker, MetadataRepairQueue},
|
||||
s3, BucketServiceImpl, ObjectServiceImpl, ServerConfig,
|
||||
};
|
||||
use lightningstor_storage::{LocalFsBackend, StorageBackend};
|
||||
|
|
@ -28,6 +30,12 @@ const OBJECT_GRPC_INITIAL_STREAM_WINDOW: u32 = 64 * 1024 * 1024;
|
|||
const OBJECT_GRPC_INITIAL_CONNECTION_WINDOW: u32 = 512 * 1024 * 1024;
|
||||
const OBJECT_GRPC_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(30);
|
||||
const OBJECT_GRPC_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const REPLICATED_REPAIR_SCAN_INTERVAL: Duration = Duration::from_secs(5);
|
||||
|
||||
struct StorageRuntime {
|
||||
backend: Arc<dyn StorageBackend>,
|
||||
repair_worker: Option<tokio::task::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
/// LightningStor object storage server
|
||||
#[derive(Parser, Debug)]
|
||||
|
|
@ -148,8 +156,6 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
metrics_addr
|
||||
);
|
||||
|
||||
let storage = create_storage_backend(&config).await?;
|
||||
|
||||
if let Some(endpoint) = &config.chainfire_endpoint {
|
||||
tracing::info!(" Cluster coordination: ChainFire @ {}", endpoint);
|
||||
let endpoint = endpoint.clone();
|
||||
|
|
@ -204,6 +210,10 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
}
|
||||
};
|
||||
|
||||
let storage_runtime = create_storage_backend(&config, metadata.clone()).await?;
|
||||
let storage = storage_runtime.backend.clone();
|
||||
let _repair_worker = storage_runtime.repair_worker;
|
||||
|
||||
// Initialize IAM authentication service
|
||||
tracing::info!(
|
||||
"Connecting to IAM server at {}",
|
||||
|
|
@ -253,7 +263,11 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|||
let s3_addr: SocketAddr = config.s3_addr;
|
||||
|
||||
// Start S3 HTTP server with shared state
|
||||
let s3_router = s3::create_router_with_state(storage.clone(), metadata.clone());
|
||||
let s3_router = s3::create_router_with_auth(
|
||||
storage.clone(),
|
||||
metadata.clone(),
|
||||
Some(config.auth.iam_server_addr.clone()),
|
||||
);
|
||||
let s3_server = tokio::spawn(async move {
|
||||
tracing::info!("S3 HTTP server listening on {}", s3_addr);
|
||||
let listener = tokio::net::TcpListener::bind(s3_addr).await.unwrap();
|
||||
|
|
@ -422,24 +436,27 @@ async fn register_chainfire_membership(
|
|||
|
||||
async fn create_storage_backend(
|
||||
config: &ServerConfig,
|
||||
) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
|
||||
metadata: Arc<MetadataStore>,
|
||||
) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
|
||||
match config.object_storage_backend {
|
||||
ObjectStorageBackend::LocalFs => {
|
||||
tracing::info!("Object storage backend: local_fs");
|
||||
Ok(Arc::new(
|
||||
LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?,
|
||||
))
|
||||
Ok(StorageRuntime {
|
||||
backend: Arc::new(LocalFsBackend::new(&config.data_dir, config.sync_on_write).await?),
|
||||
repair_worker: None,
|
||||
})
|
||||
}
|
||||
ObjectStorageBackend::Distributed => {
|
||||
tracing::info!("Object storage backend: distributed");
|
||||
create_distributed_storage_backend(&config.distributed).await
|
||||
create_distributed_storage_backend(&config.distributed, metadata).await
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn create_distributed_storage_backend(
|
||||
config: &DistributedConfig,
|
||||
) -> Result<Arc<dyn StorageBackend>, Box<dyn std::error::Error>> {
|
||||
metadata: Arc<MetadataStore>,
|
||||
) -> Result<StorageRuntime, Box<dyn std::error::Error>> {
|
||||
let endpoints: Vec<String> = config
|
||||
.node_endpoints
|
||||
.iter()
|
||||
|
|
@ -501,9 +518,25 @@ async fn create_distributed_storage_backend(
|
|||
write_quorum,
|
||||
"Using replicated LightningStor storage backend"
|
||||
);
|
||||
Ok(Arc::new(
|
||||
ReplicatedBackend::new(config.clone(), registry).await?,
|
||||
))
|
||||
let repair_queue: Arc<dyn RepairQueue> =
|
||||
Arc::new(MetadataRepairQueue::new(metadata.clone()));
|
||||
let backend = Arc::new(
|
||||
ReplicatedBackend::new_with_repair_queue(
|
||||
config.clone(),
|
||||
registry,
|
||||
Some(repair_queue),
|
||||
)
|
||||
.await?,
|
||||
);
|
||||
let repair_worker = Some(spawn_replicated_repair_worker(
|
||||
metadata,
|
||||
backend.clone(),
|
||||
REPLICATED_REPAIR_SCAN_INTERVAL,
|
||||
));
|
||||
Ok(StorageRuntime {
|
||||
backend,
|
||||
repair_worker,
|
||||
})
|
||||
}
|
||||
RedundancyMode::ErasureCoded {
|
||||
data_shards,
|
||||
|
|
@ -514,9 +547,10 @@ async fn create_distributed_storage_backend(
|
|||
parity_shards,
|
||||
"Using erasure-coded LightningStor storage backend"
|
||||
);
|
||||
Ok(Arc::new(
|
||||
ErasureCodedBackend::new(config.clone(), registry).await?,
|
||||
))
|
||||
Ok(StorageRuntime {
|
||||
backend: Arc::new(ErasureCodedBackend::new(config.clone(), registry).await?),
|
||||
repair_worker: None,
|
||||
})
|
||||
}
|
||||
RedundancyMode::None => Err(std::io::Error::other(
|
||||
"distributed object storage does not support redundancy.type=none; use object_storage_backend=local_fs instead",
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
use dashmap::DashMap;
|
||||
use flaredb_client::RdbClient;
|
||||
use lightningstor_distributed::ReplicatedRepairTask;
|
||||
use lightningstor_types::{Bucket, BucketId, MultipartUpload, Object, ObjectId, Result};
|
||||
use serde_json;
|
||||
use sqlx::pool::PoolOptions;
|
||||
|
|
@ -215,6 +216,12 @@ impl MetadataStore {
|
|||
end_key
|
||||
}
|
||||
|
||||
fn exclusive_scan_start(key: &[u8]) -> Vec<u8> {
|
||||
let mut next = key.to_vec();
|
||||
next.push(0);
|
||||
next
|
||||
}
|
||||
|
||||
fn flaredb_client_for_key<'a>(
|
||||
clients: &'a [Arc<Mutex<RdbClient>>],
|
||||
key: &[u8],
|
||||
|
|
@ -422,6 +429,56 @@ impl MetadataStore {
|
|||
Ok(results)
|
||||
}
|
||||
|
||||
async fn flaredb_scan_page(
|
||||
clients: &[Arc<Mutex<RdbClient>>],
|
||||
prefix: &[u8],
|
||||
start_after: Option<&[u8]>,
|
||||
limit: u32,
|
||||
) -> Result<(Vec<(String, String)>, bool)> {
|
||||
let end_key = Self::prefix_end(prefix);
|
||||
let start_key = start_after
|
||||
.map(Self::exclusive_scan_start)
|
||||
.unwrap_or_else(|| prefix.to_vec());
|
||||
let fetch_limit = limit.saturating_add(1).max(1);
|
||||
let client = Self::flaredb_scan_client(clients);
|
||||
let (mut items, next) = match {
|
||||
let mut c = client.lock().await;
|
||||
c.raw_scan(start_key.clone(), end_key.clone(), fetch_limit).await
|
||||
} {
|
||||
Ok((keys, values, next)) => {
|
||||
let items = keys
|
||||
.into_iter()
|
||||
.zip(values.into_iter())
|
||||
.map(|(key, value)| {
|
||||
(
|
||||
String::from_utf8_lossy(&key).to_string(),
|
||||
String::from_utf8_lossy(&value).to_string(),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
(items, next)
|
||||
}
|
||||
Err(status) if Self::flaredb_requires_strong(&status) => {
|
||||
Self::flaredb_scan_strong(client, &start_key, &end_key, fetch_limit).await?
|
||||
}
|
||||
Err(error) => {
|
||||
return Err(lightningstor_types::Error::StorageError(format!(
|
||||
"FlareDB scan failed: {}",
|
||||
error
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
let has_more = if items.len() > limit as usize {
|
||||
items.truncate(limit as usize);
|
||||
true
|
||||
} else {
|
||||
next.is_some()
|
||||
};
|
||||
|
||||
Ok((items, has_more))
|
||||
}
|
||||
|
||||
async fn flaredb_has_prefix(clients: &[Arc<Mutex<RdbClient>>], prefix: &[u8]) -> Result<bool> {
|
||||
let end_key = Self::prefix_end(prefix);
|
||||
let client = Self::flaredb_scan_client(clients);
|
||||
|
|
@ -613,11 +670,146 @@ impl MetadataStore {
|
|||
results.push((entry.key().clone(), entry.value().clone()));
|
||||
}
|
||||
}
|
||||
results.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
|
||||
Ok(results)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_prefix_page(
|
||||
&self,
|
||||
prefix: &str,
|
||||
start_after: Option<&str>,
|
||||
limit: u32,
|
||||
) -> Result<(Vec<(String, String)>, bool)> {
|
||||
if limit == 0 {
|
||||
return Ok((Vec::new(), false));
|
||||
}
|
||||
|
||||
match &self.backend {
|
||||
StorageBackend::FlareDB(client) => {
|
||||
Self::flaredb_scan_page(
|
||||
client,
|
||||
prefix.as_bytes(),
|
||||
start_after.map(str::as_bytes),
|
||||
limit,
|
||||
)
|
||||
.await
|
||||
}
|
||||
StorageBackend::Sql(sql) => {
|
||||
let prefix_end = String::from_utf8(Self::prefix_end(prefix.as_bytes())).map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"Failed to encode prefix end: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
let fetch_limit = (limit.saturating_add(1)) as i64;
|
||||
match sql {
|
||||
SqlStorageBackend::Postgres(pool) => {
|
||||
let rows: Vec<(String, String)> = if let Some(after) = start_after {
|
||||
sqlx::query_as(
|
||||
"SELECT key, value FROM metadata_kv
|
||||
WHERE key >= $1 AND key < $2 AND key > $3
|
||||
ORDER BY key
|
||||
LIMIT $4",
|
||||
)
|
||||
.bind(prefix)
|
||||
.bind(&prefix_end)
|
||||
.bind(after)
|
||||
.bind(fetch_limit)
|
||||
.fetch_all(pool.as_ref())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"Postgres paged scan failed: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
} else {
|
||||
sqlx::query_as(
|
||||
"SELECT key, value FROM metadata_kv
|
||||
WHERE key >= $1 AND key < $2
|
||||
ORDER BY key
|
||||
LIMIT $3",
|
||||
)
|
||||
.bind(prefix)
|
||||
.bind(&prefix_end)
|
||||
.bind(fetch_limit)
|
||||
.fetch_all(pool.as_ref())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"Postgres paged scan failed: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
};
|
||||
let has_more = rows.len() > limit as usize;
|
||||
let items = rows.into_iter().take(limit as usize).collect();
|
||||
Ok((items, has_more))
|
||||
}
|
||||
SqlStorageBackend::Sqlite(pool) => {
|
||||
let rows: Vec<(String, String)> = if let Some(after) = start_after {
|
||||
sqlx::query_as(
|
||||
"SELECT key, value FROM metadata_kv
|
||||
WHERE key >= ?1 AND key < ?2 AND key > ?3
|
||||
ORDER BY key
|
||||
LIMIT ?4",
|
||||
)
|
||||
.bind(prefix)
|
||||
.bind(&prefix_end)
|
||||
.bind(after)
|
||||
.bind(fetch_limit)
|
||||
.fetch_all(pool.as_ref())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"SQLite paged scan failed: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
} else {
|
||||
sqlx::query_as(
|
||||
"SELECT key, value FROM metadata_kv
|
||||
WHERE key >= ?1 AND key < ?2
|
||||
ORDER BY key
|
||||
LIMIT ?3",
|
||||
)
|
||||
.bind(prefix)
|
||||
.bind(&prefix_end)
|
||||
.bind(fetch_limit)
|
||||
.fetch_all(pool.as_ref())
|
||||
.await
|
||||
.map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"SQLite paged scan failed: {}",
|
||||
e
|
||||
))
|
||||
})?
|
||||
};
|
||||
let has_more = rows.len() > limit as usize;
|
||||
let items = rows.into_iter().take(limit as usize).collect();
|
||||
Ok((items, has_more))
|
||||
}
|
||||
}
|
||||
}
|
||||
StorageBackend::InMemory(map) => {
|
||||
let mut rows: Vec<(String, String)> = map
|
||||
.iter()
|
||||
.filter(|entry| entry.key().starts_with(prefix))
|
||||
.map(|entry| (entry.key().clone(), entry.value().clone()))
|
||||
.collect();
|
||||
rows.sort_by(|lhs, rhs| lhs.0.cmp(&rhs.0));
|
||||
if let Some(after) = start_after {
|
||||
rows.retain(|(key, _)| key.as_str() > after);
|
||||
}
|
||||
let has_more = rows.len() > limit as usize;
|
||||
let items = rows.into_iter().take(limit as usize).collect();
|
||||
Ok((items, has_more))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Internal: check if any key exists with a prefix
|
||||
async fn has_prefix(&self, prefix: &str) -> Result<bool> {
|
||||
match &self.backend {
|
||||
|
|
@ -708,10 +900,64 @@ impl MetadataStore {
|
|||
"/lightningstor/multipart/uploads/"
|
||||
}
|
||||
|
||||
fn multipart_bucket_key(bucket_id: &str, object_key: &str, upload_id: &str) -> String {
|
||||
format!(
|
||||
"/lightningstor/multipart/by-bucket/{}/{}/{}",
|
||||
bucket_id, object_key, upload_id
|
||||
)
|
||||
}
|
||||
|
||||
fn multipart_bucket_prefix(bucket_id: &BucketId, prefix: &str) -> String {
|
||||
format!("/lightningstor/multipart/by-bucket/{}/{}", bucket_id, prefix)
|
||||
}
|
||||
|
||||
fn multipart_object_key(object_id: &ObjectId) -> String {
|
||||
format!("/lightningstor/multipart/objects/{}", object_id)
|
||||
}
|
||||
|
||||
fn replicated_repair_task_key(task_id: &str) -> String {
|
||||
format!("/lightningstor/repair/replicated/{}", task_id)
|
||||
}
|
||||
|
||||
fn replicated_repair_task_prefix() -> &'static str {
|
||||
"/lightningstor/repair/replicated/"
|
||||
}
|
||||
|
||||
pub async fn save_replicated_repair_task(&self, task: &ReplicatedRepairTask) -> Result<()> {
|
||||
let key = Self::replicated_repair_task_key(&task.id);
|
||||
let value = serde_json::to_string(task).map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"Failed to serialize replicated repair task: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
self.put(&key, &value).await
|
||||
}
|
||||
|
||||
pub async fn list_replicated_repair_tasks(
|
||||
&self,
|
||||
limit: u32,
|
||||
) -> Result<Vec<ReplicatedRepairTask>> {
|
||||
let (items, _) = self
|
||||
.get_prefix_page(Self::replicated_repair_task_prefix(), None, limit)
|
||||
.await?;
|
||||
let mut tasks = Vec::new();
|
||||
for (_, value) in items {
|
||||
let task: ReplicatedRepairTask = serde_json::from_str(&value).map_err(|e| {
|
||||
lightningstor_types::Error::StorageError(format!(
|
||||
"Failed to deserialize replicated repair task: {}",
|
||||
e
|
||||
))
|
||||
})?;
|
||||
tasks.push(task);
|
||||
}
|
||||
Ok(tasks)
|
||||
}
|
||||
|
||||
pub async fn delete_replicated_repair_task(&self, task_id: &str) -> Result<()> {
|
||||
self.delete_key(&Self::replicated_repair_task_key(task_id)).await
|
||||
}
|
||||
|
||||
/// Save bucket metadata
|
||||
pub async fn save_bucket(&self, bucket: &Bucket) -> Result<()> {
|
||||
let key = Self::bucket_key(&bucket.org_id, &bucket.project_id, bucket.name.as_str());
|
||||
|
|
@ -900,6 +1146,13 @@ impl MetadataStore {
|
|||
prefix: &str,
|
||||
max_keys: u32,
|
||||
) -> Result<Vec<Object>> {
|
||||
if max_keys > 0 {
|
||||
return self
|
||||
.list_objects_page(bucket_id, prefix, None, max_keys)
|
||||
.await
|
||||
.map(|(objects, _)| objects);
|
||||
}
|
||||
|
||||
let prefix_key = Self::object_prefix(bucket_id, prefix);
|
||||
|
||||
let items = self.get_prefix(&prefix_key).await?;
|
||||
|
|
@ -921,6 +1174,34 @@ impl MetadataStore {
|
|||
Ok(objects)
|
||||
}
|
||||
|
||||
pub async fn list_objects_page(
|
||||
&self,
|
||||
bucket_id: &BucketId,
|
||||
prefix: &str,
|
||||
start_after_key: Option<&str>,
|
||||
max_keys: u32,
|
||||
) -> Result<(Vec<Object>, bool)> {
|
||||
if max_keys == 0 {
|
||||
return Ok((Vec::new(), false));
|
||||
}
|
||||
|
||||
let prefix_key = Self::object_prefix(bucket_id, prefix);
|
||||
let start_after_storage_key =
|
||||
start_after_key.map(|key| Self::object_key(bucket_id, key, None));
|
||||
let (items, has_more) = self
|
||||
.get_prefix_page(&prefix_key, start_after_storage_key.as_deref(), max_keys)
|
||||
.await?;
|
||||
|
||||
let mut objects = Vec::new();
|
||||
for (_, value) in items {
|
||||
if let Ok(object) = serde_json::from_str::<Object>(&value) {
|
||||
objects.push(object);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((objects, has_more))
|
||||
}
|
||||
|
||||
pub async fn save_multipart_upload(&self, upload: &MultipartUpload) -> Result<()> {
|
||||
let key = Self::multipart_upload_key(upload.upload_id.as_str());
|
||||
let value = serde_json::to_string(upload).map_err(|e| {
|
||||
|
|
@ -929,7 +1210,16 @@ impl MetadataStore {
|
|||
e
|
||||
))
|
||||
})?;
|
||||
self.put(&key, &value).await
|
||||
self.put(&key, &value).await?;
|
||||
self.put(
|
||||
&Self::multipart_bucket_key(
|
||||
&upload.bucket_id,
|
||||
upload.key.as_str(),
|
||||
upload.upload_id.as_str(),
|
||||
),
|
||||
&value,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn load_multipart_upload(&self, upload_id: &str) -> Result<Option<MultipartUpload>> {
|
||||
|
|
@ -948,6 +1238,14 @@ impl MetadataStore {
|
|||
}
|
||||
|
||||
pub async fn delete_multipart_upload(&self, upload_id: &str) -> Result<()> {
|
||||
if let Some(upload) = self.load_multipart_upload(upload_id).await? {
|
||||
self.delete_key(&Self::multipart_bucket_key(
|
||||
&upload.bucket_id,
|
||||
upload.key.as_str(),
|
||||
upload.upload_id.as_str(),
|
||||
))
|
||||
.await?;
|
||||
}
|
||||
self.delete_key(&Self::multipart_upload_key(upload_id)).await
|
||||
}
|
||||
|
||||
|
|
@ -957,9 +1255,24 @@ impl MetadataStore {
|
|||
prefix: &str,
|
||||
max_uploads: u32,
|
||||
) -> Result<Vec<MultipartUpload>> {
|
||||
let items = self.get_prefix(Self::multipart_upload_prefix()).await?;
|
||||
let index_prefix = Self::multipart_bucket_prefix(bucket_id, prefix);
|
||||
let items = if max_uploads > 0 {
|
||||
self.get_prefix_page(&index_prefix, None, max_uploads)
|
||||
.await?
|
||||
.0
|
||||
} else {
|
||||
self.get_prefix(&index_prefix).await?
|
||||
};
|
||||
let mut uploads = Vec::new();
|
||||
for (_, value) in items {
|
||||
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
|
||||
uploads.push(upload);
|
||||
}
|
||||
}
|
||||
|
||||
if uploads.is_empty() {
|
||||
let fallback_items = self.get_prefix(Self::multipart_upload_prefix()).await?;
|
||||
for (_, value) in fallback_items {
|
||||
if let Ok(upload) = serde_json::from_str::<MultipartUpload>(&value) {
|
||||
if upload.bucket_id == bucket_id.to_string()
|
||||
&& upload.key.as_str().starts_with(prefix)
|
||||
|
|
@ -968,6 +1281,7 @@ impl MetadataStore {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uploads.sort_by(|a, b| {
|
||||
a.key
|
||||
|
|
@ -1033,6 +1347,7 @@ fn normalize_transport_addr(endpoint: &str) -> String {
|
|||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use lightningstor_distributed::ReplicatedRepairTask;
|
||||
use lightningstor_types::{BucketName, ETag, ObjectKey};
|
||||
|
||||
#[tokio::test]
|
||||
|
|
@ -1119,4 +1434,123 @@ mod tests {
|
|||
.is_none()
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_objects_page_honors_start_after_and_has_more() {
|
||||
let store = MetadataStore::new_in_memory();
|
||||
let bucket = Bucket::new(
|
||||
BucketName::new("paged-bucket").unwrap(),
|
||||
"org-a",
|
||||
"project-a",
|
||||
"default",
|
||||
);
|
||||
store.save_bucket(&bucket).await.unwrap();
|
||||
|
||||
for key in ["a.txt", "b.txt", "c.txt"] {
|
||||
let mut object = Object::new(
|
||||
bucket.id.to_string(),
|
||||
ObjectKey::new(key).unwrap(),
|
||||
ETag::from_md5(&[7u8; 16]),
|
||||
128,
|
||||
Some("text/plain".to_string()),
|
||||
);
|
||||
object.version = lightningstor_types::ObjectVersion::null();
|
||||
store.save_object(&object).await.unwrap();
|
||||
}
|
||||
|
||||
let (first_page, first_has_more) = store
|
||||
.list_objects_page(&bucket.id, "", None, 2)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
first_page
|
||||
.iter()
|
||||
.map(|object| object.key.as_str().to_string())
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["a.txt".to_string(), "b.txt".to_string()]
|
||||
);
|
||||
assert!(first_has_more);
|
||||
|
||||
let (second_page, second_has_more) = store
|
||||
.list_objects_page(&bucket.id, "", Some("b.txt"), 2)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
second_page
|
||||
.iter()
|
||||
.map(|object| object.key.as_str().to_string())
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["c.txt".to_string()]
|
||||
);
|
||||
assert!(!second_has_more);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn list_multipart_uploads_uses_bucket_prefix_index() {
|
||||
let store = MetadataStore::new_in_memory();
|
||||
let bucket = Bucket::new(
|
||||
BucketName::new("multipart-bucket").unwrap(),
|
||||
"org-a",
|
||||
"project-a",
|
||||
"default",
|
||||
);
|
||||
store.save_bucket(&bucket).await.unwrap();
|
||||
|
||||
let upload_a = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/one.bin").unwrap());
|
||||
let upload_b = MultipartUpload::new(bucket.id.to_string(), ObjectKey::new("a/two.bin").unwrap());
|
||||
let other_bucket = Bucket::new(
|
||||
BucketName::new("other-bucket").unwrap(),
|
||||
"org-a",
|
||||
"project-a",
|
||||
"default",
|
||||
);
|
||||
store.save_bucket(&other_bucket).await.unwrap();
|
||||
let upload_other =
|
||||
MultipartUpload::new(other_bucket.id.to_string(), ObjectKey::new("a/three.bin").unwrap());
|
||||
|
||||
store.save_multipart_upload(&upload_a).await.unwrap();
|
||||
store.save_multipart_upload(&upload_b).await.unwrap();
|
||||
store.save_multipart_upload(&upload_other).await.unwrap();
|
||||
|
||||
let uploads = store
|
||||
.list_multipart_uploads(&bucket.id, "a/", 10)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(uploads.len(), 2);
|
||||
assert_eq!(
|
||||
uploads
|
||||
.iter()
|
||||
.map(|upload| upload.key.as_str().to_string())
|
||||
.collect::<Vec<_>>(),
|
||||
vec!["a/one.bin".to_string(), "a/two.bin".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn replicated_repair_tasks_round_trip() {
|
||||
let store = MetadataStore::new_in_memory();
|
||||
let mut task = ReplicatedRepairTask::new("obj_abc", 0, "quorum write");
|
||||
store.save_replicated_repair_task(&task).await.unwrap();
|
||||
|
||||
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
|
||||
assert_eq!(tasks.len(), 1);
|
||||
assert_eq!(tasks[0].key, "obj_abc");
|
||||
|
||||
task.schedule_retry("transient failure", 5_000);
|
||||
store.save_replicated_repair_task(&task).await.unwrap();
|
||||
|
||||
let tasks = store.list_replicated_repair_tasks(10).await.unwrap();
|
||||
assert_eq!(tasks[0].attempt_count, 1);
|
||||
assert_eq!(tasks[0].last_error.as_deref(), Some("transient failure"));
|
||||
|
||||
store
|
||||
.delete_replicated_repair_task(&task.id)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(store
|
||||
.list_replicated_repair_tasks(10)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_empty());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -155,6 +155,10 @@ impl ObjectServiceImpl {
|
|||
.await
|
||||
.map_err(|e| Status::internal(format!("Failed to delete multipart part: {}", e)))?;
|
||||
}
|
||||
self.storage
|
||||
.delete_upload_parts(upload.upload_id.as_str())
|
||||
.await
|
||||
.map_err(|e| Status::internal(format!("Failed to clean multipart upload: {}", e)))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
|
@ -465,7 +469,6 @@ impl ObjectService for ObjectServiceImpl {
|
|||
let (start, end) =
|
||||
Self::resolve_range(object.size as usize, req.range_start, req.range_end);
|
||||
|
||||
if object.etag.is_multipart() {
|
||||
if let Some(upload) = self
|
||||
.metadata
|
||||
.load_object_multipart_upload(&object.id)
|
||||
|
|
@ -476,7 +479,6 @@ impl ObjectService for ObjectServiceImpl {
|
|||
self.multipart_object_stream(&object, upload, start, end),
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
let data = self
|
||||
.storage
|
||||
|
|
@ -524,7 +526,6 @@ impl ObjectService for ObjectServiceImpl {
|
|||
.map_err(Self::to_status)?
|
||||
.ok_or_else(|| Status::not_found(format!("Object {} not found", req.key)))?;
|
||||
|
||||
if object.etag.is_multipart() {
|
||||
if let Some(upload) = self
|
||||
.metadata
|
||||
.load_object_multipart_upload(&object.id)
|
||||
|
|
@ -540,12 +541,6 @@ impl ObjectService for ObjectServiceImpl {
|
|||
.delete_multipart_upload(upload.upload_id.as_str())
|
||||
.await
|
||||
.map_err(Self::to_status)?;
|
||||
} else {
|
||||
self.storage
|
||||
.delete_object(&object.id)
|
||||
.await
|
||||
.map_err(|e| Status::internal(format!("Failed to delete object: {}", e)))?;
|
||||
}
|
||||
} else {
|
||||
self.storage
|
||||
.delete_object(&object.id)
|
||||
|
|
|
|||
182
lightningstor/crates/lightningstor-server/src/repair.rs
Normal file
182
lightningstor/crates/lightningstor-server/src/repair.rs
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
use crate::metadata::MetadataStore;
|
||||
use async_trait::async_trait;
|
||||
use lightningstor_distributed::{RepairQueue, ReplicatedBackend, ReplicatedRepairTask};
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, SystemTime, UNIX_EPOCH};
|
||||
use tokio::task::JoinHandle;
|
||||
use tokio::time::sleep;
|
||||
use tracing::{debug, warn};
|
||||
|
||||
const REPAIR_SCAN_LIMIT: u32 = 256;
|
||||
const REPAIR_BACKOFF_BASE_MILLIS: u64 = 1_000;
|
||||
const REPAIR_BACKOFF_MAX_MILLIS: u64 = 60_000;
|
||||
const ORPHAN_REPAIR_DROP_ATTEMPTS: u32 = 8;
|
||||
|
||||
pub struct MetadataRepairQueue {
|
||||
metadata: Arc<MetadataStore>,
|
||||
}
|
||||
|
||||
impl MetadataRepairQueue {
|
||||
pub fn new(metadata: Arc<MetadataStore>) -> Self {
|
||||
Self { metadata }
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl RepairQueue for MetadataRepairQueue {
|
||||
async fn enqueue_repair(&self, task: ReplicatedRepairTask) {
|
||||
if let Err(error) = self.metadata.save_replicated_repair_task(&task).await {
|
||||
warn!(
|
||||
task_id = task.id,
|
||||
chunk_key = task.key,
|
||||
error = %error,
|
||||
"failed to persist replicated repair task"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn spawn_replicated_repair_worker(
|
||||
metadata: Arc<MetadataStore>,
|
||||
backend: Arc<ReplicatedBackend>,
|
||||
interval: Duration,
|
||||
) -> JoinHandle<()> {
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
if let Err(error) = process_replicated_repair_queue(&metadata, &backend).await {
|
||||
if replicated_repair_queue_transiently_unready(&error) {
|
||||
debug!(error = %error, "replicated repair queue pass deferred until metadata becomes ready");
|
||||
} else {
|
||||
warn!(error = %error, "replicated repair queue pass failed");
|
||||
}
|
||||
}
|
||||
sleep(interval).await;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn process_replicated_repair_queue(
|
||||
metadata: &MetadataStore,
|
||||
backend: &ReplicatedBackend,
|
||||
) -> Result<(), lightningstor_types::Error> {
|
||||
let now = unix_time_millis();
|
||||
let tasks = metadata
|
||||
.list_replicated_repair_tasks(REPAIR_SCAN_LIMIT)
|
||||
.await?;
|
||||
for mut task in tasks {
|
||||
if !task.is_due(now) {
|
||||
continue;
|
||||
}
|
||||
match backend.repair_chunk(&task).await {
|
||||
Ok(()) => {
|
||||
metadata.delete_replicated_repair_task(&task.id).await?;
|
||||
debug!(
|
||||
task_id = task.id,
|
||||
chunk_key = task.key,
|
||||
"repaired replicated chunk"
|
||||
);
|
||||
}
|
||||
Err(error) => {
|
||||
if task.attempt_count >= ORPHAN_REPAIR_DROP_ATTEMPTS {
|
||||
match backend.chunk_exists_anywhere(&task.key).await {
|
||||
Ok(false) => {
|
||||
warn!(
|
||||
task_id = task.id,
|
||||
chunk_key = task.key,
|
||||
attempts = task.attempt_count,
|
||||
"dropping orphan replicated repair task with no remaining source replica"
|
||||
);
|
||||
metadata.delete_replicated_repair_task(&task.id).await?;
|
||||
continue;
|
||||
}
|
||||
Ok(true) => {}
|
||||
Err(probe_error) => {
|
||||
warn!(
|
||||
task_id = task.id,
|
||||
chunk_key = task.key,
|
||||
error = %probe_error,
|
||||
"failed to probe global chunk existence while evaluating orphan repair task"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
let backoff = repair_backoff_millis(task.attempt_count);
|
||||
task.schedule_retry(error.to_string(), backoff);
|
||||
metadata.save_replicated_repair_task(&task).await?;
|
||||
warn!(
|
||||
task_id = task.id,
|
||||
chunk_key = task.key,
|
||||
attempts = task.attempt_count,
|
||||
backoff_millis = backoff,
|
||||
error = %error,
|
||||
"replicated chunk repair failed"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn unix_time_millis() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_millis() as u64
|
||||
}
|
||||
|
||||
fn repair_backoff_millis(attempt_count: u32) -> u64 {
|
||||
let exponent = attempt_count.min(6);
|
||||
let multiplier = 1u64 << exponent;
|
||||
(REPAIR_BACKOFF_BASE_MILLIS.saturating_mul(multiplier)).min(REPAIR_BACKOFF_MAX_MILLIS)
|
||||
}
|
||||
|
||||
fn replicated_repair_queue_transiently_unready(error: &lightningstor_types::Error) -> bool {
|
||||
let rendered = error.to_string().to_ascii_lowercase();
|
||||
let transient = rendered.contains("region not found")
|
||||
|| rendered.contains("status: notfound")
|
||||
|| rendered.contains("metadata backend not ready")
|
||||
|| rendered.contains("notleader");
|
||||
if transient {
|
||||
return true;
|
||||
}
|
||||
|
||||
match error {
|
||||
lightningstor_types::Error::StorageError(message)
|
||||
| lightningstor_types::Error::Internal(message) => {
|
||||
let message = message.to_ascii_lowercase();
|
||||
message.contains("region not found")
|
||||
|| message.contains("status: notfound")
|
||||
|| message.contains("metadata backend not ready")
|
||||
|| message.contains("notleader")
|
||||
}
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::replicated_repair_queue_transiently_unready;
|
||||
|
||||
#[test]
|
||||
fn treats_region_not_found_as_transient_startup_state() {
|
||||
let error = lightningstor_types::Error::StorageError(
|
||||
"FlareDB scan failed: status: NotFound, message: \"region not found\"".to_string(),
|
||||
);
|
||||
assert!(replicated_repair_queue_transiently_unready(&error));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn treats_wrapped_storage_error_rendering_as_transient_startup_state() {
|
||||
let error = lightningstor_types::Error::StorageError(
|
||||
"FlareDB scan failed: status: NotFound, message: \"region not found\", details: [], metadata: MetadataMap { headers: {} }".to_string(),
|
||||
);
|
||||
assert!(replicated_repair_queue_transiently_unready(&error));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keeps_real_repair_failures_as_warnings() {
|
||||
let error =
|
||||
lightningstor_types::Error::StorageError("replication checksum mismatch".to_string());
|
||||
assert!(!replicated_repair_queue_transiently_unready(&error));
|
||||
}
|
||||
}
|
||||
|
|
@ -10,13 +10,17 @@ use axum::{
|
|||
middleware::Next,
|
||||
response::{IntoResponse, Response},
|
||||
};
|
||||
use crate::tenant::TenantContext;
|
||||
use hmac::{Hmac, Mac};
|
||||
use iam_api::proto::{iam_credential_client::IamCredentialClient, GetSecretKeyRequest};
|
||||
use sha2::{Digest, Sha256};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use tokio::sync::RwLock;
|
||||
use tokio::sync::{Mutex, RwLock};
|
||||
use tonic::transport::Channel;
|
||||
use tracing::{debug, warn};
|
||||
use url::form_urlencoded;
|
||||
use std::time::{Duration as StdDuration, Instant};
|
||||
|
||||
type HmacSha256 = Hmac<Sha256>;
|
||||
const DEFAULT_MAX_AUTH_BODY_BYTES: usize = 1024 * 1024 * 1024;
|
||||
|
|
@ -27,6 +31,13 @@ pub(crate) struct VerifiedBodyBytes(pub Bytes);
|
|||
#[derive(Clone, Debug)]
|
||||
pub(crate) struct VerifiedPayloadHash(pub String);
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct VerifiedTenantContext(pub TenantContext);
|
||||
|
||||
fn should_buffer_auth_body(payload_hash_header: Option<&str>) -> bool {
|
||||
payload_hash_header.is_none()
|
||||
}
|
||||
|
||||
/// SigV4 authentication state
|
||||
#[derive(Clone)]
|
||||
pub struct AuthState {
|
||||
|
|
@ -40,21 +51,73 @@ pub struct AuthState {
|
|||
aws_service: String,
|
||||
}
|
||||
|
||||
/// Placeholder IAM client (will integrate with real IAM later)
|
||||
pub struct IamClient {
|
||||
// Stores access_key_id -> secret_key mapping
|
||||
mode: IamClientMode,
|
||||
credential_cache: Arc<RwLock<HashMap<String, CachedCredential>>>,
|
||||
cache_ttl: StdDuration,
|
||||
}
|
||||
|
||||
enum IamClientMode {
|
||||
Env {
|
||||
credentials: std::collections::HashMap<String, String>,
|
||||
},
|
||||
Grpc {
|
||||
endpoint: String,
|
||||
channel: Arc<Mutex<Option<Channel>>>,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub(crate) struct ResolvedCredential {
|
||||
pub secret_key: String,
|
||||
pub principal_id: String,
|
||||
pub org_id: Option<String>,
|
||||
pub project_id: Option<String>,
|
||||
}
|
||||
|
||||
struct CachedCredential {
|
||||
credential: ResolvedCredential,
|
||||
cached_at: Instant,
|
||||
}
|
||||
|
||||
impl IamClient {
|
||||
/// Create a new IamClient loading credentials from environment variables for MVP.
|
||||
/// Create a new IAM client. If an endpoint is supplied, use the IAM gRPC API.
|
||||
pub fn new(iam_endpoint: Option<String>) -> Self {
|
||||
let cache_ttl = std::env::var("LIGHTNINGSTOR_S3_IAM_CACHE_TTL_SECS")
|
||||
.ok()
|
||||
.and_then(|value| value.parse::<u64>().ok())
|
||||
.map(StdDuration::from_secs)
|
||||
.unwrap_or_else(|| StdDuration::from_secs(30));
|
||||
|
||||
if let Some(endpoint) = iam_endpoint
|
||||
.map(|value| normalize_iam_endpoint(&value))
|
||||
.filter(|value| !value.is_empty())
|
||||
{
|
||||
return Self {
|
||||
mode: IamClientMode::Grpc {
|
||||
endpoint,
|
||||
channel: Arc::new(Mutex::new(None)),
|
||||
},
|
||||
credential_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
cache_ttl,
|
||||
};
|
||||
}
|
||||
|
||||
Self {
|
||||
mode: IamClientMode::Env {
|
||||
credentials: Self::load_env_credentials(),
|
||||
},
|
||||
credential_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||
cache_ttl,
|
||||
}
|
||||
}
|
||||
|
||||
/// Load credentials from environment variables for fallback/testing.
|
||||
///
|
||||
/// Supports two formats:
|
||||
/// 1. Single credential: S3_ACCESS_KEY_ID + S3_SECRET_KEY
|
||||
/// 2. Multiple credentials: S3_CREDENTIALS="key1:secret1,key2:secret2,..."
|
||||
///
|
||||
/// TODO: Replace with proper IAM gRPC integration (see T060)
|
||||
pub fn new() -> Self {
|
||||
fn load_env_credentials() -> std::collections::HashMap<String, String> {
|
||||
let mut credentials = std::collections::HashMap::new();
|
||||
|
||||
// Option 1: Multiple credentials via S3_CREDENTIALS
|
||||
|
|
@ -87,28 +150,160 @@ impl IamClient {
|
|||
warn!("Set S3_CREDENTIALS or S3_ACCESS_KEY_ID/S3_SECRET_KEY to enable access.");
|
||||
}
|
||||
|
||||
Self { credentials }
|
||||
credentials
|
||||
}
|
||||
|
||||
/// Validate access key and return secret key
|
||||
pub async fn get_secret_key(&self, access_key_id: &str) -> Result<String, String> {
|
||||
self.credentials
|
||||
#[cfg(test)]
|
||||
fn env_credentials(&self) -> Option<&std::collections::HashMap<String, String>> {
|
||||
match &self.mode {
|
||||
IamClientMode::Env { credentials } => Some(credentials),
|
||||
IamClientMode::Grpc { .. } => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn env_default_tenant() -> (Option<String>, Option<String>) {
|
||||
let org_id = std::env::var("S3_TENANT_ORG_ID")
|
||||
.ok()
|
||||
.or_else(|| std::env::var("S3_ORG_ID").ok())
|
||||
.or_else(|| Some("default".to_string()));
|
||||
let project_id = std::env::var("S3_TENANT_PROJECT_ID")
|
||||
.ok()
|
||||
.or_else(|| std::env::var("S3_PROJECT_ID").ok())
|
||||
.or_else(|| Some("default".to_string()));
|
||||
(org_id, project_id)
|
||||
}
|
||||
|
||||
/// Validate access key and resolve the credential context.
|
||||
pub async fn get_credential(&self, access_key_id: &str) -> Result<ResolvedCredential, String> {
|
||||
match &self.mode {
|
||||
IamClientMode::Env { credentials } => {
|
||||
let secret_key = credentials
|
||||
.get(access_key_id)
|
||||
.cloned()
|
||||
.ok_or_else(|| "Access key ID not found".to_string())
|
||||
.ok_or_else(|| "Access key ID not found".to_string())?;
|
||||
let (org_id, project_id) = Self::env_default_tenant();
|
||||
Ok(ResolvedCredential {
|
||||
secret_key,
|
||||
principal_id: access_key_id.to_string(),
|
||||
org_id,
|
||||
project_id,
|
||||
})
|
||||
}
|
||||
IamClientMode::Grpc { endpoint, channel } => {
|
||||
if let Some(credential) = self.cached_credential(access_key_id).await {
|
||||
return Ok(credential);
|
||||
}
|
||||
|
||||
let response = self
|
||||
.grpc_get_secret_key(endpoint, channel, access_key_id)
|
||||
.await?;
|
||||
let response = response.into_inner();
|
||||
let credential = ResolvedCredential {
|
||||
secret_key: response.secret_key,
|
||||
principal_id: response.principal_id,
|
||||
org_id: response.org_id,
|
||||
project_id: response.project_id,
|
||||
};
|
||||
self.cache_credential(access_key_id, &credential).await;
|
||||
Ok(credential)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn cached_credential(&self, access_key_id: &str) -> Option<ResolvedCredential> {
|
||||
let cache = self.credential_cache.read().await;
|
||||
cache.get(access_key_id).and_then(|entry| {
|
||||
if entry.cached_at.elapsed() <= self.cache_ttl {
|
||||
Some(entry.credential.clone())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn cache_credential(&self, access_key_id: &str, credential: &ResolvedCredential) {
|
||||
let mut cache = self.credential_cache.write().await;
|
||||
cache.insert(
|
||||
access_key_id.to_string(),
|
||||
CachedCredential {
|
||||
credential: credential.clone(),
|
||||
cached_at: Instant::now(),
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
async fn grpc_channel(
|
||||
endpoint: &str,
|
||||
channel: &Arc<Mutex<Option<Channel>>>,
|
||||
) -> Result<Channel, String> {
|
||||
let mut cached = channel.lock().await;
|
||||
if let Some(existing) = cached.as_ref() {
|
||||
return Ok(existing.clone());
|
||||
}
|
||||
|
||||
let created = Channel::from_shared(endpoint.to_string())
|
||||
.map_err(|e| format!("failed to parse IAM credential endpoint: {}", e))?
|
||||
.connect()
|
||||
.await
|
||||
.map_err(|e| format!("failed to connect to IAM credential service: {}", e))?;
|
||||
*cached = Some(created.clone());
|
||||
Ok(created)
|
||||
}
|
||||
|
||||
async fn invalidate_grpc_channel(channel: &Arc<Mutex<Option<Channel>>>) {
|
||||
let mut cached = channel.lock().await;
|
||||
*cached = None;
|
||||
}
|
||||
|
||||
async fn grpc_get_secret_key(
|
||||
&self,
|
||||
endpoint: &str,
|
||||
channel: &Arc<Mutex<Option<Channel>>>,
|
||||
access_key_id: &str,
|
||||
) -> Result<tonic::Response<iam_api::proto::GetSecretKeyResponse>, String> {
|
||||
for attempt in 0..2 {
|
||||
let grpc_channel = Self::grpc_channel(endpoint, channel).await?;
|
||||
let mut client = IamCredentialClient::new(grpc_channel);
|
||||
match client
|
||||
.get_secret_key(GetSecretKeyRequest {
|
||||
access_key_id: access_key_id.to_string(),
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(response) => return Ok(response),
|
||||
Err(status)
|
||||
if attempt == 0
|
||||
&& matches!(
|
||||
status.code(),
|
||||
tonic::Code::Unavailable
|
||||
| tonic::Code::Cancelled
|
||||
| tonic::Code::Unknown
|
||||
| tonic::Code::DeadlineExceeded
|
||||
| tonic::Code::Internal
|
||||
) =>
|
||||
{
|
||||
Self::invalidate_grpc_channel(channel).await;
|
||||
}
|
||||
Err(status) => return Err(status.message().to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
Err("IAM credential lookup exhausted retries".to_string())
|
||||
}
|
||||
}
|
||||
|
||||
fn normalize_iam_endpoint(endpoint: &str) -> String {
|
||||
if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
|
||||
endpoint.to_string()
|
||||
} else {
|
||||
format!("http://{}", endpoint)
|
||||
}
|
||||
}
|
||||
|
||||
impl AuthState {
|
||||
/// Create new auth state with IAM integration
|
||||
pub fn new(iam_endpoint: Option<String>) -> Self {
|
||||
let iam_client = if let Some(_endpoint) = iam_endpoint {
|
||||
// TODO: Connect to real IAM gRPC service
|
||||
// For now, if an endpoint is provided, we still use our env var based client
|
||||
Some(Arc::new(RwLock::new(IamClient::new())))
|
||||
} else {
|
||||
Some(Arc::new(RwLock::new(IamClient::new())))
|
||||
};
|
||||
let iam_client = Some(Arc::new(RwLock::new(IamClient::new(iam_endpoint))));
|
||||
|
||||
Self {
|
||||
iam_client,
|
||||
|
|
@ -198,9 +393,9 @@ pub async fn sigv4_auth_middleware(
|
|||
};
|
||||
|
||||
// Get secret key from IAM (or use dummy for MVP)
|
||||
let secret_key = if let Some(ref iam) = auth_state.iam_client {
|
||||
match iam.read().await.get_secret_key(&access_key_id).await {
|
||||
Ok(key) => key,
|
||||
let credential = if let Some(ref iam) = auth_state.iam_client {
|
||||
match iam.read().await.get_credential(&access_key_id).await {
|
||||
Ok(credential) => credential,
|
||||
Err(e) => {
|
||||
warn!("IAM credential validation failed: {}", e);
|
||||
return error_response(
|
||||
|
|
@ -211,18 +406,22 @@ pub async fn sigv4_auth_middleware(
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// This case should ideally not be hit with the current IamClient::new() logic
|
||||
// but kept for safety.
|
||||
debug!("No IAM integration, using dummy secret key if IamClient wasn't initialized.");
|
||||
"dummy_secret_key_for_mvp".to_string()
|
||||
ResolvedCredential {
|
||||
secret_key: "dummy_secret_key_for_mvp".to_string(),
|
||||
principal_id: access_key_id.clone(),
|
||||
org_id: Some("default".to_string()),
|
||||
project_id: Some("default".to_string()),
|
||||
}
|
||||
};
|
||||
let secret_key = credential.secret_key.as_str();
|
||||
|
||||
let payload_hash_header = headers
|
||||
.get("x-amz-content-sha256")
|
||||
.and_then(|value| value.to_str().ok())
|
||||
.filter(|value| !value.is_empty())
|
||||
.map(str::to_string);
|
||||
let should_buffer_body = !matches!(payload_hash_header.as_deref(), Some(hash) if hash != "UNSIGNED-PAYLOAD");
|
||||
let should_buffer_body = should_buffer_auth_body(payload_hash_header.as_deref());
|
||||
|
||||
let body_bytes = if should_buffer_body {
|
||||
let max_body_bytes = std::env::var("S3_MAX_AUTH_BODY_BYTES")
|
||||
|
|
@ -282,7 +481,7 @@ pub async fn sigv4_auth_middleware(
|
|||
);
|
||||
|
||||
let expected_signature = match compute_sigv4_signature(
|
||||
&secret_key,
|
||||
secret_key,
|
||||
&method,
|
||||
&uri,
|
||||
&headers,
|
||||
|
|
@ -310,6 +509,21 @@ pub async fn sigv4_auth_middleware(
|
|||
);
|
||||
}
|
||||
|
||||
match (credential.org_id, credential.project_id) {
|
||||
(Some(org_id), Some(project_id)) => {
|
||||
request
|
||||
.extensions_mut()
|
||||
.insert(VerifiedTenantContext(TenantContext { org_id, project_id }));
|
||||
}
|
||||
_ => {
|
||||
return error_response(
|
||||
StatusCode::FORBIDDEN,
|
||||
"AccessDenied",
|
||||
"S3 credential is missing tenant scope",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Auth successful
|
||||
debug!("SigV4 auth successful for access_key={}", access_key_id);
|
||||
next.run(request).await
|
||||
|
|
@ -558,6 +772,97 @@ fn error_response(status: StatusCode, code: &str, message: &str) -> Response {
|
|||
mod tests {
|
||||
use super::*;
|
||||
use axum::http::HeaderValue;
|
||||
use iam_api::proto::{
|
||||
iam_credential_server::{IamCredential, IamCredentialServer},
|
||||
CreateS3CredentialRequest, CreateS3CredentialResponse, Credential, GetSecretKeyResponse,
|
||||
ListCredentialsRequest, ListCredentialsResponse, RevokeCredentialRequest,
|
||||
RevokeCredentialResponse,
|
||||
};
|
||||
use std::collections::HashMap;
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::{atomic::{AtomicUsize, Ordering}, Mutex};
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::time::{sleep, Duration};
|
||||
use tonic::{Request as TonicRequest, Response as TonicResponse, Status};
|
||||
use tonic::transport::Server;
|
||||
|
||||
static ENV_LOCK: Mutex<()> = Mutex::new(());
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
struct MockIamCredentialService {
|
||||
secrets: Arc<HashMap<String, String>>,
|
||||
get_secret_calls: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
#[tonic::async_trait]
|
||||
impl IamCredential for MockIamCredentialService {
|
||||
async fn create_s3_credential(
|
||||
&self,
|
||||
_request: TonicRequest<CreateS3CredentialRequest>,
|
||||
) -> Result<TonicResponse<CreateS3CredentialResponse>, Status> {
|
||||
Err(Status::unimplemented("not needed in test"))
|
||||
}
|
||||
|
||||
async fn get_secret_key(
|
||||
&self,
|
||||
request: TonicRequest<GetSecretKeyRequest>,
|
||||
) -> Result<TonicResponse<GetSecretKeyResponse>, Status> {
|
||||
let access_key_id = request.into_inner().access_key_id;
|
||||
self.get_secret_calls.fetch_add(1, Ordering::SeqCst);
|
||||
let Some(secret_key) = self.secrets.get(&access_key_id) else {
|
||||
return Err(Status::not_found("access key not found"));
|
||||
};
|
||||
Ok(TonicResponse::new(GetSecretKeyResponse {
|
||||
secret_key: secret_key.clone(),
|
||||
principal_id: "test-principal".to_string(),
|
||||
expires_at: None,
|
||||
org_id: Some("test-org".to_string()),
|
||||
project_id: Some("test-project".to_string()),
|
||||
principal_kind: iam_api::proto::PrincipalKind::ServiceAccount as i32,
|
||||
}))
|
||||
}
|
||||
|
||||
async fn list_credentials(
|
||||
&self,
|
||||
_request: TonicRequest<ListCredentialsRequest>,
|
||||
) -> Result<TonicResponse<ListCredentialsResponse>, Status> {
|
||||
Ok(TonicResponse::new(ListCredentialsResponse {
|
||||
credentials: Vec::<Credential>::new(),
|
||||
}))
|
||||
}
|
||||
|
||||
async fn revoke_credential(
|
||||
&self,
|
||||
_request: TonicRequest<RevokeCredentialRequest>,
|
||||
) -> Result<TonicResponse<RevokeCredentialResponse>, Status> {
|
||||
Ok(TonicResponse::new(RevokeCredentialResponse { success: true }))
|
||||
}
|
||||
}
|
||||
|
||||
async fn start_mock_iam(secrets: HashMap<String, String>) -> (SocketAddr, Arc<AtomicUsize>) {
|
||||
let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
|
||||
let addr = listener.local_addr().unwrap();
|
||||
let get_secret_calls = Arc::new(AtomicUsize::new(0));
|
||||
let service = MockIamCredentialService {
|
||||
secrets: Arc::new(secrets),
|
||||
get_secret_calls: get_secret_calls.clone(),
|
||||
};
|
||||
drop(listener);
|
||||
tokio::spawn(async move {
|
||||
Server::builder()
|
||||
.add_service(IamCredentialServer::new(service))
|
||||
.serve(addr)
|
||||
.await
|
||||
.unwrap();
|
||||
});
|
||||
for _ in 0..20 {
|
||||
if tokio::net::TcpStream::connect(addr).await.is_ok() {
|
||||
return (addr, get_secret_calls);
|
||||
}
|
||||
sleep(Duration::from_millis(25)).await;
|
||||
}
|
||||
panic!("mock IAM server did not start on {}", addr);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_parse_auth_header() {
|
||||
|
|
@ -657,6 +962,13 @@ mod tests {
|
|||
assert_eq!(hashed_payload, "signed-payload-hash");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_should_buffer_auth_body_only_when_hash_header_missing() {
|
||||
assert!(should_buffer_auth_body(None));
|
||||
assert!(!should_buffer_auth_body(Some("signed-payload-hash")));
|
||||
assert!(!should_buffer_auth_body(Some("UNSIGNED-PAYLOAD")));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_build_string_to_sign() {
|
||||
let amz_date = "20231201T000000Z";
|
||||
|
|
@ -677,34 +989,77 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_iam_client_multi_credentials() {
|
||||
let _guard = ENV_LOCK.lock().unwrap();
|
||||
// Test parsing S3_CREDENTIALS format
|
||||
std::env::set_var("S3_CREDENTIALS", "key1:secret1,key2:secret2,key3:secret3");
|
||||
let client = IamClient::new();
|
||||
let client = IamClient::new(None);
|
||||
let credentials = client.env_credentials().unwrap();
|
||||
|
||||
assert_eq!(client.credentials.len(), 3);
|
||||
assert_eq!(client.credentials.get("key1"), Some(&"secret1".to_string()));
|
||||
assert_eq!(client.credentials.get("key2"), Some(&"secret2".to_string()));
|
||||
assert_eq!(client.credentials.get("key3"), Some(&"secret3".to_string()));
|
||||
assert_eq!(credentials.len(), 3);
|
||||
assert_eq!(credentials.get("key1"), Some(&"secret1".to_string()));
|
||||
assert_eq!(credentials.get("key2"), Some(&"secret2".to_string()));
|
||||
assert_eq!(credentials.get("key3"), Some(&"secret3".to_string()));
|
||||
|
||||
std::env::remove_var("S3_CREDENTIALS");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_iam_client_single_credentials() {
|
||||
let _guard = ENV_LOCK.lock().unwrap();
|
||||
// Test legacy S3_ACCESS_KEY_ID/S3_SECRET_KEY format
|
||||
std::env::remove_var("S3_CREDENTIALS");
|
||||
std::env::set_var("S3_ACCESS_KEY_ID", "test_key");
|
||||
std::env::set_var("S3_SECRET_KEY", "test_secret");
|
||||
|
||||
let client = IamClient::new();
|
||||
let client = IamClient::new(None);
|
||||
let credentials = client.env_credentials().unwrap();
|
||||
|
||||
assert_eq!(client.credentials.len(), 1);
|
||||
assert_eq!(client.credentials.get("test_key"), Some(&"test_secret".to_string()));
|
||||
assert_eq!(credentials.len(), 1);
|
||||
assert_eq!(credentials.get("test_key"), Some(&"test_secret".to_string()));
|
||||
|
||||
std::env::remove_var("S3_ACCESS_KEY_ID");
|
||||
std::env::remove_var("S3_SECRET_KEY");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_iam_client_grpc_lookup() {
|
||||
let (addr, _calls) = start_mock_iam(HashMap::from([(
|
||||
"grpc_key".to_string(),
|
||||
"grpc_secret".to_string(),
|
||||
)]))
|
||||
.await;
|
||||
let client = IamClient::new(Some(addr.to_string()));
|
||||
|
||||
let credential = client.get_credential("grpc_key").await.unwrap();
|
||||
assert_eq!(credential.secret_key, "grpc_secret");
|
||||
assert_eq!(credential.org_id.as_deref(), Some("test-org"));
|
||||
assert_eq!(credential.project_id.as_deref(), Some("test-project"));
|
||||
assert_eq!(
|
||||
client.get_credential("missing").await.unwrap_err(),
|
||||
"access key not found"
|
||||
);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_iam_client_grpc_cache_reuses_secret() {
|
||||
let (addr, calls) = start_mock_iam(HashMap::from([(
|
||||
"grpc_key".to_string(),
|
||||
"grpc_secret".to_string(),
|
||||
)]))
|
||||
.await;
|
||||
let client = IamClient::new(Some(addr.to_string()));
|
||||
|
||||
assert_eq!(
|
||||
client.get_credential("grpc_key").await.unwrap().secret_key,
|
||||
"grpc_secret"
|
||||
);
|
||||
assert_eq!(
|
||||
client.get_credential("grpc_key").await.unwrap().secret_key,
|
||||
"grpc_secret"
|
||||
);
|
||||
assert_eq!(calls.load(Ordering::SeqCst), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_complete_sigv4_signature() {
|
||||
// Test with AWS example credentials (from AWS docs)
|
||||
|
|
@ -1039,18 +1394,20 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_security_credential_lookup_unknown_key() {
|
||||
let _guard = ENV_LOCK.lock().unwrap();
|
||||
// Test that unknown access keys return the correct result
|
||||
std::env::remove_var("S3_CREDENTIALS");
|
||||
std::env::set_var("S3_ACCESS_KEY_ID", "known_key");
|
||||
std::env::set_var("S3_SECRET_KEY", "known_secret");
|
||||
|
||||
let client = IamClient::new();
|
||||
let client = IamClient::new(None);
|
||||
let credentials = client.env_credentials().unwrap();
|
||||
|
||||
// Known key should be found in credentials map
|
||||
assert_eq!(client.credentials.get("known_key"), Some(&"known_secret".to_string()));
|
||||
assert_eq!(credentials.get("known_key"), Some(&"known_secret".to_string()));
|
||||
|
||||
// Unknown key should not be found
|
||||
assert_eq!(client.credentials.get("unknown_key"), None);
|
||||
assert_eq!(credentials.get("unknown_key"), None);
|
||||
|
||||
std::env::remove_var("S3_ACCESS_KEY_ID");
|
||||
std::env::remove_var("S3_SECRET_KEY");
|
||||
|
|
@ -1058,33 +1415,36 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_security_empty_credentials() {
|
||||
let _guard = ENV_LOCK.lock().unwrap();
|
||||
// Test that IamClient keeps credentials empty when none provided
|
||||
std::env::remove_var("S3_CREDENTIALS");
|
||||
std::env::remove_var("S3_ACCESS_KEY_ID");
|
||||
std::env::remove_var("S3_SECRET_KEY");
|
||||
|
||||
let client = IamClient::new();
|
||||
let client = IamClient::new(None);
|
||||
|
||||
// No credentials configured
|
||||
assert!(client.credentials.is_empty());
|
||||
assert!(client.env_credentials().unwrap().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_security_malformed_s3_credentials_env() {
|
||||
let _guard = ENV_LOCK.lock().unwrap();
|
||||
// Test that malformed S3_CREDENTIALS are handled gracefully
|
||||
|
||||
// Missing colon separator
|
||||
std::env::set_var("S3_CREDENTIALS", "key1_secret1,key2:secret2");
|
||||
let client = IamClient::new();
|
||||
let client = IamClient::new(None);
|
||||
let credentials = client.env_credentials().unwrap();
|
||||
// Should only parse the valid pair (key2:secret2)
|
||||
assert_eq!(client.credentials.len(), 1);
|
||||
assert!(client.credentials.contains_key("key2"));
|
||||
assert_eq!(credentials.len(), 1);
|
||||
assert!(credentials.contains_key("key2"));
|
||||
|
||||
// Empty pairs
|
||||
std::env::set_var("S3_CREDENTIALS", "key1:secret1,,key2:secret2");
|
||||
let client2 = IamClient::new();
|
||||
let client2 = IamClient::new(None);
|
||||
// Should parse both valid pairs, skip empty
|
||||
assert_eq!(client2.credentials.len(), 2);
|
||||
assert_eq!(client2.env_credentials().unwrap().len(), 2);
|
||||
|
||||
std::env::remove_var("S3_CREDENTIALS");
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,4 +7,4 @@ mod router;
|
|||
mod xml;
|
||||
|
||||
pub use auth::{AuthState, sigv4_auth_middleware};
|
||||
pub use router::{create_router, create_router_with_state};
|
||||
pub use router::{create_router, create_router_with_auth, create_router_with_state};
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -66,6 +66,9 @@ pub struct ListBucketResult {
|
|||
pub name: String,
|
||||
#[serde(rename = "Prefix")]
|
||||
pub prefix: String,
|
||||
#[serde(rename = "Marker")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub marker: Option<String>,
|
||||
#[serde(rename = "Delimiter")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub delimiter: Option<String>,
|
||||
|
|
@ -73,6 +76,9 @@ pub struct ListBucketResult {
|
|||
pub max_keys: u32,
|
||||
#[serde(rename = "IsTruncated")]
|
||||
pub is_truncated: bool,
|
||||
#[serde(rename = "NextMarker")]
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub next_marker: Option<String>,
|
||||
#[serde(rename = "Contents", default)]
|
||||
pub contents: Vec<ObjectEntry>,
|
||||
#[serde(rename = "CommonPrefixes", default)]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
use tonic::{metadata::MetadataMap, Status};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct TenantContext {
|
||||
pub org_id: String,
|
||||
pub project_id: String,
|
||||
|
|
|
|||
146
nix/ci/flake.lock
generated
146
nix/ci/flake.lock
generated
|
|
@ -1,5 +1,26 @@
|
|||
{
|
||||
"nodes": {
|
||||
"disko": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"photoncloud",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1765326679,
|
||||
"narHash": "sha256-fTLX9kDwLr9Y0rH/nG+h1XG5UU+jBcy0PFYn5eneRX8=",
|
||||
"owner": "nix-community",
|
||||
"repo": "disko",
|
||||
"rev": "d64e5cdca35b5fad7c504f615357a7afe6d9c49e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-community",
|
||||
"repo": "disko",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils": {
|
||||
"inputs": {
|
||||
"systems": "systems"
|
||||
|
|
@ -18,6 +39,43 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"flake-utils_2": {
|
||||
"inputs": {
|
||||
"systems": "systems_2"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1731533236,
|
||||
"narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "numtide",
|
||||
"repo": "flake-utils",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nix-nos": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"photoncloud",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"path": "./nix-nos",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"path": "./nix-nos",
|
||||
"type": "path"
|
||||
},
|
||||
"parent": [
|
||||
"photoncloud"
|
||||
]
|
||||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1765186076,
|
||||
|
|
@ -34,14 +92,71 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1765186076,
|
||||
"narHash": "sha256-hM20uyap1a0M9d344I692r+ik4gTMyj60cQWO+hAYP8=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "addf7cf5f383a3101ecfba091b98d0a1263dc9b8",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "NixOS",
|
||||
"ref": "nixos-unstable",
|
||||
"repo": "nixpkgs",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"photoncloud": {
|
||||
"inputs": {
|
||||
"disko": "disko",
|
||||
"flake-utils": "flake-utils_2",
|
||||
"nix-nos": "nix-nos",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"rust-overlay": "rust-overlay",
|
||||
"systems": "systems_3"
|
||||
},
|
||||
"locked": {
|
||||
"path": "../..",
|
||||
"type": "path"
|
||||
},
|
||||
"original": {
|
||||
"path": "../..",
|
||||
"type": "path"
|
||||
},
|
||||
"parent": []
|
||||
},
|
||||
"root": {
|
||||
"inputs": {
|
||||
"flake-utils": "flake-utils",
|
||||
"nixpkgs": "nixpkgs",
|
||||
"rust-overlay": "rust-overlay"
|
||||
"photoncloud": "photoncloud",
|
||||
"rust-overlay": "rust-overlay_2"
|
||||
}
|
||||
},
|
||||
"rust-overlay": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"photoncloud",
|
||||
"nixpkgs"
|
||||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1765465581,
|
||||
"narHash": "sha256-fCXT0aZXmTalM3NPCTedVs9xb0egBG5BOZkcrYo5PGE=",
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"rev": "99cc5667eece98bb35dcf35f7e511031a8b7a125",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "oxalica",
|
||||
"repo": "rust-overlay",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"rust-overlay_2": {
|
||||
"inputs": {
|
||||
"nixpkgs": [
|
||||
"nixpkgs"
|
||||
|
|
@ -75,6 +190,35 @@
|
|||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_2": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"type": "github"
|
||||
}
|
||||
},
|
||||
"systems_3": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
"narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
|
||||
"owner": "nix-systems",
|
||||
"repo": "default",
|
||||
"rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
"id": "systems",
|
||||
"type": "indirect"
|
||||
}
|
||||
}
|
||||
},
|
||||
"root": "root",
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
|
||||
|
||||
flake-utils.url = "github:numtide/flake-utils";
|
||||
photoncloud.url = "path:../..";
|
||||
|
||||
rust-overlay = {
|
||||
url = "github:oxalica/rust-overlay";
|
||||
|
|
@ -12,7 +13,7 @@
|
|||
};
|
||||
};
|
||||
|
||||
outputs = { self, nixpkgs, flake-utils, rust-overlay }:
|
||||
outputs = { self, nixpkgs, flake-utils, photoncloud, rust-overlay }:
|
||||
flake-utils.lib.eachDefaultSystem (system:
|
||||
let
|
||||
overlays = [ (import rust-overlay) ];
|
||||
|
|
@ -201,7 +202,7 @@
|
|||
|
||||
if [[ "$no_logs" == "0" ]]; then
|
||||
local out
|
||||
out="$logdir/shared_${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
|
||||
out="$logdir/shared_''${crate}.$(echo "$title" | tr '[:upper:]' '[:lower:]' | tr ' ' '_' | tr -cd 'a-z0-9_').log"
|
||||
(cd "$repo_root" && bash -c "$cmd") 2>&1 | tee "$out"
|
||||
else
|
||||
(cd "$repo_root" && bash -c "$cmd")
|
||||
|
|
@ -291,6 +292,11 @@
|
|||
${gate}/bin/photoncloud-gate --tier 0 --no-logs
|
||||
touch $out/ok
|
||||
'';
|
||||
checks.deployer-vm-smoke = photoncloud.checks.${system}.deployer-vm-smoke;
|
||||
checks.deployer-vm-rollback = photoncloud.checks.${system}.deployer-vm-rollback;
|
||||
checks.deployer-bootstrap-e2e = photoncloud.checks.${system}.deployer-bootstrap-e2e;
|
||||
checks.host-lifecycle-e2e = photoncloud.checks.${system}.host-lifecycle-e2e;
|
||||
checks.fleet-scheduler-e2e = photoncloud.checks.${system}.fleet-scheduler-e2e;
|
||||
|
||||
devShells.default = pkgs.mkShell {
|
||||
name = "photoncloud-ci-dev";
|
||||
|
|
|
|||
67
nix/images/deployer-vm-smoke-target.nix
Normal file
67
nix/images/deployer-vm-smoke-target.nix
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
{ lib, modulesPath, ... }:
|
||||
|
||||
{
|
||||
imports = [
|
||||
"${modulesPath}/virtualisation/qemu-vm.nix"
|
||||
"${modulesPath}/testing/test-instrumentation.nix"
|
||||
];
|
||||
|
||||
boot.loader.grub = {
|
||||
enable = true;
|
||||
device = "/dev/vda";
|
||||
forceInstall = true;
|
||||
};
|
||||
|
||||
fileSystems."/" = {
|
||||
device = "/dev/disk/by-label/nixos";
|
||||
fsType = "ext4";
|
||||
};
|
||||
|
||||
networking.hostName = "worker";
|
||||
networking.firewall.enable = false;
|
||||
networking.useDHCP = lib.mkForce false;
|
||||
networking.dhcpcd.enable = lib.mkForce false;
|
||||
systemd.network = {
|
||||
enable = true;
|
||||
networks."10-eth0" = {
|
||||
matchConfig.Name = "eth0";
|
||||
networkConfig.DHCP = "yes";
|
||||
linkConfig.RequiredForOnline = "routable";
|
||||
};
|
||||
networks."20-eth1" = {
|
||||
matchConfig.Name = "eth1";
|
||||
address = [ "192.168.1.2/24" ];
|
||||
linkConfig.RequiredForOnline = "routable";
|
||||
};
|
||||
};
|
||||
|
||||
nix.registry = lib.mkForce { };
|
||||
nix.nixPath = lib.mkForce [ ];
|
||||
nix.channel.enable = false;
|
||||
nix.settings = {
|
||||
experimental-features = [
|
||||
"nix-command"
|
||||
"flakes"
|
||||
];
|
||||
flake-registry = "";
|
||||
};
|
||||
nixpkgs.flake = {
|
||||
source = lib.mkForce null;
|
||||
setFlakeRegistry = lib.mkForce false;
|
||||
setNixPath = lib.mkForce false;
|
||||
};
|
||||
|
||||
system.switch.enable = lib.mkForce true;
|
||||
system.nixos.label = lib.mkForce "vm-smoke-target";
|
||||
system.nixos.version = lib.mkForce "vm-smoke-target";
|
||||
system.nixos.versionSuffix = lib.mkForce "-vm-smoke-target";
|
||||
environment.etc."photon-vm-smoke-target".text = "vm-smoke-target\n";
|
||||
|
||||
documentation.enable = false;
|
||||
documentation.nixos.enable = false;
|
||||
documentation.man.enable = false;
|
||||
documentation.info.enable = false;
|
||||
documentation.doc.enable = false;
|
||||
|
||||
system.stateVersion = "24.11";
|
||||
}
|
||||
|
|
@ -33,6 +33,12 @@ let
|
|||
|
||||
mkDesiredSystemType = types: types.submodule {
|
||||
options = {
|
||||
deploymentId = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional host deployment identifier owning this desired system";
|
||||
};
|
||||
|
||||
nixosConfiguration = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
|
|
@ -62,6 +68,119 @@ let
|
|||
default = null;
|
||||
description = "Whether nix-agent should roll back when the health check fails";
|
||||
};
|
||||
|
||||
drainBeforeApply = mkOption {
|
||||
type = types.nullOr types.bool;
|
||||
default = null;
|
||||
description = "Whether the controller should drain the node before issuing this desired system";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
mkHostDeploymentSelectorType = types: types.submodule {
|
||||
options = {
|
||||
nodeIds = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
description = "Explicit node IDs targeted by the deployment";
|
||||
};
|
||||
|
||||
roles = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
description = "Node roles targeted by the deployment";
|
||||
};
|
||||
|
||||
pools = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
description = "Node pools targeted by the deployment";
|
||||
};
|
||||
|
||||
nodeClasses = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
description = "Node classes targeted by the deployment";
|
||||
};
|
||||
|
||||
matchLabels = mkOption {
|
||||
type = types.attrsOf types.str;
|
||||
default = { };
|
||||
description = "Label selectors applied to target nodes";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
mkHostDeploymentType = types:
|
||||
let
|
||||
selectorType = mkHostDeploymentSelectorType types;
|
||||
in types.submodule {
|
||||
options = {
|
||||
selector = mkOption {
|
||||
type = selectorType;
|
||||
default = { };
|
||||
description = "Node selector used by the host deployment";
|
||||
};
|
||||
|
||||
nixosConfiguration = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Name of the nixosConfigurations output to roll out";
|
||||
};
|
||||
|
||||
flakeRef = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Explicit flake reference used during rollout";
|
||||
};
|
||||
|
||||
batchSize = mkOption {
|
||||
type = types.nullOr types.int;
|
||||
default = null;
|
||||
description = "Maximum number of nodes started per reconciliation wave";
|
||||
};
|
||||
|
||||
maxUnavailable = mkOption {
|
||||
type = types.nullOr types.int;
|
||||
default = null;
|
||||
description = "Maximum number of unavailable nodes allowed during rollout";
|
||||
};
|
||||
|
||||
healthCheckCommand = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
description = "Health check command executed by nix-agent after activation";
|
||||
};
|
||||
|
||||
switchAction = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "switch-to-configuration action used by nix-agent";
|
||||
};
|
||||
|
||||
rollbackOnFailure = mkOption {
|
||||
type = types.nullOr types.bool;
|
||||
default = null;
|
||||
description = "Whether nodes should roll back when rollout health checks fail";
|
||||
};
|
||||
|
||||
drainBeforeApply = mkOption {
|
||||
type = types.nullOr types.bool;
|
||||
default = null;
|
||||
description = "Whether the controller should drain a node before applying the rollout";
|
||||
};
|
||||
|
||||
rebootPolicy = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Operator-facing reboot policy associated with the rollout";
|
||||
};
|
||||
|
||||
paused = mkOption {
|
||||
type = types.nullOr types.bool;
|
||||
default = null;
|
||||
description = "Whether the rollout should start in a paused state";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -159,6 +278,30 @@ let
|
|||
default = null;
|
||||
description = "Desired deployer node lifecycle state";
|
||||
};
|
||||
|
||||
commissionState = mkOption {
|
||||
type = types.nullOr (types.enum [ "discovered" "commissioning" "commissioned" ]);
|
||||
default = null;
|
||||
description = "Optional commissioning state exported into deployer cluster state";
|
||||
};
|
||||
|
||||
installState = mkOption {
|
||||
type = types.nullOr (types.enum [ "pending" "installing" "installed" "failed" "reinstall_requested" ]);
|
||||
default = null;
|
||||
description = "Optional install lifecycle state exported into deployer cluster state";
|
||||
};
|
||||
|
||||
powerState = mkOption {
|
||||
type = types.nullOr (types.enum [ "on" "off" "cycling" "unknown" ]);
|
||||
default = null;
|
||||
description = "Optional external power-management state associated with the node";
|
||||
};
|
||||
|
||||
bmcRef = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional BMC / Redfish reference associated with the node";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -339,7 +482,10 @@ let
|
|||
mkDesiredSystem = nodeName: desiredSystem:
|
||||
let
|
||||
rendered =
|
||||
optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
|
||||
optionalAttrs (desiredSystem != null && desiredSystem.deploymentId != null) {
|
||||
deployment_id = desiredSystem.deploymentId;
|
||||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
|
||||
nixos_configuration = desiredSystem.nixosConfiguration;
|
||||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) {
|
||||
|
|
@ -353,12 +499,60 @@ let
|
|||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.rollbackOnFailure != null) {
|
||||
rollback_on_failure = desiredSystem.rollbackOnFailure;
|
||||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.drainBeforeApply != null) {
|
||||
drain_before_apply = desiredSystem.drainBeforeApply;
|
||||
};
|
||||
in
|
||||
if desiredSystem == null || rendered == { } then null else {
|
||||
node_id = nodeName;
|
||||
} // rendered;
|
||||
|
||||
mkHostDeploymentSelector = selector:
|
||||
{
|
||||
node_ids = selector.nodeIds or [ ];
|
||||
roles = selector.roles or [ ];
|
||||
pools = selector.pools or [ ];
|
||||
node_classes = selector.nodeClasses or [ ];
|
||||
match_labels = selector.matchLabels or { };
|
||||
};
|
||||
|
||||
mkDeployerHostDeploymentSpec = name: deployment:
|
||||
{
|
||||
inherit name;
|
||||
selector = mkHostDeploymentSelector deployment.selector;
|
||||
}
|
||||
// optionalAttrs (deployment.nixosConfiguration != null) {
|
||||
nixos_configuration = deployment.nixosConfiguration;
|
||||
}
|
||||
// optionalAttrs (deployment.flakeRef != null) {
|
||||
flake_ref = deployment.flakeRef;
|
||||
}
|
||||
// optionalAttrs (deployment.batchSize != null) {
|
||||
batch_size = deployment.batchSize;
|
||||
}
|
||||
// optionalAttrs (deployment.maxUnavailable != null) {
|
||||
max_unavailable = deployment.maxUnavailable;
|
||||
}
|
||||
// optionalAttrs (deployment.healthCheckCommand != [ ]) {
|
||||
health_check_command = deployment.healthCheckCommand;
|
||||
}
|
||||
// optionalAttrs (deployment.switchAction != null) {
|
||||
switch_action = deployment.switchAction;
|
||||
}
|
||||
// optionalAttrs (deployment.rollbackOnFailure != null) {
|
||||
rollback_on_failure = deployment.rollbackOnFailure;
|
||||
}
|
||||
// optionalAttrs (deployment.drainBeforeApply != null) {
|
||||
drain_before_apply = deployment.drainBeforeApply;
|
||||
}
|
||||
// optionalAttrs (deployment.rebootPolicy != null) {
|
||||
reboot_policy = deployment.rebootPolicy;
|
||||
}
|
||||
// optionalAttrs (deployment.paused != null) {
|
||||
paused = deployment.paused;
|
||||
};
|
||||
|
||||
mkDeployerNodeSpec = nodeName: node:
|
||||
{
|
||||
node_id = nodeName;
|
||||
|
|
@ -390,6 +584,18 @@ let
|
|||
}
|
||||
// optionalAttrs (node.state != null) {
|
||||
state = node.state;
|
||||
}
|
||||
// optionalAttrs (node.commissionState != null) {
|
||||
commission_state = node.commissionState;
|
||||
}
|
||||
// optionalAttrs (node.installState != null) {
|
||||
install_state = node.installState;
|
||||
}
|
||||
// optionalAttrs (node.powerState != null) {
|
||||
power_state = node.powerState;
|
||||
}
|
||||
// optionalAttrs (node.bmcRef != null) {
|
||||
bmc_ref = node.bmcRef;
|
||||
};
|
||||
|
||||
mkDeployerNodeClassSpec = name: nodeClass:
|
||||
|
|
@ -522,6 +728,7 @@ let
|
|||
nodeClasses = deployer.nodeClasses or { };
|
||||
pools = deployer.pools or { };
|
||||
enrollmentRules = deployer.enrollmentRules or { };
|
||||
hostDeployments = deployer.hostDeployments or { };
|
||||
in {
|
||||
cluster = {
|
||||
cluster_id = clusterId;
|
||||
|
|
@ -532,6 +739,7 @@ let
|
|||
node_classes = map (name: mkDeployerNodeClassSpec name nodeClasses.${name}) (attrNames nodeClasses);
|
||||
pools = map (name: mkDeployerPoolSpec name pools.${name}) (attrNames pools);
|
||||
enrollment_rules = map (name: mkDeployerEnrollmentRuleSpec name enrollmentRules.${name}) (attrNames enrollmentRules);
|
||||
host_deployments = map (name: mkDeployerHostDeploymentSpec name hostDeployments.${name}) (attrNames hostDeployments);
|
||||
services = [ ];
|
||||
instances = [ ];
|
||||
mtls_policies = [ ];
|
||||
|
|
@ -541,6 +749,8 @@ in
|
|||
inherit
|
||||
mkInstallPlanType
|
||||
mkDesiredSystemType
|
||||
mkHostDeploymentSelectorType
|
||||
mkHostDeploymentType
|
||||
mkNodeType
|
||||
mkNodeClassType
|
||||
mkNodePoolType
|
||||
|
|
|
|||
|
|
@ -2,8 +2,61 @@
|
|||
|
||||
let
|
||||
cfg = config.services.coronafs;
|
||||
chainfireEnabled = lib.hasAttrByPath [ "services" "chainfire" "enable" ] config && config.services.chainfire.enable;
|
||||
chainfireApiUrls =
|
||||
if cfg.chainfireApiUrl != null then
|
||||
lib.filter (item: item != "") (map lib.strings.trim (lib.splitString "," cfg.chainfireApiUrl))
|
||||
else
|
||||
[ ];
|
||||
effectiveChainfireApiUrl =
|
||||
if cfg.chainfireApiUrl != null then cfg.chainfireApiUrl
|
||||
else if chainfireEnabled then "http://127.0.0.1:${toString config.services.chainfire.httpPort}"
|
||||
else null;
|
||||
localChainfireApiUrl =
|
||||
lib.any
|
||||
(url:
|
||||
lib.hasPrefix "http://127.0.0.1:" url
|
||||
|| lib.hasPrefix "http://localhost:" url
|
||||
)
|
||||
(
|
||||
if effectiveChainfireApiUrl == null then
|
||||
[ ]
|
||||
else if cfg.chainfireApiUrl != null then
|
||||
chainfireApiUrls
|
||||
else
|
||||
[ effectiveChainfireApiUrl ]
|
||||
);
|
||||
waitForChainfire =
|
||||
pkgs.writeShellScript "coronafs-wait-for-chainfire" ''
|
||||
set -eu
|
||||
deadline=$((SECONDS + 60))
|
||||
urls='${lib.concatStringsSep " " (
|
||||
if effectiveChainfireApiUrl == null then
|
||||
[ ]
|
||||
else if cfg.chainfireApiUrl != null then
|
||||
chainfireApiUrls
|
||||
else
|
||||
[ effectiveChainfireApiUrl ]
|
||||
)}'
|
||||
while true; do
|
||||
for url in $urls; do
|
||||
if curl -fsS "$url/health" >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
done
|
||||
if [ "$SECONDS" -ge "$deadline" ]; then
|
||||
echo "timed out waiting for ChainFire at ${if effectiveChainfireApiUrl == null then "(none)" else effectiveChainfireApiUrl}" >&2
|
||||
exit 1
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
'';
|
||||
tomlFormat = pkgs.formats.toml { };
|
||||
coronafsConfigFile = tomlFormat.generate "coronafs.toml" {
|
||||
coronafsConfigFile = tomlFormat.generate "coronafs.toml" (
|
||||
{
|
||||
mode = cfg.mode;
|
||||
metadata_backend = cfg.metadataBackend;
|
||||
chainfire_key_prefix = cfg.chainfireKeyPrefix;
|
||||
listen_addr = "0.0.0.0:${toString cfg.port}";
|
||||
advertise_host = cfg.advertiseHost;
|
||||
data_dir = toString cfg.dataDir;
|
||||
|
|
@ -20,12 +73,41 @@ let
|
|||
qemu_nbd_path = "${pkgs.qemu}/bin/qemu-nbd";
|
||||
qemu_img_path = "${pkgs.qemu}/bin/qemu-img";
|
||||
log_level = "info";
|
||||
};
|
||||
}
|
||||
// lib.optionalAttrs (effectiveChainfireApiUrl != null) {
|
||||
chainfire_api_url = effectiveChainfireApiUrl;
|
||||
}
|
||||
);
|
||||
in
|
||||
{
|
||||
options.services.coronafs = {
|
||||
enable = lib.mkEnableOption "CoronaFS block volume service";
|
||||
|
||||
mode = lib.mkOption {
|
||||
type = lib.types.enum [ "combined" "controller" "node" ];
|
||||
default = "combined";
|
||||
description = "CoronaFS operating mode: combined compatibility mode, controller-only API, or node-local export mode.";
|
||||
};
|
||||
|
||||
metadataBackend = lib.mkOption {
|
||||
type = lib.types.enum [ "filesystem" "chainfire" ];
|
||||
default = "filesystem";
|
||||
description = "Metadata backend for CoronaFS volume metadata. Use chainfire on controller nodes to replicate volume metadata.";
|
||||
};
|
||||
|
||||
chainfireApiUrl = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
description = "Optional ChainFire HTTP API URL used when metadataBackend = chainfire. Comma-separated endpoints are allowed for failover.";
|
||||
example = "http://127.0.0.1:8081";
|
||||
};
|
||||
|
||||
chainfireKeyPrefix = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "/coronafs/volumes";
|
||||
description = "ChainFire key prefix used to store CoronaFS metadata when metadataBackend = chainfire.";
|
||||
};
|
||||
|
||||
port = lib.mkOption {
|
||||
type = lib.types.port;
|
||||
default = 50088;
|
||||
|
|
@ -71,7 +153,7 @@ in
|
|||
|
||||
exportAioMode = lib.mkOption {
|
||||
type = lib.types.enum [ "native" "io_uring" "threads" ];
|
||||
default = "io_uring";
|
||||
default = "threads";
|
||||
description = "qemu-nbd AIO mode for CoronaFS exports.";
|
||||
};
|
||||
|
||||
|
|
@ -113,11 +195,22 @@ in
|
|||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
assertions = [
|
||||
{
|
||||
assertion = cfg.metadataBackend != "chainfire" || effectiveChainfireApiUrl != null;
|
||||
message = "services.coronafs.metadataBackend = \"chainfire\" requires services.coronafs.chainfireApiUrl or a local services.chainfire instance.";
|
||||
}
|
||||
];
|
||||
|
||||
users.users.coronafs = {
|
||||
isSystemUser = true;
|
||||
group = "coronafs";
|
||||
description = "CoronaFS service user";
|
||||
home = cfg.dataDir;
|
||||
extraGroups =
|
||||
lib.optional
|
||||
(lib.hasAttrByPath [ "services" "plasmavmc" "enable" ] config && config.services.plasmavmc.enable)
|
||||
"plasmavmc";
|
||||
};
|
||||
|
||||
users.groups.coronafs = { };
|
||||
|
|
@ -125,8 +218,9 @@ in
|
|||
systemd.services.coronafs = {
|
||||
description = "CoronaFS Block Volume Service";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" ];
|
||||
path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils ];
|
||||
after = [ "network.target" ] ++ lib.optionals chainfireEnabled [ "chainfire.service" ];
|
||||
wants = lib.optionals chainfireEnabled [ "chainfire.service" ];
|
||||
path = [ pkgs.qemu pkgs.util-linux pkgs.procps pkgs.coreutils pkgs.curl ];
|
||||
|
||||
serviceConfig = {
|
||||
Type = "simple";
|
||||
|
|
@ -138,13 +232,14 @@ in
|
|||
StateDirectory = "coronafs";
|
||||
StateDirectoryMode = "0750";
|
||||
ReadWritePaths = [ cfg.dataDir ];
|
||||
ExecStartPre = lib.optionals (cfg.metadataBackend == "chainfire" && localChainfireApiUrl) [ waitForChainfire ];
|
||||
ExecStart = "${cfg.package}/bin/coronafs-server --config ${coronafsConfigFile}";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.tmpfiles.rules = [
|
||||
"d ${toString cfg.dataDir} 0750 coronafs coronafs -"
|
||||
"d ${toString cfg.dataDir}/volumes 0750 coronafs coronafs -"
|
||||
"d ${toString cfg.dataDir}/volumes 2770 coronafs coronafs -"
|
||||
"d ${toString cfg.dataDir}/metadata 0750 coronafs coronafs -"
|
||||
"d ${toString cfg.dataDir}/pids 0750 coronafs coronafs -"
|
||||
];
|
||||
|
|
|
|||
|
|
@ -3,6 +3,23 @@
|
|||
let
|
||||
cfg = config.services.deployer;
|
||||
tomlFormat = pkgs.formats.toml { };
|
||||
usesLocalChainfire =
|
||||
builtins.any
|
||||
(
|
||||
endpoint:
|
||||
lib.hasPrefix "http://127.0.0.1:" endpoint
|
||||
|| lib.hasPrefix "http://localhost:" endpoint
|
||||
|| lib.hasPrefix "http://[::1]:" endpoint
|
||||
)
|
||||
cfg.chainfireEndpoints;
|
||||
localChainfireDeps =
|
||||
lib.optionals
|
||||
(
|
||||
usesLocalChainfire
|
||||
&& lib.hasAttrByPath [ "services" "chainfire" "enable" ] config
|
||||
&& config.services.chainfire.enable
|
||||
)
|
||||
[ "chainfire.service" ];
|
||||
generatedConfig = {
|
||||
bind_addr = cfg.bindAddr;
|
||||
chainfire = {
|
||||
|
|
@ -226,7 +243,9 @@ in
|
|||
systemd.services.deployer = {
|
||||
description = "PlasmaCloud Deployer Server";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" ];
|
||||
wants = [ "network-online.target" ] ++ localChainfireDeps;
|
||||
after = [ "network-online.target" ] ++ localChainfireDeps;
|
||||
requires = localChainfireDeps;
|
||||
|
||||
environment = {}
|
||||
// lib.optionalAttrs (cfg.bootstrapToken != null) {
|
||||
|
|
|
|||
|
|
@ -285,7 +285,7 @@ in
|
|||
healthUrl = "http://localhost:8082/health"; # Health endpoint on admin port
|
||||
leaderUrlKey = "flaredb_leader_url";
|
||||
defaultLeaderUrl = "http://localhost:8082";
|
||||
joinPath = null;
|
||||
joinPath = "/admin/member/add";
|
||||
port = cfg.flaredbPort;
|
||||
description = "FlareDB";
|
||||
} // {
|
||||
|
|
|
|||
|
|
@ -297,6 +297,30 @@ in
|
|||
description = "Prometheus metrics port for lightningstor-node.";
|
||||
};
|
||||
|
||||
s3StreamingPutThresholdBytes = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 64 * 1024 * 1024;
|
||||
description = "Streaming PUT multipart threshold for the S3 frontend.";
|
||||
};
|
||||
|
||||
s3InlinePutMaxBytes = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 128 * 1024 * 1024;
|
||||
description = "Maximum inline single-PUT size for the S3 frontend.";
|
||||
};
|
||||
|
||||
s3MultipartPutConcurrency = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 4;
|
||||
description = "Maximum in-flight multipart PUT part uploads.";
|
||||
};
|
||||
|
||||
s3MultipartFetchConcurrency = lib.mkOption {
|
||||
type = lib.types.int;
|
||||
default = 4;
|
||||
description = "Maximum concurrent multipart GET part fetches.";
|
||||
};
|
||||
|
||||
databaseUrl = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
|
|
@ -369,6 +393,14 @@ in
|
|||
|
||||
environment = {
|
||||
RUST_LOG = "info";
|
||||
LIGHTNINGSTOR_S3_STREAMING_PUT_THRESHOLD_BYTES =
|
||||
toString cfg.s3StreamingPutThresholdBytes;
|
||||
LIGHTNINGSTOR_S3_INLINE_PUT_MAX_BYTES =
|
||||
toString cfg.s3InlinePutMaxBytes;
|
||||
LIGHTNINGSTOR_S3_MULTIPART_PUT_CONCURRENCY =
|
||||
toString cfg.s3MultipartPutConcurrency;
|
||||
LIGHTNINGSTOR_S3_MULTIPART_FETCH_CONCURRENCY =
|
||||
toString cfg.s3MultipartFetchConcurrency;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ let
|
|||
nodeClassType = clusterConfigLib.mkNodeClassType types;
|
||||
nodePoolType = clusterConfigLib.mkNodePoolType types;
|
||||
enrollmentRuleType = clusterConfigLib.mkEnrollmentRuleType types;
|
||||
hostDeploymentType = clusterConfigLib.mkHostDeploymentType types;
|
||||
jsonFormat = pkgs.formats.json { };
|
||||
|
||||
# Generate cluster-config.json for the current node
|
||||
|
|
@ -98,6 +99,12 @@ in {
|
|||
default = { };
|
||||
description = "Deployer auto-enrollment rules derived from Nix";
|
||||
};
|
||||
|
||||
hostDeployments = mkOption {
|
||||
type = types.attrsOf hostDeploymentType;
|
||||
default = { };
|
||||
description = "Declarative host rollout objects derived from Nix";
|
||||
};
|
||||
};
|
||||
|
||||
generated = {
|
||||
|
|
@ -173,6 +180,16 @@ in {
|
|||
) (attrNames cfg.deployer.enrollmentRules);
|
||||
message = "All deployer enrollment rules must reference existing pools and node classes";
|
||||
}
|
||||
{
|
||||
assertion = all (deploymentName:
|
||||
let
|
||||
deployment = cfg.deployer.hostDeployments.${deploymentName};
|
||||
in
|
||||
all (pool: cfg.deployer.pools ? "${pool}") deployment.selector.pools
|
||||
&& all (nodeClass: cfg.deployer.nodeClasses ? "${nodeClass}") deployment.selector.nodeClasses
|
||||
) (attrNames cfg.deployer.hostDeployments);
|
||||
message = "All deployer host deployments must reference existing pools and node classes";
|
||||
}
|
||||
];
|
||||
|
||||
# Generate cluster-config.json for first-boot-automation
|
||||
|
|
|
|||
|
|
@ -2,11 +2,30 @@
|
|||
|
||||
let
|
||||
cfg = config.services.plasmavmc;
|
||||
localIamDeps = lib.optional (config.services.iam.enable or false) "iam.service";
|
||||
localIamHealthUrl =
|
||||
if config.services.iam.enable or false
|
||||
then "http://127.0.0.1:${toString config.services.iam.httpPort}/health"
|
||||
else null;
|
||||
remoteIamEndpoint =
|
||||
if !(config.services.iam.enable or false) && cfg.iamAddr != null
|
||||
then cfg.iamAddr
|
||||
else null;
|
||||
coronafsEnabled = lib.hasAttrByPath [ "services" "coronafs" "enable" ] config && config.services.coronafs.enable;
|
||||
coronafsDataDir =
|
||||
if coronafsEnabled && lib.hasAttrByPath [ "services" "coronafs" "dataDir" ] config
|
||||
then toString config.services.coronafs.dataDir
|
||||
else null;
|
||||
effectiveCoronafsControllerEndpoint =
|
||||
if cfg.coronafsControllerEndpoint != null then cfg.coronafsControllerEndpoint
|
||||
else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
|
||||
else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
|
||||
else null;
|
||||
effectiveCoronafsNodeEndpoint =
|
||||
if cfg.coronafsNodeEndpoint != null then cfg.coronafsNodeEndpoint
|
||||
else if coronafsEnabled then "http://127.0.0.1:${toString config.services.coronafs.port}"
|
||||
else if cfg.coronafsEndpoint != null then cfg.coronafsEndpoint
|
||||
else null;
|
||||
tomlFormat = pkgs.formats.toml { };
|
||||
plasmavmcConfigFile = tomlFormat.generate "plasmavmc.toml" {
|
||||
addr = "0.0.0.0:${toString cfg.port}";
|
||||
|
|
@ -94,10 +113,41 @@ in
|
|||
coronafsEndpoint = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
description = "CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
|
||||
description = "Deprecated combined CoronaFS HTTP endpoint used to provision and export managed VM volumes.";
|
||||
example = "http://10.0.0.11:50088";
|
||||
};
|
||||
|
||||
coronafsControllerEndpoint = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
description = "CoronaFS controller HTTP endpoint used to provision and resize managed VM volumes. Comma-separated endpoints are allowed for client-side failover.";
|
||||
example = "http://10.0.0.11:50088";
|
||||
};
|
||||
|
||||
coronafsNodeEndpoint = lib.mkOption {
|
||||
type = lib.types.nullOr lib.types.str;
|
||||
default = null;
|
||||
description = "CoronaFS node-local HTTP endpoint used to resolve local paths and exports for attached VM volumes. Comma-separated endpoints are allowed for client-side failover.";
|
||||
example = "http://127.0.0.1:50088";
|
||||
};
|
||||
|
||||
coronafsNodeLocalAttach = lib.mkOption {
|
||||
type = lib.types.bool;
|
||||
default = false;
|
||||
description = ''
|
||||
Enable writable VM attachment through node-local CoronaFS materialization.
|
||||
This requires services.plasmavmc.sharedLiveMigration = false because migrations use cold relocate plus flush-back.
|
||||
'';
|
||||
};
|
||||
|
||||
experimentalCoronafsNodeLocalAttach = lib.mkOption {
|
||||
type = lib.types.bool;
|
||||
default = false;
|
||||
description = ''
|
||||
Deprecated alias for services.plasmavmc.coronafsNodeLocalAttach.
|
||||
'';
|
||||
};
|
||||
|
||||
managedVolumeRoot = lib.mkOption {
|
||||
type = lib.types.path;
|
||||
default = "/var/lib/plasmavmc/managed-volumes";
|
||||
|
|
@ -173,6 +223,24 @@ in
|
|||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
assertions = [
|
||||
{
|
||||
assertion = !((cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) && cfg.sharedLiveMigration);
|
||||
message = ''
|
||||
services.plasmavmc.coronafsNodeLocalAttach requires services.plasmavmc.sharedLiveMigration = false
|
||||
because writable node-local CoronaFS attachment uses cold relocate plus flush-back instead of shared-storage live migration.
|
||||
'';
|
||||
}
|
||||
];
|
||||
|
||||
warnings =
|
||||
lib.optional (cfg.coronafsEndpoint != null) ''
|
||||
services.plasmavmc.coronafsEndpoint is deprecated; use services.plasmavmc.coronafsControllerEndpoint and services.plasmavmc.coronafsNodeEndpoint.
|
||||
''
|
||||
++ lib.optional (cfg.experimentalCoronafsNodeLocalAttach) ''
|
||||
services.plasmavmc.experimentalCoronafsNodeLocalAttach is deprecated; use services.plasmavmc.coronafsNodeLocalAttach.
|
||||
'';
|
||||
|
||||
# Create system user
|
||||
users.users.plasmavmc = {
|
||||
isSystemUser = true;
|
||||
|
|
@ -188,9 +256,35 @@ in
|
|||
systemd.services.plasmavmc = {
|
||||
description = "PlasmaVMC Virtual Machine Compute Service";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network.target" "prismnet.service" "flaredb.service" "chainfire.service" ];
|
||||
wants = [ "prismnet.service" "flaredb.service" "chainfire.service" ];
|
||||
path = [ pkgs.qemu pkgs.coreutils ];
|
||||
after = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
|
||||
wants = [ "network-online.target" "prismnet.service" "flaredb.service" "chainfire.service" ] ++ localIamDeps;
|
||||
path = [ pkgs.qemu pkgs.coreutils pkgs.curl ];
|
||||
preStart =
|
||||
lib.optionalString (localIamHealthUrl != null) ''
|
||||
for _ in $(seq 1 90); do
|
||||
if curl -fsS ${lib.escapeShellArg localIamHealthUrl} >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "plasmavmc: timed out waiting for local IAM health at ${localIamHealthUrl}" >&2
|
||||
exit 1
|
||||
''
|
||||
+ lib.optionalString (remoteIamEndpoint != null) ''
|
||||
endpoint=${lib.escapeShellArg remoteIamEndpoint}
|
||||
endpoint="''${endpoint#http://}"
|
||||
endpoint="''${endpoint#https://}"
|
||||
host="''${endpoint%:*}"
|
||||
port="''${endpoint##*:}"
|
||||
for _ in $(${pkgs.coreutils}/bin/seq 1 90); do
|
||||
if ${pkgs.coreutils}/bin/timeout 1 ${pkgs.bash}/bin/bash -lc "</dev/tcp/''${host}/''${port}" >/dev/null 2>&1; then
|
||||
exit 0
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
echo "plasmavmc: timed out waiting for IAM gRPC at ''${host}:''${port}" >&2
|
||||
exit 1
|
||||
'';
|
||||
|
||||
environment = lib.mkMerge [
|
||||
{
|
||||
|
|
@ -213,6 +307,16 @@ in
|
|||
(lib.mkIf (cfg.lightningstorAddr != null) {
|
||||
PLASMAVMC_LIGHTNINGSTOR_ENDPOINT = cfg.lightningstorAddr;
|
||||
})
|
||||
(lib.mkIf (effectiveCoronafsControllerEndpoint != null) {
|
||||
PLASMAVMC_CORONAFS_CONTROLLER_ENDPOINT = effectiveCoronafsControllerEndpoint;
|
||||
})
|
||||
(lib.mkIf (effectiveCoronafsNodeEndpoint != null) {
|
||||
PLASMAVMC_CORONAFS_NODE_ENDPOINT = effectiveCoronafsNodeEndpoint;
|
||||
})
|
||||
(lib.mkIf (cfg.coronafsNodeLocalAttach || cfg.experimentalCoronafsNodeLocalAttach) {
|
||||
PLASMAVMC_CORONAFS_NODE_LOCAL_ATTACH = "1";
|
||||
PLASMAVMC_CORONAFS_ENABLE_EXPERIMENTAL_NODE_LOCAL_ATTACH = "1";
|
||||
})
|
||||
(lib.mkIf (cfg.coronafsEndpoint != null) {
|
||||
PLASMAVMC_CORONAFS_ENDPOINT = cfg.coronafsEndpoint;
|
||||
})
|
||||
|
|
@ -273,6 +377,8 @@ in
|
|||
systemd.tmpfiles.rules = [
|
||||
"d ${builtins.dirOf (toString cfg.managedVolumeRoot)} 0755 plasmavmc plasmavmc -"
|
||||
"d ${toString cfg.managedVolumeRoot} 0750 plasmavmc plasmavmc -"
|
||||
] ++ lib.optionals coronafsEnabled [
|
||||
"d ${toString cfg.dataDir}/images 2770 plasmavmc coronafs -"
|
||||
];
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -108,6 +108,19 @@
|
|||
};
|
||||
};
|
||||
};
|
||||
|
||||
hostDeployments = {
|
||||
control-plane-canary = {
|
||||
selector.nodeIds = [ "node01" ];
|
||||
nixosConfiguration = "node01";
|
||||
flakeRef = "github:centra/cloud";
|
||||
batchSize = 1;
|
||||
maxUnavailable = 1;
|
||||
healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
|
||||
switchAction = "switch";
|
||||
rollbackOnFailure = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
bootstrap.initialPeers = [ "node01" "node02" "node03" ];
|
||||
|
|
|
|||
|
|
@ -32,8 +32,8 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "192.168.100.11:2379";
|
||||
flaredbAddr = "192.168.100.11:2479";
|
||||
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
|
||||
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
|
||||
};
|
||||
|
||||
services.openssh.enable = true;
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "192.168.100.11:2379";
|
||||
flaredbAddr = "192.168.100.11:2479";
|
||||
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
|
||||
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
|
||||
};
|
||||
|
||||
services.openssh.enable = true;
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "192.168.100.11:2379";
|
||||
flaredbAddr = "192.168.100.11:2479";
|
||||
chainfireAddr = "192.168.100.11:2379,192.168.100.12:2379,192.168.100.13:2379";
|
||||
flaredbAddr = "192.168.100.11:2479,192.168.100.12:2479,192.168.100.13:2479";
|
||||
};
|
||||
|
||||
services.openssh.enable = true;
|
||||
|
|
|
|||
|
|
@ -63,10 +63,13 @@ Preferred entrypoint for publishable verification: `nix run ./nix/test-cluster#c
|
|||
|
||||
Preferred entrypoint for publishable matrix verification: `nix run ./nix/test-cluster#cluster -- fresh-matrix`
|
||||
|
||||
`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS local-vs-shared-volume I/O, queued random-read behavior, cross-worker direct-I/O shared-volume reads, and LightningStor large/small-object S3 throughput and writes a report to `docs/storage-benchmarks.md`.
|
||||
`nix run ./nix/test-cluster#cluster -- bench-storage` benchmarks CoronaFS controller-export vs node-local-export I/O, worker-side materialization latency, and LightningStor large/small-object S3 throughput, then writes a report to `docs/storage-benchmarks.md`.
|
||||
|
||||
Preferred entrypoint for publishable storage numbers: `nix run ./nix/test-cluster#cluster -- fresh-storage-bench`
|
||||
|
||||
`nix run ./nix/test-cluster#cluster -- bench-coronafs-local-matrix` runs the local single-process CoronaFS export benchmark across the supported `cache`/`aio` combinations so software-path regressions can be separated from VM-lab network limits.
|
||||
On the current lab hosts, `cache=none` with `aio=io_uring` is the strongest local-export profile and should be treated as the reference point when CoronaFS remote numbers are being distorted by the nested-QEMU/VDE network path.
|
||||
|
||||
## Advanced usage
|
||||
|
||||
Use the script entrypoint only for local debugging inside a prepared Nix shell:
|
||||
|
|
|
|||
|
|
@ -27,6 +27,18 @@ in
|
|||
default = "/tmp/photoncloud-test-cluster-vde.sock";
|
||||
description = "VDE control socket path used for the east-west cluster NIC.";
|
||||
};
|
||||
|
||||
chainfireControlPlaneAddrs = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "10.100.0.11:2379,10.100.0.12:2379,10.100.0.13:2379";
|
||||
description = "Comma-separated ChainFire client endpoints for multi-endpoint failover.";
|
||||
};
|
||||
|
||||
flaredbControlPlaneAddrs = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
default = "10.100.0.11:2479,10.100.0.12:2479,10.100.0.13:2479";
|
||||
description = "Comma-separated FlareDB client endpoints for multi-endpoint failover.";
|
||||
};
|
||||
};
|
||||
|
||||
config = {
|
||||
|
|
@ -84,10 +96,43 @@ in
|
|||
|
||||
system.stateVersion = "24.05";
|
||||
|
||||
systemd.services.photon-test-cluster-net-tuning = {
|
||||
description = "Tune cluster NIC offloads for nested-QEMU storage tests";
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
after = [ "network-online.target" ];
|
||||
wants = [ "network-online.target" ];
|
||||
serviceConfig = {
|
||||
Type = "oneshot";
|
||||
RemainAfterExit = true;
|
||||
};
|
||||
path = [ pkgs.ethtool pkgs.iproute2 pkgs.coreutils ];
|
||||
script = ''
|
||||
set -eu
|
||||
iface="eth1"
|
||||
for _ in $(seq 1 30); do
|
||||
if ip link show "$iface" >/dev/null 2>&1; then
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
if ! ip link show "$iface" >/dev/null 2>&1; then
|
||||
echo "photon-test-cluster-net-tuning: $iface not present, skipping" >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Nested QEMU over VDE is sensitive to guest-side offloads; disabling
|
||||
# them reduces retransmits and keeps the storage benchmarks closer to
|
||||
# raw TCP throughput.
|
||||
ethtool -K "$iface" tso off gso off gro off tx off rx off sg off || true
|
||||
ip link set dev "$iface" txqueuelen 10000 || true
|
||||
'';
|
||||
};
|
||||
|
||||
environment.systemPackages = with pkgs; [
|
||||
awscli2
|
||||
curl
|
||||
dnsutils
|
||||
ethtool
|
||||
fio
|
||||
jq
|
||||
grpcurl
|
||||
|
|
|
|||
|
|
@ -115,12 +115,17 @@
|
|||
curl
|
||||
grpcurl
|
||||
jq
|
||||
llvmPackages.clang
|
||||
llvmPackages.libclang
|
||||
openssh
|
||||
protobuf
|
||||
clusterPython
|
||||
qemu
|
||||
sshpass
|
||||
vde2
|
||||
];
|
||||
LIBCLANG_PATH = "${pkgs.llvmPackages.libclang.lib}/lib";
|
||||
PROTOC = "${pkgs.protobuf}/bin/protoc";
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,29 +69,29 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "10.100.0.11:2379";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
services.prismnet = {
|
||||
enable = true;
|
||||
port = 50081;
|
||||
iamAddr = "10.100.0.11:50080";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
services.flashdns = {
|
||||
enable = true;
|
||||
iamAddr = "10.100.0.11:50080";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
services.fiberlb = {
|
||||
enable = true;
|
||||
port = 50085;
|
||||
iamAddr = "10.100.0.11:50080";
|
||||
chainfireAddr = "10.100.0.11:2379";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
services.plasmavmc = {
|
||||
|
|
@ -101,14 +101,17 @@
|
|||
httpPort = 8084;
|
||||
prismnetAddr = "10.100.0.11:50081";
|
||||
iamAddr = "10.100.0.11:50080";
|
||||
chainfireAddr = "10.100.0.11:2379";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
lightningstorAddr = "10.100.0.11:50086";
|
||||
coronafsEndpoint = "http://10.100.0.11:50088";
|
||||
coronafsControllerEndpoint = "http://127.0.0.1:50088";
|
||||
coronafsNodeEndpoint = "http://127.0.0.1:50088";
|
||||
};
|
||||
|
||||
services.coronafs = {
|
||||
enable = true;
|
||||
metadataBackend = "chainfire";
|
||||
chainfireKeyPrefix = "/coronafs/test-cluster/control/volumes";
|
||||
port = 50088;
|
||||
advertiseHost = "10.100.0.11";
|
||||
exportBasePort = 11000;
|
||||
|
|
@ -138,9 +141,9 @@
|
|||
readQuorum = 1;
|
||||
writeQuorum = 2;
|
||||
nodeMetricsPort = 9198;
|
||||
chainfireAddr = "10.100.0.11:2379";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
iamAddr = "10.100.0.11:50080";
|
||||
flaredbAddr = "10.100.0.11:2479";
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
zone = "zone-a";
|
||||
region = "test";
|
||||
};
|
||||
|
|
@ -149,10 +152,10 @@
|
|||
enable = true;
|
||||
port = 50087;
|
||||
iamAddr = "http://10.100.0.11:50080";
|
||||
chainfireAddr = "http://10.100.0.11:2379";
|
||||
chainfireAddr = "http://${config.photonTestCluster.chainfireControlPlaneAddrs}";
|
||||
prismnetAddr = "http://10.100.0.11:50081";
|
||||
flaredbPdAddr = "10.100.0.11:2379";
|
||||
flaredbDirectAddr = "10.100.0.11:2479";
|
||||
flaredbPdAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbDirectAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
fiberlbAddr = "http://10.100.0.11:50085";
|
||||
flashdnsAddr = "http://10.100.0.11:50084";
|
||||
};
|
||||
|
|
|
|||
|
|
@ -41,7 +41,6 @@
|
|||
nodeId = "node02";
|
||||
raftAddr = "10.100.0.12:2480";
|
||||
apiAddr = "10.100.0.12:2479";
|
||||
pdAddr = "10.100.0.11:2379";
|
||||
initialPeers = [
|
||||
"node01=10.100.0.11:2479"
|
||||
"node02=10.100.0.12:2479"
|
||||
|
|
@ -63,8 +62,8 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "10.100.0.12:2379";
|
||||
flaredbAddr = "10.100.0.12:2479";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
systemd.services.iam.environment = {
|
||||
|
|
|
|||
|
|
@ -41,7 +41,6 @@
|
|||
nodeId = "node03";
|
||||
raftAddr = "10.100.0.13:2480";
|
||||
apiAddr = "10.100.0.13:2479";
|
||||
pdAddr = "10.100.0.11:2379";
|
||||
initialPeers = [
|
||||
"node01=10.100.0.11:2479"
|
||||
"node02=10.100.0.12:2479"
|
||||
|
|
@ -63,8 +62,8 @@
|
|||
services.iam = {
|
||||
enable = true;
|
||||
port = 50080;
|
||||
chainfireAddr = "10.100.0.13:2379";
|
||||
flaredbAddr = "10.100.0.13:2479";
|
||||
chainfireAddr = config.photonTestCluster.chainfireControlPlaneAddrs;
|
||||
flaredbAddr = config.photonTestCluster.flaredbControlPlaneAddrs;
|
||||
};
|
||||
|
||||
systemd.services.iam.environment = {
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue