From a7d5cfa738c78be5e16fa835f975e9ca0e534fa4 Mon Sep 17 00:00:00 2001 From: centra Date: Sat, 21 Mar 2026 16:43:00 +0900 Subject: [PATCH] Wire bootstrap installers to explicit targets and reboot-aware rollout --- baremetal/image-builder/build-images.sh | 18 +- chainfire/baremetal/pxe-server/ipxe/boot.ipxe | 2 + .../baremetal/pxe-server/nixos-module.nix | 32 ++- deployer/crates/deployer-ctl/src/chainfire.rs | 2 + .../crates/deployer-server/src/cloud_init.rs | 2 + .../crates/deployer-server/src/phone_home.rs | 45 +++- deployer/crates/deployer-types/src/lib.rs | 150 ++++++++++++++ deployer/crates/fleet-scheduler/src/main.rs | 1 + deployer/crates/nix-agent/src/main.rs | 142 ++++++++++++- .../scripts/verify-deployer-bootstrap-e2e.sh | 48 +++++ nix/iso/plasmacloud-iso.nix | 196 ++++++++++++++++-- nix/modules/cluster-config-lib.nix | 18 ++ nix/modules/default.nix | 1 + nix/nodes/vm-cluster/cluster.nix | 3 + nix/nodes/vm-cluster/node01/disko.nix | 4 +- nix/nodes/vm-cluster/node02/disko.nix | 4 +- nix/nodes/vm-cluster/node03/disko.nix | 4 +- 17 files changed, 640 insertions(+), 32 deletions(-) diff --git a/baremetal/image-builder/build-images.sh b/baremetal/image-builder/build-images.sh index 9fb77de..8359e85 100755 --- a/baremetal/image-builder/build-images.sh +++ b/baremetal/image-builder/build-images.sh @@ -106,6 +106,11 @@ OUTPUT: - initrd Initial ramdisk - netboot.ipxe iPXE boot script +ENVIRONMENT: + PLASMACLOUD_DEPLOYER_URL Optional deployer endpoint embedded into generated netboot.ipxe + PLASMACLOUD_BOOTSTRAP_TOKEN Optional bootstrap token embedded into generated netboot.ipxe + PLASMACLOUD_CA_CERT_URL Optional CA certificate URL embedded into generated netboot.ipxe + EOF } @@ -151,6 +156,17 @@ build_profile() { print_warning "Failed to resolve init path for $profile; using /init" fi + local deployer_kernel_args="" + if [ -n "${PLASMACLOUD_DEPLOYER_URL:-}" ]; then + deployer_kernel_args+=" plasmacloud.deployer_url=${PLASMACLOUD_DEPLOYER_URL}" + fi + if [ -n "${PLASMACLOUD_BOOTSTRAP_TOKEN:-}" ]; then + deployer_kernel_args+=" plasmacloud.bootstrap_token=${PLASMACLOUD_BOOTSTRAP_TOKEN}" + fi + if [ -n "${PLASMACLOUD_CA_CERT_URL:-}" ]; then + deployer_kernel_args+=" plasmacloud.ca_cert_url=${PLASMACLOUD_CA_CERT_URL}" + fi + # Generate iPXE boot script print_info " Generating iPXE boot script..." cat > "$profile_dir/netboot.ipxe" << EOF @@ -169,7 +185,7 @@ echo Initrd: initrd echo # Load kernel and initrd -kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4 +kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4${deployer_kernel_args} initrd \${boot-server}/$profile/initrd # Boot diff --git a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe index 5b48b1a..28aa1ad 100644 --- a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe +++ b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe @@ -27,6 +27,7 @@ set boot-server 10.0.100.10 set boot-url http://${boot-server}/boot set nixos-url ${boot-url}/nixos set provisioning-server http://${boot-server} +set deployer-url http://${boot-server}:8080 # Detect network configuration echo Network Configuration: @@ -181,6 +182,7 @@ set kernel-params ${kernel-params} centra.profile=${profile} set kernel-params ${kernel-params} centra.hostname=${hostname} set kernel-params ${kernel-params} centra.mac=${mac} set kernel-params ${kernel-params} centra.provisioning-server=${provisioning-server} +set kernel-params ${kernel-params} plasmacloud.deployer_url=${deployer-url} set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8 # For debugging, enable these: diff --git a/chainfire/baremetal/pxe-server/nixos-module.nix b/chainfire/baremetal/pxe-server/nixos-module.nix index 136ec65..f45f210 100644 --- a/chainfire/baremetal/pxe-server/nixos-module.nix +++ b/chainfire/baremetal/pxe-server/nixos-module.nix @@ -60,7 +60,7 @@ let next-server ${cfg.serverAddress}; if exists user-class and option user-class = "iPXE" { - filename "http://${cfg.serverAddress}/boot/ipxe/boot.ipxe"; + filename "http://${cfg.serverAddress}:${toString cfg.http.port}/boot/ipxe/boot.ipxe"; } elsif option architecture-type = 00:00 { filename "undionly.kpxe"; } elsif option architecture-type = 00:06 { @@ -82,9 +82,10 @@ let #!ipxe set boot-server ${cfg.serverAddress} - set boot-url http://''${boot-server}/boot + set boot-url http://''${boot-server}:${toString cfg.http.port}/boot set nixos-url ''${boot-url}/nixos - set provisioning-server http://''${boot-server} + set provisioning-server http://''${boot-server}:${toString cfg.http.port} + set deployer-url ${if cfg.bootstrap.deployerUrl != null then cfg.bootstrap.deployerUrl else "http://${cfg.serverAddress}:8080"} echo Network Configuration: echo IP Address: ''${ip} @@ -145,6 +146,9 @@ let set kernel-params ''${kernel-params} centra.hostname=''${hostname} set kernel-params ''${kernel-params} centra.mac=''${mac} set kernel-params ''${kernel-params} centra.provisioning-server=''${provisioning-server} + set kernel-params ''${kernel-params} plasmacloud.deployer_url=''${deployer-url} + ${optionalString (cfg.bootstrap.bootstrapToken != null) "set kernel-params ''${kernel-params} plasmacloud.bootstrap_token=${cfg.bootstrap.bootstrapToken}"} + ${optionalString (cfg.bootstrap.caCertUrl != null) "set kernel-params ''${kernel-params} plasmacloud.ca_cert_url=${cfg.bootstrap.caCertUrl}"} set kernel-params ''${kernel-params} console=tty0 console=ttyS0,115200n8 kernel ''${nixos-url}/bzImage ''${kernel-params} || goto failed @@ -338,6 +342,28 @@ in { }; }; + bootstrap = { + deployerUrl = mkOption { + type = types.nullOr types.str; + default = null; + description = "Deployer endpoint passed to the bootstrap ISO/netboot environment"; + example = "https://deployer.example.com:8443"; + }; + + bootstrapToken = mkOption { + type = types.nullOr types.str; + default = null; + description = "Optional shared bootstrap token embedded in iPXE kernel arguments"; + }; + + caCertUrl = mkOption { + type = types.nullOr types.str; + default = null; + description = "Optional CA certificate URL fetched by the bootstrap environment before phone-home"; + example = "https://deployer.example.com/bootstrap-ca.crt"; + }; + }; + nodes = mkOption { type = types.attrsOf (types.submodule { options = { diff --git a/deployer/crates/deployer-ctl/src/chainfire.rs b/deployer/crates/deployer-ctl/src/chainfire.rs index 042f004..e09d5c7 100644 --- a/deployer/crates/deployer-ctl/src/chainfire.rs +++ b/deployer/crates/deployer-ctl/src/chainfire.rs @@ -907,6 +907,8 @@ mod tests { install_plan: Some(InstallPlan { nixos_configuration: Some("worker-golden".to_string()), disko_config_path: Some("profiles/worker-linux/disko.nix".to_string()), + target_disk: Some("/dev/disk/by-id/worker-golden".to_string()), + target_disk_by_id: None, }), roles: vec!["worker".to_string()], labels: HashMap::from([("tier".to_string(), "general".to_string())]), diff --git a/deployer/crates/deployer-server/src/cloud_init.rs b/deployer/crates/deployer-server/src/cloud_init.rs index 6d14721..592e02f 100644 --- a/deployer/crates/deployer-server/src/cloud_init.rs +++ b/deployer/crates/deployer-server/src/cloud_init.rs @@ -133,6 +133,8 @@ mod tests { install_plan: Some(InstallPlan { nixos_configuration: Some("worker-golden".to_string()), disko_config_path: Some("profiles/worker/disko.nix".to_string()), + target_disk: Some("/dev/vda".to_string()), + target_disk_by_id: None, }), } } diff --git a/deployer/crates/deployer-server/src/phone_home.rs b/deployer/crates/deployer-server/src/phone_home.rs index 0fd11e8..83ca165 100644 --- a/deployer/crates/deployer-server/src/phone_home.rs +++ b/deployer/crates/deployer-server/src/phone_home.rs @@ -1,8 +1,8 @@ use axum::{extract::State, http::HeaderMap, http::StatusCode, Json}; use chrono::Utc; use deployer_types::{ - EnrollmentRuleSpec, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState, - PhoneHomeRequest, PhoneHomeResponse, + EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo, + NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse, }; use std::sync::Arc; use tracing::{debug, error, info, warn}; @@ -19,6 +19,36 @@ fn merge_install_plan( InstallPlan::from_layers(preferred, fallback) } +fn merge_hardware_summary_metadata( + metadata: &mut std::collections::HashMap, + hardware_facts: Option<&HardwareFacts>, +) { + let Some(hardware_facts) = hardware_facts else { + return; + }; + + if let Some(cpu_threads) = hardware_facts.cpu_threads { + metadata.insert("hardware.cpu_threads".to_string(), cpu_threads.to_string()); + } + if let Some(cpu_cores) = hardware_facts.cpu_cores { + metadata.insert("hardware.cpu_cores".to_string(), cpu_cores.to_string()); + } + if let Some(memory_bytes) = hardware_facts.memory_bytes { + metadata.insert("hardware.memory_bytes".to_string(), memory_bytes.to_string()); + } + metadata.insert( + "hardware.disk_count".to_string(), + hardware_facts.disks.len().to_string(), + ); + metadata.insert( + "hardware.nic_count".to_string(), + hardware_facts.nics.len().to_string(), + ); + if let Some(architecture) = hardware_facts.architecture.as_deref() { + metadata.insert("hardware.architecture".to_string(), architecture.to_string()); + } +} + /// POST /api/v1/phone-home /// /// Handles node registration during first boot. @@ -164,6 +194,7 @@ pub async fn phone_home( let mut metadata = request.metadata.clone(); metadata.insert("role".to_string(), node_config.role.clone()); metadata.insert("services".to_string(), node_config.services.join(",")); + merge_hardware_summary_metadata(&mut metadata, request.hardware_facts.as_ref()); // Create NodeInfo for tracking let node_info = NodeInfo { @@ -210,6 +241,7 @@ pub async fn phone_home( &node_info, &node_config, &request.machine_id, + request.hardware_facts.as_ref(), ) .await { @@ -714,6 +746,7 @@ async fn store_cluster_node_if_configured( node_info: &NodeInfo, node_config: &NodeConfig, machine_id: &str, + hardware_facts: Option<&HardwareFacts>, ) -> anyhow::Result<()> { let Some(cluster_id) = state.config.cluster_id.as_deref() else { debug!("cluster_id not configured; skipping cluster node state write"); @@ -759,6 +792,7 @@ async fn store_cluster_node_if_configured( failure_domain: node_config.failure_domain.clone(), nix_profile: node_config.nix_profile.clone(), install_plan: node_config.install_plan.clone(), + hardware_facts: hardware_facts.cloned(), state: Some(format!("{:?}", node_info.state).to_lowercase()), last_heartbeat: Some(node_info.last_heartbeat), }; @@ -837,6 +871,7 @@ mod tests { ip: None, cluster_config_hash: None, metadata: HashMap::new(), + hardware_facts: None, }; let result = phone_home(State(state.clone()), test_headers(), Json(request)).await; @@ -872,6 +907,7 @@ mod tests { ip: Some("10.0.1.100".to_string()), cluster_config_hash: None, metadata: HashMap::new(), + hardware_facts: None, }; let result = phone_home(State(state.clone()), test_headers(), Json(request)).await; @@ -917,6 +953,7 @@ mod tests { ip: None, cluster_config_hash: None, metadata: HashMap::new(), + hardware_facts: None, }; let result = phone_home(State(state.clone()), test_headers(), Json(request)).await; @@ -957,6 +994,7 @@ mod tests { ip: Some("10.0.3.25".to_string()), cluster_config_hash: None, metadata: HashMap::from([("sku".to_string(), "gpu".to_string())]), + hardware_facts: None, }; assert!(enrollment_rule_matches(&rule, &request)); @@ -990,6 +1028,7 @@ mod tests { "topology.kubernetes.io/zone".to_string(), "rack-z".to_string(), )]), + hardware_facts: None, }; let node_classes = vec![NodeClassSpec { name: "gpu-worker".to_string(), @@ -998,6 +1037,8 @@ mod tests { install_plan: Some(InstallPlan { nixos_configuration: Some("gpu-worker".to_string()), disko_config_path: Some("profiles/gpu-worker/disko.nix".to_string()), + target_disk: Some("/dev/disk/by-id/nvme-gpu-worker".to_string()), + target_disk_by_id: None, }), roles: vec!["worker".to_string()], labels: HashMap::from([("tier".to_string(), "gpu".to_string())]), diff --git a/deployer/crates/deployer-types/src/lib.rs b/deployer/crates/deployer-types/src/lib.rs index f05965e..93bd480 100644 --- a/deployer/crates/deployer-types/src/lib.rs +++ b/deployer/crates/deployer-types/src/lib.rs @@ -55,6 +55,12 @@ pub struct InstallPlan { /// Repository-relative Disko file used during installation. #[serde(default, skip_serializing_if = "Option::is_none")] pub disko_config_path: Option, + /// Explicit disk device path used by bootstrap installers. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_disk: Option, + /// Stable `/dev/disk/by-id/...` selector preferred over volatile device names. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub target_disk_by_id: Option, } impl InstallPlan { @@ -66,6 +72,12 @@ impl InstallPlan { if self.disko_config_path.is_some() { merged.disko_config_path = self.disko_config_path.clone(); } + if self.target_disk.is_some() { + merged.target_disk = self.target_disk.clone(); + } + if self.target_disk_by_id.is_some() { + merged.target_disk_by_id = self.target_disk_by_id.clone(); + } merged } @@ -81,6 +93,66 @@ impl InstallPlan { } } +/// Basic inventory record for a physical disk observed during commissioning. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct DiskFact { + pub name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub path: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub by_id: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub size_bytes: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub model: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub serial: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub rotational: Option, +} + +/// Basic inventory record for a network interface observed during commissioning. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct NicFact { + pub name: String, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub mac_address: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub oper_state: Option, +} + +/// DMI strings collected during commissioning. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct DmiFact { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub vendor: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub product_name: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub serial_number: Option, +} + +/// Hardware inventory captured during bootstrap / commissioning. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +pub struct HardwareFacts { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub architecture: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cpu_model: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cpu_threads: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub cpu_cores: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub memory_bytes: Option, + #[serde(default)] + pub disks: Vec, + #[serde(default)] + pub nics: Vec, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub dmi: Option, +} + /// Node configuration returned by Deployer #[derive(Debug, Clone, Serialize, Deserialize)] pub struct NodeConfig { @@ -136,6 +208,9 @@ pub struct PhoneHomeRequest { /// Node metadata #[serde(default)] pub metadata: HashMap, + /// Hardware inventory gathered by the bootstrap environment. + #[serde(default, skip_serializing_if = "Option::is_none")] + pub hardware_facts: Option, } /// Phone Home response payload with secrets @@ -414,6 +489,8 @@ pub struct ClusterNodeRecord { #[serde(default)] pub install_plan: Option, #[serde(default)] + pub hardware_facts: Option, + #[serde(default)] pub state: Option, #[serde(default)] pub last_heartbeat: Option>, @@ -430,10 +507,18 @@ pub struct ObservedSystemState { #[serde(default, skip_serializing_if = "Option::is_none")] pub target_system: Option, #[serde(default, skip_serializing_if = "Option::is_none")] + pub configured_system: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] pub current_system: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub booted_system: Option, #[serde(default, skip_serializing_if = "Option::is_none")] + pub rollback_system: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub switch_action: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub reboot_required: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] pub status: Option, #[serde(default, skip_serializing_if = "Option::is_none")] pub last_attempt: Option>, @@ -760,6 +845,32 @@ mod tests { ip: Some("10.0.1.10".to_string()), cluster_config_hash: Some("abc123".to_string()), metadata, + hardware_facts: Some(HardwareFacts { + architecture: Some("x86_64".to_string()), + cpu_model: Some("Example CPU".to_string()), + cpu_threads: Some(16), + cpu_cores: Some(8), + memory_bytes: Some(64 * 1024 * 1024 * 1024), + disks: vec![DiskFact { + name: "nvme0n1".to_string(), + path: Some("/dev/nvme0n1".to_string()), + by_id: Some("/dev/disk/by-id/nvme-example".to_string()), + size_bytes: Some(1_000_000_000_000), + model: Some("Example Disk".to_string()), + serial: Some("disk-serial".to_string()), + rotational: Some(false), + }], + nics: vec![NicFact { + name: "eno1".to_string(), + mac_address: Some("52:54:00:12:34:56".to_string()), + oper_state: Some("up".to_string()), + }], + dmi: Some(DmiFact { + vendor: Some("ExampleVendor".to_string()), + product_name: Some("ExampleSystem".to_string()), + serial_number: Some("system-serial".to_string()), + }), + }), }; let json = serde_json::to_string(&request).unwrap(); @@ -767,6 +878,14 @@ mod tests { assert_eq!(deserialized.machine_id, "abc123def456"); assert_eq!(deserialized.node_id, Some("node01".to_string())); assert_eq!(deserialized.metadata.get("role").unwrap(), "control-plane"); + assert_eq!( + deserialized + .hardware_facts + .as_ref() + .and_then(|facts| facts.disks.first()) + .and_then(|disk| disk.by_id.as_deref()), + Some("/dev/disk/by-id/nvme-example") + ); } #[test] @@ -785,6 +904,8 @@ mod tests { install_plan: Some(InstallPlan { nixos_configuration: Some("node01".to_string()), disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()), + target_disk: Some("/dev/vda".to_string()), + target_disk_by_id: None, }), }; @@ -811,6 +932,7 @@ mod tests { .and_then(|config| config.install_plan.as_ref()) .expect("install_plan should round-trip"); assert_eq!(install_plan.nixos_configuration.as_deref(), Some("node01")); + assert_eq!(install_plan.target_disk.as_deref(), Some("/dev/vda")); } #[test] @@ -935,8 +1057,12 @@ mod tests { nixos_configuration: Some("node01".to_string()), flake_root: Some("/opt/plasmacloud-src".to_string()), target_system: Some("/nix/store/system-node01".to_string()), + configured_system: Some("/nix/store/system-node01".to_string()), current_system: Some("/nix/store/system-old".to_string()), booted_system: Some("/nix/store/system-old".to_string()), + rollback_system: Some("/nix/store/system-old".to_string()), + switch_action: Some("boot".to_string()), + reboot_required: Some(true), status: Some("pending".to_string()), last_attempt: None, last_success: None, @@ -968,4 +1094,28 @@ mod tests { assert_eq!(decoded.health_check_command.len(), 2); assert_eq!(decoded.rollback_on_failure, Some(true)); } + + #[test] + fn test_install_plan_merges_disk_preferences() { + let fallback = InstallPlan { + nixos_configuration: Some("fallback".to_string()), + disko_config_path: Some("fallback/disko.nix".to_string()), + target_disk: Some("/dev/sda".to_string()), + target_disk_by_id: None, + }; + let preferred = InstallPlan { + nixos_configuration: None, + disko_config_path: None, + target_disk: None, + target_disk_by_id: Some("/dev/disk/by-id/nvme-example".to_string()), + }; + + let merged = preferred.merged_with(Some(&fallback)); + assert_eq!(merged.nixos_configuration.as_deref(), Some("fallback")); + assert_eq!(merged.target_disk.as_deref(), Some("/dev/sda")); + assert_eq!( + merged.target_disk_by_id.as_deref(), + Some("/dev/disk/by-id/nvme-example") + ); + } } diff --git a/deployer/crates/fleet-scheduler/src/main.rs b/deployer/crates/fleet-scheduler/src/main.rs index 167e773..1271392 100644 --- a/deployer/crates/fleet-scheduler/src/main.rs +++ b/deployer/crates/fleet-scheduler/src/main.rs @@ -897,6 +897,7 @@ mod tests { failure_domain: Some(format!("rack-{}", &node_id[node_id.len() - 1..])), nix_profile: Some("profiles/worker-linux".to_string()), install_plan: None, + hardware_facts: None, state: Some("active".to_string()), last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)), } diff --git a/deployer/crates/nix-agent/src/main.rs b/deployer/crates/nix-agent/src/main.rs index 729ffbb..dd0d433 100644 --- a/deployer/crates/nix-agent/src/main.rs +++ b/deployer/crates/nix-agent/src/main.rs @@ -103,6 +103,12 @@ struct ResolvedDesiredSystem { rollback_on_failure: bool, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum HealthCheckOutcome { + Passed, + RolledBack, +} + impl Agent { fn new(cli: Cli) -> Self { Self { @@ -155,9 +161,20 @@ impl Agent { .transpose() .context("failed to parse desired-system spec")?; + let previous_observed = client + .get(key_observed_system( + &self.cluster_namespace, + &self.cluster_id, + &self.node_id, + )) + .await? + .map(|bytes| serde_json::from_slice::(&bytes)) + .transpose() + .context("failed to parse observed-system state")?; + let mut observed = self.base_observed_state(&node); let reconcile_result = self - .reconcile_node(&node, desired.as_ref(), &mut observed) + .reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed) .await; if let Err(error) = reconcile_result { observed.status = Some("failed".to_string()); @@ -177,6 +194,7 @@ impl Agent { fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState { ObservedSystemState { node_id: node.node_id.clone(), + configured_system: read_symlink_target("/nix/var/nix/profiles/system"), current_system: read_symlink_target("/run/current-system"), booted_system: read_symlink_target("/run/booted-system"), ..ObservedSystemState::default() @@ -187,6 +205,7 @@ impl Agent { &self, node: &ClusterNodeRecord, desired: Option<&DesiredSystemSpec>, + previous_observed: Option<&ObservedSystemState>, observed: &mut ObservedSystemState, ) -> Result<()> { match node.state.as_deref() { @@ -211,8 +230,12 @@ impl Agent { observed.nixos_configuration = Some(desired.nixos_configuration.clone()); observed.flake_root = Some(desired.flake_ref.clone()); + observed.switch_action = Some(desired.switch_action.clone()); - let previous_system = observed.current_system.clone(); + let previous_system = previous_observed + .and_then(|state| state.rollback_system.clone()) + .or_else(|| observed.current_system.clone()); + observed.rollback_system = previous_system.clone(); let target_system = self .build_target_system(&desired.flake_ref, &desired.nixos_configuration) .await @@ -225,6 +248,25 @@ impl Agent { observed.target_system = Some(target_system.clone()); if observed.current_system.as_deref() == Some(target_system.as_str()) { + if should_run_post_boot_health_check(previous_observed, &desired, &target_system) { + observed.status = Some("verifying".to_string()); + observed.last_attempt = Some(Utc::now()); + let outcome = self + .run_health_check_and_maybe_rollback( + &desired, + previous_system.as_deref(), + observed, + ) + .await?; + observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system"); + observed.current_system = read_symlink_target("/run/current-system"); + observed.booted_system = read_symlink_target("/run/booted-system"); + if outcome == HealthCheckOutcome::RolledBack { + return Ok(()); + } + } + + observed.reboot_required = Some(false); observed.status = Some("active".to_string()); observed.last_success = Some(Utc::now()); return Ok(()); @@ -240,9 +282,24 @@ impl Agent { self.switch_to_target(&target_system, &desired.switch_action) .await?; + observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system"); observed.current_system = read_symlink_target("/run/current-system"); observed.booted_system = read_symlink_target("/run/booted-system"); + if desired.switch_action == "boot" { + if observed.configured_system.as_deref() != Some(target_system.as_str()) { + return Err(anyhow!( + "boot switch completed but configured system does not match target {}", + target_system + )); + } + + observed.reboot_required = Some(true); + observed.status = Some("staged".to_string()); + observed.last_error = None; + return Ok(()); + } + if observed.current_system.as_deref() != Some(target_system.as_str()) { return Err(anyhow!( "switch completed but /run/current-system does not match target {}", @@ -250,9 +307,17 @@ impl Agent { )); } - self.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed) + let outcome = self + .run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed) .await?; + observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system"); + observed.current_system = read_symlink_target("/run/current-system"); + observed.booted_system = read_symlink_target("/run/booted-system"); + if outcome == HealthCheckOutcome::RolledBack { + return Ok(()); + } + observed.reboot_required = Some(false); observed.status = Some("active".to_string()); observed.last_success = Some(Utc::now()); observed.last_error = None; @@ -299,26 +364,28 @@ impl Agent { desired: &ResolvedDesiredSystem, previous_system: Option<&str>, observed: &mut ObservedSystemState, - ) -> Result<()> { + ) -> Result { if desired.health_check_command.is_empty() { - return Ok(()); + return Ok(HealthCheckOutcome::Passed); } if let Err(error) = run_vec_command(&desired.health_check_command).await { let error_message = format!("health check failed after activation: {error}"); if desired.rollback_on_failure { self.rollback_to_previous(previous_system).await?; + observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system"); observed.current_system = read_symlink_target("/run/current-system"); observed.booted_system = read_symlink_target("/run/booted-system"); + observed.reboot_required = Some(false); observed.status = Some("rolled-back".to_string()); observed.last_error = Some(error_message); - return Ok(()); + return Ok(HealthCheckOutcome::RolledBack); } return Err(anyhow!(error_message)); } - Ok(()) + Ok(HealthCheckOutcome::Passed) } async fn rollback_to_previous(&self, previous_system: Option<&str>) -> Result<()> { @@ -370,6 +437,20 @@ fn target_flake_attr(flake_root: &str, configuration: &str) -> String { ) } +fn should_run_post_boot_health_check( + previous_observed: Option<&ObservedSystemState>, + desired: &ResolvedDesiredSystem, + target_system: &str, +) -> bool { + desired.switch_action == "boot" + && previous_observed + .map(|state| { + state.status.as_deref() == Some("staged") + && state.target_system.as_deref() == Some(target_system) + }) + .unwrap_or(false) +} + fn read_symlink_target(path: &str) -> Option { fs::read_link(path) .ok() @@ -457,7 +538,10 @@ mod tests { install_plan: Some(InstallPlan { nixos_configuration: Some("node01".to_string()), disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()), + target_disk: Some("/dev/vda".to_string()), + target_disk_by_id: None, }), + hardware_facts: None, state: Some("active".to_string()), last_heartbeat: None, } @@ -549,4 +633,48 @@ mod tests { fn read_symlink_target_returns_none_for_missing_path() { assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None); } + + #[test] + fn post_boot_health_check_is_requested_for_matching_staged_target() { + let desired = ResolvedDesiredSystem { + nixos_configuration: "node01".to_string(), + flake_ref: "/opt/plasmacloud-src".to_string(), + switch_action: "boot".to_string(), + health_check_command: vec!["true".to_string()], + rollback_on_failure: true, + }; + let previous = ObservedSystemState { + status: Some("staged".to_string()), + target_system: Some("/nix/store/example-system".to_string()), + ..ObservedSystemState::default() + }; + + assert!(should_run_post_boot_health_check( + Some(&previous), + &desired, + "/nix/store/example-system" + )); + } + + #[test] + fn post_boot_health_check_is_skipped_for_non_matching_state() { + let desired = ResolvedDesiredSystem { + nixos_configuration: "node01".to_string(), + flake_ref: "/opt/plasmacloud-src".to_string(), + switch_action: "boot".to_string(), + health_check_command: vec!["true".to_string()], + rollback_on_failure: true, + }; + let previous = ObservedSystemState { + status: Some("active".to_string()), + target_system: Some("/nix/store/example-system".to_string()), + ..ObservedSystemState::default() + }; + + assert!(!should_run_post_boot_health_check( + Some(&previous), + &desired, + "/nix/store/example-system" + )); + } } diff --git a/deployer/scripts/verify-deployer-bootstrap-e2e.sh b/deployer/scripts/verify-deployer-bootstrap-e2e.sh index 78dfcb3..d8e2cdd 100755 --- a/deployer/scripts/verify-deployer-bootstrap-e2e.sh +++ b/deployer/scripts/verify-deployer-bootstrap-e2e.sh @@ -173,6 +173,7 @@ node_classes: install_plan: nixos_configuration: worker-golden disko_config_path: profiles/worker-linux/disko.nix + target_disk_by_id: /dev/disk/by-id/worker-default roles: - worker labels: @@ -182,6 +183,7 @@ node_classes: install_plan: nixos_configuration: edge-metal disko_config_path: profiles/edge-metal/disko.nix + target_disk_by_id: /dev/disk/by-id/edge-default roles: - edge labels: @@ -208,6 +210,7 @@ nodes: install_plan: nixos_configuration: node01 disko_config_path: nix/nodes/vm-cluster/node01/disko.nix + target_disk: /dev/vda desired_system: flake_ref: "github:centra/cloud" health_check_command: @@ -273,6 +276,7 @@ assert payload["node_config"]["node_class"] == "general-worker" assert payload["node_config"]["nix_profile"] == "profiles/worker-linux" assert payload["node_config"]["install_plan"]["nixos_configuration"] == "node01" assert payload["node_config"]["install_plan"]["disko_config_path"] == "nix/nodes/vm-cluster/node01/disko.nix" +assert payload["node_config"]["install_plan"]["target_disk"] == "/dev/vda" assert payload["node_config"]["failure_domain"] == "rack-a" print("Seeded mapping validated") PY @@ -339,6 +343,36 @@ request = urllib.request.Request( "sku": "metal", "topology.kubernetes.io/zone": "rack-z", }, + "hardware_facts": { + "architecture": "x86_64", + "cpu_model": "Example CPU", + "cpu_threads": 32, + "cpu_cores": 16, + "memory_bytes": 137438953472, + "disks": [ + { + "name": "nvme0n1", + "path": "/dev/nvme0n1", + "by_id": "/dev/disk/by-id/nvme-dynamic-metal-01", + "size_bytes": 2000398934016, + "model": "Example NVMe", + "serial": "disk-serial-01", + "rotational": False + } + ], + "nics": [ + { + "name": "eno1", + "mac_address": "52:54:00:aa:bb:cc", + "oper_state": "up" + } + ], + "dmi": { + "vendor": "ExampleVendor", + "product_name": "ExampleMetal", + "serial_number": "dynamic-metal-serial" + } + }, } ).encode(), headers={ @@ -357,6 +391,7 @@ assert payload["node_config"]["node_class"] == "edge-metal" assert payload["node_config"]["nix_profile"] == "profiles/edge-metal" assert payload["node_config"]["install_plan"]["nixos_configuration"] == "edge-metal" assert payload["node_config"]["install_plan"]["disko_config_path"] == "profiles/edge-metal/disko.nix" +assert payload["node_config"]["install_plan"]["target_disk_by_id"] == "/dev/disk/by-id/edge-default" assert "prismnet" in payload["node_config"]["services"] assert payload["node_config"]["labels"]["managed-by"] == "deployer" print(payload["node_id"]) @@ -400,6 +435,19 @@ if dynamic.get("failure_domain") != "rack-z": raise SystemExit(f"unexpected dynamic failure domain: {dynamic}") if dynamic.get("labels", {}).get("lane") != "edge": raise SystemExit(f"missing pool label propagation: {dynamic}") +if seeded.get("install_plan", {}).get("target_disk") != "/dev/vda": + raise SystemExit(f"missing seeded target disk: {seeded}") +if dynamic.get("install_plan", {}).get("target_disk_by_id") != "/dev/disk/by-id/edge-default": + raise SystemExit(f"missing dynamic target disk by-id: {dynamic}") +facts = dynamic.get("hardware_facts") or {} +if facts.get("architecture") != "x86_64": + raise SystemExit(f"missing dynamic hardware architecture: {dynamic}") +if facts.get("disks", [{}])[0].get("by_id") != "/dev/disk/by-id/nvme-dynamic-metal-01": + raise SystemExit(f"missing dynamic hardware disk facts: {dynamic}") +if dynamic.get("labels", {}).get("hardware.architecture") != "x86_64": + raise SystemExit(f"missing hardware metadata labels: {dynamic}") +if dynamic.get("labels", {}).get("hardware.disk_count") != "1": + raise SystemExit(f"missing hardware disk count label: {dynamic}") print("Deployer bootstrap records validated") PY diff --git a/nix/iso/plasmacloud-iso.nix b/nix/iso/plasmacloud-iso.nix index 81df962..70eb23b 100644 --- a/nix/iso/plasmacloud-iso.nix +++ b/nix/iso/plasmacloud-iso.nix @@ -44,8 +44,30 @@ script = '' set -euo pipefail - # Discover Deployer via DNS or fallback - DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}" + cmdline_value() { + local key="$1" + local arg + for arg in $(cat /proc/cmdline); do + case "$arg" in + "$key"=*) + echo "''${arg#*=}" + return 0 + ;; + esac + done + return 1 + } + + mkdir -p /etc/plasmacloud + + # Discover Deployer via environment, kernel cmdline, or fallback. + DEPLOYER_URL="''${DEPLOYER_URL:-}" + if [ -z "$DEPLOYER_URL" ]; then + DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)" + fi + if [ -z "$DEPLOYER_URL" ]; then + DEPLOYER_URL="http://192.168.100.1:8080" + fi # Get machine identity MACHINE_ID=$(cat /etc/machine-id) @@ -61,14 +83,27 @@ DEPLOYER_TOKEN=$(cat "$TOKEN_FILE") elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}" + else + DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)" + fi + + DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}" + if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then + DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)" + if [ -n "$DEPLOYER_CA_CERT_URL" ]; then + DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt" + ${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \ + "$DEPLOYER_CA_CERT_URL" \ + -o "$DEPLOYER_CA_CERT_PATH" + fi fi CURL_ARGS=(-sf --connect-timeout 5 --max-time 15) if [ -n "$DEPLOYER_TOKEN" ]; then CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN") fi - if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then - CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}") + if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then + CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH") fi NODE_IP=$(${pkgs.iproute2}/bin/ip -4 route get 1.1.1.1 2>/dev/null | ${pkgs.gawk}/bin/awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}') @@ -79,6 +114,76 @@ NODE_IP=$(hostname -I 2>/dev/null | ${pkgs.gawk}/bin/awk '{print $1}') fi NODE_HOSTNAME=$(hostname) + CPU_MODEL=$(${pkgs.gawk}/bin/awk -F: '/model name/ {gsub(/^[ \t]+/, "", $2); print $2; exit}' /proc/cpuinfo 2>/dev/null || true) + CPU_CORES=$(${pkgs.gawk}/bin/awk '/^cpu cores/ {print $4; exit}' /proc/cpuinfo 2>/dev/null || true) + CPU_THREADS=$(${pkgs.coreutils}/bin/nproc --all 2>/dev/null || true) + MEMORY_KIB=$(${pkgs.gawk}/bin/awk '/MemTotal:/ {print $2; exit}' /proc/meminfo 2>/dev/null || true) + MEMORY_BYTES="" + if [ -n "$MEMORY_KIB" ]; then + MEMORY_BYTES=$((MEMORY_KIB * 1024)) + fi + + DISKS_JSON=$(${pkgs.util-linux}/bin/lsblk -J -b -o NAME,PATH,SIZE,MODEL,SERIAL,ROTA,TYPE 2>/dev/null | ${pkgs.jq}/bin/jq ' + [.blockdevices[] | select(.type == "disk") | { + name: .name, + path: (.path // null), + size_bytes: (.size | tonumber?), + model: ((.model // "") | if . == "" then null else . end), + serial: ((.serial // "") | if . == "" then null else . end), + rotational: (if .rota == null then null else (.rota == 1) end) + }] + ') + NICS_JSON=$(${pkgs.iproute2}/bin/ip -j link 2>/dev/null | ${pkgs.jq}/bin/jq ' + [.[] | select(.ifname != "lo") | { + name: .ifname, + mac_address: ((.address // "") | if . == "" or . == "00:00:00:00:00:00" then null else . end), + oper_state: ((.operstate // "") | ascii_downcase | if . == "" then null else . end) + }] + ') + DMI_VENDOR=$(tr -d '\n' /dev/null || true) + DMI_PRODUCT=$(tr -d '\n' /dev/null || true) + DMI_SERIAL=$(tr -d '\n' /dev/null || true) + HARDWARE_FACTS=$(${pkgs.jq}/bin/jq -n \ + --arg architecture "$(${pkgs.coreutils}/bin/uname -m)" \ + --arg cpu_model "$CPU_MODEL" \ + --arg cpu_threads "$CPU_THREADS" \ + --arg cpu_cores "$CPU_CORES" \ + --arg memory_bytes "$MEMORY_BYTES" \ + --arg dmi_vendor "$DMI_VENDOR" \ + --arg dmi_product "$DMI_PRODUCT" \ + --arg dmi_serial "$DMI_SERIAL" \ + --argjson disks "$DISKS_JSON" \ + --argjson nics "$NICS_JSON" ' + { + architecture: (if $architecture == "" then null else $architecture end), + cpu_model: (if $cpu_model == "" then null else $cpu_model end), + cpu_threads: (if $cpu_threads == "" then null else ($cpu_threads | tonumber) end), + cpu_cores: (if $cpu_cores == "" then null else ($cpu_cores | tonumber) end), + memory_bytes: (if $memory_bytes == "" then null else ($memory_bytes | tonumber) end), + disks: $disks, + nics: $nics, + dmi: ({ + vendor: (if $dmi_vendor == "" then null else $dmi_vendor end), + product_name: (if $dmi_product == "" then null else $dmi_product end), + serial_number: (if $dmi_serial == "" then null else $dmi_serial end) + } | with_entries(select(.value != null))) + } + | if (.dmi | length) == 0 then del(.dmi) else . end + ') + REQUEST_JSON=$(${pkgs.jq}/bin/jq -n \ + --arg machine_id "$MACHINE_ID" \ + --arg node_id "$NODE_HOSTNAME" \ + --arg hostname "$NODE_HOSTNAME" \ + --arg ip "$NODE_IP" \ + --argjson hardware_facts "$HARDWARE_FACTS" ' + { + machine_id: $machine_id, + node_id: $node_id, + hostname: $hostname, + ip: $ip, + hardware_facts: $hardware_facts + } + ') # Phone Home request with retry for i in 1 2 3 4 5; do @@ -86,7 +191,7 @@ if RESPONSE=$(${pkgs.curl}/bin/curl "''${CURL_ARGS[@]}" -X POST \ -H "Content-Type: application/json" \ - -d "{\"machine_id\": \"$MACHINE_ID\", \"node_id\": \"$NODE_HOSTNAME\", \"hostname\": \"$NODE_HOSTNAME\", \"ip\": \"$NODE_IP\"}" \ + -d "$REQUEST_JSON" \ "$DEPLOYER_URL/api/v1/phone-home"); then echo "✓ Phone Home successful" @@ -177,6 +282,20 @@ script = '' set -euo pipefail + cmdline_value() { + local key="$1" + local arg + for arg in $(cat /proc/cmdline); do + case "$arg" in + "$key"=*) + echo "''${arg#*=}" + return 0 + ;; + esac + done + return 1 + } + if [ ! -s /etc/plasmacloud/node-config.json ]; then echo "ERROR: node-config.json missing (bootstrap not complete?)" exit 1 @@ -186,7 +305,15 @@ NODE_IP=$(${pkgs.jq}/bin/jq -r '.ip // empty' /etc/plasmacloud/node-config.json) NIXOS_CONFIGURATION=$(${pkgs.jq}/bin/jq -r '.install_plan.nixos_configuration // .hostname // empty' /etc/plasmacloud/node-config.json) DISKO_PATH=$(${pkgs.jq}/bin/jq -r '.install_plan.disko_config_path // empty' /etc/plasmacloud/node-config.json) - DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}" + TARGET_DISK=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk // empty' /etc/plasmacloud/node-config.json) + TARGET_DISK_BY_ID=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk_by_id // empty' /etc/plasmacloud/node-config.json) + DEPLOYER_URL="''${DEPLOYER_URL:-}" + if [ -z "$DEPLOYER_URL" ]; then + DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)" + fi + if [ -z "$DEPLOYER_URL" ]; then + DEPLOYER_URL="http://192.168.100.1:8080" + fi SRC_ROOT="/opt/plasmacloud-src" if [ -z "$NODE_ID" ] || [ -z "$NODE_IP" ]; then @@ -205,14 +332,27 @@ DEPLOYER_TOKEN=$(cat "$TOKEN_FILE") elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}" + else + DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)" + fi + + DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}" + if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then + DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)" + if [ -n "$DEPLOYER_CA_CERT_URL" ]; then + DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt" + ${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \ + "$DEPLOYER_CA_CERT_URL" \ + -o "$DEPLOYER_CA_CERT_PATH" + fi fi CURL_ARGS=(-sfL --connect-timeout 5 --max-time 120) if [ -n "$DEPLOYER_TOKEN" ]; then CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN") fi - if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then - CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}") + if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then + CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH") fi BUNDLE_PATH="/run/plasmacloud/flake-bundle.tar.gz" @@ -247,18 +387,32 @@ echo "PlasmaCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, disko_path=$DISKO_PATH)" - # Find disk - DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}') + # Resolve installation target disk. + if [ -n "$TARGET_DISK_BY_ID" ]; then + if [ ! -b "$TARGET_DISK_BY_ID" ]; then + echo "ERROR: target_disk_by_id does not exist: $TARGET_DISK_BY_ID" + exit 1 + fi + DISK="$TARGET_DISK_BY_ID" + elif [ -n "$TARGET_DISK" ]; then + if [ ! -b "$TARGET_DISK" ]; then + echo "ERROR: target_disk does not exist: $TARGET_DISK" + exit 1 + fi + DISK="$TARGET_DISK" + else + DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}') + fi if [ -z "$DISK" ]; then echo "ERROR: No disk found" exit 1 fi - ROOT_PART="''${DISK}2" + ROOT_PART=$(${pkgs.util-linux}/bin/lsblk -lnpo NAME,TYPE "$DISK" 2>/dev/null | ${pkgs.gawk}/bin/awk '$2=="part"{print $1}' | sed -n '2p') mkdir -p /mnt # Skip if already installed - if ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then + if [ -n "$ROOT_PART" ] && ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then mount "$ROOT_PART" /mnt 2>/dev/null || true if [ -e /mnt/etc/NIXOS ]; then echo "✓ Existing NixOS detected; skipping install" @@ -271,9 +425,25 @@ echo "Validating NixOS configuration output..." nix eval --raw "$SRC_ROOT#nixosConfigurations.$NIXOS_CONFIGURATION.config.system.build.toplevel.drvPath" >/dev/null + EFFECTIVE_DISKO_PATH="$SRC_ROOT/$DISKO_PATH" + if [ -n "$DISK" ]; then + cat > /run/plasmacloud/disko-wrapper.nix <