From a7d5cfa738c78be5e16fa835f975e9ca0e534fa4 Mon Sep 17 00:00:00 2001
From: centra
Date: Sat, 21 Mar 2026 16:43:00 +0900
Subject: [PATCH] Wire bootstrap installers to explicit targets and
reboot-aware rollout
---
baremetal/image-builder/build-images.sh | 18 +-
chainfire/baremetal/pxe-server/ipxe/boot.ipxe | 2 +
.../baremetal/pxe-server/nixos-module.nix | 32 ++-
deployer/crates/deployer-ctl/src/chainfire.rs | 2 +
.../crates/deployer-server/src/cloud_init.rs | 2 +
.../crates/deployer-server/src/phone_home.rs | 45 +++-
deployer/crates/deployer-types/src/lib.rs | 150 ++++++++++++++
deployer/crates/fleet-scheduler/src/main.rs | 1 +
deployer/crates/nix-agent/src/main.rs | 142 ++++++++++++-
.../scripts/verify-deployer-bootstrap-e2e.sh | 48 +++++
nix/iso/plasmacloud-iso.nix | 196 ++++++++++++++++--
nix/modules/cluster-config-lib.nix | 18 ++
nix/modules/default.nix | 1 +
nix/nodes/vm-cluster/cluster.nix | 3 +
nix/nodes/vm-cluster/node01/disko.nix | 4 +-
nix/nodes/vm-cluster/node02/disko.nix | 4 +-
nix/nodes/vm-cluster/node03/disko.nix | 4 +-
17 files changed, 640 insertions(+), 32 deletions(-)
diff --git a/baremetal/image-builder/build-images.sh b/baremetal/image-builder/build-images.sh
index 9fb77de..8359e85 100755
--- a/baremetal/image-builder/build-images.sh
+++ b/baremetal/image-builder/build-images.sh
@@ -106,6 +106,11 @@ OUTPUT:
- initrd Initial ramdisk
- netboot.ipxe iPXE boot script
+ENVIRONMENT:
+ PLASMACLOUD_DEPLOYER_URL Optional deployer endpoint embedded into generated netboot.ipxe
+ PLASMACLOUD_BOOTSTRAP_TOKEN Optional bootstrap token embedded into generated netboot.ipxe
+ PLASMACLOUD_CA_CERT_URL Optional CA certificate URL embedded into generated netboot.ipxe
+
EOF
}
@@ -151,6 +156,17 @@ build_profile() {
print_warning "Failed to resolve init path for $profile; using /init"
fi
+ local deployer_kernel_args=""
+ if [ -n "${PLASMACLOUD_DEPLOYER_URL:-}" ]; then
+ deployer_kernel_args+=" plasmacloud.deployer_url=${PLASMACLOUD_DEPLOYER_URL}"
+ fi
+ if [ -n "${PLASMACLOUD_BOOTSTRAP_TOKEN:-}" ]; then
+ deployer_kernel_args+=" plasmacloud.bootstrap_token=${PLASMACLOUD_BOOTSTRAP_TOKEN}"
+ fi
+ if [ -n "${PLASMACLOUD_CA_CERT_URL:-}" ]; then
+ deployer_kernel_args+=" plasmacloud.ca_cert_url=${PLASMACLOUD_CA_CERT_URL}"
+ fi
+
# Generate iPXE boot script
print_info " Generating iPXE boot script..."
cat > "$profile_dir/netboot.ipxe" << EOF
@@ -169,7 +185,7 @@ echo Initrd: initrd
echo
# Load kernel and initrd
-kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4
+kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4${deployer_kernel_args}
initrd \${boot-server}/$profile/initrd
# Boot
diff --git a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
index 5b48b1a..28aa1ad 100644
--- a/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
+++ b/chainfire/baremetal/pxe-server/ipxe/boot.ipxe
@@ -27,6 +27,7 @@ set boot-server 10.0.100.10
set boot-url http://${boot-server}/boot
set nixos-url ${boot-url}/nixos
set provisioning-server http://${boot-server}
+set deployer-url http://${boot-server}:8080
# Detect network configuration
echo Network Configuration:
@@ -181,6 +182,7 @@ set kernel-params ${kernel-params} centra.profile=${profile}
set kernel-params ${kernel-params} centra.hostname=${hostname}
set kernel-params ${kernel-params} centra.mac=${mac}
set kernel-params ${kernel-params} centra.provisioning-server=${provisioning-server}
+set kernel-params ${kernel-params} plasmacloud.deployer_url=${deployer-url}
set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8
# For debugging, enable these:
diff --git a/chainfire/baremetal/pxe-server/nixos-module.nix b/chainfire/baremetal/pxe-server/nixos-module.nix
index 136ec65..f45f210 100644
--- a/chainfire/baremetal/pxe-server/nixos-module.nix
+++ b/chainfire/baremetal/pxe-server/nixos-module.nix
@@ -60,7 +60,7 @@ let
next-server ${cfg.serverAddress};
if exists user-class and option user-class = "iPXE" {
- filename "http://${cfg.serverAddress}/boot/ipxe/boot.ipxe";
+ filename "http://${cfg.serverAddress}:${toString cfg.http.port}/boot/ipxe/boot.ipxe";
} elsif option architecture-type = 00:00 {
filename "undionly.kpxe";
} elsif option architecture-type = 00:06 {
@@ -82,9 +82,10 @@ let
#!ipxe
set boot-server ${cfg.serverAddress}
- set boot-url http://''${boot-server}/boot
+ set boot-url http://''${boot-server}:${toString cfg.http.port}/boot
set nixos-url ''${boot-url}/nixos
- set provisioning-server http://''${boot-server}
+ set provisioning-server http://''${boot-server}:${toString cfg.http.port}
+ set deployer-url ${if cfg.bootstrap.deployerUrl != null then cfg.bootstrap.deployerUrl else "http://${cfg.serverAddress}:8080"}
echo Network Configuration:
echo IP Address: ''${ip}
@@ -145,6 +146,9 @@ let
set kernel-params ''${kernel-params} centra.hostname=''${hostname}
set kernel-params ''${kernel-params} centra.mac=''${mac}
set kernel-params ''${kernel-params} centra.provisioning-server=''${provisioning-server}
+ set kernel-params ''${kernel-params} plasmacloud.deployer_url=''${deployer-url}
+ ${optionalString (cfg.bootstrap.bootstrapToken != null) "set kernel-params ''${kernel-params} plasmacloud.bootstrap_token=${cfg.bootstrap.bootstrapToken}"}
+ ${optionalString (cfg.bootstrap.caCertUrl != null) "set kernel-params ''${kernel-params} plasmacloud.ca_cert_url=${cfg.bootstrap.caCertUrl}"}
set kernel-params ''${kernel-params} console=tty0 console=ttyS0,115200n8
kernel ''${nixos-url}/bzImage ''${kernel-params} || goto failed
@@ -338,6 +342,28 @@ in {
};
};
+ bootstrap = {
+ deployerUrl = mkOption {
+ type = types.nullOr types.str;
+ default = null;
+ description = "Deployer endpoint passed to the bootstrap ISO/netboot environment";
+ example = "https://deployer.example.com:8443";
+ };
+
+ bootstrapToken = mkOption {
+ type = types.nullOr types.str;
+ default = null;
+ description = "Optional shared bootstrap token embedded in iPXE kernel arguments";
+ };
+
+ caCertUrl = mkOption {
+ type = types.nullOr types.str;
+ default = null;
+ description = "Optional CA certificate URL fetched by the bootstrap environment before phone-home";
+ example = "https://deployer.example.com/bootstrap-ca.crt";
+ };
+ };
+
nodes = mkOption {
type = types.attrsOf (types.submodule {
options = {
diff --git a/deployer/crates/deployer-ctl/src/chainfire.rs b/deployer/crates/deployer-ctl/src/chainfire.rs
index 042f004..e09d5c7 100644
--- a/deployer/crates/deployer-ctl/src/chainfire.rs
+++ b/deployer/crates/deployer-ctl/src/chainfire.rs
@@ -907,6 +907,8 @@ mod tests {
install_plan: Some(InstallPlan {
nixos_configuration: Some("worker-golden".to_string()),
disko_config_path: Some("profiles/worker-linux/disko.nix".to_string()),
+ target_disk: Some("/dev/disk/by-id/worker-golden".to_string()),
+ target_disk_by_id: None,
}),
roles: vec!["worker".to_string()],
labels: HashMap::from([("tier".to_string(), "general".to_string())]),
diff --git a/deployer/crates/deployer-server/src/cloud_init.rs b/deployer/crates/deployer-server/src/cloud_init.rs
index 6d14721..592e02f 100644
--- a/deployer/crates/deployer-server/src/cloud_init.rs
+++ b/deployer/crates/deployer-server/src/cloud_init.rs
@@ -133,6 +133,8 @@ mod tests {
install_plan: Some(InstallPlan {
nixos_configuration: Some("worker-golden".to_string()),
disko_config_path: Some("profiles/worker/disko.nix".to_string()),
+ target_disk: Some("/dev/vda".to_string()),
+ target_disk_by_id: None,
}),
}
}
diff --git a/deployer/crates/deployer-server/src/phone_home.rs b/deployer/crates/deployer-server/src/phone_home.rs
index 0fd11e8..83ca165 100644
--- a/deployer/crates/deployer-server/src/phone_home.rs
+++ b/deployer/crates/deployer-server/src/phone_home.rs
@@ -1,8 +1,8 @@
use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
use chrono::Utc;
use deployer_types::{
- EnrollmentRuleSpec, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState,
- PhoneHomeRequest, PhoneHomeResponse,
+ EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo,
+ NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse,
};
use std::sync::Arc;
use tracing::{debug, error, info, warn};
@@ -19,6 +19,36 @@ fn merge_install_plan(
InstallPlan::from_layers(preferred, fallback)
}
+fn merge_hardware_summary_metadata(
+ metadata: &mut std::collections::HashMap,
+ hardware_facts: Option<&HardwareFacts>,
+) {
+ let Some(hardware_facts) = hardware_facts else {
+ return;
+ };
+
+ if let Some(cpu_threads) = hardware_facts.cpu_threads {
+ metadata.insert("hardware.cpu_threads".to_string(), cpu_threads.to_string());
+ }
+ if let Some(cpu_cores) = hardware_facts.cpu_cores {
+ metadata.insert("hardware.cpu_cores".to_string(), cpu_cores.to_string());
+ }
+ if let Some(memory_bytes) = hardware_facts.memory_bytes {
+ metadata.insert("hardware.memory_bytes".to_string(), memory_bytes.to_string());
+ }
+ metadata.insert(
+ "hardware.disk_count".to_string(),
+ hardware_facts.disks.len().to_string(),
+ );
+ metadata.insert(
+ "hardware.nic_count".to_string(),
+ hardware_facts.nics.len().to_string(),
+ );
+ if let Some(architecture) = hardware_facts.architecture.as_deref() {
+ metadata.insert("hardware.architecture".to_string(), architecture.to_string());
+ }
+}
+
/// POST /api/v1/phone-home
///
/// Handles node registration during first boot.
@@ -164,6 +194,7 @@ pub async fn phone_home(
let mut metadata = request.metadata.clone();
metadata.insert("role".to_string(), node_config.role.clone());
metadata.insert("services".to_string(), node_config.services.join(","));
+ merge_hardware_summary_metadata(&mut metadata, request.hardware_facts.as_ref());
// Create NodeInfo for tracking
let node_info = NodeInfo {
@@ -210,6 +241,7 @@ pub async fn phone_home(
&node_info,
&node_config,
&request.machine_id,
+ request.hardware_facts.as_ref(),
)
.await
{
@@ -714,6 +746,7 @@ async fn store_cluster_node_if_configured(
node_info: &NodeInfo,
node_config: &NodeConfig,
machine_id: &str,
+ hardware_facts: Option<&HardwareFacts>,
) -> anyhow::Result<()> {
let Some(cluster_id) = state.config.cluster_id.as_deref() else {
debug!("cluster_id not configured; skipping cluster node state write");
@@ -759,6 +792,7 @@ async fn store_cluster_node_if_configured(
failure_domain: node_config.failure_domain.clone(),
nix_profile: node_config.nix_profile.clone(),
install_plan: node_config.install_plan.clone(),
+ hardware_facts: hardware_facts.cloned(),
state: Some(format!("{:?}", node_info.state).to_lowercase()),
last_heartbeat: Some(node_info.last_heartbeat),
};
@@ -837,6 +871,7 @@ mod tests {
ip: None,
cluster_config_hash: None,
metadata: HashMap::new(),
+ hardware_facts: None,
};
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
@@ -872,6 +907,7 @@ mod tests {
ip: Some("10.0.1.100".to_string()),
cluster_config_hash: None,
metadata: HashMap::new(),
+ hardware_facts: None,
};
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
@@ -917,6 +953,7 @@ mod tests {
ip: None,
cluster_config_hash: None,
metadata: HashMap::new(),
+ hardware_facts: None,
};
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
@@ -957,6 +994,7 @@ mod tests {
ip: Some("10.0.3.25".to_string()),
cluster_config_hash: None,
metadata: HashMap::from([("sku".to_string(), "gpu".to_string())]),
+ hardware_facts: None,
};
assert!(enrollment_rule_matches(&rule, &request));
@@ -990,6 +1028,7 @@ mod tests {
"topology.kubernetes.io/zone".to_string(),
"rack-z".to_string(),
)]),
+ hardware_facts: None,
};
let node_classes = vec![NodeClassSpec {
name: "gpu-worker".to_string(),
@@ -998,6 +1037,8 @@ mod tests {
install_plan: Some(InstallPlan {
nixos_configuration: Some("gpu-worker".to_string()),
disko_config_path: Some("profiles/gpu-worker/disko.nix".to_string()),
+ target_disk: Some("/dev/disk/by-id/nvme-gpu-worker".to_string()),
+ target_disk_by_id: None,
}),
roles: vec!["worker".to_string()],
labels: HashMap::from([("tier".to_string(), "gpu".to_string())]),
diff --git a/deployer/crates/deployer-types/src/lib.rs b/deployer/crates/deployer-types/src/lib.rs
index f05965e..93bd480 100644
--- a/deployer/crates/deployer-types/src/lib.rs
+++ b/deployer/crates/deployer-types/src/lib.rs
@@ -55,6 +55,12 @@ pub struct InstallPlan {
/// Repository-relative Disko file used during installation.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub disko_config_path: Option,
+ /// Explicit disk device path used by bootstrap installers.
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub target_disk: Option,
+ /// Stable `/dev/disk/by-id/...` selector preferred over volatile device names.
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub target_disk_by_id: Option,
}
impl InstallPlan {
@@ -66,6 +72,12 @@ impl InstallPlan {
if self.disko_config_path.is_some() {
merged.disko_config_path = self.disko_config_path.clone();
}
+ if self.target_disk.is_some() {
+ merged.target_disk = self.target_disk.clone();
+ }
+ if self.target_disk_by_id.is_some() {
+ merged.target_disk_by_id = self.target_disk_by_id.clone();
+ }
merged
}
@@ -81,6 +93,66 @@ impl InstallPlan {
}
}
+/// Basic inventory record for a physical disk observed during commissioning.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct DiskFact {
+ pub name: String,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub path: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub by_id: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub size_bytes: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub model: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub serial: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub rotational: Option,
+}
+
+/// Basic inventory record for a network interface observed during commissioning.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct NicFact {
+ pub name: String,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub mac_address: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub oper_state: Option,
+}
+
+/// DMI strings collected during commissioning.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct DmiFact {
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub vendor: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub product_name: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub serial_number: Option,
+}
+
+/// Hardware inventory captured during bootstrap / commissioning.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
+pub struct HardwareFacts {
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub architecture: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub cpu_model: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub cpu_threads: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub cpu_cores: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub memory_bytes: Option,
+ #[serde(default)]
+ pub disks: Vec,
+ #[serde(default)]
+ pub nics: Vec,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub dmi: Option,
+}
+
/// Node configuration returned by Deployer
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct NodeConfig {
@@ -136,6 +208,9 @@ pub struct PhoneHomeRequest {
/// Node metadata
#[serde(default)]
pub metadata: HashMap,
+ /// Hardware inventory gathered by the bootstrap environment.
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub hardware_facts: Option,
}
/// Phone Home response payload with secrets
@@ -414,6 +489,8 @@ pub struct ClusterNodeRecord {
#[serde(default)]
pub install_plan: Option,
#[serde(default)]
+ pub hardware_facts: Option,
+ #[serde(default)]
pub state: Option,
#[serde(default)]
pub last_heartbeat: Option>,
@@ -430,10 +507,18 @@ pub struct ObservedSystemState {
#[serde(default, skip_serializing_if = "Option::is_none")]
pub target_system: Option,
#[serde(default, skip_serializing_if = "Option::is_none")]
+ pub configured_system: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
pub current_system: Option,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub booted_system: Option,
#[serde(default, skip_serializing_if = "Option::is_none")]
+ pub rollback_system: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub switch_action: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub reboot_required: Option,
+ #[serde(default, skip_serializing_if = "Option::is_none")]
pub status: Option,
#[serde(default, skip_serializing_if = "Option::is_none")]
pub last_attempt: Option>,
@@ -760,6 +845,32 @@ mod tests {
ip: Some("10.0.1.10".to_string()),
cluster_config_hash: Some("abc123".to_string()),
metadata,
+ hardware_facts: Some(HardwareFacts {
+ architecture: Some("x86_64".to_string()),
+ cpu_model: Some("Example CPU".to_string()),
+ cpu_threads: Some(16),
+ cpu_cores: Some(8),
+ memory_bytes: Some(64 * 1024 * 1024 * 1024),
+ disks: vec![DiskFact {
+ name: "nvme0n1".to_string(),
+ path: Some("/dev/nvme0n1".to_string()),
+ by_id: Some("/dev/disk/by-id/nvme-example".to_string()),
+ size_bytes: Some(1_000_000_000_000),
+ model: Some("Example Disk".to_string()),
+ serial: Some("disk-serial".to_string()),
+ rotational: Some(false),
+ }],
+ nics: vec![NicFact {
+ name: "eno1".to_string(),
+ mac_address: Some("52:54:00:12:34:56".to_string()),
+ oper_state: Some("up".to_string()),
+ }],
+ dmi: Some(DmiFact {
+ vendor: Some("ExampleVendor".to_string()),
+ product_name: Some("ExampleSystem".to_string()),
+ serial_number: Some("system-serial".to_string()),
+ }),
+ }),
};
let json = serde_json::to_string(&request).unwrap();
@@ -767,6 +878,14 @@ mod tests {
assert_eq!(deserialized.machine_id, "abc123def456");
assert_eq!(deserialized.node_id, Some("node01".to_string()));
assert_eq!(deserialized.metadata.get("role").unwrap(), "control-plane");
+ assert_eq!(
+ deserialized
+ .hardware_facts
+ .as_ref()
+ .and_then(|facts| facts.disks.first())
+ .and_then(|disk| disk.by_id.as_deref()),
+ Some("/dev/disk/by-id/nvme-example")
+ );
}
#[test]
@@ -785,6 +904,8 @@ mod tests {
install_plan: Some(InstallPlan {
nixos_configuration: Some("node01".to_string()),
disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()),
+ target_disk: Some("/dev/vda".to_string()),
+ target_disk_by_id: None,
}),
};
@@ -811,6 +932,7 @@ mod tests {
.and_then(|config| config.install_plan.as_ref())
.expect("install_plan should round-trip");
assert_eq!(install_plan.nixos_configuration.as_deref(), Some("node01"));
+ assert_eq!(install_plan.target_disk.as_deref(), Some("/dev/vda"));
}
#[test]
@@ -935,8 +1057,12 @@ mod tests {
nixos_configuration: Some("node01".to_string()),
flake_root: Some("/opt/plasmacloud-src".to_string()),
target_system: Some("/nix/store/system-node01".to_string()),
+ configured_system: Some("/nix/store/system-node01".to_string()),
current_system: Some("/nix/store/system-old".to_string()),
booted_system: Some("/nix/store/system-old".to_string()),
+ rollback_system: Some("/nix/store/system-old".to_string()),
+ switch_action: Some("boot".to_string()),
+ reboot_required: Some(true),
status: Some("pending".to_string()),
last_attempt: None,
last_success: None,
@@ -968,4 +1094,28 @@ mod tests {
assert_eq!(decoded.health_check_command.len(), 2);
assert_eq!(decoded.rollback_on_failure, Some(true));
}
+
+ #[test]
+ fn test_install_plan_merges_disk_preferences() {
+ let fallback = InstallPlan {
+ nixos_configuration: Some("fallback".to_string()),
+ disko_config_path: Some("fallback/disko.nix".to_string()),
+ target_disk: Some("/dev/sda".to_string()),
+ target_disk_by_id: None,
+ };
+ let preferred = InstallPlan {
+ nixos_configuration: None,
+ disko_config_path: None,
+ target_disk: None,
+ target_disk_by_id: Some("/dev/disk/by-id/nvme-example".to_string()),
+ };
+
+ let merged = preferred.merged_with(Some(&fallback));
+ assert_eq!(merged.nixos_configuration.as_deref(), Some("fallback"));
+ assert_eq!(merged.target_disk.as_deref(), Some("/dev/sda"));
+ assert_eq!(
+ merged.target_disk_by_id.as_deref(),
+ Some("/dev/disk/by-id/nvme-example")
+ );
+ }
}
diff --git a/deployer/crates/fleet-scheduler/src/main.rs b/deployer/crates/fleet-scheduler/src/main.rs
index 167e773..1271392 100644
--- a/deployer/crates/fleet-scheduler/src/main.rs
+++ b/deployer/crates/fleet-scheduler/src/main.rs
@@ -897,6 +897,7 @@ mod tests {
failure_domain: Some(format!("rack-{}", &node_id[node_id.len() - 1..])),
nix_profile: Some("profiles/worker-linux".to_string()),
install_plan: None,
+ hardware_facts: None,
state: Some("active".to_string()),
last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
}
diff --git a/deployer/crates/nix-agent/src/main.rs b/deployer/crates/nix-agent/src/main.rs
index 729ffbb..dd0d433 100644
--- a/deployer/crates/nix-agent/src/main.rs
+++ b/deployer/crates/nix-agent/src/main.rs
@@ -103,6 +103,12 @@ struct ResolvedDesiredSystem {
rollback_on_failure: bool,
}
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum HealthCheckOutcome {
+ Passed,
+ RolledBack,
+}
+
impl Agent {
fn new(cli: Cli) -> Self {
Self {
@@ -155,9 +161,20 @@ impl Agent {
.transpose()
.context("failed to parse desired-system spec")?;
+ let previous_observed = client
+ .get(key_observed_system(
+ &self.cluster_namespace,
+ &self.cluster_id,
+ &self.node_id,
+ ))
+ .await?
+ .map(|bytes| serde_json::from_slice::(&bytes))
+ .transpose()
+ .context("failed to parse observed-system state")?;
+
let mut observed = self.base_observed_state(&node);
let reconcile_result = self
- .reconcile_node(&node, desired.as_ref(), &mut observed)
+ .reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed)
.await;
if let Err(error) = reconcile_result {
observed.status = Some("failed".to_string());
@@ -177,6 +194,7 @@ impl Agent {
fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
ObservedSystemState {
node_id: node.node_id.clone(),
+ configured_system: read_symlink_target("/nix/var/nix/profiles/system"),
current_system: read_symlink_target("/run/current-system"),
booted_system: read_symlink_target("/run/booted-system"),
..ObservedSystemState::default()
@@ -187,6 +205,7 @@ impl Agent {
&self,
node: &ClusterNodeRecord,
desired: Option<&DesiredSystemSpec>,
+ previous_observed: Option<&ObservedSystemState>,
observed: &mut ObservedSystemState,
) -> Result<()> {
match node.state.as_deref() {
@@ -211,8 +230,12 @@ impl Agent {
observed.nixos_configuration = Some(desired.nixos_configuration.clone());
observed.flake_root = Some(desired.flake_ref.clone());
+ observed.switch_action = Some(desired.switch_action.clone());
- let previous_system = observed.current_system.clone();
+ let previous_system = previous_observed
+ .and_then(|state| state.rollback_system.clone())
+ .or_else(|| observed.current_system.clone());
+ observed.rollback_system = previous_system.clone();
let target_system = self
.build_target_system(&desired.flake_ref, &desired.nixos_configuration)
.await
@@ -225,6 +248,25 @@ impl Agent {
observed.target_system = Some(target_system.clone());
if observed.current_system.as_deref() == Some(target_system.as_str()) {
+ if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
+ observed.status = Some("verifying".to_string());
+ observed.last_attempt = Some(Utc::now());
+ let outcome = self
+ .run_health_check_and_maybe_rollback(
+ &desired,
+ previous_system.as_deref(),
+ observed,
+ )
+ .await?;
+ observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
+ observed.current_system = read_symlink_target("/run/current-system");
+ observed.booted_system = read_symlink_target("/run/booted-system");
+ if outcome == HealthCheckOutcome::RolledBack {
+ return Ok(());
+ }
+ }
+
+ observed.reboot_required = Some(false);
observed.status = Some("active".to_string());
observed.last_success = Some(Utc::now());
return Ok(());
@@ -240,9 +282,24 @@ impl Agent {
self.switch_to_target(&target_system, &desired.switch_action)
.await?;
+ observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
observed.current_system = read_symlink_target("/run/current-system");
observed.booted_system = read_symlink_target("/run/booted-system");
+ if desired.switch_action == "boot" {
+ if observed.configured_system.as_deref() != Some(target_system.as_str()) {
+ return Err(anyhow!(
+ "boot switch completed but configured system does not match target {}",
+ target_system
+ ));
+ }
+
+ observed.reboot_required = Some(true);
+ observed.status = Some("staged".to_string());
+ observed.last_error = None;
+ return Ok(());
+ }
+
if observed.current_system.as_deref() != Some(target_system.as_str()) {
return Err(anyhow!(
"switch completed but /run/current-system does not match target {}",
@@ -250,9 +307,17 @@ impl Agent {
));
}
- self.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed)
+ let outcome = self
+ .run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed)
.await?;
+ observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
+ observed.current_system = read_symlink_target("/run/current-system");
+ observed.booted_system = read_symlink_target("/run/booted-system");
+ if outcome == HealthCheckOutcome::RolledBack {
+ return Ok(());
+ }
+ observed.reboot_required = Some(false);
observed.status = Some("active".to_string());
observed.last_success = Some(Utc::now());
observed.last_error = None;
@@ -299,26 +364,28 @@ impl Agent {
desired: &ResolvedDesiredSystem,
previous_system: Option<&str>,
observed: &mut ObservedSystemState,
- ) -> Result<()> {
+ ) -> Result {
if desired.health_check_command.is_empty() {
- return Ok(());
+ return Ok(HealthCheckOutcome::Passed);
}
if let Err(error) = run_vec_command(&desired.health_check_command).await {
let error_message = format!("health check failed after activation: {error}");
if desired.rollback_on_failure {
self.rollback_to_previous(previous_system).await?;
+ observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
observed.current_system = read_symlink_target("/run/current-system");
observed.booted_system = read_symlink_target("/run/booted-system");
+ observed.reboot_required = Some(false);
observed.status = Some("rolled-back".to_string());
observed.last_error = Some(error_message);
- return Ok(());
+ return Ok(HealthCheckOutcome::RolledBack);
}
return Err(anyhow!(error_message));
}
- Ok(())
+ Ok(HealthCheckOutcome::Passed)
}
async fn rollback_to_previous(&self, previous_system: Option<&str>) -> Result<()> {
@@ -370,6 +437,20 @@ fn target_flake_attr(flake_root: &str, configuration: &str) -> String {
)
}
+fn should_run_post_boot_health_check(
+ previous_observed: Option<&ObservedSystemState>,
+ desired: &ResolvedDesiredSystem,
+ target_system: &str,
+) -> bool {
+ desired.switch_action == "boot"
+ && previous_observed
+ .map(|state| {
+ state.status.as_deref() == Some("staged")
+ && state.target_system.as_deref() == Some(target_system)
+ })
+ .unwrap_or(false)
+}
+
fn read_symlink_target(path: &str) -> Option {
fs::read_link(path)
.ok()
@@ -457,7 +538,10 @@ mod tests {
install_plan: Some(InstallPlan {
nixos_configuration: Some("node01".to_string()),
disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()),
+ target_disk: Some("/dev/vda".to_string()),
+ target_disk_by_id: None,
}),
+ hardware_facts: None,
state: Some("active".to_string()),
last_heartbeat: None,
}
@@ -549,4 +633,48 @@ mod tests {
fn read_symlink_target_returns_none_for_missing_path() {
assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None);
}
+
+ #[test]
+ fn post_boot_health_check_is_requested_for_matching_staged_target() {
+ let desired = ResolvedDesiredSystem {
+ nixos_configuration: "node01".to_string(),
+ flake_ref: "/opt/plasmacloud-src".to_string(),
+ switch_action: "boot".to_string(),
+ health_check_command: vec!["true".to_string()],
+ rollback_on_failure: true,
+ };
+ let previous = ObservedSystemState {
+ status: Some("staged".to_string()),
+ target_system: Some("/nix/store/example-system".to_string()),
+ ..ObservedSystemState::default()
+ };
+
+ assert!(should_run_post_boot_health_check(
+ Some(&previous),
+ &desired,
+ "/nix/store/example-system"
+ ));
+ }
+
+ #[test]
+ fn post_boot_health_check_is_skipped_for_non_matching_state() {
+ let desired = ResolvedDesiredSystem {
+ nixos_configuration: "node01".to_string(),
+ flake_ref: "/opt/plasmacloud-src".to_string(),
+ switch_action: "boot".to_string(),
+ health_check_command: vec!["true".to_string()],
+ rollback_on_failure: true,
+ };
+ let previous = ObservedSystemState {
+ status: Some("active".to_string()),
+ target_system: Some("/nix/store/example-system".to_string()),
+ ..ObservedSystemState::default()
+ };
+
+ assert!(!should_run_post_boot_health_check(
+ Some(&previous),
+ &desired,
+ "/nix/store/example-system"
+ ));
+ }
}
diff --git a/deployer/scripts/verify-deployer-bootstrap-e2e.sh b/deployer/scripts/verify-deployer-bootstrap-e2e.sh
index 78dfcb3..d8e2cdd 100755
--- a/deployer/scripts/verify-deployer-bootstrap-e2e.sh
+++ b/deployer/scripts/verify-deployer-bootstrap-e2e.sh
@@ -173,6 +173,7 @@ node_classes:
install_plan:
nixos_configuration: worker-golden
disko_config_path: profiles/worker-linux/disko.nix
+ target_disk_by_id: /dev/disk/by-id/worker-default
roles:
- worker
labels:
@@ -182,6 +183,7 @@ node_classes:
install_plan:
nixos_configuration: edge-metal
disko_config_path: profiles/edge-metal/disko.nix
+ target_disk_by_id: /dev/disk/by-id/edge-default
roles:
- edge
labels:
@@ -208,6 +210,7 @@ nodes:
install_plan:
nixos_configuration: node01
disko_config_path: nix/nodes/vm-cluster/node01/disko.nix
+ target_disk: /dev/vda
desired_system:
flake_ref: "github:centra/cloud"
health_check_command:
@@ -273,6 +276,7 @@ assert payload["node_config"]["node_class"] == "general-worker"
assert payload["node_config"]["nix_profile"] == "profiles/worker-linux"
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "node01"
assert payload["node_config"]["install_plan"]["disko_config_path"] == "nix/nodes/vm-cluster/node01/disko.nix"
+assert payload["node_config"]["install_plan"]["target_disk"] == "/dev/vda"
assert payload["node_config"]["failure_domain"] == "rack-a"
print("Seeded mapping validated")
PY
@@ -339,6 +343,36 @@ request = urllib.request.Request(
"sku": "metal",
"topology.kubernetes.io/zone": "rack-z",
},
+ "hardware_facts": {
+ "architecture": "x86_64",
+ "cpu_model": "Example CPU",
+ "cpu_threads": 32,
+ "cpu_cores": 16,
+ "memory_bytes": 137438953472,
+ "disks": [
+ {
+ "name": "nvme0n1",
+ "path": "/dev/nvme0n1",
+ "by_id": "/dev/disk/by-id/nvme-dynamic-metal-01",
+ "size_bytes": 2000398934016,
+ "model": "Example NVMe",
+ "serial": "disk-serial-01",
+ "rotational": False
+ }
+ ],
+ "nics": [
+ {
+ "name": "eno1",
+ "mac_address": "52:54:00:aa:bb:cc",
+ "oper_state": "up"
+ }
+ ],
+ "dmi": {
+ "vendor": "ExampleVendor",
+ "product_name": "ExampleMetal",
+ "serial_number": "dynamic-metal-serial"
+ }
+ },
}
).encode(),
headers={
@@ -357,6 +391,7 @@ assert payload["node_config"]["node_class"] == "edge-metal"
assert payload["node_config"]["nix_profile"] == "profiles/edge-metal"
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "edge-metal"
assert payload["node_config"]["install_plan"]["disko_config_path"] == "profiles/edge-metal/disko.nix"
+assert payload["node_config"]["install_plan"]["target_disk_by_id"] == "/dev/disk/by-id/edge-default"
assert "prismnet" in payload["node_config"]["services"]
assert payload["node_config"]["labels"]["managed-by"] == "deployer"
print(payload["node_id"])
@@ -400,6 +435,19 @@ if dynamic.get("failure_domain") != "rack-z":
raise SystemExit(f"unexpected dynamic failure domain: {dynamic}")
if dynamic.get("labels", {}).get("lane") != "edge":
raise SystemExit(f"missing pool label propagation: {dynamic}")
+if seeded.get("install_plan", {}).get("target_disk") != "/dev/vda":
+ raise SystemExit(f"missing seeded target disk: {seeded}")
+if dynamic.get("install_plan", {}).get("target_disk_by_id") != "/dev/disk/by-id/edge-default":
+ raise SystemExit(f"missing dynamic target disk by-id: {dynamic}")
+facts = dynamic.get("hardware_facts") or {}
+if facts.get("architecture") != "x86_64":
+ raise SystemExit(f"missing dynamic hardware architecture: {dynamic}")
+if facts.get("disks", [{}])[0].get("by_id") != "/dev/disk/by-id/nvme-dynamic-metal-01":
+ raise SystemExit(f"missing dynamic hardware disk facts: {dynamic}")
+if dynamic.get("labels", {}).get("hardware.architecture") != "x86_64":
+ raise SystemExit(f"missing hardware metadata labels: {dynamic}")
+if dynamic.get("labels", {}).get("hardware.disk_count") != "1":
+ raise SystemExit(f"missing hardware disk count label: {dynamic}")
print("Deployer bootstrap records validated")
PY
diff --git a/nix/iso/plasmacloud-iso.nix b/nix/iso/plasmacloud-iso.nix
index 81df962..70eb23b 100644
--- a/nix/iso/plasmacloud-iso.nix
+++ b/nix/iso/plasmacloud-iso.nix
@@ -44,8 +44,30 @@
script = ''
set -euo pipefail
- # Discover Deployer via DNS or fallback
- DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}"
+ cmdline_value() {
+ local key="$1"
+ local arg
+ for arg in $(cat /proc/cmdline); do
+ case "$arg" in
+ "$key"=*)
+ echo "''${arg#*=}"
+ return 0
+ ;;
+ esac
+ done
+ return 1
+ }
+
+ mkdir -p /etc/plasmacloud
+
+ # Discover Deployer via environment, kernel cmdline, or fallback.
+ DEPLOYER_URL="''${DEPLOYER_URL:-}"
+ if [ -z "$DEPLOYER_URL" ]; then
+ DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
+ fi
+ if [ -z "$DEPLOYER_URL" ]; then
+ DEPLOYER_URL="http://192.168.100.1:8080"
+ fi
# Get machine identity
MACHINE_ID=$(cat /etc/machine-id)
@@ -61,14 +83,27 @@
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
+ else
+ DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
+ fi
+
+ DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
+ if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
+ DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
+ if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
+ DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
+ ${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
+ "$DEPLOYER_CA_CERT_URL" \
+ -o "$DEPLOYER_CA_CERT_PATH"
+ fi
fi
CURL_ARGS=(-sf --connect-timeout 5 --max-time 15)
if [ -n "$DEPLOYER_TOKEN" ]; then
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
fi
- if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then
- CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}")
+ if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
+ CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
fi
NODE_IP=$(${pkgs.iproute2}/bin/ip -4 route get 1.1.1.1 2>/dev/null | ${pkgs.gawk}/bin/awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')
@@ -79,6 +114,76 @@
NODE_IP=$(hostname -I 2>/dev/null | ${pkgs.gawk}/bin/awk '{print $1}')
fi
NODE_HOSTNAME=$(hostname)
+ CPU_MODEL=$(${pkgs.gawk}/bin/awk -F: '/model name/ {gsub(/^[ \t]+/, "", $2); print $2; exit}' /proc/cpuinfo 2>/dev/null || true)
+ CPU_CORES=$(${pkgs.gawk}/bin/awk '/^cpu cores/ {print $4; exit}' /proc/cpuinfo 2>/dev/null || true)
+ CPU_THREADS=$(${pkgs.coreutils}/bin/nproc --all 2>/dev/null || true)
+ MEMORY_KIB=$(${pkgs.gawk}/bin/awk '/MemTotal:/ {print $2; exit}' /proc/meminfo 2>/dev/null || true)
+ MEMORY_BYTES=""
+ if [ -n "$MEMORY_KIB" ]; then
+ MEMORY_BYTES=$((MEMORY_KIB * 1024))
+ fi
+
+ DISKS_JSON=$(${pkgs.util-linux}/bin/lsblk -J -b -o NAME,PATH,SIZE,MODEL,SERIAL,ROTA,TYPE 2>/dev/null | ${pkgs.jq}/bin/jq '
+ [.blockdevices[] | select(.type == "disk") | {
+ name: .name,
+ path: (.path // null),
+ size_bytes: (.size | tonumber?),
+ model: ((.model // "") | if . == "" then null else . end),
+ serial: ((.serial // "") | if . == "" then null else . end),
+ rotational: (if .rota == null then null else (.rota == 1) end)
+ }]
+ ')
+ NICS_JSON=$(${pkgs.iproute2}/bin/ip -j link 2>/dev/null | ${pkgs.jq}/bin/jq '
+ [.[] | select(.ifname != "lo") | {
+ name: .ifname,
+ mac_address: ((.address // "") | if . == "" or . == "00:00:00:00:00:00" then null else . end),
+ oper_state: ((.operstate // "") | ascii_downcase | if . == "" then null else . end)
+ }]
+ ')
+ DMI_VENDOR=$(tr -d '\n' /dev/null || true)
+ DMI_PRODUCT=$(tr -d '\n' /dev/null || true)
+ DMI_SERIAL=$(tr -d '\n' /dev/null || true)
+ HARDWARE_FACTS=$(${pkgs.jq}/bin/jq -n \
+ --arg architecture "$(${pkgs.coreutils}/bin/uname -m)" \
+ --arg cpu_model "$CPU_MODEL" \
+ --arg cpu_threads "$CPU_THREADS" \
+ --arg cpu_cores "$CPU_CORES" \
+ --arg memory_bytes "$MEMORY_BYTES" \
+ --arg dmi_vendor "$DMI_VENDOR" \
+ --arg dmi_product "$DMI_PRODUCT" \
+ --arg dmi_serial "$DMI_SERIAL" \
+ --argjson disks "$DISKS_JSON" \
+ --argjson nics "$NICS_JSON" '
+ {
+ architecture: (if $architecture == "" then null else $architecture end),
+ cpu_model: (if $cpu_model == "" then null else $cpu_model end),
+ cpu_threads: (if $cpu_threads == "" then null else ($cpu_threads | tonumber) end),
+ cpu_cores: (if $cpu_cores == "" then null else ($cpu_cores | tonumber) end),
+ memory_bytes: (if $memory_bytes == "" then null else ($memory_bytes | tonumber) end),
+ disks: $disks,
+ nics: $nics,
+ dmi: ({
+ vendor: (if $dmi_vendor == "" then null else $dmi_vendor end),
+ product_name: (if $dmi_product == "" then null else $dmi_product end),
+ serial_number: (if $dmi_serial == "" then null else $dmi_serial end)
+ } | with_entries(select(.value != null)))
+ }
+ | if (.dmi | length) == 0 then del(.dmi) else . end
+ ')
+ REQUEST_JSON=$(${pkgs.jq}/bin/jq -n \
+ --arg machine_id "$MACHINE_ID" \
+ --arg node_id "$NODE_HOSTNAME" \
+ --arg hostname "$NODE_HOSTNAME" \
+ --arg ip "$NODE_IP" \
+ --argjson hardware_facts "$HARDWARE_FACTS" '
+ {
+ machine_id: $machine_id,
+ node_id: $node_id,
+ hostname: $hostname,
+ ip: $ip,
+ hardware_facts: $hardware_facts
+ }
+ ')
# Phone Home request with retry
for i in 1 2 3 4 5; do
@@ -86,7 +191,7 @@
if RESPONSE=$(${pkgs.curl}/bin/curl "''${CURL_ARGS[@]}" -X POST \
-H "Content-Type: application/json" \
- -d "{\"machine_id\": \"$MACHINE_ID\", \"node_id\": \"$NODE_HOSTNAME\", \"hostname\": \"$NODE_HOSTNAME\", \"ip\": \"$NODE_IP\"}" \
+ -d "$REQUEST_JSON" \
"$DEPLOYER_URL/api/v1/phone-home"); then
echo "✓ Phone Home successful"
@@ -177,6 +282,20 @@
script = ''
set -euo pipefail
+ cmdline_value() {
+ local key="$1"
+ local arg
+ for arg in $(cat /proc/cmdline); do
+ case "$arg" in
+ "$key"=*)
+ echo "''${arg#*=}"
+ return 0
+ ;;
+ esac
+ done
+ return 1
+ }
+
if [ ! -s /etc/plasmacloud/node-config.json ]; then
echo "ERROR: node-config.json missing (bootstrap not complete?)"
exit 1
@@ -186,7 +305,15 @@
NODE_IP=$(${pkgs.jq}/bin/jq -r '.ip // empty' /etc/plasmacloud/node-config.json)
NIXOS_CONFIGURATION=$(${pkgs.jq}/bin/jq -r '.install_plan.nixos_configuration // .hostname // empty' /etc/plasmacloud/node-config.json)
DISKO_PATH=$(${pkgs.jq}/bin/jq -r '.install_plan.disko_config_path // empty' /etc/plasmacloud/node-config.json)
- DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}"
+ TARGET_DISK=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk // empty' /etc/plasmacloud/node-config.json)
+ TARGET_DISK_BY_ID=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk_by_id // empty' /etc/plasmacloud/node-config.json)
+ DEPLOYER_URL="''${DEPLOYER_URL:-}"
+ if [ -z "$DEPLOYER_URL" ]; then
+ DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
+ fi
+ if [ -z "$DEPLOYER_URL" ]; then
+ DEPLOYER_URL="http://192.168.100.1:8080"
+ fi
SRC_ROOT="/opt/plasmacloud-src"
if [ -z "$NODE_ID" ] || [ -z "$NODE_IP" ]; then
@@ -205,14 +332,27 @@
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
+ else
+ DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
+ fi
+
+ DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
+ if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
+ DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
+ if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
+ DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
+ ${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
+ "$DEPLOYER_CA_CERT_URL" \
+ -o "$DEPLOYER_CA_CERT_PATH"
+ fi
fi
CURL_ARGS=(-sfL --connect-timeout 5 --max-time 120)
if [ -n "$DEPLOYER_TOKEN" ]; then
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
fi
- if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then
- CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}")
+ if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
+ CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
fi
BUNDLE_PATH="/run/plasmacloud/flake-bundle.tar.gz"
@@ -247,18 +387,32 @@
echo "PlasmaCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, disko_path=$DISKO_PATH)"
- # Find disk
- DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}')
+ # Resolve installation target disk.
+ if [ -n "$TARGET_DISK_BY_ID" ]; then
+ if [ ! -b "$TARGET_DISK_BY_ID" ]; then
+ echo "ERROR: target_disk_by_id does not exist: $TARGET_DISK_BY_ID"
+ exit 1
+ fi
+ DISK="$TARGET_DISK_BY_ID"
+ elif [ -n "$TARGET_DISK" ]; then
+ if [ ! -b "$TARGET_DISK" ]; then
+ echo "ERROR: target_disk does not exist: $TARGET_DISK"
+ exit 1
+ fi
+ DISK="$TARGET_DISK"
+ else
+ DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}')
+ fi
if [ -z "$DISK" ]; then
echo "ERROR: No disk found"
exit 1
fi
- ROOT_PART="''${DISK}2"
+ ROOT_PART=$(${pkgs.util-linux}/bin/lsblk -lnpo NAME,TYPE "$DISK" 2>/dev/null | ${pkgs.gawk}/bin/awk '$2=="part"{print $1}' | sed -n '2p')
mkdir -p /mnt
# Skip if already installed
- if ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then
+ if [ -n "$ROOT_PART" ] && ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then
mount "$ROOT_PART" /mnt 2>/dev/null || true
if [ -e /mnt/etc/NIXOS ]; then
echo "✓ Existing NixOS detected; skipping install"
@@ -271,9 +425,25 @@
echo "Validating NixOS configuration output..."
nix eval --raw "$SRC_ROOT#nixosConfigurations.$NIXOS_CONFIGURATION.config.system.build.toplevel.drvPath" >/dev/null
+ EFFECTIVE_DISKO_PATH="$SRC_ROOT/$DISKO_PATH"
+ if [ -n "$DISK" ]; then
+ cat > /run/plasmacloud/disko-wrapper.nix <