Wire bootstrap installers to explicit targets and reboot-aware rollout
This commit is contained in:
parent
88e78d1602
commit
a7d5cfa738
17 changed files with 640 additions and 32 deletions
|
|
@ -106,6 +106,11 @@ OUTPUT:
|
|||
- initrd Initial ramdisk
|
||||
- netboot.ipxe iPXE boot script
|
||||
|
||||
ENVIRONMENT:
|
||||
PLASMACLOUD_DEPLOYER_URL Optional deployer endpoint embedded into generated netboot.ipxe
|
||||
PLASMACLOUD_BOOTSTRAP_TOKEN Optional bootstrap token embedded into generated netboot.ipxe
|
||||
PLASMACLOUD_CA_CERT_URL Optional CA certificate URL embedded into generated netboot.ipxe
|
||||
|
||||
EOF
|
||||
}
|
||||
|
||||
|
|
@ -151,6 +156,17 @@ build_profile() {
|
|||
print_warning "Failed to resolve init path for $profile; using /init"
|
||||
fi
|
||||
|
||||
local deployer_kernel_args=""
|
||||
if [ -n "${PLASMACLOUD_DEPLOYER_URL:-}" ]; then
|
||||
deployer_kernel_args+=" plasmacloud.deployer_url=${PLASMACLOUD_DEPLOYER_URL}"
|
||||
fi
|
||||
if [ -n "${PLASMACLOUD_BOOTSTRAP_TOKEN:-}" ]; then
|
||||
deployer_kernel_args+=" plasmacloud.bootstrap_token=${PLASMACLOUD_BOOTSTRAP_TOKEN}"
|
||||
fi
|
||||
if [ -n "${PLASMACLOUD_CA_CERT_URL:-}" ]; then
|
||||
deployer_kernel_args+=" plasmacloud.ca_cert_url=${PLASMACLOUD_CA_CERT_URL}"
|
||||
fi
|
||||
|
||||
# Generate iPXE boot script
|
||||
print_info " Generating iPXE boot script..."
|
||||
cat > "$profile_dir/netboot.ipxe" << EOF
|
||||
|
|
@ -169,7 +185,7 @@ echo Initrd: initrd
|
|||
echo
|
||||
|
||||
# Load kernel and initrd
|
||||
kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4
|
||||
kernel \${boot-server}/$profile/bzImage init=${init_path} console=ttyS0,115200 console=tty0 loglevel=4${deployer_kernel_args}
|
||||
initrd \${boot-server}/$profile/initrd
|
||||
|
||||
# Boot
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ set boot-server 10.0.100.10
|
|||
set boot-url http://${boot-server}/boot
|
||||
set nixos-url ${boot-url}/nixos
|
||||
set provisioning-server http://${boot-server}
|
||||
set deployer-url http://${boot-server}:8080
|
||||
|
||||
# Detect network configuration
|
||||
echo Network Configuration:
|
||||
|
|
@ -181,6 +182,7 @@ set kernel-params ${kernel-params} centra.profile=${profile}
|
|||
set kernel-params ${kernel-params} centra.hostname=${hostname}
|
||||
set kernel-params ${kernel-params} centra.mac=${mac}
|
||||
set kernel-params ${kernel-params} centra.provisioning-server=${provisioning-server}
|
||||
set kernel-params ${kernel-params} plasmacloud.deployer_url=${deployer-url}
|
||||
set kernel-params ${kernel-params} console=tty0 console=ttyS0,115200n8
|
||||
|
||||
# For debugging, enable these:
|
||||
|
|
|
|||
|
|
@ -60,7 +60,7 @@ let
|
|||
next-server ${cfg.serverAddress};
|
||||
|
||||
if exists user-class and option user-class = "iPXE" {
|
||||
filename "http://${cfg.serverAddress}/boot/ipxe/boot.ipxe";
|
||||
filename "http://${cfg.serverAddress}:${toString cfg.http.port}/boot/ipxe/boot.ipxe";
|
||||
} elsif option architecture-type = 00:00 {
|
||||
filename "undionly.kpxe";
|
||||
} elsif option architecture-type = 00:06 {
|
||||
|
|
@ -82,9 +82,10 @@ let
|
|||
#!ipxe
|
||||
|
||||
set boot-server ${cfg.serverAddress}
|
||||
set boot-url http://''${boot-server}/boot
|
||||
set boot-url http://''${boot-server}:${toString cfg.http.port}/boot
|
||||
set nixos-url ''${boot-url}/nixos
|
||||
set provisioning-server http://''${boot-server}
|
||||
set provisioning-server http://''${boot-server}:${toString cfg.http.port}
|
||||
set deployer-url ${if cfg.bootstrap.deployerUrl != null then cfg.bootstrap.deployerUrl else "http://${cfg.serverAddress}:8080"}
|
||||
|
||||
echo Network Configuration:
|
||||
echo IP Address: ''${ip}
|
||||
|
|
@ -145,6 +146,9 @@ let
|
|||
set kernel-params ''${kernel-params} centra.hostname=''${hostname}
|
||||
set kernel-params ''${kernel-params} centra.mac=''${mac}
|
||||
set kernel-params ''${kernel-params} centra.provisioning-server=''${provisioning-server}
|
||||
set kernel-params ''${kernel-params} plasmacloud.deployer_url=''${deployer-url}
|
||||
${optionalString (cfg.bootstrap.bootstrapToken != null) "set kernel-params ''${kernel-params} plasmacloud.bootstrap_token=${cfg.bootstrap.bootstrapToken}"}
|
||||
${optionalString (cfg.bootstrap.caCertUrl != null) "set kernel-params ''${kernel-params} plasmacloud.ca_cert_url=${cfg.bootstrap.caCertUrl}"}
|
||||
set kernel-params ''${kernel-params} console=tty0 console=ttyS0,115200n8
|
||||
|
||||
kernel ''${nixos-url}/bzImage ''${kernel-params} || goto failed
|
||||
|
|
@ -338,6 +342,28 @@ in {
|
|||
};
|
||||
};
|
||||
|
||||
bootstrap = {
|
||||
deployerUrl = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Deployer endpoint passed to the bootstrap ISO/netboot environment";
|
||||
example = "https://deployer.example.com:8443";
|
||||
};
|
||||
|
||||
bootstrapToken = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional shared bootstrap token embedded in iPXE kernel arguments";
|
||||
};
|
||||
|
||||
caCertUrl = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional CA certificate URL fetched by the bootstrap environment before phone-home";
|
||||
example = "https://deployer.example.com/bootstrap-ca.crt";
|
||||
};
|
||||
};
|
||||
|
||||
nodes = mkOption {
|
||||
type = types.attrsOf (types.submodule {
|
||||
options = {
|
||||
|
|
|
|||
|
|
@ -907,6 +907,8 @@ mod tests {
|
|||
install_plan: Some(InstallPlan {
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
disko_config_path: Some("profiles/worker-linux/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/disk/by-id/worker-golden".to_string()),
|
||||
target_disk_by_id: None,
|
||||
}),
|
||||
roles: vec!["worker".to_string()],
|
||||
labels: HashMap::from([("tier".to_string(), "general".to_string())]),
|
||||
|
|
|
|||
|
|
@ -133,6 +133,8 @@ mod tests {
|
|||
install_plan: Some(InstallPlan {
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
disko_config_path: Some("profiles/worker/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/vda".to_string()),
|
||||
target_disk_by_id: None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
|
||||
use chrono::Utc;
|
||||
use deployer_types::{
|
||||
EnrollmentRuleSpec, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState,
|
||||
PhoneHomeRequest, PhoneHomeResponse,
|
||||
EnrollmentRuleSpec, HardwareFacts, InstallPlan, NodeClassSpec, NodeConfig, NodeInfo,
|
||||
NodePoolSpec, NodeState, PhoneHomeRequest, PhoneHomeResponse,
|
||||
};
|
||||
use std::sync::Arc;
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
|
@ -19,6 +19,36 @@ fn merge_install_plan(
|
|||
InstallPlan::from_layers(preferred, fallback)
|
||||
}
|
||||
|
||||
fn merge_hardware_summary_metadata(
|
||||
metadata: &mut std::collections::HashMap<String, String>,
|
||||
hardware_facts: Option<&HardwareFacts>,
|
||||
) {
|
||||
let Some(hardware_facts) = hardware_facts else {
|
||||
return;
|
||||
};
|
||||
|
||||
if let Some(cpu_threads) = hardware_facts.cpu_threads {
|
||||
metadata.insert("hardware.cpu_threads".to_string(), cpu_threads.to_string());
|
||||
}
|
||||
if let Some(cpu_cores) = hardware_facts.cpu_cores {
|
||||
metadata.insert("hardware.cpu_cores".to_string(), cpu_cores.to_string());
|
||||
}
|
||||
if let Some(memory_bytes) = hardware_facts.memory_bytes {
|
||||
metadata.insert("hardware.memory_bytes".to_string(), memory_bytes.to_string());
|
||||
}
|
||||
metadata.insert(
|
||||
"hardware.disk_count".to_string(),
|
||||
hardware_facts.disks.len().to_string(),
|
||||
);
|
||||
metadata.insert(
|
||||
"hardware.nic_count".to_string(),
|
||||
hardware_facts.nics.len().to_string(),
|
||||
);
|
||||
if let Some(architecture) = hardware_facts.architecture.as_deref() {
|
||||
metadata.insert("hardware.architecture".to_string(), architecture.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
/// POST /api/v1/phone-home
|
||||
///
|
||||
/// Handles node registration during first boot.
|
||||
|
|
@ -164,6 +194,7 @@ pub async fn phone_home(
|
|||
let mut metadata = request.metadata.clone();
|
||||
metadata.insert("role".to_string(), node_config.role.clone());
|
||||
metadata.insert("services".to_string(), node_config.services.join(","));
|
||||
merge_hardware_summary_metadata(&mut metadata, request.hardware_facts.as_ref());
|
||||
|
||||
// Create NodeInfo for tracking
|
||||
let node_info = NodeInfo {
|
||||
|
|
@ -210,6 +241,7 @@ pub async fn phone_home(
|
|||
&node_info,
|
||||
&node_config,
|
||||
&request.machine_id,
|
||||
request.hardware_facts.as_ref(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
|
|
@ -714,6 +746,7 @@ async fn store_cluster_node_if_configured(
|
|||
node_info: &NodeInfo,
|
||||
node_config: &NodeConfig,
|
||||
machine_id: &str,
|
||||
hardware_facts: Option<&HardwareFacts>,
|
||||
) -> anyhow::Result<()> {
|
||||
let Some(cluster_id) = state.config.cluster_id.as_deref() else {
|
||||
debug!("cluster_id not configured; skipping cluster node state write");
|
||||
|
|
@ -759,6 +792,7 @@ async fn store_cluster_node_if_configured(
|
|||
failure_domain: node_config.failure_domain.clone(),
|
||||
nix_profile: node_config.nix_profile.clone(),
|
||||
install_plan: node_config.install_plan.clone(),
|
||||
hardware_facts: hardware_facts.cloned(),
|
||||
state: Some(format!("{:?}", node_info.state).to_lowercase()),
|
||||
last_heartbeat: Some(node_info.last_heartbeat),
|
||||
};
|
||||
|
|
@ -837,6 +871,7 @@ mod tests {
|
|||
ip: None,
|
||||
cluster_config_hash: None,
|
||||
metadata: HashMap::new(),
|
||||
hardware_facts: None,
|
||||
};
|
||||
|
||||
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
||||
|
|
@ -872,6 +907,7 @@ mod tests {
|
|||
ip: Some("10.0.1.100".to_string()),
|
||||
cluster_config_hash: None,
|
||||
metadata: HashMap::new(),
|
||||
hardware_facts: None,
|
||||
};
|
||||
|
||||
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
||||
|
|
@ -917,6 +953,7 @@ mod tests {
|
|||
ip: None,
|
||||
cluster_config_hash: None,
|
||||
metadata: HashMap::new(),
|
||||
hardware_facts: None,
|
||||
};
|
||||
|
||||
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
||||
|
|
@ -957,6 +994,7 @@ mod tests {
|
|||
ip: Some("10.0.3.25".to_string()),
|
||||
cluster_config_hash: None,
|
||||
metadata: HashMap::from([("sku".to_string(), "gpu".to_string())]),
|
||||
hardware_facts: None,
|
||||
};
|
||||
|
||||
assert!(enrollment_rule_matches(&rule, &request));
|
||||
|
|
@ -990,6 +1028,7 @@ mod tests {
|
|||
"topology.kubernetes.io/zone".to_string(),
|
||||
"rack-z".to_string(),
|
||||
)]),
|
||||
hardware_facts: None,
|
||||
};
|
||||
let node_classes = vec![NodeClassSpec {
|
||||
name: "gpu-worker".to_string(),
|
||||
|
|
@ -998,6 +1037,8 @@ mod tests {
|
|||
install_plan: Some(InstallPlan {
|
||||
nixos_configuration: Some("gpu-worker".to_string()),
|
||||
disko_config_path: Some("profiles/gpu-worker/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/disk/by-id/nvme-gpu-worker".to_string()),
|
||||
target_disk_by_id: None,
|
||||
}),
|
||||
roles: vec!["worker".to_string()],
|
||||
labels: HashMap::from([("tier".to_string(), "gpu".to_string())]),
|
||||
|
|
|
|||
|
|
@ -55,6 +55,12 @@ pub struct InstallPlan {
|
|||
/// Repository-relative Disko file used during installation.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub disko_config_path: Option<String>,
|
||||
/// Explicit disk device path used by bootstrap installers.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub target_disk: Option<String>,
|
||||
/// Stable `/dev/disk/by-id/...` selector preferred over volatile device names.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub target_disk_by_id: Option<String>,
|
||||
}
|
||||
|
||||
impl InstallPlan {
|
||||
|
|
@ -66,6 +72,12 @@ impl InstallPlan {
|
|||
if self.disko_config_path.is_some() {
|
||||
merged.disko_config_path = self.disko_config_path.clone();
|
||||
}
|
||||
if self.target_disk.is_some() {
|
||||
merged.target_disk = self.target_disk.clone();
|
||||
}
|
||||
if self.target_disk_by_id.is_some() {
|
||||
merged.target_disk_by_id = self.target_disk_by_id.clone();
|
||||
}
|
||||
merged
|
||||
}
|
||||
|
||||
|
|
@ -81,6 +93,66 @@ impl InstallPlan {
|
|||
}
|
||||
}
|
||||
|
||||
/// Basic inventory record for a physical disk observed during commissioning.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct DiskFact {
|
||||
pub name: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub path: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub by_id: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub size_bytes: Option<u64>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub model: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub serial: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub rotational: Option<bool>,
|
||||
}
|
||||
|
||||
/// Basic inventory record for a network interface observed during commissioning.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct NicFact {
|
||||
pub name: String,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub mac_address: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub oper_state: Option<String>,
|
||||
}
|
||||
|
||||
/// DMI strings collected during commissioning.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct DmiFact {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub vendor: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub product_name: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub serial_number: Option<String>,
|
||||
}
|
||||
|
||||
/// Hardware inventory captured during bootstrap / commissioning.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
|
||||
pub struct HardwareFacts {
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub architecture: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub cpu_model: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub cpu_threads: Option<u32>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub cpu_cores: Option<u32>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub memory_bytes: Option<u64>,
|
||||
#[serde(default)]
|
||||
pub disks: Vec<DiskFact>,
|
||||
#[serde(default)]
|
||||
pub nics: Vec<NicFact>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub dmi: Option<DmiFact>,
|
||||
}
|
||||
|
||||
/// Node configuration returned by Deployer
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct NodeConfig {
|
||||
|
|
@ -136,6 +208,9 @@ pub struct PhoneHomeRequest {
|
|||
/// Node metadata
|
||||
#[serde(default)]
|
||||
pub metadata: HashMap<String, String>,
|
||||
/// Hardware inventory gathered by the bootstrap environment.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub hardware_facts: Option<HardwareFacts>,
|
||||
}
|
||||
|
||||
/// Phone Home response payload with secrets
|
||||
|
|
@ -414,6 +489,8 @@ pub struct ClusterNodeRecord {
|
|||
#[serde(default)]
|
||||
pub install_plan: Option<InstallPlan>,
|
||||
#[serde(default)]
|
||||
pub hardware_facts: Option<HardwareFacts>,
|
||||
#[serde(default)]
|
||||
pub state: Option<String>,
|
||||
#[serde(default)]
|
||||
pub last_heartbeat: Option<DateTime<Utc>>,
|
||||
|
|
@ -430,10 +507,18 @@ pub struct ObservedSystemState {
|
|||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub target_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub configured_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub current_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub booted_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub rollback_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub switch_action: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub reboot_required: Option<bool>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub status: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub last_attempt: Option<DateTime<Utc>>,
|
||||
|
|
@ -760,6 +845,32 @@ mod tests {
|
|||
ip: Some("10.0.1.10".to_string()),
|
||||
cluster_config_hash: Some("abc123".to_string()),
|
||||
metadata,
|
||||
hardware_facts: Some(HardwareFacts {
|
||||
architecture: Some("x86_64".to_string()),
|
||||
cpu_model: Some("Example CPU".to_string()),
|
||||
cpu_threads: Some(16),
|
||||
cpu_cores: Some(8),
|
||||
memory_bytes: Some(64 * 1024 * 1024 * 1024),
|
||||
disks: vec![DiskFact {
|
||||
name: "nvme0n1".to_string(),
|
||||
path: Some("/dev/nvme0n1".to_string()),
|
||||
by_id: Some("/dev/disk/by-id/nvme-example".to_string()),
|
||||
size_bytes: Some(1_000_000_000_000),
|
||||
model: Some("Example Disk".to_string()),
|
||||
serial: Some("disk-serial".to_string()),
|
||||
rotational: Some(false),
|
||||
}],
|
||||
nics: vec![NicFact {
|
||||
name: "eno1".to_string(),
|
||||
mac_address: Some("52:54:00:12:34:56".to_string()),
|
||||
oper_state: Some("up".to_string()),
|
||||
}],
|
||||
dmi: Some(DmiFact {
|
||||
vendor: Some("ExampleVendor".to_string()),
|
||||
product_name: Some("ExampleSystem".to_string()),
|
||||
serial_number: Some("system-serial".to_string()),
|
||||
}),
|
||||
}),
|
||||
};
|
||||
|
||||
let json = serde_json::to_string(&request).unwrap();
|
||||
|
|
@ -767,6 +878,14 @@ mod tests {
|
|||
assert_eq!(deserialized.machine_id, "abc123def456");
|
||||
assert_eq!(deserialized.node_id, Some("node01".to_string()));
|
||||
assert_eq!(deserialized.metadata.get("role").unwrap(), "control-plane");
|
||||
assert_eq!(
|
||||
deserialized
|
||||
.hardware_facts
|
||||
.as_ref()
|
||||
.and_then(|facts| facts.disks.first())
|
||||
.and_then(|disk| disk.by_id.as_deref()),
|
||||
Some("/dev/disk/by-id/nvme-example")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -785,6 +904,8 @@ mod tests {
|
|||
install_plan: Some(InstallPlan {
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/vda".to_string()),
|
||||
target_disk_by_id: None,
|
||||
}),
|
||||
};
|
||||
|
||||
|
|
@ -811,6 +932,7 @@ mod tests {
|
|||
.and_then(|config| config.install_plan.as_ref())
|
||||
.expect("install_plan should round-trip");
|
||||
assert_eq!(install_plan.nixos_configuration.as_deref(), Some("node01"));
|
||||
assert_eq!(install_plan.target_disk.as_deref(), Some("/dev/vda"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
@ -935,8 +1057,12 @@ mod tests {
|
|||
nixos_configuration: Some("node01".to_string()),
|
||||
flake_root: Some("/opt/plasmacloud-src".to_string()),
|
||||
target_system: Some("/nix/store/system-node01".to_string()),
|
||||
configured_system: Some("/nix/store/system-node01".to_string()),
|
||||
current_system: Some("/nix/store/system-old".to_string()),
|
||||
booted_system: Some("/nix/store/system-old".to_string()),
|
||||
rollback_system: Some("/nix/store/system-old".to_string()),
|
||||
switch_action: Some("boot".to_string()),
|
||||
reboot_required: Some(true),
|
||||
status: Some("pending".to_string()),
|
||||
last_attempt: None,
|
||||
last_success: None,
|
||||
|
|
@ -968,4 +1094,28 @@ mod tests {
|
|||
assert_eq!(decoded.health_check_command.len(), 2);
|
||||
assert_eq!(decoded.rollback_on_failure, Some(true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_install_plan_merges_disk_preferences() {
|
||||
let fallback = InstallPlan {
|
||||
nixos_configuration: Some("fallback".to_string()),
|
||||
disko_config_path: Some("fallback/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/sda".to_string()),
|
||||
target_disk_by_id: None,
|
||||
};
|
||||
let preferred = InstallPlan {
|
||||
nixos_configuration: None,
|
||||
disko_config_path: None,
|
||||
target_disk: None,
|
||||
target_disk_by_id: Some("/dev/disk/by-id/nvme-example".to_string()),
|
||||
};
|
||||
|
||||
let merged = preferred.merged_with(Some(&fallback));
|
||||
assert_eq!(merged.nixos_configuration.as_deref(), Some("fallback"));
|
||||
assert_eq!(merged.target_disk.as_deref(), Some("/dev/sda"));
|
||||
assert_eq!(
|
||||
merged.target_disk_by_id.as_deref(),
|
||||
Some("/dev/disk/by-id/nvme-example")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -897,6 +897,7 @@ mod tests {
|
|||
failure_domain: Some(format!("rack-{}", &node_id[node_id.len() - 1..])),
|
||||
nix_profile: Some("profiles/worker-linux".to_string()),
|
||||
install_plan: None,
|
||||
hardware_facts: None,
|
||||
state: Some("active".to_string()),
|
||||
last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,6 +103,12 @@ struct ResolvedDesiredSystem {
|
|||
rollback_on_failure: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
enum HealthCheckOutcome {
|
||||
Passed,
|
||||
RolledBack,
|
||||
}
|
||||
|
||||
impl Agent {
|
||||
fn new(cli: Cli) -> Self {
|
||||
Self {
|
||||
|
|
@ -155,9 +161,20 @@ impl Agent {
|
|||
.transpose()
|
||||
.context("failed to parse desired-system spec")?;
|
||||
|
||||
let previous_observed = client
|
||||
.get(key_observed_system(
|
||||
&self.cluster_namespace,
|
||||
&self.cluster_id,
|
||||
&self.node_id,
|
||||
))
|
||||
.await?
|
||||
.map(|bytes| serde_json::from_slice::<ObservedSystemState>(&bytes))
|
||||
.transpose()
|
||||
.context("failed to parse observed-system state")?;
|
||||
|
||||
let mut observed = self.base_observed_state(&node);
|
||||
let reconcile_result = self
|
||||
.reconcile_node(&node, desired.as_ref(), &mut observed)
|
||||
.reconcile_node(&node, desired.as_ref(), previous_observed.as_ref(), &mut observed)
|
||||
.await;
|
||||
if let Err(error) = reconcile_result {
|
||||
observed.status = Some("failed".to_string());
|
||||
|
|
@ -177,6 +194,7 @@ impl Agent {
|
|||
fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
|
||||
ObservedSystemState {
|
||||
node_id: node.node_id.clone(),
|
||||
configured_system: read_symlink_target("/nix/var/nix/profiles/system"),
|
||||
current_system: read_symlink_target("/run/current-system"),
|
||||
booted_system: read_symlink_target("/run/booted-system"),
|
||||
..ObservedSystemState::default()
|
||||
|
|
@ -187,6 +205,7 @@ impl Agent {
|
|||
&self,
|
||||
node: &ClusterNodeRecord,
|
||||
desired: Option<&DesiredSystemSpec>,
|
||||
previous_observed: Option<&ObservedSystemState>,
|
||||
observed: &mut ObservedSystemState,
|
||||
) -> Result<()> {
|
||||
match node.state.as_deref() {
|
||||
|
|
@ -211,8 +230,12 @@ impl Agent {
|
|||
|
||||
observed.nixos_configuration = Some(desired.nixos_configuration.clone());
|
||||
observed.flake_root = Some(desired.flake_ref.clone());
|
||||
observed.switch_action = Some(desired.switch_action.clone());
|
||||
|
||||
let previous_system = observed.current_system.clone();
|
||||
let previous_system = previous_observed
|
||||
.and_then(|state| state.rollback_system.clone())
|
||||
.or_else(|| observed.current_system.clone());
|
||||
observed.rollback_system = previous_system.clone();
|
||||
let target_system = self
|
||||
.build_target_system(&desired.flake_ref, &desired.nixos_configuration)
|
||||
.await
|
||||
|
|
@ -225,6 +248,25 @@ impl Agent {
|
|||
observed.target_system = Some(target_system.clone());
|
||||
|
||||
if observed.current_system.as_deref() == Some(target_system.as_str()) {
|
||||
if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
|
||||
observed.status = Some("verifying".to_string());
|
||||
observed.last_attempt = Some(Utc::now());
|
||||
let outcome = self
|
||||
.run_health_check_and_maybe_rollback(
|
||||
&desired,
|
||||
previous_system.as_deref(),
|
||||
observed,
|
||||
)
|
||||
.await?;
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
observed.booted_system = read_symlink_target("/run/booted-system");
|
||||
if outcome == HealthCheckOutcome::RolledBack {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
observed.reboot_required = Some(false);
|
||||
observed.status = Some("active".to_string());
|
||||
observed.last_success = Some(Utc::now());
|
||||
return Ok(());
|
||||
|
|
@ -240,9 +282,24 @@ impl Agent {
|
|||
self.switch_to_target(&target_system, &desired.switch_action)
|
||||
.await?;
|
||||
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
observed.booted_system = read_symlink_target("/run/booted-system");
|
||||
|
||||
if desired.switch_action == "boot" {
|
||||
if observed.configured_system.as_deref() != Some(target_system.as_str()) {
|
||||
return Err(anyhow!(
|
||||
"boot switch completed but configured system does not match target {}",
|
||||
target_system
|
||||
));
|
||||
}
|
||||
|
||||
observed.reboot_required = Some(true);
|
||||
observed.status = Some("staged".to_string());
|
||||
observed.last_error = None;
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
if observed.current_system.as_deref() != Some(target_system.as_str()) {
|
||||
return Err(anyhow!(
|
||||
"switch completed but /run/current-system does not match target {}",
|
||||
|
|
@ -250,9 +307,17 @@ impl Agent {
|
|||
));
|
||||
}
|
||||
|
||||
self.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed)
|
||||
let outcome = self
|
||||
.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed)
|
||||
.await?;
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
observed.booted_system = read_symlink_target("/run/booted-system");
|
||||
if outcome == HealthCheckOutcome::RolledBack {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
observed.reboot_required = Some(false);
|
||||
observed.status = Some("active".to_string());
|
||||
observed.last_success = Some(Utc::now());
|
||||
observed.last_error = None;
|
||||
|
|
@ -299,26 +364,28 @@ impl Agent {
|
|||
desired: &ResolvedDesiredSystem,
|
||||
previous_system: Option<&str>,
|
||||
observed: &mut ObservedSystemState,
|
||||
) -> Result<()> {
|
||||
) -> Result<HealthCheckOutcome> {
|
||||
if desired.health_check_command.is_empty() {
|
||||
return Ok(());
|
||||
return Ok(HealthCheckOutcome::Passed);
|
||||
}
|
||||
|
||||
if let Err(error) = run_vec_command(&desired.health_check_command).await {
|
||||
let error_message = format!("health check failed after activation: {error}");
|
||||
if desired.rollback_on_failure {
|
||||
self.rollback_to_previous(previous_system).await?;
|
||||
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
||||
observed.current_system = read_symlink_target("/run/current-system");
|
||||
observed.booted_system = read_symlink_target("/run/booted-system");
|
||||
observed.reboot_required = Some(false);
|
||||
observed.status = Some("rolled-back".to_string());
|
||||
observed.last_error = Some(error_message);
|
||||
return Ok(());
|
||||
return Ok(HealthCheckOutcome::RolledBack);
|
||||
}
|
||||
|
||||
return Err(anyhow!(error_message));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
Ok(HealthCheckOutcome::Passed)
|
||||
}
|
||||
|
||||
async fn rollback_to_previous(&self, previous_system: Option<&str>) -> Result<()> {
|
||||
|
|
@ -370,6 +437,20 @@ fn target_flake_attr(flake_root: &str, configuration: &str) -> String {
|
|||
)
|
||||
}
|
||||
|
||||
fn should_run_post_boot_health_check(
|
||||
previous_observed: Option<&ObservedSystemState>,
|
||||
desired: &ResolvedDesiredSystem,
|
||||
target_system: &str,
|
||||
) -> bool {
|
||||
desired.switch_action == "boot"
|
||||
&& previous_observed
|
||||
.map(|state| {
|
||||
state.status.as_deref() == Some("staged")
|
||||
&& state.target_system.as_deref() == Some(target_system)
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
fn read_symlink_target(path: &str) -> Option<String> {
|
||||
fs::read_link(path)
|
||||
.ok()
|
||||
|
|
@ -457,7 +538,10 @@ mod tests {
|
|||
install_plan: Some(InstallPlan {
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()),
|
||||
target_disk: Some("/dev/vda".to_string()),
|
||||
target_disk_by_id: None,
|
||||
}),
|
||||
hardware_facts: None,
|
||||
state: Some("active".to_string()),
|
||||
last_heartbeat: None,
|
||||
}
|
||||
|
|
@ -549,4 +633,48 @@ mod tests {
|
|||
fn read_symlink_target_returns_none_for_missing_path() {
|
||||
assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn post_boot_health_check_is_requested_for_matching_staged_target() {
|
||||
let desired = ResolvedDesiredSystem {
|
||||
nixos_configuration: "node01".to_string(),
|
||||
flake_ref: "/opt/plasmacloud-src".to_string(),
|
||||
switch_action: "boot".to_string(),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
rollback_on_failure: true,
|
||||
};
|
||||
let previous = ObservedSystemState {
|
||||
status: Some("staged".to_string()),
|
||||
target_system: Some("/nix/store/example-system".to_string()),
|
||||
..ObservedSystemState::default()
|
||||
};
|
||||
|
||||
assert!(should_run_post_boot_health_check(
|
||||
Some(&previous),
|
||||
&desired,
|
||||
"/nix/store/example-system"
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn post_boot_health_check_is_skipped_for_non_matching_state() {
|
||||
let desired = ResolvedDesiredSystem {
|
||||
nixos_configuration: "node01".to_string(),
|
||||
flake_ref: "/opt/plasmacloud-src".to_string(),
|
||||
switch_action: "boot".to_string(),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
rollback_on_failure: true,
|
||||
};
|
||||
let previous = ObservedSystemState {
|
||||
status: Some("active".to_string()),
|
||||
target_system: Some("/nix/store/example-system".to_string()),
|
||||
..ObservedSystemState::default()
|
||||
};
|
||||
|
||||
assert!(!should_run_post_boot_health_check(
|
||||
Some(&previous),
|
||||
&desired,
|
||||
"/nix/store/example-system"
|
||||
));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -173,6 +173,7 @@ node_classes:
|
|||
install_plan:
|
||||
nixos_configuration: worker-golden
|
||||
disko_config_path: profiles/worker-linux/disko.nix
|
||||
target_disk_by_id: /dev/disk/by-id/worker-default
|
||||
roles:
|
||||
- worker
|
||||
labels:
|
||||
|
|
@ -182,6 +183,7 @@ node_classes:
|
|||
install_plan:
|
||||
nixos_configuration: edge-metal
|
||||
disko_config_path: profiles/edge-metal/disko.nix
|
||||
target_disk_by_id: /dev/disk/by-id/edge-default
|
||||
roles:
|
||||
- edge
|
||||
labels:
|
||||
|
|
@ -208,6 +210,7 @@ nodes:
|
|||
install_plan:
|
||||
nixos_configuration: node01
|
||||
disko_config_path: nix/nodes/vm-cluster/node01/disko.nix
|
||||
target_disk: /dev/vda
|
||||
desired_system:
|
||||
flake_ref: "github:centra/cloud"
|
||||
health_check_command:
|
||||
|
|
@ -273,6 +276,7 @@ assert payload["node_config"]["node_class"] == "general-worker"
|
|||
assert payload["node_config"]["nix_profile"] == "profiles/worker-linux"
|
||||
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "node01"
|
||||
assert payload["node_config"]["install_plan"]["disko_config_path"] == "nix/nodes/vm-cluster/node01/disko.nix"
|
||||
assert payload["node_config"]["install_plan"]["target_disk"] == "/dev/vda"
|
||||
assert payload["node_config"]["failure_domain"] == "rack-a"
|
||||
print("Seeded mapping validated")
|
||||
PY
|
||||
|
|
@ -339,6 +343,36 @@ request = urllib.request.Request(
|
|||
"sku": "metal",
|
||||
"topology.kubernetes.io/zone": "rack-z",
|
||||
},
|
||||
"hardware_facts": {
|
||||
"architecture": "x86_64",
|
||||
"cpu_model": "Example CPU",
|
||||
"cpu_threads": 32,
|
||||
"cpu_cores": 16,
|
||||
"memory_bytes": 137438953472,
|
||||
"disks": [
|
||||
{
|
||||
"name": "nvme0n1",
|
||||
"path": "/dev/nvme0n1",
|
||||
"by_id": "/dev/disk/by-id/nvme-dynamic-metal-01",
|
||||
"size_bytes": 2000398934016,
|
||||
"model": "Example NVMe",
|
||||
"serial": "disk-serial-01",
|
||||
"rotational": False
|
||||
}
|
||||
],
|
||||
"nics": [
|
||||
{
|
||||
"name": "eno1",
|
||||
"mac_address": "52:54:00:aa:bb:cc",
|
||||
"oper_state": "up"
|
||||
}
|
||||
],
|
||||
"dmi": {
|
||||
"vendor": "ExampleVendor",
|
||||
"product_name": "ExampleMetal",
|
||||
"serial_number": "dynamic-metal-serial"
|
||||
}
|
||||
},
|
||||
}
|
||||
).encode(),
|
||||
headers={
|
||||
|
|
@ -357,6 +391,7 @@ assert payload["node_config"]["node_class"] == "edge-metal"
|
|||
assert payload["node_config"]["nix_profile"] == "profiles/edge-metal"
|
||||
assert payload["node_config"]["install_plan"]["nixos_configuration"] == "edge-metal"
|
||||
assert payload["node_config"]["install_plan"]["disko_config_path"] == "profiles/edge-metal/disko.nix"
|
||||
assert payload["node_config"]["install_plan"]["target_disk_by_id"] == "/dev/disk/by-id/edge-default"
|
||||
assert "prismnet" in payload["node_config"]["services"]
|
||||
assert payload["node_config"]["labels"]["managed-by"] == "deployer"
|
||||
print(payload["node_id"])
|
||||
|
|
@ -400,6 +435,19 @@ if dynamic.get("failure_domain") != "rack-z":
|
|||
raise SystemExit(f"unexpected dynamic failure domain: {dynamic}")
|
||||
if dynamic.get("labels", {}).get("lane") != "edge":
|
||||
raise SystemExit(f"missing pool label propagation: {dynamic}")
|
||||
if seeded.get("install_plan", {}).get("target_disk") != "/dev/vda":
|
||||
raise SystemExit(f"missing seeded target disk: {seeded}")
|
||||
if dynamic.get("install_plan", {}).get("target_disk_by_id") != "/dev/disk/by-id/edge-default":
|
||||
raise SystemExit(f"missing dynamic target disk by-id: {dynamic}")
|
||||
facts = dynamic.get("hardware_facts") or {}
|
||||
if facts.get("architecture") != "x86_64":
|
||||
raise SystemExit(f"missing dynamic hardware architecture: {dynamic}")
|
||||
if facts.get("disks", [{}])[0].get("by_id") != "/dev/disk/by-id/nvme-dynamic-metal-01":
|
||||
raise SystemExit(f"missing dynamic hardware disk facts: {dynamic}")
|
||||
if dynamic.get("labels", {}).get("hardware.architecture") != "x86_64":
|
||||
raise SystemExit(f"missing hardware metadata labels: {dynamic}")
|
||||
if dynamic.get("labels", {}).get("hardware.disk_count") != "1":
|
||||
raise SystemExit(f"missing hardware disk count label: {dynamic}")
|
||||
|
||||
print("Deployer bootstrap records validated")
|
||||
PY
|
||||
|
|
|
|||
|
|
@ -44,8 +44,30 @@
|
|||
script = ''
|
||||
set -euo pipefail
|
||||
|
||||
# Discover Deployer via DNS or fallback
|
||||
DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}"
|
||||
cmdline_value() {
|
||||
local key="$1"
|
||||
local arg
|
||||
for arg in $(cat /proc/cmdline); do
|
||||
case "$arg" in
|
||||
"$key"=*)
|
||||
echo "''${arg#*=}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
mkdir -p /etc/plasmacloud
|
||||
|
||||
# Discover Deployer via environment, kernel cmdline, or fallback.
|
||||
DEPLOYER_URL="''${DEPLOYER_URL:-}"
|
||||
if [ -z "$DEPLOYER_URL" ]; then
|
||||
DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
|
||||
fi
|
||||
if [ -z "$DEPLOYER_URL" ]; then
|
||||
DEPLOYER_URL="http://192.168.100.1:8080"
|
||||
fi
|
||||
|
||||
# Get machine identity
|
||||
MACHINE_ID=$(cat /etc/machine-id)
|
||||
|
|
@ -61,14 +83,27 @@
|
|||
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
|
||||
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
|
||||
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
|
||||
else
|
||||
DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
|
||||
fi
|
||||
|
||||
DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
|
||||
if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
|
||||
DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
|
||||
if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
|
||||
DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
|
||||
${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
|
||||
"$DEPLOYER_CA_CERT_URL" \
|
||||
-o "$DEPLOYER_CA_CERT_PATH"
|
||||
fi
|
||||
fi
|
||||
|
||||
CURL_ARGS=(-sf --connect-timeout 5 --max-time 15)
|
||||
if [ -n "$DEPLOYER_TOKEN" ]; then
|
||||
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
|
||||
fi
|
||||
if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then
|
||||
CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}")
|
||||
if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
|
||||
CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
|
||||
fi
|
||||
|
||||
NODE_IP=$(${pkgs.iproute2}/bin/ip -4 route get 1.1.1.1 2>/dev/null | ${pkgs.gawk}/bin/awk '{for(i=1;i<=NF;i++) if ($i=="src") {print $(i+1); exit}}')
|
||||
|
|
@ -79,6 +114,76 @@
|
|||
NODE_IP=$(hostname -I 2>/dev/null | ${pkgs.gawk}/bin/awk '{print $1}')
|
||||
fi
|
||||
NODE_HOSTNAME=$(hostname)
|
||||
CPU_MODEL=$(${pkgs.gawk}/bin/awk -F: '/model name/ {gsub(/^[ \t]+/, "", $2); print $2; exit}' /proc/cpuinfo 2>/dev/null || true)
|
||||
CPU_CORES=$(${pkgs.gawk}/bin/awk '/^cpu cores/ {print $4; exit}' /proc/cpuinfo 2>/dev/null || true)
|
||||
CPU_THREADS=$(${pkgs.coreutils}/bin/nproc --all 2>/dev/null || true)
|
||||
MEMORY_KIB=$(${pkgs.gawk}/bin/awk '/MemTotal:/ {print $2; exit}' /proc/meminfo 2>/dev/null || true)
|
||||
MEMORY_BYTES=""
|
||||
if [ -n "$MEMORY_KIB" ]; then
|
||||
MEMORY_BYTES=$((MEMORY_KIB * 1024))
|
||||
fi
|
||||
|
||||
DISKS_JSON=$(${pkgs.util-linux}/bin/lsblk -J -b -o NAME,PATH,SIZE,MODEL,SERIAL,ROTA,TYPE 2>/dev/null | ${pkgs.jq}/bin/jq '
|
||||
[.blockdevices[] | select(.type == "disk") | {
|
||||
name: .name,
|
||||
path: (.path // null),
|
||||
size_bytes: (.size | tonumber?),
|
||||
model: ((.model // "") | if . == "" then null else . end),
|
||||
serial: ((.serial // "") | if . == "" then null else . end),
|
||||
rotational: (if .rota == null then null else (.rota == 1) end)
|
||||
}]
|
||||
')
|
||||
NICS_JSON=$(${pkgs.iproute2}/bin/ip -j link 2>/dev/null | ${pkgs.jq}/bin/jq '
|
||||
[.[] | select(.ifname != "lo") | {
|
||||
name: .ifname,
|
||||
mac_address: ((.address // "") | if . == "" or . == "00:00:00:00:00:00" then null else . end),
|
||||
oper_state: ((.operstate // "") | ascii_downcase | if . == "" then null else . end)
|
||||
}]
|
||||
')
|
||||
DMI_VENDOR=$(tr -d '\n' </sys/class/dmi/id/sys_vendor 2>/dev/null || true)
|
||||
DMI_PRODUCT=$(tr -d '\n' </sys/class/dmi/id/product_name 2>/dev/null || true)
|
||||
DMI_SERIAL=$(tr -d '\n' </sys/class/dmi/id/product_serial 2>/dev/null || true)
|
||||
HARDWARE_FACTS=$(${pkgs.jq}/bin/jq -n \
|
||||
--arg architecture "$(${pkgs.coreutils}/bin/uname -m)" \
|
||||
--arg cpu_model "$CPU_MODEL" \
|
||||
--arg cpu_threads "$CPU_THREADS" \
|
||||
--arg cpu_cores "$CPU_CORES" \
|
||||
--arg memory_bytes "$MEMORY_BYTES" \
|
||||
--arg dmi_vendor "$DMI_VENDOR" \
|
||||
--arg dmi_product "$DMI_PRODUCT" \
|
||||
--arg dmi_serial "$DMI_SERIAL" \
|
||||
--argjson disks "$DISKS_JSON" \
|
||||
--argjson nics "$NICS_JSON" '
|
||||
{
|
||||
architecture: (if $architecture == "" then null else $architecture end),
|
||||
cpu_model: (if $cpu_model == "" then null else $cpu_model end),
|
||||
cpu_threads: (if $cpu_threads == "" then null else ($cpu_threads | tonumber) end),
|
||||
cpu_cores: (if $cpu_cores == "" then null else ($cpu_cores | tonumber) end),
|
||||
memory_bytes: (if $memory_bytes == "" then null else ($memory_bytes | tonumber) end),
|
||||
disks: $disks,
|
||||
nics: $nics,
|
||||
dmi: ({
|
||||
vendor: (if $dmi_vendor == "" then null else $dmi_vendor end),
|
||||
product_name: (if $dmi_product == "" then null else $dmi_product end),
|
||||
serial_number: (if $dmi_serial == "" then null else $dmi_serial end)
|
||||
} | with_entries(select(.value != null)))
|
||||
}
|
||||
| if (.dmi | length) == 0 then del(.dmi) else . end
|
||||
')
|
||||
REQUEST_JSON=$(${pkgs.jq}/bin/jq -n \
|
||||
--arg machine_id "$MACHINE_ID" \
|
||||
--arg node_id "$NODE_HOSTNAME" \
|
||||
--arg hostname "$NODE_HOSTNAME" \
|
||||
--arg ip "$NODE_IP" \
|
||||
--argjson hardware_facts "$HARDWARE_FACTS" '
|
||||
{
|
||||
machine_id: $machine_id,
|
||||
node_id: $node_id,
|
||||
hostname: $hostname,
|
||||
ip: $ip,
|
||||
hardware_facts: $hardware_facts
|
||||
}
|
||||
')
|
||||
|
||||
# Phone Home request with retry
|
||||
for i in 1 2 3 4 5; do
|
||||
|
|
@ -86,7 +191,7 @@
|
|||
|
||||
if RESPONSE=$(${pkgs.curl}/bin/curl "''${CURL_ARGS[@]}" -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"machine_id\": \"$MACHINE_ID\", \"node_id\": \"$NODE_HOSTNAME\", \"hostname\": \"$NODE_HOSTNAME\", \"ip\": \"$NODE_IP\"}" \
|
||||
-d "$REQUEST_JSON" \
|
||||
"$DEPLOYER_URL/api/v1/phone-home"); then
|
||||
|
||||
echo "✓ Phone Home successful"
|
||||
|
|
@ -177,6 +282,20 @@
|
|||
script = ''
|
||||
set -euo pipefail
|
||||
|
||||
cmdline_value() {
|
||||
local key="$1"
|
||||
local arg
|
||||
for arg in $(cat /proc/cmdline); do
|
||||
case "$arg" in
|
||||
"$key"=*)
|
||||
echo "''${arg#*=}"
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
done
|
||||
return 1
|
||||
}
|
||||
|
||||
if [ ! -s /etc/plasmacloud/node-config.json ]; then
|
||||
echo "ERROR: node-config.json missing (bootstrap not complete?)"
|
||||
exit 1
|
||||
|
|
@ -186,7 +305,15 @@
|
|||
NODE_IP=$(${pkgs.jq}/bin/jq -r '.ip // empty' /etc/plasmacloud/node-config.json)
|
||||
NIXOS_CONFIGURATION=$(${pkgs.jq}/bin/jq -r '.install_plan.nixos_configuration // .hostname // empty' /etc/plasmacloud/node-config.json)
|
||||
DISKO_PATH=$(${pkgs.jq}/bin/jq -r '.install_plan.disko_config_path // empty' /etc/plasmacloud/node-config.json)
|
||||
DEPLOYER_URL="''${DEPLOYER_URL:-http://192.168.100.1:8080}"
|
||||
TARGET_DISK=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk // empty' /etc/plasmacloud/node-config.json)
|
||||
TARGET_DISK_BY_ID=$(${pkgs.jq}/bin/jq -r '.install_plan.target_disk_by_id // empty' /etc/plasmacloud/node-config.json)
|
||||
DEPLOYER_URL="''${DEPLOYER_URL:-}"
|
||||
if [ -z "$DEPLOYER_URL" ]; then
|
||||
DEPLOYER_URL="$(cmdline_value plasmacloud.deployer_url || true)"
|
||||
fi
|
||||
if [ -z "$DEPLOYER_URL" ]; then
|
||||
DEPLOYER_URL="http://192.168.100.1:8080"
|
||||
fi
|
||||
SRC_ROOT="/opt/plasmacloud-src"
|
||||
|
||||
if [ -z "$NODE_ID" ] || [ -z "$NODE_IP" ]; then
|
||||
|
|
@ -205,14 +332,27 @@
|
|||
DEPLOYER_TOKEN=$(cat "$TOKEN_FILE")
|
||||
elif [ -n "''${DEPLOYER_BOOTSTRAP_TOKEN:-}" ]; then
|
||||
DEPLOYER_TOKEN="''${DEPLOYER_BOOTSTRAP_TOKEN}"
|
||||
else
|
||||
DEPLOYER_TOKEN="$(cmdline_value plasmacloud.bootstrap_token || true)"
|
||||
fi
|
||||
|
||||
DEPLOYER_CA_CERT_PATH="''${DEPLOYER_CA_CERT:-}"
|
||||
if [ -z "$DEPLOYER_CA_CERT_PATH" ]; then
|
||||
DEPLOYER_CA_CERT_URL="$(cmdline_value plasmacloud.ca_cert_url || true)"
|
||||
if [ -n "$DEPLOYER_CA_CERT_URL" ]; then
|
||||
DEPLOYER_CA_CERT_PATH="/etc/plasmacloud/bootstrap-ca.crt"
|
||||
${pkgs.curl}/bin/curl -sfL --connect-timeout 5 --max-time 30 \
|
||||
"$DEPLOYER_CA_CERT_URL" \
|
||||
-o "$DEPLOYER_CA_CERT_PATH"
|
||||
fi
|
||||
fi
|
||||
|
||||
CURL_ARGS=(-sfL --connect-timeout 5 --max-time 120)
|
||||
if [ -n "$DEPLOYER_TOKEN" ]; then
|
||||
CURL_ARGS+=(-H "X-Deployer-Token: $DEPLOYER_TOKEN")
|
||||
fi
|
||||
if [ -n "''${DEPLOYER_CA_CERT:-}" ] && [ -f "''${DEPLOYER_CA_CERT}" ]; then
|
||||
CURL_ARGS+=(--cacert "''${DEPLOYER_CA_CERT}")
|
||||
if [ -n "$DEPLOYER_CA_CERT_PATH" ] && [ -f "$DEPLOYER_CA_CERT_PATH" ]; then
|
||||
CURL_ARGS+=(--cacert "$DEPLOYER_CA_CERT_PATH")
|
||||
fi
|
||||
|
||||
BUNDLE_PATH="/run/plasmacloud/flake-bundle.tar.gz"
|
||||
|
|
@ -247,18 +387,32 @@
|
|||
|
||||
echo "PlasmaCloud install starting for $NODE_ID (ip=$NODE_IP, nixos_configuration=$NIXOS_CONFIGURATION, disko_path=$DISKO_PATH)"
|
||||
|
||||
# Find disk
|
||||
# Resolve installation target disk.
|
||||
if [ -n "$TARGET_DISK_BY_ID" ]; then
|
||||
if [ ! -b "$TARGET_DISK_BY_ID" ]; then
|
||||
echo "ERROR: target_disk_by_id does not exist: $TARGET_DISK_BY_ID"
|
||||
exit 1
|
||||
fi
|
||||
DISK="$TARGET_DISK_BY_ID"
|
||||
elif [ -n "$TARGET_DISK" ]; then
|
||||
if [ ! -b "$TARGET_DISK" ]; then
|
||||
echo "ERROR: target_disk does not exist: $TARGET_DISK"
|
||||
exit 1
|
||||
fi
|
||||
DISK="$TARGET_DISK"
|
||||
else
|
||||
DISK=$(${pkgs.util-linux}/bin/lsblk -dpno NAME,TYPE | ${pkgs.gawk}/bin/awk '$2=="disk"{print $1; exit}')
|
||||
fi
|
||||
if [ -z "$DISK" ]; then
|
||||
echo "ERROR: No disk found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ROOT_PART="''${DISK}2"
|
||||
ROOT_PART=$(${pkgs.util-linux}/bin/lsblk -lnpo NAME,TYPE "$DISK" 2>/dev/null | ${pkgs.gawk}/bin/awk '$2=="part"{print $1}' | sed -n '2p')
|
||||
mkdir -p /mnt
|
||||
|
||||
# Skip if already installed
|
||||
if ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then
|
||||
if [ -n "$ROOT_PART" ] && ${pkgs.util-linux}/bin/lsblk -no FSTYPE "$ROOT_PART" 2>/dev/null | ${pkgs.gnugrep}/bin/grep -q '^ext4$'; then
|
||||
mount "$ROOT_PART" /mnt 2>/dev/null || true
|
||||
if [ -e /mnt/etc/NIXOS ]; then
|
||||
echo "✓ Existing NixOS detected; skipping install"
|
||||
|
|
@ -271,9 +425,25 @@
|
|||
echo "Validating NixOS configuration output..."
|
||||
nix eval --raw "$SRC_ROOT#nixosConfigurations.$NIXOS_CONFIGURATION.config.system.build.toplevel.drvPath" >/dev/null
|
||||
|
||||
EFFECTIVE_DISKO_PATH="$SRC_ROOT/$DISKO_PATH"
|
||||
if [ -n "$DISK" ]; then
|
||||
cat > /run/plasmacloud/disko-wrapper.nix <<EOF
|
||||
{ ... }:
|
||||
{
|
||||
imports = [
|
||||
"$SRC_ROOT/nix/modules/install-target.nix"
|
||||
"$SRC_ROOT/$DISKO_PATH"
|
||||
];
|
||||
|
||||
plasmacloud.install.diskDevice = "$DISK";
|
||||
}
|
||||
EOF
|
||||
EFFECTIVE_DISKO_PATH="/run/plasmacloud/disko-wrapper.nix"
|
||||
fi
|
||||
|
||||
echo "Running disko to partition $DISK..."
|
||||
export NIX_CONFIG="experimental-features = nix-command flakes"
|
||||
nix run github:nix-community/disko -- --mode disko "$SRC_ROOT/$DISKO_PATH"
|
||||
nix run github:nix-community/disko -- --mode disko "$EFFECTIVE_DISKO_PATH"
|
||||
|
||||
echo "Running nixos-install..."
|
||||
nixos-install --flake "$SRC_ROOT#$NIXOS_CONFIGURATION" --no-root-passwd
|
||||
|
|
|
|||
|
|
@ -16,6 +16,18 @@ let
|
|||
default = null;
|
||||
description = "Repository-relative Disko file used for installation";
|
||||
};
|
||||
|
||||
targetDisk = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Explicit disk device path selected for installation";
|
||||
};
|
||||
|
||||
targetDiskById = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Stable /dev/disk/by-id path selected for installation";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
|
|
@ -314,6 +326,12 @@ let
|
|||
}
|
||||
// optionalAttrs (plan != null && plan.diskoConfigPath != null) {
|
||||
disko_config_path = plan.diskoConfigPath;
|
||||
}
|
||||
// optionalAttrs (plan != null && plan.targetDisk != null) {
|
||||
target_disk = plan.targetDisk;
|
||||
}
|
||||
// optionalAttrs (plan != null && plan.targetDiskById != null) {
|
||||
target_disk_by_id = plan.targetDiskById;
|
||||
};
|
||||
in
|
||||
if plan == null || rendered == { } then null else rendered;
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
imports = [
|
||||
./chainfire.nix
|
||||
./plasmacloud-cluster.nix
|
||||
./install-target.nix
|
||||
./creditservice.nix
|
||||
./coronafs.nix
|
||||
./flaredb.nix
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@
|
|||
installPlan = {
|
||||
nixosConfiguration = "node01";
|
||||
diskoConfigPath = "nix/nodes/vm-cluster/node01/disko.nix";
|
||||
targetDisk = "/dev/vda";
|
||||
};
|
||||
desiredSystem = {
|
||||
healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
|
||||
|
|
@ -45,6 +46,7 @@
|
|||
installPlan = {
|
||||
nixosConfiguration = "node02";
|
||||
diskoConfigPath = "nix/nodes/vm-cluster/node02/disko.nix";
|
||||
targetDisk = "/dev/vda";
|
||||
};
|
||||
desiredSystem = {
|
||||
healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
|
||||
|
|
@ -69,6 +71,7 @@
|
|||
installPlan = {
|
||||
nixosConfiguration = "node03";
|
||||
diskoConfigPath = "nix/nodes/vm-cluster/node03/disko.nix";
|
||||
targetDisk = "/dev/vda";
|
||||
};
|
||||
desiredSystem = {
|
||||
healthCheckCommand = [ "systemctl" "is-system-running" "--wait" ];
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
{ lib, ... }:
|
||||
{ config, lib, ... }:
|
||||
|
||||
{
|
||||
disko.devices = {
|
||||
disk.main = {
|
||||
type = "disk";
|
||||
device = "/dev/vda";
|
||||
device = config.plasmacloud.install.diskDevice or "/dev/vda";
|
||||
content = {
|
||||
type = "gpt";
|
||||
partitions = {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
{ lib, ... }:
|
||||
{ config, lib, ... }:
|
||||
|
||||
{
|
||||
disko.devices = {
|
||||
disk.main = {
|
||||
type = "disk";
|
||||
device = "/dev/vda";
|
||||
device = config.plasmacloud.install.diskDevice or "/dev/vda";
|
||||
content = {
|
||||
type = "gpt";
|
||||
partitions = {
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
{ lib, ... }:
|
||||
{ config, lib, ... }:
|
||||
|
||||
{
|
||||
disko.devices = {
|
||||
disk.main = {
|
||||
type = "disk";
|
||||
device = "/dev/vda";
|
||||
device = config.plasmacloud.install.diskDevice or "/dev/vda";
|
||||
content = {
|
||||
type = "gpt";
|
||||
partitions = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue