Add prebuilt system closure support for host rollouts
Some checks failed
Nix CI / filter (push) Successful in 7s
Nix CI / gate () (push) Failing after 1s
Nix CI / gate (shared crates) (push) Has been skipped
Nix CI / build () (push) Has been skipped
Nix CI / ci-status (push) Failing after 1s

This commit is contained in:
centra 2026-03-30 13:54:14 +09:00
parent d6d96b8c37
commit 795b8ad70c
Signed by: centra
GPG key ID: 0C09689D20B25ACA
8 changed files with 230 additions and 78 deletions

View file

@ -48,3 +48,5 @@ nix run ./nix/test-cluster#cluster -- fresh-smoke
## Scope ## Scope
PhotonCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products. PhotonCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products.
Host-level NixOS rollout validation is also expected to stay reproducible: the `deployer-vm-smoke` VM test now proves that `nix-agent` can activate a prebuilt target system closure directly, without recompiling the stack inside the guest.

View file

@ -236,7 +236,7 @@ fn desired_system_from_spec(node: &NodeSpec) -> Option<DesiredSystemSpec> {
if desired.drain_before_apply.is_none() { if desired.drain_before_apply.is_none() {
desired.drain_before_apply = Some(false); desired.drain_before_apply = Some(false);
} }
if desired.nixos_configuration.is_some() { if desired.nixos_configuration.is_some() || desired.target_system.is_some() {
Some(desired) Some(desired)
} else { } else {
None None
@ -882,7 +882,9 @@ pub async fn inspect_node(
if let Some(observed_system) = observed_system { if let Some(observed_system) = observed_system {
println!( println!(
"observed_status={}", "observed_status={}",
observed_system.status.unwrap_or_else(|| "unknown".to_string()) observed_system
.status
.unwrap_or_else(|| "unknown".to_string())
); );
} }
} }
@ -1090,7 +1092,8 @@ pub async fn set_host_deployment_paused(
let deployment_name = deployment_name.to_string(); let deployment_name = deployment_name.to_string();
async move { async move {
let mut client = Client::connect(endpoint).await?; let mut client = Client::connect(endpoint).await?;
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name); let spec_key =
key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
if client.get(&spec_key).await?.is_none() { if client.get(&spec_key).await?.is_none() {
return Err(anyhow::anyhow!( return Err(anyhow::anyhow!(
"host deployment {} not found", "host deployment {} not found",
@ -1116,7 +1119,9 @@ pub async fn set_host_deployment_paused(
"resumed by operator".to_string() "resumed by operator".to_string()
}); });
status.updated_at = Some(Utc::now()); status.updated_at = Some(Utc::now());
client.put(&status_key, &serde_json::to_vec(&status)?).await?; client
.put(&status_key, &serde_json::to_vec(&status)?)
.await?;
println!("{}", serde_json::to_string_pretty(&status)?); println!("{}", serde_json::to_string_pretty(&status)?);
Ok(()) Ok(())
} }
@ -1138,7 +1143,8 @@ pub async fn abort_host_deployment(
let deployment_name = deployment_name.to_string(); let deployment_name = deployment_name.to_string();
async move { async move {
let mut client = Client::connect(endpoint).await?; let mut client = Client::connect(endpoint).await?;
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name); let spec_key =
key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
if client.get(&spec_key).await?.is_none() { if client.get(&spec_key).await?.is_none() {
return Err(anyhow::anyhow!( return Err(anyhow::anyhow!(
"host deployment {} not found", "host deployment {} not found",
@ -1512,6 +1518,7 @@ mod tests {
node_id: String::new(), node_id: String::new(),
deployment_id: None, deployment_id: None,
nixos_configuration: Some("node01-next".to_string()), nixos_configuration: Some("node01-next".to_string()),
target_system: Some("/nix/store/node01-next".to_string()),
flake_ref: Some("github:centra/cloud".to_string()), flake_ref: Some("github:centra/cloud".to_string()),
switch_action: Some("boot".to_string()), switch_action: Some("boot".to_string()),
health_check_command: vec!["true".to_string()], health_check_command: vec!["true".to_string()],
@ -1523,6 +1530,10 @@ mod tests {
let desired = desired_system_from_spec(&resolved[0]).expect("desired system should exist"); let desired = desired_system_from_spec(&resolved[0]).expect("desired system should exist");
assert_eq!(desired.node_id, "node01"); assert_eq!(desired.node_id, "node01");
assert_eq!(desired.nixos_configuration.as_deref(), Some("node01-next")); assert_eq!(desired.nixos_configuration.as_deref(), Some("node01-next"));
assert_eq!(
desired.target_system.as_deref(),
Some("/nix/store/node01-next")
);
assert_eq!(desired.flake_ref.as_deref(), Some("github:centra/cloud")); assert_eq!(desired.flake_ref.as_deref(), Some("github:centra/cloud"));
assert_eq!(desired.switch_action.as_deref(), Some("boot")); assert_eq!(desired.switch_action.as_deref(), Some("boot"));
assert_eq!(desired.health_check_command, vec!["true".to_string()]); assert_eq!(desired.health_check_command, vec!["true".to_string()]);

View file

@ -605,6 +605,9 @@ pub struct DesiredSystemSpec {
pub deployment_id: Option<String>, pub deployment_id: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub nixos_configuration: Option<String>, pub nixos_configuration: Option<String>,
/// Optional prebuilt NixOS system closure path to activate directly.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub target_system: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub flake_ref: Option<String>, pub flake_ref: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
@ -756,6 +759,9 @@ pub struct HostDeploymentSpec {
pub selector: HostDeploymentSelector, pub selector: HostDeploymentSelector,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub nixos_configuration: Option<String>, pub nixos_configuration: Option<String>,
/// Optional prebuilt NixOS system closure path handed directly to nix-agent.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub target_system: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
pub flake_ref: Option<String>, pub flake_ref: Option<String>,
#[serde(default, skip_serializing_if = "Option::is_none")] #[serde(default, skip_serializing_if = "Option::is_none")]
@ -1238,6 +1244,7 @@ mod tests {
node_id: "node01".to_string(), node_id: "node01".to_string(),
deployment_id: Some("worker-rollout".to_string()), deployment_id: Some("worker-rollout".to_string()),
nixos_configuration: Some("node01".to_string()), nixos_configuration: Some("node01".to_string()),
target_system: Some("/nix/store/system-node01".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()), flake_ref: Some("/opt/plasmacloud-src".to_string()),
switch_action: Some("switch".to_string()), switch_action: Some("switch".to_string()),
health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()], health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()],
@ -1250,6 +1257,10 @@ mod tests {
assert_eq!(decoded.node_id, "node01"); assert_eq!(decoded.node_id, "node01");
assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout")); assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout"));
assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01")); assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01"));
assert_eq!(
decoded.target_system.as_deref(),
Some("/nix/store/system-node01")
);
assert_eq!(decoded.health_check_command.len(), 2); assert_eq!(decoded.health_check_command.len(), 2);
assert_eq!(decoded.rollback_on_failure, Some(true)); assert_eq!(decoded.rollback_on_failure, Some(true));
assert_eq!(decoded.drain_before_apply, Some(true)); assert_eq!(decoded.drain_before_apply, Some(true));
@ -1267,6 +1278,7 @@ mod tests {
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]), match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
}, },
nixos_configuration: Some("worker-golden".to_string()), nixos_configuration: Some("worker-golden".to_string()),
target_system: Some("/nix/store/worker-golden".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()), flake_ref: Some("/opt/plasmacloud-src".to_string()),
batch_size: Some(1), batch_size: Some(1),
max_unavailable: Some(1), max_unavailable: Some(1),
@ -1283,9 +1295,17 @@ mod tests {
assert_eq!(decoded.name, "worker-rollout"); assert_eq!(decoded.name, "worker-rollout");
assert_eq!(decoded.batch_size, Some(1)); assert_eq!(decoded.batch_size, Some(1));
assert_eq!(decoded.max_unavailable, Some(1)); assert_eq!(decoded.max_unavailable, Some(1));
assert_eq!(
decoded.target_system.as_deref(),
Some("/nix/store/worker-golden")
);
assert_eq!(decoded.selector.roles, vec!["worker".to_string()]); assert_eq!(decoded.selector.roles, vec!["worker".to_string()]);
assert_eq!( assert_eq!(
decoded.selector.match_labels.get("tier").map(String::as_str), decoded
.selector
.match_labels
.get("tier")
.map(String::as_str),
Some("general") Some("general")
); );
assert_eq!(decoded.drain_before_apply, Some(true)); assert_eq!(decoded.drain_before_apply, Some(true));
@ -1318,10 +1338,16 @@ mod tests {
let json = serde_json::to_string(&node).unwrap(); let json = serde_json::to_string(&node).unwrap();
let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap(); let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap();
assert_eq!(decoded.commission_state, Some(CommissionState::Commissioned)); assert_eq!(
decoded.commission_state,
Some(CommissionState::Commissioned)
);
assert_eq!(decoded.install_state, Some(InstallState::Installed)); assert_eq!(decoded.install_state, Some(InstallState::Installed));
assert_eq!(decoded.power_state, Some(PowerState::On)); assert_eq!(decoded.power_state, Some(PowerState::On));
assert_eq!(decoded.bmc_ref.as_deref(), Some("redfish://lab-rack-a/node01")); assert_eq!(
decoded.bmc_ref.as_deref(),
Some("redfish://lab-rack-a/node01")
);
} }
#[test] #[test]

View file

@ -97,7 +97,8 @@ struct Agent {
#[derive(Debug, Clone, PartialEq, Eq)] #[derive(Debug, Clone, PartialEq, Eq)]
struct ResolvedDesiredSystem { struct ResolvedDesiredSystem {
nixos_configuration: String, nixos_configuration: Option<String>,
target_system: Option<String>,
flake_ref: String, flake_ref: String,
switch_action: String, switch_action: String,
health_check_command: Vec<String>, health_check_command: Vec<String>,
@ -298,8 +299,7 @@ impl Agent {
Some("draining") Some("draining")
if !desired if !desired
.map(|spec| { .map(|spec| {
spec.deployment_id.is_some() spec.deployment_id.is_some() && spec.drain_before_apply.unwrap_or(false)
&& spec.drain_before_apply.unwrap_or(false)
}) })
.unwrap_or(false) => .unwrap_or(false) =>
{ {
@ -321,7 +321,8 @@ impl Agent {
return Ok(()); return Ok(());
}; };
info!( info!(
nixos_configuration = %desired.nixos_configuration, nixos_configuration = desired.nixos_configuration.as_deref().unwrap_or(""),
target_system = desired.target_system.as_deref().unwrap_or(""),
flake_ref = %desired.flake_ref, flake_ref = %desired.flake_ref,
switch_action = %desired.switch_action, switch_action = %desired.switch_action,
rollback_on_failure = desired.rollback_on_failure, rollback_on_failure = desired.rollback_on_failure,
@ -329,7 +330,7 @@ impl Agent {
"resolved desired system" "resolved desired system"
); );
observed.nixos_configuration = Some(desired.nixos_configuration.clone()); observed.nixos_configuration = desired.nixos_configuration.clone();
observed.flake_root = Some(desired.flake_ref.clone()); observed.flake_root = Some(desired.flake_ref.clone());
observed.switch_action = Some(desired.switch_action.clone()); observed.switch_action = Some(desired.switch_action.clone());
@ -341,17 +342,25 @@ impl Agent {
previous_system = previous_system.as_deref().unwrap_or(""), previous_system = previous_system.as_deref().unwrap_or(""),
"selected rollback baseline" "selected rollback baseline"
); );
let target_system = self let target_system = match desired.target_system.as_deref() {
.build_target_system(&desired.flake_ref, &desired.nixos_configuration) Some(target_system) => {
info!(target_system, "using prebuilt target system");
target_system.to_string()
}
None => {
let configuration = desired
.nixos_configuration
.as_deref()
.ok_or_else(|| anyhow!("desired system did not specify nixos_configuration"))?;
self.build_target_system(&desired.flake_ref, configuration)
.await .await
.with_context(|| { .with_context(|| {
format!( format!("failed to build target system for {}", configuration)
"failed to build target system for {}", })?
desired.nixos_configuration }
) };
})?;
observed.target_system = Some(target_system.clone()); observed.target_system = Some(target_system.clone());
info!(target_system = %target_system, "built target system"); info!(target_system = %target_system, "resolved target system");
if observed.current_system.as_deref() == Some(target_system.as_str()) { if observed.current_system.as_deref() == Some(target_system.as_str()) {
info!("target system already active"); info!("target system already active");
@ -575,10 +584,16 @@ fn resolve_desired_system(
node.install_plan node.install_plan
.as_ref() .as_ref()
.and_then(|plan| plan.nixos_configuration.clone()) .and_then(|plan| plan.nixos_configuration.clone())
})?; });
let target_system = desired.and_then(|spec| spec.target_system.clone());
if nixos_configuration.is_none() && target_system.is_none() {
return None;
}
Some(ResolvedDesiredSystem { Some(ResolvedDesiredSystem {
nixos_configuration, nixos_configuration,
target_system,
flake_ref: desired flake_ref: desired
.and_then(|spec| spec.flake_ref.clone()) .and_then(|spec| spec.flake_ref.clone())
.unwrap_or_else(|| local_flake_root.to_string()), .unwrap_or_else(|| local_flake_root.to_string()),
@ -787,7 +802,8 @@ mod tests {
true, true,
) )
.expect("desired system should resolve"); .expect("desired system should resolve");
assert_eq!(resolved.nixos_configuration, "node01"); assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01"));
assert_eq!(resolved.target_system, None);
assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src"); assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src");
assert_eq!(resolved.switch_action, "switch"); assert_eq!(resolved.switch_action, "switch");
assert!(resolved.rollback_on_failure); assert!(resolved.rollback_on_failure);
@ -799,6 +815,7 @@ mod tests {
node_id: "node01".to_string(), node_id: "node01".to_string(),
deployment_id: None, deployment_id: None,
nixos_configuration: Some("node01-next".to_string()), nixos_configuration: Some("node01-next".to_string()),
target_system: None,
flake_ref: Some("github:centra/cloud".to_string()), flake_ref: Some("github:centra/cloud".to_string()),
switch_action: Some("boot".to_string()), switch_action: Some("boot".to_string()),
health_check_command: vec!["true".to_string()], health_check_command: vec!["true".to_string()],
@ -815,19 +832,52 @@ mod tests {
false, false,
) )
.expect("desired system should resolve"); .expect("desired system should resolve");
assert_eq!(resolved.nixos_configuration, "node01-next"); assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
assert_eq!(resolved.flake_ref, "github:centra/cloud"); assert_eq!(resolved.flake_ref, "github:centra/cloud");
assert_eq!(resolved.switch_action, "boot"); assert_eq!(resolved.switch_action, "boot");
assert_eq!(resolved.health_check_command, vec!["true".to_string()]); assert_eq!(resolved.health_check_command, vec!["true".to_string()]);
assert!(resolved.rollback_on_failure); assert!(resolved.rollback_on_failure);
} }
#[test]
fn resolve_desired_system_accepts_prebuilt_target_system() {
let desired = DesiredSystemSpec {
node_id: "node01".to_string(),
deployment_id: None,
nixos_configuration: Some("node01-next".to_string()),
target_system: Some("/nix/store/node01-next".to_string()),
flake_ref: None,
switch_action: Some("switch".to_string()),
health_check_command: Vec::new(),
rollback_on_failure: Some(true),
drain_before_apply: Some(false),
};
let resolved = resolve_desired_system(
&test_node(),
Some(&desired),
"/opt/plasmacloud-src",
"switch",
&[],
true,
)
.expect("desired system should resolve");
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
assert_eq!(
resolved.target_system.as_deref(),
Some("/nix/store/node01-next")
);
assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src");
}
#[test] #[test]
fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() { fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
let desired = DesiredSystemSpec { let desired = DesiredSystemSpec {
node_id: "node01".to_string(), node_id: "node01".to_string(),
deployment_id: None, deployment_id: None,
nixos_configuration: Some("node01-next".to_string()), nixos_configuration: Some("node01-next".to_string()),
target_system: None,
flake_ref: None, flake_ref: None,
switch_action: None, switch_action: None,
health_check_command: Vec::new(), health_check_command: Vec::new(),
@ -873,7 +923,8 @@ mod tests {
#[test] #[test]
fn post_boot_health_check_is_requested_for_matching_staged_target() { fn post_boot_health_check_is_requested_for_matching_staged_target() {
let desired = ResolvedDesiredSystem { let desired = ResolvedDesiredSystem {
nixos_configuration: "node01".to_string(), nixos_configuration: Some("node01".to_string()),
target_system: None,
flake_ref: "/opt/plasmacloud-src".to_string(), flake_ref: "/opt/plasmacloud-src".to_string(),
switch_action: "boot".to_string(), switch_action: "boot".to_string(),
health_check_command: vec!["true".to_string()], health_check_command: vec!["true".to_string()],
@ -895,7 +946,8 @@ mod tests {
#[test] #[test]
fn post_boot_health_check_is_skipped_for_non_matching_state() { fn post_boot_health_check_is_skipped_for_non_matching_state() {
let desired = ResolvedDesiredSystem { let desired = ResolvedDesiredSystem {
nixos_configuration: "node01".to_string(), nixos_configuration: Some("node01".to_string()),
target_system: None,
flake_ref: "/opt/plasmacloud-src".to_string(), flake_ref: "/opt/plasmacloud-src".to_string(),
switch_action: "boot".to_string(), switch_action: "boot".to_string(),
health_check_command: vec!["true".to_string()], health_check_command: vec!["true".to_string()],

View file

@ -4,7 +4,8 @@ use chrono::Utc;
use clap::Args; use clap::Args;
use deployer_types::{ use deployer_types::{
ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector, ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector,
HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState, ServiceInstanceSpec, HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState,
ServiceInstanceSpec,
}; };
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
use std::time::Duration; use std::time::Duration;
@ -286,7 +287,9 @@ impl HostDeploymentController {
let key = String::from_utf8_lossy(&key); let key = String::from_utf8_lossy(&key);
match serde_json::from_slice::<ServiceInstanceSpec>(&value) { match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
Ok(instance) => instances.push(instance), Ok(instance) => instances.push(instance),
Err(error) => warn!(error = %error, key = %key, "failed to decode service instance"), Err(error) => {
warn!(error = %error, key = %key, "failed to decode service instance")
}
} }
} }
@ -336,7 +339,9 @@ impl HostDeploymentController {
Ok(status) => { Ok(status) => {
statuses.insert(status.name.clone(), status); statuses.insert(status.name.clone(), status);
} }
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment status"), Err(error) => {
warn!(error = %error, key = %key, "failed to decode host deployment status")
}
} }
} }
@ -393,17 +398,28 @@ fn plan_host_deployment(
for node in &selector_matches { for node in &selector_matches {
let desired = desired_systems.get(&node.node_id); let desired = desired_systems.get(&node.node_id);
let observed = observed_systems.get(&node.node_id); let observed = observed_systems.get(&node.node_id);
let is_completed = let is_completed = is_node_completed(
is_node_completed(deployment, node, desired, observed, target_configuration.as_deref()); deployment,
node,
desired,
observed,
target_configuration.as_deref(),
);
let is_failed = is_node_failed(deployment, desired, observed); let is_failed = is_node_failed(deployment, desired, observed);
let is_in_progress = is_node_in_progress(deployment, desired, observed, is_completed, is_failed) let is_in_progress =
is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
|| (deployment.drain_before_apply == Some(true) || (deployment.drain_before_apply == Some(true)
&& node.state.as_deref() == Some("draining") && node.state.as_deref() == Some("draining")
&& instance_counts.get(&node.node_id).copied().unwrap_or_default() > 0); && instance_counts
.get(&node.node_id)
.copied()
.unwrap_or_default()
> 0);
if is_completed { if is_completed {
completed.push(node.node_id.clone()); completed.push(node.node_id.clone());
if deployment.drain_before_apply == Some(true) && node.state.as_deref() == Some("draining") if deployment.drain_before_apply == Some(true)
&& node.state.as_deref() == Some("draining")
{ {
let mut updated = (*node).clone(); let mut updated = (*node).clone();
updated.state = Some("active".to_string()); updated.state = Some("active".to_string());
@ -431,7 +447,8 @@ fn plan_host_deployment(
let paused = operator_paused || spec_paused || !failed.is_empty(); let paused = operator_paused || spec_paused || !failed.is_empty();
let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable); let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable);
let remaining_batch_budget = batch_size.saturating_sub(in_progress.len()); let remaining_batch_budget = batch_size.saturating_sub(in_progress.len());
let max_starts = if deployment.nixos_configuration.is_some() { let max_starts =
if deployment.nixos_configuration.is_some() || deployment.target_system.is_some() {
remaining_unavailable_budget.min(remaining_batch_budget) remaining_unavailable_budget.min(remaining_batch_budget)
} else { } else {
0 0
@ -445,7 +462,10 @@ fn plan_host_deployment(
break; break;
} }
let remaining_instances = instance_counts.get(&node.node_id).copied().unwrap_or_default(); let remaining_instances = instance_counts
.get(&node.node_id)
.copied()
.unwrap_or_default();
if deployment.drain_before_apply == Some(true) && remaining_instances > 0 { if deployment.drain_before_apply == Some(true) && remaining_instances > 0 {
let mut updated = node.clone(); let mut updated = node.clone();
updated.state = Some("draining".to_string()); updated.state = Some("draining".to_string());
@ -460,8 +480,12 @@ fn plan_host_deployment(
node_id: node.node_id.clone(), node_id: node.node_id.clone(),
deployment_id: Some(deployment.name.clone()), deployment_id: Some(deployment.name.clone()),
nixos_configuration: deployment.nixos_configuration.clone(), nixos_configuration: deployment.nixos_configuration.clone(),
target_system: deployment.target_system.clone(),
flake_ref: deployment.flake_ref.clone(), flake_ref: deployment.flake_ref.clone(),
switch_action: deployment.switch_action.clone().or_else(|| Some("switch".to_string())), switch_action: deployment
.switch_action
.clone()
.or_else(|| Some("switch".to_string())),
health_check_command: deployment.health_check_command.clone(), health_check_command: deployment.health_check_command.clone(),
rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)), rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)),
drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)), drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)),
@ -469,7 +493,8 @@ fn plan_host_deployment(
newly_started.push(node.node_id.clone()); newly_started.push(node.node_id.clone());
in_progress.push(node.node_id.clone()); in_progress.push(node.node_id.clone());
planned += 1; planned += 1;
if deployment.drain_before_apply == Some(true) && node.state.as_deref() != Some("draining") if deployment.drain_before_apply == Some(true)
&& node.state.as_deref() != Some("draining")
{ {
let mut updated = node.clone(); let mut updated = node.clone();
updated.state = Some("draining".to_string()); updated.state = Some("draining".to_string());
@ -481,15 +506,19 @@ fn plan_host_deployment(
let mut status = existing_status.cloned().unwrap_or_default(); let mut status = existing_status.cloned().unwrap_or_default();
status.name = deployment.name.clone(); status.name = deployment.name.clone();
status.selected_nodes = selector_matches.iter().map(|node| node.node_id.clone()).collect(); status.selected_nodes = selector_matches
.iter()
.map(|node| node.node_id.clone())
.collect();
status.completed_nodes = dedup_sorted(completed); status.completed_nodes = dedup_sorted(completed);
status.in_progress_nodes = dedup_sorted(in_progress); status.in_progress_nodes = dedup_sorted(in_progress);
status.failed_nodes = dedup_sorted(failed); status.failed_nodes = dedup_sorted(failed);
status.paused_by_operator = operator_paused; status.paused_by_operator = operator_paused;
status.paused = paused; status.paused = paused;
status.phase = Some(if status.selected_nodes.is_empty() { status.phase = Some(
if status.selected_nodes.is_empty() {
"idle" "idle"
} else if deployment.nixos_configuration.is_none() { } else if deployment.nixos_configuration.is_none() && deployment.target_system.is_none() {
"invalid" "invalid"
} else if status.paused { } else if status.paused {
"paused" "paused"
@ -500,7 +529,8 @@ fn plan_host_deployment(
} else { } else {
"ready" "ready"
} }
.to_string()); .to_string(),
);
status.message = Some(format!( status.message = Some(format!(
"selected={} completed={} in_progress={} failed={} newly_started={}", "selected={} completed={} in_progress={} failed={} newly_started={}",
status.selected_nodes.len(), status.selected_nodes.len(),
@ -585,9 +615,7 @@ fn node_is_rollout_candidate(node: &ClusterNodeRecord, heartbeat_timeout_secs: u
} }
if matches!( if matches!(
node.install_state, node.install_state,
Some( Some(InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested)
InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested
)
) { ) {
return false; return false;
} }
@ -612,9 +640,17 @@ fn is_node_completed(
) -> bool { ) -> bool {
observed observed
.filter(|observed| observed.status.as_deref() == Some("active")) .filter(|observed| observed.status.as_deref() == Some("active"))
.and_then(|observed| observed.nixos_configuration.as_deref()) .map(|observed| {
.zip(target_configuration) target_configuration
.map(|(observed_configuration, target)| observed_configuration == target) .map(|target| observed.nixos_configuration.as_deref() == Some(target))
.or_else(|| {
deployment
.target_system
.as_deref()
.map(|target| observed.target_system.as_deref() == Some(target))
})
.unwrap_or(false)
})
.unwrap_or(false) .unwrap_or(false)
&& desired && desired
.and_then(|desired| desired.deployment_id.as_deref()) .and_then(|desired| desired.deployment_id.as_deref())
@ -653,7 +689,12 @@ fn is_node_in_progress(
.unwrap_or(false) .unwrap_or(false)
|| observed || observed
.and_then(|observed| observed.status.as_deref()) .and_then(|observed| observed.status.as_deref())
.map(|status| matches!(status, "planning" | "pending" | "reconciling" | "verifying" | "staged")) .map(|status| {
matches!(
status,
"planning" | "pending" | "reconciling" | "verifying" | "staged"
)
})
.unwrap_or(false) .unwrap_or(false)
} }
@ -706,6 +747,7 @@ mod tests {
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]), match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
}, },
nixos_configuration: Some("worker-golden".to_string()), nixos_configuration: Some("worker-golden".to_string()),
target_system: Some("/nix/store/worker-golden".to_string()),
flake_ref: Some("/opt/plasmacloud-src".to_string()), flake_ref: Some("/opt/plasmacloud-src".to_string()),
batch_size: Some(1), batch_size: Some(1),
max_unavailable: Some(1), max_unavailable: Some(1),
@ -733,6 +775,10 @@ mod tests {
); );
assert_eq!(plan.desired_upserts.len(), 1); assert_eq!(plan.desired_upserts.len(), 1);
assert_eq!(
plan.desired_upserts[0].target_system.as_deref(),
Some("/nix/store/worker-golden")
);
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]); assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
assert_eq!(plan.status.phase.as_deref(), Some("running")); assert_eq!(plan.status.phase.as_deref(), Some("running"));
} }
@ -747,6 +793,7 @@ mod tests {
node_id: "node01".to_string(), node_id: "node01".to_string(),
deployment_id: Some("worker-rollout".to_string()), deployment_id: Some("worker-rollout".to_string()),
nixos_configuration: Some("worker-golden".to_string()), nixos_configuration: Some("worker-golden".to_string()),
target_system: Some("/nix/store/worker-golden".to_string()),
flake_ref: None, flake_ref: None,
switch_action: Some("switch".to_string()), switch_action: Some("switch".to_string()),
health_check_command: Vec::new(), health_check_command: Vec::new(),
@ -764,15 +811,7 @@ mod tests {
}, },
)]); )]);
let plan = plan_host_deployment( let plan = plan_host_deployment(&deployment, None, &nodes, &desired, &observed, &[], 300);
&deployment,
None,
&nodes,
&desired,
&observed,
&[],
300,
);
assert!(plan.desired_upserts.is_empty()); assert!(plan.desired_upserts.is_empty());
assert!(plan.status.paused); assert!(plan.status.paused);

View file

@ -21,6 +21,7 @@ This flow:
nix run ./nix/test-cluster#cluster -- fresh-smoke nix run ./nix/test-cluster#cluster -- fresh-smoke
nix run ./nix/test-cluster#cluster -- fresh-matrix nix run ./nix/test-cluster#cluster -- fresh-matrix
nix run ./nix/test-cluster#cluster -- fresh-bench-storage nix run ./nix/test-cluster#cluster -- fresh-bench-storage
nix build .#checks.x86_64-linux.deployer-vm-smoke
``` ```
Use these three commands as the release-facing local proof set: Use these three commands as the release-facing local proof set:
@ -28,6 +29,7 @@ Use these three commands as the release-facing local proof set:
- `fresh-smoke`: whole-cluster readiness, core behavior, and fault injection - `fresh-smoke`: whole-cluster readiness, core behavior, and fault injection
- `fresh-matrix`: composed service scenarios such as `prismnet + flashdns + fiberlb` and PrismNet-backed VM hosting bundles with `plasmavmc + coronafs + lightningstor` - `fresh-matrix`: composed service scenarios such as `prismnet + flashdns + fiberlb` and PrismNet-backed VM hosting bundles with `plasmavmc + coronafs + lightningstor`
- `fresh-bench-storage`: CoronaFS local-vs-shared-volume throughput, cross-worker volume visibility, and LightningStor large/small-object throughput capture - `fresh-bench-storage`: CoronaFS local-vs-shared-volume throughput, cross-worker volume visibility, and LightningStor large/small-object throughput capture
- `deployer-vm-smoke`: prebuilt NixOS system closure handoff into `nix-agent`, proving host rollout can activate a host-built target without guest-side compilation
## Operational Commands ## Operational Commands

View file

@ -45,6 +45,12 @@ let
description = "Name of the nixosConfigurations output to activate"; description = "Name of the nixosConfigurations output to activate";
}; };
targetSystem = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional prebuilt NixOS system closure path activated directly by nix-agent";
};
flakeRef = mkOption { flakeRef = mkOption {
type = types.nullOr types.str; type = types.nullOr types.str;
default = null; default = null;
@ -128,6 +134,12 @@ let
description = "Name of the nixosConfigurations output to roll out"; description = "Name of the nixosConfigurations output to roll out";
}; };
targetSystem = mkOption {
type = types.nullOr types.str;
default = null;
description = "Optional prebuilt NixOS system closure path handed directly to nix-agent";
};
flakeRef = mkOption { flakeRef = mkOption {
type = types.nullOr types.str; type = types.nullOr types.str;
default = null; default = null;
@ -963,6 +975,9 @@ let
// optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) { // optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
nixos_configuration = desiredSystem.nixosConfiguration; nixos_configuration = desiredSystem.nixosConfiguration;
} }
// optionalAttrs (desiredSystem != null && desiredSystem.targetSystem != null) {
target_system = desiredSystem.targetSystem;
}
// optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) { // optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) {
flake_ref = desiredSystem.flakeRef; flake_ref = desiredSystem.flakeRef;
} }
@ -1000,6 +1015,9 @@ let
// optionalAttrs (deployment.nixosConfiguration != null) { // optionalAttrs (deployment.nixosConfiguration != null) {
nixos_configuration = deployment.nixosConfiguration; nixos_configuration = deployment.nixosConfiguration;
} }
// optionalAttrs (deployment.targetSystem != null) {
target_system = deployment.targetSystem;
}
// optionalAttrs (deployment.flakeRef != null) { // optionalAttrs (deployment.flakeRef != null) {
flake_ref = deployment.flakeRef; flake_ref = deployment.flakeRef;
} }

View file

@ -103,6 +103,7 @@ in
import time import time
desired_system_overrides = json.loads("""${desiredSystemOverridesJson}""") desired_system_overrides = json.loads("""${desiredSystemOverridesJson}""")
smoke_target_system = "${smokeTargetToplevel}"
def write_remote_json(machine, path, payload): def write_remote_json(machine, path, payload):
machine.succeed( machine.succeed(
@ -151,6 +152,7 @@ in
}, },
"desired_system": { "desired_system": {
"nixos_configuration": "vm-smoke-target", "nixos_configuration": "vm-smoke-target",
"target_system": smoke_target_system,
**desired_system_overrides, **desired_system_overrides,
}, },
"state": "pending", "state": "pending",
@ -387,7 +389,7 @@ in
assert observed["status"] == "${expectedStatus}", observed assert observed["status"] == "${expectedStatus}", observed
assert observed["nixos_configuration"] == "vm-smoke-target" assert observed["nixos_configuration"] == "vm-smoke-target"
assert observed["flake_root"] == "/var/lib/photon-src" assert observed["flake_root"] == "/var/lib/photon-src"
assert observed["target_system"].startswith("/nix/store/") assert observed["target_system"] == smoke_target_system
current_system = worker.succeed("readlink -f /run/current-system").strip() current_system = worker.succeed("readlink -f /run/current-system").strip()
print("worker_current_system=", current_system) print("worker_current_system=", current_system)
if ${if expectCurrentSystemMatchesTarget then "True" else "False"}: if ${if expectCurrentSystemMatchesTarget then "True" else "False"}: