Add prebuilt system closure support for host rollouts
This commit is contained in:
parent
d6d96b8c37
commit
795b8ad70c
8 changed files with 230 additions and 78 deletions
|
|
@ -48,3 +48,5 @@ nix run ./nix/test-cluster#cluster -- fresh-smoke
|
|||
## Scope
|
||||
|
||||
PhotonCloud is centered on reproducible infrastructure behavior rather than polished end-user product surfaces. Some services, such as `creditservice`, are intentionally minimal reference implementations that prove integration points rather than full products.
|
||||
|
||||
Host-level NixOS rollout validation is also expected to stay reproducible: the `deployer-vm-smoke` VM test now proves that `nix-agent` can activate a prebuilt target system closure directly, without recompiling the stack inside the guest.
|
||||
|
|
|
|||
|
|
@ -236,7 +236,7 @@ fn desired_system_from_spec(node: &NodeSpec) -> Option<DesiredSystemSpec> {
|
|||
if desired.drain_before_apply.is_none() {
|
||||
desired.drain_before_apply = Some(false);
|
||||
}
|
||||
if desired.nixos_configuration.is_some() {
|
||||
if desired.nixos_configuration.is_some() || desired.target_system.is_some() {
|
||||
Some(desired)
|
||||
} else {
|
||||
None
|
||||
|
|
@ -882,7 +882,9 @@ pub async fn inspect_node(
|
|||
if let Some(observed_system) = observed_system {
|
||||
println!(
|
||||
"observed_status={}",
|
||||
observed_system.status.unwrap_or_else(|| "unknown".to_string())
|
||||
observed_system
|
||||
.status
|
||||
.unwrap_or_else(|| "unknown".to_string())
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -1090,7 +1092,8 @@ pub async fn set_host_deployment_paused(
|
|||
let deployment_name = deployment_name.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
let spec_key =
|
||||
key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
if client.get(&spec_key).await?.is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"host deployment {} not found",
|
||||
|
|
@ -1116,7 +1119,9 @@ pub async fn set_host_deployment_paused(
|
|||
"resumed by operator".to_string()
|
||||
});
|
||||
status.updated_at = Some(Utc::now());
|
||||
client.put(&status_key, &serde_json::to_vec(&status)?).await?;
|
||||
client
|
||||
.put(&status_key, &serde_json::to_vec(&status)?)
|
||||
.await?;
|
||||
println!("{}", serde_json::to_string_pretty(&status)?);
|
||||
Ok(())
|
||||
}
|
||||
|
|
@ -1138,7 +1143,8 @@ pub async fn abort_host_deployment(
|
|||
let deployment_name = deployment_name.to_string();
|
||||
async move {
|
||||
let mut client = Client::connect(endpoint).await?;
|
||||
let spec_key = key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
let spec_key =
|
||||
key_host_deployment_spec(&cluster_namespace, &cluster_id, &deployment_name);
|
||||
if client.get(&spec_key).await?.is_none() {
|
||||
return Err(anyhow::anyhow!(
|
||||
"host deployment {} not found",
|
||||
|
|
@ -1512,6 +1518,7 @@ mod tests {
|
|||
node_id: String::new(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
target_system: Some("/nix/store/node01-next".to_string()),
|
||||
flake_ref: Some("github:centra/cloud".to_string()),
|
||||
switch_action: Some("boot".to_string()),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
|
|
@ -1523,6 +1530,10 @@ mod tests {
|
|||
let desired = desired_system_from_spec(&resolved[0]).expect("desired system should exist");
|
||||
assert_eq!(desired.node_id, "node01");
|
||||
assert_eq!(desired.nixos_configuration.as_deref(), Some("node01-next"));
|
||||
assert_eq!(
|
||||
desired.target_system.as_deref(),
|
||||
Some("/nix/store/node01-next")
|
||||
);
|
||||
assert_eq!(desired.flake_ref.as_deref(), Some("github:centra/cloud"));
|
||||
assert_eq!(desired.switch_action.as_deref(), Some("boot"));
|
||||
assert_eq!(desired.health_check_command, vec!["true".to_string()]);
|
||||
|
|
|
|||
|
|
@ -605,6 +605,9 @@ pub struct DesiredSystemSpec {
|
|||
pub deployment_id: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub nixos_configuration: Option<String>,
|
||||
/// Optional prebuilt NixOS system closure path to activate directly.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub target_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub flake_ref: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
|
|
@ -756,6 +759,9 @@ pub struct HostDeploymentSpec {
|
|||
pub selector: HostDeploymentSelector,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub nixos_configuration: Option<String>,
|
||||
/// Optional prebuilt NixOS system closure path handed directly to nix-agent.
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub target_system: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
pub flake_ref: Option<String>,
|
||||
#[serde(default, skip_serializing_if = "Option::is_none")]
|
||||
|
|
@ -1238,6 +1244,7 @@ mod tests {
|
|||
node_id: "node01".to_string(),
|
||||
deployment_id: Some("worker-rollout".to_string()),
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
target_system: Some("/nix/store/system-node01".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
switch_action: Some("switch".to_string()),
|
||||
health_check_command: vec!["systemctl".to_string(), "is-system-running".to_string()],
|
||||
|
|
@ -1250,6 +1257,10 @@ mod tests {
|
|||
assert_eq!(decoded.node_id, "node01");
|
||||
assert_eq!(decoded.deployment_id.as_deref(), Some("worker-rollout"));
|
||||
assert_eq!(decoded.nixos_configuration.as_deref(), Some("node01"));
|
||||
assert_eq!(
|
||||
decoded.target_system.as_deref(),
|
||||
Some("/nix/store/system-node01")
|
||||
);
|
||||
assert_eq!(decoded.health_check_command.len(), 2);
|
||||
assert_eq!(decoded.rollback_on_failure, Some(true));
|
||||
assert_eq!(decoded.drain_before_apply, Some(true));
|
||||
|
|
@ -1267,6 +1278,7 @@ mod tests {
|
|||
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
|
||||
},
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
target_system: Some("/nix/store/worker-golden".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
batch_size: Some(1),
|
||||
max_unavailable: Some(1),
|
||||
|
|
@ -1283,9 +1295,17 @@ mod tests {
|
|||
assert_eq!(decoded.name, "worker-rollout");
|
||||
assert_eq!(decoded.batch_size, Some(1));
|
||||
assert_eq!(decoded.max_unavailable, Some(1));
|
||||
assert_eq!(
|
||||
decoded.target_system.as_deref(),
|
||||
Some("/nix/store/worker-golden")
|
||||
);
|
||||
assert_eq!(decoded.selector.roles, vec!["worker".to_string()]);
|
||||
assert_eq!(
|
||||
decoded.selector.match_labels.get("tier").map(String::as_str),
|
||||
decoded
|
||||
.selector
|
||||
.match_labels
|
||||
.get("tier")
|
||||
.map(String::as_str),
|
||||
Some("general")
|
||||
);
|
||||
assert_eq!(decoded.drain_before_apply, Some(true));
|
||||
|
|
@ -1318,10 +1338,16 @@ mod tests {
|
|||
|
||||
let json = serde_json::to_string(&node).unwrap();
|
||||
let decoded: ClusterNodeRecord = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(decoded.commission_state, Some(CommissionState::Commissioned));
|
||||
assert_eq!(
|
||||
decoded.commission_state,
|
||||
Some(CommissionState::Commissioned)
|
||||
);
|
||||
assert_eq!(decoded.install_state, Some(InstallState::Installed));
|
||||
assert_eq!(decoded.power_state, Some(PowerState::On));
|
||||
assert_eq!(decoded.bmc_ref.as_deref(), Some("redfish://lab-rack-a/node01"));
|
||||
assert_eq!(
|
||||
decoded.bmc_ref.as_deref(),
|
||||
Some("redfish://lab-rack-a/node01")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
|
|
|||
|
|
@ -97,7 +97,8 @@ struct Agent {
|
|||
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
struct ResolvedDesiredSystem {
|
||||
nixos_configuration: String,
|
||||
nixos_configuration: Option<String>,
|
||||
target_system: Option<String>,
|
||||
flake_ref: String,
|
||||
switch_action: String,
|
||||
health_check_command: Vec<String>,
|
||||
|
|
@ -298,8 +299,7 @@ impl Agent {
|
|||
Some("draining")
|
||||
if !desired
|
||||
.map(|spec| {
|
||||
spec.deployment_id.is_some()
|
||||
&& spec.drain_before_apply.unwrap_or(false)
|
||||
spec.deployment_id.is_some() && spec.drain_before_apply.unwrap_or(false)
|
||||
})
|
||||
.unwrap_or(false) =>
|
||||
{
|
||||
|
|
@ -321,7 +321,8 @@ impl Agent {
|
|||
return Ok(());
|
||||
};
|
||||
info!(
|
||||
nixos_configuration = %desired.nixos_configuration,
|
||||
nixos_configuration = desired.nixos_configuration.as_deref().unwrap_or(""),
|
||||
target_system = desired.target_system.as_deref().unwrap_or(""),
|
||||
flake_ref = %desired.flake_ref,
|
||||
switch_action = %desired.switch_action,
|
||||
rollback_on_failure = desired.rollback_on_failure,
|
||||
|
|
@ -329,7 +330,7 @@ impl Agent {
|
|||
"resolved desired system"
|
||||
);
|
||||
|
||||
observed.nixos_configuration = Some(desired.nixos_configuration.clone());
|
||||
observed.nixos_configuration = desired.nixos_configuration.clone();
|
||||
observed.flake_root = Some(desired.flake_ref.clone());
|
||||
observed.switch_action = Some(desired.switch_action.clone());
|
||||
|
||||
|
|
@ -341,17 +342,25 @@ impl Agent {
|
|||
previous_system = previous_system.as_deref().unwrap_or(""),
|
||||
"selected rollback baseline"
|
||||
);
|
||||
let target_system = self
|
||||
.build_target_system(&desired.flake_ref, &desired.nixos_configuration)
|
||||
let target_system = match desired.target_system.as_deref() {
|
||||
Some(target_system) => {
|
||||
info!(target_system, "using prebuilt target system");
|
||||
target_system.to_string()
|
||||
}
|
||||
None => {
|
||||
let configuration = desired
|
||||
.nixos_configuration
|
||||
.as_deref()
|
||||
.ok_or_else(|| anyhow!("desired system did not specify nixos_configuration"))?;
|
||||
self.build_target_system(&desired.flake_ref, configuration)
|
||||
.await
|
||||
.with_context(|| {
|
||||
format!(
|
||||
"failed to build target system for {}",
|
||||
desired.nixos_configuration
|
||||
)
|
||||
})?;
|
||||
format!("failed to build target system for {}", configuration)
|
||||
})?
|
||||
}
|
||||
};
|
||||
observed.target_system = Some(target_system.clone());
|
||||
info!(target_system = %target_system, "built target system");
|
||||
info!(target_system = %target_system, "resolved target system");
|
||||
|
||||
if observed.current_system.as_deref() == Some(target_system.as_str()) {
|
||||
info!("target system already active");
|
||||
|
|
@ -575,10 +584,16 @@ fn resolve_desired_system(
|
|||
node.install_plan
|
||||
.as_ref()
|
||||
.and_then(|plan| plan.nixos_configuration.clone())
|
||||
})?;
|
||||
});
|
||||
let target_system = desired.and_then(|spec| spec.target_system.clone());
|
||||
|
||||
if nixos_configuration.is_none() && target_system.is_none() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(ResolvedDesiredSystem {
|
||||
nixos_configuration,
|
||||
target_system,
|
||||
flake_ref: desired
|
||||
.and_then(|spec| spec.flake_ref.clone())
|
||||
.unwrap_or_else(|| local_flake_root.to_string()),
|
||||
|
|
@ -787,7 +802,8 @@ mod tests {
|
|||
true,
|
||||
)
|
||||
.expect("desired system should resolve");
|
||||
assert_eq!(resolved.nixos_configuration, "node01");
|
||||
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01"));
|
||||
assert_eq!(resolved.target_system, None);
|
||||
assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src");
|
||||
assert_eq!(resolved.switch_action, "switch");
|
||||
assert!(resolved.rollback_on_failure);
|
||||
|
|
@ -799,6 +815,7 @@ mod tests {
|
|||
node_id: "node01".to_string(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
target_system: None,
|
||||
flake_ref: Some("github:centra/cloud".to_string()),
|
||||
switch_action: Some("boot".to_string()),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
|
|
@ -815,19 +832,52 @@ mod tests {
|
|||
false,
|
||||
)
|
||||
.expect("desired system should resolve");
|
||||
assert_eq!(resolved.nixos_configuration, "node01-next");
|
||||
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
|
||||
assert_eq!(resolved.flake_ref, "github:centra/cloud");
|
||||
assert_eq!(resolved.switch_action, "boot");
|
||||
assert_eq!(resolved.health_check_command, vec!["true".to_string()]);
|
||||
assert!(resolved.rollback_on_failure);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_desired_system_accepts_prebuilt_target_system() {
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
target_system: Some("/nix/store/node01-next".to_string()),
|
||||
flake_ref: None,
|
||||
switch_action: Some("switch".to_string()),
|
||||
health_check_command: Vec::new(),
|
||||
rollback_on_failure: Some(true),
|
||||
drain_before_apply: Some(false),
|
||||
};
|
||||
|
||||
let resolved = resolve_desired_system(
|
||||
&test_node(),
|
||||
Some(&desired),
|
||||
"/opt/plasmacloud-src",
|
||||
"switch",
|
||||
&[],
|
||||
true,
|
||||
)
|
||||
.expect("desired system should resolve");
|
||||
|
||||
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
|
||||
assert_eq!(
|
||||
resolved.target_system.as_deref(),
|
||||
Some("/nix/store/node01-next")
|
||||
);
|
||||
assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
|
||||
let desired = DesiredSystemSpec {
|
||||
node_id: "node01".to_string(),
|
||||
deployment_id: None,
|
||||
nixos_configuration: Some("node01-next".to_string()),
|
||||
target_system: None,
|
||||
flake_ref: None,
|
||||
switch_action: None,
|
||||
health_check_command: Vec::new(),
|
||||
|
|
@ -873,7 +923,8 @@ mod tests {
|
|||
#[test]
|
||||
fn post_boot_health_check_is_requested_for_matching_staged_target() {
|
||||
let desired = ResolvedDesiredSystem {
|
||||
nixos_configuration: "node01".to_string(),
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
target_system: None,
|
||||
flake_ref: "/opt/plasmacloud-src".to_string(),
|
||||
switch_action: "boot".to_string(),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
|
|
@ -895,7 +946,8 @@ mod tests {
|
|||
#[test]
|
||||
fn post_boot_health_check_is_skipped_for_non_matching_state() {
|
||||
let desired = ResolvedDesiredSystem {
|
||||
nixos_configuration: "node01".to_string(),
|
||||
nixos_configuration: Some("node01".to_string()),
|
||||
target_system: None,
|
||||
flake_ref: "/opt/plasmacloud-src".to_string(),
|
||||
switch_action: "boot".to_string(),
|
||||
health_check_command: vec!["true".to_string()],
|
||||
|
|
|
|||
|
|
@ -4,7 +4,8 @@ use chrono::Utc;
|
|||
use clap::Args;
|
||||
use deployer_types::{
|
||||
ClusterNodeRecord, CommissionState, DesiredSystemSpec, HostDeploymentSelector,
|
||||
HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState, ServiceInstanceSpec,
|
||||
HostDeploymentSpec, HostDeploymentStatus, InstallState, ObservedSystemState,
|
||||
ServiceInstanceSpec,
|
||||
};
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::time::Duration;
|
||||
|
|
@ -286,7 +287,9 @@ impl HostDeploymentController {
|
|||
let key = String::from_utf8_lossy(&key);
|
||||
match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
|
||||
Ok(instance) => instances.push(instance),
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode service instance"),
|
||||
Err(error) => {
|
||||
warn!(error = %error, key = %key, "failed to decode service instance")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -336,7 +339,9 @@ impl HostDeploymentController {
|
|||
Ok(status) => {
|
||||
statuses.insert(status.name.clone(), status);
|
||||
}
|
||||
Err(error) => warn!(error = %error, key = %key, "failed to decode host deployment status"),
|
||||
Err(error) => {
|
||||
warn!(error = %error, key = %key, "failed to decode host deployment status")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -393,17 +398,28 @@ fn plan_host_deployment(
|
|||
for node in &selector_matches {
|
||||
let desired = desired_systems.get(&node.node_id);
|
||||
let observed = observed_systems.get(&node.node_id);
|
||||
let is_completed =
|
||||
is_node_completed(deployment, node, desired, observed, target_configuration.as_deref());
|
||||
let is_completed = is_node_completed(
|
||||
deployment,
|
||||
node,
|
||||
desired,
|
||||
observed,
|
||||
target_configuration.as_deref(),
|
||||
);
|
||||
let is_failed = is_node_failed(deployment, desired, observed);
|
||||
let is_in_progress = is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
|
||||
let is_in_progress =
|
||||
is_node_in_progress(deployment, desired, observed, is_completed, is_failed)
|
||||
|| (deployment.drain_before_apply == Some(true)
|
||||
&& node.state.as_deref() == Some("draining")
|
||||
&& instance_counts.get(&node.node_id).copied().unwrap_or_default() > 0);
|
||||
&& instance_counts
|
||||
.get(&node.node_id)
|
||||
.copied()
|
||||
.unwrap_or_default()
|
||||
> 0);
|
||||
|
||||
if is_completed {
|
||||
completed.push(node.node_id.clone());
|
||||
if deployment.drain_before_apply == Some(true) && node.state.as_deref() == Some("draining")
|
||||
if deployment.drain_before_apply == Some(true)
|
||||
&& node.state.as_deref() == Some("draining")
|
||||
{
|
||||
let mut updated = (*node).clone();
|
||||
updated.state = Some("active".to_string());
|
||||
|
|
@ -431,7 +447,8 @@ fn plan_host_deployment(
|
|||
let paused = operator_paused || spec_paused || !failed.is_empty();
|
||||
let remaining_unavailable_budget = max_unavailable.saturating_sub(unavailable);
|
||||
let remaining_batch_budget = batch_size.saturating_sub(in_progress.len());
|
||||
let max_starts = if deployment.nixos_configuration.is_some() {
|
||||
let max_starts =
|
||||
if deployment.nixos_configuration.is_some() || deployment.target_system.is_some() {
|
||||
remaining_unavailable_budget.min(remaining_batch_budget)
|
||||
} else {
|
||||
0
|
||||
|
|
@ -445,7 +462,10 @@ fn plan_host_deployment(
|
|||
break;
|
||||
}
|
||||
|
||||
let remaining_instances = instance_counts.get(&node.node_id).copied().unwrap_or_default();
|
||||
let remaining_instances = instance_counts
|
||||
.get(&node.node_id)
|
||||
.copied()
|
||||
.unwrap_or_default();
|
||||
if deployment.drain_before_apply == Some(true) && remaining_instances > 0 {
|
||||
let mut updated = node.clone();
|
||||
updated.state = Some("draining".to_string());
|
||||
|
|
@ -460,8 +480,12 @@ fn plan_host_deployment(
|
|||
node_id: node.node_id.clone(),
|
||||
deployment_id: Some(deployment.name.clone()),
|
||||
nixos_configuration: deployment.nixos_configuration.clone(),
|
||||
target_system: deployment.target_system.clone(),
|
||||
flake_ref: deployment.flake_ref.clone(),
|
||||
switch_action: deployment.switch_action.clone().or_else(|| Some("switch".to_string())),
|
||||
switch_action: deployment
|
||||
.switch_action
|
||||
.clone()
|
||||
.or_else(|| Some("switch".to_string())),
|
||||
health_check_command: deployment.health_check_command.clone(),
|
||||
rollback_on_failure: Some(deployment.rollback_on_failure.unwrap_or(true)),
|
||||
drain_before_apply: Some(deployment.drain_before_apply.unwrap_or(false)),
|
||||
|
|
@ -469,7 +493,8 @@ fn plan_host_deployment(
|
|||
newly_started.push(node.node_id.clone());
|
||||
in_progress.push(node.node_id.clone());
|
||||
planned += 1;
|
||||
if deployment.drain_before_apply == Some(true) && node.state.as_deref() != Some("draining")
|
||||
if deployment.drain_before_apply == Some(true)
|
||||
&& node.state.as_deref() != Some("draining")
|
||||
{
|
||||
let mut updated = node.clone();
|
||||
updated.state = Some("draining".to_string());
|
||||
|
|
@ -481,15 +506,19 @@ fn plan_host_deployment(
|
|||
|
||||
let mut status = existing_status.cloned().unwrap_or_default();
|
||||
status.name = deployment.name.clone();
|
||||
status.selected_nodes = selector_matches.iter().map(|node| node.node_id.clone()).collect();
|
||||
status.selected_nodes = selector_matches
|
||||
.iter()
|
||||
.map(|node| node.node_id.clone())
|
||||
.collect();
|
||||
status.completed_nodes = dedup_sorted(completed);
|
||||
status.in_progress_nodes = dedup_sorted(in_progress);
|
||||
status.failed_nodes = dedup_sorted(failed);
|
||||
status.paused_by_operator = operator_paused;
|
||||
status.paused = paused;
|
||||
status.phase = Some(if status.selected_nodes.is_empty() {
|
||||
status.phase = Some(
|
||||
if status.selected_nodes.is_empty() {
|
||||
"idle"
|
||||
} else if deployment.nixos_configuration.is_none() {
|
||||
} else if deployment.nixos_configuration.is_none() && deployment.target_system.is_none() {
|
||||
"invalid"
|
||||
} else if status.paused {
|
||||
"paused"
|
||||
|
|
@ -500,7 +529,8 @@ fn plan_host_deployment(
|
|||
} else {
|
||||
"ready"
|
||||
}
|
||||
.to_string());
|
||||
.to_string(),
|
||||
);
|
||||
status.message = Some(format!(
|
||||
"selected={} completed={} in_progress={} failed={} newly_started={}",
|
||||
status.selected_nodes.len(),
|
||||
|
|
@ -585,9 +615,7 @@ fn node_is_rollout_candidate(node: &ClusterNodeRecord, heartbeat_timeout_secs: u
|
|||
}
|
||||
if matches!(
|
||||
node.install_state,
|
||||
Some(
|
||||
InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested
|
||||
)
|
||||
Some(InstallState::Installing | InstallState::Failed | InstallState::ReinstallRequested)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
|
@ -612,9 +640,17 @@ fn is_node_completed(
|
|||
) -> bool {
|
||||
observed
|
||||
.filter(|observed| observed.status.as_deref() == Some("active"))
|
||||
.and_then(|observed| observed.nixos_configuration.as_deref())
|
||||
.zip(target_configuration)
|
||||
.map(|(observed_configuration, target)| observed_configuration == target)
|
||||
.map(|observed| {
|
||||
target_configuration
|
||||
.map(|target| observed.nixos_configuration.as_deref() == Some(target))
|
||||
.or_else(|| {
|
||||
deployment
|
||||
.target_system
|
||||
.as_deref()
|
||||
.map(|target| observed.target_system.as_deref() == Some(target))
|
||||
})
|
||||
.unwrap_or(false)
|
||||
})
|
||||
.unwrap_or(false)
|
||||
&& desired
|
||||
.and_then(|desired| desired.deployment_id.as_deref())
|
||||
|
|
@ -653,7 +689,12 @@ fn is_node_in_progress(
|
|||
.unwrap_or(false)
|
||||
|| observed
|
||||
.and_then(|observed| observed.status.as_deref())
|
||||
.map(|status| matches!(status, "planning" | "pending" | "reconciling" | "verifying" | "staged"))
|
||||
.map(|status| {
|
||||
matches!(
|
||||
status,
|
||||
"planning" | "pending" | "reconciling" | "verifying" | "staged"
|
||||
)
|
||||
})
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
|
|
@ -706,6 +747,7 @@ mod tests {
|
|||
match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
|
||||
},
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
target_system: Some("/nix/store/worker-golden".to_string()),
|
||||
flake_ref: Some("/opt/plasmacloud-src".to_string()),
|
||||
batch_size: Some(1),
|
||||
max_unavailable: Some(1),
|
||||
|
|
@ -733,6 +775,10 @@ mod tests {
|
|||
);
|
||||
|
||||
assert_eq!(plan.desired_upserts.len(), 1);
|
||||
assert_eq!(
|
||||
plan.desired_upserts[0].target_system.as_deref(),
|
||||
Some("/nix/store/worker-golden")
|
||||
);
|
||||
assert_eq!(plan.status.in_progress_nodes, vec!["node01".to_string()]);
|
||||
assert_eq!(plan.status.phase.as_deref(), Some("running"));
|
||||
}
|
||||
|
|
@ -747,6 +793,7 @@ mod tests {
|
|||
node_id: "node01".to_string(),
|
||||
deployment_id: Some("worker-rollout".to_string()),
|
||||
nixos_configuration: Some("worker-golden".to_string()),
|
||||
target_system: Some("/nix/store/worker-golden".to_string()),
|
||||
flake_ref: None,
|
||||
switch_action: Some("switch".to_string()),
|
||||
health_check_command: Vec::new(),
|
||||
|
|
@ -764,15 +811,7 @@ mod tests {
|
|||
},
|
||||
)]);
|
||||
|
||||
let plan = plan_host_deployment(
|
||||
&deployment,
|
||||
None,
|
||||
&nodes,
|
||||
&desired,
|
||||
&observed,
|
||||
&[],
|
||||
300,
|
||||
);
|
||||
let plan = plan_host_deployment(&deployment, None, &nodes, &desired, &observed, &[], 300);
|
||||
|
||||
assert!(plan.desired_upserts.is_empty());
|
||||
assert!(plan.status.paused);
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ This flow:
|
|||
nix run ./nix/test-cluster#cluster -- fresh-smoke
|
||||
nix run ./nix/test-cluster#cluster -- fresh-matrix
|
||||
nix run ./nix/test-cluster#cluster -- fresh-bench-storage
|
||||
nix build .#checks.x86_64-linux.deployer-vm-smoke
|
||||
```
|
||||
|
||||
Use these three commands as the release-facing local proof set:
|
||||
|
|
@ -28,6 +29,7 @@ Use these three commands as the release-facing local proof set:
|
|||
- `fresh-smoke`: whole-cluster readiness, core behavior, and fault injection
|
||||
- `fresh-matrix`: composed service scenarios such as `prismnet + flashdns + fiberlb` and PrismNet-backed VM hosting bundles with `plasmavmc + coronafs + lightningstor`
|
||||
- `fresh-bench-storage`: CoronaFS local-vs-shared-volume throughput, cross-worker volume visibility, and LightningStor large/small-object throughput capture
|
||||
- `deployer-vm-smoke`: prebuilt NixOS system closure handoff into `nix-agent`, proving host rollout can activate a host-built target without guest-side compilation
|
||||
|
||||
## Operational Commands
|
||||
|
||||
|
|
|
|||
|
|
@ -45,6 +45,12 @@ let
|
|||
description = "Name of the nixosConfigurations output to activate";
|
||||
};
|
||||
|
||||
targetSystem = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional prebuilt NixOS system closure path activated directly by nix-agent";
|
||||
};
|
||||
|
||||
flakeRef = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
|
|
@ -128,6 +134,12 @@ let
|
|||
description = "Name of the nixosConfigurations output to roll out";
|
||||
};
|
||||
|
||||
targetSystem = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
description = "Optional prebuilt NixOS system closure path handed directly to nix-agent";
|
||||
};
|
||||
|
||||
flakeRef = mkOption {
|
||||
type = types.nullOr types.str;
|
||||
default = null;
|
||||
|
|
@ -963,6 +975,9 @@ let
|
|||
// optionalAttrs (desiredSystem != null && desiredSystem.nixosConfiguration != null) {
|
||||
nixos_configuration = desiredSystem.nixosConfiguration;
|
||||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.targetSystem != null) {
|
||||
target_system = desiredSystem.targetSystem;
|
||||
}
|
||||
// optionalAttrs (desiredSystem != null && desiredSystem.flakeRef != null) {
|
||||
flake_ref = desiredSystem.flakeRef;
|
||||
}
|
||||
|
|
@ -1000,6 +1015,9 @@ let
|
|||
// optionalAttrs (deployment.nixosConfiguration != null) {
|
||||
nixos_configuration = deployment.nixosConfiguration;
|
||||
}
|
||||
// optionalAttrs (deployment.targetSystem != null) {
|
||||
target_system = deployment.targetSystem;
|
||||
}
|
||||
// optionalAttrs (deployment.flakeRef != null) {
|
||||
flake_ref = deployment.flakeRef;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -103,6 +103,7 @@ in
|
|||
import time
|
||||
|
||||
desired_system_overrides = json.loads("""${desiredSystemOverridesJson}""")
|
||||
smoke_target_system = "${smokeTargetToplevel}"
|
||||
|
||||
def write_remote_json(machine, path, payload):
|
||||
machine.succeed(
|
||||
|
|
@ -151,6 +152,7 @@ in
|
|||
},
|
||||
"desired_system": {
|
||||
"nixos_configuration": "vm-smoke-target",
|
||||
"target_system": smoke_target_system,
|
||||
**desired_system_overrides,
|
||||
},
|
||||
"state": "pending",
|
||||
|
|
@ -387,7 +389,7 @@ in
|
|||
assert observed["status"] == "${expectedStatus}", observed
|
||||
assert observed["nixos_configuration"] == "vm-smoke-target"
|
||||
assert observed["flake_root"] == "/var/lib/photon-src"
|
||||
assert observed["target_system"].startswith("/nix/store/")
|
||||
assert observed["target_system"] == smoke_target_system
|
||||
current_system = worker.succeed("readlink -f /run/current-system").strip()
|
||||
print("worker_current_system=", current_system)
|
||||
if ${if expectCurrentSystemMatchesTarget then "True" else "False"}:
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue