970 lines
32 KiB
Rust
970 lines
32 KiB
Rust
use std::fs;
|
|
use std::path::Path;
|
|
use std::process::Stdio;
|
|
use std::time::Duration;
|
|
use std::time::Instant;
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
use chainfire_client::Client;
|
|
use chrono::Utc;
|
|
use clap::Parser;
|
|
use deployer_types::{ClusterNodeRecord, DesiredSystemSpec, ObservedSystemState};
|
|
use tokio::process::Command;
|
|
use tokio::time::sleep;
|
|
use tracing::{info, warn};
|
|
use tracing_subscriber::EnvFilter;
|
|
|
|
fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
|
|
format!("{}/clusters/{}/", cluster_namespace, cluster_id)
|
|
}
|
|
|
|
fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
|
format!(
|
|
"{}nodes/{}",
|
|
cluster_prefix(cluster_namespace, cluster_id),
|
|
node_id
|
|
)
|
|
.into_bytes()
|
|
}
|
|
|
|
fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
|
format!(
|
|
"{}nodes/{}/desired-system",
|
|
cluster_prefix(cluster_namespace, cluster_id),
|
|
node_id
|
|
)
|
|
.into_bytes()
|
|
}
|
|
|
|
fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec<u8> {
|
|
format!(
|
|
"{}nodes/{}/observed-system",
|
|
cluster_prefix(cluster_namespace, cluster_id),
|
|
node_id
|
|
)
|
|
.into_bytes()
|
|
}
|
|
|
|
#[derive(Parser, Debug)]
|
|
#[command(author, version, about)]
|
|
struct Cli {
|
|
#[arg(long, default_value = "http://127.0.0.1:2379")]
|
|
chainfire_endpoint: String,
|
|
|
|
#[arg(long, default_value = "ultracloud")]
|
|
cluster_namespace: String,
|
|
|
|
#[arg(long)]
|
|
cluster_id: String,
|
|
|
|
#[arg(long)]
|
|
node_id: String,
|
|
|
|
#[arg(long, default_value = "/etc/nixos")]
|
|
flake_root: String,
|
|
|
|
#[arg(long, default_value_t = 30)]
|
|
interval_secs: u64,
|
|
|
|
#[arg(long, default_value = "switch")]
|
|
switch_action: String,
|
|
|
|
#[arg(long, allow_hyphen_values = true)]
|
|
health_check_command: Vec<String>,
|
|
|
|
#[arg(long, default_value_t = false)]
|
|
rollback_on_failure: bool,
|
|
|
|
#[arg(long, default_value_t = false)]
|
|
apply: bool,
|
|
|
|
#[arg(long, default_value_t = false)]
|
|
once: bool,
|
|
}
|
|
|
|
struct Agent {
|
|
endpoint: String,
|
|
cluster_namespace: String,
|
|
cluster_id: String,
|
|
node_id: String,
|
|
flake_root: String,
|
|
interval: Duration,
|
|
switch_action: String,
|
|
health_check_command: Vec<String>,
|
|
rollback_on_failure: bool,
|
|
apply: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
struct ResolvedDesiredSystem {
|
|
nixos_configuration: Option<String>,
|
|
target_system: Option<String>,
|
|
flake_ref: String,
|
|
switch_action: String,
|
|
health_check_command: Vec<String>,
|
|
rollback_on_failure: bool,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
enum HealthCheckOutcome {
|
|
Passed,
|
|
RolledBack,
|
|
}
|
|
|
|
impl Agent {
|
|
fn new(cli: Cli) -> Self {
|
|
Self {
|
|
endpoint: cli.chainfire_endpoint,
|
|
cluster_namespace: cli.cluster_namespace,
|
|
cluster_id: cli.cluster_id,
|
|
node_id: cli.node_id,
|
|
flake_root: cli.flake_root,
|
|
interval: Duration::from_secs(cli.interval_secs),
|
|
switch_action: cli.switch_action,
|
|
health_check_command: cli.health_check_command,
|
|
rollback_on_failure: cli.rollback_on_failure,
|
|
apply: cli.apply,
|
|
}
|
|
}
|
|
|
|
async fn run_loop(&self) -> Result<()> {
|
|
loop {
|
|
if let Err(error) = self.tick().await {
|
|
warn!(error = %error, "nix-agent tick failed");
|
|
}
|
|
sleep(self.interval).await;
|
|
}
|
|
}
|
|
|
|
async fn tick(&self) -> Result<()> {
|
|
info!(
|
|
endpoint = %self.endpoint,
|
|
cluster_namespace = %self.cluster_namespace,
|
|
cluster_id = %self.cluster_id,
|
|
node_id = %self.node_id,
|
|
"starting reconciliation tick"
|
|
);
|
|
let mut client = Client::connect(self.endpoint.clone()).await?;
|
|
info!("connected to ChainFire");
|
|
let node_key = key_node(&self.cluster_namespace, &self.cluster_id, &self.node_id);
|
|
let node_raw = client.get_with_revision(&node_key).await?;
|
|
let Some((node_bytes, _revision)) = node_raw else {
|
|
warn!(
|
|
cluster_id = %self.cluster_id,
|
|
node_id = %self.node_id,
|
|
"node definition not found; skipping nix reconciliation"
|
|
);
|
|
return Ok(());
|
|
};
|
|
|
|
let node: ClusterNodeRecord =
|
|
serde_json::from_slice(&node_bytes).context("failed to parse node record")?;
|
|
info!(
|
|
hostname = %node.hostname,
|
|
state = node.state.as_deref().unwrap_or("unknown"),
|
|
"loaded node record"
|
|
);
|
|
|
|
let desired = client
|
|
.get(key_desired_system(
|
|
&self.cluster_namespace,
|
|
&self.cluster_id,
|
|
&self.node_id,
|
|
))
|
|
.await?
|
|
.map(|bytes| serde_json::from_slice::<DesiredSystemSpec>(&bytes))
|
|
.transpose()
|
|
.context("failed to parse desired-system spec")?;
|
|
info!(
|
|
has_desired_system = desired.is_some(),
|
|
has_install_plan = node.install_plan.is_some(),
|
|
"resolved desired-state inputs"
|
|
);
|
|
|
|
let previous_observed = client
|
|
.get(key_observed_system(
|
|
&self.cluster_namespace,
|
|
&self.cluster_id,
|
|
&self.node_id,
|
|
))
|
|
.await?
|
|
.map(|bytes| serde_json::from_slice::<ObservedSystemState>(&bytes))
|
|
.transpose()
|
|
.context("failed to parse observed-system state")?;
|
|
|
|
let mut observed = self.base_observed_state(&node);
|
|
observed.status = Some("planning".to_string());
|
|
info!(
|
|
current_system = observed.current_system.as_deref().unwrap_or(""),
|
|
configured_system = observed.configured_system.as_deref().unwrap_or(""),
|
|
booted_system = observed.booted_system.as_deref().unwrap_or(""),
|
|
"publishing planning status"
|
|
);
|
|
self.publish_observed_state(&mut client, &observed).await?;
|
|
let reconcile_result = self
|
|
.reconcile_node(
|
|
&node,
|
|
desired.as_ref(),
|
|
previous_observed.as_ref(),
|
|
&mut observed,
|
|
)
|
|
.await;
|
|
if let Err(error) = reconcile_result {
|
|
observed.status = Some("failed".to_string());
|
|
observed.last_error = Some(format!("{error:#}"));
|
|
}
|
|
|
|
info!(
|
|
status = observed.status.as_deref().unwrap_or("unknown"),
|
|
"publishing final observed status"
|
|
);
|
|
self.publish_observed_state_with_retry(&observed).await?;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn publish_observed_state(
|
|
&self,
|
|
client: &mut Client,
|
|
observed: &ObservedSystemState,
|
|
) -> Result<()> {
|
|
info!(
|
|
status = observed.status.as_deref().unwrap_or("unknown"),
|
|
"writing observed-system state"
|
|
);
|
|
client
|
|
.put(
|
|
&key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id),
|
|
&serde_json::to_vec(observed)?,
|
|
)
|
|
.await?;
|
|
Ok(())
|
|
}
|
|
|
|
async fn publish_observed_state_with_retry(
|
|
&self,
|
|
observed: &ObservedSystemState,
|
|
) -> Result<()> {
|
|
let payload = serde_json::to_vec(observed)?;
|
|
let key = key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id);
|
|
let deadline = Instant::now() + Duration::from_secs(30);
|
|
let mut attempt = 1u32;
|
|
|
|
loop {
|
|
let result = async {
|
|
let mut client = Client::connect(self.endpoint.clone()).await?;
|
|
client.put(&key, &payload).await?;
|
|
Result::<()>::Ok(())
|
|
}
|
|
.await;
|
|
|
|
match result {
|
|
Ok(()) => return Ok(()),
|
|
Err(error) if Instant::now() < deadline => {
|
|
warn!(
|
|
attempt,
|
|
error = %error,
|
|
"failed to publish observed-system state; retrying with a fresh connection"
|
|
);
|
|
attempt += 1;
|
|
sleep(Duration::from_secs(2)).await;
|
|
}
|
|
Err(error) => return Err(error),
|
|
}
|
|
}
|
|
}
|
|
|
|
fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState {
|
|
ObservedSystemState {
|
|
node_id: node.node_id.clone(),
|
|
configured_system: read_symlink_target("/nix/var/nix/profiles/system"),
|
|
current_system: read_symlink_target("/run/current-system"),
|
|
booted_system: read_symlink_target("/run/booted-system"),
|
|
..ObservedSystemState::default()
|
|
}
|
|
}
|
|
|
|
async fn reconcile_node(
|
|
&self,
|
|
node: &ClusterNodeRecord,
|
|
desired: Option<&DesiredSystemSpec>,
|
|
previous_observed: Option<&ObservedSystemState>,
|
|
observed: &mut ObservedSystemState,
|
|
) -> Result<()> {
|
|
match node.state.as_deref() {
|
|
Some("failed") => {
|
|
observed.status = Some("paused".to_string());
|
|
return Ok(());
|
|
}
|
|
Some("draining")
|
|
if !desired
|
|
.map(|spec| {
|
|
spec.deployment_id.is_some() && spec.drain_before_apply.unwrap_or(false)
|
|
})
|
|
.unwrap_or(false) =>
|
|
{
|
|
observed.status = Some("paused".to_string());
|
|
return Ok(());
|
|
}
|
|
_ => {}
|
|
}
|
|
|
|
let Some(desired) = resolve_desired_system(
|
|
node,
|
|
desired,
|
|
&self.flake_root,
|
|
&self.switch_action,
|
|
&self.health_check_command,
|
|
self.rollback_on_failure,
|
|
) else {
|
|
observed.status = Some("idle".to_string());
|
|
return Ok(());
|
|
};
|
|
info!(
|
|
nixos_configuration = desired.nixos_configuration.as_deref().unwrap_or(""),
|
|
target_system = desired.target_system.as_deref().unwrap_or(""),
|
|
flake_ref = %desired.flake_ref,
|
|
switch_action = %desired.switch_action,
|
|
rollback_on_failure = desired.rollback_on_failure,
|
|
health_check_command = ?desired.health_check_command,
|
|
"resolved desired system"
|
|
);
|
|
|
|
observed.nixos_configuration = desired.nixos_configuration.clone();
|
|
observed.flake_root = Some(desired.flake_ref.clone());
|
|
observed.switch_action = Some(desired.switch_action.clone());
|
|
|
|
let previous_system = previous_observed
|
|
.and_then(|state| state.rollback_system.clone())
|
|
.or_else(|| observed.current_system.clone());
|
|
observed.rollback_system = previous_system.clone();
|
|
info!(
|
|
previous_system = previous_system.as_deref().unwrap_or(""),
|
|
"selected rollback baseline"
|
|
);
|
|
let target_system = match desired.target_system.as_deref() {
|
|
Some(target_system) => {
|
|
info!(target_system, "using prebuilt target system");
|
|
target_system.to_string()
|
|
}
|
|
None => {
|
|
let configuration = desired
|
|
.nixos_configuration
|
|
.as_deref()
|
|
.ok_or_else(|| anyhow!("desired system did not specify nixos_configuration"))?;
|
|
self.build_target_system(&desired.flake_ref, configuration)
|
|
.await
|
|
.with_context(|| {
|
|
format!("failed to build target system for {}", configuration)
|
|
})?
|
|
}
|
|
};
|
|
observed.target_system = Some(target_system.clone());
|
|
info!(target_system = %target_system, "resolved target system");
|
|
|
|
if observed.current_system.as_deref() == Some(target_system.as_str()) {
|
|
info!("target system already active");
|
|
if should_run_post_boot_health_check(previous_observed, &desired, &target_system) {
|
|
observed.status = Some("verifying".to_string());
|
|
observed.last_attempt = Some(Utc::now());
|
|
let outcome = self
|
|
.run_health_check_and_maybe_rollback(
|
|
&desired,
|
|
previous_system.as_deref(),
|
|
observed,
|
|
)
|
|
.await?;
|
|
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
|
observed.current_system = read_symlink_target("/run/current-system");
|
|
observed.booted_system = read_symlink_target("/run/booted-system");
|
|
if outcome == HealthCheckOutcome::RolledBack {
|
|
return Ok(());
|
|
}
|
|
}
|
|
|
|
observed.reboot_required = Some(false);
|
|
observed.status = Some("active".to_string());
|
|
observed.last_success = Some(Utc::now());
|
|
return Ok(());
|
|
}
|
|
|
|
if !self.apply {
|
|
observed.status = Some("pending".to_string());
|
|
return Ok(());
|
|
}
|
|
|
|
observed.status = Some("reconciling".to_string());
|
|
observed.last_attempt = Some(Utc::now());
|
|
info!(
|
|
target_system = %target_system,
|
|
switch_action = %desired.switch_action,
|
|
"switching to target system"
|
|
);
|
|
self.switch_to_target(&target_system, &desired.switch_action)
|
|
.await?;
|
|
info!("switch-to-configuration completed");
|
|
|
|
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
|
observed.current_system = read_symlink_target("/run/current-system");
|
|
observed.booted_system = read_symlink_target("/run/booted-system");
|
|
|
|
if desired.switch_action == "boot" {
|
|
if observed.configured_system.as_deref() != Some(target_system.as_str()) {
|
|
return Err(anyhow!(
|
|
"boot switch completed but configured system does not match target {}",
|
|
target_system
|
|
));
|
|
}
|
|
|
|
observed.reboot_required = Some(true);
|
|
observed.status = Some("staged".to_string());
|
|
observed.last_error = None;
|
|
return Ok(());
|
|
}
|
|
|
|
if observed.current_system.as_deref() != Some(target_system.as_str()) {
|
|
return Err(anyhow!(
|
|
"switch completed but /run/current-system does not match target {}",
|
|
target_system
|
|
));
|
|
}
|
|
|
|
let outcome = self
|
|
.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed)
|
|
.await?;
|
|
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
|
observed.current_system = read_symlink_target("/run/current-system");
|
|
observed.booted_system = read_symlink_target("/run/booted-system");
|
|
if outcome == HealthCheckOutcome::RolledBack {
|
|
return Ok(());
|
|
}
|
|
|
|
observed.reboot_required = Some(false);
|
|
observed.status = Some("active".to_string());
|
|
observed.last_success = Some(Utc::now());
|
|
observed.last_error = None;
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn build_target_system(&self, flake_ref: &str, configuration: &str) -> Result<String> {
|
|
let flake_attr = target_flake_attr(flake_ref, configuration);
|
|
info!(flake_attr = %flake_attr, "building target system");
|
|
let mut build_args = vec![
|
|
"build",
|
|
"-L",
|
|
"--no-link",
|
|
"--no-write-lock-file",
|
|
"--print-out-paths",
|
|
];
|
|
build_args.push(flake_attr.as_str());
|
|
let output = run_command("nix", &build_args).await?;
|
|
let path = output
|
|
.lines()
|
|
.map(str::trim)
|
|
.find(|line| line.starts_with("/nix/store/"))
|
|
.ok_or_else(|| anyhow!("nix build returned no output path"))?;
|
|
Ok(path.to_string())
|
|
}
|
|
|
|
async fn switch_to_target(&self, target_system: &str, switch_action: &str) -> Result<()> {
|
|
let switch_bin = Path::new(target_system).join("bin/switch-to-configuration");
|
|
if !switch_bin.exists() {
|
|
return Err(anyhow!(
|
|
"target system {} does not contain switch-to-configuration",
|
|
target_system
|
|
));
|
|
}
|
|
|
|
info!(
|
|
switch_bin = %switch_bin.display(),
|
|
switch_action = %switch_action,
|
|
"executing switch-to-configuration"
|
|
);
|
|
run_command_inherit_output(
|
|
switch_bin
|
|
.to_str()
|
|
.ok_or_else(|| anyhow!("invalid switch path"))?,
|
|
&[switch_action],
|
|
)
|
|
.await?;
|
|
Ok(())
|
|
}
|
|
|
|
async fn run_health_check_and_maybe_rollback(
|
|
&self,
|
|
desired: &ResolvedDesiredSystem,
|
|
previous_system: Option<&str>,
|
|
observed: &mut ObservedSystemState,
|
|
) -> Result<HealthCheckOutcome> {
|
|
if desired.health_check_command.is_empty() {
|
|
return Ok(HealthCheckOutcome::Passed);
|
|
}
|
|
|
|
info!(
|
|
command = ?desired.health_check_command,
|
|
rollback_on_failure = desired.rollback_on_failure,
|
|
"running post-activation health check"
|
|
);
|
|
if let Err(error) = run_vec_command(&desired.health_check_command).await {
|
|
let error_message = format!("health check failed after activation: {error}");
|
|
if desired.rollback_on_failure {
|
|
info!("health check failed; rolling back to previous system");
|
|
self.rollback_to_previous(previous_system).await?;
|
|
observed.configured_system = read_symlink_target("/nix/var/nix/profiles/system");
|
|
observed.current_system = read_symlink_target("/run/current-system");
|
|
observed.booted_system = read_symlink_target("/run/booted-system");
|
|
observed.reboot_required = Some(false);
|
|
observed.status = Some("rolled-back".to_string());
|
|
observed.last_error = Some(error_message);
|
|
return Ok(HealthCheckOutcome::RolledBack);
|
|
}
|
|
|
|
return Err(anyhow!(error_message));
|
|
}
|
|
|
|
info!("post-activation health check passed");
|
|
Ok(HealthCheckOutcome::Passed)
|
|
}
|
|
|
|
async fn rollback_to_previous(&self, previous_system: Option<&str>) -> Result<()> {
|
|
let previous_system = previous_system
|
|
.filter(|value| !value.is_empty())
|
|
.ok_or_else(|| anyhow!("rollback requested but no previous system is known"))?;
|
|
info!(previous_system = %previous_system, "rolling back to previous system");
|
|
let switch_bin = Path::new(previous_system).join("bin/switch-to-configuration");
|
|
if switch_bin.exists() {
|
|
return self.switch_to_target(previous_system, "switch").await;
|
|
}
|
|
|
|
let activate = Path::new(previous_system).join("activate");
|
|
if !activate.exists() {
|
|
return Err(anyhow!(
|
|
"previous system {} does not contain switch-to-configuration or activate",
|
|
previous_system
|
|
));
|
|
}
|
|
|
|
info!(
|
|
previous_system = %previous_system,
|
|
activate = %activate.display(),
|
|
"previous system lacks switch-to-configuration; falling back to profile set + activate"
|
|
);
|
|
run_command(
|
|
"nix-env",
|
|
&[
|
|
"--profile",
|
|
"/nix/var/nix/profiles/system",
|
|
"--set",
|
|
previous_system,
|
|
],
|
|
)
|
|
.await?;
|
|
run_command_inherit_output(
|
|
activate
|
|
.to_str()
|
|
.ok_or_else(|| anyhow!("invalid activate path"))?,
|
|
&[],
|
|
)
|
|
.await
|
|
}
|
|
}
|
|
|
|
fn resolve_desired_system(
|
|
node: &ClusterNodeRecord,
|
|
desired: Option<&DesiredSystemSpec>,
|
|
local_flake_root: &str,
|
|
local_switch_action: &str,
|
|
local_health_check_command: &[String],
|
|
local_rollback_on_failure: bool,
|
|
) -> Option<ResolvedDesiredSystem> {
|
|
let nixos_configuration = desired
|
|
.and_then(|spec| spec.nixos_configuration.clone())
|
|
.or_else(|| {
|
|
node.install_plan
|
|
.as_ref()
|
|
.and_then(|plan| plan.nixos_configuration.clone())
|
|
});
|
|
let target_system = desired.and_then(|spec| spec.target_system.clone());
|
|
|
|
if nixos_configuration.is_none() && target_system.is_none() {
|
|
return None;
|
|
}
|
|
|
|
Some(ResolvedDesiredSystem {
|
|
nixos_configuration,
|
|
target_system,
|
|
flake_ref: desired
|
|
.and_then(|spec| spec.flake_ref.clone())
|
|
.unwrap_or_else(|| local_flake_root.to_string()),
|
|
switch_action: desired
|
|
.and_then(|spec| spec.switch_action.clone())
|
|
.unwrap_or_else(|| local_switch_action.to_string()),
|
|
health_check_command: desired
|
|
.map(|spec| spec.health_check_command.clone())
|
|
.filter(|command| !command.is_empty())
|
|
.unwrap_or_else(|| local_health_check_command.to_vec()),
|
|
rollback_on_failure: desired
|
|
.and_then(|spec| spec.rollback_on_failure)
|
|
.unwrap_or(local_rollback_on_failure),
|
|
})
|
|
}
|
|
|
|
fn target_flake_attr(flake_root: &str, configuration: &str) -> String {
|
|
format!(
|
|
"{}#nixosConfigurations.{}.config.system.build.toplevel",
|
|
flake_root, configuration
|
|
)
|
|
}
|
|
|
|
fn should_run_post_boot_health_check(
|
|
previous_observed: Option<&ObservedSystemState>,
|
|
desired: &ResolvedDesiredSystem,
|
|
target_system: &str,
|
|
) -> bool {
|
|
desired.switch_action == "boot"
|
|
&& previous_observed
|
|
.map(|state| {
|
|
state.status.as_deref() == Some("staged")
|
|
&& state.target_system.as_deref() == Some(target_system)
|
|
})
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
fn read_symlink_target(path: &str) -> Option<String> {
|
|
fs::read_link(path)
|
|
.ok()
|
|
.map(|value| value.to_string_lossy().into_owned())
|
|
}
|
|
|
|
async fn run_command(program: &str, args: &[&str]) -> Result<String> {
|
|
let started_at = Instant::now();
|
|
info!(program = %program, args = ?args, "running command");
|
|
let output = Command::new(program)
|
|
.args(args)
|
|
.stdin(Stdio::null())
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped())
|
|
.output()
|
|
.await
|
|
.with_context(|| format!("failed to execute {}", program))?;
|
|
|
|
if output.status.success() {
|
|
info!(
|
|
program = %program,
|
|
args = ?args,
|
|
elapsed_ms = started_at.elapsed().as_millis(),
|
|
"command completed successfully"
|
|
);
|
|
Ok(String::from_utf8_lossy(&output.stdout).trim().to_string())
|
|
} else {
|
|
let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string();
|
|
let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string();
|
|
warn!(
|
|
program = %program,
|
|
args = ?args,
|
|
elapsed_ms = started_at.elapsed().as_millis(),
|
|
status = %output.status,
|
|
stdout = %stdout,
|
|
stderr = %stderr,
|
|
"command failed"
|
|
);
|
|
Err(anyhow!(
|
|
"{} {:?} failed with status {}: stdout='{}' stderr='{}'",
|
|
program,
|
|
args,
|
|
output.status,
|
|
stdout,
|
|
stderr
|
|
))
|
|
}
|
|
}
|
|
|
|
async fn run_vec_command(command: &[String]) -> Result<String> {
|
|
let (program, args) = command
|
|
.split_first()
|
|
.ok_or_else(|| anyhow!("command vector is empty"))?;
|
|
let arg_refs = args.iter().map(String::as_str).collect::<Vec<_>>();
|
|
run_command(program, &arg_refs).await
|
|
}
|
|
|
|
async fn run_command_inherit_output(program: &str, args: &[&str]) -> Result<()> {
|
|
let started_at = Instant::now();
|
|
info!(
|
|
program = %program,
|
|
args = ?args,
|
|
"running command with inherited output"
|
|
);
|
|
let status = Command::new(program)
|
|
.args(args)
|
|
.stdin(Stdio::null())
|
|
.stdout(Stdio::inherit())
|
|
.stderr(Stdio::inherit())
|
|
.status()
|
|
.await
|
|
.with_context(|| format!("failed to execute {}", program))?;
|
|
|
|
if status.success() {
|
|
info!(
|
|
program = %program,
|
|
args = ?args,
|
|
elapsed_ms = started_at.elapsed().as_millis(),
|
|
"command completed successfully"
|
|
);
|
|
Ok(())
|
|
} else {
|
|
warn!(
|
|
program = %program,
|
|
args = ?args,
|
|
elapsed_ms = started_at.elapsed().as_millis(),
|
|
status = %status,
|
|
"command failed"
|
|
);
|
|
Err(anyhow!(
|
|
"{} {:?} failed with status {}",
|
|
program,
|
|
args,
|
|
status
|
|
))
|
|
}
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(EnvFilter::from_default_env().add_directive("info".parse()?))
|
|
.init();
|
|
|
|
let cli = Cli::parse();
|
|
let once = cli.once;
|
|
let agent = Agent::new(cli);
|
|
|
|
info!(
|
|
cluster_id = %agent.cluster_id,
|
|
node_id = %agent.node_id,
|
|
flake_root = %agent.flake_root,
|
|
apply = agent.apply,
|
|
"starting nix-agent"
|
|
);
|
|
|
|
if once {
|
|
agent.tick().await?;
|
|
} else {
|
|
agent.run_loop().await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use deployer_types::{DesiredSystemSpec, InstallPlan};
|
|
|
|
fn test_node() -> ClusterNodeRecord {
|
|
ClusterNodeRecord {
|
|
node_id: "node01".to_string(),
|
|
machine_id: None,
|
|
ip: "10.0.0.10".to_string(),
|
|
hostname: "node01".to_string(),
|
|
roles: vec!["control-plane".to_string()],
|
|
labels: Default::default(),
|
|
pool: Some("control".to_string()),
|
|
node_class: Some("control-plane".to_string()),
|
|
failure_domain: Some("rack-a".to_string()),
|
|
nix_profile: Some("profiles/control-plane".to_string()),
|
|
install_plan: Some(InstallPlan {
|
|
nixos_configuration: Some("node01".to_string()),
|
|
disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()),
|
|
disko_script_path: None,
|
|
target_system_path: None,
|
|
target_disk: Some("/dev/vda".to_string()),
|
|
target_disk_by_id: None,
|
|
}),
|
|
hardware_facts: None,
|
|
state: Some("active".to_string()),
|
|
commission_state: None,
|
|
install_state: None,
|
|
commissioned_at: None,
|
|
last_inventory_hash: None,
|
|
power_state: None,
|
|
bmc_ref: None,
|
|
last_heartbeat: None,
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn resolve_desired_system_falls_back_to_install_plan() {
|
|
let resolved = resolve_desired_system(
|
|
&test_node(),
|
|
None,
|
|
"/opt/ultracloud-src",
|
|
"switch",
|
|
&[],
|
|
true,
|
|
)
|
|
.expect("desired system should resolve");
|
|
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01"));
|
|
assert_eq!(resolved.target_system, None);
|
|
assert_eq!(resolved.flake_ref, "/opt/ultracloud-src");
|
|
assert_eq!(resolved.switch_action, "switch");
|
|
assert!(resolved.rollback_on_failure);
|
|
}
|
|
|
|
#[test]
|
|
fn resolve_desired_system_prefers_chainfire_spec() {
|
|
let desired = DesiredSystemSpec {
|
|
node_id: "node01".to_string(),
|
|
deployment_id: None,
|
|
nixos_configuration: Some("node01-next".to_string()),
|
|
target_system: None,
|
|
flake_ref: Some("github:centra/cloud".to_string()),
|
|
switch_action: Some("boot".to_string()),
|
|
health_check_command: vec!["true".to_string()],
|
|
rollback_on_failure: Some(true),
|
|
drain_before_apply: Some(false),
|
|
};
|
|
|
|
let resolved = resolve_desired_system(
|
|
&test_node(),
|
|
Some(&desired),
|
|
"/opt/ultracloud-src",
|
|
"switch",
|
|
&[],
|
|
false,
|
|
)
|
|
.expect("desired system should resolve");
|
|
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
|
|
assert_eq!(resolved.flake_ref, "github:centra/cloud");
|
|
assert_eq!(resolved.switch_action, "boot");
|
|
assert_eq!(resolved.health_check_command, vec!["true".to_string()]);
|
|
assert!(resolved.rollback_on_failure);
|
|
}
|
|
|
|
#[test]
|
|
fn resolve_desired_system_accepts_prebuilt_target_system() {
|
|
let desired = DesiredSystemSpec {
|
|
node_id: "node01".to_string(),
|
|
deployment_id: None,
|
|
nixos_configuration: Some("node01-next".to_string()),
|
|
target_system: Some("/nix/store/node01-next".to_string()),
|
|
flake_ref: None,
|
|
switch_action: Some("switch".to_string()),
|
|
health_check_command: Vec::new(),
|
|
rollback_on_failure: Some(true),
|
|
drain_before_apply: Some(false),
|
|
};
|
|
|
|
let resolved = resolve_desired_system(
|
|
&test_node(),
|
|
Some(&desired),
|
|
"/opt/ultracloud-src",
|
|
"switch",
|
|
&[],
|
|
true,
|
|
)
|
|
.expect("desired system should resolve");
|
|
|
|
assert_eq!(resolved.nixos_configuration.as_deref(), Some("node01-next"));
|
|
assert_eq!(
|
|
resolved.target_system.as_deref(),
|
|
Some("/nix/store/node01-next")
|
|
);
|
|
assert_eq!(resolved.flake_ref, "/opt/ultracloud-src");
|
|
}
|
|
|
|
#[test]
|
|
fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() {
|
|
let desired = DesiredSystemSpec {
|
|
node_id: "node01".to_string(),
|
|
deployment_id: None,
|
|
nixos_configuration: Some("node01-next".to_string()),
|
|
target_system: None,
|
|
flake_ref: None,
|
|
switch_action: None,
|
|
health_check_command: Vec::new(),
|
|
rollback_on_failure: None,
|
|
drain_before_apply: None,
|
|
};
|
|
|
|
let resolved = resolve_desired_system(
|
|
&test_node(),
|
|
Some(&desired),
|
|
"/opt/ultracloud-src",
|
|
"switch",
|
|
&["systemctl".to_string(), "is-system-running".to_string()],
|
|
true,
|
|
)
|
|
.expect("desired system should resolve");
|
|
|
|
assert_eq!(resolved.flake_ref, "/opt/ultracloud-src");
|
|
assert_eq!(resolved.switch_action, "switch");
|
|
assert_eq!(
|
|
resolved.health_check_command,
|
|
vec!["systemctl".to_string(), "is-system-running".to_string()]
|
|
);
|
|
assert!(resolved.rollback_on_failure);
|
|
}
|
|
|
|
#[test]
|
|
fn target_flake_attr_is_rendered_from_root_and_configuration() {
|
|
assert_eq!(
|
|
target_flake_attr("/opt/ultracloud-src", "node01"),
|
|
"/opt/ultracloud-src#nixosConfigurations.node01.config.system.build.toplevel"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn read_symlink_target_returns_none_for_missing_path() {
|
|
assert_eq!(
|
|
read_symlink_target("/tmp/ultracloud-nix-agent-missing-link"),
|
|
None
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn post_boot_health_check_is_requested_for_matching_staged_target() {
|
|
let desired = ResolvedDesiredSystem {
|
|
nixos_configuration: Some("node01".to_string()),
|
|
target_system: None,
|
|
flake_ref: "/opt/ultracloud-src".to_string(),
|
|
switch_action: "boot".to_string(),
|
|
health_check_command: vec!["true".to_string()],
|
|
rollback_on_failure: true,
|
|
};
|
|
let previous = ObservedSystemState {
|
|
status: Some("staged".to_string()),
|
|
target_system: Some("/nix/store/example-system".to_string()),
|
|
..ObservedSystemState::default()
|
|
};
|
|
|
|
assert!(should_run_post_boot_health_check(
|
|
Some(&previous),
|
|
&desired,
|
|
"/nix/store/example-system"
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn post_boot_health_check_is_skipped_for_non_matching_state() {
|
|
let desired = ResolvedDesiredSystem {
|
|
nixos_configuration: Some("node01".to_string()),
|
|
target_system: None,
|
|
flake_ref: "/opt/ultracloud-src".to_string(),
|
|
switch_action: "boot".to_string(),
|
|
health_check_command: vec!["true".to_string()],
|
|
rollback_on_failure: true,
|
|
};
|
|
let previous = ObservedSystemState {
|
|
status: Some("active".to_string()),
|
|
target_system: Some("/nix/store/example-system".to_string()),
|
|
..ObservedSystemState::default()
|
|
};
|
|
|
|
assert!(!should_run_post_boot_health_check(
|
|
Some(&previous),
|
|
&desired,
|
|
"/nix/store/example-system"
|
|
));
|
|
}
|
|
}
|