use std::fs; use std::path::Path; use std::process::Stdio; use std::time::Duration; use anyhow::{anyhow, Context, Result}; use chainfire_client::Client; use chrono::Utc; use clap::Parser; use deployer_types::{ClusterNodeRecord, DesiredSystemSpec, ObservedSystemState}; use tokio::process::Command; use tokio::time::sleep; use tracing::{info, warn}; use tracing_subscriber::EnvFilter; fn cluster_prefix(cluster_namespace: &str, cluster_id: &str) -> String { format!("{}/clusters/{}/", cluster_namespace, cluster_id) } fn key_node(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec { format!( "{}nodes/{}", cluster_prefix(cluster_namespace, cluster_id), node_id ) .into_bytes() } fn key_desired_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec { format!( "{}nodes/{}/desired-system", cluster_prefix(cluster_namespace, cluster_id), node_id ) .into_bytes() } fn key_observed_system(cluster_namespace: &str, cluster_id: &str, node_id: &str) -> Vec { format!( "{}nodes/{}/observed-system", cluster_prefix(cluster_namespace, cluster_id), node_id ) .into_bytes() } #[derive(Parser, Debug)] #[command(author, version, about)] struct Cli { #[arg(long, default_value = "http://127.0.0.1:7000")] chainfire_endpoint: String, #[arg(long, default_value = "photoncloud")] cluster_namespace: String, #[arg(long)] cluster_id: String, #[arg(long)] node_id: String, #[arg(long, default_value = "/etc/nixos")] flake_root: String, #[arg(long, default_value_t = 30)] interval_secs: u64, #[arg(long, default_value = "switch")] switch_action: String, #[arg(long, allow_hyphen_values = true)] health_check_command: Vec, #[arg(long, default_value_t = false)] rollback_on_failure: bool, #[arg(long, default_value_t = false)] apply: bool, #[arg(long, default_value_t = false)] once: bool, } struct Agent { endpoint: String, cluster_namespace: String, cluster_id: String, node_id: String, flake_root: String, interval: Duration, switch_action: String, health_check_command: Vec, rollback_on_failure: bool, apply: bool, } #[derive(Debug, Clone, PartialEq, Eq)] struct ResolvedDesiredSystem { nixos_configuration: String, flake_ref: String, switch_action: String, health_check_command: Vec, rollback_on_failure: bool, } impl Agent { fn new(cli: Cli) -> Self { Self { endpoint: cli.chainfire_endpoint, cluster_namespace: cli.cluster_namespace, cluster_id: cli.cluster_id, node_id: cli.node_id, flake_root: cli.flake_root, interval: Duration::from_secs(cli.interval_secs), switch_action: cli.switch_action, health_check_command: cli.health_check_command, rollback_on_failure: cli.rollback_on_failure, apply: cli.apply, } } async fn run_loop(&self) -> Result<()> { loop { if let Err(error) = self.tick().await { warn!(error = %error, "nix-agent tick failed"); } sleep(self.interval).await; } } async fn tick(&self) -> Result<()> { let mut client = Client::connect(self.endpoint.clone()).await?; let node_key = key_node(&self.cluster_namespace, &self.cluster_id, &self.node_id); let node_raw = client.get_with_revision(&node_key).await?; let Some((node_bytes, _revision)) = node_raw else { warn!( cluster_id = %self.cluster_id, node_id = %self.node_id, "node definition not found; skipping nix reconciliation" ); return Ok(()); }; let node: ClusterNodeRecord = serde_json::from_slice(&node_bytes).context("failed to parse node record")?; let desired = client .get(key_desired_system( &self.cluster_namespace, &self.cluster_id, &self.node_id, )) .await? .map(|bytes| serde_json::from_slice::(&bytes)) .transpose() .context("failed to parse desired-system spec")?; let mut observed = self.base_observed_state(&node); let reconcile_result = self .reconcile_node(&node, desired.as_ref(), &mut observed) .await; if let Err(error) = reconcile_result { observed.status = Some("failed".to_string()); observed.last_error = Some(error.to_string()); } client .put( &key_observed_system(&self.cluster_namespace, &self.cluster_id, &self.node_id), &serde_json::to_vec(&observed)?, ) .await?; Ok(()) } fn base_observed_state(&self, node: &ClusterNodeRecord) -> ObservedSystemState { ObservedSystemState { node_id: node.node_id.clone(), current_system: read_symlink_target("/run/current-system"), booted_system: read_symlink_target("/run/booted-system"), ..ObservedSystemState::default() } } async fn reconcile_node( &self, node: &ClusterNodeRecord, desired: Option<&DesiredSystemSpec>, observed: &mut ObservedSystemState, ) -> Result<()> { match node.state.as_deref() { Some("failed") | Some("draining") => { observed.status = Some("paused".to_string()); return Ok(()); } _ => {} } let Some(desired) = resolve_desired_system( node, desired, &self.flake_root, &self.switch_action, &self.health_check_command, self.rollback_on_failure, ) else { observed.status = Some("idle".to_string()); return Ok(()); }; observed.nixos_configuration = Some(desired.nixos_configuration.clone()); observed.flake_root = Some(desired.flake_ref.clone()); let previous_system = observed.current_system.clone(); let target_system = self .build_target_system(&desired.flake_ref, &desired.nixos_configuration) .await .with_context(|| { format!( "failed to build target system for {}", desired.nixos_configuration ) })?; observed.target_system = Some(target_system.clone()); if observed.current_system.as_deref() == Some(target_system.as_str()) { observed.status = Some("active".to_string()); observed.last_success = Some(Utc::now()); return Ok(()); } if !self.apply { observed.status = Some("pending".to_string()); return Ok(()); } observed.status = Some("reconciling".to_string()); observed.last_attempt = Some(Utc::now()); self.switch_to_target(&target_system, &desired.switch_action) .await?; observed.current_system = read_symlink_target("/run/current-system"); observed.booted_system = read_symlink_target("/run/booted-system"); if observed.current_system.as_deref() != Some(target_system.as_str()) { return Err(anyhow!( "switch completed but /run/current-system does not match target {}", target_system )); } self.run_health_check_and_maybe_rollback(&desired, previous_system.as_deref(), observed) .await?; observed.status = Some("active".to_string()); observed.last_success = Some(Utc::now()); observed.last_error = None; Ok(()) } async fn build_target_system(&self, flake_ref: &str, configuration: &str) -> Result { let flake_attr = target_flake_attr(flake_ref, configuration); let output = run_command( "nix", &["build", "--no-link", "--print-out-paths", flake_attr.as_str()], ) .await?; let path = output .lines() .find(|line| !line.trim().is_empty()) .map(str::trim) .ok_or_else(|| anyhow!("nix build returned no output path"))?; Ok(path.to_string()) } async fn switch_to_target(&self, target_system: &str, switch_action: &str) -> Result<()> { let switch_bin = Path::new(target_system).join("bin/switch-to-configuration"); if !switch_bin.exists() { return Err(anyhow!( "target system {} does not contain switch-to-configuration", target_system )); } run_command( switch_bin .to_str() .ok_or_else(|| anyhow!("invalid switch path"))?, &[switch_action], ) .await?; Ok(()) } async fn run_health_check_and_maybe_rollback( &self, desired: &ResolvedDesiredSystem, previous_system: Option<&str>, observed: &mut ObservedSystemState, ) -> Result<()> { if desired.health_check_command.is_empty() { return Ok(()); } if let Err(error) = run_vec_command(&desired.health_check_command).await { let error_message = format!("health check failed after activation: {error}"); if desired.rollback_on_failure { self.rollback_to_previous(previous_system).await?; observed.current_system = read_symlink_target("/run/current-system"); observed.booted_system = read_symlink_target("/run/booted-system"); observed.status = Some("rolled-back".to_string()); observed.last_error = Some(error_message); return Ok(()); } return Err(anyhow!(error_message)); } Ok(()) } async fn rollback_to_previous(&self, previous_system: Option<&str>) -> Result<()> { let previous_system = previous_system .filter(|value| !value.is_empty()) .ok_or_else(|| anyhow!("rollback requested but no previous system is known"))?; self.switch_to_target(previous_system, "switch").await } } fn resolve_desired_system( node: &ClusterNodeRecord, desired: Option<&DesiredSystemSpec>, local_flake_root: &str, local_switch_action: &str, local_health_check_command: &[String], local_rollback_on_failure: bool, ) -> Option { let nixos_configuration = desired .and_then(|spec| spec.nixos_configuration.clone()) .or_else(|| { node.install_plan .as_ref() .and_then(|plan| plan.nixos_configuration.clone()) })?; Some(ResolvedDesiredSystem { nixos_configuration, flake_ref: desired .and_then(|spec| spec.flake_ref.clone()) .unwrap_or_else(|| local_flake_root.to_string()), switch_action: desired .and_then(|spec| spec.switch_action.clone()) .unwrap_or_else(|| local_switch_action.to_string()), health_check_command: desired .map(|spec| spec.health_check_command.clone()) .filter(|command| !command.is_empty()) .unwrap_or_else(|| local_health_check_command.to_vec()), rollback_on_failure: desired .and_then(|spec| spec.rollback_on_failure) .unwrap_or(local_rollback_on_failure), }) } fn target_flake_attr(flake_root: &str, configuration: &str) -> String { format!( "{}#nixosConfigurations.{}.config.system.build.toplevel", flake_root, configuration ) } fn read_symlink_target(path: &str) -> Option { fs::read_link(path) .ok() .map(|value| value.to_string_lossy().into_owned()) } async fn run_command(program: &str, args: &[&str]) -> Result { let output = Command::new(program) .args(args) .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()) .output() .await .with_context(|| format!("failed to execute {}", program))?; if output.status.success() { Ok(String::from_utf8_lossy(&output.stdout).trim().to_string()) } else { let stderr = String::from_utf8_lossy(&output.stderr).trim().to_string(); let stdout = String::from_utf8_lossy(&output.stdout).trim().to_string(); Err(anyhow!( "{} {:?} failed with status {}: stdout='{}' stderr='{}'", program, args, output.status, stdout, stderr )) } } async fn run_vec_command(command: &[String]) -> Result { let (program, args) = command .split_first() .ok_or_else(|| anyhow!("command vector is empty"))?; let arg_refs = args.iter().map(String::as_str).collect::>(); run_command(program, &arg_refs).await } #[tokio::main] async fn main() -> Result<()> { tracing_subscriber::fmt() .with_env_filter(EnvFilter::from_default_env().add_directive("info".parse()?)) .init(); let cli = Cli::parse(); let once = cli.once; let agent = Agent::new(cli); info!( cluster_id = %agent.cluster_id, node_id = %agent.node_id, flake_root = %agent.flake_root, apply = agent.apply, "starting nix-agent" ); if once { agent.tick().await?; } else { agent.run_loop().await?; } Ok(()) } #[cfg(test)] mod tests { use super::*; use deployer_types::{DesiredSystemSpec, InstallPlan}; fn test_node() -> ClusterNodeRecord { ClusterNodeRecord { node_id: "node01".to_string(), machine_id: None, ip: "10.0.0.10".to_string(), hostname: "node01".to_string(), roles: vec!["control-plane".to_string()], labels: Default::default(), pool: Some("control".to_string()), node_class: Some("control-plane".to_string()), failure_domain: Some("rack-a".to_string()), nix_profile: Some("profiles/control-plane".to_string()), install_plan: Some(InstallPlan { nixos_configuration: Some("node01".to_string()), disko_config_path: Some("nix/nodes/vm-cluster/node01/disko.nix".to_string()), }), state: Some("active".to_string()), last_heartbeat: None, } } #[test] fn resolve_desired_system_falls_back_to_install_plan() { let resolved = resolve_desired_system( &test_node(), None, "/opt/plasmacloud-src", "switch", &[], true, ) .expect("desired system should resolve"); assert_eq!(resolved.nixos_configuration, "node01"); assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src"); assert_eq!(resolved.switch_action, "switch"); assert!(resolved.rollback_on_failure); } #[test] fn resolve_desired_system_prefers_chainfire_spec() { let desired = DesiredSystemSpec { node_id: "node01".to_string(), nixos_configuration: Some("node01-next".to_string()), flake_ref: Some("github:centra/cloud".to_string()), switch_action: Some("boot".to_string()), health_check_command: vec!["true".to_string()], rollback_on_failure: Some(true), }; let resolved = resolve_desired_system( &test_node(), Some(&desired), "/opt/plasmacloud-src", "switch", &[], false, ) .expect("desired system should resolve"); assert_eq!(resolved.nixos_configuration, "node01-next"); assert_eq!(resolved.flake_ref, "github:centra/cloud"); assert_eq!(resolved.switch_action, "boot"); assert_eq!(resolved.health_check_command, vec!["true".to_string()]); assert!(resolved.rollback_on_failure); } #[test] fn resolve_desired_system_uses_local_health_check_defaults_when_spec_omits_them() { let desired = DesiredSystemSpec { node_id: "node01".to_string(), nixos_configuration: Some("node01-next".to_string()), flake_ref: None, switch_action: None, health_check_command: Vec::new(), rollback_on_failure: None, }; let resolved = resolve_desired_system( &test_node(), Some(&desired), "/opt/plasmacloud-src", "switch", &["systemctl".to_string(), "is-system-running".to_string()], true, ) .expect("desired system should resolve"); assert_eq!(resolved.flake_ref, "/opt/plasmacloud-src"); assert_eq!(resolved.switch_action, "switch"); assert_eq!( resolved.health_check_command, vec!["systemctl".to_string(), "is-system-running".to_string()] ); assert!(resolved.rollback_on_failure); } #[test] fn target_flake_attr_is_rendered_from_root_and_configuration() { assert_eq!( target_flake_attr("/opt/plasmacloud-src", "node01"), "/opt/plasmacloud-src#nixosConfigurations.node01.config.system.build.toplevel" ); } #[test] fn read_symlink_target_returns_none_for_missing_path() { assert_eq!(read_symlink_target("/tmp/photoncloud-nix-agent-missing-link"), None); } }