1109 lines
37 KiB
Rust
1109 lines
37 KiB
Rust
use axum::{extract::State, http::HeaderMap, http::StatusCode, Json};
|
|
use chrono::Utc;
|
|
use deployer_types::{
|
|
CommissionState, EnrollmentRuleSpec, HardwareFacts, InstallPlan, InstallState,
|
|
NodeClassSpec, NodeConfig, NodeInfo, NodePoolSpec, NodeState, PhoneHomeRequest,
|
|
PhoneHomeResponse, PowerState,
|
|
};
|
|
use sha2::{Digest, Sha256};
|
|
use std::sync::Arc;
|
|
use tracing::{debug, error, info, warn};
|
|
|
|
use crate::auth::require_bootstrap_auth;
|
|
use crate::cluster::ClusterNodeRecord;
|
|
use crate::state::AppState;
|
|
use crate::validation::{validate_identifier, validate_ip};
|
|
|
|
fn merge_install_plan(
|
|
preferred: Option<&InstallPlan>,
|
|
fallback: Option<&InstallPlan>,
|
|
) -> Option<InstallPlan> {
|
|
InstallPlan::from_layers(preferred, fallback)
|
|
}
|
|
|
|
fn merge_hardware_summary_metadata(
|
|
metadata: &mut std::collections::HashMap<String, String>,
|
|
hardware_facts: Option<&HardwareFacts>,
|
|
) {
|
|
let Some(hardware_facts) = hardware_facts else {
|
|
return;
|
|
};
|
|
|
|
if let Some(cpu_threads) = hardware_facts.cpu_threads {
|
|
metadata.insert("hardware.cpu_threads".to_string(), cpu_threads.to_string());
|
|
}
|
|
if let Some(cpu_cores) = hardware_facts.cpu_cores {
|
|
metadata.insert("hardware.cpu_cores".to_string(), cpu_cores.to_string());
|
|
}
|
|
if let Some(memory_bytes) = hardware_facts.memory_bytes {
|
|
metadata.insert("hardware.memory_bytes".to_string(), memory_bytes.to_string());
|
|
}
|
|
metadata.insert(
|
|
"hardware.disk_count".to_string(),
|
|
hardware_facts.disks.len().to_string(),
|
|
);
|
|
metadata.insert(
|
|
"hardware.nic_count".to_string(),
|
|
hardware_facts.nics.len().to_string(),
|
|
);
|
|
if let Some(architecture) = hardware_facts.architecture.as_deref() {
|
|
metadata.insert("hardware.architecture".to_string(), architecture.to_string());
|
|
}
|
|
}
|
|
|
|
fn inventory_hash(hardware_facts: Option<&HardwareFacts>) -> Option<String> {
|
|
let hardware_facts = hardware_facts?;
|
|
let payload = serde_json::to_vec(hardware_facts).ok()?;
|
|
let mut hasher = Sha256::new();
|
|
hasher.update(payload);
|
|
Some(format!("{:x}", hasher.finalize()))
|
|
}
|
|
|
|
/// POST /api/v1/phone-home
|
|
///
|
|
/// Handles node registration during first boot.
|
|
/// Nodes send their machine-id, and Deployer returns:
|
|
/// - Node configuration (hostname, role, IP, services)
|
|
/// - SSH host key
|
|
/// - TLS certificates (optional)
|
|
///
|
|
/// Uses ChainFire storage when available, falls back to in-memory.
|
|
pub async fn phone_home(
|
|
State(state): State<Arc<AppState>>,
|
|
headers: HeaderMap,
|
|
Json(request): Json<PhoneHomeRequest>,
|
|
) -> Result<Json<PhoneHomeResponse>, (StatusCode, String)> {
|
|
require_bootstrap_auth(&state, &headers)?;
|
|
validate_identifier(&request.machine_id, "machine_id")?;
|
|
if let Some(node_id) = request.node_id.as_ref() {
|
|
validate_identifier(node_id, "node_id")?;
|
|
}
|
|
if let Some(ip) = request.ip.as_ref() {
|
|
validate_ip(ip, "ip")?;
|
|
}
|
|
|
|
info!(
|
|
machine_id = %request.machine_id,
|
|
"Phone home request received"
|
|
);
|
|
|
|
// Lookup node configuration (ChainFire or fallback)
|
|
let (node_id, mut node_config) = match lookup_node_config(&state, &request.machine_id).await {
|
|
Some((id, config)) => (id, config),
|
|
None => {
|
|
if let Some((id, config)) = resolve_enrollment_config(&state, &request).await? {
|
|
info!(
|
|
machine_id = %request.machine_id,
|
|
node_id = %id,
|
|
"Resolved unknown machine through enrollment rules"
|
|
);
|
|
(id, config)
|
|
} else {
|
|
if !state.config.allow_unknown_nodes {
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
"Unknown machine-id rejected (pre-registration required)"
|
|
);
|
|
return Err((
|
|
StatusCode::FORBIDDEN,
|
|
"machine-id not registered".to_string(),
|
|
));
|
|
}
|
|
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
"Unknown machine-id, assigning default configuration (unsafe)"
|
|
);
|
|
// Assign default configuration for unknown machines (dev-only).
|
|
// Prefer explicit node_id, then DHCP-provided hostname, then machine-id suffix.
|
|
let node_id = request
|
|
.node_id
|
|
.as_ref()
|
|
.map(|v| v.trim())
|
|
.filter(|v| !v.is_empty())
|
|
.map(|v| v.to_string())
|
|
.or_else(|| {
|
|
request
|
|
.hostname
|
|
.as_ref()
|
|
.map(|v| v.trim())
|
|
.filter(|v| !v.is_empty())
|
|
.map(|v| v.to_string())
|
|
})
|
|
.unwrap_or_else(|| {
|
|
let max_suffix_len = 128usize.saturating_sub("node-".len());
|
|
let suffix_len = std::cmp::min(max_suffix_len, request.machine_id.len());
|
|
format!("node-{}", &request.machine_id[..suffix_len])
|
|
});
|
|
let config = NodeConfig {
|
|
hostname: node_id.clone(),
|
|
role: "worker".to_string(),
|
|
ip: request.ip.clone().unwrap_or_default(),
|
|
services: vec![],
|
|
ssh_authorized_keys: vec![],
|
|
labels: std::collections::HashMap::new(),
|
|
pool: None,
|
|
node_class: None,
|
|
failure_domain: request.metadata.get("failure_domain").cloned(),
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
};
|
|
(node_id, config)
|
|
}
|
|
}
|
|
};
|
|
|
|
if let Some(request_ip) = request.ip.as_ref() {
|
|
if !node_config.ip.is_empty() && node_config.ip != *request_ip {
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
requested_ip = %request_ip,
|
|
expected_ip = %node_config.ip,
|
|
"Node IP mismatch in phone-home"
|
|
);
|
|
return Err((StatusCode::BAD_REQUEST, "node ip mismatch".to_string()));
|
|
}
|
|
}
|
|
|
|
if let Some(requested_id) = request.node_id.as_ref() {
|
|
if requested_id != &node_id {
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
requested_id = %requested_id,
|
|
expected_id = %node_id,
|
|
"Node ID mismatch in phone-home"
|
|
);
|
|
return Err((StatusCode::BAD_REQUEST, "node_id mismatch".to_string()));
|
|
}
|
|
}
|
|
|
|
if node_config.hostname.is_empty() {
|
|
if let Some(hostname) = request.hostname.as_ref() {
|
|
node_config.hostname = hostname.clone();
|
|
} else {
|
|
node_config.hostname = node_id.clone();
|
|
}
|
|
}
|
|
|
|
if node_config.ip.is_empty() {
|
|
if let Some(ip) = request.ip.clone() {
|
|
node_config.ip = ip;
|
|
} else {
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
node_id = %node_id,
|
|
"Node config missing IP; refusing registration"
|
|
);
|
|
return Err((StatusCode::BAD_REQUEST, "node ip missing".to_string()));
|
|
}
|
|
}
|
|
|
|
validate_ip(&node_config.ip, "node_config.ip")?;
|
|
|
|
// Ensure metadata contains authoritative role/service info
|
|
let mut metadata = request.metadata.clone();
|
|
metadata.insert("role".to_string(), node_config.role.clone());
|
|
metadata.insert("services".to_string(), node_config.services.join(","));
|
|
merge_hardware_summary_metadata(&mut metadata, request.hardware_facts.as_ref());
|
|
|
|
// Create NodeInfo for tracking
|
|
let node_info = NodeInfo {
|
|
id: node_id.clone(),
|
|
machine_id: Some(request.machine_id.clone()),
|
|
hostname: node_config.hostname.clone(),
|
|
ip: node_config.ip.clone(),
|
|
state: NodeState::Provisioning,
|
|
cluster_config_hash: request.cluster_config_hash.unwrap_or_default(),
|
|
last_heartbeat: Utc::now(),
|
|
metadata,
|
|
};
|
|
|
|
// Persist config mapping for this machine (best-effort)
|
|
if let Err(e) = persist_node_config(&state, &request.machine_id, &node_id, &node_config).await {
|
|
warn!(
|
|
machine_id = %request.machine_id,
|
|
node_id = %node_id,
|
|
error = %e,
|
|
"Failed to persist node configuration"
|
|
);
|
|
}
|
|
|
|
// Store in ChainFire or in-memory
|
|
match store_node_info(&state, &node_info).await {
|
|
Ok(_) => {
|
|
let storage = if state.has_local_storage() {
|
|
"local"
|
|
} else if state.has_storage() {
|
|
"chainfire"
|
|
} else {
|
|
"in-memory"
|
|
};
|
|
info!(
|
|
node_id = %node_info.id,
|
|
hostname = %node_info.hostname,
|
|
role = %node_config.role,
|
|
storage = storage,
|
|
"Node registered successfully"
|
|
);
|
|
|
|
if let Err(e) = store_cluster_node_if_configured(
|
|
&state,
|
|
&node_info,
|
|
&node_config,
|
|
&request.machine_id,
|
|
request.hardware_facts.as_ref(),
|
|
)
|
|
.await
|
|
{
|
|
warn!(
|
|
node_id = %node_info.id,
|
|
error = %e,
|
|
"Failed to store cluster node state"
|
|
);
|
|
}
|
|
|
|
let ssh_host_key = if let Some(local_storage) = &state.local_storage {
|
|
let mut storage = local_storage.lock().await;
|
|
match storage.get_or_generate_ssh_host_key(&node_info.id) {
|
|
Ok(key) => Some(key),
|
|
Err(e) => {
|
|
warn!(error = %e, "Failed to generate ssh host key");
|
|
None
|
|
}
|
|
}
|
|
} else {
|
|
None
|
|
};
|
|
|
|
let (tls_cert, tls_key) = if state.config.tls_self_signed
|
|
|| (state.config.tls_ca_cert_path.is_some()
|
|
&& state.config.tls_ca_key_path.is_some())
|
|
{
|
|
if let Some(local_storage) = &state.local_storage {
|
|
let mut storage = local_storage.lock().await;
|
|
match storage.get_or_generate_tls_cert(
|
|
&node_info.id,
|
|
&node_config.hostname,
|
|
&node_config.ip,
|
|
state.config.tls_ca_cert_path.as_deref(),
|
|
state.config.tls_ca_key_path.as_deref(),
|
|
) {
|
|
Ok((cert, key)) => (Some(cert), Some(key)),
|
|
Err(e) => {
|
|
warn!(error = %e, "Failed to issue node TLS certificate");
|
|
(None, None)
|
|
}
|
|
}
|
|
} else {
|
|
match crate::tls::issue_node_cert(
|
|
&node_info.id,
|
|
&node_config.hostname,
|
|
&node_config.ip,
|
|
state.config.tls_ca_cert_path.as_deref(),
|
|
state.config.tls_ca_key_path.as_deref(),
|
|
) {
|
|
Ok((cert, key)) => (Some(cert), Some(key)),
|
|
Err(e) => {
|
|
warn!(error = %e, "Failed to issue node TLS certificate");
|
|
(None, None)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
(None, None)
|
|
};
|
|
|
|
Ok(Json(PhoneHomeResponse {
|
|
success: true,
|
|
message: Some(format!("Node {} registered successfully", node_info.id)),
|
|
node_id: node_id.clone(),
|
|
state: NodeState::Provisioning,
|
|
node_config: Some(node_config),
|
|
ssh_host_key,
|
|
tls_cert,
|
|
tls_key,
|
|
}))
|
|
}
|
|
Err(e) => {
|
|
error!(
|
|
machine_id = %request.machine_id,
|
|
error = %e,
|
|
"Failed to store node info"
|
|
);
|
|
|
|
Err((
|
|
StatusCode::INTERNAL_SERVER_ERROR,
|
|
format!("Failed to register node: {}", e),
|
|
))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Lookup node configuration by machine-id
|
|
///
|
|
/// Tries ChainFire first, then falls back to in-memory storage.
|
|
pub(crate) async fn lookup_node_config(
|
|
state: &AppState,
|
|
machine_id: &str,
|
|
) -> Option<(String, NodeConfig)> {
|
|
debug!(machine_id = %machine_id, "Looking up node configuration");
|
|
|
|
// Try local storage first
|
|
if let Some(local_storage) = &state.local_storage {
|
|
let storage = local_storage.lock().await;
|
|
if let Some((node_id, config)) = storage.get_node_config(machine_id) {
|
|
debug!(
|
|
machine_id = %machine_id,
|
|
node_id = %node_id,
|
|
"Found config in local storage"
|
|
);
|
|
return Some((node_id, config));
|
|
}
|
|
}
|
|
|
|
// Try ChainFire storage first
|
|
if let Some(storage_mutex) = &state.storage {
|
|
let mut storage = storage_mutex.lock().await;
|
|
match storage.get_node_config(machine_id).await {
|
|
Ok(Some((node_id, config))) => {
|
|
debug!(
|
|
machine_id = %machine_id,
|
|
node_id = %node_id,
|
|
"Found config in ChainFire"
|
|
);
|
|
return Some((node_id, config));
|
|
}
|
|
Ok(None) => {
|
|
debug!(machine_id = %machine_id, "Not found in ChainFire");
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
machine_id = %machine_id,
|
|
error = %e,
|
|
"ChainFire lookup failed, trying fallback"
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Fallback to in-memory storage
|
|
let configs = state.machine_configs.read().await;
|
|
if let Some((node_id, config)) = configs.get(machine_id) {
|
|
debug!(
|
|
machine_id = %machine_id,
|
|
node_id = %node_id,
|
|
"Found config in in-memory storage"
|
|
);
|
|
return Some((node_id.clone(), config.clone()));
|
|
}
|
|
|
|
// Hardcoded test mappings (for development/testing)
|
|
if state.config.allow_test_mappings {
|
|
match machine_id {
|
|
"test-machine-01" => {
|
|
return Some((
|
|
"node01".to_string(),
|
|
NodeConfig {
|
|
hostname: "node01".to_string(),
|
|
role: "control-plane".to_string(),
|
|
ip: "10.0.1.10".to_string(),
|
|
services: vec!["chainfire".to_string(), "flaredb".to_string()],
|
|
ssh_authorized_keys: vec![],
|
|
labels: std::collections::HashMap::new(),
|
|
pool: None,
|
|
node_class: None,
|
|
failure_domain: None,
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
},
|
|
));
|
|
}
|
|
"test-machine-02" => {
|
|
return Some((
|
|
"node02".to_string(),
|
|
NodeConfig {
|
|
hostname: "node02".to_string(),
|
|
role: "worker".to_string(),
|
|
ip: "10.0.1.11".to_string(),
|
|
services: vec!["chainfire".to_string()],
|
|
ssh_authorized_keys: vec![],
|
|
labels: std::collections::HashMap::new(),
|
|
pool: None,
|
|
node_class: None,
|
|
failure_domain: None,
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
},
|
|
));
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
async fn resolve_enrollment_config(
|
|
state: &AppState,
|
|
request: &PhoneHomeRequest,
|
|
) -> Result<Option<(String, NodeConfig)>, (StatusCode, String)> {
|
|
let Some(cluster_id) = state.config.cluster_id.as_deref() else {
|
|
return Ok(None);
|
|
};
|
|
let Some(storage_mutex) = &state.storage else {
|
|
return Ok(None);
|
|
};
|
|
|
|
let cluster_namespace = state.config.cluster_namespace.trim();
|
|
if cluster_namespace.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
|
|
let mut storage = storage_mutex.lock().await;
|
|
let mut rules = storage
|
|
.list_enrollment_rules(cluster_namespace, cluster_id)
|
|
.await
|
|
.map_err(|e| {
|
|
(
|
|
StatusCode::INTERNAL_SERVER_ERROR,
|
|
format!("failed to load enrollment rules: {}", e),
|
|
)
|
|
})?;
|
|
if rules.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
|
|
let node_classes = storage
|
|
.list_node_classes(cluster_namespace, cluster_id)
|
|
.await
|
|
.map_err(|e| {
|
|
(
|
|
StatusCode::INTERNAL_SERVER_ERROR,
|
|
format!("failed to load node classes: {}", e),
|
|
)
|
|
})?;
|
|
let pools = storage
|
|
.list_pools(cluster_namespace, cluster_id)
|
|
.await
|
|
.map_err(|e| {
|
|
(
|
|
StatusCode::INTERNAL_SERVER_ERROR,
|
|
format!("failed to load pools: {}", e),
|
|
)
|
|
})?;
|
|
drop(storage);
|
|
|
|
rules.sort_by(|lhs, rhs| {
|
|
rhs.priority
|
|
.cmp(&lhs.priority)
|
|
.then_with(|| lhs.name.cmp(&rhs.name))
|
|
});
|
|
|
|
let Some(rule) = rules
|
|
.iter()
|
|
.find(|rule| enrollment_rule_matches(rule, request))
|
|
else {
|
|
return Ok(None);
|
|
};
|
|
|
|
Ok(Some(build_node_config_from_rule(
|
|
rule,
|
|
request,
|
|
&node_classes,
|
|
&pools,
|
|
)))
|
|
}
|
|
|
|
fn enrollment_rule_matches(rule: &EnrollmentRuleSpec, request: &PhoneHomeRequest) -> bool {
|
|
if let Some(prefix) = rule.match_hostname_prefix.as_deref() {
|
|
let Some(hostname) = request.hostname.as_deref() else {
|
|
return false;
|
|
};
|
|
if !hostname.starts_with(prefix) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
if !rule.match_ip_prefixes.is_empty() {
|
|
let Some(ip) = request.ip.as_deref() else {
|
|
return false;
|
|
};
|
|
if !rule
|
|
.match_ip_prefixes
|
|
.iter()
|
|
.any(|prefix| ip.starts_with(prefix))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
|
|
rule.match_labels
|
|
.iter()
|
|
.all(|(key, value)| request.metadata.get(key) == Some(value))
|
|
}
|
|
|
|
fn build_node_config_from_rule(
|
|
rule: &EnrollmentRuleSpec,
|
|
request: &PhoneHomeRequest,
|
|
node_classes: &[NodeClassSpec],
|
|
pools: &[NodePoolSpec],
|
|
) -> (String, NodeConfig) {
|
|
let requested_id = request
|
|
.node_id
|
|
.as_ref()
|
|
.map(|value| value.trim())
|
|
.filter(|value| !value.is_empty())
|
|
.map(str::to_string)
|
|
.or_else(|| {
|
|
request
|
|
.hostname
|
|
.as_ref()
|
|
.map(|value| value.trim())
|
|
.filter(|value| !value.is_empty())
|
|
.map(str::to_string)
|
|
});
|
|
|
|
let node_id = requested_id.unwrap_or_else(|| {
|
|
let prefix = rule.node_id_prefix.as_deref().unwrap_or("node");
|
|
let suffix_len = std::cmp::min(12usize, request.machine_id.len());
|
|
format!("{prefix}-{}", &request.machine_id[..suffix_len])
|
|
});
|
|
|
|
let pool = rule.pool.clone();
|
|
let pool_spec = pool
|
|
.as_deref()
|
|
.and_then(|name| pools.iter().find(|pool| pool.name == name));
|
|
let node_class = rule
|
|
.node_class
|
|
.clone()
|
|
.or_else(|| pool_spec.and_then(|pool| pool.node_class.clone()));
|
|
let node_class_spec = node_class.as_deref().and_then(|name| {
|
|
node_classes
|
|
.iter()
|
|
.find(|node_class| node_class.name == name)
|
|
});
|
|
|
|
let role = rule
|
|
.role
|
|
.clone()
|
|
.or_else(|| node_class_spec.and_then(|node_class| node_class.roles.first().cloned()))
|
|
.unwrap_or_else(|| "worker".to_string());
|
|
|
|
let mut labels = std::collections::HashMap::new();
|
|
if let Some(node_class) = node_class_spec {
|
|
labels.extend(node_class.labels.clone());
|
|
}
|
|
if let Some(pool) = pool_spec {
|
|
for (key, value) in &pool.labels {
|
|
labels.entry(key.clone()).or_insert_with(|| value.clone());
|
|
}
|
|
}
|
|
for (key, value) in &rule.labels {
|
|
labels.insert(key.clone(), value.clone());
|
|
}
|
|
if let Some(pool_name) = pool.as_deref() {
|
|
labels
|
|
.entry("pool".to_string())
|
|
.or_insert_with(|| pool_name.to_string());
|
|
}
|
|
if let Some(node_class_name) = node_class.as_deref() {
|
|
labels
|
|
.entry("node_class".to_string())
|
|
.or_insert_with(|| node_class_name.to_string());
|
|
}
|
|
|
|
let failure_domain = request
|
|
.metadata
|
|
.get("failure_domain")
|
|
.cloned()
|
|
.or_else(|| request.metadata.get("topology.kubernetes.io/zone").cloned());
|
|
|
|
(
|
|
node_id.clone(),
|
|
NodeConfig {
|
|
hostname: request
|
|
.hostname
|
|
.clone()
|
|
.filter(|value| !value.trim().is_empty())
|
|
.unwrap_or_else(|| node_id.clone()),
|
|
role,
|
|
ip: request.ip.clone().unwrap_or_default(),
|
|
services: rule.services.clone(),
|
|
ssh_authorized_keys: rule.ssh_authorized_keys.clone(),
|
|
labels,
|
|
pool,
|
|
node_class,
|
|
failure_domain,
|
|
nix_profile: rule
|
|
.nix_profile
|
|
.clone()
|
|
.or_else(|| node_class_spec.and_then(|node_class| node_class.nix_profile.clone())),
|
|
install_plan: merge_install_plan(
|
|
rule.install_plan.as_ref(),
|
|
node_class_spec.and_then(|node_class| node_class.install_plan.as_ref()),
|
|
),
|
|
},
|
|
)
|
|
}
|
|
|
|
/// Store NodeInfo in ChainFire or in-memory
|
|
async fn store_node_info(state: &AppState, node_info: &NodeInfo) -> anyhow::Result<()> {
|
|
let mut stored = false;
|
|
|
|
// Prefer local storage when configured.
|
|
if let Some(local_storage) = &state.local_storage {
|
|
let mut storage = local_storage.lock().await;
|
|
match storage.store_node_info(node_info) {
|
|
Ok(()) => {
|
|
stored = true;
|
|
debug!(node_id = %node_info.id, "Stored node info in local storage");
|
|
}
|
|
Err(e) => {
|
|
warn!(error = %e, "Failed to store node info in local storage");
|
|
}
|
|
}
|
|
}
|
|
|
|
// Also try ChainFire if available.
|
|
if let Some(storage_mutex) = &state.storage {
|
|
let mut chainfire = storage_mutex.lock().await;
|
|
match chainfire.store_node_info(node_info).await {
|
|
Ok(()) => {
|
|
stored = true;
|
|
debug!(node_id = %node_info.id, "Stored node info in ChainFire");
|
|
}
|
|
Err(e) => {
|
|
warn!(error = %e, "Failed to store node info in ChainFire");
|
|
}
|
|
}
|
|
}
|
|
|
|
if stored {
|
|
return Ok(());
|
|
}
|
|
|
|
// Fallback to in-memory storage when all configured backends fail.
|
|
state
|
|
.nodes
|
|
.write()
|
|
.await
|
|
.insert(node_info.id.clone(), node_info.clone());
|
|
|
|
debug!(
|
|
node_id = %node_info.id,
|
|
"Stored node info in-memory (all backends unavailable)"
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Persist node config mapping in ChainFire and in-memory fallback
|
|
async fn persist_node_config(
|
|
state: &AppState,
|
|
machine_id: &str,
|
|
node_id: &str,
|
|
config: &NodeConfig,
|
|
) -> anyhow::Result<()> {
|
|
if let Some(local_storage) = &state.local_storage {
|
|
let mut storage = local_storage.lock().await;
|
|
if let Err(e) = storage.register_node(machine_id, node_id, config) {
|
|
warn!(
|
|
machine_id = %machine_id,
|
|
node_id = %node_id,
|
|
error = %e,
|
|
"Failed to persist node config to local storage"
|
|
);
|
|
}
|
|
}
|
|
|
|
if let Some(storage_mutex) = &state.storage {
|
|
let mut storage = storage_mutex.lock().await;
|
|
if let Err(e) = storage.register_node(machine_id, node_id, config).await {
|
|
warn!(
|
|
machine_id = %machine_id,
|
|
node_id = %node_id,
|
|
error = %e,
|
|
"Failed to persist node config to ChainFire"
|
|
);
|
|
}
|
|
}
|
|
|
|
// Keep in-memory mapping in sync as a fallback cache
|
|
{
|
|
let mut map = state.machine_configs.write().await;
|
|
if let Some((existing_node, _)) = map.get(machine_id) {
|
|
if existing_node != node_id {
|
|
warn!(
|
|
machine_id = %machine_id,
|
|
existing_node = %existing_node,
|
|
requested_node = %node_id,
|
|
"Skipping in-memory mapping update due to conflict"
|
|
);
|
|
return Ok(());
|
|
}
|
|
}
|
|
map.insert(
|
|
machine_id.to_string(),
|
|
(node_id.to_string(), config.clone()),
|
|
);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn store_cluster_node_if_configured(
|
|
state: &AppState,
|
|
node_info: &NodeInfo,
|
|
node_config: &NodeConfig,
|
|
machine_id: &str,
|
|
hardware_facts: Option<&HardwareFacts>,
|
|
) -> anyhow::Result<()> {
|
|
let Some(cluster_id) = state.config.cluster_id.as_deref() else {
|
|
debug!("cluster_id not configured; skipping cluster node state write");
|
|
return Ok(());
|
|
};
|
|
|
|
if cluster_id.trim().is_empty() {
|
|
debug!("cluster_id is empty; skipping cluster node state write");
|
|
return Ok(());
|
|
}
|
|
|
|
let cluster_namespace = state.config.cluster_namespace.trim();
|
|
if cluster_namespace.is_empty() {
|
|
debug!("cluster_namespace is empty; skipping cluster node state write");
|
|
return Ok(());
|
|
}
|
|
|
|
let mut labels = node_config.labels.clone();
|
|
for (key, value) in &node_info.metadata {
|
|
labels.insert(key.clone(), value.clone());
|
|
}
|
|
labels.remove("role");
|
|
labels.remove("services");
|
|
|
|
let mut roles = Vec::new();
|
|
if !node_config.role.trim().is_empty() {
|
|
roles.push(node_config.role.clone());
|
|
} else if let Some(role) = node_info.metadata.get("role") {
|
|
if !role.trim().is_empty() {
|
|
roles.push(role.clone());
|
|
}
|
|
}
|
|
|
|
let record = ClusterNodeRecord {
|
|
node_id: node_info.id.clone(),
|
|
machine_id: Some(machine_id.to_string()),
|
|
ip: node_info.ip.clone(),
|
|
hostname: node_info.hostname.clone(),
|
|
roles,
|
|
labels,
|
|
pool: node_config.pool.clone(),
|
|
node_class: node_config.node_class.clone(),
|
|
failure_domain: node_config.failure_domain.clone(),
|
|
nix_profile: node_config.nix_profile.clone(),
|
|
install_plan: node_config.install_plan.clone(),
|
|
hardware_facts: hardware_facts.cloned(),
|
|
state: Some(format!("{:?}", node_info.state).to_lowercase()),
|
|
commission_state: hardware_facts.map(|_| CommissionState::Discovered),
|
|
install_state: node_config.install_plan.as_ref().map(|_| InstallState::Pending),
|
|
commissioned_at: None,
|
|
last_inventory_hash: inventory_hash(hardware_facts),
|
|
power_state: node_info
|
|
.metadata
|
|
.get("power_state")
|
|
.and_then(|value| match value.as_str() {
|
|
"on" => Some(PowerState::On),
|
|
"off" => Some(PowerState::Off),
|
|
"cycling" => Some(PowerState::Cycling),
|
|
"unknown" => Some(PowerState::Unknown),
|
|
_ => None,
|
|
}),
|
|
bmc_ref: node_info.metadata.get("bmc_ref").cloned(),
|
|
last_heartbeat: Some(node_info.last_heartbeat),
|
|
};
|
|
|
|
if let Some(local_storage) = &state.local_storage {
|
|
let mut storage = local_storage.lock().await;
|
|
if let Err(e) =
|
|
storage.store_cluster_node(cluster_namespace, cluster_id, &node_info.id, &record)
|
|
{
|
|
warn!(error = %e, "Failed to store cluster node in local storage");
|
|
}
|
|
}
|
|
|
|
if let Some(storage_mutex) = &state.storage {
|
|
let mut storage = storage_mutex.lock().await;
|
|
if let Err(e) = storage
|
|
.store_cluster_node(cluster_namespace, cluster_id, &node_info.id, &record)
|
|
.await
|
|
{
|
|
warn!(error = %e, "Failed to store cluster node in ChainFire");
|
|
}
|
|
} else if state.local_storage.is_none() {
|
|
debug!("ChainFire storage unavailable; skipping cluster node state write");
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::config::Config;
|
|
use crate::state::AppState;
|
|
use axum::http::HeaderMap;
|
|
use std::collections::HashMap;
|
|
|
|
fn test_headers() -> HeaderMap {
|
|
let mut headers = HeaderMap::new();
|
|
headers.insert("x-deployer-token", "test-token".parse().unwrap());
|
|
headers
|
|
}
|
|
|
|
fn test_state() -> Arc<AppState> {
|
|
let mut config = Config::default();
|
|
config.bootstrap_token = Some("test-token".to_string());
|
|
Arc::new(AppState::with_config(config))
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_phone_home_known_machine() {
|
|
let state = test_state();
|
|
|
|
// Pre-register a machine
|
|
let config = NodeConfig {
|
|
hostname: "node01".to_string(),
|
|
role: "control-plane".to_string(),
|
|
ip: "10.0.1.10".to_string(),
|
|
services: vec!["chainfire".to_string(), "flaredb".to_string()],
|
|
ssh_authorized_keys: vec![],
|
|
labels: HashMap::new(),
|
|
pool: None,
|
|
node_class: None,
|
|
failure_domain: None,
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
};
|
|
state.machine_configs.write().await.insert(
|
|
"test-machine-01".to_string(),
|
|
("node01".to_string(), config),
|
|
);
|
|
|
|
let request = PhoneHomeRequest {
|
|
machine_id: "test-machine-01".to_string(),
|
|
node_id: None,
|
|
hostname: None,
|
|
ip: None,
|
|
cluster_config_hash: None,
|
|
metadata: HashMap::new(),
|
|
hardware_facts: None,
|
|
};
|
|
|
|
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
|
assert!(result.is_ok());
|
|
|
|
let response = result.unwrap().0;
|
|
assert!(response.success);
|
|
assert_eq!(response.node_id, "node01");
|
|
assert_eq!(response.state, NodeState::Provisioning);
|
|
assert!(response.node_config.is_some());
|
|
assert!(response.ssh_host_key.is_none());
|
|
|
|
let config = response.node_config.unwrap();
|
|
assert_eq!(config.hostname, "node01");
|
|
assert_eq!(config.role, "control-plane");
|
|
|
|
// Verify node was stored
|
|
let nodes = state.nodes.read().await;
|
|
assert!(nodes.contains_key("node01"));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_phone_home_unknown_machine() {
|
|
let mut config = Config::default();
|
|
config.bootstrap_token = Some("test-token".to_string());
|
|
config.allow_unknown_nodes = true;
|
|
let state = Arc::new(AppState::with_config(config));
|
|
|
|
let request = PhoneHomeRequest {
|
|
machine_id: "unknown-machine-xyz".to_string(),
|
|
node_id: None,
|
|
hostname: None,
|
|
ip: Some("10.0.1.100".to_string()),
|
|
cluster_config_hash: None,
|
|
metadata: HashMap::new(),
|
|
hardware_facts: None,
|
|
};
|
|
|
|
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
|
assert!(result.is_ok());
|
|
|
|
let response = result.unwrap().0;
|
|
assert!(response.success);
|
|
assert!(response.node_id.starts_with("node-"));
|
|
assert_eq!(response.state, NodeState::Provisioning);
|
|
assert!(response.node_config.is_some());
|
|
|
|
let config = response.node_config.unwrap();
|
|
assert_eq!(config.role, "worker"); // Default role
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_phone_home_with_preregistered_config() {
|
|
let state = test_state();
|
|
|
|
// Pre-register a machine
|
|
let config = NodeConfig {
|
|
hostname: "my-node".to_string(),
|
|
role: "storage".to_string(),
|
|
ip: "10.0.2.50".to_string(),
|
|
services: vec!["lightningstor".to_string()],
|
|
ssh_authorized_keys: vec![],
|
|
labels: HashMap::new(),
|
|
pool: None,
|
|
node_class: None,
|
|
failure_domain: None,
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
};
|
|
state.machine_configs.write().await.insert(
|
|
"preregistered-123".to_string(),
|
|
("my-node".to_string(), config),
|
|
);
|
|
|
|
let request = PhoneHomeRequest {
|
|
machine_id: "preregistered-123".to_string(),
|
|
node_id: None,
|
|
hostname: None,
|
|
ip: None,
|
|
cluster_config_hash: None,
|
|
metadata: HashMap::new(),
|
|
hardware_facts: None,
|
|
};
|
|
|
|
let result = phone_home(State(state.clone()), test_headers(), Json(request)).await;
|
|
assert!(result.is_ok());
|
|
|
|
let response = result.unwrap().0;
|
|
assert!(response.success);
|
|
assert_eq!(response.node_id, "my-node");
|
|
|
|
let config = response.node_config.unwrap();
|
|
assert_eq!(config.role, "storage");
|
|
assert_eq!(config.ip, "10.0.2.50");
|
|
}
|
|
|
|
#[test]
|
|
fn test_enrollment_rule_matching() {
|
|
let rule = EnrollmentRuleSpec {
|
|
name: "gpu".to_string(),
|
|
priority: 10,
|
|
match_labels: HashMap::from([("sku".to_string(), "gpu".to_string())]),
|
|
match_hostname_prefix: Some("gpu-".to_string()),
|
|
match_ip_prefixes: vec!["10.0.3.".to_string()],
|
|
pool: Some("gpu".to_string()),
|
|
node_class: Some("gpu-worker".to_string()),
|
|
role: None,
|
|
labels: HashMap::new(),
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
services: vec![],
|
|
ssh_authorized_keys: vec![],
|
|
node_id_prefix: Some("gpu".to_string()),
|
|
};
|
|
|
|
let request = PhoneHomeRequest {
|
|
machine_id: "machine-1".to_string(),
|
|
node_id: None,
|
|
hostname: Some("gpu-node-01".to_string()),
|
|
ip: Some("10.0.3.25".to_string()),
|
|
cluster_config_hash: None,
|
|
metadata: HashMap::from([("sku".to_string(), "gpu".to_string())]),
|
|
hardware_facts: None,
|
|
};
|
|
|
|
assert!(enrollment_rule_matches(&rule, &request));
|
|
}
|
|
|
|
#[test]
|
|
fn test_build_node_config_from_rule_inherits_class_and_pool() {
|
|
let rule = EnrollmentRuleSpec {
|
|
name: "gpu".to_string(),
|
|
priority: 10,
|
|
match_labels: HashMap::new(),
|
|
match_hostname_prefix: None,
|
|
match_ip_prefixes: vec![],
|
|
pool: Some("gpu".to_string()),
|
|
node_class: None,
|
|
role: None,
|
|
labels: HashMap::from([("accelerator".to_string(), "nvidia".to_string())]),
|
|
nix_profile: None,
|
|
install_plan: None,
|
|
services: vec!["gpu-agent".to_string()],
|
|
ssh_authorized_keys: vec!["ssh-ed25519 test".to_string()],
|
|
node_id_prefix: Some("gpu".to_string()),
|
|
};
|
|
let request = PhoneHomeRequest {
|
|
machine_id: "abcdef123456".to_string(),
|
|
node_id: None,
|
|
hostname: Some("gpu-dyn-01".to_string()),
|
|
ip: Some("10.0.9.10".to_string()),
|
|
cluster_config_hash: None,
|
|
metadata: HashMap::from([(
|
|
"topology.kubernetes.io/zone".to_string(),
|
|
"rack-z".to_string(),
|
|
)]),
|
|
hardware_facts: None,
|
|
};
|
|
let node_classes = vec![NodeClassSpec {
|
|
name: "gpu-worker".to_string(),
|
|
description: None,
|
|
nix_profile: Some("profiles/gpu-worker".to_string()),
|
|
install_plan: Some(InstallPlan {
|
|
nixos_configuration: Some("gpu-worker".to_string()),
|
|
disko_config_path: Some("profiles/gpu-worker/disko.nix".to_string()),
|
|
target_disk: Some("/dev/disk/by-id/nvme-gpu-worker".to_string()),
|
|
target_disk_by_id: None,
|
|
}),
|
|
roles: vec!["worker".to_string()],
|
|
labels: HashMap::from([("tier".to_string(), "gpu".to_string())]),
|
|
}];
|
|
let pools = vec![NodePoolSpec {
|
|
name: "gpu".to_string(),
|
|
description: None,
|
|
node_class: Some("gpu-worker".to_string()),
|
|
min_size: None,
|
|
max_size: None,
|
|
labels: HashMap::from([("pool-kind".to_string(), "accelerated".to_string())]),
|
|
}];
|
|
|
|
let (node_id, config) = build_node_config_from_rule(&rule, &request, &node_classes, &pools);
|
|
|
|
assert_eq!(node_id, "gpu-dyn-01");
|
|
assert_eq!(config.role, "worker");
|
|
assert_eq!(config.pool.as_deref(), Some("gpu"));
|
|
assert_eq!(config.node_class.as_deref(), Some("gpu-worker"));
|
|
assert_eq!(config.nix_profile.as_deref(), Some("profiles/gpu-worker"));
|
|
let install_plan = config
|
|
.install_plan
|
|
.expect("install_plan should inherit from class");
|
|
assert_eq!(
|
|
install_plan.nixos_configuration.as_deref(),
|
|
Some("gpu-worker")
|
|
);
|
|
assert_eq!(
|
|
install_plan.disko_config_path.as_deref(),
|
|
Some("profiles/gpu-worker/disko.nix")
|
|
);
|
|
assert_eq!(config.labels.get("tier").map(String::as_str), Some("gpu"));
|
|
assert_eq!(
|
|
config.labels.get("pool-kind").map(String::as_str),
|
|
Some("accelerated")
|
|
);
|
|
assert_eq!(
|
|
config.labels.get("accelerator").map(String::as_str),
|
|
Some("nvidia")
|
|
);
|
|
assert_eq!(config.failure_domain.as_deref(), Some("rack-z"));
|
|
}
|
|
}
|