Refresh service status from node agent

This commit is contained in:
centra 2026-04-01 23:07:42 +09:00
parent 0a5c823134
commit faabcbfc2e
Signed by: centra
GPG key ID: 0C09689D20B25ACA
5 changed files with 834 additions and 333 deletions

View file

@ -1,6 +1,6 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::collections::HashMap; use std::collections::{HashMap, HashSet};
/// Node lifecycle state /// Node lifecycle state
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@ -980,6 +980,391 @@ pub struct ServiceStatusRecord {
pub observed_at: Option<DateTime<Utc>>, pub observed_at: Option<DateTime<Utc>>,
} }
/// Summarized readiness of a service's declared dependencies.
#[derive(Debug, Clone, PartialEq, Eq, Default)]
pub struct ServiceDependencySummary {
pub dependencies_ready: bool,
pub blockers: Vec<String>,
}
pub fn cluster_node_pool(node: &ClusterNodeRecord) -> Option<&str> {
node.pool
.as_deref()
.or_else(|| node.labels.get("pool").map(String::as_str))
.or_else(|| {
node.labels
.get("pool.photoncloud.io/name")
.map(String::as_str)
})
}
pub fn cluster_node_class(node: &ClusterNodeRecord) -> Option<&str> {
node.node_class
.as_deref()
.or_else(|| node.labels.get("node_class").map(String::as_str))
.or_else(|| {
node.labels
.get("nodeclass.photoncloud.io/name")
.map(String::as_str)
})
}
pub fn cluster_node_is_eligible(
node: &ClusterNodeRecord,
placement: &PlacementPolicy,
heartbeat_timeout_secs: u64,
) -> bool {
if node.state.as_deref() != Some("active") {
return false;
}
if heartbeat_timeout_secs > 0 {
let Some(last) = node.last_heartbeat else {
return false;
};
let age = Utc::now().signed_duration_since(last).num_seconds();
if age > heartbeat_timeout_secs as i64 {
return false;
}
}
if !placement.roles.is_empty()
&& !node
.roles
.iter()
.any(|role| placement.roles.iter().any(|expected| expected == role))
{
return false;
}
if !placement.pools.is_empty()
&& !cluster_node_pool(node)
.map(|pool| placement.pools.iter().any(|expected| expected == pool))
.unwrap_or(false)
{
return false;
}
if !placement.node_classes.is_empty()
&& !cluster_node_class(node)
.map(|node_class| {
placement
.node_classes
.iter()
.any(|expected| expected == node_class)
})
.unwrap_or(false)
{
return false;
}
placement
.match_labels
.iter()
.all(|(key, value)| node.labels.get(key) == Some(value))
}
pub fn eligible_cluster_nodes<'a>(
nodes: &'a [ClusterNodeRecord],
placement: &PlacementPolicy,
heartbeat_timeout_secs: u64,
) -> Vec<&'a ClusterNodeRecord> {
nodes
.iter()
.filter(|node| cluster_node_is_eligible(node, placement, heartbeat_timeout_secs))
.collect()
}
pub fn service_instance_has_fresh_heartbeat(
instance: &ServiceInstanceSpec,
heartbeat_timeout_secs: u64,
) -> bool {
if heartbeat_timeout_secs == 0 {
return true;
}
let Some(last_heartbeat) = instance.last_heartbeat.or(instance.observed_at) else {
return false;
};
Utc::now()
.signed_duration_since(last_heartbeat)
.num_seconds()
<= heartbeat_timeout_secs as i64
}
pub fn service_instance_is_available(
instance: &ServiceInstanceSpec,
heartbeat_timeout_secs: u64,
) -> bool {
matches!(instance.state.as_deref(), Some("healthy"))
&& service_instance_has_fresh_heartbeat(instance, heartbeat_timeout_secs)
}
pub fn service_publication_is_ready(state: &ServicePublicationState) -> bool {
state
.dns
.as_ref()
.map(|dns| !dns.values.is_empty())
.unwrap_or(false)
|| state
.load_balancer
.as_ref()
.and_then(|load_balancer| load_balancer.vip_address.as_ref())
.is_some()
}
pub fn compute_service_dependency_cycles(services: &[ServiceSpec]) -> HashSet<String> {
let service_names = services
.iter()
.map(|service| service.name.clone())
.collect::<HashSet<_>>();
let dependencies = services
.iter()
.map(|service| {
let deps = service
.depends_on
.iter()
.filter(|dependency| service_names.contains(&dependency.service))
.map(|dependency| dependency.service.clone())
.collect::<Vec<_>>();
(service.name.clone(), deps)
})
.collect::<HashMap<_, _>>();
let mut permanent = HashSet::new();
let mut visiting = Vec::<String>::new();
let mut cycles = HashSet::new();
for service in services {
visit_service_dependency_cycles(
&service.name,
&dependencies,
&mut permanent,
&mut visiting,
&mut cycles,
);
}
cycles
}
fn visit_service_dependency_cycles(
service: &str,
dependencies: &HashMap<String, Vec<String>>,
permanent: &mut HashSet<String>,
visiting: &mut Vec<String>,
cycles: &mut HashSet<String>,
) {
if permanent.contains(service) {
return;
}
if let Some(position) = visiting.iter().position(|current| current == service) {
cycles.extend(visiting[position..].iter().cloned());
return;
}
visiting.push(service.to_string());
if let Some(depends_on) = dependencies.get(service) {
for dependency in depends_on {
visit_service_dependency_cycles(dependency, dependencies, permanent, visiting, cycles);
}
}
visiting.pop();
permanent.insert(service.to_string());
}
pub fn summarize_service_dependencies(
service: &ServiceSpec,
services_by_name: &HashMap<&str, &ServiceSpec>,
instances_by_service: &HashMap<String, Vec<ServiceInstanceSpec>>,
publications: &HashMap<String, ServicePublicationState>,
dependency_cycles: &HashSet<String>,
heartbeat_timeout_secs: u64,
) -> ServiceDependencySummary {
let mut blockers = Vec::new();
if dependency_cycles.contains(&service.name) {
blockers.push("cyclic dependency graph".to_string());
}
for dependency in &service.depends_on {
if dependency.service == service.name {
blockers.push(format!(
"dependency {} points to itself",
dependency.service
));
continue;
}
if !services_by_name.contains_key(dependency.service.as_str()) {
blockers.push(format!("dependency {} is not defined", dependency.service));
continue;
}
if dependency_cycles.contains(&dependency.service) {
blockers.push(format!(
"dependency {} is part of a dependency cycle",
dependency.service
));
continue;
}
match dependency.condition {
ServiceDependencyCondition::Healthy => {
let ready = instances_by_service
.get(&dependency.service)
.map(|instances| {
instances
.iter()
.filter(|instance| {
service_instance_is_available(instance, heartbeat_timeout_secs)
})
.count() as u32
})
.unwrap_or(0);
let min_ready = dependency.min_ready.max(1);
if ready < min_ready {
blockers.push(format!(
"dependency {} has {ready}/{min_ready} healthy instance(s)",
dependency.service
));
}
}
ServiceDependencyCondition::Published => {
let ready = publications
.get(&dependency.service)
.map(service_publication_is_ready)
.unwrap_or(false);
if !ready {
blockers.push(format!(
"dependency {} is not published",
dependency.service
));
}
}
}
}
ServiceDependencySummary {
dependencies_ready: blockers.is_empty(),
blockers,
}
}
pub fn desired_service_instance_count(
service: &ServiceSpec,
nodes: &[ClusterNodeRecord],
heartbeat_timeout_secs: u64,
) -> u32 {
let Some(schedule) = service.schedule.as_ref() else {
return 0;
};
match schedule.mode {
ServiceScheduleMode::Replicated => schedule.replicas,
ServiceScheduleMode::Daemon => {
eligible_cluster_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len() as u32
}
}
}
pub fn build_service_status_record(
service: &ServiceSpec,
nodes: &[ClusterNodeRecord],
services_by_name: &HashMap<&str, &ServiceSpec>,
instances_by_service: &HashMap<String, Vec<ServiceInstanceSpec>>,
publications: &HashMap<String, ServicePublicationState>,
dependency_cycles: &HashSet<String>,
heartbeat_timeout_secs: u64,
) -> ServiceStatusRecord {
let instances = instances_by_service
.get(&service.name)
.map(Vec::as_slice)
.unwrap_or(&[]);
let healthy_instances = instances
.iter()
.filter(|instance| service_instance_is_available(instance, heartbeat_timeout_secs))
.count() as u32;
let scheduled_instances = instances.len() as u32;
let desired_instances = desired_service_instance_count(service, nodes, heartbeat_timeout_secs);
let publish_ready = publications
.get(&service.name)
.map(service_publication_is_ready)
.unwrap_or(false);
let dependencies = summarize_service_dependencies(
service,
services_by_name,
instances_by_service,
publications,
dependency_cycles,
heartbeat_timeout_secs,
);
let eligible_node_count = service
.schedule
.as_ref()
.map(|schedule| {
eligible_cluster_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len()
})
.unwrap_or(0);
let (phase, message) = if service.schedule.is_none() {
(
"unmanaged".to_string(),
Some("service is not managed by fleet-scheduler".to_string()),
)
} else if !dependencies.dependencies_ready {
(
"blocked".to_string(),
Some(dependencies.blockers.join("; ")),
)
} else if eligible_node_count == 0 {
(
"unschedulable".to_string(),
Some("no eligible nodes match the service placement policy".to_string()),
)
} else if desired_instances == 0 && scheduled_instances == 0 {
(
"idle".to_string(),
Some("service has no desired instances".to_string()),
)
} else if desired_instances > 0 && healthy_instances >= desired_instances {
(
"healthy".to_string(),
Some(format!(
"healthy instances satisfy desired count ({healthy_instances}/{desired_instances})"
)),
)
} else if scheduled_instances > 0 || healthy_instances > 0 {
(
"degraded".to_string(),
Some(format!(
"healthy={healthy_instances} scheduled={scheduled_instances} desired={desired_instances}"
)),
)
} else {
(
"pending".to_string(),
Some(format!(
"waiting for instances to reach desired count ({healthy_instances}/{desired_instances})"
)),
)
};
ServiceStatusRecord {
service: service.name.clone(),
phase,
desired_instances,
scheduled_instances,
healthy_instances,
publish_ready,
dependencies_ready: dependencies.dependencies_ready,
blockers: dependencies.blockers,
message,
observed_at: Some(Utc::now()),
}
}
/// mTLS policy definition. /// mTLS policy definition.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct MtlsPolicySpec { pub struct MtlsPolicySpec {
@ -1253,6 +1638,207 @@ mod tests {
assert_eq!(decoded.dns.unwrap().fqdn, "api.test.cluster.local"); assert_eq!(decoded.dns.unwrap().fqdn, "api.test.cluster.local");
} }
fn active_cluster_node(node_id: &str) -> ClusterNodeRecord {
ClusterNodeRecord {
node_id: node_id.to_string(),
machine_id: None,
ip: format!("10.0.0.{}", if node_id == "node01" { 11 } else { 12 }),
hostname: node_id.to_string(),
roles: vec!["worker".to_string()],
labels: HashMap::from([("tier".to_string(), "general".to_string())]),
pool: Some("general".to_string()),
node_class: Some("worker-linux".to_string()),
failure_domain: None,
nix_profile: None,
install_plan: None,
hardware_facts: None,
state: Some("active".to_string()),
commission_state: None,
install_state: None,
commissioned_at: None,
last_inventory_hash: None,
power_state: None,
bmc_ref: None,
last_heartbeat: Some(Utc::now()),
}
}
fn scheduled_service_named(name: &str) -> ServiceSpec {
ServiceSpec {
name: name.to_string(),
ports: Some(ServicePorts {
http: Some(8080),
grpc: None,
}),
protocol: None,
mtls_required: None,
mesh_mode: None,
depends_on: Vec::new(),
schedule: Some(ServiceScheduleSpec {
mode: ServiceScheduleMode::Replicated,
replicas: 2,
placement: PlacementPolicy {
roles: vec!["worker".to_string()],
pools: Vec::new(),
node_classes: Vec::new(),
match_labels: HashMap::new(),
spread_by_label: None,
max_instances_per_node: 1,
},
rollout: RolloutStrategySpec::default(),
instance_port: Some(8080),
mesh_port: None,
process: None,
container: None,
health_check: None,
}),
publish: None,
}
}
fn healthy_service_instance(service: &str, node_id: &str) -> ServiceInstanceSpec {
ServiceInstanceSpec {
instance_id: format!("{service}-{node_id}"),
service: service.to_string(),
node_id: node_id.to_string(),
ip: if node_id == "node01" {
"10.0.0.11".to_string()
} else {
"10.0.0.12".to_string()
},
port: 8080,
mesh_port: None,
version: None,
health_check: None,
process: None,
container: None,
managed_by: Some("fleet-scheduler".to_string()),
state: Some("healthy".to_string()),
last_heartbeat: Some(Utc::now()),
observed_at: Some(Utc::now()),
}
}
#[test]
fn test_compute_service_dependency_cycles_detects_full_cycle() {
let mut api = scheduled_service_named("api");
api.depends_on = vec![ServiceDependencySpec {
service: "worker".to_string(),
condition: ServiceDependencyCondition::Healthy,
min_ready: 1,
}];
let mut worker = scheduled_service_named("worker");
worker.depends_on = vec![ServiceDependencySpec {
service: "edge".to_string(),
condition: ServiceDependencyCondition::Healthy,
min_ready: 1,
}];
let mut edge = scheduled_service_named("edge");
edge.depends_on = vec![ServiceDependencySpec {
service: "api".to_string(),
condition: ServiceDependencyCondition::Healthy,
min_ready: 1,
}];
let cycles = compute_service_dependency_cycles(&[api, worker, edge]);
assert_eq!(
cycles,
HashSet::from(["api".to_string(), "worker".to_string(), "edge".to_string()])
);
}
#[test]
fn test_summarize_service_dependencies_blocks_until_dependency_is_healthy() {
let api = scheduled_service_named("api");
let mut worker = scheduled_service_named("worker");
worker.depends_on = vec![ServiceDependencySpec {
service: "api".to_string(),
condition: ServiceDependencyCondition::Healthy,
min_ready: 2,
}];
let services = vec![api.clone(), worker.clone()];
let services_by_name = services
.iter()
.map(|service| (service.name.as_str(), service))
.collect::<HashMap<_, _>>();
let instances_by_service = HashMap::from([(
"api".to_string(),
vec![
healthy_service_instance("api", "node01"),
ServiceInstanceSpec {
state: Some("starting".to_string()),
..healthy_service_instance("api", "node02")
},
],
)]);
let summary = summarize_service_dependencies(
&worker,
&services_by_name,
&instances_by_service,
&HashMap::new(),
&HashSet::new(),
300,
);
assert!(!summary.dependencies_ready);
assert_eq!(
summary.blockers,
vec!["dependency api has 1/2 healthy instance(s)".to_string()]
);
}
#[test]
fn test_build_service_status_record_recognizes_published_dependency() {
let api = scheduled_service_named("api");
let mut worker = scheduled_service_named("worker");
worker.depends_on = vec![ServiceDependencySpec {
service: "api".to_string(),
condition: ServiceDependencyCondition::Published,
min_ready: 1,
}];
let services = vec![api.clone(), worker.clone()];
let services_by_name = services
.iter()
.map(|service| (service.name.as_str(), service))
.collect::<HashMap<_, _>>();
let publications = HashMap::from([(
"api".to_string(),
ServicePublicationState {
service: "api".to_string(),
org_id: "default-org".to_string(),
project_id: "default-project".to_string(),
load_balancer: Some(PublishedLoadBalancerState {
id: "lb-1".to_string(),
pool_id: "pool-1".to_string(),
listener_id: "listener-1".to_string(),
vip_address: Some("203.0.113.10".to_string()),
}),
dns: None,
observed_at: Some(Utc::now()),
},
)]);
let nodes = vec![active_cluster_node("node01")];
let status = build_service_status_record(
&worker,
&nodes,
&services_by_name,
&HashMap::new(),
&publications,
&HashSet::new(),
300,
);
assert_eq!(status.phase, "pending");
assert!(status.dependencies_ready);
assert!(status.blockers.is_empty());
}
#[test] #[test]
fn test_observed_system_state_roundtrip() { fn test_observed_system_state_roundtrip() {
let observed = ObservedSystemState { let observed = ObservedSystemState {

View file

@ -3,11 +3,13 @@ mod publish;
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use chainfire_client::Client; use chainfire_client::Client;
use chrono::Utc;
use clap::Parser; use clap::Parser;
use deployer_types::{ use deployer_types::{
ClusterNodeRecord, PlacementPolicy, ServiceDependencyCondition, ServiceInstanceSpec, build_service_status_record, cluster_node_class, cluster_node_pool,
ServicePublicationState, ServiceScheduleMode, ServiceSpec, ServiceStatusRecord, compute_service_dependency_cycles, eligible_cluster_nodes, service_instance_is_available,
summarize_service_dependencies, ClusterNodeRecord, PlacementPolicy, ServiceDependencySummary,
ServiceInstanceSpec, ServicePublicationState, ServiceScheduleMode, ServiceSpec,
ServiceStatusRecord,
}; };
use publish::{PublicationConfig, PublicationReconciler}; use publish::{PublicationConfig, PublicationReconciler};
use serde_json::Value; use serde_json::Value;
@ -17,6 +19,11 @@ use tokio::time::sleep;
use tracing::{debug, info, warn}; use tracing::{debug, info, warn};
use tracing_subscriber::EnvFilter; use tracing_subscriber::EnvFilter;
#[cfg(test)]
use chrono::Utc;
#[cfg(test)]
use deployer_types::cluster_node_is_eligible;
const MANAGED_BY: &str = "fleet-scheduler"; const MANAGED_BY: &str = "fleet-scheduler";
#[derive(Debug, Parser)] #[derive(Debug, Parser)]
@ -91,11 +98,7 @@ struct ReconcilePlan {
deferred_deletes: usize, deferred_deletes: usize,
} }
#[derive(Debug, Default)] type DependencySummary = ServiceDependencySummary;
struct DependencySummary {
dependencies_ready: bool,
blockers: Vec<String>,
}
impl Scheduler { impl Scheduler {
fn new(cli: Cli) -> Self { fn new(cli: Cli) -> Self {
@ -519,64 +522,7 @@ fn service_status_key(cluster_namespace: &str, cluster_id: &str, service: &str)
} }
fn dependency_cycle_services(services: &[ServiceSpec]) -> HashSet<String> { fn dependency_cycle_services(services: &[ServiceSpec]) -> HashSet<String> {
let service_names = services compute_service_dependency_cycles(services)
.iter()
.map(|service| service.name.clone())
.collect::<HashSet<_>>();
let dependencies = services
.iter()
.map(|service| {
let deps = service
.depends_on
.iter()
.filter(|dependency| service_names.contains(&dependency.service))
.map(|dependency| dependency.service.clone())
.collect::<Vec<_>>();
(service.name.clone(), deps)
})
.collect::<HashMap<_, _>>();
let mut permanent = HashSet::new();
let mut visiting = Vec::<String>::new();
let mut cycles = HashSet::new();
for service in services {
visit_dependency_cycles(
&service.name,
&dependencies,
&mut permanent,
&mut visiting,
&mut cycles,
);
}
cycles
}
fn visit_dependency_cycles(
service: &str,
dependencies: &HashMap<String, Vec<String>>,
permanent: &mut HashSet<String>,
visiting: &mut Vec<String>,
cycles: &mut HashSet<String>,
) {
if permanent.contains(service) {
return;
}
if let Some(position) = visiting.iter().position(|current| current == service) {
cycles.extend(visiting[position..].iter().cloned());
return;
}
visiting.push(service.to_string());
if let Some(depends_on) = dependencies.get(service) {
for dependency in depends_on {
visit_dependency_cycles(dependency, dependencies, permanent, visiting, cycles);
}
}
visiting.pop();
permanent.insert(service.to_string());
} }
fn dependency_summary( fn dependency_summary(
@ -587,103 +533,14 @@ fn dependency_summary(
dependency_cycles: &HashSet<String>, dependency_cycles: &HashSet<String>,
heartbeat_timeout_secs: u64, heartbeat_timeout_secs: u64,
) -> DependencySummary { ) -> DependencySummary {
let mut blockers = Vec::new(); summarize_service_dependencies(
service,
if dependency_cycles.contains(&service.name) { services_by_name,
blockers.push("cyclic dependency graph".to_string()); instances_by_service,
} publications,
dependency_cycles,
for dependency in &service.depends_on { heartbeat_timeout_secs,
if dependency.service == service.name { )
blockers.push(format!(
"dependency {} points to itself",
dependency.service
));
continue;
}
if !services_by_name.contains_key(dependency.service.as_str()) {
blockers.push(format!("dependency {} is not defined", dependency.service));
continue;
}
if dependency_cycles.contains(&dependency.service) {
blockers.push(format!(
"dependency {} is part of a dependency cycle",
dependency.service
));
continue;
}
match dependency.condition {
ServiceDependencyCondition::Healthy => {
let ready = instances_by_service
.get(&dependency.service)
.map(|instances| {
instances
.iter()
.filter(|instance| {
instance_is_available(instance, heartbeat_timeout_secs)
})
.count() as u32
})
.unwrap_or(0);
let min_ready = dependency.min_ready.max(1);
if ready < min_ready {
blockers.push(format!(
"dependency {} has {ready}/{min_ready} healthy instance(s)",
dependency.service
));
}
}
ServiceDependencyCondition::Published => {
let ready = publications
.get(&dependency.service)
.map(publication_ready)
.unwrap_or(false);
if !ready {
blockers.push(format!(
"dependency {} is not published",
dependency.service
));
}
}
}
}
DependencySummary {
dependencies_ready: blockers.is_empty(),
blockers,
}
}
fn publication_ready(state: &ServicePublicationState) -> bool {
state
.dns
.as_ref()
.map(|dns| !dns.values.is_empty())
.unwrap_or(false)
|| state
.load_balancer
.as_ref()
.and_then(|load_balancer| load_balancer.vip_address.as_ref())
.is_some()
}
fn desired_instance_count(
service: &ServiceSpec,
nodes: &[ClusterNodeRecord],
heartbeat_timeout_secs: u64,
) -> u32 {
let Some(schedule) = service.schedule.as_ref() else {
return 0;
};
match schedule.mode {
ServiceScheduleMode::Replicated => schedule.replicas,
ServiceScheduleMode::Daemon => {
eligible_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len() as u32
}
}
} }
fn build_service_status( fn build_service_status(
@ -695,89 +552,15 @@ fn build_service_status(
dependency_cycles: &HashSet<String>, dependency_cycles: &HashSet<String>,
heartbeat_timeout_secs: u64, heartbeat_timeout_secs: u64,
) -> ServiceStatusRecord { ) -> ServiceStatusRecord {
let instances = instances_by_service build_service_status_record(
.get(&service.name)
.map(Vec::as_slice)
.unwrap_or(&[]);
let healthy_instances = instances
.iter()
.filter(|instance| instance_is_available(instance, heartbeat_timeout_secs))
.count() as u32;
let scheduled_instances = instances.len() as u32;
let desired_instances = desired_instance_count(service, nodes, heartbeat_timeout_secs);
let publish_ready = publications
.get(&service.name)
.map(publication_ready)
.unwrap_or(false);
let dependencies = dependency_summary(
service, service,
nodes,
services_by_name, services_by_name,
instances_by_service, instances_by_service,
publications, publications,
dependency_cycles, dependency_cycles,
heartbeat_timeout_secs, heartbeat_timeout_secs,
);
let eligible_node_count = service
.schedule
.as_ref()
.map(|schedule| eligible_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len())
.unwrap_or(0);
let (phase, message) = if service.schedule.is_none() {
(
"unmanaged".to_string(),
Some("service is not managed by fleet-scheduler".to_string()),
) )
} else if !dependencies.dependencies_ready {
(
"blocked".to_string(),
Some(dependencies.blockers.join("; ")),
)
} else if eligible_node_count == 0 {
(
"unschedulable".to_string(),
Some("no eligible nodes match the service placement policy".to_string()),
)
} else if desired_instances == 0 && scheduled_instances == 0 {
(
"idle".to_string(),
Some("service has no desired instances".to_string()),
)
} else if desired_instances > 0 && healthy_instances >= desired_instances {
(
"healthy".to_string(),
Some(format!(
"healthy instances satisfy desired count ({healthy_instances}/{desired_instances})"
)),
)
} else if scheduled_instances > 0 || healthy_instances > 0 {
(
"degraded".to_string(),
Some(format!(
"healthy={healthy_instances} scheduled={scheduled_instances} desired={desired_instances}"
)),
)
} else {
(
"pending".to_string(),
Some(format!(
"waiting for instances to reach desired count ({healthy_instances}/{desired_instances})"
)),
)
};
ServiceStatusRecord {
service: service.name.clone(),
phase,
desired_instances,
scheduled_instances,
healthy_instances,
publish_ready,
dependencies_ready: dependencies.dependencies_ready,
blockers: dependencies.blockers,
message,
observed_at: Some(Utc::now()),
}
} }
fn eligible_nodes<'a>( fn eligible_nodes<'a>(
@ -785,65 +568,16 @@ fn eligible_nodes<'a>(
placement: &PlacementPolicy, placement: &PlacementPolicy,
heartbeat_timeout_secs: u64, heartbeat_timeout_secs: u64,
) -> Vec<&'a ClusterNodeRecord> { ) -> Vec<&'a ClusterNodeRecord> {
nodes eligible_cluster_nodes(nodes, placement, heartbeat_timeout_secs)
.iter()
.filter(|node| node_is_eligible(node, placement, heartbeat_timeout_secs))
.collect()
} }
#[cfg(test)]
fn node_is_eligible( fn node_is_eligible(
node: &ClusterNodeRecord, node: &ClusterNodeRecord,
placement: &PlacementPolicy, placement: &PlacementPolicy,
heartbeat_timeout_secs: u64, heartbeat_timeout_secs: u64,
) -> bool { ) -> bool {
if node.state.as_deref() != Some("active") { cluster_node_is_eligible(node, placement, heartbeat_timeout_secs)
return false;
}
if heartbeat_timeout_secs > 0 {
let Some(last) = node.last_heartbeat else {
return false;
};
let age = Utc::now().signed_duration_since(last).num_seconds();
if age > heartbeat_timeout_secs as i64 {
return false;
}
}
if !placement.roles.is_empty()
&& !node
.roles
.iter()
.any(|role| placement.roles.iter().any(|expected| expected == role))
{
return false;
}
if !placement.pools.is_empty()
&& !node_pool(node)
.map(|pool| placement.pools.iter().any(|expected| expected == pool))
.unwrap_or(false)
{
return false;
}
if !placement.node_classes.is_empty()
&& !node_class(node)
.map(|node_class| {
placement
.node_classes
.iter()
.any(|expected| expected == node_class)
})
.unwrap_or(false)
{
return false;
}
placement
.match_labels
.iter()
.all(|(key, value)| node.labels.get(key) == Some(value))
} }
fn build_desired_instances( fn build_desired_instances(
@ -1104,25 +838,11 @@ fn spread_value(node: &ClusterNodeRecord, spread_by_label: Option<&str>) -> Stri
} }
fn node_pool(node: &ClusterNodeRecord) -> Option<&str> { fn node_pool(node: &ClusterNodeRecord) -> Option<&str> {
node.pool cluster_node_pool(node)
.as_deref()
.or_else(|| node.labels.get("pool").map(String::as_str))
.or_else(|| {
node.labels
.get("pool.photoncloud.io/name")
.map(String::as_str)
})
} }
fn node_class(node: &ClusterNodeRecord) -> Option<&str> { fn node_class(node: &ClusterNodeRecord) -> Option<&str> {
node.node_class cluster_node_class(node)
.as_deref()
.or_else(|| node.labels.get("node_class").map(String::as_str))
.or_else(|| {
node.labels
.get("nodeclass.photoncloud.io/name")
.map(String::as_str)
})
} }
fn resolve_instance_port(service: &ServiceSpec) -> Option<u16> { fn resolve_instance_port(service: &ServiceSpec) -> Option<u16> {
@ -1288,26 +1008,7 @@ fn plan_managed_reconciliation(
} }
fn instance_is_available(instance: &ServiceInstanceSpec, heartbeat_timeout_secs: u64) -> bool { fn instance_is_available(instance: &ServiceInstanceSpec, heartbeat_timeout_secs: u64) -> bool {
matches!(instance.state.as_deref(), Some("healthy")) service_instance_is_available(instance, heartbeat_timeout_secs)
&& instance_has_fresh_heartbeat(instance, heartbeat_timeout_secs)
}
fn instance_has_fresh_heartbeat(
instance: &ServiceInstanceSpec,
heartbeat_timeout_secs: u64,
) -> bool {
if heartbeat_timeout_secs == 0 {
return true;
}
let Some(last_heartbeat) = instance.last_heartbeat.or(instance.observed_at) else {
return false;
};
Utc::now()
.signed_duration_since(last_heartbeat)
.num_seconds()
<= heartbeat_timeout_secs as i64
} }
fn instance_is_reusable(instance: &ServiceInstanceSpec) -> bool { fn instance_is_reusable(instance: &ServiceInstanceSpec) -> bool {

View file

@ -7,7 +7,11 @@ use std::time::Duration;
use anyhow::{Context, Result}; use anyhow::{Context, Result};
use chainfire_client::{Client, ClientError}; use chainfire_client::{Client, ClientError};
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use deployer_types::{ContainerSpec, HealthCheckSpec, ProcessSpec, ServiceInstanceSpec}; use deployer_types::{
build_service_status_record, compute_service_dependency_cycles, ClusterNodeRecord,
ContainerSpec, HealthCheckSpec, ProcessSpec, ServiceInstanceSpec, ServicePublicationState,
ServiceSpec,
};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use tokio::process::Command; use tokio::process::Command;
@ -44,6 +48,22 @@ fn key_instance(
.into_bytes() .into_bytes()
} }
fn service_status_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
format!(
"{}/clusters/{}/service-statuses/",
cluster_namespace, cluster_id
)
}
fn key_service_status(cluster_namespace: &str, cluster_id: &str, service: &str) -> Vec<u8> {
format!(
"{}{}",
service_status_prefix(cluster_namespace, cluster_id),
service
)
.into_bytes()
}
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]
pub struct NodeState { pub struct NodeState {
pub node_id: String, pub node_id: String,
@ -67,6 +87,7 @@ pub struct Agent {
cluster_id: String, cluster_id: String,
node_id: String, node_id: String,
interval: Duration, interval: Duration,
heartbeat_timeout_secs: u64,
apply: bool, apply: bool,
allow_local_instance_upsert: bool, allow_local_instance_upsert: bool,
process_manager: ProcessManager, process_manager: ProcessManager,
@ -98,6 +119,7 @@ impl Agent {
cluster_id: String, cluster_id: String,
node_id: String, node_id: String,
interval: Duration, interval: Duration,
heartbeat_timeout_secs: u64,
apply: bool, apply: bool,
allow_local_instance_upsert: bool, allow_local_instance_upsert: bool,
pid_dir: PathBuf, pid_dir: PathBuf,
@ -108,6 +130,7 @@ impl Agent {
cluster_id, cluster_id,
node_id, node_id,
interval, interval,
heartbeat_timeout_secs,
apply, apply,
allow_local_instance_upsert, allow_local_instance_upsert,
process_manager: ProcessManager::new(pid_dir), process_manager: ProcessManager::new(pid_dir),
@ -193,6 +216,10 @@ impl Agent {
info!("apply=false; skipping process reconciliation and health checks"); info!("apply=false; skipping process reconciliation and health checks");
} }
if let Err(e) = self.persist_service_statuses(&mut client).await {
warn!(error = %e, "failed to persist aggregated service statuses");
}
self.log_node_only(&node); self.log_node_only(&node);
Ok(()) Ok(())
@ -457,6 +484,156 @@ impl Agent {
Ok(()) Ok(())
} }
async fn load_cluster_nodes(&self, client: &mut Client) -> Result<Vec<ClusterNodeRecord>> {
let prefix = format!(
"{}nodes/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut nodes = Vec::with_capacity(kvs.len());
for (_key, value) in kvs {
match serde_json::from_slice::<ClusterNodeRecord>(&value) {
Ok(node) => nodes.push(node),
Err(error) => warn!(error = %error, "failed to decode cluster node"),
}
}
nodes.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
Ok(nodes)
}
async fn load_services(&self, client: &mut Client) -> Result<Vec<ServiceSpec>> {
let prefix = format!(
"{}services/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut services = Vec::with_capacity(kvs.len());
for (_key, value) in kvs {
match serde_json::from_slice::<ServiceSpec>(&value) {
Ok(service) => services.push(service),
Err(error) => warn!(error = %error, "failed to decode service spec"),
}
}
services.sort_by(|lhs, rhs| lhs.name.cmp(&rhs.name));
Ok(services)
}
async fn load_instances_by_service(
&self,
client: &mut Client,
) -> Result<HashMap<String, Vec<ServiceInstanceSpec>>> {
let prefix = format!(
"{}instances/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut instances = HashMap::<String, Vec<ServiceInstanceSpec>>::new();
for (_key, value) in kvs {
match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
Ok(instance) => {
instances
.entry(instance.service.clone())
.or_default()
.push(instance);
}
Err(error) => warn!(error = %error, "failed to decode service instance"),
}
}
for service_instances in instances.values_mut() {
service_instances.sort_by(|lhs, rhs| lhs.instance_id.cmp(&rhs.instance_id));
}
Ok(instances)
}
async fn load_publications(
&self,
client: &mut Client,
) -> Result<HashMap<String, ServicePublicationState>> {
let prefix = format!(
"{}publications/",
cluster_prefix(&self.cluster_namespace, &self.cluster_id)
);
let kvs = client.get_prefix(prefix.as_bytes()).await?;
let mut publications = HashMap::with_capacity(kvs.len());
for (_key, value) in kvs {
match serde_json::from_slice::<ServicePublicationState>(&value) {
Ok(publication) => {
publications.insert(publication.service.clone(), publication);
}
Err(error) => warn!(error = %error, "failed to decode service publication"),
}
}
Ok(publications)
}
async fn persist_service_statuses(&self, client: &mut Client) -> Result<()> {
let nodes = self.load_cluster_nodes(client).await?;
let services = self.load_services(client).await?;
let instances_by_service = self.load_instances_by_service(client).await?;
let publications = self.load_publications(client).await?;
let dependency_cycles = compute_service_dependency_cycles(&services);
let services_by_name = services
.iter()
.map(|service| (service.name.as_str(), service))
.collect::<HashMap<_, _>>();
for service in &services {
let status = build_service_status_record(
service,
&nodes,
&services_by_name,
&instances_by_service,
&publications,
&dependency_cycles,
self.heartbeat_timeout_secs,
);
let key = key_service_status(&self.cluster_namespace, &self.cluster_id, &service.name);
client.put(&key, serde_json::to_vec(&status)?).await?;
}
self.cleanup_stale_service_statuses(client, &services)
.await?;
Ok(())
}
async fn cleanup_stale_service_statuses(
&self,
client: &mut Client,
services: &[ServiceSpec],
) -> Result<()> {
let desired = services
.iter()
.map(|service| service.name.as_str())
.collect::<HashSet<_>>();
let prefix = service_status_prefix(&self.cluster_namespace, &self.cluster_id);
let existing = client.get_prefix(prefix.as_bytes()).await?;
for (key, _) in existing {
let key_str = String::from_utf8_lossy(&key);
let Some(service_name) = key_str.strip_prefix(&prefix) else {
continue;
};
if service_name.is_empty()
|| service_name.contains('/')
|| desired.contains(service_name)
{
continue;
}
let _ = client.delete(&key).await?;
}
Ok(())
}
/// Desired Stateに基づいてプロセスを起動/停止する /// Desired Stateに基づいてプロセスを起動/停止する
async fn reconcile_processes(&mut self, client: &mut Client) -> Result<()> { async fn reconcile_processes(&mut self, client: &mut Client) -> Result<()> {
let prefix = format!( let prefix = format!(
@ -771,6 +948,7 @@ mod tests {
"test-cluster".to_string(), "test-cluster".to_string(),
"node01".to_string(), "node01".to_string(),
Duration::from_secs(1), Duration::from_secs(1),
300,
false, false,
false, false,
PathBuf::from("/tmp/photoncloud-node-agent-tests"), PathBuf::from("/tmp/photoncloud-node-agent-tests"),

View file

@ -36,6 +36,10 @@ struct Cli {
#[arg(long, default_value_t = 15)] #[arg(long, default_value_t = 15)]
interval_secs: u64, interval_secs: u64,
/// service status 集約時に利用する heartbeat 許容秒数
#[arg(long, default_value_t = 300)]
heartbeat_timeout_secs: u64,
/// PIDファイル出力ディレクトリ /// PIDファイル出力ディレクトリ
#[arg(long, default_value = "/var/run/photoncloud")] #[arg(long, default_value = "/var/run/photoncloud")]
pid_dir: String, pid_dir: String,
@ -73,6 +77,7 @@ async fn main() -> Result<()> {
cli.cluster_id, cli.cluster_id,
cli.node_id, cli.node_id,
Duration::from_secs(cli.interval_secs), Duration::from_secs(cli.interval_secs),
cli.heartbeat_timeout_secs,
cli.apply, cli.apply,
cli.allow_local_instance_upsert, cli.allow_local_instance_upsert,
std::path::PathBuf::from(cli.pid_dir), std::path::PathBuf::from(cli.pid_dir),

View file

@ -461,6 +461,23 @@ for _ in 1 2 3; do
sleep 1 sleep 1
done done
run_deployer_ctl service inspect --name api --include-instances >"$tmp_dir/api-inspect-healthy.json"
python3 - "$tmp_dir/api-inspect-healthy.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload.get("status") or {}
instances = payload.get("instances") or []
if status.get("phase") != "healthy":
raise SystemExit(f"expected api inspect phase=healthy before scheduler rerun, found {status.get('phase')}")
if len(instances) != 2:
raise SystemExit(f"expected 2 api instances from service inspect, found {len(instances)}")
print("api service inspect refreshed to healthy from node-agent updates")
PY
echo "Re-running scheduler after api became healthy" echo "Re-running scheduler after api became healthy"
run_scheduler_once run_scheduler_once
@ -557,9 +574,6 @@ if states != ["healthy", "healthy"]:
print("Observed two healthy dependent worker instances across node01 and node02") print("Observed two healthy dependent worker instances across node01 and node02")
PY PY
echo "Refreshing aggregated service status after worker became healthy"
run_scheduler_once
run_deployer_ctl service inspect --name worker --include-instances >"$tmp_dir/worker-inspect-healthy.json" run_deployer_ctl service inspect --name worker --include-instances >"$tmp_dir/worker-inspect-healthy.json"
python3 - "$tmp_dir/worker-inspect-healthy.json" <<'PY' python3 - "$tmp_dir/worker-inspect-healthy.json" <<'PY'
import json import json
@ -663,6 +677,23 @@ if instance.get("state") != "healthy":
print("Observed one healthy dependent worker instance on node01 after scale-down") print("Observed one healthy dependent worker instance on node01 after scale-down")
PY PY
run_deployer_ctl service inspect --name worker --include-instances >"$tmp_dir/worker-inspect-scaled.json"
python3 - "$tmp_dir/worker-inspect-scaled.json" <<'PY'
import json
import sys
payload = json.load(open(sys.argv[1], "r", encoding="utf-8"))
status = payload.get("status") or {}
instances = payload.get("instances") or []
if status.get("phase") != "healthy":
raise SystemExit(f"expected scaled worker inspect phase=healthy, found {status.get('phase')}")
if len(instances) != 1:
raise SystemExit(f"expected 1 scaled worker instance from service inspect, found {len(instances)}")
print("service inspect reports scaled healthy worker state without waiting for scheduler status refresh")
PY
echo "Validating endpoint convergence after scale-down" echo "Validating endpoint convergence after scale-down"
python3 - <<'PY' python3 - <<'PY'
import socket import socket