Add daemon scheduling for native services
Some checks failed
Nix CI / filter (push) Failing after 1s
Nix CI / gate () (push) Has been skipped
Nix CI / gate (shared crates) (push) Has been skipped
Nix CI / build () (push) Has been skipped
Nix CI / ci-status (push) Failing after 1s

This commit is contained in:
centra 2026-03-30 21:31:32 +09:00
parent ce979d8f26
commit 37f5479ab8
Signed by: centra
GPG key ID: 0C09689D20B25ACA
8 changed files with 355 additions and 8 deletions

View file

@ -303,6 +303,10 @@ fn default_service_replicas() -> u32 {
1 1
} }
fn default_service_schedule_mode() -> ServiceScheduleMode {
ServiceScheduleMode::Replicated
}
fn default_rollout_max_unavailable() -> u32 { fn default_rollout_max_unavailable() -> u32 {
1 1
} }
@ -433,9 +437,22 @@ impl Default for RolloutStrategySpec {
} }
} }
/// Scheduling mode for native runtime services.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "snake_case")]
pub enum ServiceScheduleMode {
/// Keep a fixed replica count and place instances across eligible nodes.
#[default]
Replicated,
/// Run one instance on every eligible node.
Daemon,
}
/// Scheduler-specific service intent used by the non-Kubernetes fleet scheduler. /// Scheduler-specific service intent used by the non-Kubernetes fleet scheduler.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct ServiceScheduleSpec { pub struct ServiceScheduleSpec {
#[serde(default = "default_service_schedule_mode")]
pub mode: ServiceScheduleMode,
#[serde(default = "default_service_replicas")] #[serde(default = "default_service_replicas")]
pub replicas: u32, pub replicas: u32,
#[serde(default)] #[serde(default)]
@ -457,6 +474,7 @@ pub struct ServiceScheduleSpec {
impl Default for ServiceScheduleSpec { impl Default for ServiceScheduleSpec {
fn default() -> Self { fn default() -> Self {
Self { Self {
mode: default_service_schedule_mode(),
replicas: default_service_replicas(), replicas: default_service_replicas(),
placement: PlacementPolicy::default(), placement: PlacementPolicy::default(),
rollout: RolloutStrategySpec::default(), rollout: RolloutStrategySpec::default(),
@ -1100,6 +1118,7 @@ mod tests {
#[test] #[test]
fn test_service_schedule_defaults() { fn test_service_schedule_defaults() {
let schedule = ServiceScheduleSpec::default(); let schedule = ServiceScheduleSpec::default();
assert_eq!(schedule.mode, ServiceScheduleMode::Replicated);
assert_eq!(schedule.replicas, 1); assert_eq!(schedule.replicas, 1);
assert_eq!(schedule.placement.max_instances_per_node, 1); assert_eq!(schedule.placement.max_instances_per_node, 1);
} }

View file

@ -5,7 +5,9 @@ use anyhow::{Context, Result};
use chainfire_client::Client; use chainfire_client::Client;
use chrono::Utc; use chrono::Utc;
use clap::Parser; use clap::Parser;
use deployer_types::{ClusterNodeRecord, PlacementPolicy, ServiceInstanceSpec, ServiceSpec}; use deployer_types::{
ClusterNodeRecord, PlacementPolicy, ServiceInstanceSpec, ServiceScheduleMode, ServiceSpec,
};
use publish::{PublicationConfig, PublicationReconciler}; use publish::{PublicationConfig, PublicationReconciler};
use serde_json::Value; use serde_json::Value;
use std::collections::{BTreeMap, HashMap, HashSet}; use std::collections::{BTreeMap, HashMap, HashSet};
@ -220,12 +222,14 @@ impl Scheduler {
let existing_instances = decode_managed_instances(&existing); let existing_instances = decode_managed_instances(&existing);
let desired_instances = let desired_instances =
build_desired_instances(service, &eligible_nodes, &existing_instances)?; build_desired_instances(service, &eligible_nodes, &existing_instances)?;
if desired_instances.len() < schedule.replicas as usize { let target_instances = schedule_target_count(schedule.mode, eligible_nodes.len(), schedule.replicas);
if desired_instances.len() < target_instances {
warn!( warn!(
service = %service.name, service = %service.name,
requested = schedule.replicas, requested = target_instances,
scheduled = desired_instances.len(), scheduled = desired_instances.len(),
"insufficient eligible node capacity for requested replicas" mode = ?schedule.mode,
"insufficient eligible node capacity for requested service instances"
); );
} }
@ -407,6 +411,26 @@ fn build_desired_instances(
service: &ServiceSpec, service: &ServiceSpec,
eligible_nodes: &[&ClusterNodeRecord], eligible_nodes: &[&ClusterNodeRecord],
existing_instances: &[ServiceInstanceSpec], existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> {
let schedule = service
.schedule
.as_ref()
.context("scheduled service missing schedule block")?;
match schedule.mode {
ServiceScheduleMode::Replicated => {
build_replicated_desired_instances(service, eligible_nodes, existing_instances)
}
ServiceScheduleMode::Daemon => {
build_daemon_desired_instances(service, eligible_nodes, existing_instances)
}
}
}
fn build_replicated_desired_instances(
service: &ServiceSpec,
eligible_nodes: &[&ClusterNodeRecord],
existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> { ) -> Result<Vec<ServiceInstanceSpec>> {
let schedule = service let schedule = service
.schedule .schedule
@ -486,6 +510,59 @@ fn build_desired_instances(
Ok(desired) Ok(desired)
} }
fn build_daemon_desired_instances(
service: &ServiceSpec,
eligible_nodes: &[&ClusterNodeRecord],
existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> {
let port = resolve_instance_port(service).with_context(|| {
format!(
"service {} is missing instance_port and service ports",
service.name
)
})?;
let eligible_by_node: HashSet<&str> = eligible_nodes
.iter()
.map(|node| node.node_id.as_str())
.collect();
let mut reusable = existing_instances
.iter()
.filter(|instance| eligible_by_node.contains(instance.node_id.as_str()))
.filter(|instance| instance_is_reusable(instance))
.collect::<Vec<_>>();
reusable.sort_by(|lhs, rhs| {
lhs.node_id
.cmp(&rhs.node_id)
.then_with(|| instance_state_rank(lhs).cmp(&instance_state_rank(rhs)))
.then_with(|| lhs.instance_id.cmp(&rhs.instance_id))
});
let mut reusable_by_node: BTreeMap<String, Vec<String>> = BTreeMap::new();
for instance in reusable {
reusable_by_node
.entry(instance.node_id.clone())
.or_default()
.push(instance.instance_id.clone());
}
let mut desired = Vec::with_capacity(eligible_nodes.len());
for node in eligible_nodes {
let instance_id = reusable_by_node
.get_mut(&node.node_id)
.and_then(|instance_ids| {
if instance_ids.is_empty() {
None
} else {
Some(instance_ids.remove(0))
}
})
.unwrap_or_else(|| render_instance_id(&service.name, &node.node_id, 0));
desired.push(desired_instance(service, node, &instance_id, port));
}
Ok(desired)
}
fn desired_instance( fn desired_instance(
service: &ServiceSpec, service: &ServiceSpec,
node: &ClusterNodeRecord, node: &ClusterNodeRecord,
@ -618,6 +695,13 @@ fn resolve_instance_port(service: &ServiceSpec) -> Option<u16> {
.or_else(|| service.ports.as_ref().and_then(|ports| ports.grpc)) .or_else(|| service.ports.as_ref().and_then(|ports| ports.grpc))
} }
fn schedule_target_count(mode: ServiceScheduleMode, eligible_nodes: usize, replicas: u32) -> usize {
match mode {
ServiceScheduleMode::Replicated => replicas as usize,
ServiceScheduleMode::Daemon => eligible_nodes,
}
}
fn render_instance_id(service: &str, node_id: &str, ordinal: u32) -> String { fn render_instance_id(service: &str, node_id: &str, ordinal: u32) -> String {
if ordinal == 0 { if ordinal == 0 {
format!("{service}-{node_id}") format!("{service}-{node_id}")
@ -680,7 +764,7 @@ fn plan_managed_reconciliation(
.schedule .schedule
.as_ref() .as_ref()
.context("scheduled service missing schedule block")?; .context("scheduled service missing schedule block")?;
let desired_replicas = schedule.replicas as usize; let desired_replicas = desired_instances.len();
let min_available = desired_replicas.saturating_sub(schedule.rollout.max_unavailable as usize); let min_available = desired_replicas.saturating_sub(schedule.rollout.max_unavailable as usize);
let max_total_instances = desired_replicas + schedule.rollout.max_surge as usize; let max_total_instances = desired_replicas + schedule.rollout.max_surge as usize;
let mut available_count = existing_instances let mut available_count = existing_instances
@ -878,7 +962,7 @@ mod tests {
use chrono::Duration as ChronoDuration; use chrono::Duration as ChronoDuration;
use deployer_types::{ use deployer_types::{
ClusterNodeRecord, HealthCheckSpec, PlacementPolicy, ProcessSpec, RolloutStrategySpec, ClusterNodeRecord, HealthCheckSpec, PlacementPolicy, ProcessSpec, RolloutStrategySpec,
ServicePorts, ServiceScheduleSpec, ServicePorts, ServiceScheduleMode, ServiceScheduleSpec,
}; };
fn active_node(node_id: &str, roles: &[&str], labels: &[(&str, &str)]) -> ClusterNodeRecord { fn active_node(node_id: &str, roles: &[&str], labels: &[(&str, &str)]) -> ClusterNodeRecord {
@ -920,6 +1004,7 @@ mod tests {
mtls_required: None, mtls_required: None,
mesh_mode: None, mesh_mode: None,
schedule: Some(ServiceScheduleSpec { schedule: Some(ServiceScheduleSpec {
mode: ServiceScheduleMode::Replicated,
replicas: 2, replicas: 2,
placement: PlacementPolicy { placement: PlacementPolicy {
roles: vec!["worker".to_string()], roles: vec!["worker".to_string()],
@ -1013,6 +1098,79 @@ mod tests {
assert_eq!(desired[1].instance_id, "api-node01-2"); assert_eq!(desired[1].instance_id, "api-node01-2");
} }
#[test]
fn test_build_desired_instances_daemon_places_on_every_eligible_node() {
let nodes = vec![
active_node("node01", &["worker"], &[("tier", "general")]),
active_node("node02", &["worker"], &[("tier", "general")]),
];
let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
let mut service = scheduled_service();
let schedule = service.schedule.as_mut().unwrap();
schedule.mode = ServiceScheduleMode::Daemon;
schedule.replicas = 1;
let desired = build_desired_instances(&service, &refs, &[]).unwrap();
assert_eq!(desired.len(), 2);
assert_eq!(desired[0].instance_id, "api-node01");
assert_eq!(desired[1].instance_id, "api-node02");
}
#[test]
fn test_build_desired_instances_daemon_reuses_existing_instance_per_node() {
let nodes = vec![
active_node("node01", &["worker"], &[("tier", "general")]),
active_node("node02", &["worker"], &[("tier", "general")]),
];
let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
let mut service = scheduled_service();
let schedule = service.schedule.as_mut().unwrap();
schedule.mode = ServiceScheduleMode::Daemon;
schedule.replicas = 1;
let existing = vec![
ServiceInstanceSpec {
instance_id: "api-node01-2".to_string(),
service: "api".to_string(),
node_id: "node01".to_string(),
ip: "10.0.0.1".to_string(),
port: 8080,
mesh_port: Some(18080),
version: None,
health_check: None,
process: None,
container: None,
managed_by: Some(MANAGED_BY.to_string()),
state: Some("healthy".to_string()),
last_heartbeat: None,
observed_at: None,
},
ServiceInstanceSpec {
instance_id: "api-node02".to_string(),
service: "api".to_string(),
node_id: "node02".to_string(),
ip: "10.0.0.2".to_string(),
port: 8080,
mesh_port: Some(18080),
version: None,
health_check: None,
process: None,
container: None,
managed_by: Some(MANAGED_BY.to_string()),
state: Some("healthy".to_string()),
last_heartbeat: None,
observed_at: None,
},
];
let desired = build_desired_instances(&service, &refs, &existing).unwrap();
assert_eq!(desired.len(), 2);
assert_eq!(desired[0].instance_id, "api-node01-2");
assert_eq!(desired[1].instance_id, "api-node02");
}
#[test] #[test]
fn test_pick_next_node_prefers_less_used_failure_domain() { fn test_pick_next_node_prefers_less_used_failure_domain() {
let nodes = vec![ let nodes = vec![
@ -1281,6 +1439,76 @@ mod tests {
assert_eq!(plan.deletes, vec!["api-node01".to_string()]); assert_eq!(plan.deletes, vec!["api-node01".to_string()]);
} }
#[test]
fn test_plan_reconciliation_for_daemon_uses_current_eligible_node_count() {
let mut service = scheduled_service();
let schedule = service.schedule.as_mut().unwrap();
schedule.mode = ServiceScheduleMode::Daemon;
schedule.replicas = 99;
schedule.rollout = RolloutStrategySpec {
max_unavailable: 0,
max_surge: 0,
};
let existing_instances = vec![
ServiceInstanceSpec {
instance_id: "api-node01".to_string(),
service: "api".to_string(),
node_id: "node01".to_string(),
ip: "10.0.0.1".to_string(),
port: 8080,
mesh_port: Some(18080),
version: None,
health_check: None,
process: schedule.process.clone(),
container: None,
managed_by: Some(MANAGED_BY.to_string()),
state: Some("healthy".to_string()),
last_heartbeat: None,
observed_at: None,
},
ServiceInstanceSpec {
instance_id: "api-node02".to_string(),
service: "api".to_string(),
node_id: "node02".to_string(),
ip: "10.0.0.2".to_string(),
port: 8080,
mesh_port: Some(18080),
version: None,
health_check: None,
process: schedule.process.clone(),
container: None,
managed_by: Some(MANAGED_BY.to_string()),
state: Some("healthy".to_string()),
last_heartbeat: None,
observed_at: None,
},
];
let existing = existing_instances
.iter()
.map(|instance| {
(
instance.instance_id.clone(),
serde_json::to_value(instance).unwrap(),
)
})
.collect::<HashMap<_, _>>();
let desired_instances = vec![existing_instances[1].clone()];
let plan = plan_managed_reconciliation(
&service,
&desired_instances,
&existing,
&existing_instances,
0,
)
.unwrap();
assert!(plan.upserts.is_empty());
assert_eq!(plan.deletes, vec!["api-node01".to_string()]);
assert_eq!(plan.deferred_deletes, 0);
}
#[test] #[test]
fn test_merge_preserved_fields_replaces_null_status_fields() { fn test_merge_preserved_fields_replaces_null_status_fields() {
let desired = serde_json::json!({ let desired = serde_json::json!({

View file

@ -545,6 +545,7 @@ async fn ensure_load_balancer(
org_id: org_id.to_string(), org_id: org_id.to_string(),
project_id: project_id.to_string(), project_id: project_id.to_string(),
description: format!("native runtime service {name}"), description: format!("native runtime service {name}"),
vip_address: String::new(),
}, },
auth_token, auth_token,
)) ))

View file

@ -423,6 +423,7 @@ async fn ensure_load_balancer(
org_id: spec.org_id.clone(), org_id: spec.org_id.clone(),
project_id: spec.project_id.clone().unwrap_or_default(), project_id: spec.project_id.clone().unwrap_or_default(),
description: spec.description.clone().unwrap_or_default(), description: spec.description.clone().unwrap_or_default(),
vip_address: String::new(),
}) })
.await? .await?
.into_inner(); .into_inner();

View file

@ -170,6 +170,7 @@ impl FiberLbController {
org_id: org_id.to_string(), org_id: org_id.to_string(),
project_id: project_id.to_string(), project_id: project_id.to_string(),
description: format!("k8s service {}/{}", namespace, name), description: format!("k8s service {}/{}", namespace, name),
vip_address: String::new(),
}; };
let lb_id = match lb_client let lb_id = match lb_client

View file

@ -445,6 +445,12 @@ let
healthCheckType = mkHealthCheckType types; healthCheckType = mkHealthCheckType types;
in types.submodule { in types.submodule {
options = { options = {
mode = mkOption {
type = types.enum [ "replicated" "daemon" ];
default = "replicated";
description = "Scheduling mode used by the native runtime service";
};
replicas = mkOption { replicas = mkOption {
type = types.ints.positive; type = types.ints.positive;
default = 1; default = 1;
@ -1253,6 +1259,7 @@ let
mkServiceScheduleSpec = schedule: mkServiceScheduleSpec = schedule:
{ {
mode = schedule.mode;
replicas = schedule.replicas; replicas = schedule.replicas;
placement = mkPlacementPolicySpec schedule.placement; placement = mkPlacementPolicySpec schedule.placement;
rollout = mkRolloutStrategySpec schedule.rollout; rollout = mkRolloutStrategySpec schedule.rollout;

View file

@ -364,6 +364,41 @@ in
}; };
}; };
}; };
native-daemon = {
protocol = "http";
ports.http = 18193;
schedule = {
mode = "daemon";
replicas = 1;
placement = {
roles = [ "worker" ];
pools = [ "general" ];
nodeClasses = [ "worker-linux" ];
matchLabels = {
runtime = "native";
};
maxInstancesPerNode = 1;
};
instancePort = 18193;
process = {
command = "python3";
args = [
"-m"
"http.server"
"\${INSTANCE_PORT}"
"--bind"
"\${INSTANCE_IP}"
];
};
healthCheck = {
type = "http";
path = "/";
intervalSecs = 5;
timeoutSecs = 3;
};
};
};
}; };
}; };

View file

@ -4969,8 +4969,8 @@ validate_native_runtime_flow() {
wait_for_native_dump_count \ wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/services/" \ "photoncloud/clusters/test-cluster/services/" \
'map(select(.name == "native-web" or .name == "native-container")) | length' \ 'map(select(.name == "native-web" or .name == "native-container" or .name == "native-daemon")) | length' \
"2" \ "3" \
180 180
wait_for_native_dump_count \ wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/nodes/" \ "photoncloud/clusters/test-cluster/nodes/" \
@ -4997,9 +4997,21 @@ validate_native_runtime_flow() {
"${native_fresh_healthy_count_expr}" \ "${native_fresh_healthy_count_expr}" \
"1" \ "1" \
360 360
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
'length' \
"2" \
300
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
"${native_fresh_healthy_count_expr}" \
"2" \
300
wait_for_http node04 "http://10.100.0.21:18190/" 240 wait_for_http node04 "http://10.100.0.21:18190/" 240
wait_for_http node05 "http://10.100.0.22:18190/" 240 wait_for_http node05 "http://10.100.0.22:18190/" 240
wait_for_http node04 "http://10.100.0.21:18193/" 240
wait_for_http node05 "http://10.100.0.22:18193/" 240
local container_value container_node container_ip container_port local container_value container_node container_ip container_port
container_value="$(native_first_healthy_instance "native-container")" container_value="$(native_first_healthy_instance "native-container")"
container_node="$(printf '%s' "${container_value}" | jq -r '.node_id')" container_node="$(printf '%s' "${container_value}" | jq -r '.node_id')"
@ -5067,6 +5079,16 @@ validate_native_runtime_flow() {
"${native_fresh_healthy_count_expr}" \ "${native_fresh_healthy_count_expr}" \
"1" \ "1" \
240 240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
'length' \
"1" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local drained_web_value drained_web_node drained_container_value drained_container_node local drained_web_value drained_web_node drained_container_value drained_container_node
drained_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)" drained_web_value="$(wait_for_native_instance_node "native-web" "node05" 240)"
drained_web_node="$(printf '%s' "${drained_web_value}" | jq -r '.node_id')" drained_web_node="$(printf '%s' "${drained_web_value}" | jq -r '.node_id')"
@ -5074,8 +5096,10 @@ validate_native_runtime_flow() {
drained_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)" drained_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
drained_container_node="$(printf '%s' "${drained_container_value}" | jq -r '.node_id')" drained_container_node="$(printf '%s' "${drained_container_value}" | jq -r '.node_id')"
[[ "${drained_container_node}" == "node05" ]] || die "native-container did not relocate to node05 after draining node04" [[ "${drained_container_node}" == "node05" ]] || die "native-container did not relocate to node05 after draining node04"
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
wait_for_http node05 "http://10.100.0.22:18190/" 240 wait_for_http node05 "http://10.100.0.22:18190/" 240
wait_for_http node05 "http://10.100.0.22:18192/" 240 wait_for_http node05 "http://10.100.0.22:18192/" 240
wait_for_http node05 "http://10.100.0.22:18193/" 240
wait_for_http node01 "http://127.0.0.1:18191/" 240 wait_for_http node01 "http://127.0.0.1:18191/" 240
publication_value="$(native_publication_state)" publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')" publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
@ -5105,8 +5129,20 @@ validate_native_runtime_flow() {
"${native_fresh_healthy_count_expr}" \ "${native_fresh_healthy_count_expr}" \
"1" \ "1" \
240 240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
'length' \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
"${native_fresh_healthy_count_expr}" \
"2" \
240
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
local restored_container_value restored_container_node local restored_container_value restored_container_node
restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)" restored_container_value="$(wait_for_native_instance_node "native-container" "node05" 240)"
restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')" restored_container_node="$(printf '%s' "${restored_container_value}" | jq -r '.node_id')"
@ -5132,6 +5168,11 @@ validate_native_runtime_flow() {
"${native_fresh_healthy_count_expr}" \ "${native_fresh_healthy_count_expr}" \
"1" \ "1" \
240 240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
"${native_fresh_healthy_count_expr}" \
"1" \
240
local failover_web_value failover_web_node failover_container_value failover_container_node local failover_web_value failover_web_node failover_container_value failover_container_node
failover_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)" failover_web_value="$(wait_for_native_instance_node "native-web" "node04" 240)"
@ -5140,6 +5181,7 @@ validate_native_runtime_flow() {
failover_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)" failover_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
failover_container_node="$(printf '%s' "${failover_container_value}" | jq -r '.node_id')" failover_container_node="$(printf '%s' "${failover_container_value}" | jq -r '.node_id')"
[[ "${failover_container_node}" == "node04" ]] || die "native-container did not fail over to node04 after node05 stopped" [[ "${failover_container_node}" == "node04" ]] || die "native-container did not fail over to node04 after node05 stopped"
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
publication_value="$(native_publication_state)" publication_value="$(native_publication_state)"
publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')" publication_pool_id="$(printf '%s' "${publication_value}" | jq -r '.load_balancer.pool_id')"
publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')" publication_ip="$(printf '%s' "${publication_value}" | jq -r '.dns.value')"
@ -5147,6 +5189,7 @@ validate_native_runtime_flow() {
wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180 wait_for_native_dns_record "${publication_fqdn}" "${publication_ip}" 180
wait_for_http node04 "http://10.100.0.21:18190/" 240 wait_for_http node04 "http://10.100.0.21:18190/" 240
wait_for_http node04 "http://10.100.0.21:18192/" 240 wait_for_http node04 "http://10.100.0.21:18192/" 240
wait_for_http node04 "http://10.100.0.21:18193/" 240
wait_for_http node01 "http://127.0.0.1:18191/" 240 wait_for_http node01 "http://127.0.0.1:18191/" 240
log "Restarting native worker and ensuring declarative replica count is restored" log "Restarting native worker and ensuring declarative replica count is restored"
@ -5175,8 +5218,20 @@ validate_native_runtime_flow() {
"${native_fresh_healthy_count_expr}" \ "${native_fresh_healthy_count_expr}" \
"1" \ "1" \
240 240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
'length' \
"2" \
240
wait_for_native_dump_count \
"photoncloud/clusters/test-cluster/instances/native-daemon/" \
"${native_fresh_healthy_count_expr}" \
"2" \
240
wait_for_native_instance_node "native-web" "node04" 240 >/dev/null wait_for_native_instance_node "native-web" "node04" 240 >/dev/null
wait_for_native_instance_node "native-web" "node05" 240 >/dev/null wait_for_native_instance_node "native-web" "node05" 240 >/dev/null
wait_for_native_instance_node "native-daemon" "node04" 240 >/dev/null
wait_for_native_instance_node "native-daemon" "node05" 240 >/dev/null
local recovered_container_value recovered_container_node local recovered_container_value recovered_container_node
recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)" recovered_container_value="$(wait_for_native_instance_node "native-container" "node04" 240)"
recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')" recovered_container_node="$(printf '%s' "${recovered_container_value}" | jq -r '.node_id')"