Add dependency-aware fleet scheduling

2026-04-01 22:22:18 +09:00 · 2026-04-01 22:22:18 +09:00 · 7450832bff
commit 7450832bff
parent b07bcb3772
4 changed files with 879 additions and 19 deletions
--- a/deployer/crates/deployer-types/src/lib.rs
+++ b/deployer/crates/deployer-types/src/lib.rs
@ -319,6 +319,14 @@ fn default_dns_ttl() -> u32 {
    30
 }
 fn default_service_dependency_condition() -> ServiceDependencyCondition {
    ServiceDependencyCondition::Healthy
 }
 fn default_service_dependency_min_ready() -> u32 {
    1
 }
 /// Process specification executed by node-agent for a scheduled instance.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
 pub struct ProcessSpec {
@ -542,6 +550,37 @@ pub struct ServicePublicationSpec {
    pub load_balancer: Option<LoadBalancerPublicationSpec>,
 }
 /// Readiness condition required from another service before this service can reconcile.
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
 #[serde(rename_all = "snake_case")]
 pub enum ServiceDependencyCondition {
    /// Require a minimum number of healthy dependency instances.
    #[default]
    Healthy,
    /// Require the dependency to publish at least one DNS or load-balancer artifact.
    Published,
 }
 /// Dependency edge between scheduler-managed services.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct ServiceDependencySpec {
    pub service: String,
    #[serde(default = "default_service_dependency_condition")]
    pub condition: ServiceDependencyCondition,
    #[serde(default = "default_service_dependency_min_ready")]
    pub min_ready: u32,
 }
 impl Default for ServiceDependencySpec {
    fn default() -> Self {
        Self {
            service: String::new(),
            condition: default_service_dependency_condition(),
            min_ready: default_service_dependency_min_ready(),
        }
    }
 }
 /// Cluster node record stored under photoncloud/clusters/{cluster_id}/nodes/{node_id}.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct ClusterNodeRecord {
@ -849,6 +888,8 @@ pub struct ServiceSpec {
    #[serde(default)]
    pub mesh_mode: Option<String>,
    #[serde(default)]
    pub depends_on: Vec<ServiceDependencySpec>,
    #[serde(default)]
    pub schedule: Option<ServiceScheduleSpec>,
    #[serde(default)]
    pub publish: Option<ServicePublicationSpec>,
@ -915,6 +956,30 @@ pub struct ServicePublicationState {
    pub observed_at: Option<DateTime<Utc>>,
 }
 /// Scheduler-observed readiness and dependency state for a logical service.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)]
 pub struct ServiceStatusRecord {
    pub service: String,
    #[serde(default)]
    pub phase: String,
    #[serde(default)]
    pub desired_instances: u32,
    #[serde(default)]
    pub scheduled_instances: u32,
    #[serde(default)]
    pub healthy_instances: u32,
    #[serde(default)]
    pub publish_ready: bool,
    #[serde(default)]
    pub dependencies_ready: bool,
    #[serde(default)]
    pub blockers: Vec<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub message: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub observed_at: Option<DateTime<Utc>>,
 }
 /// mTLS policy definition.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct MtlsPolicySpec {
--- a/deployer/crates/fleet-scheduler/src/main.rs
+++ b/deployer/crates/fleet-scheduler/src/main.rs
@ -6,7 +6,8 @@ use chainfire_client::Client;
 use chrono::Utc;
 use clap::Parser;
 use deployer_types::{
-    ClusterNodeRecord, PlacementPolicy, ServiceInstanceSpec, ServiceScheduleMode, ServiceSpec,
+    ClusterNodeRecord, PlacementPolicy, ServiceDependencyCondition, ServiceInstanceSpec,
    ServicePublicationState, ServiceScheduleMode, ServiceSpec, ServiceStatusRecord,
 };
 use publish::{PublicationConfig, PublicationReconciler};
 use serde_json::Value;
@ -90,6 +91,12 @@ struct ReconcilePlan {
    deferred_deletes: usize,
 }
 #[derive(Debug, Default)]
 struct DependencySummary {
    dependencies_ready: bool,
    blockers: Vec<String>,
 }
 impl Scheduler {
    fn new(cli: Cli) -> Self {
        let cluster_namespace = cli.cluster_namespace;
@ -135,6 +142,18 @@ impl Scheduler {
        let mut client = Client::connect(self.endpoint.clone()).await?;
        let nodes = self.load_cluster_nodes(&mut client).await?;
        let services = self.load_services(&mut client).await?;
        let instances_before = self.load_instances_by_service(&mut client).await?;
        let publications_before = publish::load_publication_states(
            &mut client,
            &self.cluster_namespace,
            &self.cluster_id,
        )
        .await?;
        let dependency_cycles = dependency_cycle_services(&services);
        let services_by_name = services
            .iter()
            .map(|service| (service.name.as_str(), service))
            .collect::<HashMap<_, _>>();
        debug!(
            nodes = nodes.len(),
@ -146,6 +165,22 @@ impl Scheduler {
            if service.schedule.is_none() {
                continue;
            }
            let dependencies = dependency_summary(
                service,
                &services_by_name,
                &instances_before,
                &publications_before,
                &dependency_cycles,
                self.heartbeat_timeout_secs,
            );
            if !dependencies.dependencies_ready {
                warn!(
                    service = %service.name,
                    blockers = ?dependencies.blockers,
                    "skipping service reconciliation until dependencies are ready"
                );
                continue;
            }
            self.reconcile_service(&mut client, &nodes, service).await?;
        }
@ -159,6 +194,25 @@ impl Scheduler {
            )
            .await?;
        let instances_after = self.load_instances_by_service(&mut client).await?;
        let publications_after = publish::load_publication_states(
            &mut client,
            &self.cluster_namespace,
            &self.cluster_id,
        )
        .await?;
        self.persist_service_statuses(
            &mut client,
            &nodes,
            &services,
            &instances_after,
            &publications_after,
            &dependency_cycles,
        )
        .await?;
        self.cleanup_stale_service_statuses(&mut client, &services)
            .await?;
        Ok(())
    }
@ -200,6 +254,112 @@ impl Scheduler {
        Ok(services)
    }
    async fn load_instances_by_service(
        &self,
        client: &mut Client,
    ) -> Result<HashMap<String, Vec<ServiceInstanceSpec>>> {
        let prefix = format!(
            "{}/clusters/{}/instances/",
            self.cluster_namespace, self.cluster_id
        );
        let kvs = client.get_prefix(prefix.as_bytes()).await?;
        let mut instances = HashMap::<String, Vec<ServiceInstanceSpec>>::new();
        for (_key, value) in kvs {
            match serde_json::from_slice::<ServiceInstanceSpec>(&value) {
                Ok(instance) => {
                    instances
                        .entry(instance.service.clone())
                        .or_default()
                        .push(instance);
                }
                Err(error) => warn!(error = %error, "failed to decode service instance"),
            }
        }
        for service_instances in instances.values_mut() {
            service_instances.sort_by(|lhs, rhs| lhs.instance_id.cmp(&rhs.instance_id));
        }
        Ok(instances)
    }
    async fn persist_service_statuses(
        &self,
        client: &mut Client,
        nodes: &[ClusterNodeRecord],
        services: &[ServiceSpec],
        instances_by_service: &HashMap<String, Vec<ServiceInstanceSpec>>,
        publications: &HashMap<String, ServicePublicationState>,
        dependency_cycles: &HashSet<String>,
    ) -> Result<()> {
        let services_by_name = services
            .iter()
            .map(|service| (service.name.as_str(), service))
            .collect::<HashMap<_, _>>();
        for service in services {
            let status = build_service_status(
                service,
                nodes,
                &services_by_name,
                instances_by_service,
                publications,
                dependency_cycles,
                self.heartbeat_timeout_secs,
            );
            let key = service_status_key(&self.cluster_namespace, &self.cluster_id, &service.name);
            if self.dry_run {
                info!(
                    service = %service.name,
                    phase = %status.phase,
                    healthy_instances = status.healthy_instances,
                    desired_instances = status.desired_instances,
                    blockers = ?status.blockers,
                    "would update service status"
                );
            } else {
                client.put(&key, serde_json::to_vec(&status)?).await?;
            }
        }
        Ok(())
    }
    async fn cleanup_stale_service_statuses(
        &self,
        client: &mut Client,
        services: &[ServiceSpec],
    ) -> Result<()> {
        let desired = services
            .iter()
            .map(|service| service.name.as_str())
            .collect::<HashSet<_>>();
        let prefix = service_status_prefix(&self.cluster_namespace, &self.cluster_id);
        let existing = client.get_prefix(prefix.as_bytes()).await?;
        for (key, _) in existing {
            let key_str = String::from_utf8_lossy(&key);
            let Some(service_name) = key_str.strip_prefix(&prefix) else {
                continue;
            };
            if service_name.is_empty()
                || service_name.contains('/')
                || desired.contains(service_name)
            {
                continue;
            }
            if self.dry_run {
                info!(service = %service_name, "would delete stale service status");
            } else if client.delete(&key).await? {
                info!(service = %service_name, "deleted stale service status");
            }
        }
        Ok(())
    }
    async fn reconcile_service(
        &self,
        client: &mut Client,
@ -222,7 +382,8 @@ impl Scheduler {
        let existing_instances = decode_managed_instances(&existing);
        let desired_instances =
            build_desired_instances(service, &eligible_nodes, &existing_instances)?;
-        let target_instances = schedule_target_count(schedule.mode, eligible_nodes.len(), schedule.replicas);
+        let target_instances =
            schedule_target_count(schedule.mode, eligible_nodes.len(), schedule.replicas);
        if desired_instances.len() < target_instances {
            warn!(
                service = %service.name,
@ -341,6 +502,284 @@ impl Scheduler {
    }
 }
 fn service_status_prefix(cluster_namespace: &str, cluster_id: &str) -> String {
    format!(
        "{}/clusters/{}/service-statuses/",
        cluster_namespace, cluster_id
    )
 }
 fn service_status_key(cluster_namespace: &str, cluster_id: &str, service: &str) -> Vec<u8> {
    format!(
        "{}{}",
        service_status_prefix(cluster_namespace, cluster_id),
        service
    )
    .into_bytes()
 }
 fn dependency_cycle_services(services: &[ServiceSpec]) -> HashSet<String> {
    let service_names = services
        .iter()
        .map(|service| service.name.clone())
        .collect::<HashSet<_>>();
    let dependencies = services
        .iter()
        .map(|service| {
            let deps = service
                .depends_on
                .iter()
                .filter(|dependency| service_names.contains(&dependency.service))
                .map(|dependency| dependency.service.clone())
                .collect::<Vec<_>>();
            (service.name.clone(), deps)
        })
        .collect::<HashMap<_, _>>();
    let mut permanent = HashSet::new();
    let mut visiting = Vec::<String>::new();
    let mut cycles = HashSet::new();
    for service in services {
        visit_dependency_cycles(
            &service.name,
            &dependencies,
            &mut permanent,
            &mut visiting,
            &mut cycles,
        );
    }
    cycles
 }
 fn visit_dependency_cycles(
    service: &str,
    dependencies: &HashMap<String, Vec<String>>,
    permanent: &mut HashSet<String>,
    visiting: &mut Vec<String>,
    cycles: &mut HashSet<String>,
 ) {
    if permanent.contains(service) {
        return;
    }
    if let Some(position) = visiting.iter().position(|current| current == service) {
        cycles.extend(visiting[position..].iter().cloned());
        return;
    }
    visiting.push(service.to_string());
    if let Some(depends_on) = dependencies.get(service) {
        for dependency in depends_on {
            visit_dependency_cycles(dependency, dependencies, permanent, visiting, cycles);
        }
    }
    visiting.pop();
    permanent.insert(service.to_string());
 }
 fn dependency_summary(
    service: &ServiceSpec,
    services_by_name: &HashMap<&str, &ServiceSpec>,
    instances_by_service: &HashMap<String, Vec<ServiceInstanceSpec>>,
    publications: &HashMap<String, ServicePublicationState>,
    dependency_cycles: &HashSet<String>,
    heartbeat_timeout_secs: u64,
 ) -> DependencySummary {
    let mut blockers = Vec::new();
    if dependency_cycles.contains(&service.name) {
        blockers.push("cyclic dependency graph".to_string());
    }
    for dependency in &service.depends_on {
        if dependency.service == service.name {
            blockers.push(format!(
                "dependency {} points to itself",
                dependency.service
            ));
            continue;
        }
        if !services_by_name.contains_key(dependency.service.as_str()) {
            blockers.push(format!("dependency {} is not defined", dependency.service));
            continue;
        }
        if dependency_cycles.contains(&dependency.service) {
            blockers.push(format!(
                "dependency {} is part of a dependency cycle",
                dependency.service
            ));
            continue;
        }
        match dependency.condition {
            ServiceDependencyCondition::Healthy => {
                let ready = instances_by_service
                    .get(&dependency.service)
                    .map(|instances| {
                        instances
                            .iter()
                            .filter(|instance| {
                                instance_is_available(instance, heartbeat_timeout_secs)
                            })
                            .count() as u32
                    })
                    .unwrap_or(0);
                let min_ready = dependency.min_ready.max(1);
                if ready < min_ready {
                    blockers.push(format!(
                        "dependency {} has {ready}/{min_ready} healthy instance(s)",
                        dependency.service
                    ));
                }
            }
            ServiceDependencyCondition::Published => {
                let ready = publications
                    .get(&dependency.service)
                    .map(publication_ready)
                    .unwrap_or(false);
                if !ready {
                    blockers.push(format!(
                        "dependency {} is not published",
                        dependency.service
                    ));
                }
            }
        }
    }
    DependencySummary {
        dependencies_ready: blockers.is_empty(),
        blockers,
    }
 }
 fn publication_ready(state: &ServicePublicationState) -> bool {
    state
        .dns
        .as_ref()
        .map(|dns| !dns.values.is_empty())
        .unwrap_or(false)
        || state
            .load_balancer
            .as_ref()
            .and_then(|load_balancer| load_balancer.vip_address.as_ref())
            .is_some()
 }
 fn desired_instance_count(
    service: &ServiceSpec,
    nodes: &[ClusterNodeRecord],
    heartbeat_timeout_secs: u64,
 ) -> u32 {
    let Some(schedule) = service.schedule.as_ref() else {
        return 0;
    };
    match schedule.mode {
        ServiceScheduleMode::Replicated => schedule.replicas,
        ServiceScheduleMode::Daemon => {
            eligible_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len() as u32
        }
    }
 }
 fn build_service_status(
    service: &ServiceSpec,
    nodes: &[ClusterNodeRecord],
    services_by_name: &HashMap<&str, &ServiceSpec>,
    instances_by_service: &HashMap<String, Vec<ServiceInstanceSpec>>,
    publications: &HashMap<String, ServicePublicationState>,
    dependency_cycles: &HashSet<String>,
    heartbeat_timeout_secs: u64,
 ) -> ServiceStatusRecord {
    let instances = instances_by_service
        .get(&service.name)
        .map(Vec::as_slice)
        .unwrap_or(&[]);
    let healthy_instances = instances
        .iter()
        .filter(|instance| instance_is_available(instance, heartbeat_timeout_secs))
        .count() as u32;
    let scheduled_instances = instances.len() as u32;
    let desired_instances = desired_instance_count(service, nodes, heartbeat_timeout_secs);
    let publish_ready = publications
        .get(&service.name)
        .map(publication_ready)
        .unwrap_or(false);
    let dependencies = dependency_summary(
        service,
        services_by_name,
        instances_by_service,
        publications,
        dependency_cycles,
        heartbeat_timeout_secs,
    );
    let eligible_node_count = service
        .schedule
        .as_ref()
        .map(|schedule| eligible_nodes(nodes, &schedule.placement, heartbeat_timeout_secs).len())
        .unwrap_or(0);
    let (phase, message) = if service.schedule.is_none() {
        (
            "unmanaged".to_string(),
            Some("service is not managed by fleet-scheduler".to_string()),
        )
    } else if !dependencies.dependencies_ready {
        (
            "blocked".to_string(),
            Some(dependencies.blockers.join("; ")),
        )
    } else if eligible_node_count == 0 {
        (
            "unschedulable".to_string(),
            Some("no eligible nodes match the service placement policy".to_string()),
        )
    } else if desired_instances == 0 && scheduled_instances == 0 {
        (
            "idle".to_string(),
            Some("service has no desired instances".to_string()),
        )
    } else if desired_instances > 0 && healthy_instances >= desired_instances {
        (
            "healthy".to_string(),
            Some(format!(
                "healthy instances satisfy desired count ({healthy_instances}/{desired_instances})"
            )),
        )
    } else if scheduled_instances > 0 || healthy_instances > 0 {
        (
            "degraded".to_string(),
            Some(format!(
                "healthy={healthy_instances} scheduled={scheduled_instances} desired={desired_instances}"
            )),
        )
    } else {
        (
            "pending".to_string(),
            Some(format!(
                "waiting for instances to reach desired count ({healthy_instances}/{desired_instances})"
            )),
        )
    };
    ServiceStatusRecord {
        service: service.name.clone(),
        phase,
        desired_instances,
        scheduled_instances,
        healthy_instances,
        publish_ready,
        dependencies_ready: dependencies.dependencies_ready,
        blockers: dependencies.blockers,
        message,
        observed_at: Some(Utc::now()),
    }
 }
 fn eligible_nodes<'a>(
    nodes: &'a [ClusterNodeRecord],
    placement: &PlacementPolicy,
@ -853,7 +1292,10 @@ fn instance_is_available(instance: &ServiceInstanceSpec, heartbeat_timeout_secs:
        && instance_has_fresh_heartbeat(instance, heartbeat_timeout_secs)
 }
-fn instance_has_fresh_heartbeat(instance: &ServiceInstanceSpec, heartbeat_timeout_secs: u64) -> bool {
+fn instance_has_fresh_heartbeat(
    instance: &ServiceInstanceSpec,
    heartbeat_timeout_secs: u64,
 ) -> bool {
    if heartbeat_timeout_secs == 0 {
        return true;
    }
@ -961,8 +1403,9 @@ mod tests {
    use super::*;
    use chrono::Duration as ChronoDuration;
    use deployer_types::{
-        ClusterNodeRecord, HealthCheckSpec, PlacementPolicy, ProcessSpec, RolloutStrategySpec,
+        ClusterNodeRecord, HealthCheckSpec, PlacementPolicy, ProcessSpec,
-        ServicePorts, ServiceScheduleMode, ServiceScheduleSpec,
+        PublishedLoadBalancerState, RolloutStrategySpec, ServiceDependencyCondition,
        ServiceDependencySpec, ServicePorts, ServiceScheduleMode, ServiceScheduleSpec,
    };
    fn active_node(node_id: &str, roles: &[&str], labels: &[(&str, &str)]) -> ClusterNodeRecord {
@ -1003,6 +1446,7 @@ mod tests {
            protocol: Some("http".to_string()),
            mtls_required: None,
            mesh_mode: None,
            depends_on: Vec::new(),
            schedule: Some(ServiceScheduleSpec {
                mode: ServiceScheduleMode::Replicated,
                replicas: 2,
@ -1036,6 +1480,31 @@ mod tests {
        }
    }
    fn service_named(name: &str) -> ServiceSpec {
        let mut service = scheduled_service();
        service.name = name.to_string();
        service
    }
    fn healthy_instance(service: &str, node_id: &str) -> ServiceInstanceSpec {
        ServiceInstanceSpec {
            instance_id: format!("{service}-{node_id}"),
            service: service.to_string(),
            node_id: node_id.to_string(),
            ip: "10.0.0.10".to_string(),
            port: 8080,
            mesh_port: Some(18080),
            version: None,
            health_check: None,
            process: None,
            container: None,
            managed_by: Some(MANAGED_BY.to_string()),
            state: Some("healthy".to_string()),
            last_heartbeat: Some(Utc::now()),
            observed_at: None,
        }
    }
    #[test]
    fn test_node_eligibility_matches_roles_and_labels() {
        let node = active_node("node01", &["worker"], &[("tier", "general")]);
@ -1083,6 +1552,126 @@ mod tests {
        assert_eq!(desired[0].process.as_ref().unwrap().command, "/usr/bin/api");
    }
    #[test]
    fn test_dependency_cycles_detect_all_services_in_cycle() {
        let mut api = service_named("api");
        api.depends_on = vec![ServiceDependencySpec {
            service: "worker".to_string(),
            condition: ServiceDependencyCondition::Healthy,
            min_ready: 1,
        }];
        let mut worker = service_named("worker");
        worker.depends_on = vec![ServiceDependencySpec {
            service: "edge".to_string(),
            condition: ServiceDependencyCondition::Healthy,
            min_ready: 1,
        }];
        let mut edge = service_named("edge");
        edge.depends_on = vec![ServiceDependencySpec {
            service: "api".to_string(),
            condition: ServiceDependencyCondition::Healthy,
            min_ready: 1,
        }];
        let cycles = dependency_cycle_services(&[api, worker, edge]);
        assert_eq!(
            cycles,
            HashSet::from(["api".to_string(), "worker".to_string(), "edge".to_string(),])
        );
    }
    #[test]
    fn test_dependency_summary_blocks_until_dependency_is_healthy() {
        let api = service_named("api");
        let mut worker = service_named("worker");
        worker.depends_on = vec![ServiceDependencySpec {
            service: "api".to_string(),
            condition: ServiceDependencyCondition::Healthy,
            min_ready: 2,
        }];
        let services = vec![api.clone(), worker.clone()];
        let services_by_name = services
            .iter()
            .map(|service| (service.name.as_str(), service))
            .collect::<HashMap<_, _>>();
        let instances_by_service = HashMap::from([(
            "api".to_string(),
            vec![
                healthy_instance("api", "node01"),
                ServiceInstanceSpec {
                    state: Some("starting".to_string()),
                    ..healthy_instance("api", "node02")
                },
            ],
        )]);
        let summary = dependency_summary(
            &worker,
            &services_by_name,
            &instances_by_service,
            &HashMap::new(),
            &HashSet::new(),
            300,
        );
        assert!(!summary.dependencies_ready);
        assert_eq!(
            summary.blockers,
            vec!["dependency api has 1/2 healthy instance(s)".to_string()]
        );
    }
    #[test]
    fn test_build_service_status_recognizes_published_dependency() {
        let api = service_named("api");
        let mut worker = service_named("worker");
        worker.depends_on = vec![ServiceDependencySpec {
            service: "api".to_string(),
            condition: ServiceDependencyCondition::Published,
            min_ready: 1,
        }];
        let services = vec![api.clone(), worker.clone()];
        let services_by_name = services
            .iter()
            .map(|service| (service.name.as_str(), service))
            .collect::<HashMap<_, _>>();
        let publications = HashMap::from([(
            "api".to_string(),
            ServicePublicationState {
                service: "api".to_string(),
                org_id: "default-org".to_string(),
                project_id: "default-project".to_string(),
                load_balancer: Some(PublishedLoadBalancerState {
                    id: "lb-1".to_string(),
                    pool_id: "pool-1".to_string(),
                    listener_id: "listener-1".to_string(),
                    vip_address: Some("203.0.113.10".to_string()),
                }),
                dns: None,
                observed_at: Some(Utc::now()),
            },
        )]);
        let nodes = vec![active_node("node01", &["worker"], &[("tier", "general")])];
        let status = build_service_status(
            &worker,
            &nodes,
            &services_by_name,
            &HashMap::new(),
            &publications,
            &HashSet::new(),
            300,
        );
        assert_eq!(status.phase, "pending");
        assert!(status.dependencies_ready);
        assert!(status.blockers.is_empty());
    }
    #[test]
    fn test_build_desired_instances_honors_max_instances_per_node() {
        let nodes = vec![active_node("node01", &["worker"], &[("tier", "general")])];
--- a/deployer/crates/fleet-scheduler/src/publish.rs
+++ b/deployer/crates/fleet-scheduler/src/publish.rs
@ -1129,7 +1129,7 @@ fn publication_key(cluster_namespace: &str, cluster_id: &str, service: &str) ->
    .into_bytes()
 }
-async fn load_publication_states(
+pub(crate) async fn load_publication_states(
    client: &mut Client,
    cluster_namespace: &str,
    cluster_id: &str,
@ -1194,6 +1194,7 @@ mod tests {
            protocol: Some("http".to_string()),
            mtls_required: None,
            mesh_mode: None,
            depends_on: Vec::new(),
            schedule: Some(ServiceScheduleSpec::default()),
            publish: Some(ServicePublicationSpec {
                org_id: Some("default-org".to_string()),
--- a/deployer/scripts/verify-fleet-scheduler-e2e.sh
+++ b/deployer/scripts/verify-fleet-scheduler-e2e.sh
@ -212,6 +212,41 @@ services:
        path: /
        interval_secs: 1
        timeout_secs: 2
  - name: worker
    ports:
      http: 18081
    protocol: http
    depends_on:
      - service: api
        condition: healthy
        min_ready: 2
    schedule:
      replicas: 2
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          tier: general
        spread_by_label: failure_domain
        max_instances_per_node: 1
      instance_port: 18081
      process:
        command: python3
        args:
          - -m
          - http.server
          - ${INSTANCE_PORT}
          - --bind
          - ${INSTANCE_IP}
      health_check:
        type: http
        path: /
        interval_secs: 1
        timeout_secs: 2
 EOF
 cat >"$tmp_dir/cluster-scaled.yaml" <<'EOF'
@ -283,6 +318,41 @@ services:
        path: /
        interval_secs: 1
        timeout_secs: 2
  - name: worker
    ports:
      http: 18081
    protocol: http
    depends_on:
      - service: api
        condition: healthy
        min_ready: 1
    schedule:
      replicas: 1
      placement:
        roles:
          - worker
        pools:
          - general
        node_classes:
          - worker-linux
        match_labels:
          tier: general
        spread_by_label: failure_domain
        max_instances_per_node: 1
      instance_port: 18081
      process:
        command: python3
        args:
          - -m
          - http.server
          - ${INSTANCE_PORT}
          - --bind
          - ${INSTANCE_IP}
      health_check:
        type: http
        path: /
        interval_secs: 1
        timeout_secs: 2
 EOF
 endpoint="http://127.0.0.1:${api_port}"
@ -326,7 +396,59 @@ run_node_agent_once node02
 echo "Scheduling managed instances"
 run_scheduler_once
-echo "Reconciling processes and health"
+echo "Validating dependency block before api is healthy"
 run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-blocked.dump"
 python3 - "$tmp_dir/worker-blocked.dump" <<'PY'
 import sys
 path = sys.argv[1]
 lines = [line.strip() for line in open(path, "r", encoding="utf-8") if line.strip()]
 if lines:
    raise SystemExit(f"expected no worker instances before api is healthy, found {len(lines)}")
 print("worker instances correctly blocked before dependency becomes healthy")
 PY
 run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/service-statuses/worker" >"$tmp_dir/worker-status-blocked.dump"
 python3 - "$tmp_dir/worker-status-blocked.dump" <<'PY'
 import json
 import sys
 path = sys.argv[1]
 statuses = []
 with open(path, "r", encoding="utf-8") as handle:
    for line in handle:
        line = line.strip()
        if not line:
            continue
        marker = " value="
        if marker not in line:
            continue
        statuses.append(json.loads(line.split(marker, 1)[1]))
 if len(statuses) != 1:
    raise SystemExit(f"expected exactly one worker service status, found {len(statuses)}")
 status = statuses[0]
 if status.get("phase") != "blocked":
    raise SystemExit(f"expected worker phase=blocked, found {status.get('phase')}")
 blockers = status.get("blockers") or []
 if not blockers or "dependency api has 0/2 healthy instance(s)" not in blockers[0]:
    raise SystemExit(f"unexpected blockers: {blockers}")
 print("worker service status reports dependency block")
 PY
 echo "Reconciling processes and health for api"
 for _ in 1 2 3; do
  run_node_agent_once node01
  run_node_agent_once node02
  sleep 1
 done
 echo "Re-running scheduler after api became healthy"
 run_scheduler_once
 echo "Reconciling processes and health for dependent worker service"
 for _ in 1 2 3; do
  run_node_agent_once node01
  run_node_agent_once node02
@ -337,7 +459,12 @@ echo "Validating HTTP endpoints"
 python3 - <<'PY'
 import urllib.request
-for address in ("http://127.0.0.2:18080/", "http://127.0.0.3:18080/"):
+for address in (
    "http://127.0.0.2:18080/",
    "http://127.0.0.3:18080/",
    "http://127.0.0.2:18081/",
    "http://127.0.0.3:18081/",
 ):
    with urllib.request.urlopen(address, timeout=5) as response:
        body = response.read().decode("utf-8")
        if response.status != 200:
@ -381,13 +508,56 @@ if states != ["healthy", "healthy"]:
 print("Observed two healthy scheduled instances across node01 and node02")
 PY
 run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-instances.dump"
 python3 - "$tmp_dir/worker-instances.dump" <<'PY'
 import json
 import sys
 path = sys.argv[1]
 instances = []
 with open(path, "r", encoding="utf-8") as handle:
    for line in handle:
        line = line.strip()
        if not line:
            continue
        marker = " value="
        if marker not in line:
            continue
        value = line.split(marker, 1)[1]
        instances.append(json.loads(value))
 if len(instances) != 2:
    raise SystemExit(f"expected 2 worker instances, found {len(instances)}")
 node_ids = sorted(instance["node_id"] for instance in instances)
 states = sorted(instance.get("state") for instance in instances)
 if node_ids != ["node01", "node02"]:
    raise SystemExit(f"unexpected worker placement: {node_ids}")
 if states != ["healthy", "healthy"]:
    raise SystemExit(f"unexpected worker states: {states}")
 print("Observed two healthy dependent worker instances across node01 and node02")
 PY
 echo "Applying scaled declaration"
 run_deployer_ctl apply --config "$tmp_dir/cluster-scaled.yaml" --prune
 echo "Re-running scheduler after scale-down"
 run_scheduler_once
-echo "Reconciling processes and health after scale-down"
+echo "Reconciling api after scale-down"
 for _ in 1 2 3; do
  run_node_agent_once node01
  run_node_agent_once node02
  sleep 1
 done
 echo "Re-running scheduler after scaled api became healthy"
 run_scheduler_once
 echo "Reconciling dependent worker service after scale-down"
 for _ in 1 2 3; do
  run_node_agent_once node01
  run_node_agent_once node02
@ -426,6 +596,37 @@ if instance.get("state") != "healthy":
 print("Observed one healthy scheduled instance on node01 after scale-down")
 PY
 run_deployer_ctl dump --prefix "photoncloud/clusters/test-cluster/instances/worker/" >"$tmp_dir/worker-instances-scaled.dump"
 python3 - "$tmp_dir/worker-instances-scaled.dump" <<'PY'
 import json
 import sys
 path = sys.argv[1]
 instances = []
 with open(path, "r", encoding="utf-8") as handle:
    for line in handle:
        line = line.strip()
        if not line:
            continue
        marker = " value="
        if marker not in line:
            continue
        value = line.split(marker, 1)[1]
        instances.append(json.loads(value))
 if len(instances) != 1:
    raise SystemExit(f"expected 1 worker instance after scale-down, found {len(instances)}")
 instance = instances[0]
 if instance["node_id"] != "node01":
    raise SystemExit(f"expected remaining worker instance on node01, found {instance['node_id']}")
 if instance.get("state") != "healthy":
    raise SystemExit(f"expected remaining worker instance to be healthy, found {instance.get('state')}")
 print("Observed one healthy dependent worker instance on node01 after scale-down")
 PY
 echo "Validating endpoint convergence after scale-down"
 python3 - <<'PY'
 import socket
@ -434,17 +635,21 @@ import urllib.request
 with urllib.request.urlopen("http://127.0.0.2:18080/", timeout=5) as response:
    if response.status != 200:
        raise SystemExit(f"node01 endpoint returned {response.status}")
 with urllib.request.urlopen("http://127.0.0.2:18081/", timeout=5) as response:
    if response.status != 200:
        raise SystemExit(f"node01 worker endpoint returned {response.status}")
-sock = socket.socket()
+for port, label in ((18080, "api"), (18081, "worker")):
-sock.settimeout(1.5)
+    sock = socket.socket()
-try:
+    sock.settimeout(1.5)
-    sock.connect(("127.0.0.3", 18080))
+    try:
-except OSError:
+        sock.connect(("127.0.0.3", port))
-    pass
+    except OSError:
-else:
+        pass
-    raise SystemExit("node02 endpoint still accepts connections after scale-down")
+    else:
-finally:
+        raise SystemExit(f"node02 {label} endpoint still accepts connections after scale-down")
-    sock.close()
+    finally:
        sock.close()
 print("Endpoint convergence validated")
 PY