photoncloud-monorepo/deployer/crates/fleet-scheduler/src/main.rs

mod auth;
mod publish;

use anyhow::{Context, Result};
use chainfire_client::Client;
use chrono::Utc;
use clap::Parser;
use deployer_types::{
    ClusterNodeRecord, PlacementPolicy, ServiceInstanceSpec, ServiceScheduleMode, ServiceSpec,
};
use publish::{PublicationConfig, PublicationReconciler};
use serde_json::Value;
use std::collections::{BTreeMap, HashMap, HashSet};
use std::time::Duration;
use tokio::time::sleep;
use tracing::{debug, info, warn};
use tracing_subscriber::EnvFilter;

const MANAGED_BY: &str = "fleet-scheduler";

#[derive(Debug, Parser)]
#[command(author, version, about = "PhotonCloud non-Kubernetes fleet scheduler")]
struct Cli {
    #[arg(long, default_value = "http://127.0.0.1:7000")]
    chainfire_endpoint: String,

    #[arg(long, default_value = "photoncloud")]
    cluster_namespace: String,

    #[arg(long)]
    cluster_id: String,

    #[arg(long, default_value_t = 15)]
    interval_secs: u64,

    #[arg(long, default_value_t = 300)]
    heartbeat_timeout_secs: u64,

    #[arg(long, default_value_t = false)]
    dry_run: bool,

    #[arg(long)]
    iam_endpoint: Option<String>,

    #[arg(long)]
    fiberlb_endpoint: Option<String>,

    #[arg(long)]
    flashdns_endpoint: Option<String>,

    #[arg(long)]
    publish_address: Option<String>,

    #[arg(long, default_value = "default-org")]
    default_org_id: String,

    #[arg(long, default_value = "default-project")]
    default_project_id: String,

    #[arg(long, default_value = MANAGED_BY)]
    controller_principal_id: String,

    #[arg(long, default_value_t = false)]
    once: bool,
}

struct Scheduler {
    endpoint: String,
    cluster_namespace: String,
    cluster_id: String,
    interval: Duration,
    heartbeat_timeout_secs: u64,
    dry_run: bool,
    publication: PublicationReconciler,
    once: bool,
}

#[derive(Debug)]
struct PlannedUpsert {
    instance: ServiceInstanceSpec,
    desired_value: Value,
}

#[derive(Debug, Default)]
struct ReconcilePlan {
    upserts: Vec<PlannedUpsert>,
    deletes: Vec<String>,
    deferred_creates: usize,
    deferred_updates: usize,
    deferred_deletes: usize,
}

impl Scheduler {
    fn new(cli: Cli) -> Self {
        let cluster_namespace = cli.cluster_namespace;
        let cluster_id = cli.cluster_id;

        Self {
            endpoint: cli.chainfire_endpoint,
            cluster_namespace,
            cluster_id: cluster_id.clone(),
            interval: Duration::from_secs(cli.interval_secs),
            heartbeat_timeout_secs: cli.heartbeat_timeout_secs,
            dry_run: cli.dry_run,
            publication: PublicationReconciler::new(PublicationConfig {
                cluster_id,
                heartbeat_timeout_secs: cli.heartbeat_timeout_secs,
                iam_endpoint: cli.iam_endpoint,
                fiberlb_endpoint: cli.fiberlb_endpoint,
                flashdns_endpoint: cli.flashdns_endpoint,
                publish_address: cli.publish_address,
                controller_principal_id: cli.controller_principal_id,
                default_org_id: cli.default_org_id,
                default_project_id: cli.default_project_id,
                dry_run: cli.dry_run,
            }),
            once: cli.once,
        }
    }

    async fn run_loop(&self) -> Result<()> {
        if self.once {
            return self.reconcile_once().await;
        }

        loop {
            if let Err(error) = self.reconcile_once().await {
                warn!(error = %error, "fleet scheduler reconciliation failed");
            }
            sleep(self.interval).await;
        }
    }

    async fn reconcile_once(&self) -> Result<()> {
        let mut client = Client::connect(self.endpoint.clone()).await?;
        let nodes = self.load_cluster_nodes(&mut client).await?;
        let services = self.load_services(&mut client).await?;

        debug!(
            nodes = nodes.len(),
            services = services.len(),
            "loaded scheduler inputs"
        );

        for service in &services {
            if service.schedule.is_none() {
                continue;
            }
            self.reconcile_service(&mut client, &nodes, service).await?;
        }

        self.publication
            .reconcile_all(
                &mut client,
                &self.cluster_namespace,
                &self.cluster_id,
                &services,
                self.dry_run,
            )
            .await?;

        Ok(())
    }

    async fn load_cluster_nodes(&self, client: &mut Client) -> Result<Vec<ClusterNodeRecord>> {
        let prefix = format!(
            "{}/clusters/{}/nodes/",
            self.cluster_namespace, self.cluster_id
        );
        let kvs = client.get_prefix(prefix.as_bytes()).await?;
        let mut nodes = Vec::with_capacity(kvs.len());

        for (_key, value) in kvs {
            match serde_json::from_slice::<ClusterNodeRecord>(&value) {
                Ok(node) => nodes.push(node),
                Err(error) => warn!(error = %error, "failed to decode cluster node"),
            }
        }

        nodes.sort_by(|lhs, rhs| lhs.node_id.cmp(&rhs.node_id));
        Ok(nodes)
    }

    async fn load_services(&self, client: &mut Client) -> Result<Vec<ServiceSpec>> {
        let prefix = format!(
            "{}/clusters/{}/services/",
            self.cluster_namespace, self.cluster_id
        );
        let kvs = client.get_prefix(prefix.as_bytes()).await?;
        let mut services = Vec::with_capacity(kvs.len());

        for (_key, value) in kvs {
            match serde_json::from_slice::<ServiceSpec>(&value) {
                Ok(service) => services.push(service),
                Err(error) => warn!(error = %error, "failed to decode service spec"),
            }
        }

        services.sort_by(|lhs, rhs| lhs.name.cmp(&rhs.name));
        Ok(services)
    }

    async fn reconcile_service(
        &self,
        client: &mut Client,
        nodes: &[ClusterNodeRecord],
        service: &ServiceSpec,
    ) -> Result<()> {
        let schedule = service
            .schedule
            .as_ref()
            .context("service marked schedulable without schedule block")?;
        let eligible_nodes =
            eligible_nodes(nodes, &schedule.placement, self.heartbeat_timeout_secs);

        if eligible_nodes.is_empty() {
            warn!(service = %service.name, "no eligible nodes for scheduled service");
            return Ok(());
        }

        let existing = self.load_instance_values(client, &service.name).await?;
        let existing_instances = decode_managed_instances(&existing);
        let desired_instances =
            build_desired_instances(service, &eligible_nodes, &existing_instances)?;
        let target_instances = schedule_target_count(schedule.mode, eligible_nodes.len(), schedule.replicas);
        if desired_instances.len() < target_instances {
            warn!(
                service = %service.name,
                requested = target_instances,
                scheduled = desired_instances.len(),
                mode = ?schedule.mode,
                "insufficient eligible node capacity for requested service instances"
            );
        }

        let plan = plan_managed_reconciliation(
            service,
            &desired_instances,
            &existing,
            &existing_instances,
            self.heartbeat_timeout_secs,
        )?;

        for upsert in plan.upserts {
            let key = instance_key(
                &self.cluster_namespace,
                &self.cluster_id,
                &upsert.instance.service,
                &upsert.instance.instance_id,
            );
            if self.dry_run {
                info!(
                    service = %service.name,
                    instance_id = %upsert.instance.instance_id,
                    node_id = %upsert.instance.node_id,
                    "would upsert managed instance"
                );
            } else {
                client
                    .put(&key, serde_json::to_vec(&upsert.desired_value)?)
                    .await?;
                info!(
                    service = %service.name,
                    instance_id = %upsert.instance.instance_id,
                    node_id = %upsert.instance.node_id,
                    "upserted managed instance"
                );
            }
        }

        if plan.deferred_creates > 0 || plan.deferred_updates > 0 || plan.deferred_deletes > 0 {
            info!(
                service = %service.name,
                deferred_creates = plan.deferred_creates,
                deferred_updates = plan.deferred_updates,
                deferred_deletes = plan.deferred_deletes,
                "deferring managed instance changes until rollout budget frees"
            );
        }

        for instance_id in plan.deletes {
            let key = instance_key(
                &self.cluster_namespace,
                &self.cluster_id,
                &service.name,
                &instance_id,
            );

            if self.dry_run {
                info!(
                    service = %service.name,
                    instance_id = %instance_id,
                    "would delete stale managed instance"
                );
            } else if client.delete(&key).await? {
                info!(
                    service = %service.name,
                    instance_id = %instance_id,
                    "deleted stale managed instance"
                );
            }
        }

        Ok(())
    }

    async fn load_instance_values(
        &self,
        client: &mut Client,
        service: &str,
    ) -> Result<HashMap<String, Value>> {
        let prefix = format!(
            "{}/clusters/{}/instances/{}/",
            self.cluster_namespace, self.cluster_id, service
        );
        let kvs = client.get_prefix(prefix.as_bytes()).await?;
        let mut instances = HashMap::with_capacity(kvs.len());

        for (_key, value) in kvs {
            let parsed: Value = match serde_json::from_slice(&value) {
                Ok(value) => value,
                Err(error) => {
                    warn!(service = %service, error = %error, "failed to decode instance value");
                    continue;
                }
            };

            let Some(instance_id) = parsed
                .get("instance_id")
                .and_then(|value| value.as_str())
                .map(|value| value.to_string())
            else {
                warn!(service = %service, "instance record missing instance_id");
                continue;
            };

            instances.insert(instance_id, parsed);
        }

        Ok(instances)
    }
}

fn eligible_nodes<'a>(
    nodes: &'a [ClusterNodeRecord],
    placement: &PlacementPolicy,
    heartbeat_timeout_secs: u64,
) -> Vec<&'a ClusterNodeRecord> {
    nodes
        .iter()
        .filter(|node| node_is_eligible(node, placement, heartbeat_timeout_secs))
        .collect()
}

fn node_is_eligible(
    node: &ClusterNodeRecord,
    placement: &PlacementPolicy,
    heartbeat_timeout_secs: u64,
) -> bool {
    if node.state.as_deref() != Some("active") {
        return false;
    }

    if heartbeat_timeout_secs > 0 {
        let Some(last) = node.last_heartbeat else {
            return false;
        };
        let age = Utc::now().signed_duration_since(last).num_seconds();
        if age > heartbeat_timeout_secs as i64 {
            return false;
        }
    }

    if !placement.roles.is_empty()
        && !node
            .roles
            .iter()
            .any(|role| placement.roles.iter().any(|expected| expected == role))
    {
        return false;
    }

    if !placement.pools.is_empty()
        && !node_pool(node)
            .map(|pool| placement.pools.iter().any(|expected| expected == pool))
            .unwrap_or(false)
    {
        return false;
    }

    if !placement.node_classes.is_empty()
        && !node_class(node)
            .map(|node_class| {
                placement
                    .node_classes
                    .iter()
                    .any(|expected| expected == node_class)
            })
            .unwrap_or(false)
    {
        return false;
    }

    placement
        .match_labels
        .iter()
        .all(|(key, value)| node.labels.get(key) == Some(value))
}

fn build_desired_instances(
    service: &ServiceSpec,
    eligible_nodes: &[&ClusterNodeRecord],
    existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> {
    let schedule = service
        .schedule
        .as_ref()
        .context("scheduled service missing schedule block")?;

    match schedule.mode {
        ServiceScheduleMode::Replicated => {
            build_replicated_desired_instances(service, eligible_nodes, existing_instances)
        }
        ServiceScheduleMode::Daemon => {
            build_daemon_desired_instances(service, eligible_nodes, existing_instances)
        }
    }
}

fn build_replicated_desired_instances(
    service: &ServiceSpec,
    eligible_nodes: &[&ClusterNodeRecord],
    existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> {
    let schedule = service
        .schedule
        .as_ref()
        .context("scheduled service missing schedule block")?;
    let port = resolve_instance_port(service).with_context(|| {
        format!(
            "service {} is missing instance_port and service ports",
            service.name
        )
    })?;
    let max_instances_per_node = schedule.placement.max_instances_per_node.max(1);
    let eligible_by_node: HashMap<&str, &ClusterNodeRecord> = eligible_nodes
        .iter()
        .copied()
        .map(|node| (node.node_id.as_str(), node))
        .collect();
    let mut counts: BTreeMap<String, u32> = eligible_nodes
        .iter()
        .map(|node| (node.node_id.clone(), 0))
        .collect();
    let mut used_ordinals: BTreeMap<String, HashSet<u32>> = BTreeMap::new();
    let mut desired = Vec::new();

    let mut reusable = existing_instances
        .iter()
        .filter(|instance| eligible_by_node.contains_key(instance.node_id.as_str()))
        .filter(|instance| instance_is_reusable(instance))
        .collect::<Vec<_>>();
    reusable.sort_by(|lhs, rhs| {
        instance_state_rank(lhs)
            .cmp(&instance_state_rank(rhs))
            .then_with(|| lhs.instance_id.cmp(&rhs.instance_id))
    });

    for instance in reusable {
        if desired.len() >= schedule.replicas as usize {
            break;
        }
        let Some(node) = eligible_by_node.get(instance.node_id.as_str()).copied() else {
            continue;
        };
        let ordinal = counts.get(&node.node_id).copied().unwrap_or(0);
        if ordinal >= max_instances_per_node {
            continue;
        }
        counts.insert(node.node_id.clone(), ordinal + 1);
        if let Some(parsed_ordinal) =
            parse_instance_ordinal(&service.name, &node.node_id, &instance.instance_id)
        {
            used_ordinals
                .entry(node.node_id.clone())
                .or_default()
                .insert(parsed_ordinal);
        }
        desired.push(desired_instance(service, node, &instance.instance_id, port));
    }

    while desired.len() < schedule.replicas as usize {
        let Some(node) = pick_next_node(
            eligible_nodes,
            &counts,
            max_instances_per_node,
            schedule.placement.spread_by_label.as_deref(),
            existing_instances,
        ) else {
            break;
        };

        let ordinal = counts.get(&node.node_id).copied().unwrap_or(0);
        counts.insert(node.node_id.clone(), ordinal + 1);
        let instance_id = render_next_instance_id(&service.name, &node.node_id, &mut used_ordinals);

        desired.push(desired_instance(service, node, &instance_id, port));
    }

    Ok(desired)
}

fn build_daemon_desired_instances(
    service: &ServiceSpec,
    eligible_nodes: &[&ClusterNodeRecord],
    existing_instances: &[ServiceInstanceSpec],
) -> Result<Vec<ServiceInstanceSpec>> {
    let port = resolve_instance_port(service).with_context(|| {
        format!(
            "service {} is missing instance_port and service ports",
            service.name
        )
    })?;
    let eligible_by_node: HashSet<&str> = eligible_nodes
        .iter()
        .map(|node| node.node_id.as_str())
        .collect();
    let mut reusable = existing_instances
        .iter()
        .filter(|instance| eligible_by_node.contains(instance.node_id.as_str()))
        .filter(|instance| instance_is_reusable(instance))
        .collect::<Vec<_>>();
    reusable.sort_by(|lhs, rhs| {
        lhs.node_id
            .cmp(&rhs.node_id)
            .then_with(|| instance_state_rank(lhs).cmp(&instance_state_rank(rhs)))
            .then_with(|| lhs.instance_id.cmp(&rhs.instance_id))
    });

    let mut reusable_by_node: BTreeMap<String, Vec<String>> = BTreeMap::new();
    for instance in reusable {
        reusable_by_node
            .entry(instance.node_id.clone())
            .or_default()
            .push(instance.instance_id.clone());
    }

    let mut desired = Vec::with_capacity(eligible_nodes.len());
    for node in eligible_nodes {
        let instance_id = reusable_by_node
            .get_mut(&node.node_id)
            .and_then(|instance_ids| {
                if instance_ids.is_empty() {
                    None
                } else {
                    Some(instance_ids.remove(0))
                }
            })
            .unwrap_or_else(|| render_instance_id(&service.name, &node.node_id, 0));
        desired.push(desired_instance(service, node, &instance_id, port));
    }

    Ok(desired)
}

fn desired_instance(
    service: &ServiceSpec,
    node: &ClusterNodeRecord,
    instance_id: &str,
    port: u16,
) -> ServiceInstanceSpec {
    let schedule = service
        .schedule
        .as_ref()
        .expect("scheduled service missing schedule block");

    ServiceInstanceSpec {
        instance_id: instance_id.to_string(),
        service: service.name.clone(),
        node_id: node.node_id.clone(),
        ip: node.ip.clone(),
        port,
        mesh_port: schedule.mesh_port,
        version: None,
        health_check: schedule.health_check.clone(),
        process: schedule.process.clone(),
        container: schedule.container.clone(),
        managed_by: Some(MANAGED_BY.to_string()),
        state: None,
        last_heartbeat: None,
        observed_at: None,
    }
}

fn pick_next_node<'a>(
    eligible_nodes: &'a [&ClusterNodeRecord],
    counts: &BTreeMap<String, u32>,
    max_instances_per_node: u32,
    spread_by_label: Option<&str>,
    existing_instances: &[ServiceInstanceSpec],
) -> Option<&'a ClusterNodeRecord> {
    eligible_nodes
        .iter()
        .copied()
        .filter(|node| counts.get(&node.node_id).copied().unwrap_or(0) < max_instances_per_node)
        .min_by(|lhs, rhs| {
            let lhs_spread = spread_count_for_node(eligible_nodes, counts, lhs, spread_by_label);
            let rhs_spread = spread_count_for_node(eligible_nodes, counts, rhs, spread_by_label);
            let lhs_count = counts.get(&lhs.node_id).copied().unwrap_or(0);
            let rhs_count = counts.get(&rhs.node_id).copied().unwrap_or(0);
            let lhs_preference = node_preference_rank(existing_instances, &lhs.node_id);
            let rhs_preference = node_preference_rank(existing_instances, &rhs.node_id);
            lhs_spread
                .cmp(&rhs_spread)
                .then_with(|| lhs_count.cmp(&rhs_count))
                .then_with(|| lhs_preference.cmp(&rhs_preference))
                .then_with(|| {
                    spread_value(lhs, spread_by_label).cmp(&spread_value(rhs, spread_by_label))
                })
                .then_with(|| lhs.node_id.cmp(&rhs.node_id))
        })
}

fn spread_count_for_node(
    eligible_nodes: &[&ClusterNodeRecord],
    counts: &BTreeMap<String, u32>,
    node: &ClusterNodeRecord,
    spread_by_label: Option<&str>,
) -> u32 {
    let Some(spread_by_label) = spread_by_label else {
        return 0;
    };
    let target = spread_value(node, Some(spread_by_label));
    eligible_nodes
        .iter()
        .filter(|candidate| spread_value(candidate, Some(spread_by_label)) == target)
        .map(|candidate| counts.get(&candidate.node_id).copied().unwrap_or(0))
        .sum()
}

fn spread_value(node: &ClusterNodeRecord, spread_by_label: Option<&str>) -> String {
    let Some(label) = spread_by_label else {
        return String::new();
    };

    match label {
        "pool" => node_pool(node)
            .map(ToOwned::to_owned)
            .unwrap_or_else(|| node.node_id.clone()),
        "node_class" => node_class(node)
            .map(ToOwned::to_owned)
            .unwrap_or_else(|| node.node_id.clone()),
        "failure_domain" => node
            .failure_domain
            .clone()
            .or_else(|| node.labels.get("failure_domain").cloned())
            .or_else(|| node.labels.get("topology.kubernetes.io/zone").cloned())
            .unwrap_or_else(|| node.node_id.clone()),
        other => node
            .labels
            .get(other)
            .cloned()
            .unwrap_or_else(|| node.node_id.clone()),
    }
}

fn node_pool(node: &ClusterNodeRecord) -> Option<&str> {
    node.pool
        .as_deref()
        .or_else(|| node.labels.get("pool").map(String::as_str))
        .or_else(|| {
            node.labels
                .get("pool.photoncloud.io/name")
                .map(String::as_str)
        })
}

fn node_class(node: &ClusterNodeRecord) -> Option<&str> {
    node.node_class
        .as_deref()
        .or_else(|| node.labels.get("node_class").map(String::as_str))
        .or_else(|| {
            node.labels
                .get("nodeclass.photoncloud.io/name")
                .map(String::as_str)
        })
}

fn resolve_instance_port(service: &ServiceSpec) -> Option<u16> {
    service
        .schedule
        .as_ref()
        .and_then(|schedule| schedule.instance_port)
        .or_else(|| service.ports.as_ref().and_then(|ports| ports.http))
        .or_else(|| service.ports.as_ref().and_then(|ports| ports.grpc))
}

fn schedule_target_count(mode: ServiceScheduleMode, eligible_nodes: usize, replicas: u32) -> usize {
    match mode {
        ServiceScheduleMode::Replicated => replicas as usize,
        ServiceScheduleMode::Daemon => eligible_nodes,
    }
}

fn render_instance_id(service: &str, node_id: &str, ordinal: u32) -> String {
    if ordinal == 0 {
        format!("{service}-{node_id}")
    } else {
        format!("{service}-{node_id}-{}", ordinal + 1)
    }
}

fn render_next_instance_id(
    service: &str,
    node_id: &str,
    used_ordinals: &mut BTreeMap<String, HashSet<u32>>,
) -> String {
    let used = used_ordinals.entry(node_id.to_string()).or_default();
    let mut ordinal = 0;
    while used.contains(&ordinal) {
        ordinal += 1;
    }
    used.insert(ordinal);
    render_instance_id(service, node_id, ordinal)
}

fn parse_instance_ordinal(service: &str, node_id: &str, instance_id: &str) -> Option<u32> {
    let base = format!("{service}-{node_id}");
    if instance_id == base {
        return Some(0);
    }

    let suffix = instance_id.strip_prefix(&format!("{base}-"))?;
    let ordinal = suffix.parse::<u32>().ok()?;
    ordinal.checked_sub(1)
}

fn decode_managed_instances(existing: &HashMap<String, Value>) -> Vec<ServiceInstanceSpec> {
    let mut decoded = Vec::new();

    for value in existing.values() {
        if !is_managed_by_scheduler(value) {
            continue;
        }

        match serde_json::from_value::<ServiceInstanceSpec>(value.clone()) {
            Ok(instance) => decoded.push(instance),
            Err(error) => warn!(error = %error, "failed to decode managed instance"),
        }
    }

    decoded.sort_by(|lhs, rhs| lhs.instance_id.cmp(&rhs.instance_id));
    decoded
}

fn plan_managed_reconciliation(
    service: &ServiceSpec,
    desired_instances: &[ServiceInstanceSpec],
    existing_values: &HashMap<String, Value>,
    existing_instances: &[ServiceInstanceSpec],
    heartbeat_timeout_secs: u64,
) -> Result<ReconcilePlan> {
    let schedule = service
        .schedule
        .as_ref()
        .context("scheduled service missing schedule block")?;
    let desired_replicas = desired_instances.len();
    let min_available = desired_replicas.saturating_sub(schedule.rollout.max_unavailable as usize);
    let max_total_instances = desired_replicas + schedule.rollout.max_surge as usize;
    let mut available_count = existing_instances
        .iter()
        .filter(|instance| instance_is_available(instance, heartbeat_timeout_secs))
        .count();
    let desired_ids: HashSet<_> = desired_instances
        .iter()
        .map(|instance| instance.instance_id.clone())
        .collect();

    let mut plan = ReconcilePlan::default();
    let mut managed_count = existing_instances.len();
    let mut stale_instances = existing_instances
        .iter()
        .filter(|instance| !desired_ids.contains(&instance.instance_id))
        .collect::<Vec<_>>();
    stale_instances.sort_by(|lhs, rhs| {
        instance_state_rank(lhs)
            .cmp(&instance_state_rank(rhs))
            .then_with(|| lhs.instance_id.cmp(&rhs.instance_id))
    });

    for instance in stale_instances {
        if instance_is_available(instance, heartbeat_timeout_secs)
            && available_count.saturating_sub(1) < min_available
        {
            plan.deferred_deletes += 1;
            continue;
        }
        if instance_is_available(instance, heartbeat_timeout_secs) {
            available_count = available_count.saturating_sub(1);
        }
        managed_count = managed_count.saturating_sub(1);
        plan.deletes.push(instance.instance_id.clone());
    }

    let mut create_budget = max_total_instances.saturating_sub(managed_count);
    let mut disruption_budget = available_count.saturating_sub(min_available);

    for instance in desired_instances {
        let existing_value = existing_values.get(&instance.instance_id);
        let desired_value = merge_preserved_fields(serde_json::to_value(instance)?, existing_value);

        if existing_value == Some(&desired_value) {
            continue;
        }

        let existing_instance = existing_instances
            .iter()
            .find(|current| current.instance_id == instance.instance_id);

        match existing_instance {
            None => {
                if create_budget == 0 {
                    plan.deferred_creates += 1;
                    continue;
                }
                create_budget -= 1;
                plan.upserts.push(PlannedUpsert {
                    instance: instance.clone(),
                    desired_value,
                });
            }
            Some(current) => {
                if instance_is_available(current, heartbeat_timeout_secs) {
                    if disruption_budget == 0 {
                        plan.deferred_updates += 1;
                        continue;
                    }
                    disruption_budget -= 1;
                }
                plan.upserts.push(PlannedUpsert {
                    instance: instance.clone(),
                    desired_value,
                });
            }
        }
    }

    Ok(plan)
}

fn instance_is_available(instance: &ServiceInstanceSpec, heartbeat_timeout_secs: u64) -> bool {
    matches!(instance.state.as_deref(), Some("healthy"))
        && instance_has_fresh_heartbeat(instance, heartbeat_timeout_secs)
}

fn instance_has_fresh_heartbeat(instance: &ServiceInstanceSpec, heartbeat_timeout_secs: u64) -> bool {
    if heartbeat_timeout_secs == 0 {
        return true;
    }

    let Some(last_heartbeat) = instance.last_heartbeat.or(instance.observed_at) else {
        return false;
    };

    Utc::now()
        .signed_duration_since(last_heartbeat)
        .num_seconds()
        <= heartbeat_timeout_secs as i64
}

fn instance_is_reusable(instance: &ServiceInstanceSpec) -> bool {
    !matches!(
        instance.state.as_deref(),
        Some("unhealthy") | Some("failed")
    )
}

fn instance_state_rank(instance: &ServiceInstanceSpec) -> u8 {
    match instance.state.as_deref() {
        Some("healthy") => 0,
        Some("pending") | Some("provisioning") | Some("starting") | None => 1,
        Some("unknown") => 2,
        Some("unhealthy") | Some("failed") => 3,
        Some(_) => 4,
    }
}

fn node_preference_rank(existing_instances: &[ServiceInstanceSpec], node_id: &str) -> u8 {
    let mut saw_failed = false;

    for instance in existing_instances
        .iter()
        .filter(|instance| instance.node_id == node_id)
    {
        if instance_is_reusable(instance) {
            return 0;
        }
        saw_failed = true;
    }

    if saw_failed {
        2
    } else {
        1
    }
}

fn merge_preserved_fields(mut desired: Value, existing: Option<&Value>) -> Value {
    let Some(existing) = existing else {
        return desired;
    };

    let (Some(desired_obj), Some(existing_obj)) = (desired.as_object_mut(), existing.as_object())
    else {
        return desired;
    };

    for preserve_key in ["state", "last_heartbeat", "observed_at"] {
        if let Some(value) = existing_obj.get(preserve_key) {
            match desired_obj.get_mut(preserve_key) {
                Some(slot) if slot.is_null() => *slot = value.clone(),
                Some(_) => {}
                None => {
                    desired_obj.insert(preserve_key.to_string(), value.clone());
                }
            }
        }
    }

    desired
}

fn is_managed_by_scheduler(value: &Value) -> bool {
    value.get("managed_by").and_then(|value| value.as_str()) == Some(MANAGED_BY)
}

fn instance_key(
    cluster_namespace: &str,
    cluster_id: &str,
    service: &str,
    instance_id: &str,
) -> String {
    format!(
        "{}/clusters/{}/instances/{}/{}",
        cluster_namespace, cluster_id, service, instance_id
    )
}

#[tokio::main]
async fn main() -> Result<()> {
    tracing_subscriber::fmt()
        .with_env_filter(EnvFilter::from_default_env().add_directive("info".parse()?))
        .init();

    let scheduler = Scheduler::new(Cli::parse());
    scheduler.run_loop().await
}

#[cfg(test)]
mod tests {
    use super::*;
    use chrono::Duration as ChronoDuration;
    use deployer_types::{
        ClusterNodeRecord, HealthCheckSpec, PlacementPolicy, ProcessSpec, RolloutStrategySpec,
        ServicePorts, ServiceScheduleMode, ServiceScheduleSpec,
    };

    fn active_node(node_id: &str, roles: &[&str], labels: &[(&str, &str)]) -> ClusterNodeRecord {
        ClusterNodeRecord {
            node_id: node_id.to_string(),
            machine_id: None,
            ip: format!("10.0.0.{}", &node_id[node_id.len() - 1..]),
            hostname: node_id.to_string(),
            roles: roles.iter().map(|role| role.to_string()).collect(),
            labels: labels
                .iter()
                .map(|(key, value)| (key.to_string(), value.to_string()))
                .collect(),
            pool: Some("general".to_string()),
            node_class: Some("worker-linux".to_string()),
            failure_domain: Some(format!("rack-{}", &node_id[node_id.len() - 1..])),
            nix_profile: Some("profiles/worker-linux".to_string()),
            install_plan: None,
            hardware_facts: None,
            state: Some("active".to_string()),
            commission_state: None,
            install_state: None,
            commissioned_at: None,
            last_inventory_hash: None,
            power_state: None,
            bmc_ref: None,
            last_heartbeat: Some(Utc::now() - ChronoDuration::seconds(10)),
        }
    }

    fn scheduled_service() -> ServiceSpec {
        ServiceSpec {
            name: "api".to_string(),
            ports: Some(ServicePorts {
                http: Some(8080),
                grpc: None,
            }),
            protocol: Some("http".to_string()),
            mtls_required: None,
            mesh_mode: None,
            schedule: Some(ServiceScheduleSpec {
                mode: ServiceScheduleMode::Replicated,
                replicas: 2,
                placement: PlacementPolicy {
                    roles: vec!["worker".to_string()],
                    pools: vec!["general".to_string()],
                    node_classes: vec!["worker-linux".to_string()],
                    match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
                    spread_by_label: Some("failure_domain".to_string()),
                    max_instances_per_node: 1,
                },
                rollout: RolloutStrategySpec::default(),
                instance_port: None,
                mesh_port: Some(18080),
                process: Some(ProcessSpec {
                    command: "/usr/bin/api".to_string(),
                    args: vec!["serve".to_string()],
                    working_dir: None,
                    env: HashMap::new(),
                }),
                container: None,
                health_check: Some(HealthCheckSpec {
                    check_type: "http".to_string(),
                    path: Some("/health".to_string()),
                    interval_secs: Some(10),
                    timeout_secs: Some(5),
                    startup_grace_secs: Some(30),
                }),
            }),
            publish: None,
        }
    }

    #[test]
    fn test_node_eligibility_matches_roles_and_labels() {
        let node = active_node("node01", &["worker"], &[("tier", "general")]);
        let placement = PlacementPolicy {
            roles: vec!["worker".to_string()],
            pools: vec!["general".to_string()],
            node_classes: vec!["worker-linux".to_string()],
            match_labels: HashMap::from([("tier".to_string(), "general".to_string())]),
            spread_by_label: Some("failure_domain".to_string()),
            max_instances_per_node: 1,
        };

        assert!(node_is_eligible(&node, &placement, 300));
    }

    #[test]
    fn test_node_eligibility_rejects_stale_or_wrong_label() {
        let mut stale = active_node("node01", &["worker"], &[("tier", "general")]);
        stale.last_heartbeat = Some(Utc::now() - ChronoDuration::seconds(600));

        let placement = PlacementPolicy {
            roles: vec!["worker".to_string()],
            pools: vec!["gpu".to_string()],
            node_classes: vec!["gpu-worker".to_string()],
            match_labels: HashMap::from([("tier".to_string(), "gpu".to_string())]),
            spread_by_label: Some("failure_domain".to_string()),
            max_instances_per_node: 1,
        };

        assert!(!node_is_eligible(&stale, &placement, 300));
    }

    #[test]
    fn test_build_desired_instances_spreads_across_matching_nodes() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();

        let desired = build_desired_instances(&scheduled_service(), &refs, &[]).unwrap();
        assert_eq!(desired.len(), 2);
        assert_eq!(desired[0].instance_id, "api-node01");
        assert_eq!(desired[1].instance_id, "api-node02");
        assert_eq!(desired[0].process.as_ref().unwrap().command, "/usr/bin/api");
    }

    #[test]
    fn test_build_desired_instances_honors_max_instances_per_node() {
        let nodes = vec![active_node("node01", &["worker"], &[("tier", "general")])];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.replicas = 3;
        schedule.placement.max_instances_per_node = 2;

        let desired = build_desired_instances(&service, &refs, &[]).unwrap();
        assert_eq!(desired.len(), 2);
        assert_eq!(desired[0].instance_id, "api-node01");
        assert_eq!(desired[1].instance_id, "api-node01-2");
    }

    #[test]
    fn test_build_desired_instances_daemon_places_on_every_eligible_node() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.mode = ServiceScheduleMode::Daemon;
        schedule.replicas = 1;

        let desired = build_desired_instances(&service, &refs, &[]).unwrap();

        assert_eq!(desired.len(), 2);
        assert_eq!(desired[0].instance_id, "api-node01");
        assert_eq!(desired[1].instance_id, "api-node02");
    }

    #[test]
    fn test_build_desired_instances_daemon_reuses_existing_instance_per_node() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.mode = ServiceScheduleMode::Daemon;
        schedule.replicas = 1;

        let existing = vec![
            ServiceInstanceSpec {
                instance_id: "api-node01-2".to_string(),
                service: "api".to_string(),
                node_id: "node01".to_string(),
                ip: "10.0.0.1".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: None,
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
            ServiceInstanceSpec {
                instance_id: "api-node02".to_string(),
                service: "api".to_string(),
                node_id: "node02".to_string(),
                ip: "10.0.0.2".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: None,
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
        ];

        let desired = build_desired_instances(&service, &refs, &existing).unwrap();

        assert_eq!(desired.len(), 2);
        assert_eq!(desired[0].instance_id, "api-node01-2");
        assert_eq!(desired[1].instance_id, "api-node02");
    }

    #[test]
    fn test_pick_next_node_prefers_less_used_failure_domain() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
            active_node("node03", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let counts = BTreeMap::from([
            ("node01".to_string(), 1),
            ("node02".to_string(), 0),
            ("node03".to_string(), 1),
        ]);

        let picked = pick_next_node(&refs, &counts, 2, Some("failure_domain"), &[]).unwrap();
        assert_eq!(picked.node_id, "node02");
    }

    #[test]
    fn test_build_desired_instances_preserves_existing_healthy_placement() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let mut service = scheduled_service();
        service.schedule.as_mut().unwrap().replicas = 1;

        let existing = vec![ServiceInstanceSpec {
            instance_id: "api-node02".to_string(),
            service: "api".to_string(),
            node_id: "node02".to_string(),
            ip: "10.0.0.2".to_string(),
            port: 8080,
            mesh_port: Some(18080),
            version: None,
            health_check: None,
            process: None,
            container: None,
            managed_by: Some(MANAGED_BY.to_string()),
            state: Some("healthy".to_string()),
            last_heartbeat: None,
            observed_at: None,
        }];

        let desired = build_desired_instances(&service, &refs, &existing).unwrap();
        assert_eq!(desired.len(), 1);
        assert_eq!(desired[0].instance_id, "api-node02");
        assert_eq!(desired[0].node_id, "node02");
    }

    #[test]
    fn test_build_desired_instances_avoids_unhealthy_node_when_spare_exists() {
        let nodes = vec![
            active_node("node01", &["worker"], &[("tier", "general")]),
            active_node("node02", &["worker"], &[("tier", "general")]),
        ];
        let refs: Vec<&ClusterNodeRecord> = nodes.iter().collect();
        let mut service = scheduled_service();
        service.schedule.as_mut().unwrap().replicas = 1;

        let existing = vec![ServiceInstanceSpec {
            instance_id: "api-node01".to_string(),
            service: "api".to_string(),
            node_id: "node01".to_string(),
            ip: "10.0.0.1".to_string(),
            port: 8080,
            mesh_port: Some(18080),
            version: None,
            health_check: None,
            process: None,
            container: None,
            managed_by: Some(MANAGED_BY.to_string()),
            state: Some("unhealthy".to_string()),
            last_heartbeat: None,
            observed_at: None,
        }];

        let desired = build_desired_instances(&service, &refs, &existing).unwrap();
        assert_eq!(desired.len(), 1);
        assert_eq!(desired[0].node_id, "node02");
        assert_eq!(desired[0].instance_id, "api-node02");
    }

    #[test]
    fn test_plan_reconciliation_defers_delete_until_replacement_is_healthy() {
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.replicas = 1;
        schedule.rollout = RolloutStrategySpec {
            max_unavailable: 0,
            max_surge: 1,
        };

        let existing_instance = ServiceInstanceSpec {
            instance_id: "api-node01".to_string(),
            service: "api".to_string(),
            node_id: "node01".to_string(),
            ip: "10.0.0.1".to_string(),
            port: 8080,
            mesh_port: Some(18080),
            version: None,
            health_check: None,
            process: schedule.process.clone(),
            container: None,
            managed_by: Some(MANAGED_BY.to_string()),
            state: Some("healthy".to_string()),
            last_heartbeat: None,
            observed_at: None,
        };
        let existing = HashMap::from([(
            existing_instance.instance_id.clone(),
            serde_json::to_value(&existing_instance).unwrap(),
        )]);
        let desired_instances = vec![ServiceInstanceSpec {
            instance_id: "api-node02".to_string(),
            node_id: "node02".to_string(),
            ip: "10.0.0.2".to_string(),
            ..existing_instance.clone()
        }];

        let plan = plan_managed_reconciliation(
            &service,
            &desired_instances,
            &existing,
            std::slice::from_ref(&existing_instance),
            0,
        )
        .unwrap();

        assert_eq!(plan.upserts.len(), 1);
        assert_eq!(plan.upserts[0].instance.instance_id, "api-node02");
        assert!(plan.deletes.is_empty());
        assert_eq!(plan.deferred_deletes, 1);
    }

    #[test]
    fn test_plan_reconciliation_limits_healthy_updates_by_rollout_budget() {
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.replicas = 2;
        schedule.rollout = RolloutStrategySpec {
            max_unavailable: 1,
            max_surge: 0,
        };

        let old_process = schedule.process.clone().unwrap();
        let mut new_process = old_process.clone();
        new_process.args.push("--new-flag".to_string());
        schedule.process = Some(new_process.clone());

        let existing_instances = vec![
            ServiceInstanceSpec {
                instance_id: "api-node01".to_string(),
                service: "api".to_string(),
                node_id: "node01".to_string(),
                ip: "10.0.0.1".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: Some(old_process.clone()),
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
            ServiceInstanceSpec {
                instance_id: "api-node02".to_string(),
                service: "api".to_string(),
                node_id: "node02".to_string(),
                ip: "10.0.0.2".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: Some(old_process),
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
        ];
        let existing = existing_instances
            .iter()
            .map(|instance| {
                (
                    instance.instance_id.clone(),
                    serde_json::to_value(instance).unwrap(),
                )
            })
            .collect::<HashMap<_, _>>();
        let desired_instances = existing_instances
            .iter()
            .map(|instance| ServiceInstanceSpec {
                process: Some(new_process.clone()),
                ..instance.clone()
            })
            .collect::<Vec<_>>();

        let plan = plan_managed_reconciliation(
            &service,
            &desired_instances,
            &existing,
            &existing_instances,
            0,
        )
        .unwrap();

        assert_eq!(plan.upserts.len(), 1);
        assert_eq!(plan.deferred_updates, 1);
    }

    #[test]
    fn test_plan_reconciliation_treats_stale_healthy_instance_as_unavailable() {
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.replicas = 1;
        schedule.rollout = RolloutStrategySpec {
            max_unavailable: 0,
            max_surge: 0,
        };

        let existing_instance = ServiceInstanceSpec {
            instance_id: "api-node01".to_string(),
            service: "api".to_string(),
            node_id: "node01".to_string(),
            ip: "10.0.0.1".to_string(),
            port: 8080,
            mesh_port: Some(18080),
            version: None,
            health_check: None,
            process: schedule.process.clone(),
            container: None,
            managed_by: Some(MANAGED_BY.to_string()),
            state: Some("healthy".to_string()),
            last_heartbeat: Some(Utc::now() - chrono::Duration::seconds(600)),
            observed_at: None,
        };
        let existing = HashMap::from([(
            existing_instance.instance_id.clone(),
            serde_json::to_value(&existing_instance).unwrap(),
        )]);
        let desired_instances = vec![ServiceInstanceSpec {
            instance_id: "api-node02".to_string(),
            node_id: "node02".to_string(),
            ip: "10.0.0.2".to_string(),
            last_heartbeat: Some(Utc::now()),
            ..existing_instance.clone()
        }];

        let plan = plan_managed_reconciliation(
            &service,
            &desired_instances,
            &existing,
            std::slice::from_ref(&existing_instance),
            300,
        )
        .unwrap();

        assert_eq!(plan.upserts.len(), 1);
        assert_eq!(plan.deferred_updates, 0);
        assert_eq!(plan.deferred_deletes, 0);
        assert_eq!(plan.deletes, vec!["api-node01".to_string()]);
    }

    #[test]
    fn test_plan_reconciliation_for_daemon_uses_current_eligible_node_count() {
        let mut service = scheduled_service();
        let schedule = service.schedule.as_mut().unwrap();
        schedule.mode = ServiceScheduleMode::Daemon;
        schedule.replicas = 99;
        schedule.rollout = RolloutStrategySpec {
            max_unavailable: 0,
            max_surge: 0,
        };

        let existing_instances = vec![
            ServiceInstanceSpec {
                instance_id: "api-node01".to_string(),
                service: "api".to_string(),
                node_id: "node01".to_string(),
                ip: "10.0.0.1".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: schedule.process.clone(),
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
            ServiceInstanceSpec {
                instance_id: "api-node02".to_string(),
                service: "api".to_string(),
                node_id: "node02".to_string(),
                ip: "10.0.0.2".to_string(),
                port: 8080,
                mesh_port: Some(18080),
                version: None,
                health_check: None,
                process: schedule.process.clone(),
                container: None,
                managed_by: Some(MANAGED_BY.to_string()),
                state: Some("healthy".to_string()),
                last_heartbeat: None,
                observed_at: None,
            },
        ];
        let existing = existing_instances
            .iter()
            .map(|instance| {
                (
                    instance.instance_id.clone(),
                    serde_json::to_value(instance).unwrap(),
                )
            })
            .collect::<HashMap<_, _>>();
        let desired_instances = vec![existing_instances[1].clone()];

        let plan = plan_managed_reconciliation(
            &service,
            &desired_instances,
            &existing,
            &existing_instances,
            0,
        )
        .unwrap();

        assert!(plan.upserts.is_empty());
        assert_eq!(plan.deletes, vec!["api-node01".to_string()]);
        assert_eq!(plan.deferred_deletes, 0);
    }

    #[test]
    fn test_merge_preserved_fields_replaces_null_status_fields() {
        let desired = serde_json::json!({
            "instance_id": "api-node01",
            "state": null,
            "last_heartbeat": null,
            "observed_at": null,
        });
        let existing = serde_json::json!({
            "instance_id": "api-node01",
            "state": "healthy",
            "last_heartbeat": "2026-03-11T06:59:50Z",
            "observed_at": "2026-03-11T06:59:51Z",
        });

        let merged = merge_preserved_fields(desired, Some(&existing));

        assert_eq!(merged.get("state").and_then(Value::as_str), Some("healthy"));
        assert_eq!(
            merged.get("last_heartbeat").and_then(Value::as_str),
            Some("2026-03-11T06:59:50Z")
        );
        assert_eq!(
            merged.get("observed_at").and_then(Value::as_str),
            Some("2026-03-11T06:59:51Z")
        );
    }

    #[test]
    fn test_render_next_instance_id_skips_used_ordinals() {
        let mut used =
            BTreeMap::from([("node01".to_string(), HashSet::from([0_u32, 1_u32, 3_u32]))]);

        let instance_id = render_next_instance_id("api", "node01", &mut used);

        assert_eq!(instance_id, "api-node01-3");
    }
}