photoncloud-monorepo/k8shost/crates/k8shost-server/src/scheduler.rs

//! Kubernetes scheduler implementation
//!
//! Assigns pending pods to available nodes based on resource availability and scheduling policies.
//! Implements tenant-aware scheduling with quota enforcement via CreditService.

use crate::storage::Storage;
use creditservice_client::Client as CreditServiceClient;
use k8shost_types::{Node, Pod};
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::RwLock;
use tokio::time::sleep;
use tracing::{debug, info, warn};

/// Scheduler assigns pods to nodes with tenant-aware quota enforcement
pub struct Scheduler {
    storage: Arc<Storage>,
    /// Scheduling interval in seconds
    interval: Duration,
    /// CreditService client for quota enforcement (optional)
    credit_service: Option<Arc<RwLock<CreditServiceClient>>>,
}

impl Scheduler {
    /// Create a new scheduler without quota enforcement
    pub fn new(storage: Arc<Storage>) -> Self {
        Self {
            storage,
            interval: Duration::from_secs(5), // Check for pending pods every 5 seconds
            credit_service: None,
        }
    }

    /// Create a new scheduler with CreditService quota enforcement
    pub async fn new_with_credit_service(storage: Arc<Storage>) -> Self {
        // Initialize CreditService client if endpoint is configured
        let credit_service = match std::env::var("CREDITSERVICE_ENDPOINT") {
            Ok(endpoint) => match CreditServiceClient::connect(&endpoint).await {
                Ok(client) => {
                    info!("Scheduler: CreditService quota enforcement enabled: {}", endpoint);
                    Some(Arc::new(RwLock::new(client)))
                }
                Err(e) => {
                    warn!(
                        "Scheduler: Failed to connect to CreditService (quota enforcement disabled): {}",
                        e
                    );
                    None
                }
            },
            Err(_) => {
                info!("Scheduler: CREDITSERVICE_ENDPOINT not set, quota enforcement disabled");
                None
            }
        };

        Self {
            storage,
            interval: Duration::from_secs(5),
            credit_service,
        }
    }

    /// Start the scheduler loop
    pub async fn run(self: Arc<Self>) {
        info!("Scheduler started (spread algorithm, {}s interval)", self.interval.as_secs());

        loop {
            if let Err(e) = self.schedule_pending_pods().await {
                warn!("Scheduler cycle failed: {}", e);
            }

            sleep(self.interval).await;
        }
    }

    /// Schedule all pending pods across all tenants
    async fn schedule_pending_pods(&self) -> anyhow::Result<()> {
        // Get list of active tenants from storage (query pods for unique org_id/project_id)
        let tenants = self.get_active_tenants().await?;

        if tenants.is_empty() {
            debug!("No active tenants found");
            return Ok(());
        }

        info!("Scheduling for {} active tenant(s)", tenants.len());

        for (org_id, project_id) in tenants {
            if let Err(e) = self.schedule_tenant_pods(&org_id, &project_id).await {
                warn!(
                    "Failed to schedule pods for tenant {}/{}: {}",
                    org_id, project_id, e
                );
            }
        }

        Ok(())
    }

    /// Get list of active tenants from storage (unique org_id/project_id pairs)
    async fn get_active_tenants(&self) -> anyhow::Result<Vec<(String, String)>> {
        // Query all pods to find unique (org_id, project_id) combinations
        // This is a pragmatic approach that doesn't require IAM changes
        let all_pods = self
            .storage
            .list_all_pods()
            .await
            .unwrap_or_else(|e| {
                warn!("Failed to query all pods for tenant discovery: {}", e);
                vec![]
            });

        let mut tenants: HashSet<(String, String)> = HashSet::new();

        for pod in all_pods {
            if let (Some(org_id), Some(project_id)) =
                (pod.metadata.org_id.clone(), pod.metadata.project_id.clone())
            {
                tenants.insert((org_id, project_id));
            }
        }

        // Fall back to default tenant if no pods found
        if tenants.is_empty() {
            tenants.insert(("default-org".to_string(), "default-project".to_string()));
        }

        Ok(tenants.into_iter().collect())
    }

    /// Schedule pending pods for a specific tenant
    async fn schedule_tenant_pods(&self, org_id: &str, project_id: &str) -> anyhow::Result<()> {
        // Get all pods in all namespaces for this tenant
        let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;

        // Filter to pending pods that need scheduling
        let pending_pods: Vec<Pod> = all_pods
            .into_iter()
            .filter(|pod| {
                // Pod is pending if:
                // 1. node_name is None (not yet assigned)
                // 2. status.phase is "Pending"
                pod.spec.node_name.is_none()
                    && pod
                        .status
                        .as_ref()
                        .and_then(|s| s.phase.as_ref())
                        .map(|p| p == "Pending")
                        .unwrap_or(false)
            })
            .collect();

        if pending_pods.is_empty() {
            debug!("No pending pods for tenant {}/{}", org_id, project_id);
            return Ok(());
        }

        info!("Scheduling {} pending pod(s) for tenant {}/{}",
              pending_pods.len(), org_id, project_id);

        // Get all nodes for this tenant
        let nodes = self.storage.list_nodes(org_id, project_id).await?;

        if nodes.is_empty() {
            warn!("No nodes available for tenant {}/{}. {} pod(s) remain pending.",
                  org_id, project_id, pending_pods.len());
            return Ok(());
        }

        // Filter to ready nodes
        let ready_nodes: Vec<Node> = nodes
            .into_iter()
            .filter(|node| self.is_node_ready(node))
            .collect();

        if ready_nodes.is_empty() {
            warn!("No ready nodes available for tenant {}/{}. {} pod(s) remain pending.",
                  org_id, project_id, pending_pods.len());
            return Ok(());
        }

        info!("Found {} ready node(s) for scheduling", ready_nodes.len());

        // Get current pod count per node for spread algorithm
        let pod_counts = self.count_pods_per_node(org_id, project_id, &ready_nodes).await?;

        // Schedule each pending pod
        for pod in pending_pods {
            // Check quota before scheduling (if CreditService enabled)
            if let Err(e) = self.check_quota_for_pod(&pod, org_id, project_id).await {
                warn!(
                    "Skipping pod {}/{} due to quota: {}",
                    pod.metadata.namespace.as_deref().unwrap_or("default"),
                    pod.metadata.name,
                    e
                );
                continue;
            }

            match self.select_node_spread(&ready_nodes, &pod_counts) {
                Some(selected_node) => {
                    info!(
                        "Scheduling pod {}/{} to node {}",
                        pod.metadata.namespace.as_deref().unwrap_or("default"),
                        pod.metadata.name,
                        selected_node.metadata.name
                    );

                    if let Err(e) = self
                        .assign_pod_to_node(pod, &selected_node.metadata.name)
                        .await
                    {
                        warn!("Failed to assign pod to node: {}", e);
                    }
                }
                None => {
                    warn!(
                        "No suitable node found for pod {}/{}",
                        pod.metadata.namespace.as_deref().unwrap_or("default"),
                        pod.metadata.name
                    );
                }
            }
        }

        Ok(())
    }

    /// Check if a node is ready for scheduling
    fn is_node_ready(&self, node: &Node) -> bool {
        node.status
            .as_ref()
            .map(|status| {
                status.conditions.iter().any(|cond| {
                    cond.r#type == "Ready" && cond.status == "True"
                })
            })
            .unwrap_or(false)
    }

    /// Count pods currently assigned to each node
    async fn count_pods_per_node(
        &self,
        org_id: &str,
        project_id: &str,
        nodes: &[Node],
    ) -> anyhow::Result<HashMap<String, usize>> {
        let mut counts: HashMap<String, usize> = nodes
            .iter()
            .map(|n| (n.metadata.name.clone(), 0))
            .collect();

        // Get all assigned pods
        let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;

        // Count pods per node
        for pod in all_pods {
            if let Some(node_name) = &pod.spec.node_name {
                *counts.entry(node_name.clone()).or_insert(0) += 1;
            }
        }

        Ok(counts)
    }

    /// Select node using spread algorithm (least pods)
    fn select_node_spread<'a>(
        &self,
        ready_nodes: &'a [Node],
        pod_counts: &HashMap<String, usize>,
    ) -> Option<&'a Node> {
        // Simple spread: pick node with fewest pods
        ready_nodes
            .iter()
            .min_by_key(|node| pod_counts.get(&node.metadata.name).unwrap_or(&0))
    }

    /// Assign a pod to a specific node
    async fn assign_pod_to_node(&self, mut pod: Pod, node_name: &str) -> anyhow::Result<()> {
        // Set the node assignment
        pod.spec.node_name = Some(node_name.to_string());

        // Update resource version
        let current_version = pod
            .metadata
            .resource_version
            .as_ref()
            .and_then(|v| v.parse::<u64>().ok())
            .unwrap_or(0);
        pod.metadata.resource_version = Some((current_version + 1).to_string());

        // Store the updated pod
        self.storage.put_pod(&pod).await?;

        info!(
            "Assigned pod {}/{} to node {}",
            pod.metadata.namespace.as_deref().unwrap_or("default"),
            pod.metadata.name,
            node_name
        );

        Ok(())
    }

    /// Check if tenant has quota for scheduling this pod
    async fn check_quota_for_pod(
        &self,
        pod: &Pod,
        org_id: &str,
        project_id: &str,
    ) -> anyhow::Result<()> {
        // If CreditService is not enabled, skip quota check
        let Some(ref credit_svc) = self.credit_service else {
            return Ok(());
        };

        let mut client = credit_svc.write().await;

        // Calculate estimated cost for this pod
        let estimated_cost = Self::calculate_pod_cost(pod);

        // Check if tenant has sufficient quota
        use creditservice_client::ResourceType;
        match client
            .check_quota(
                project_id,
                ResourceType::K8sNode,
                1,
                estimated_cost as i64,
            )
            .await
        {
            Ok(response) if !response.allowed => {
                let reason = if response.reason.is_empty() {
                    "Quota exceeded".to_string()
                } else {
                    response.reason
                };
                return Err(anyhow::anyhow!(
                    "Quota check failed for tenant {}/{}: {}",
                    org_id,
                    project_id,
                    reason
                ));
            }
            Ok(_) => Ok(()),
            Err(e) => {
                // Log error but don't block scheduling if CreditService is unavailable
                warn!(
                    "CreditService check_quota failed for tenant {}/{} (allowing scheduling): {}",
                    org_id, project_id, e
                );
                Ok(())
            }
        }
    }

    /// Calculate estimated cost for a pod based on resource requests
    /// This matches the calculation in PodServiceImpl for consistency
    fn calculate_pod_cost(pod: &Pod) -> u64 {
        // Base cost per pod
        let mut cost: u64 = 10;

        // Add cost based on resource requests if present
        for container in &pod.spec.containers {
            if let Some(ref resources) = container.resources {
                // CPU: 1 core = 100 credits
                if let Some(cpu) = resources.requests.get("cpu") {
                    if let Ok(cores) = cpu.parse::<f64>() {
                        cost += (cores * 100.0) as u64;
                    }
                }

                // Memory: 1 GB = 50 credits
                if let Some(memory) = resources.requests.get("memory") {
                    // Parse memory (e.g., "512Mi", "1Gi")
                    if let Some(gb) = Self::parse_memory_to_gb(memory) {
                        cost += (gb * 50.0) as u64;
                    }
                }
            }
        }

        cost
    }

    /// Parse memory string to GB (e.g., "512Mi" -> 0.5, "2Gi" -> 2.0)
    fn parse_memory_to_gb(memory: &str) -> Option<f64> {
        if memory.ends_with("Gi") {
            memory
                .trim_end_matches("Gi")
                .parse::<f64>()
                .ok()
        } else if memory.ends_with("Mi") {
            memory
                .trim_end_matches("Mi")
                .parse::<f64>()
                .ok()
                .map(|mi| mi / 1024.0)
        } else if memory.ends_with("Ki") {
            memory
                .trim_end_matches("Ki")
                .parse::<f64>()
                .ok()
                .map(|ki| ki / (1024.0 * 1024.0))
        } else {
            // Assume bytes
            memory.parse::<f64>().ok().map(|bytes| bytes / (1024.0 * 1024.0 * 1024.0))
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use k8shost_types::{NodeCondition, NodeStatus, ObjectMeta, PodSpec, PodStatus};

    #[tokio::test]
    async fn test_is_node_ready() {
        let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
        let scheduler = Scheduler::new(storage);

        // Node with Ready=True condition
        let ready_node = Node {
            metadata: ObjectMeta {
                name: "node1".to_string(),
                namespace: None,
                uid: None,
                resource_version: None,
                creation_timestamp: None,
                labels: HashMap::new(),
                annotations: HashMap::new(),
                org_id: None,
                project_id: None,
            },
            spec: k8shost_types::NodeSpec { pod_cidr: None },
            status: Some(NodeStatus {
                addresses: vec![],
                conditions: vec![NodeCondition {
                    r#type: "Ready".to_string(),
                    status: "True".to_string(),
                    reason: None,
                    message: None,
                }],
                capacity: HashMap::new(),
                allocatable: HashMap::new(),
            }),
        };

        assert!(scheduler.is_node_ready(&ready_node));

        // Node without Ready condition
        let not_ready_node = Node {
            metadata: ready_node.metadata.clone(),
            spec: k8shost_types::NodeSpec { pod_cidr: None },
            status: Some(NodeStatus {
                addresses: vec![],
                conditions: vec![],
                capacity: HashMap::new(),
                allocatable: HashMap::new(),
            }),
        };

        assert!(!scheduler.is_node_ready(&not_ready_node));
    }

    #[tokio::test]
    async fn test_select_node_spread() {
        let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
        let scheduler = Scheduler::new(storage);

        let node1 = Node {
            metadata: ObjectMeta {
                name: "node1".to_string(),
                namespace: None,
                uid: None,
                resource_version: None,
                creation_timestamp: None,
                labels: HashMap::new(),
                annotations: HashMap::new(),
                org_id: None,
                project_id: None,
            },
            spec: k8shost_types::NodeSpec { pod_cidr: None },
            status: None,
        };

        let node2 = Node {
            metadata: ObjectMeta {
                name: "node2".to_string(),
                ..node1.metadata.clone()
            },
            spec: k8shost_types::NodeSpec { pod_cidr: None },
            status: None,
        };

        let nodes = vec![node1, node2];

        // Node1 has 2 pods, node2 has 1 pod
        let mut pod_counts = HashMap::new();
        pod_counts.insert("node1".to_string(), 2);
        pod_counts.insert("node2".to_string(), 1);

        // Should select node2 (fewer pods)
        let selected = scheduler.select_node_spread(&nodes, &pod_counts);
        assert_eq!(selected.unwrap().metadata.name, "node2");

        // Equal distribution - should select first node
        pod_counts.insert("node1".to_string(), 1);
        pod_counts.insert("node2".to_string(), 1);

        let selected = scheduler.select_node_spread(&nodes, &pod_counts);
        assert!(selected.is_some());
    }
}