//! Kubernetes scheduler implementation //! //! Assigns pending pods to available nodes based on resource availability and scheduling policies. //! Implements tenant-aware scheduling with quota enforcement via CreditService. use crate::storage::Storage; use creditservice_client::Client as CreditServiceClient; use k8shost_types::{Node, Pod}; use std::collections::{HashMap, HashSet}; use std::sync::Arc; use std::time::Duration; use tokio::sync::RwLock; use tokio::time::sleep; use tracing::{debug, info, warn}; /// Scheduler assigns pods to nodes with tenant-aware quota enforcement pub struct Scheduler { storage: Arc, /// Scheduling interval in seconds interval: Duration, /// CreditService client for quota enforcement (optional) credit_service: Option>>, } impl Scheduler { /// Create a new scheduler without quota enforcement pub fn new(storage: Arc) -> Self { Self { storage, interval: Duration::from_secs(5), // Check for pending pods every 5 seconds credit_service: None, } } /// Create a new scheduler with CreditService quota enforcement pub async fn new_with_credit_service(storage: Arc) -> Self { // Initialize CreditService client if endpoint is configured let credit_service = match std::env::var("CREDITSERVICE_ENDPOINT") { Ok(endpoint) => match CreditServiceClient::connect(&endpoint).await { Ok(client) => { info!("Scheduler: CreditService quota enforcement enabled: {}", endpoint); Some(Arc::new(RwLock::new(client))) } Err(e) => { warn!( "Scheduler: Failed to connect to CreditService (quota enforcement disabled): {}", e ); None } }, Err(_) => { info!("Scheduler: CREDITSERVICE_ENDPOINT not set, quota enforcement disabled"); None } }; Self { storage, interval: Duration::from_secs(5), credit_service, } } /// Start the scheduler loop pub async fn run(self: Arc) { info!("Scheduler started (spread algorithm, {}s interval)", self.interval.as_secs()); loop { if let Err(e) = self.schedule_pending_pods().await { warn!("Scheduler cycle failed: {}", e); } sleep(self.interval).await; } } /// Schedule all pending pods across all tenants async fn schedule_pending_pods(&self) -> anyhow::Result<()> { // Get list of active tenants from storage (query pods for unique org_id/project_id) let tenants = self.get_active_tenants().await?; if tenants.is_empty() { debug!("No active tenants found"); return Ok(()); } info!("Scheduling for {} active tenant(s)", tenants.len()); for (org_id, project_id) in tenants { if let Err(e) = self.schedule_tenant_pods(&org_id, &project_id).await { warn!( "Failed to schedule pods for tenant {}/{}: {}", org_id, project_id, e ); } } Ok(()) } /// Get list of active tenants from storage (unique org_id/project_id pairs) async fn get_active_tenants(&self) -> anyhow::Result> { // Query all pods to find unique (org_id, project_id) combinations // This is a pragmatic approach that doesn't require IAM changes let all_pods = self .storage .list_all_pods() .await .unwrap_or_else(|e| { warn!("Failed to query all pods for tenant discovery: {}", e); vec![] }); let mut tenants: HashSet<(String, String)> = HashSet::new(); for pod in all_pods { if let (Some(org_id), Some(project_id)) = (pod.metadata.org_id.clone(), pod.metadata.project_id.clone()) { tenants.insert((org_id, project_id)); } } // Fall back to default tenant if no pods found if tenants.is_empty() { tenants.insert(("default-org".to_string(), "default-project".to_string())); } Ok(tenants.into_iter().collect()) } /// Schedule pending pods for a specific tenant async fn schedule_tenant_pods(&self, org_id: &str, project_id: &str) -> anyhow::Result<()> { // Get all pods in all namespaces for this tenant let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?; // Filter to pending pods that need scheduling let pending_pods: Vec = all_pods .into_iter() .filter(|pod| { // Pod is pending if: // 1. node_name is None (not yet assigned) // 2. status.phase is "Pending" pod.spec.node_name.is_none() && pod .status .as_ref() .and_then(|s| s.phase.as_ref()) .map(|p| p == "Pending") .unwrap_or(false) }) .collect(); if pending_pods.is_empty() { debug!("No pending pods for tenant {}/{}", org_id, project_id); return Ok(()); } info!("Scheduling {} pending pod(s) for tenant {}/{}", pending_pods.len(), org_id, project_id); // Get all nodes for this tenant let nodes = self.storage.list_nodes(org_id, project_id).await?; if nodes.is_empty() { warn!("No nodes available for tenant {}/{}. {} pod(s) remain pending.", org_id, project_id, pending_pods.len()); return Ok(()); } // Filter to ready nodes let ready_nodes: Vec = nodes .into_iter() .filter(|node| self.is_node_ready(node)) .collect(); if ready_nodes.is_empty() { warn!("No ready nodes available for tenant {}/{}. {} pod(s) remain pending.", org_id, project_id, pending_pods.len()); return Ok(()); } info!("Found {} ready node(s) for scheduling", ready_nodes.len()); // Get current pod count per node for spread algorithm let pod_counts = self.count_pods_per_node(org_id, project_id, &ready_nodes).await?; // Schedule each pending pod for pod in pending_pods { // Check quota before scheduling (if CreditService enabled) if let Err(e) = self.check_quota_for_pod(&pod, org_id, project_id).await { warn!( "Skipping pod {}/{} due to quota: {}", pod.metadata.namespace.as_deref().unwrap_or("default"), pod.metadata.name, e ); continue; } match self.select_node_spread(&ready_nodes, &pod_counts) { Some(selected_node) => { info!( "Scheduling pod {}/{} to node {}", pod.metadata.namespace.as_deref().unwrap_or("default"), pod.metadata.name, selected_node.metadata.name ); if let Err(e) = self .assign_pod_to_node(pod, &selected_node.metadata.name) .await { warn!("Failed to assign pod to node: {}", e); } } None => { warn!( "No suitable node found for pod {}/{}", pod.metadata.namespace.as_deref().unwrap_or("default"), pod.metadata.name ); } } } Ok(()) } /// Check if a node is ready for scheduling fn is_node_ready(&self, node: &Node) -> bool { node.status .as_ref() .map(|status| { status.conditions.iter().any(|cond| { cond.r#type == "Ready" && cond.status == "True" }) }) .unwrap_or(false) } /// Count pods currently assigned to each node async fn count_pods_per_node( &self, org_id: &str, project_id: &str, nodes: &[Node], ) -> anyhow::Result> { let mut counts: HashMap = nodes .iter() .map(|n| (n.metadata.name.clone(), 0)) .collect(); // Get all assigned pods let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?; // Count pods per node for pod in all_pods { if let Some(node_name) = &pod.spec.node_name { *counts.entry(node_name.clone()).or_insert(0) += 1; } } Ok(counts) } /// Select node using spread algorithm (least pods) fn select_node_spread<'a>( &self, ready_nodes: &'a [Node], pod_counts: &HashMap, ) -> Option<&'a Node> { // Simple spread: pick node with fewest pods ready_nodes .iter() .min_by_key(|node| pod_counts.get(&node.metadata.name).unwrap_or(&0)) } /// Assign a pod to a specific node async fn assign_pod_to_node(&self, mut pod: Pod, node_name: &str) -> anyhow::Result<()> { // Set the node assignment pod.spec.node_name = Some(node_name.to_string()); // Update resource version let current_version = pod .metadata .resource_version .as_ref() .and_then(|v| v.parse::().ok()) .unwrap_or(0); pod.metadata.resource_version = Some((current_version + 1).to_string()); // Store the updated pod self.storage.put_pod(&pod).await?; info!( "Assigned pod {}/{} to node {}", pod.metadata.namespace.as_deref().unwrap_or("default"), pod.metadata.name, node_name ); Ok(()) } /// Check if tenant has quota for scheduling this pod async fn check_quota_for_pod( &self, pod: &Pod, org_id: &str, project_id: &str, ) -> anyhow::Result<()> { // If CreditService is not enabled, skip quota check let Some(ref credit_svc) = self.credit_service else { return Ok(()); }; let mut client = credit_svc.write().await; // Calculate estimated cost for this pod let estimated_cost = Self::calculate_pod_cost(pod); // Check if tenant has sufficient quota use creditservice_client::ResourceType; match client .check_quota( project_id, ResourceType::K8sNode, 1, estimated_cost as i64, ) .await { Ok(response) if !response.allowed => { let reason = if response.reason.is_empty() { "Quota exceeded".to_string() } else { response.reason }; return Err(anyhow::anyhow!( "Quota check failed for tenant {}/{}: {}", org_id, project_id, reason )); } Ok(_) => Ok(()), Err(e) => { // Log error but don't block scheduling if CreditService is unavailable warn!( "CreditService check_quota failed for tenant {}/{} (allowing scheduling): {}", org_id, project_id, e ); Ok(()) } } } /// Calculate estimated cost for a pod based on resource requests /// This matches the calculation in PodServiceImpl for consistency fn calculate_pod_cost(pod: &Pod) -> u64 { // Base cost per pod let mut cost: u64 = 10; // Add cost based on resource requests if present for container in &pod.spec.containers { if let Some(ref resources) = container.resources { // CPU: 1 core = 100 credits if let Some(cpu) = resources.requests.get("cpu") { if let Ok(cores) = cpu.parse::() { cost += (cores * 100.0) as u64; } } // Memory: 1 GB = 50 credits if let Some(memory) = resources.requests.get("memory") { // Parse memory (e.g., "512Mi", "1Gi") if let Some(gb) = Self::parse_memory_to_gb(memory) { cost += (gb * 50.0) as u64; } } } } cost } /// Parse memory string to GB (e.g., "512Mi" -> 0.5, "2Gi" -> 2.0) fn parse_memory_to_gb(memory: &str) -> Option { if memory.ends_with("Gi") { memory .trim_end_matches("Gi") .parse::() .ok() } else if memory.ends_with("Mi") { memory .trim_end_matches("Mi") .parse::() .ok() .map(|mi| mi / 1024.0) } else if memory.ends_with("Ki") { memory .trim_end_matches("Ki") .parse::() .ok() .map(|ki| ki / (1024.0 * 1024.0)) } else { // Assume bytes memory.parse::().ok().map(|bytes| bytes / (1024.0 * 1024.0 * 1024.0)) } } } #[cfg(test)] mod tests { use super::*; use k8shost_types::{NodeCondition, NodeStatus, ObjectMeta, PodSpec, PodStatus}; #[tokio::test] async fn test_is_node_ready() { let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage")); let scheduler = Scheduler::new(storage); // Node with Ready=True condition let ready_node = Node { metadata: ObjectMeta { name: "node1".to_string(), namespace: None, uid: None, resource_version: None, creation_timestamp: None, labels: HashMap::new(), annotations: HashMap::new(), org_id: None, project_id: None, }, spec: k8shost_types::NodeSpec { pod_cidr: None }, status: Some(NodeStatus { addresses: vec![], conditions: vec![NodeCondition { r#type: "Ready".to_string(), status: "True".to_string(), reason: None, message: None, }], capacity: HashMap::new(), allocatable: HashMap::new(), }), }; assert!(scheduler.is_node_ready(&ready_node)); // Node without Ready condition let not_ready_node = Node { metadata: ready_node.metadata.clone(), spec: k8shost_types::NodeSpec { pod_cidr: None }, status: Some(NodeStatus { addresses: vec![], conditions: vec![], capacity: HashMap::new(), allocatable: HashMap::new(), }), }; assert!(!scheduler.is_node_ready(¬_ready_node)); } #[tokio::test] async fn test_select_node_spread() { let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage")); let scheduler = Scheduler::new(storage); let node1 = Node { metadata: ObjectMeta { name: "node1".to_string(), namespace: None, uid: None, resource_version: None, creation_timestamp: None, labels: HashMap::new(), annotations: HashMap::new(), org_id: None, project_id: None, }, spec: k8shost_types::NodeSpec { pod_cidr: None }, status: None, }; let node2 = Node { metadata: ObjectMeta { name: "node2".to_string(), ..node1.metadata.clone() }, spec: k8shost_types::NodeSpec { pod_cidr: None }, status: None, }; let nodes = vec![node1, node2]; // Node1 has 2 pods, node2 has 1 pod let mut pod_counts = HashMap::new(); pod_counts.insert("node1".to_string(), 2); pod_counts.insert("node2".to_string(), 1); // Should select node2 (fewer pods) let selected = scheduler.select_node_spread(&nodes, &pod_counts); assert_eq!(selected.unwrap().metadata.name, "node2"); // Equal distribution - should select first node pod_counts.insert("node1".to_string(), 1); pod_counts.insert("node2".to_string(), 1); let selected = scheduler.select_node_spread(&nodes, &pod_counts); assert!(selected.is_some()); } }