Includes all pending changes needed for nixos-anywhere: - fiberlb: L7 policy, rule, certificate types - deployer: New service for cluster management - nix-nos: Generic network modules - Various service updates and fixes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
517 lines
17 KiB
Rust
517 lines
17 KiB
Rust
//! Kubernetes scheduler implementation
|
|
//!
|
|
//! Assigns pending pods to available nodes based on resource availability and scheduling policies.
|
|
//! Implements tenant-aware scheduling with quota enforcement via CreditService.
|
|
|
|
use crate::storage::Storage;
|
|
use creditservice_client::Client as CreditServiceClient;
|
|
use k8shost_types::{Node, Pod};
|
|
use std::collections::{HashMap, HashSet};
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
use tokio::sync::RwLock;
|
|
use tokio::time::sleep;
|
|
use tracing::{debug, info, warn};
|
|
|
|
/// Scheduler assigns pods to nodes with tenant-aware quota enforcement
|
|
pub struct Scheduler {
|
|
storage: Arc<Storage>,
|
|
/// Scheduling interval in seconds
|
|
interval: Duration,
|
|
/// CreditService client for quota enforcement (optional)
|
|
credit_service: Option<Arc<RwLock<CreditServiceClient>>>,
|
|
}
|
|
|
|
impl Scheduler {
|
|
/// Create a new scheduler without quota enforcement
|
|
pub fn new(storage: Arc<Storage>) -> Self {
|
|
Self {
|
|
storage,
|
|
interval: Duration::from_secs(5), // Check for pending pods every 5 seconds
|
|
credit_service: None,
|
|
}
|
|
}
|
|
|
|
/// Create a new scheduler with CreditService quota enforcement
|
|
pub async fn new_with_credit_service(storage: Arc<Storage>) -> Self {
|
|
// Initialize CreditService client if endpoint is configured
|
|
let credit_service = match std::env::var("CREDITSERVICE_ENDPOINT") {
|
|
Ok(endpoint) => match CreditServiceClient::connect(&endpoint).await {
|
|
Ok(client) => {
|
|
info!("Scheduler: CreditService quota enforcement enabled: {}", endpoint);
|
|
Some(Arc::new(RwLock::new(client)))
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
"Scheduler: Failed to connect to CreditService (quota enforcement disabled): {}",
|
|
e
|
|
);
|
|
None
|
|
}
|
|
},
|
|
Err(_) => {
|
|
info!("Scheduler: CREDITSERVICE_ENDPOINT not set, quota enforcement disabled");
|
|
None
|
|
}
|
|
};
|
|
|
|
Self {
|
|
storage,
|
|
interval: Duration::from_secs(5),
|
|
credit_service,
|
|
}
|
|
}
|
|
|
|
/// Start the scheduler loop
|
|
pub async fn run(self: Arc<Self>) {
|
|
info!("Scheduler started (spread algorithm, {}s interval)", self.interval.as_secs());
|
|
|
|
loop {
|
|
if let Err(e) = self.schedule_pending_pods().await {
|
|
warn!("Scheduler cycle failed: {}", e);
|
|
}
|
|
|
|
sleep(self.interval).await;
|
|
}
|
|
}
|
|
|
|
/// Schedule all pending pods across all tenants
|
|
async fn schedule_pending_pods(&self) -> anyhow::Result<()> {
|
|
// Get list of active tenants from storage (query pods for unique org_id/project_id)
|
|
let tenants = self.get_active_tenants().await?;
|
|
|
|
if tenants.is_empty() {
|
|
debug!("No active tenants found");
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Scheduling for {} active tenant(s)", tenants.len());
|
|
|
|
for (org_id, project_id) in tenants {
|
|
if let Err(e) = self.schedule_tenant_pods(&org_id, &project_id).await {
|
|
warn!(
|
|
"Failed to schedule pods for tenant {}/{}: {}",
|
|
org_id, project_id, e
|
|
);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get list of active tenants from storage (unique org_id/project_id pairs)
|
|
async fn get_active_tenants(&self) -> anyhow::Result<Vec<(String, String)>> {
|
|
// Query all pods to find unique (org_id, project_id) combinations
|
|
// This is a pragmatic approach that doesn't require IAM changes
|
|
let all_pods = self
|
|
.storage
|
|
.list_all_pods()
|
|
.await
|
|
.unwrap_or_else(|e| {
|
|
warn!("Failed to query all pods for tenant discovery: {}", e);
|
|
vec![]
|
|
});
|
|
|
|
let mut tenants: HashSet<(String, String)> = HashSet::new();
|
|
|
|
for pod in all_pods {
|
|
if let (Some(org_id), Some(project_id)) =
|
|
(pod.metadata.org_id.clone(), pod.metadata.project_id.clone())
|
|
{
|
|
tenants.insert((org_id, project_id));
|
|
}
|
|
}
|
|
|
|
// Fall back to default tenant if no pods found
|
|
if tenants.is_empty() {
|
|
tenants.insert(("default-org".to_string(), "default-project".to_string()));
|
|
}
|
|
|
|
Ok(tenants.into_iter().collect())
|
|
}
|
|
|
|
/// Schedule pending pods for a specific tenant
|
|
async fn schedule_tenant_pods(&self, org_id: &str, project_id: &str) -> anyhow::Result<()> {
|
|
// Get all pods in all namespaces for this tenant
|
|
let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;
|
|
|
|
// Filter to pending pods that need scheduling
|
|
let pending_pods: Vec<Pod> = all_pods
|
|
.into_iter()
|
|
.filter(|pod| {
|
|
// Pod is pending if:
|
|
// 1. node_name is None (not yet assigned)
|
|
// 2. status.phase is "Pending"
|
|
pod.spec.node_name.is_none()
|
|
&& pod
|
|
.status
|
|
.as_ref()
|
|
.and_then(|s| s.phase.as_ref())
|
|
.map(|p| p == "Pending")
|
|
.unwrap_or(false)
|
|
})
|
|
.collect();
|
|
|
|
if pending_pods.is_empty() {
|
|
debug!("No pending pods for tenant {}/{}", org_id, project_id);
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Scheduling {} pending pod(s) for tenant {}/{}",
|
|
pending_pods.len(), org_id, project_id);
|
|
|
|
// Get all nodes for this tenant
|
|
let nodes = self.storage.list_nodes(org_id, project_id).await?;
|
|
|
|
if nodes.is_empty() {
|
|
warn!("No nodes available for tenant {}/{}. {} pod(s) remain pending.",
|
|
org_id, project_id, pending_pods.len());
|
|
return Ok(());
|
|
}
|
|
|
|
// Filter to ready nodes
|
|
let ready_nodes: Vec<Node> = nodes
|
|
.into_iter()
|
|
.filter(|node| self.is_node_ready(node))
|
|
.collect();
|
|
|
|
if ready_nodes.is_empty() {
|
|
warn!("No ready nodes available for tenant {}/{}. {} pod(s) remain pending.",
|
|
org_id, project_id, pending_pods.len());
|
|
return Ok(());
|
|
}
|
|
|
|
info!("Found {} ready node(s) for scheduling", ready_nodes.len());
|
|
|
|
// Get current pod count per node for spread algorithm
|
|
let pod_counts = self.count_pods_per_node(org_id, project_id, &ready_nodes).await?;
|
|
|
|
// Schedule each pending pod
|
|
for pod in pending_pods {
|
|
// Check quota before scheduling (if CreditService enabled)
|
|
if let Err(e) = self.check_quota_for_pod(&pod, org_id, project_id).await {
|
|
warn!(
|
|
"Skipping pod {}/{} due to quota: {}",
|
|
pod.metadata.namespace.as_deref().unwrap_or("default"),
|
|
pod.metadata.name,
|
|
e
|
|
);
|
|
continue;
|
|
}
|
|
|
|
match self.select_node_spread(&ready_nodes, &pod_counts) {
|
|
Some(selected_node) => {
|
|
info!(
|
|
"Scheduling pod {}/{} to node {}",
|
|
pod.metadata.namespace.as_deref().unwrap_or("default"),
|
|
pod.metadata.name,
|
|
selected_node.metadata.name
|
|
);
|
|
|
|
if let Err(e) = self
|
|
.assign_pod_to_node(pod, &selected_node.metadata.name)
|
|
.await
|
|
{
|
|
warn!("Failed to assign pod to node: {}", e);
|
|
}
|
|
}
|
|
None => {
|
|
warn!(
|
|
"No suitable node found for pod {}/{}",
|
|
pod.metadata.namespace.as_deref().unwrap_or("default"),
|
|
pod.metadata.name
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Check if a node is ready for scheduling
|
|
fn is_node_ready(&self, node: &Node) -> bool {
|
|
node.status
|
|
.as_ref()
|
|
.map(|status| {
|
|
status.conditions.iter().any(|cond| {
|
|
cond.r#type == "Ready" && cond.status == "True"
|
|
})
|
|
})
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
/// Count pods currently assigned to each node
|
|
async fn count_pods_per_node(
|
|
&self,
|
|
org_id: &str,
|
|
project_id: &str,
|
|
nodes: &[Node],
|
|
) -> anyhow::Result<HashMap<String, usize>> {
|
|
let mut counts: HashMap<String, usize> = nodes
|
|
.iter()
|
|
.map(|n| (n.metadata.name.clone(), 0))
|
|
.collect();
|
|
|
|
// Get all assigned pods
|
|
let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;
|
|
|
|
// Count pods per node
|
|
for pod in all_pods {
|
|
if let Some(node_name) = &pod.spec.node_name {
|
|
*counts.entry(node_name.clone()).or_insert(0) += 1;
|
|
}
|
|
}
|
|
|
|
Ok(counts)
|
|
}
|
|
|
|
/// Select node using spread algorithm (least pods)
|
|
fn select_node_spread<'a>(
|
|
&self,
|
|
ready_nodes: &'a [Node],
|
|
pod_counts: &HashMap<String, usize>,
|
|
) -> Option<&'a Node> {
|
|
// Simple spread: pick node with fewest pods
|
|
ready_nodes
|
|
.iter()
|
|
.min_by_key(|node| pod_counts.get(&node.metadata.name).unwrap_or(&0))
|
|
}
|
|
|
|
/// Assign a pod to a specific node
|
|
async fn assign_pod_to_node(&self, mut pod: Pod, node_name: &str) -> anyhow::Result<()> {
|
|
// Set the node assignment
|
|
pod.spec.node_name = Some(node_name.to_string());
|
|
|
|
// Update resource version
|
|
let current_version = pod
|
|
.metadata
|
|
.resource_version
|
|
.as_ref()
|
|
.and_then(|v| v.parse::<u64>().ok())
|
|
.unwrap_or(0);
|
|
pod.metadata.resource_version = Some((current_version + 1).to_string());
|
|
|
|
// Store the updated pod
|
|
self.storage.put_pod(&pod).await?;
|
|
|
|
info!(
|
|
"Assigned pod {}/{} to node {}",
|
|
pod.metadata.namespace.as_deref().unwrap_or("default"),
|
|
pod.metadata.name,
|
|
node_name
|
|
);
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Check if tenant has quota for scheduling this pod
|
|
async fn check_quota_for_pod(
|
|
&self,
|
|
pod: &Pod,
|
|
org_id: &str,
|
|
project_id: &str,
|
|
) -> anyhow::Result<()> {
|
|
// If CreditService is not enabled, skip quota check
|
|
let Some(ref credit_svc) = self.credit_service else {
|
|
return Ok(());
|
|
};
|
|
|
|
let mut client = credit_svc.write().await;
|
|
|
|
// Calculate estimated cost for this pod
|
|
let estimated_cost = Self::calculate_pod_cost(pod);
|
|
|
|
// Check if tenant has sufficient quota
|
|
use creditservice_client::ResourceType;
|
|
match client
|
|
.check_quota(
|
|
project_id,
|
|
ResourceType::K8sNode,
|
|
1,
|
|
estimated_cost as i64,
|
|
)
|
|
.await
|
|
{
|
|
Ok(response) if !response.allowed => {
|
|
let reason = if response.reason.is_empty() {
|
|
"Quota exceeded".to_string()
|
|
} else {
|
|
response.reason
|
|
};
|
|
return Err(anyhow::anyhow!(
|
|
"Quota check failed for tenant {}/{}: {}",
|
|
org_id,
|
|
project_id,
|
|
reason
|
|
));
|
|
}
|
|
Ok(_) => Ok(()),
|
|
Err(e) => {
|
|
// Log error but don't block scheduling if CreditService is unavailable
|
|
warn!(
|
|
"CreditService check_quota failed for tenant {}/{} (allowing scheduling): {}",
|
|
org_id, project_id, e
|
|
);
|
|
Ok(())
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Calculate estimated cost for a pod based on resource requests
|
|
/// This matches the calculation in PodServiceImpl for consistency
|
|
fn calculate_pod_cost(pod: &Pod) -> u64 {
|
|
// Base cost per pod
|
|
let mut cost: u64 = 10;
|
|
|
|
// Add cost based on resource requests if present
|
|
for container in &pod.spec.containers {
|
|
if let Some(ref resources) = container.resources {
|
|
// CPU: 1 core = 100 credits
|
|
if let Some(cpu) = resources.requests.get("cpu") {
|
|
if let Ok(cores) = cpu.parse::<f64>() {
|
|
cost += (cores * 100.0) as u64;
|
|
}
|
|
}
|
|
|
|
// Memory: 1 GB = 50 credits
|
|
if let Some(memory) = resources.requests.get("memory") {
|
|
// Parse memory (e.g., "512Mi", "1Gi")
|
|
if let Some(gb) = Self::parse_memory_to_gb(memory) {
|
|
cost += (gb * 50.0) as u64;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
cost
|
|
}
|
|
|
|
/// Parse memory string to GB (e.g., "512Mi" -> 0.5, "2Gi" -> 2.0)
|
|
fn parse_memory_to_gb(memory: &str) -> Option<f64> {
|
|
if memory.ends_with("Gi") {
|
|
memory
|
|
.trim_end_matches("Gi")
|
|
.parse::<f64>()
|
|
.ok()
|
|
} else if memory.ends_with("Mi") {
|
|
memory
|
|
.trim_end_matches("Mi")
|
|
.parse::<f64>()
|
|
.ok()
|
|
.map(|mi| mi / 1024.0)
|
|
} else if memory.ends_with("Ki") {
|
|
memory
|
|
.trim_end_matches("Ki")
|
|
.parse::<f64>()
|
|
.ok()
|
|
.map(|ki| ki / (1024.0 * 1024.0))
|
|
} else {
|
|
// Assume bytes
|
|
memory.parse::<f64>().ok().map(|bytes| bytes / (1024.0 * 1024.0 * 1024.0))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use k8shost_types::{NodeCondition, NodeStatus, ObjectMeta, PodSpec, PodStatus};
|
|
|
|
#[tokio::test]
|
|
async fn test_is_node_ready() {
|
|
let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
|
|
let scheduler = Scheduler::new(storage);
|
|
|
|
// Node with Ready=True condition
|
|
let ready_node = Node {
|
|
metadata: ObjectMeta {
|
|
name: "node1".to_string(),
|
|
namespace: None,
|
|
uid: None,
|
|
resource_version: None,
|
|
creation_timestamp: None,
|
|
labels: HashMap::new(),
|
|
annotations: HashMap::new(),
|
|
org_id: None,
|
|
project_id: None,
|
|
},
|
|
spec: k8shost_types::NodeSpec { pod_cidr: None },
|
|
status: Some(NodeStatus {
|
|
addresses: vec![],
|
|
conditions: vec![NodeCondition {
|
|
r#type: "Ready".to_string(),
|
|
status: "True".to_string(),
|
|
reason: None,
|
|
message: None,
|
|
}],
|
|
capacity: HashMap::new(),
|
|
allocatable: HashMap::new(),
|
|
}),
|
|
};
|
|
|
|
assert!(scheduler.is_node_ready(&ready_node));
|
|
|
|
// Node without Ready condition
|
|
let not_ready_node = Node {
|
|
metadata: ready_node.metadata.clone(),
|
|
spec: k8shost_types::NodeSpec { pod_cidr: None },
|
|
status: Some(NodeStatus {
|
|
addresses: vec![],
|
|
conditions: vec![],
|
|
capacity: HashMap::new(),
|
|
allocatable: HashMap::new(),
|
|
}),
|
|
};
|
|
|
|
assert!(!scheduler.is_node_ready(¬_ready_node));
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_select_node_spread() {
|
|
let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
|
|
let scheduler = Scheduler::new(storage);
|
|
|
|
let node1 = Node {
|
|
metadata: ObjectMeta {
|
|
name: "node1".to_string(),
|
|
namespace: None,
|
|
uid: None,
|
|
resource_version: None,
|
|
creation_timestamp: None,
|
|
labels: HashMap::new(),
|
|
annotations: HashMap::new(),
|
|
org_id: None,
|
|
project_id: None,
|
|
},
|
|
spec: k8shost_types::NodeSpec { pod_cidr: None },
|
|
status: None,
|
|
};
|
|
|
|
let node2 = Node {
|
|
metadata: ObjectMeta {
|
|
name: "node2".to_string(),
|
|
..node1.metadata.clone()
|
|
},
|
|
spec: k8shost_types::NodeSpec { pod_cidr: None },
|
|
status: None,
|
|
};
|
|
|
|
let nodes = vec![node1, node2];
|
|
|
|
// Node1 has 2 pods, node2 has 1 pod
|
|
let mut pod_counts = HashMap::new();
|
|
pod_counts.insert("node1".to_string(), 2);
|
|
pod_counts.insert("node2".to_string(), 1);
|
|
|
|
// Should select node2 (fewer pods)
|
|
let selected = scheduler.select_node_spread(&nodes, &pod_counts);
|
|
assert_eq!(selected.unwrap().metadata.name, "node2");
|
|
|
|
// Equal distribution - should select first node
|
|
pod_counts.insert("node1".to_string(), 1);
|
|
pod_counts.insert("node2".to_string(), 1);
|
|
|
|
let selected = scheduler.select_node_spread(&nodes, &pod_counts);
|
|
assert!(selected.is_some());
|
|
}
|
|
}
|