photoncloud-monorepo/k8shost/crates/k8shost-server/src/scheduler.rs
centra 3eeb303dcb feat: Batch commit for T039.S3 deployment
Includes all pending changes needed for nixos-anywhere:
- fiberlb: L7 policy, rule, certificate types
- deployer: New service for cluster management
- nix-nos: Generic network modules
- Various service updates and fixes

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-13 04:34:51 +09:00

517 lines
17 KiB
Rust

//! Kubernetes scheduler implementation
//!
//! Assigns pending pods to available nodes based on resource availability and scheduling policies.
//! Implements tenant-aware scheduling with quota enforcement via CreditService.
use crate::storage::Storage;
use creditservice_client::Client as CreditServiceClient;
use k8shost_types::{Node, Pod};
use std::collections::{HashMap, HashSet};
use std::sync::Arc;
use std::time::Duration;
use tokio::sync::RwLock;
use tokio::time::sleep;
use tracing::{debug, info, warn};
/// Scheduler assigns pods to nodes with tenant-aware quota enforcement
pub struct Scheduler {
storage: Arc<Storage>,
/// Scheduling interval in seconds
interval: Duration,
/// CreditService client for quota enforcement (optional)
credit_service: Option<Arc<RwLock<CreditServiceClient>>>,
}
impl Scheduler {
/// Create a new scheduler without quota enforcement
pub fn new(storage: Arc<Storage>) -> Self {
Self {
storage,
interval: Duration::from_secs(5), // Check for pending pods every 5 seconds
credit_service: None,
}
}
/// Create a new scheduler with CreditService quota enforcement
pub async fn new_with_credit_service(storage: Arc<Storage>) -> Self {
// Initialize CreditService client if endpoint is configured
let credit_service = match std::env::var("CREDITSERVICE_ENDPOINT") {
Ok(endpoint) => match CreditServiceClient::connect(&endpoint).await {
Ok(client) => {
info!("Scheduler: CreditService quota enforcement enabled: {}", endpoint);
Some(Arc::new(RwLock::new(client)))
}
Err(e) => {
warn!(
"Scheduler: Failed to connect to CreditService (quota enforcement disabled): {}",
e
);
None
}
},
Err(_) => {
info!("Scheduler: CREDITSERVICE_ENDPOINT not set, quota enforcement disabled");
None
}
};
Self {
storage,
interval: Duration::from_secs(5),
credit_service,
}
}
/// Start the scheduler loop
pub async fn run(self: Arc<Self>) {
info!("Scheduler started (spread algorithm, {}s interval)", self.interval.as_secs());
loop {
if let Err(e) = self.schedule_pending_pods().await {
warn!("Scheduler cycle failed: {}", e);
}
sleep(self.interval).await;
}
}
/// Schedule all pending pods across all tenants
async fn schedule_pending_pods(&self) -> anyhow::Result<()> {
// Get list of active tenants from storage (query pods for unique org_id/project_id)
let tenants = self.get_active_tenants().await?;
if tenants.is_empty() {
debug!("No active tenants found");
return Ok(());
}
info!("Scheduling for {} active tenant(s)", tenants.len());
for (org_id, project_id) in tenants {
if let Err(e) = self.schedule_tenant_pods(&org_id, &project_id).await {
warn!(
"Failed to schedule pods for tenant {}/{}: {}",
org_id, project_id, e
);
}
}
Ok(())
}
/// Get list of active tenants from storage (unique org_id/project_id pairs)
async fn get_active_tenants(&self) -> anyhow::Result<Vec<(String, String)>> {
// Query all pods to find unique (org_id, project_id) combinations
// This is a pragmatic approach that doesn't require IAM changes
let all_pods = self
.storage
.list_all_pods()
.await
.unwrap_or_else(|e| {
warn!("Failed to query all pods for tenant discovery: {}", e);
vec![]
});
let mut tenants: HashSet<(String, String)> = HashSet::new();
for pod in all_pods {
if let (Some(org_id), Some(project_id)) =
(pod.metadata.org_id.clone(), pod.metadata.project_id.clone())
{
tenants.insert((org_id, project_id));
}
}
// Fall back to default tenant if no pods found
if tenants.is_empty() {
tenants.insert(("default-org".to_string(), "default-project".to_string()));
}
Ok(tenants.into_iter().collect())
}
/// Schedule pending pods for a specific tenant
async fn schedule_tenant_pods(&self, org_id: &str, project_id: &str) -> anyhow::Result<()> {
// Get all pods in all namespaces for this tenant
let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;
// Filter to pending pods that need scheduling
let pending_pods: Vec<Pod> = all_pods
.into_iter()
.filter(|pod| {
// Pod is pending if:
// 1. node_name is None (not yet assigned)
// 2. status.phase is "Pending"
pod.spec.node_name.is_none()
&& pod
.status
.as_ref()
.and_then(|s| s.phase.as_ref())
.map(|p| p == "Pending")
.unwrap_or(false)
})
.collect();
if pending_pods.is_empty() {
debug!("No pending pods for tenant {}/{}", org_id, project_id);
return Ok(());
}
info!("Scheduling {} pending pod(s) for tenant {}/{}",
pending_pods.len(), org_id, project_id);
// Get all nodes for this tenant
let nodes = self.storage.list_nodes(org_id, project_id).await?;
if nodes.is_empty() {
warn!("No nodes available for tenant {}/{}. {} pod(s) remain pending.",
org_id, project_id, pending_pods.len());
return Ok(());
}
// Filter to ready nodes
let ready_nodes: Vec<Node> = nodes
.into_iter()
.filter(|node| self.is_node_ready(node))
.collect();
if ready_nodes.is_empty() {
warn!("No ready nodes available for tenant {}/{}. {} pod(s) remain pending.",
org_id, project_id, pending_pods.len());
return Ok(());
}
info!("Found {} ready node(s) for scheduling", ready_nodes.len());
// Get current pod count per node for spread algorithm
let pod_counts = self.count_pods_per_node(org_id, project_id, &ready_nodes).await?;
// Schedule each pending pod
for pod in pending_pods {
// Check quota before scheduling (if CreditService enabled)
if let Err(e) = self.check_quota_for_pod(&pod, org_id, project_id).await {
warn!(
"Skipping pod {}/{} due to quota: {}",
pod.metadata.namespace.as_deref().unwrap_or("default"),
pod.metadata.name,
e
);
continue;
}
match self.select_node_spread(&ready_nodes, &pod_counts) {
Some(selected_node) => {
info!(
"Scheduling pod {}/{} to node {}",
pod.metadata.namespace.as_deref().unwrap_or("default"),
pod.metadata.name,
selected_node.metadata.name
);
if let Err(e) = self
.assign_pod_to_node(pod, &selected_node.metadata.name)
.await
{
warn!("Failed to assign pod to node: {}", e);
}
}
None => {
warn!(
"No suitable node found for pod {}/{}",
pod.metadata.namespace.as_deref().unwrap_or("default"),
pod.metadata.name
);
}
}
}
Ok(())
}
/// Check if a node is ready for scheduling
fn is_node_ready(&self, node: &Node) -> bool {
node.status
.as_ref()
.map(|status| {
status.conditions.iter().any(|cond| {
cond.r#type == "Ready" && cond.status == "True"
})
})
.unwrap_or(false)
}
/// Count pods currently assigned to each node
async fn count_pods_per_node(
&self,
org_id: &str,
project_id: &str,
nodes: &[Node],
) -> anyhow::Result<HashMap<String, usize>> {
let mut counts: HashMap<String, usize> = nodes
.iter()
.map(|n| (n.metadata.name.clone(), 0))
.collect();
// Get all assigned pods
let all_pods = self.storage.list_pods(org_id, project_id, None, None).await?;
// Count pods per node
for pod in all_pods {
if let Some(node_name) = &pod.spec.node_name {
*counts.entry(node_name.clone()).or_insert(0) += 1;
}
}
Ok(counts)
}
/// Select node using spread algorithm (least pods)
fn select_node_spread<'a>(
&self,
ready_nodes: &'a [Node],
pod_counts: &HashMap<String, usize>,
) -> Option<&'a Node> {
// Simple spread: pick node with fewest pods
ready_nodes
.iter()
.min_by_key(|node| pod_counts.get(&node.metadata.name).unwrap_or(&0))
}
/// Assign a pod to a specific node
async fn assign_pod_to_node(&self, mut pod: Pod, node_name: &str) -> anyhow::Result<()> {
// Set the node assignment
pod.spec.node_name = Some(node_name.to_string());
// Update resource version
let current_version = pod
.metadata
.resource_version
.as_ref()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(0);
pod.metadata.resource_version = Some((current_version + 1).to_string());
// Store the updated pod
self.storage.put_pod(&pod).await?;
info!(
"Assigned pod {}/{} to node {}",
pod.metadata.namespace.as_deref().unwrap_or("default"),
pod.metadata.name,
node_name
);
Ok(())
}
/// Check if tenant has quota for scheduling this pod
async fn check_quota_for_pod(
&self,
pod: &Pod,
org_id: &str,
project_id: &str,
) -> anyhow::Result<()> {
// If CreditService is not enabled, skip quota check
let Some(ref credit_svc) = self.credit_service else {
return Ok(());
};
let mut client = credit_svc.write().await;
// Calculate estimated cost for this pod
let estimated_cost = Self::calculate_pod_cost(pod);
// Check if tenant has sufficient quota
use creditservice_client::ResourceType;
match client
.check_quota(
project_id,
ResourceType::K8sNode,
1,
estimated_cost as i64,
)
.await
{
Ok(response) if !response.allowed => {
let reason = if response.reason.is_empty() {
"Quota exceeded".to_string()
} else {
response.reason
};
return Err(anyhow::anyhow!(
"Quota check failed for tenant {}/{}: {}",
org_id,
project_id,
reason
));
}
Ok(_) => Ok(()),
Err(e) => {
// Log error but don't block scheduling if CreditService is unavailable
warn!(
"CreditService check_quota failed for tenant {}/{} (allowing scheduling): {}",
org_id, project_id, e
);
Ok(())
}
}
}
/// Calculate estimated cost for a pod based on resource requests
/// This matches the calculation in PodServiceImpl for consistency
fn calculate_pod_cost(pod: &Pod) -> u64 {
// Base cost per pod
let mut cost: u64 = 10;
// Add cost based on resource requests if present
for container in &pod.spec.containers {
if let Some(ref resources) = container.resources {
// CPU: 1 core = 100 credits
if let Some(cpu) = resources.requests.get("cpu") {
if let Ok(cores) = cpu.parse::<f64>() {
cost += (cores * 100.0) as u64;
}
}
// Memory: 1 GB = 50 credits
if let Some(memory) = resources.requests.get("memory") {
// Parse memory (e.g., "512Mi", "1Gi")
if let Some(gb) = Self::parse_memory_to_gb(memory) {
cost += (gb * 50.0) as u64;
}
}
}
}
cost
}
/// Parse memory string to GB (e.g., "512Mi" -> 0.5, "2Gi" -> 2.0)
fn parse_memory_to_gb(memory: &str) -> Option<f64> {
if memory.ends_with("Gi") {
memory
.trim_end_matches("Gi")
.parse::<f64>()
.ok()
} else if memory.ends_with("Mi") {
memory
.trim_end_matches("Mi")
.parse::<f64>()
.ok()
.map(|mi| mi / 1024.0)
} else if memory.ends_with("Ki") {
memory
.trim_end_matches("Ki")
.parse::<f64>()
.ok()
.map(|ki| ki / (1024.0 * 1024.0))
} else {
// Assume bytes
memory.parse::<f64>().ok().map(|bytes| bytes / (1024.0 * 1024.0 * 1024.0))
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use k8shost_types::{NodeCondition, NodeStatus, ObjectMeta, PodSpec, PodStatus};
#[tokio::test]
async fn test_is_node_ready() {
let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
let scheduler = Scheduler::new(storage);
// Node with Ready=True condition
let ready_node = Node {
metadata: ObjectMeta {
name: "node1".to_string(),
namespace: None,
uid: None,
resource_version: None,
creation_timestamp: None,
labels: HashMap::new(),
annotations: HashMap::new(),
org_id: None,
project_id: None,
},
spec: k8shost_types::NodeSpec { pod_cidr: None },
status: Some(NodeStatus {
addresses: vec![],
conditions: vec![NodeCondition {
r#type: "Ready".to_string(),
status: "True".to_string(),
reason: None,
message: None,
}],
capacity: HashMap::new(),
allocatable: HashMap::new(),
}),
};
assert!(scheduler.is_node_ready(&ready_node));
// Node without Ready condition
let not_ready_node = Node {
metadata: ready_node.metadata.clone(),
spec: k8shost_types::NodeSpec { pod_cidr: None },
status: Some(NodeStatus {
addresses: vec![],
conditions: vec![],
capacity: HashMap::new(),
allocatable: HashMap::new(),
}),
};
assert!(!scheduler.is_node_ready(&not_ready_node));
}
#[tokio::test]
async fn test_select_node_spread() {
let storage = Arc::new(Storage::new("memory://test".to_string()).await.expect("Failed to create storage"));
let scheduler = Scheduler::new(storage);
let node1 = Node {
metadata: ObjectMeta {
name: "node1".to_string(),
namespace: None,
uid: None,
resource_version: None,
creation_timestamp: None,
labels: HashMap::new(),
annotations: HashMap::new(),
org_id: None,
project_id: None,
},
spec: k8shost_types::NodeSpec { pod_cidr: None },
status: None,
};
let node2 = Node {
metadata: ObjectMeta {
name: "node2".to_string(),
..node1.metadata.clone()
},
spec: k8shost_types::NodeSpec { pod_cidr: None },
status: None,
};
let nodes = vec![node1, node2];
// Node1 has 2 pods, node2 has 1 pod
let mut pod_counts = HashMap::new();
pod_counts.insert("node1".to_string(), 2);
pod_counts.insert("node2".to_string(), 1);
// Should select node2 (fewer pods)
let selected = scheduler.select_node_spread(&nodes, &pod_counts);
assert_eq!(selected.unwrap().metadata.name, "node2");
// Equal distribution - should select first node
pod_counts.insert("node1".to_string(), 1);
pod_counts.insert("node2".to_string(), 1);
let selected = scheduler.select_node_spread(&nodes, &pod_counts);
assert!(selected.is_some());
}
}