//! FlashDNS Controller - Manages cluster.local DNS records for Services //! //! This controller watches for Services and automatically creates DNS records //! in the format: {service}.{namespace}.svc.cluster.local → ClusterIP use crate::auth::{authorized_request, issue_controller_token}; use crate::storage::Storage; use anyhow::Result; use flashdns_api::proto::record_service_client::RecordServiceClient; use flashdns_api::proto::zone_service_client::ZoneServiceClient; use flashdns_api::proto::{ get_zone_request, record_data, ARecord, CreateRecordRequest, CreateZoneRequest, DeleteRecordRequest, GetZoneRequest, ListZonesRequest, RecordData, }; use std::collections::HashMap; use std::sync::Arc; use std::time::Duration; use tokio::time::sleep; use tonic::Code; use tracing::{debug, info, warn}; const CLUSTER_DOMAIN: &str = "cluster.local"; const DNS_RECORD_TTL: u32 = 60; // 60 seconds for dynamic cluster services const CONTROLLER_PRINCIPAL_ID: &str = "k8shost-controller"; /// FlashDNS controller for managing cluster.local DNS records pub struct FlashDnsController { storage: Arc, flashdns_addr: String, iam_server_addr: String, interval: Duration, /// Cache of zone_id per tenant (org_id/project_id -> zone_id) zone_cache: Arc>>, } impl FlashDnsController { /// Create a new FlashDNS controller pub fn new(storage: Arc, flashdns_addr: String, iam_server_addr: String) -> Self { Self { storage, flashdns_addr, iam_server_addr, interval: Duration::from_secs(10), // Check every 10 seconds zone_cache: Arc::new(tokio::sync::RwLock::new(HashMap::new())), } } /// Start the controller loop pub async fn run(self: Arc) { info!( "FlashDNS controller started (FlashDNS at {}, {}s interval)", self.flashdns_addr, self.interval.as_secs() ); loop { if let Err(e) = self.reconcile_dns_records().await { warn!("FlashDNS controller cycle failed: {}", e); } sleep(self.interval).await; } } /// Reconcile DNS records across all tenants async fn reconcile_dns_records(&self) -> Result<()> { // For MVP, iterate through known tenants let tenants = vec![("default-org".to_string(), "default-project".to_string())]; for (org_id, project_id) in tenants { if let Err(e) = self.reconcile_tenant_dns(&org_id, &project_id).await { warn!( "Failed to reconcile DNS for tenant {}/{}: {}", org_id, project_id, e ); } } Ok(()) } /// Reconcile DNS records for a specific tenant async fn reconcile_tenant_dns(&self, org_id: &str, project_id: &str) -> Result<()> { let auth_token = issue_controller_token( &self.iam_server_addr, CONTROLLER_PRINCIPAL_ID, org_id, project_id, ) .await?; // Ensure cluster.local zone exists for this tenant let zone_id = match self .ensure_zone_exists(org_id, project_id, &auth_token) .await { Ok(id) => id, Err(e) => { warn!( "Failed to ensure zone exists for tenant {}/{}: {}", org_id, project_id, e ); return Ok(()); } }; // Get all services for this tenant let services = self.storage.list_services(org_id, project_id, None).await?; // Filter for services that need DNS records let services_needing_dns: Vec<_> = services .into_iter() .filter(|svc| { // Service needs DNS if: // 1. Has cluster_ip allocated // 2. Does NOT have flashdns.plasmacloud.io/record-id annotation (not yet provisioned) svc.spec.cluster_ip.is_some() && !svc .metadata .annotations .contains_key("flashdns.plasmacloud.io/record-id") }) .collect(); if services_needing_dns.is_empty() { debug!( "No services need DNS records for tenant {}/{}", org_id, project_id ); return Ok(()); } info!( "Found {} service(s) needing DNS records for tenant {}/{}", services_needing_dns.len(), org_id, project_id ); // Connect to FlashDNS let mut record_client = match RecordServiceClient::connect(self.flashdns_addr.clone()).await { Ok(client) => client, Err(e) => { warn!( "Failed to connect to FlashDNS at {}: {}", self.flashdns_addr, e ); return Ok(()); } }; // Create DNS records for each service for mut service in services_needing_dns { let namespace = service .metadata .namespace .clone() .unwrap_or_else(|| "default".to_string()); let name = service.metadata.name.clone(); let cluster_ip = service.spec.cluster_ip.as_ref().unwrap(); // Construct DNS name: {service}.{namespace}.svc // Full FQDN will be: {service}.{namespace}.svc.cluster.local let record_name = format!("{}.{}.svc", name, namespace); info!( "Creating DNS record {} → {} for service {}/{}", record_name, cluster_ip, namespace, name ); // Create A record let create_req = CreateRecordRequest { zone_id: zone_id.clone(), name: record_name.clone(), record_type: "A".to_string(), ttl: DNS_RECORD_TTL, data: Some(RecordData { data: Some(record_data::Data::A(ARecord { address: cluster_ip.clone(), })), }), }; match record_client .create_record(authorized_request(create_req, &auth_token)) .await { Ok(response) => { let record = response.into_inner().record; if let Some(record) = record { info!( "Created DNS record {} → {} (record_id: {})", record_name, cluster_ip, record.id ); // Store record_id in service annotations service .metadata .annotations .insert("flashdns.plasmacloud.io/record-id".to_string(), record.id); service.metadata.annotations.insert( "flashdns.plasmacloud.io/zone-id".to_string(), zone_id.clone(), ); // Merge with the latest stored version so the FiberLB controller does not // lose its status/annotations when both controllers reconcile together. if let Ok(Some(mut current)) = self .storage .get_service(org_id, project_id, &namespace, &name) .await { current.status = current.status.or(service.status.clone()); current .metadata .annotations .extend(service.metadata.annotations.clone()); service = current; } let current_version = service .metadata .resource_version .as_ref() .and_then(|v| v.parse::().ok()) .unwrap_or(0); service.metadata.resource_version = Some((current_version + 1).to_string()); // Save updated service if let Err(e) = self.storage.put_service(&service).await { warn!( "Failed to update service {}/{} with DNS record ID: {}", namespace, name, e ); } } } Err(e) => { warn!( "Failed to create DNS record {} for service {}/{}: {}", record_name, namespace, name, e ); } } } Ok(()) } /// Ensure cluster.local zone exists for tenant, return zone_id async fn ensure_zone_exists( &self, org_id: &str, project_id: &str, auth_token: &str, ) -> Result { let cache_key = format!("{}/{}", org_id, project_id); // Check cache first { let cache = self.zone_cache.read().await; if let Some(zone_id) = cache.get(&cache_key) { return Ok(zone_id.clone()); } } // Connect to FlashDNS let mut zone_client = ZoneServiceClient::connect(self.flashdns_addr.clone()).await?; if let Some(zone_id) = self .lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token) .await? { info!( "Found existing zone {} for tenant {}/{} (zone_id: {})", CLUSTER_DOMAIN, org_id, project_id, zone_id ); let mut cache = self.zone_cache.write().await; cache.insert(cache_key.clone(), zone_id.clone()); return Ok(zone_id); } // Create zone info!( "Creating zone {} for tenant {}/{}", CLUSTER_DOMAIN, org_id, project_id ); let create_req = CreateZoneRequest { name: CLUSTER_DOMAIN.to_string(), org_id: org_id.to_string(), project_id: project_id.to_string(), primary_ns: "ns1.plasmacloud.io".to_string(), admin_email: "admin@plasmacloud.io".to_string(), }; let response = match zone_client .create_zone(authorized_request(create_req, auth_token)) .await { Ok(response) => response, Err(status) if status.code() == Code::AlreadyExists => { debug!( "Zone {} already exists for tenant {}/{}; retrying lookup", CLUSTER_DOMAIN, org_id, project_id ); for _ in 0..5 { if let Some(zone_id) = self .lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token) .await? { let mut cache = self.zone_cache.write().await; cache.insert(cache_key.clone(), zone_id.clone()); return Ok(zone_id); } sleep(Duration::from_millis(200)).await; } return Err(anyhow::anyhow!( "zone {} already exists for tenant {}/{} but could not be listed", CLUSTER_DOMAIN, org_id, project_id )); } Err(status) => return Err(status.into()), }; let zone = response .into_inner() .zone .ok_or_else(|| anyhow::anyhow!("FlashDNS returned empty zone"))?; info!( "Created zone {} for tenant {}/{} (zone_id: {})", CLUSTER_DOMAIN, org_id, project_id, zone.id ); // Cache zone_id let mut cache = self.zone_cache.write().await; cache.insert(cache_key, zone.id.clone()); Ok(zone.id) } async fn lookup_zone_id( &self, zone_client: &mut ZoneServiceClient, zone_name: &str, auth_token: &str, ) -> Result> { let get_req = GetZoneRequest { identifier: Some(get_zone_request::Identifier::Name(zone_name.to_string())), }; match zone_client .get_zone(authorized_request(get_req, auth_token)) .await { Ok(response) => Ok(response.into_inner().zone.map(|z| z.id)), Err(e) if e.code() == Code::NotFound => Ok(None), Err(e) => { debug!("Exact zone lookup failed for {}: {}", zone_name, e); let list_req = ListZonesRequest { org_id: String::new(), project_id: String::new(), name_filter: zone_name.to_string(), page_size: 100, page_token: String::new(), }; match zone_client .list_zones(authorized_request(list_req, auth_token)) .await { Ok(response) => Ok(response .into_inner() .zones .into_iter() .find(|z| z.name.trim_end_matches('.') == zone_name.trim_end_matches('.')) .map(|z| z.id)), Err(list_error) => { debug!( "Zone list fallback failed for {}: {}", zone_name, list_error ); Ok(None) } } } } } /// Cleanup DNS record when Service is deleted (not automatically triggered in MVP) #[allow(dead_code)] async fn cleanup_dns_record( &self, org_id: &str, project_id: &str, record_id: &str, _zone_id: &str, ) -> Result<()> { let mut record_client = RecordServiceClient::connect(self.flashdns_addr.clone()).await?; let auth_token = issue_controller_token( &self.iam_server_addr, CONTROLLER_PRINCIPAL_ID, org_id, project_id, ) .await?; let delete_req = DeleteRecordRequest { id: record_id.to_string(), }; record_client .delete_record(authorized_request(delete_req, &auth_token)) .await?; info!("Deleted DNS record {} from FlashDNS", record_id); Ok(()) } }