photoncloud-monorepo/k8shost/crates/k8shost-server/src/flashdns_controller.rs

427 lines
15 KiB
Rust

//! FlashDNS Controller - Manages cluster.local DNS records for Services
//!
//! This controller watches for Services and automatically creates DNS records
//! in the format: {service}.{namespace}.svc.cluster.local → ClusterIP
use crate::auth::{authorized_request, issue_controller_token};
use crate::storage::Storage;
use anyhow::Result;
use flashdns_api::proto::record_service_client::RecordServiceClient;
use flashdns_api::proto::zone_service_client::ZoneServiceClient;
use flashdns_api::proto::{
get_zone_request, record_data, ARecord, CreateRecordRequest, CreateZoneRequest,
DeleteRecordRequest, GetZoneRequest, ListZonesRequest, RecordData,
};
use std::collections::HashMap;
use std::sync::Arc;
use std::time::Duration;
use tokio::time::sleep;
use tonic::Code;
use tracing::{debug, info, warn};
const CLUSTER_DOMAIN: &str = "cluster.local";
const DNS_RECORD_TTL: u32 = 60; // 60 seconds for dynamic cluster services
const CONTROLLER_PRINCIPAL_ID: &str = "k8shost-controller";
/// FlashDNS controller for managing cluster.local DNS records
pub struct FlashDnsController {
storage: Arc<Storage>,
flashdns_addr: String,
iam_server_addr: String,
interval: Duration,
/// Cache of zone_id per tenant (org_id/project_id -> zone_id)
zone_cache: Arc<tokio::sync::RwLock<HashMap<String, String>>>,
}
impl FlashDnsController {
/// Create a new FlashDNS controller
pub fn new(storage: Arc<Storage>, flashdns_addr: String, iam_server_addr: String) -> Self {
Self {
storage,
flashdns_addr,
iam_server_addr,
interval: Duration::from_secs(10), // Check every 10 seconds
zone_cache: Arc::new(tokio::sync::RwLock::new(HashMap::new())),
}
}
/// Start the controller loop
pub async fn run(self: Arc<Self>) {
info!(
"FlashDNS controller started (FlashDNS at {}, {}s interval)",
self.flashdns_addr,
self.interval.as_secs()
);
loop {
if let Err(e) = self.reconcile_dns_records().await {
warn!("FlashDNS controller cycle failed: {}", e);
}
sleep(self.interval).await;
}
}
/// Reconcile DNS records across all tenants
async fn reconcile_dns_records(&self) -> Result<()> {
// For MVP, iterate through known tenants
let tenants = vec![("default-org".to_string(), "default-project".to_string())];
for (org_id, project_id) in tenants {
if let Err(e) = self.reconcile_tenant_dns(&org_id, &project_id).await {
warn!(
"Failed to reconcile DNS for tenant {}/{}: {}",
org_id, project_id, e
);
}
}
Ok(())
}
/// Reconcile DNS records for a specific tenant
async fn reconcile_tenant_dns(&self, org_id: &str, project_id: &str) -> Result<()> {
let auth_token = issue_controller_token(
&self.iam_server_addr,
CONTROLLER_PRINCIPAL_ID,
org_id,
project_id,
)
.await?;
// Ensure cluster.local zone exists for this tenant
let zone_id = match self
.ensure_zone_exists(org_id, project_id, &auth_token)
.await
{
Ok(id) => id,
Err(e) => {
warn!(
"Failed to ensure zone exists for tenant {}/{}: {}",
org_id, project_id, e
);
return Ok(());
}
};
// Get all services for this tenant
let services = self.storage.list_services(org_id, project_id, None).await?;
// Filter for services that need DNS records
let services_needing_dns: Vec<_> = services
.into_iter()
.filter(|svc| {
// Service needs DNS if:
// 1. Has cluster_ip allocated
// 2. Does NOT have flashdns.plasmacloud.io/record-id annotation (not yet provisioned)
svc.spec.cluster_ip.is_some()
&& !svc
.metadata
.annotations
.contains_key("flashdns.plasmacloud.io/record-id")
})
.collect();
if services_needing_dns.is_empty() {
debug!(
"No services need DNS records for tenant {}/{}",
org_id, project_id
);
return Ok(());
}
info!(
"Found {} service(s) needing DNS records for tenant {}/{}",
services_needing_dns.len(),
org_id,
project_id
);
// Connect to FlashDNS
let mut record_client = match RecordServiceClient::connect(self.flashdns_addr.clone()).await
{
Ok(client) => client,
Err(e) => {
warn!(
"Failed to connect to FlashDNS at {}: {}",
self.flashdns_addr, e
);
return Ok(());
}
};
// Create DNS records for each service
for mut service in services_needing_dns {
let namespace = service
.metadata
.namespace
.clone()
.unwrap_or_else(|| "default".to_string());
let name = service.metadata.name.clone();
let cluster_ip = service.spec.cluster_ip.as_ref().unwrap();
// Construct DNS name: {service}.{namespace}.svc
// Full FQDN will be: {service}.{namespace}.svc.cluster.local
let record_name = format!("{}.{}.svc", name, namespace);
info!(
"Creating DNS record {} → {} for service {}/{}",
record_name, cluster_ip, namespace, name
);
// Create A record
let create_req = CreateRecordRequest {
zone_id: zone_id.clone(),
name: record_name.clone(),
record_type: "A".to_string(),
ttl: DNS_RECORD_TTL,
data: Some(RecordData {
data: Some(record_data::Data::A(ARecord {
address: cluster_ip.clone(),
})),
}),
};
match record_client
.create_record(authorized_request(create_req, &auth_token))
.await
{
Ok(response) => {
let record = response.into_inner().record;
if let Some(record) = record {
info!(
"Created DNS record {} → {} (record_id: {})",
record_name, cluster_ip, record.id
);
// Store record_id in service annotations
service
.metadata
.annotations
.insert("flashdns.plasmacloud.io/record-id".to_string(), record.id);
service.metadata.annotations.insert(
"flashdns.plasmacloud.io/zone-id".to_string(),
zone_id.clone(),
);
// Merge with the latest stored version so the FiberLB controller does not
// lose its status/annotations when both controllers reconcile together.
if let Ok(Some(mut current)) = self
.storage
.get_service(org_id, project_id, &namespace, &name)
.await
{
current.status = current.status.or(service.status.clone());
current
.metadata
.annotations
.extend(service.metadata.annotations.clone());
service = current;
}
let current_version = service
.metadata
.resource_version
.as_ref()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(0);
service.metadata.resource_version = Some((current_version + 1).to_string());
// Save updated service
if let Err(e) = self.storage.put_service(&service).await {
warn!(
"Failed to update service {}/{} with DNS record ID: {}",
namespace, name, e
);
}
}
}
Err(e) => {
warn!(
"Failed to create DNS record {} for service {}/{}: {}",
record_name, namespace, name, e
);
}
}
}
Ok(())
}
/// Ensure cluster.local zone exists for tenant, return zone_id
async fn ensure_zone_exists(
&self,
org_id: &str,
project_id: &str,
auth_token: &str,
) -> Result<String> {
let cache_key = format!("{}/{}", org_id, project_id);
// Check cache first
{
let cache = self.zone_cache.read().await;
if let Some(zone_id) = cache.get(&cache_key) {
return Ok(zone_id.clone());
}
}
// Connect to FlashDNS
let mut zone_client = ZoneServiceClient::connect(self.flashdns_addr.clone()).await?;
if let Some(zone_id) = self
.lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token)
.await?
{
info!(
"Found existing zone {} for tenant {}/{} (zone_id: {})",
CLUSTER_DOMAIN, org_id, project_id, zone_id
);
let mut cache = self.zone_cache.write().await;
cache.insert(cache_key.clone(), zone_id.clone());
return Ok(zone_id);
}
// Create zone
info!(
"Creating zone {} for tenant {}/{}",
CLUSTER_DOMAIN, org_id, project_id
);
let create_req = CreateZoneRequest {
name: CLUSTER_DOMAIN.to_string(),
org_id: org_id.to_string(),
project_id: project_id.to_string(),
primary_ns: "ns1.plasmacloud.io".to_string(),
admin_email: "admin@plasmacloud.io".to_string(),
};
let response = match zone_client
.create_zone(authorized_request(create_req, auth_token))
.await
{
Ok(response) => response,
Err(status) if status.code() == Code::AlreadyExists => {
debug!(
"Zone {} already exists for tenant {}/{}; retrying lookup",
CLUSTER_DOMAIN, org_id, project_id
);
for _ in 0..5 {
if let Some(zone_id) = self
.lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token)
.await?
{
let mut cache = self.zone_cache.write().await;
cache.insert(cache_key.clone(), zone_id.clone());
return Ok(zone_id);
}
sleep(Duration::from_millis(200)).await;
}
return Err(anyhow::anyhow!(
"zone {} already exists for tenant {}/{} but could not be listed",
CLUSTER_DOMAIN,
org_id,
project_id
));
}
Err(status) => return Err(status.into()),
};
let zone = response
.into_inner()
.zone
.ok_or_else(|| anyhow::anyhow!("FlashDNS returned empty zone"))?;
info!(
"Created zone {} for tenant {}/{} (zone_id: {})",
CLUSTER_DOMAIN, org_id, project_id, zone.id
);
// Cache zone_id
let mut cache = self.zone_cache.write().await;
cache.insert(cache_key, zone.id.clone());
Ok(zone.id)
}
async fn lookup_zone_id(
&self,
zone_client: &mut ZoneServiceClient<tonic::transport::Channel>,
zone_name: &str,
auth_token: &str,
) -> Result<Option<String>> {
let get_req = GetZoneRequest {
identifier: Some(get_zone_request::Identifier::Name(zone_name.to_string())),
};
match zone_client
.get_zone(authorized_request(get_req, auth_token))
.await
{
Ok(response) => Ok(response.into_inner().zone.map(|z| z.id)),
Err(e) if e.code() == Code::NotFound => Ok(None),
Err(e) => {
debug!("Exact zone lookup failed for {}: {}", zone_name, e);
let list_req = ListZonesRequest {
org_id: String::new(),
project_id: String::new(),
name_filter: zone_name.to_string(),
page_size: 100,
page_token: String::new(),
};
match zone_client
.list_zones(authorized_request(list_req, auth_token))
.await
{
Ok(response) => Ok(response
.into_inner()
.zones
.into_iter()
.find(|z| z.name.trim_end_matches('.') == zone_name.trim_end_matches('.'))
.map(|z| z.id)),
Err(list_error) => {
debug!(
"Zone list fallback failed for {}: {}",
zone_name, list_error
);
Ok(None)
}
}
}
}
}
/// Cleanup DNS record when Service is deleted (not automatically triggered in MVP)
#[allow(dead_code)]
async fn cleanup_dns_record(
&self,
org_id: &str,
project_id: &str,
record_id: &str,
_zone_id: &str,
) -> Result<()> {
let mut record_client = RecordServiceClient::connect(self.flashdns_addr.clone()).await?;
let auth_token = issue_controller_token(
&self.iam_server_addr,
CONTROLLER_PRINCIPAL_ID,
org_id,
project_id,
)
.await?;
let delete_req = DeleteRecordRequest {
id: record_id.to_string(),
};
record_client
.delete_record(authorized_request(delete_req, &auth_token))
.await?;
info!("Deleted DNS record {} from FlashDNS", record_id);
Ok(())
}
}