427 lines
15 KiB
Rust
427 lines
15 KiB
Rust
//! FlashDNS Controller - Manages cluster.local DNS records for Services
|
|
//!
|
|
//! This controller watches for Services and automatically creates DNS records
|
|
//! in the format: {service}.{namespace}.svc.cluster.local → ClusterIP
|
|
|
|
use crate::auth::{authorized_request, issue_controller_token};
|
|
use crate::storage::Storage;
|
|
use anyhow::Result;
|
|
use flashdns_api::proto::record_service_client::RecordServiceClient;
|
|
use flashdns_api::proto::zone_service_client::ZoneServiceClient;
|
|
use flashdns_api::proto::{
|
|
get_zone_request, record_data, ARecord, CreateRecordRequest, CreateZoneRequest,
|
|
DeleteRecordRequest, GetZoneRequest, ListZonesRequest, RecordData,
|
|
};
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
use tokio::time::sleep;
|
|
use tonic::Code;
|
|
use tracing::{debug, info, warn};
|
|
|
|
const CLUSTER_DOMAIN: &str = "cluster.local";
|
|
const DNS_RECORD_TTL: u32 = 60; // 60 seconds for dynamic cluster services
|
|
const CONTROLLER_PRINCIPAL_ID: &str = "k8shost-controller";
|
|
|
|
/// FlashDNS controller for managing cluster.local DNS records
|
|
pub struct FlashDnsController {
|
|
storage: Arc<Storage>,
|
|
flashdns_addr: String,
|
|
iam_server_addr: String,
|
|
interval: Duration,
|
|
/// Cache of zone_id per tenant (org_id/project_id -> zone_id)
|
|
zone_cache: Arc<tokio::sync::RwLock<HashMap<String, String>>>,
|
|
}
|
|
|
|
impl FlashDnsController {
|
|
/// Create a new FlashDNS controller
|
|
pub fn new(storage: Arc<Storage>, flashdns_addr: String, iam_server_addr: String) -> Self {
|
|
Self {
|
|
storage,
|
|
flashdns_addr,
|
|
iam_server_addr,
|
|
interval: Duration::from_secs(10), // Check every 10 seconds
|
|
zone_cache: Arc::new(tokio::sync::RwLock::new(HashMap::new())),
|
|
}
|
|
}
|
|
|
|
/// Start the controller loop
|
|
pub async fn run(self: Arc<Self>) {
|
|
info!(
|
|
"FlashDNS controller started (FlashDNS at {}, {}s interval)",
|
|
self.flashdns_addr,
|
|
self.interval.as_secs()
|
|
);
|
|
|
|
loop {
|
|
if let Err(e) = self.reconcile_dns_records().await {
|
|
warn!("FlashDNS controller cycle failed: {}", e);
|
|
}
|
|
|
|
sleep(self.interval).await;
|
|
}
|
|
}
|
|
|
|
/// Reconcile DNS records across all tenants
|
|
async fn reconcile_dns_records(&self) -> Result<()> {
|
|
// For MVP, iterate through known tenants
|
|
let tenants = vec![("default-org".to_string(), "default-project".to_string())];
|
|
|
|
for (org_id, project_id) in tenants {
|
|
if let Err(e) = self.reconcile_tenant_dns(&org_id, &project_id).await {
|
|
warn!(
|
|
"Failed to reconcile DNS for tenant {}/{}: {}",
|
|
org_id, project_id, e
|
|
);
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Reconcile DNS records for a specific tenant
|
|
async fn reconcile_tenant_dns(&self, org_id: &str, project_id: &str) -> Result<()> {
|
|
let auth_token = issue_controller_token(
|
|
&self.iam_server_addr,
|
|
CONTROLLER_PRINCIPAL_ID,
|
|
org_id,
|
|
project_id,
|
|
)
|
|
.await?;
|
|
|
|
// Ensure cluster.local zone exists for this tenant
|
|
let zone_id = match self
|
|
.ensure_zone_exists(org_id, project_id, &auth_token)
|
|
.await
|
|
{
|
|
Ok(id) => id,
|
|
Err(e) => {
|
|
warn!(
|
|
"Failed to ensure zone exists for tenant {}/{}: {}",
|
|
org_id, project_id, e
|
|
);
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
// Get all services for this tenant
|
|
let services = self.storage.list_services(org_id, project_id, None).await?;
|
|
|
|
// Filter for services that need DNS records
|
|
let services_needing_dns: Vec<_> = services
|
|
.into_iter()
|
|
.filter(|svc| {
|
|
// Service needs DNS if:
|
|
// 1. Has cluster_ip allocated
|
|
// 2. Does NOT have flashdns.plasmacloud.io/record-id annotation (not yet provisioned)
|
|
svc.spec.cluster_ip.is_some()
|
|
&& !svc
|
|
.metadata
|
|
.annotations
|
|
.contains_key("flashdns.plasmacloud.io/record-id")
|
|
})
|
|
.collect();
|
|
|
|
if services_needing_dns.is_empty() {
|
|
debug!(
|
|
"No services need DNS records for tenant {}/{}",
|
|
org_id, project_id
|
|
);
|
|
return Ok(());
|
|
}
|
|
|
|
info!(
|
|
"Found {} service(s) needing DNS records for tenant {}/{}",
|
|
services_needing_dns.len(),
|
|
org_id,
|
|
project_id
|
|
);
|
|
|
|
// Connect to FlashDNS
|
|
let mut record_client = match RecordServiceClient::connect(self.flashdns_addr.clone()).await
|
|
{
|
|
Ok(client) => client,
|
|
Err(e) => {
|
|
warn!(
|
|
"Failed to connect to FlashDNS at {}: {}",
|
|
self.flashdns_addr, e
|
|
);
|
|
return Ok(());
|
|
}
|
|
};
|
|
|
|
// Create DNS records for each service
|
|
for mut service in services_needing_dns {
|
|
let namespace = service
|
|
.metadata
|
|
.namespace
|
|
.clone()
|
|
.unwrap_or_else(|| "default".to_string());
|
|
let name = service.metadata.name.clone();
|
|
let cluster_ip = service.spec.cluster_ip.as_ref().unwrap();
|
|
|
|
// Construct DNS name: {service}.{namespace}.svc
|
|
// Full FQDN will be: {service}.{namespace}.svc.cluster.local
|
|
let record_name = format!("{}.{}.svc", name, namespace);
|
|
|
|
info!(
|
|
"Creating DNS record {} → {} for service {}/{}",
|
|
record_name, cluster_ip, namespace, name
|
|
);
|
|
|
|
// Create A record
|
|
let create_req = CreateRecordRequest {
|
|
zone_id: zone_id.clone(),
|
|
name: record_name.clone(),
|
|
record_type: "A".to_string(),
|
|
ttl: DNS_RECORD_TTL,
|
|
data: Some(RecordData {
|
|
data: Some(record_data::Data::A(ARecord {
|
|
address: cluster_ip.clone(),
|
|
})),
|
|
}),
|
|
};
|
|
|
|
match record_client
|
|
.create_record(authorized_request(create_req, &auth_token))
|
|
.await
|
|
{
|
|
Ok(response) => {
|
|
let record = response.into_inner().record;
|
|
if let Some(record) = record {
|
|
info!(
|
|
"Created DNS record {} → {} (record_id: {})",
|
|
record_name, cluster_ip, record.id
|
|
);
|
|
|
|
// Store record_id in service annotations
|
|
service
|
|
.metadata
|
|
.annotations
|
|
.insert("flashdns.plasmacloud.io/record-id".to_string(), record.id);
|
|
service.metadata.annotations.insert(
|
|
"flashdns.plasmacloud.io/zone-id".to_string(),
|
|
zone_id.clone(),
|
|
);
|
|
|
|
// Merge with the latest stored version so the FiberLB controller does not
|
|
// lose its status/annotations when both controllers reconcile together.
|
|
if let Ok(Some(mut current)) = self
|
|
.storage
|
|
.get_service(org_id, project_id, &namespace, &name)
|
|
.await
|
|
{
|
|
current.status = current.status.or(service.status.clone());
|
|
current
|
|
.metadata
|
|
.annotations
|
|
.extend(service.metadata.annotations.clone());
|
|
service = current;
|
|
}
|
|
|
|
let current_version = service
|
|
.metadata
|
|
.resource_version
|
|
.as_ref()
|
|
.and_then(|v| v.parse::<u64>().ok())
|
|
.unwrap_or(0);
|
|
service.metadata.resource_version = Some((current_version + 1).to_string());
|
|
|
|
// Save updated service
|
|
if let Err(e) = self.storage.put_service(&service).await {
|
|
warn!(
|
|
"Failed to update service {}/{} with DNS record ID: {}",
|
|
namespace, name, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
"Failed to create DNS record {} for service {}/{}: {}",
|
|
record_name, namespace, name, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Ensure cluster.local zone exists for tenant, return zone_id
|
|
async fn ensure_zone_exists(
|
|
&self,
|
|
org_id: &str,
|
|
project_id: &str,
|
|
auth_token: &str,
|
|
) -> Result<String> {
|
|
let cache_key = format!("{}/{}", org_id, project_id);
|
|
|
|
// Check cache first
|
|
{
|
|
let cache = self.zone_cache.read().await;
|
|
if let Some(zone_id) = cache.get(&cache_key) {
|
|
return Ok(zone_id.clone());
|
|
}
|
|
}
|
|
|
|
// Connect to FlashDNS
|
|
let mut zone_client = ZoneServiceClient::connect(self.flashdns_addr.clone()).await?;
|
|
|
|
if let Some(zone_id) = self
|
|
.lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token)
|
|
.await?
|
|
{
|
|
info!(
|
|
"Found existing zone {} for tenant {}/{} (zone_id: {})",
|
|
CLUSTER_DOMAIN, org_id, project_id, zone_id
|
|
);
|
|
|
|
let mut cache = self.zone_cache.write().await;
|
|
cache.insert(cache_key.clone(), zone_id.clone());
|
|
|
|
return Ok(zone_id);
|
|
}
|
|
|
|
// Create zone
|
|
info!(
|
|
"Creating zone {} for tenant {}/{}",
|
|
CLUSTER_DOMAIN, org_id, project_id
|
|
);
|
|
|
|
let create_req = CreateZoneRequest {
|
|
name: CLUSTER_DOMAIN.to_string(),
|
|
org_id: org_id.to_string(),
|
|
project_id: project_id.to_string(),
|
|
primary_ns: "ns1.plasmacloud.io".to_string(),
|
|
admin_email: "admin@plasmacloud.io".to_string(),
|
|
};
|
|
|
|
let response = match zone_client
|
|
.create_zone(authorized_request(create_req, auth_token))
|
|
.await
|
|
{
|
|
Ok(response) => response,
|
|
Err(status) if status.code() == Code::AlreadyExists => {
|
|
debug!(
|
|
"Zone {} already exists for tenant {}/{}; retrying lookup",
|
|
CLUSTER_DOMAIN, org_id, project_id
|
|
);
|
|
|
|
for _ in 0..5 {
|
|
if let Some(zone_id) = self
|
|
.lookup_zone_id(&mut zone_client, CLUSTER_DOMAIN, auth_token)
|
|
.await?
|
|
{
|
|
let mut cache = self.zone_cache.write().await;
|
|
cache.insert(cache_key.clone(), zone_id.clone());
|
|
return Ok(zone_id);
|
|
}
|
|
sleep(Duration::from_millis(200)).await;
|
|
}
|
|
|
|
return Err(anyhow::anyhow!(
|
|
"zone {} already exists for tenant {}/{} but could not be listed",
|
|
CLUSTER_DOMAIN,
|
|
org_id,
|
|
project_id
|
|
));
|
|
}
|
|
Err(status) => return Err(status.into()),
|
|
};
|
|
let zone = response
|
|
.into_inner()
|
|
.zone
|
|
.ok_or_else(|| anyhow::anyhow!("FlashDNS returned empty zone"))?;
|
|
|
|
info!(
|
|
"Created zone {} for tenant {}/{} (zone_id: {})",
|
|
CLUSTER_DOMAIN, org_id, project_id, zone.id
|
|
);
|
|
|
|
// Cache zone_id
|
|
let mut cache = self.zone_cache.write().await;
|
|
cache.insert(cache_key, zone.id.clone());
|
|
|
|
Ok(zone.id)
|
|
}
|
|
|
|
async fn lookup_zone_id(
|
|
&self,
|
|
zone_client: &mut ZoneServiceClient<tonic::transport::Channel>,
|
|
zone_name: &str,
|
|
auth_token: &str,
|
|
) -> Result<Option<String>> {
|
|
let get_req = GetZoneRequest {
|
|
identifier: Some(get_zone_request::Identifier::Name(zone_name.to_string())),
|
|
};
|
|
|
|
match zone_client
|
|
.get_zone(authorized_request(get_req, auth_token))
|
|
.await
|
|
{
|
|
Ok(response) => Ok(response.into_inner().zone.map(|z| z.id)),
|
|
Err(e) if e.code() == Code::NotFound => Ok(None),
|
|
Err(e) => {
|
|
debug!("Exact zone lookup failed for {}: {}", zone_name, e);
|
|
|
|
let list_req = ListZonesRequest {
|
|
org_id: String::new(),
|
|
project_id: String::new(),
|
|
name_filter: zone_name.to_string(),
|
|
page_size: 100,
|
|
page_token: String::new(),
|
|
};
|
|
|
|
match zone_client
|
|
.list_zones(authorized_request(list_req, auth_token))
|
|
.await
|
|
{
|
|
Ok(response) => Ok(response
|
|
.into_inner()
|
|
.zones
|
|
.into_iter()
|
|
.find(|z| z.name.trim_end_matches('.') == zone_name.trim_end_matches('.'))
|
|
.map(|z| z.id)),
|
|
Err(list_error) => {
|
|
debug!(
|
|
"Zone list fallback failed for {}: {}",
|
|
zone_name, list_error
|
|
);
|
|
Ok(None)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Cleanup DNS record when Service is deleted (not automatically triggered in MVP)
|
|
#[allow(dead_code)]
|
|
async fn cleanup_dns_record(
|
|
&self,
|
|
org_id: &str,
|
|
project_id: &str,
|
|
record_id: &str,
|
|
_zone_id: &str,
|
|
) -> Result<()> {
|
|
let mut record_client = RecordServiceClient::connect(self.flashdns_addr.clone()).await?;
|
|
let auth_token = issue_controller_token(
|
|
&self.iam_server_addr,
|
|
CONTROLLER_PRINCIPAL_ID,
|
|
org_id,
|
|
project_id,
|
|
)
|
|
.await?;
|
|
|
|
let delete_req = DeleteRecordRequest {
|
|
id: record_id.to_string(),
|
|
};
|
|
|
|
record_client
|
|
.delete_record(authorized_request(delete_req, &auth_token))
|
|
.await?;
|
|
|
|
info!("Deleted DNS record {} from FlashDNS", record_id);
|
|
Ok(())
|
|
}
|
|
}
|