mod auth; mod cni; mod config; mod fiberlb_controller; mod flashdns_controller; mod ipam_client; mod rest; mod scheduler; mod services; mod storage; use anyhow::Result; use auth::AuthService; use chainfire_client::Client as ChainFireClient; use clap::Parser; use config::Config; use ipam_client::IpamClient; use metrics_exporter_prometheus::PrometheusBuilder; use k8shost_proto::{ deployment_service_server::{DeploymentService, DeploymentServiceServer}, node_service_server::NodeServiceServer, pod_service_server::PodServiceServer, service_service_server::ServiceServiceServer, *, }; use services::{node::NodeServiceImpl, pod::PodServiceImpl, service::ServiceServiceImpl}; use std::{path::PathBuf, sync::Arc}; use std::time::{SystemTime, UNIX_EPOCH}; use storage::Storage; use tonic::{transport::Server, Request, Response, Status}; use tracing::{info, warn}; use tracing_subscriber::EnvFilter; /// k8shost API Server #[derive(Parser, Debug)] #[command(name = "k8shost-server")] #[command(about = "Kubernetes API server for PlasmaCloud's k8shost component")] struct Args { /// Configuration file path #[arg(short, long)] config: Option, /// Listen address for gRPC server (e.g., "[::]:6443") #[arg(long)] addr: Option, /// Log level (e.g., "info", "debug", "trace") #[arg(long)] log_level: Option, /// FlareDB Placement Driver address (e.g., "127.0.0.1:2479") #[arg(long)] flaredb_pd_addr: Option, /// FlareDB direct address (e.g., "127.0.0.1:50052") #[arg(long)] flaredb_direct_addr: Option, /// ChainFire endpoint for cluster coordination (e.g., "http://127.0.0.1:2379") #[arg(long)] chainfire_endpoint: Option, /// IAM server address (e.g., "http://127.0.0.1:50051") #[arg(long)] iam_server_addr: Option, /// FiberLB server address (e.g., "http://127.0.0.1:50082") #[arg(long)] fiberlb_server_addr: Option, /// FlashDNS server address (e.g., "http://127.0.0.1:50053") #[arg(long)] flashdns_server_addr: Option, /// PrismNET server address (e.g., "http://127.0.0.1:50081") #[arg(long)] prismnet_server_addr: Option, /// Metrics port for Prometheus scraping #[arg(long, default_value = "9094")] metrics_port: u16, } #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); // Load configuration let mut settings = ::config::Config::builder() .add_source(::config::File::from_str( toml::to_string(&Config::default())?.as_str(), ::config::FileFormat::Toml, )) .add_source(::config::Environment::with_prefix("K8SHOST").separator("_")); // Add config file if specified if let Some(config_path) = &args.config { info!("Loading config from file: {}", config_path.display()); settings = settings.add_source(::config::File::from(config_path.as_path())); } let loaded_config: Config = settings .build()? .try_deserialize() .map_err(|e| anyhow::anyhow!("Failed to load configuration: {}", e))?; // Apply CLI overrides to the loaded configuration let config = Config { server: config::ServerConfig { addr: args .addr .map(|s| s.parse().unwrap_or(loaded_config.server.addr)) .unwrap_or(loaded_config.server.addr), http_addr: loaded_config.server.http_addr, log_level: args.log_level.unwrap_or(loaded_config.server.log_level), }, flaredb: config::FlareDbConfig { pd_addr: args.flaredb_pd_addr.or(loaded_config.flaredb.pd_addr), direct_addr: args.flaredb_direct_addr.or(loaded_config.flaredb.direct_addr), }, chainfire: config::ChainFireConfig { endpoint: args.chainfire_endpoint.or(loaded_config.chainfire.endpoint), }, iam: config::IamConfig { server_addr: args.iam_server_addr.unwrap_or(loaded_config.iam.server_addr), }, fiberlb: config::FiberLbConfig { server_addr: args.fiberlb_server_addr.unwrap_or(loaded_config.fiberlb.server_addr), }, flashdns: config::FlashDnsConfig { server_addr: args.flashdns_server_addr.unwrap_or(loaded_config.flashdns.server_addr), }, prismnet: config::PrismNetConfig { server_addr: args .prismnet_server_addr .unwrap_or(loaded_config.prismnet.server_addr), }, }; // Initialize tracing init_logging(&config.server.log_level); info!("Starting k8shost API server on {}", config.server.addr); if let Some(endpoint) = &config.chainfire.endpoint { let endpoint = endpoint.clone(); let addr = config.server.addr.to_string(); tokio::spawn(async move { if let Err(error) = register_chainfire_membership(&endpoint, "k8shost", addr).await { warn!(error = %error, "ChainFire membership registration failed"); } }); } // Initialize Prometheus metrics exporter let metrics_addr = format!("0.0.0.0:{}", args.metrics_port); let builder = PrometheusBuilder::new(); builder .with_http_listener(metrics_addr.parse::()?) .install() .expect("Failed to install Prometheus metrics exporter"); info!( "Prometheus metrics available at http://{}/metrics", metrics_addr ); // Initialize FlareDB storage. Prefer direct access when configured, but fall back to // the placement-driver path so service startup is resilient to early direct endpoint races. let storage = if let Some(addr) = &config.flaredb.direct_addr { info!("Connecting to FlareDB directly at {}", addr); match Storage::new_direct(addr.clone()).await { Ok(s) => { info!("Successfully connected to FlareDB (direct)"); Arc::new(s) } Err(e) => { warn!("Failed direct FlareDB connection: {}", e); if let Some(pd_addr) = &config.flaredb.pd_addr { info!("Falling back to FlareDB PD at {}", pd_addr); match Storage::new(pd_addr.clone()).await { Ok(s) => { info!("Successfully connected to FlareDB via PD fallback"); Arc::new(s) } Err(pd_error) => { warn!( "Failed to connect to FlareDB via PD fallback: {}. Server will not start.", pd_error ); return Err(anyhow::anyhow!( "Failed to connect to FlareDB directly ({}) and via PD fallback ({}).", e, pd_error ) .into()); } } } else { return Err( anyhow::anyhow!("Failed to connect to FlareDB (direct): {}", e).into() ); } } } } else if let Some(addr) = &config.flaredb.pd_addr { info!("Connecting to FlareDB PD at {}", addr); match Storage::new(addr.clone()).await { Ok(s) => { info!("Successfully connected to FlareDB"); Arc::new(s) } Err(e) => { warn!("Failed to connect to FlareDB: {}. Server will not start.", e); return Err(anyhow::anyhow!("Failed to connect to FlareDB: {}", e).into()); } } } else { return Err(anyhow::anyhow!("No FlareDB address configured.").into()); }; // Initialize IAM authentication service info!("Connecting to IAM server at {}", config.iam.server_addr); let auth_service = match AuthService::new(&config.iam.server_addr).await { Ok(s) => { info!("Successfully connected to IAM server"); Arc::new(s) } Err(e) => { warn!("Failed to connect to IAM server: {}. Server will not start.", e); return Err(anyhow::anyhow!("Failed to connect to IAM server: {}", e).into()); } }; // Dedicated runtime for auth interceptors to avoid blocking the main async runtime let auth_runtime = Arc::new(tokio::runtime::Runtime::new()?); let make_interceptor = |auth: Arc| { let rt = auth_runtime.clone(); move |mut req: Request<()>| -> Result, Status> { let auth = auth.clone(); tokio::task::block_in_place(|| { rt.block_on(async move { let tenant_context = auth.authenticate_request(&req).await?; req.extensions_mut().insert(tenant_context); Ok(req) }) }) } }; // Create IPAM client let ipam_client = Arc::new(IpamClient::new(config.prismnet.server_addr.clone())); // Create service implementations with storage let pod_service = Arc::new( PodServiceImpl::new_with_credit_service(storage.clone(), auth_service.clone()).await, ); let service_service = Arc::new(ServiceServiceImpl::new( storage.clone(), ipam_client, auth_service.clone(), )); let node_service = Arc::new(NodeServiceImpl::new(storage.clone(), auth_service.clone())); let deployment_service = DeploymentServiceImpl; // Still unimplemented // Start scheduler in background with CreditService integration let scheduler = Arc::new(scheduler::Scheduler::new_with_credit_service(storage.clone()).await); tokio::spawn(async move { scheduler.run().await; }); info!("Scheduler started - tenant-aware with quota enforcement"); // Start FiberLB controller in background let fiberlb_controller = Arc::new(fiberlb_controller::FiberLbController::new( storage.clone(), config.fiberlb.server_addr.clone(), config.iam.server_addr.clone(), )); tokio::spawn(async move { fiberlb_controller.run().await; }); info!("FiberLB controller started - monitoring LoadBalancer services with per-tenant IAM tokens"); // Start FlashDNS controller in background let flashdns_controller = Arc::new(flashdns_controller::FlashDnsController::new( storage.clone(), config.flashdns.server_addr.clone(), config.iam.server_addr.clone(), )); tokio::spawn(async move { flashdns_controller.run().await; }); info!("FlashDNS controller started - managing cluster.local DNS records with per-tenant IAM tokens"); info!("Starting gRPC server with authentication..."); // Build gRPC server with authentication layer let grpc_server = Server::builder() .add_service( tonic::codegen::InterceptedService::new( PodServiceServer::new(pod_service.as_ref().clone()), make_interceptor(auth_service.clone()), ), ) .add_service( tonic::codegen::InterceptedService::new( ServiceServiceServer::new(service_service.as_ref().clone()), make_interceptor(auth_service.clone()), ), ) .add_service( tonic::codegen::InterceptedService::new( NodeServiceServer::new(node_service.as_ref().clone()), make_interceptor(auth_service.clone()), ), ) .add_service(DeploymentServiceServer::new(deployment_service)) .serve(config.server.addr); // HTTP REST API server let http_addr = config.server.http_addr; let rest_state = rest::RestApiState { pod_service: pod_service.clone(), service_service: service_service.clone(), node_service: node_service.clone(), auth_service: auth_service.clone(), }; let rest_app = rest::build_router(rest_state); let http_listener = tokio::net::TcpListener::bind(&http_addr).await?; info!("k8shost HTTP REST API server starting on {}", http_addr); let http_server = async move { axum::serve(http_listener, rest_app) .await .map_err(|e| format!("HTTP server error: {}", e)) }; // Run both servers concurrently tokio::select! { result = grpc_server => { result?; } result = http_server => { result?; } } Ok(()) } // Deployment Service Implementation (placeholder - not part of MVP) #[derive(Debug, Default)] struct DeploymentServiceImpl; #[tonic::async_trait] impl DeploymentService for DeploymentServiceImpl { async fn create_deployment( &self, _request: Request, ) -> Result, Status> { Err(Status::unimplemented("create_deployment not yet implemented")) } async fn get_deployment( &self, _request: Request, ) -> Result, Status> { Err(Status::unimplemented("get_deployment not yet implemented")) } async fn list_deployments( &self, _request: Request, ) -> Result, Status> { Err(Status::unimplemented("list_deployments not yet implemented")) } async fn update_deployment( &self, _request: Request, ) -> Result, Status> { Err(Status::unimplemented("update_deployment not yet implemented")) } async fn delete_deployment( &self, _request: Request, ) -> Result, Status> { Err(Status::unimplemented("delete_deployment not yet implemented")) } } fn init_logging(level: &str) { tracing_subscriber::fmt() .with_env_filter(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level))) .init(); } async fn register_chainfire_membership( endpoint: &str, service: &str, addr: String, ) -> Result<()> { let node_id = std::env::var("HOSTNAME").unwrap_or_else(|_| format!("{}-{}", service, std::process::id())); let ts = SystemTime::now() .duration_since(UNIX_EPOCH) .unwrap_or_default() .as_secs(); let key = format!("/cluster/{}/members/{}", service, node_id); let value = format!(r#"{{"addr":"{}","ts":{}}}"#, addr, ts); let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(120); let mut attempt = 0usize; let mut last_error = String::new(); loop { attempt += 1; match ChainFireClient::connect(endpoint).await { Ok(mut client) => match client.put_str(&key, &value).await { Ok(_) => return Ok(()), Err(error) => last_error = format!("put failed: {}", error), }, Err(error) => last_error = format!("connect failed: {}", error), } if tokio::time::Instant::now() >= deadline { break; } tracing::warn!( attempt, endpoint, service, error = %last_error, "retrying ChainFire membership registration" ); tokio::time::sleep(std::time::Duration::from_secs(2)).await; } anyhow::bail!( "failed to register ChainFire membership for {} via {} after {} attempts: {}", service, endpoint, attempt, last_error ) }