photoncloud-monorepo/k8shost/crates/k8shost-server/src/main.rs

449 lines
16 KiB
Rust

mod auth;
mod cni;
mod config;
mod fiberlb_controller;
mod flashdns_controller;
mod ipam_client;
mod rest;
mod scheduler;
mod services;
mod storage;
use anyhow::Result;
use auth::AuthService;
use chainfire_client::Client as ChainFireClient;
use clap::Parser;
use config::Config;
use ipam_client::IpamClient;
use metrics_exporter_prometheus::PrometheusBuilder;
use k8shost_proto::{
deployment_service_server::{DeploymentService, DeploymentServiceServer},
node_service_server::NodeServiceServer,
pod_service_server::PodServiceServer,
service_service_server::ServiceServiceServer,
*,
};
use services::{node::NodeServiceImpl, pod::PodServiceImpl, service::ServiceServiceImpl};
use std::{path::PathBuf, sync::Arc};
use std::time::{SystemTime, UNIX_EPOCH};
use storage::Storage;
use tonic::{transport::Server, Request, Response, Status};
use tracing::{info, warn};
use tracing_subscriber::EnvFilter;
/// k8shost API Server
#[derive(Parser, Debug)]
#[command(name = "k8shost-server")]
#[command(about = "Kubernetes API server for PlasmaCloud's k8shost component")]
struct Args {
/// Configuration file path
#[arg(short, long)]
config: Option<PathBuf>,
/// Listen address for gRPC server (e.g., "[::]:6443")
#[arg(long)]
addr: Option<String>,
/// Log level (e.g., "info", "debug", "trace")
#[arg(long)]
log_level: Option<String>,
/// FlareDB Placement Driver address (e.g., "127.0.0.1:2479")
#[arg(long)]
flaredb_pd_addr: Option<String>,
/// FlareDB direct address (e.g., "127.0.0.1:50052")
#[arg(long)]
flaredb_direct_addr: Option<String>,
/// ChainFire endpoint for cluster coordination (e.g., "http://127.0.0.1:2379")
#[arg(long)]
chainfire_endpoint: Option<String>,
/// IAM server address (e.g., "http://127.0.0.1:50051")
#[arg(long)]
iam_server_addr: Option<String>,
/// FiberLB server address (e.g., "http://127.0.0.1:50082")
#[arg(long)]
fiberlb_server_addr: Option<String>,
/// FlashDNS server address (e.g., "http://127.0.0.1:50053")
#[arg(long)]
flashdns_server_addr: Option<String>,
/// PrismNET server address (e.g., "http://127.0.0.1:50081")
#[arg(long)]
prismnet_server_addr: Option<String>,
/// Metrics port for Prometheus scraping
#[arg(long, default_value = "9094")]
metrics_port: u16,
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
// Load configuration
let mut settings = ::config::Config::builder()
.add_source(::config::File::from_str(
toml::to_string(&Config::default())?.as_str(),
::config::FileFormat::Toml,
))
.add_source(::config::Environment::with_prefix("K8SHOST").separator("_"));
// Add config file if specified
if let Some(config_path) = &args.config {
info!("Loading config from file: {}", config_path.display());
settings = settings.add_source(::config::File::from(config_path.as_path()));
}
let loaded_config: Config = settings
.build()?
.try_deserialize()
.map_err(|e| anyhow::anyhow!("Failed to load configuration: {}", e))?;
// Apply CLI overrides to the loaded configuration
let config = Config {
server: config::ServerConfig {
addr: args
.addr
.map(|s| s.parse().unwrap_or(loaded_config.server.addr))
.unwrap_or(loaded_config.server.addr),
http_addr: loaded_config.server.http_addr,
log_level: args.log_level.unwrap_or(loaded_config.server.log_level),
},
flaredb: config::FlareDbConfig {
pd_addr: args.flaredb_pd_addr.or(loaded_config.flaredb.pd_addr),
direct_addr: args.flaredb_direct_addr.or(loaded_config.flaredb.direct_addr),
},
chainfire: config::ChainFireConfig {
endpoint: args.chainfire_endpoint.or(loaded_config.chainfire.endpoint),
},
iam: config::IamConfig {
server_addr: args.iam_server_addr.unwrap_or(loaded_config.iam.server_addr),
},
fiberlb: config::FiberLbConfig {
server_addr: args.fiberlb_server_addr.unwrap_or(loaded_config.fiberlb.server_addr),
},
flashdns: config::FlashDnsConfig {
server_addr: args.flashdns_server_addr.unwrap_or(loaded_config.flashdns.server_addr),
},
prismnet: config::PrismNetConfig {
server_addr: args
.prismnet_server_addr
.unwrap_or(loaded_config.prismnet.server_addr),
},
};
// Initialize tracing
init_logging(&config.server.log_level);
info!("Starting k8shost API server on {}", config.server.addr);
if let Some(endpoint) = &config.chainfire.endpoint {
let endpoint = endpoint.clone();
let addr = config.server.addr.to_string();
tokio::spawn(async move {
if let Err(error) = register_chainfire_membership(&endpoint, "k8shost", addr).await {
warn!(error = %error, "ChainFire membership registration failed");
}
});
}
// Initialize Prometheus metrics exporter
let metrics_addr = format!("0.0.0.0:{}", args.metrics_port);
let builder = PrometheusBuilder::new();
builder
.with_http_listener(metrics_addr.parse::<std::net::SocketAddr>()?)
.install()
.expect("Failed to install Prometheus metrics exporter");
info!(
"Prometheus metrics available at http://{}/metrics",
metrics_addr
);
// Initialize FlareDB storage. Prefer direct access when configured, but fall back to
// the placement-driver path so service startup is resilient to early direct endpoint races.
let storage = if let Some(addr) = &config.flaredb.direct_addr {
info!("Connecting to FlareDB directly at {}", addr);
match Storage::new_direct(addr.clone()).await {
Ok(s) => {
info!("Successfully connected to FlareDB (direct)");
Arc::new(s)
}
Err(e) => {
warn!("Failed direct FlareDB connection: {}", e);
if let Some(pd_addr) = &config.flaredb.pd_addr {
info!("Falling back to FlareDB PD at {}", pd_addr);
match Storage::new(pd_addr.clone()).await {
Ok(s) => {
info!("Successfully connected to FlareDB via PD fallback");
Arc::new(s)
}
Err(pd_error) => {
warn!(
"Failed to connect to FlareDB via PD fallback: {}. Server will not start.",
pd_error
);
return Err(anyhow::anyhow!(
"Failed to connect to FlareDB directly ({}) and via PD fallback ({}).",
e,
pd_error
)
.into());
}
}
} else {
return Err(
anyhow::anyhow!("Failed to connect to FlareDB (direct): {}", e).into()
);
}
}
}
} else if let Some(addr) = &config.flaredb.pd_addr {
info!("Connecting to FlareDB PD at {}", addr);
match Storage::new(addr.clone()).await {
Ok(s) => {
info!("Successfully connected to FlareDB");
Arc::new(s)
}
Err(e) => {
warn!("Failed to connect to FlareDB: {}. Server will not start.", e);
return Err(anyhow::anyhow!("Failed to connect to FlareDB: {}", e).into());
}
}
} else {
return Err(anyhow::anyhow!("No FlareDB address configured.").into());
};
// Initialize IAM authentication service
info!("Connecting to IAM server at {}", config.iam.server_addr);
let auth_service = match AuthService::new(&config.iam.server_addr).await {
Ok(s) => {
info!("Successfully connected to IAM server");
Arc::new(s)
}
Err(e) => {
warn!("Failed to connect to IAM server: {}. Server will not start.", e);
return Err(anyhow::anyhow!("Failed to connect to IAM server: {}", e).into());
}
};
// Dedicated runtime for auth interceptors to avoid blocking the main async runtime
let auth_runtime = Arc::new(tokio::runtime::Runtime::new()?);
let make_interceptor = |auth: Arc<AuthService>| {
let rt = auth_runtime.clone();
move |mut req: Request<()>| -> Result<Request<()>, Status> {
let auth = auth.clone();
tokio::task::block_in_place(|| {
rt.block_on(async move {
let tenant_context = auth.authenticate_request(&req).await?;
req.extensions_mut().insert(tenant_context);
Ok(req)
})
})
}
};
// Create IPAM client
let ipam_client = Arc::new(IpamClient::new(config.prismnet.server_addr.clone()));
// Create service implementations with storage
let pod_service = Arc::new(
PodServiceImpl::new_with_credit_service(storage.clone(), auth_service.clone()).await,
);
let service_service = Arc::new(ServiceServiceImpl::new(
storage.clone(),
ipam_client,
auth_service.clone(),
));
let node_service = Arc::new(NodeServiceImpl::new(storage.clone(), auth_service.clone()));
let deployment_service = DeploymentServiceImpl; // Still unimplemented
// Start scheduler in background with CreditService integration
let scheduler = Arc::new(scheduler::Scheduler::new_with_credit_service(storage.clone()).await);
tokio::spawn(async move {
scheduler.run().await;
});
info!("Scheduler started - tenant-aware with quota enforcement");
// Start FiberLB controller in background
let fiberlb_controller = Arc::new(fiberlb_controller::FiberLbController::new(
storage.clone(),
config.fiberlb.server_addr.clone(),
config.iam.server_addr.clone(),
));
tokio::spawn(async move {
fiberlb_controller.run().await;
});
info!("FiberLB controller started - monitoring LoadBalancer services with per-tenant IAM tokens");
// Start FlashDNS controller in background
let flashdns_controller = Arc::new(flashdns_controller::FlashDnsController::new(
storage.clone(),
config.flashdns.server_addr.clone(),
config.iam.server_addr.clone(),
));
tokio::spawn(async move {
flashdns_controller.run().await;
});
info!("FlashDNS controller started - managing cluster.local DNS records with per-tenant IAM tokens");
info!("Starting gRPC server with authentication...");
// Build gRPC server with authentication layer
let grpc_server = Server::builder()
.add_service(
tonic::codegen::InterceptedService::new(
PodServiceServer::new(pod_service.as_ref().clone()),
make_interceptor(auth_service.clone()),
),
)
.add_service(
tonic::codegen::InterceptedService::new(
ServiceServiceServer::new(service_service.as_ref().clone()),
make_interceptor(auth_service.clone()),
),
)
.add_service(
tonic::codegen::InterceptedService::new(
NodeServiceServer::new(node_service.as_ref().clone()),
make_interceptor(auth_service.clone()),
),
)
.add_service(DeploymentServiceServer::new(deployment_service))
.serve(config.server.addr);
// HTTP REST API server
let http_addr = config.server.http_addr;
let rest_state = rest::RestApiState {
pod_service: pod_service.clone(),
service_service: service_service.clone(),
node_service: node_service.clone(),
auth_service: auth_service.clone(),
};
let rest_app = rest::build_router(rest_state);
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
info!("k8shost HTTP REST API server starting on {}", http_addr);
let http_server = async move {
axum::serve(http_listener, rest_app)
.await
.map_err(|e| format!("HTTP server error: {}", e))
};
// Run both servers concurrently
tokio::select! {
result = grpc_server => {
result?;
}
result = http_server => {
result?;
}
}
Ok(())
}
// Deployment Service Implementation (placeholder - not part of MVP)
#[derive(Debug, Default)]
struct DeploymentServiceImpl;
#[tonic::async_trait]
impl DeploymentService for DeploymentServiceImpl {
async fn create_deployment(
&self,
_request: Request<CreateDeploymentRequest>,
) -> Result<Response<CreateDeploymentResponse>, Status> {
Err(Status::unimplemented("create_deployment not yet implemented"))
}
async fn get_deployment(
&self,
_request: Request<GetDeploymentRequest>,
) -> Result<Response<GetDeploymentResponse>, Status> {
Err(Status::unimplemented("get_deployment not yet implemented"))
}
async fn list_deployments(
&self,
_request: Request<ListDeploymentsRequest>,
) -> Result<Response<ListDeploymentsResponse>, Status> {
Err(Status::unimplemented("list_deployments not yet implemented"))
}
async fn update_deployment(
&self,
_request: Request<UpdateDeploymentRequest>,
) -> Result<Response<UpdateDeploymentResponse>, Status> {
Err(Status::unimplemented("update_deployment not yet implemented"))
}
async fn delete_deployment(
&self,
_request: Request<DeleteDeploymentRequest>,
) -> Result<Response<DeleteDeploymentResponse>, Status> {
Err(Status::unimplemented("delete_deployment not yet implemented"))
}
}
fn init_logging(level: &str) {
tracing_subscriber::fmt()
.with_env_filter(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level)))
.init();
}
async fn register_chainfire_membership(
endpoint: &str,
service: &str,
addr: String,
) -> Result<()> {
let node_id =
std::env::var("HOSTNAME").unwrap_or_else(|_| format!("{}-{}", service, std::process::id()));
let ts = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
let key = format!("/cluster/{}/members/{}", service, node_id);
let value = format!(r#"{{"addr":"{}","ts":{}}}"#, addr, ts);
let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(120);
let mut attempt = 0usize;
let mut last_error = String::new();
loop {
attempt += 1;
match ChainFireClient::connect(endpoint).await {
Ok(mut client) => match client.put_str(&key, &value).await {
Ok(_) => return Ok(()),
Err(error) => last_error = format!("put failed: {}", error),
},
Err(error) => last_error = format!("connect failed: {}", error),
}
if tokio::time::Instant::now() >= deadline {
break;
}
tracing::warn!(
attempt,
endpoint,
service,
error = %last_error,
"retrying ChainFire membership registration"
);
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
anyhow::bail!(
"failed to register ChainFire membership for {} via {} after {} attempts: {}",
service,
endpoint,
attempt,
last_error
)
}