449 lines
16 KiB
Rust
449 lines
16 KiB
Rust
mod auth;
|
|
mod cni;
|
|
mod config;
|
|
mod fiberlb_controller;
|
|
mod flashdns_controller;
|
|
mod ipam_client;
|
|
mod rest;
|
|
mod scheduler;
|
|
mod services;
|
|
mod storage;
|
|
|
|
use anyhow::Result;
|
|
use auth::AuthService;
|
|
use chainfire_client::Client as ChainFireClient;
|
|
use clap::Parser;
|
|
use config::Config;
|
|
use ipam_client::IpamClient;
|
|
use metrics_exporter_prometheus::PrometheusBuilder;
|
|
use k8shost_proto::{
|
|
deployment_service_server::{DeploymentService, DeploymentServiceServer},
|
|
node_service_server::NodeServiceServer,
|
|
pod_service_server::PodServiceServer,
|
|
service_service_server::ServiceServiceServer,
|
|
*,
|
|
};
|
|
use services::{node::NodeServiceImpl, pod::PodServiceImpl, service::ServiceServiceImpl};
|
|
use std::{path::PathBuf, sync::Arc};
|
|
use std::time::{SystemTime, UNIX_EPOCH};
|
|
use storage::Storage;
|
|
use tonic::{transport::Server, Request, Response, Status};
|
|
use tracing::{info, warn};
|
|
use tracing_subscriber::EnvFilter;
|
|
|
|
/// k8shost API Server
|
|
#[derive(Parser, Debug)]
|
|
#[command(name = "k8shost-server")]
|
|
#[command(about = "Kubernetes API server for PlasmaCloud's k8shost component")]
|
|
struct Args {
|
|
/// Configuration file path
|
|
#[arg(short, long)]
|
|
config: Option<PathBuf>,
|
|
|
|
/// Listen address for gRPC server (e.g., "[::]:6443")
|
|
#[arg(long)]
|
|
addr: Option<String>,
|
|
|
|
/// Log level (e.g., "info", "debug", "trace")
|
|
#[arg(long)]
|
|
log_level: Option<String>,
|
|
|
|
/// FlareDB Placement Driver address (e.g., "127.0.0.1:2479")
|
|
#[arg(long)]
|
|
flaredb_pd_addr: Option<String>,
|
|
|
|
/// FlareDB direct address (e.g., "127.0.0.1:50052")
|
|
#[arg(long)]
|
|
flaredb_direct_addr: Option<String>,
|
|
|
|
/// ChainFire endpoint for cluster coordination (e.g., "http://127.0.0.1:2379")
|
|
#[arg(long)]
|
|
chainfire_endpoint: Option<String>,
|
|
|
|
/// IAM server address (e.g., "http://127.0.0.1:50051")
|
|
#[arg(long)]
|
|
iam_server_addr: Option<String>,
|
|
|
|
/// FiberLB server address (e.g., "http://127.0.0.1:50082")
|
|
#[arg(long)]
|
|
fiberlb_server_addr: Option<String>,
|
|
|
|
/// FlashDNS server address (e.g., "http://127.0.0.1:50053")
|
|
#[arg(long)]
|
|
flashdns_server_addr: Option<String>,
|
|
|
|
/// PrismNET server address (e.g., "http://127.0.0.1:50081")
|
|
#[arg(long)]
|
|
prismnet_server_addr: Option<String>,
|
|
|
|
/// Metrics port for Prometheus scraping
|
|
#[arg(long, default_value = "9094")]
|
|
metrics_port: u16,
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
|
let args = Args::parse();
|
|
|
|
// Load configuration
|
|
let mut settings = ::config::Config::builder()
|
|
.add_source(::config::File::from_str(
|
|
toml::to_string(&Config::default())?.as_str(),
|
|
::config::FileFormat::Toml,
|
|
))
|
|
.add_source(::config::Environment::with_prefix("K8SHOST").separator("_"));
|
|
|
|
// Add config file if specified
|
|
if let Some(config_path) = &args.config {
|
|
info!("Loading config from file: {}", config_path.display());
|
|
settings = settings.add_source(::config::File::from(config_path.as_path()));
|
|
}
|
|
|
|
let loaded_config: Config = settings
|
|
.build()?
|
|
.try_deserialize()
|
|
.map_err(|e| anyhow::anyhow!("Failed to load configuration: {}", e))?;
|
|
|
|
// Apply CLI overrides to the loaded configuration
|
|
let config = Config {
|
|
server: config::ServerConfig {
|
|
addr: args
|
|
.addr
|
|
.map(|s| s.parse().unwrap_or(loaded_config.server.addr))
|
|
.unwrap_or(loaded_config.server.addr),
|
|
http_addr: loaded_config.server.http_addr,
|
|
log_level: args.log_level.unwrap_or(loaded_config.server.log_level),
|
|
},
|
|
flaredb: config::FlareDbConfig {
|
|
pd_addr: args.flaredb_pd_addr.or(loaded_config.flaredb.pd_addr),
|
|
direct_addr: args.flaredb_direct_addr.or(loaded_config.flaredb.direct_addr),
|
|
},
|
|
chainfire: config::ChainFireConfig {
|
|
endpoint: args.chainfire_endpoint.or(loaded_config.chainfire.endpoint),
|
|
},
|
|
iam: config::IamConfig {
|
|
server_addr: args.iam_server_addr.unwrap_or(loaded_config.iam.server_addr),
|
|
},
|
|
fiberlb: config::FiberLbConfig {
|
|
server_addr: args.fiberlb_server_addr.unwrap_or(loaded_config.fiberlb.server_addr),
|
|
},
|
|
flashdns: config::FlashDnsConfig {
|
|
server_addr: args.flashdns_server_addr.unwrap_or(loaded_config.flashdns.server_addr),
|
|
},
|
|
prismnet: config::PrismNetConfig {
|
|
server_addr: args
|
|
.prismnet_server_addr
|
|
.unwrap_or(loaded_config.prismnet.server_addr),
|
|
},
|
|
};
|
|
|
|
// Initialize tracing
|
|
init_logging(&config.server.log_level);
|
|
|
|
info!("Starting k8shost API server on {}", config.server.addr);
|
|
|
|
if let Some(endpoint) = &config.chainfire.endpoint {
|
|
let endpoint = endpoint.clone();
|
|
let addr = config.server.addr.to_string();
|
|
tokio::spawn(async move {
|
|
if let Err(error) = register_chainfire_membership(&endpoint, "k8shost", addr).await {
|
|
warn!(error = %error, "ChainFire membership registration failed");
|
|
}
|
|
});
|
|
}
|
|
|
|
// Initialize Prometheus metrics exporter
|
|
let metrics_addr = format!("0.0.0.0:{}", args.metrics_port);
|
|
let builder = PrometheusBuilder::new();
|
|
builder
|
|
.with_http_listener(metrics_addr.parse::<std::net::SocketAddr>()?)
|
|
.install()
|
|
.expect("Failed to install Prometheus metrics exporter");
|
|
|
|
info!(
|
|
"Prometheus metrics available at http://{}/metrics",
|
|
metrics_addr
|
|
);
|
|
|
|
// Initialize FlareDB storage. Prefer direct access when configured, but fall back to
|
|
// the placement-driver path so service startup is resilient to early direct endpoint races.
|
|
let storage = if let Some(addr) = &config.flaredb.direct_addr {
|
|
info!("Connecting to FlareDB directly at {}", addr);
|
|
match Storage::new_direct(addr.clone()).await {
|
|
Ok(s) => {
|
|
info!("Successfully connected to FlareDB (direct)");
|
|
Arc::new(s)
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed direct FlareDB connection: {}", e);
|
|
if let Some(pd_addr) = &config.flaredb.pd_addr {
|
|
info!("Falling back to FlareDB PD at {}", pd_addr);
|
|
match Storage::new(pd_addr.clone()).await {
|
|
Ok(s) => {
|
|
info!("Successfully connected to FlareDB via PD fallback");
|
|
Arc::new(s)
|
|
}
|
|
Err(pd_error) => {
|
|
warn!(
|
|
"Failed to connect to FlareDB via PD fallback: {}. Server will not start.",
|
|
pd_error
|
|
);
|
|
return Err(anyhow::anyhow!(
|
|
"Failed to connect to FlareDB directly ({}) and via PD fallback ({}).",
|
|
e,
|
|
pd_error
|
|
)
|
|
.into());
|
|
}
|
|
}
|
|
} else {
|
|
return Err(
|
|
anyhow::anyhow!("Failed to connect to FlareDB (direct): {}", e).into()
|
|
);
|
|
}
|
|
}
|
|
}
|
|
} else if let Some(addr) = &config.flaredb.pd_addr {
|
|
info!("Connecting to FlareDB PD at {}", addr);
|
|
match Storage::new(addr.clone()).await {
|
|
Ok(s) => {
|
|
info!("Successfully connected to FlareDB");
|
|
Arc::new(s)
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to connect to FlareDB: {}. Server will not start.", e);
|
|
return Err(anyhow::anyhow!("Failed to connect to FlareDB: {}", e).into());
|
|
}
|
|
}
|
|
} else {
|
|
return Err(anyhow::anyhow!("No FlareDB address configured.").into());
|
|
};
|
|
|
|
// Initialize IAM authentication service
|
|
info!("Connecting to IAM server at {}", config.iam.server_addr);
|
|
|
|
let auth_service = match AuthService::new(&config.iam.server_addr).await {
|
|
Ok(s) => {
|
|
info!("Successfully connected to IAM server");
|
|
Arc::new(s)
|
|
}
|
|
Err(e) => {
|
|
warn!("Failed to connect to IAM server: {}. Server will not start.", e);
|
|
return Err(anyhow::anyhow!("Failed to connect to IAM server: {}", e).into());
|
|
}
|
|
};
|
|
|
|
// Dedicated runtime for auth interceptors to avoid blocking the main async runtime
|
|
let auth_runtime = Arc::new(tokio::runtime::Runtime::new()?);
|
|
let make_interceptor = |auth: Arc<AuthService>| {
|
|
let rt = auth_runtime.clone();
|
|
move |mut req: Request<()>| -> Result<Request<()>, Status> {
|
|
let auth = auth.clone();
|
|
tokio::task::block_in_place(|| {
|
|
rt.block_on(async move {
|
|
let tenant_context = auth.authenticate_request(&req).await?;
|
|
req.extensions_mut().insert(tenant_context);
|
|
Ok(req)
|
|
})
|
|
})
|
|
}
|
|
};
|
|
|
|
// Create IPAM client
|
|
let ipam_client = Arc::new(IpamClient::new(config.prismnet.server_addr.clone()));
|
|
|
|
// Create service implementations with storage
|
|
let pod_service = Arc::new(
|
|
PodServiceImpl::new_with_credit_service(storage.clone(), auth_service.clone()).await,
|
|
);
|
|
let service_service = Arc::new(ServiceServiceImpl::new(
|
|
storage.clone(),
|
|
ipam_client,
|
|
auth_service.clone(),
|
|
));
|
|
let node_service = Arc::new(NodeServiceImpl::new(storage.clone(), auth_service.clone()));
|
|
let deployment_service = DeploymentServiceImpl; // Still unimplemented
|
|
|
|
// Start scheduler in background with CreditService integration
|
|
let scheduler = Arc::new(scheduler::Scheduler::new_with_credit_service(storage.clone()).await);
|
|
tokio::spawn(async move {
|
|
scheduler.run().await;
|
|
});
|
|
info!("Scheduler started - tenant-aware with quota enforcement");
|
|
|
|
// Start FiberLB controller in background
|
|
let fiberlb_controller = Arc::new(fiberlb_controller::FiberLbController::new(
|
|
storage.clone(),
|
|
config.fiberlb.server_addr.clone(),
|
|
config.iam.server_addr.clone(),
|
|
));
|
|
tokio::spawn(async move {
|
|
fiberlb_controller.run().await;
|
|
});
|
|
info!("FiberLB controller started - monitoring LoadBalancer services with per-tenant IAM tokens");
|
|
|
|
// Start FlashDNS controller in background
|
|
let flashdns_controller = Arc::new(flashdns_controller::FlashDnsController::new(
|
|
storage.clone(),
|
|
config.flashdns.server_addr.clone(),
|
|
config.iam.server_addr.clone(),
|
|
));
|
|
tokio::spawn(async move {
|
|
flashdns_controller.run().await;
|
|
});
|
|
info!("FlashDNS controller started - managing cluster.local DNS records with per-tenant IAM tokens");
|
|
|
|
info!("Starting gRPC server with authentication...");
|
|
|
|
// Build gRPC server with authentication layer
|
|
let grpc_server = Server::builder()
|
|
.add_service(
|
|
tonic::codegen::InterceptedService::new(
|
|
PodServiceServer::new(pod_service.as_ref().clone()),
|
|
make_interceptor(auth_service.clone()),
|
|
),
|
|
)
|
|
.add_service(
|
|
tonic::codegen::InterceptedService::new(
|
|
ServiceServiceServer::new(service_service.as_ref().clone()),
|
|
make_interceptor(auth_service.clone()),
|
|
),
|
|
)
|
|
.add_service(
|
|
tonic::codegen::InterceptedService::new(
|
|
NodeServiceServer::new(node_service.as_ref().clone()),
|
|
make_interceptor(auth_service.clone()),
|
|
),
|
|
)
|
|
.add_service(DeploymentServiceServer::new(deployment_service))
|
|
.serve(config.server.addr);
|
|
|
|
// HTTP REST API server
|
|
let http_addr = config.server.http_addr;
|
|
let rest_state = rest::RestApiState {
|
|
pod_service: pod_service.clone(),
|
|
service_service: service_service.clone(),
|
|
node_service: node_service.clone(),
|
|
auth_service: auth_service.clone(),
|
|
};
|
|
let rest_app = rest::build_router(rest_state);
|
|
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
|
|
|
|
info!("k8shost HTTP REST API server starting on {}", http_addr);
|
|
|
|
let http_server = async move {
|
|
axum::serve(http_listener, rest_app)
|
|
.await
|
|
.map_err(|e| format!("HTTP server error: {}", e))
|
|
};
|
|
|
|
// Run both servers concurrently
|
|
tokio::select! {
|
|
result = grpc_server => {
|
|
result?;
|
|
}
|
|
result = http_server => {
|
|
result?;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
// Deployment Service Implementation (placeholder - not part of MVP)
|
|
#[derive(Debug, Default)]
|
|
struct DeploymentServiceImpl;
|
|
|
|
#[tonic::async_trait]
|
|
impl DeploymentService for DeploymentServiceImpl {
|
|
async fn create_deployment(
|
|
&self,
|
|
_request: Request<CreateDeploymentRequest>,
|
|
) -> Result<Response<CreateDeploymentResponse>, Status> {
|
|
Err(Status::unimplemented("create_deployment not yet implemented"))
|
|
}
|
|
|
|
async fn get_deployment(
|
|
&self,
|
|
_request: Request<GetDeploymentRequest>,
|
|
) -> Result<Response<GetDeploymentResponse>, Status> {
|
|
Err(Status::unimplemented("get_deployment not yet implemented"))
|
|
}
|
|
|
|
async fn list_deployments(
|
|
&self,
|
|
_request: Request<ListDeploymentsRequest>,
|
|
) -> Result<Response<ListDeploymentsResponse>, Status> {
|
|
Err(Status::unimplemented("list_deployments not yet implemented"))
|
|
}
|
|
|
|
async fn update_deployment(
|
|
&self,
|
|
_request: Request<UpdateDeploymentRequest>,
|
|
) -> Result<Response<UpdateDeploymentResponse>, Status> {
|
|
Err(Status::unimplemented("update_deployment not yet implemented"))
|
|
}
|
|
|
|
async fn delete_deployment(
|
|
&self,
|
|
_request: Request<DeleteDeploymentRequest>,
|
|
) -> Result<Response<DeleteDeploymentResponse>, Status> {
|
|
Err(Status::unimplemented("delete_deployment not yet implemented"))
|
|
}
|
|
}
|
|
|
|
fn init_logging(level: &str) {
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(level)))
|
|
.init();
|
|
}
|
|
|
|
async fn register_chainfire_membership(
|
|
endpoint: &str,
|
|
service: &str,
|
|
addr: String,
|
|
) -> Result<()> {
|
|
let node_id =
|
|
std::env::var("HOSTNAME").unwrap_or_else(|_| format!("{}-{}", service, std::process::id()));
|
|
let ts = SystemTime::now()
|
|
.duration_since(UNIX_EPOCH)
|
|
.unwrap_or_default()
|
|
.as_secs();
|
|
let key = format!("/cluster/{}/members/{}", service, node_id);
|
|
let value = format!(r#"{{"addr":"{}","ts":{}}}"#, addr, ts);
|
|
let deadline = tokio::time::Instant::now() + std::time::Duration::from_secs(120);
|
|
let mut attempt = 0usize;
|
|
let mut last_error = String::new();
|
|
|
|
loop {
|
|
attempt += 1;
|
|
match ChainFireClient::connect(endpoint).await {
|
|
Ok(mut client) => match client.put_str(&key, &value).await {
|
|
Ok(_) => return Ok(()),
|
|
Err(error) => last_error = format!("put failed: {}", error),
|
|
},
|
|
Err(error) => last_error = format!("connect failed: {}", error),
|
|
}
|
|
|
|
if tokio::time::Instant::now() >= deadline {
|
|
break;
|
|
}
|
|
|
|
tracing::warn!(
|
|
attempt,
|
|
endpoint,
|
|
service,
|
|
error = %last_error,
|
|
"retrying ChainFire membership registration"
|
|
);
|
|
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
|
|
}
|
|
|
|
anyhow::bail!(
|
|
"failed to register ChainFire membership for {} via {} after {} attempts: {}",
|
|
service,
|
|
endpoint,
|
|
attempt,
|
|
last_error
|
|
)
|
|
}
|