photoncloud-monorepo/plasmavmc/crates/plasmavmc-server/src/main.rs

453 lines
16 KiB
Rust

//! PlasmaVMC control plane server binary
use clap::Parser;
use iam_service_auth::AuthService;
use metrics_exporter_prometheus::PrometheusBuilder;
use plasmavmc_api::proto::image_service_server::ImageServiceServer;
use plasmavmc_api::proto::node_service_client::NodeServiceClient;
use plasmavmc_api::proto::node_service_server::NodeServiceServer;
use plasmavmc_api::proto::vm_service_server::VmServiceServer;
use plasmavmc_api::proto::volume_service_server::VolumeServiceServer;
use plasmavmc_api::proto::{
HeartbeatNodeRequest, HypervisorType as ProtoHypervisorType, NodeCapacity,
NodeState as ProtoNodeState, VolumeDriverKind as ProtoVolumeDriverKind,
};
use plasmavmc_firecracker::FireCrackerBackend;
use plasmavmc_hypervisor::HypervisorRegistry;
use plasmavmc_kvm::KvmBackend;
use plasmavmc_server::config::ServerConfig;
use plasmavmc_server::watcher::{StateSynchronizer, StateWatcher, WatcherConfig};
use plasmavmc_server::VmServiceImpl;
use std::net::SocketAddr;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use std::{collections::HashMap, fs};
use tonic::transport::{Certificate, Endpoint, Identity, Server, ServerTlsConfig};
use tonic::{Request, Status};
use tonic_health::server::health_reporter;
use tracing_subscriber::EnvFilter;
/// PlasmaVMC control plane server
#[derive(Parser, Debug)]
#[command(author, version, about, long_about = None)]
struct Args {
/// Configuration file path
#[arg(short, long, default_value = "plasmavmc.toml")]
config: PathBuf,
/// Address to listen on (overrides config)
#[arg(short, long)]
addr: Option<String>,
/// Log level (overrides config)
#[arg(short, long)]
log_level: Option<String>,
/// Path to the Firecracker kernel image (overrides config)
#[arg(long)]
firecracker_kernel_path: Option<PathBuf>,
/// Path to the Firecracker rootfs image (overrides config)
#[arg(long)]
firecracker_rootfs_path: Option<PathBuf>,
/// Metrics port for Prometheus scraping
#[arg(long, default_value = "9102")]
metrics_port: u16,
}
fn normalize_endpoint(endpoint: &str) -> String {
if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
endpoint.to_string()
} else {
format!("http://{endpoint}")
}
}
fn available_memory_mib() -> u64 {
let Ok(meminfo) = fs::read_to_string("/proc/meminfo") else {
return 0;
};
meminfo
.lines()
.find_map(|line| line.strip_prefix("MemTotal:"))
.and_then(|rest| rest.split_whitespace().next())
.and_then(|value| value.parse::<u64>().ok())
.map(|kib| kib / 1024)
.unwrap_or(0)
}
async fn start_agent_heartbeat(
local_addr: SocketAddr,
supported_volume_drivers: Vec<i32>,
supported_storage_classes: Vec<String>,
shared_live_migration: bool,
) {
let Some(control_plane_addr) = std::env::var("PLASMAVMC_CONTROL_PLANE_ADDR")
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
else {
return;
};
let Some(node_id) = std::env::var("PLASMAVMC_NODE_ID")
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
else {
return;
};
let endpoint = normalize_endpoint(&control_plane_addr);
let advertise_endpoint = std::env::var("PLASMAVMC_ENDPOINT_ADVERTISE")
.ok()
.map(|value| value.trim().to_string())
.filter(|value| !value.is_empty())
.unwrap_or_else(|| local_addr.to_string());
let node_name = std::env::var("PLASMAVMC_NODE_NAME")
.ok()
.filter(|value| !value.trim().is_empty())
.unwrap_or_else(|| node_id.clone());
let heartbeat_secs = std::env::var("PLASMAVMC_NODE_HEARTBEAT_INTERVAL_SECS")
.ok()
.and_then(|value| value.parse::<u64>().ok())
.unwrap_or(5);
tokio::spawn(async move {
let mut ticker = tokio::time::interval(Duration::from_secs(heartbeat_secs));
loop {
ticker.tick().await;
let channel = match Endpoint::from_shared(endpoint.clone()) {
Ok(endpoint) => match endpoint.connect().await {
Ok(channel) => channel,
Err(error) => {
tracing::warn!(%error, "Failed to connect to PlasmaVMC control plane for heartbeat");
continue;
}
},
Err(error) => {
tracing::warn!(%error, "Invalid PlasmaVMC control plane endpoint for heartbeat");
continue;
}
};
let mut client = NodeServiceClient::new(channel);
let mut labels = HashMap::new();
labels.insert("plasmavmc_endpoint".to_string(), advertise_endpoint.clone());
let request = HeartbeatNodeRequest {
node_id: node_id.clone(),
name: node_name.clone(),
state: ProtoNodeState::Ready as i32,
capacity: Some(NodeCapacity {
vcpus: std::thread::available_parallelism()
.map(|parallelism| parallelism.get() as u32)
.unwrap_or(1),
memory_mib: available_memory_mib(),
storage_gib: 0,
}),
allocatable: Some(NodeCapacity {
vcpus: std::thread::available_parallelism()
.map(|parallelism| parallelism.get() as u32)
.unwrap_or(1),
memory_mib: available_memory_mib(),
storage_gib: 0,
}),
hypervisors: vec![ProtoHypervisorType::Kvm as i32],
labels,
agent_version: env!("CARGO_PKG_VERSION").to_string(),
supported_volume_drivers: supported_volume_drivers.clone(),
supported_storage_classes: supported_storage_classes.clone(),
shared_live_migration,
};
if let Err(error) = client.heartbeat_node(request).await {
tracing::warn!(%error, "Failed to heartbeat PlasmaVMC node");
}
}
});
}
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let args = Args::parse();
// Load configuration from file or use defaults
let mut config = if args.config.exists() {
let contents = tokio::fs::read_to_string(&args.config).await?;
toml::from_str(&contents)?
} else {
tracing::info!(
"Config file not found: {}, using defaults",
args.config.display()
);
ServerConfig::default()
};
// Apply command line overrides
if let Some(addr_str) = args.addr {
config.addr = addr_str.parse()?;
}
if let Some(log_level) = args.log_level {
config.log_level = log_level;
}
if let Some(kernel_path) = args.firecracker_kernel_path {
config.firecracker.kernel_path = Some(kernel_path);
}
if let Some(rootfs_path) = args.firecracker_rootfs_path {
config.firecracker.rootfs_path = Some(rootfs_path);
}
// Initialize tracing
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&config.log_level)),
)
.init();
tracing::info!("Starting PlasmaVMC server on {}", config.addr);
// Initialize Prometheus metrics exporter
let metrics_addr = format!("0.0.0.0:{}", args.metrics_port);
let builder = PrometheusBuilder::new();
builder
.with_http_listener(metrics_addr.parse::<std::net::SocketAddr>()?)
.install()
.expect("Failed to install Prometheus metrics exporter");
tracing::info!(
"Prometheus metrics available at http://{}/metrics",
metrics_addr
);
// Create hypervisor registry and register backends
let registry = Arc::new(HypervisorRegistry::new());
// Register KVM backend (always available)
let kvm_backend = Arc::new(KvmBackend::with_defaults());
registry.register(kvm_backend);
// Register FireCracker backend if kernel/rootfs paths are configured (config or env)
let has_kernel = config.firecracker.kernel_path.is_some()
|| std::env::var_os("PLASMAVMC_FIRECRACKER_KERNEL_PATH").is_some();
let has_rootfs = config.firecracker.rootfs_path.is_some()
|| std::env::var_os("PLASMAVMC_FIRECRACKER_ROOTFS_PATH").is_some();
if has_kernel && has_rootfs {
match FireCrackerBackend::from_config(&config.firecracker) {
Ok(firecracker_backend) => {
registry.register(Arc::new(firecracker_backend));
tracing::info!("Registered FireCracker backend");
}
Err(err) => {
tracing::warn!("Failed to initialize FireCracker backend: {}", err);
}
}
} else if has_kernel || has_rootfs {
tracing::warn!(
"FireCracker backend configuration incomplete: kernel_path/rootfs_path must both be set (config or env)"
);
} else {
tracing::debug!("FireCracker backend not available (missing kernel/rootfs paths)");
}
tracing::info!("Registered hypervisors: {:?}", registry.available());
// Initialize IAM authentication service
tracing::info!(
"Connecting to IAM server at {}",
config.auth.iam_server_addr
);
let auth_service = AuthService::new(&config.auth.iam_server_addr)
.await
.map_err(|e| format!("Failed to connect to IAM server: {}", e))?;
let auth_service = Arc::new(auth_service);
// gRPC interceptors are synchronous, so bridge into the current Tokio runtime
// from a blocking section instead of creating a nested runtime that would
// later be dropped from async context during shutdown.
let auth_handle = tokio::runtime::Handle::current();
let make_interceptor = |auth: Arc<AuthService>| {
let handle = auth_handle.clone();
move |mut req: Request<()>| -> Result<Request<()>, Status> {
let auth = auth.clone();
tokio::task::block_in_place(|| {
handle.block_on(async move {
let tenant_context = auth.authenticate_request(&req).await?;
req.extensions_mut().insert(tenant_context);
Ok(req)
})
})
}
};
// Create services
let vm_service = Arc::new(
VmServiceImpl::new(
registry,
auth_service.clone(),
config.auth.iam_server_addr.clone(),
)
.await?,
);
// Optional: start state watcher for multi-instance HA sync
if std::env::var("PLASMAVMC_STATE_WATCHER")
.map(|v| matches!(v.as_str(), "1" | "true" | "yes"))
.unwrap_or(false)
{
let config = WatcherConfig::default();
let (watcher, rx) = StateWatcher::new(vm_service.store(), config);
let synchronizer = StateSynchronizer::new(vm_service.clone());
tokio::spawn(async move {
if let Err(e) = watcher.start().await {
tracing::error!(error = %e, "State watcher failed to start");
}
});
tokio::spawn(async move {
synchronizer.run(rx).await;
});
tracing::info!("State watcher enabled (PLASMAVMC_STATE_WATCHER)");
}
// Optional: start health monitor to refresh VM status periodically
if let Some(secs) = std::env::var("PLASMAVMC_HEALTH_MONITOR_INTERVAL_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
{
if secs > 0 {
vm_service
.clone()
.start_health_monitor(Duration::from_secs(secs));
}
}
// Optional: start node health monitor to detect stale heartbeats
if let Some(interval_secs) = std::env::var("PLASMAVMC_NODE_HEALTH_MONITOR_INTERVAL_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
{
if interval_secs > 0 {
let timeout_secs = std::env::var("PLASMAVMC_NODE_HEARTBEAT_TIMEOUT_SECS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(60);
vm_service.clone().start_node_health_monitor(
Duration::from_secs(interval_secs),
Duration::from_secs(timeout_secs),
);
}
}
// Setup health service
let (mut health_reporter, health_service) = health_reporter();
health_reporter
.set_serving::<VmServiceServer<VmServiceImpl>>()
.await;
health_reporter
.set_serving::<ImageServiceServer<VmServiceImpl>>()
.await;
health_reporter
.set_serving::<VolumeServiceServer<VmServiceImpl>>()
.await;
health_reporter
.set_serving::<NodeServiceServer<VmServiceImpl>>()
.await;
// Parse address
let addr: SocketAddr = config.addr;
let heartbeat_volume_drivers = vm_service
.supported_volume_drivers()
.into_iter()
.map(|driver| match driver {
plasmavmc_types::VolumeDriverKind::Managed => ProtoVolumeDriverKind::Managed as i32,
plasmavmc_types::VolumeDriverKind::CephRbd => ProtoVolumeDriverKind::CephRbd as i32,
})
.collect();
let heartbeat_storage_classes = vm_service.supported_storage_classes();
let shared_live_migration = vm_service.shared_live_migration();
start_agent_heartbeat(
addr,
heartbeat_volume_drivers,
heartbeat_storage_classes,
shared_live_migration,
)
.await;
tracing::info!("PlasmaVMC gRPC server listening on {}", addr);
// Configure TLS if enabled
let mut server = Server::builder();
if let Some(tls_config) = &config.tls {
tracing::info!("TLS enabled, loading certificates...");
let cert = tokio::fs::read(&tls_config.cert_file).await?;
let key = tokio::fs::read(&tls_config.key_file).await?;
let server_identity = Identity::from_pem(cert, key);
let tls = if tls_config.require_client_cert {
tracing::info!("mTLS enabled");
let ca_cert = tokio::fs::read(
tls_config
.ca_file
.as_ref()
.ok_or("ca_file required for mTLS")?,
)
.await?;
let ca = Certificate::from_pem(ca_cert);
ServerTlsConfig::new()
.identity(server_identity)
.client_ca_root(ca)
} else {
ServerTlsConfig::new().identity(server_identity)
};
server = server.tls_config(tls)?;
}
// gRPC server (clone Arc for gRPC service)
let grpc_vm_service = Arc::clone(&vm_service);
let grpc_server = server
.add_service(health_service)
.add_service(tonic::codegen::InterceptedService::new(
VmServiceServer::from_arc(grpc_vm_service),
make_interceptor(auth_service.clone()),
))
.add_service(tonic::codegen::InterceptedService::new(
ImageServiceServer::from_arc(Arc::clone(&vm_service)),
make_interceptor(auth_service.clone()),
))
.add_service(tonic::codegen::InterceptedService::new(
VolumeServiceServer::from_arc(Arc::clone(&vm_service)),
make_interceptor(auth_service.clone()),
))
.add_service(NodeServiceServer::from_arc(Arc::clone(&vm_service)))
.serve(addr);
// HTTP REST API server
let http_addr = config.http_addr;
let rest_state = plasmavmc_server::rest::RestApiState {
vm_service: vm_service,
auth_service: auth_service.clone(),
};
let rest_app = plasmavmc_server::rest::build_router(rest_state);
let http_listener = tokio::net::TcpListener::bind(&http_addr).await?;
tracing::info!("PlasmaVMC HTTP REST API server starting on {}", http_addr);
let http_server = async move {
axum::serve(http_listener, rest_app)
.await
.map_err(|e| format!("HTTP server error: {}", e))
};
// Run both servers concurrently
tokio::select! {
result = grpc_server => {
result?;
}
result = http_server => {
result?;
}
}
Ok(())
}