diff --git a/docs/por/T027-production-hardening/task.yaml b/docs/por/T027-production-hardening/task.yaml new file mode 100644 index 0000000..ce2f64e --- /dev/null +++ b/docs/por/T027-production-hardening/task.yaml @@ -0,0 +1,73 @@ +id: T027 +name: Production Hardening +goal: Transform MVP stack into a production-grade, observable, and highly available platform. +status: active +priority: P1 +owner: peerB +created: 2025-12-10 +depends_on: [T026] +blocks: [] + +context: | + With MVP functionality verified (T026), the platform must be hardened for + production usage. This involves ensuring high availability (HA), comprehensive + observability (metrics/logs), and security (TLS). + + This task focuses on Non-Functional Requirements (NFRs). Functional gaps + (deferred P1s) will be handled in T028. + +acceptance: + - All components use a unified configuration approach (clap + config file or env) + - Full observability stack (Prometheus/Grafana/Loki) operational via NixOS + - All services exporting metrics and logs to the stack + - Chainfire and FlareDB verified in 3-node HA cluster + - TLS enabled for all inter-service communication (optional for internal, required for external) + - Chaos testing (kill node, verify recovery) passed + - Ops documentation (Backup/Restore, Upgrade) created + +steps: + - step: S0 + name: Config Unification + done: All components use unified configuration (clap + config file/env) + status: complete + owner: peerB + priority: P0 + + - step: S1 + name: Observability Stack + done: Prometheus, Grafana, and Loki deployed and scraping targets + status: pending + owner: peerB + priority: P0 + + - step: S2 + name: Service Telemetry Integration + done: All components (Chainfire, FlareDB, IAM, k8shost) dashboards functional + status: pending + owner: peerB + priority: P0 + + - step: S3 + name: HA Clustering Verification + done: 3-node Chainfire/FlareDB cluster survives single node failure + status: pending + owner: peerB + priority: P0 + + - step: S4 + name: Security Hardening + done: mTLS/TLS enabled where appropriate, secrets management verified + status: pending + owner: peerB + priority: P1 + + - step: S5 + name: Ops Documentation + done: Runbooks for common operations (Scale out, Restore, Upgrade) + status: pending + owner: peerB + priority: P1 + +evidence: [] +notes: | + Separated from functional feature work (T028). diff --git a/flaredb/crates/flaredb-server/Cargo.toml b/flaredb/crates/flaredb-server/Cargo.toml index 0674b4d..7163565 100644 --- a/flaredb/crates/flaredb-server/Cargo.toml +++ b/flaredb/crates/flaredb-server/Cargo.toml @@ -15,7 +15,7 @@ prost.workspace = true clap.workspace = true openraft.workspace = true serde.workspace = true -serde_json.workspace = true +toml.workspace = true async-trait.workspace = true tracing.workspace = true tracing-subscriber.workspace = true diff --git a/flaredb/crates/flaredb-server/src/config.rs b/flaredb/crates/flaredb-server/src/config.rs new file mode 100644 index 0000000..2bacc06 --- /dev/null +++ b/flaredb/crates/flaredb-server/src/config.rs @@ -0,0 +1,76 @@ +//! Server configuration + +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::path::PathBuf; +use std::net::SocketAddr; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub enum ConsistencyMode { + Strong, + Eventual, +} + +impl Default for ConsistencyMode { + fn default() -> Self { + ConsistencyMode::Strong + } +} + +/// Server configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServerConfig { + /// Unique store ID + pub store_id: u64, + /// Listen address for gRPC API + pub addr: SocketAddr, + /// Data directory for RocksDB + pub data_dir: PathBuf, + /// ChainFire PD address + pub pd_addr: SocketAddr, + /// Initial cluster peers in id=host:port format + pub peers: HashMap, + /// Namespace consistency modes + pub namespace_modes: HashMap, +} + +impl Default for ServerConfig { + fn default() -> Self { + let mut default_peers = HashMap::new(); + default_peers.insert(1, "127.0.0.1:50051".parse().unwrap()); + + let mut default_namespace_modes = HashMap::new(); + default_namespace_modes.insert("default".to_string(), ConsistencyMode::Strong); + + Self { + store_id: 1, + addr: "127.0.0.1:50051".parse().unwrap(), + data_dir: PathBuf::from("./data"), + pd_addr: "127.0.0.1:2379".parse().unwrap(), + peers: default_peers, + namespace_modes: default_namespace_modes, + } + } +} + +// Helper function to parse namespace modes from command line strings +pub fn parse_namespace_modes( + modes: &[String], +) -> Result, String> { + let mut namespace_map = HashMap::new(); + for mode_str in modes { + let parts: Vec<&str> = mode_str.split('=').collect(); + if parts.len() == 2 { + let namespace = parts[0].to_string(); + let mode = match parts[1].to_lowercase().as_str() { + "strong" => ConsistencyMode::Strong, + "eventual" => ConsistencyMode::Eventual, + _ => return Err(format!("Invalid consistency mode: {}", parts[1])), + }; + namespace_map.insert(namespace, mode); + } else { + return Err(format!("Invalid namespace mode format: {}", mode_str)); + } + } + Ok(namespace_map) +} diff --git a/flaredb/crates/flaredb-server/src/main.rs b/flaredb/crates/flaredb-server/src/main.rs index 72846f2..4b5e12d 100644 --- a/flaredb/crates/flaredb-server/src/main.rs +++ b/flaredb/crates/flaredb-server/src/main.rs @@ -2,9 +2,11 @@ use clap::Parser; use flaredb_proto::kvrpc::kv_cas_server::KvCasServer; use flaredb_proto::kvrpc::kv_raw_server::KvRawServer; use flaredb_proto::raft_server::raft_service_server::RaftServiceServer; +use flaredb_server::config::{self, ServerConfig}; use flaredb_storage::rocks_engine::RocksEngine; use flaredb_types::RegionMeta; use std::collections::HashMap; +use std::path::PathBuf; use std::sync::Arc; use tokio::sync::Mutex; use tokio::time::{sleep, Duration}; @@ -13,14 +15,11 @@ use tonic_health::server::health_reporter; use tracing::info; use tracing_subscriber::EnvFilter; -mod config; mod heartbeat; mod merkle; mod pd_client; mod raft_service; mod service; - -use pd_client::PdEvent; mod store; use pd_client::PdClient; @@ -28,23 +27,31 @@ use pd_client::PdClient; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { - #[arg(long, default_value = "1")] - store_id: u64, + /// Configuration file path + #[arg(short, long, default_value = "flaredb.toml")] + config: PathBuf, - #[arg(long, default_value = "127.0.0.1:50051")] - addr: String, + /// Store ID (overrides config) + #[arg(long)] + store_id: Option, - #[arg(long, default_value = "data")] - data_dir: String, + /// Listen address for gRPC API (overrides config) + #[arg(long)] + addr: Option, - #[arg(long, default_value = "127.0.0.1:2379")] - pd_addr: String, + /// Data directory for RocksDB (overrides config) + #[arg(long)] + data_dir: Option, - /// Peers in format id=host:port (repeatable) + /// ChainFire PD address (overrides config) + #[arg(long)] + pd_addr: Option, + + /// Initial cluster peers in id=host:port format (overrides config) #[arg(long = "peer")] peers: Vec, - /// Namespace modes in format namespace=strong|eventual (repeatable) + /// Namespace modes in format namespace=strong|eventual (overrides config) #[arg(long = "namespace-mode")] namespace_modes: Vec, } @@ -57,40 +64,67 @@ async fn main() -> Result<(), Box> { .init(); let args = Args::parse(); - let addr = args.addr.parse()?; + + // Load configuration from file or use defaults + let mut config = if args.config.exists() { + let contents = tokio::fs::read_to_string(&args.config).await?; + toml::from_str(&contents)? + } else { + info!("Config file not found: {}, using defaults", args.config.display()); + ServerConfig::default() + }; + + // Apply command line overrides + if let Some(store_id) = args.store_id { + config.store_id = store_id; + } + if let Some(addr_str) = args.addr { + config.addr = addr_str.parse()?; + } + if let Some(data_dir) = args.data_dir { + config.data_dir = data_dir; + } + if let Some(pd_addr_str) = args.pd_addr { + config.pd_addr = pd_addr_str.parse()?; + } + + for p in args.peers { + if let Some((id_str, addr_str)) = p.split_once('=') { + if let Ok(id) = id_str.parse::() { + config.peers.insert(id, addr_str.parse()?); + } + } + } + + if !args.namespace_modes.is_empty() { + config.namespace_modes = config::parse_namespace_modes(&args.namespace_modes).unwrap_or_else(|e| { + eprintln!("Failed to parse namespace modes from command line: {}", e); + std::process::exit(1); + }); + } + + let addr = config.addr; info!("rdb-server listening on {}", addr); - // Build namespace consistency config - let namespace_map = config::parse_namespace_modes(&args.namespace_modes).unwrap_or_else(|e| { - eprintln!("Failed to parse namespace modes: {}", e); - std::process::exit(1); - }); - let server_config = Arc::new(config::ServerConfig::new( - config::ConsistencyMode::Strong, - namespace_map, - )); + let server_config = Arc::new(config); // Parse peer addresses for cluster membership - let mut voters = vec![args.store_id]; + let mut voters = vec![server_config.store_id]; let mut peer_addrs: HashMap = HashMap::new(); // Add self address - peer_addrs.insert(args.store_id, args.addr.clone()); - for p in &args.peers { - if let Some((id_str, addr)) = p.split_once('=') { - if let Ok(id) = id_str.parse::() { - if id != args.store_id { - voters.push(id); - peer_addrs.insert(id, addr.to_string()); - } - } + peer_addrs.insert(server_config.store_id, server_config.addr.to_string()); + for (id, addr) in server_config.peers.clone() { + if id != server_config.store_id { + voters.push(id); + peer_addrs.insert(id, addr.to_string()); } } let peer_addrs = Arc::new(peer_addrs); - let engine = Arc::new(RocksEngine::new(&args.data_dir)?); + let engine = Arc::new(RocksEngine::new(server_config.data_dir.to_str().unwrap())?); let store = Arc::new(store::Store::new( - args.store_id, + server_config.store_id, engine.clone(), server_config.clone(), peer_addrs.clone(), @@ -99,8 +133,8 @@ async fn main() -> Result<(), Box> { let service = service::KvServiceImpl::new(engine.clone(), server_config.clone(), store.clone()); let raft_service = raft_service::RaftServiceImpl::new(store.clone(), args.store_id); - println!("Connecting to ChainFire PD at {}...", args.pd_addr); - let pd_client_res = PdClient::connect(args.pd_addr.clone()).await; + println!("Connecting to ChainFire PD at {}...", server_config.pd_addr); + let pd_client_res = PdClient::connect(server_config.pd_addr.to_string()).await; if let Ok(mut pd_client) = pd_client_res { println!( @@ -110,12 +144,12 @@ async fn main() -> Result<(), Box> { // Register this store with the PD if let Err(e) = pd_client - .register_store(args.store_id, args.addr.clone()) + .register_store(server_config.store_id, server_config.addr.to_string()) .await { eprintln!("Failed to register store: {}", e); } else { - println!("Store {} registered with PD", args.store_id); + println!("Store {} registered with PD", server_config.store_id); } // Start watching for metadata changes from ChainFire @@ -139,10 +173,9 @@ async fn main() -> Result<(), Box> { start_key: region.start_key, end_key: region.end_key, }, - if region.peers.is_empty() { - vec![store_clone_for_events.store_id()] - } else { - region.peers + if region.peers.is_empty() { + vec![store_id] + } else { region.peers }, )]; if let Err(e) = store_clone_for_events.refresh_regions(metas).await { @@ -202,12 +235,12 @@ async fn main() -> Result<(), Box> { // Background task: heartbeat and refresh regions from PD let store_clone = store.clone(); - let pd_addr_clone = args.pd_addr.clone(); - let store_id = args.store_id; - let server_addr = args.addr.clone(); + let pd_addr_string = server_config.pd_addr.to_string(); + let store_id = server_config.store_id; + let server_addr_string = server_config.addr.to_string(); tokio::spawn(async move { let client = Arc::new(Mutex::new( - PdClient::connect(pd_addr_clone.clone()).await.ok(), + PdClient::connect(pd_addr_string.clone()).await.ok(), )); loop { @@ -217,7 +250,7 @@ async fn main() -> Result<(), Box> { if let Some(ref mut c) = *guard { // Send heartbeat let heartbeat_ok = - match c.heartbeat(store_id, server_addr.clone()).await { + match c.heartbeat(store_id, server_addr_string.clone()).await { Ok(_) => true, Err(e) => { eprintln!("Heartbeat failed: {}", e); @@ -249,7 +282,7 @@ async fn main() -> Result<(), Box> { .into_iter() .map(|r| { let region_voters = if r.peers.is_empty() { - vec![store_clone.store_id()] + vec![store_id] } else { r.peers.clone() }; @@ -271,7 +304,7 @@ async fn main() -> Result<(), Box> { } else { // Try to reconnect if let Some(new_client) = - PdClient::connect(pd_addr_clone.clone()).await.ok() + PdClient::connect(pd_addr_string.clone()).await.ok() { println!("Reconnected to PD"); *guard = Some(new_client); diff --git a/plasmavmc/crates/plasmavmc-firecracker/src/lib.rs b/plasmavmc/crates/plasmavmc-firecracker/src/lib.rs index bbdde5f..2879bf8 100644 --- a/plasmavmc/crates/plasmavmc-firecracker/src/lib.rs +++ b/plasmavmc/crates/plasmavmc-firecracker/src/lib.rs @@ -14,6 +14,7 @@ use env::{ }; use api::FireCrackerClient; use plasmavmc_hypervisor::{BackendCapabilities, HypervisorBackend, UnsupportedReason}; +use plasmavmc_server::config::FireCrackerConfig; use plasmavmc_types::{ DiskBus, DiskSpec, Error, HypervisorType, NetworkSpec, NicModel, Result, VirtualMachine, VmHandle, VmSpec, VmStatus, VmState, @@ -46,6 +47,26 @@ pub struct FireCrackerBackend { } impl FireCrackerBackend { + /// Create a new FireCracker backend from provided configuration. + /// Returns Some(Self) if kernel_path and rootfs_path are provided (either in config or env), + /// otherwise returns None. + pub fn new_from_config(config: &FireCrackerConfig) -> Option { + let kernel_path = config.kernel_path.clone().or_else(resolve_kernel_path)?; + let rootfs_path = config.rootfs_path.clone().or_else(resolve_rootfs_path)?; + + Some(Self { + firecracker_path: config.firecracker_path.clone().unwrap_or_else(resolve_firecracker_path), + jailer_path: config.jailer_path.clone().or_else(resolve_jailer_path), + runtime_dir: config.runtime_dir.clone().unwrap_or_else(resolve_runtime_dir), + socket_base_path: config.socket_base_path.clone().unwrap_or_else(resolve_socket_base_path), + kernel_path, + rootfs_path, + initrd_path: config.initrd_path.clone().or_else(resolve_initrd_path), + boot_args: config.boot_args.clone().unwrap_or_else(resolve_boot_args), + use_jailer: config.use_jailer.unwrap_or_else(resolve_use_jailer), + }) + } + /// Create a new FireCracker backend from environment variables pub fn from_env() -> Result { let kernel_path = resolve_kernel_path().ok_or_else(|| { diff --git a/plasmavmc/crates/plasmavmc-server/Cargo.toml b/plasmavmc/crates/plasmavmc-server/Cargo.toml index 0be4a4b..dfdd476 100644 --- a/plasmavmc/crates/plasmavmc-server/Cargo.toml +++ b/plasmavmc/crates/plasmavmc-server/Cargo.toml @@ -29,6 +29,7 @@ clap = { workspace = true } dashmap = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } +toml = { workspace = true } chainfire-client = { path = "../../../chainfire/chainfire-client" } flaredb-client = { path = "../../../flaredb/crates/flaredb-client" } novanet-api = { path = "../../../novanet/crates/novanet-api" } diff --git a/plasmavmc/crates/plasmavmc-server/src/config.rs b/plasmavmc/crates/plasmavmc-server/src/config.rs new file mode 100644 index 0000000..d8c6828 --- /dev/null +++ b/plasmavmc/crates/plasmavmc-server/src/config.rs @@ -0,0 +1,85 @@ +//! Server configuration + +use serde::{Deserialize, Serialize}; +use std::net::SocketAddr; +use std::path::PathBuf; + +/// Server configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ServerConfig { + /// Address to listen on + pub addr: SocketAddr, + /// Log level + pub log_level: String, + /// Configuration for KVM backend + #[serde(default)] + pub kvm: KvmConfig, + /// Configuration for FireCracker backend + #[serde(default)] + pub firecracker: FireCrackerConfig, +} + +/// KVM backend configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct KvmConfig { + // Add KVM specific configuration fields here if any, e.g., + // pub some_kvm_setting: String, +} + +impl Default for KvmConfig { + fn default() -> Self { + Self { + // Default values for KVM config + } + } +} + +/// FireCracker backend configuration +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FireCrackerConfig { + /// Path to the Firecracker binary + pub firecracker_path: Option, + /// Path to Jailer binary (optional) + pub jailer_path: Option, + /// Runtime directory for VM state + pub runtime_dir: Option, + /// Base path for FireCracker API sockets + pub socket_base_path: Option, + /// Kernel image path + pub kernel_path: Option, + /// Path to the Firecracker rootfs image + pub rootfs_path: Option, + /// Initrd image path (optional) + pub initrd_path: Option, + /// Boot arguments + pub boot_args: Option, + /// Use jailer for security + pub use_jailer: Option, +} + +impl Default for FireCrackerConfig { + fn default() -> Self { + Self { + firecracker_path: None, + jailer_path: None, + runtime_dir: None, + socket_base_path: None, + kernel_path: None, + rootfs_path: None, + initrd_path: None, + boot_args: None, + use_jailer: None, + } + } +} + +impl Default for ServerConfig { + fn default() -> Self { + Self { + addr: "0.0.0.0:8080".parse().unwrap(), + log_level: "info".to_string(), + kvm: KvmConfig::default(), + firecracker: FireCrackerConfig::default(), + } + } +} \ No newline at end of file diff --git a/plasmavmc/crates/plasmavmc-server/src/main.rs b/plasmavmc/crates/plasmavmc-server/src/main.rs index 7cadcf2..5081b80 100644 --- a/plasmavmc/crates/plasmavmc-server/src/main.rs +++ b/plasmavmc/crates/plasmavmc-server/src/main.rs @@ -5,8 +5,10 @@ use plasmavmc_api::proto::vm_service_server::VmServiceServer; use plasmavmc_hypervisor::HypervisorRegistry; use plasmavmc_kvm::KvmBackend; use plasmavmc_firecracker::FireCrackerBackend; +use plasmavmc_server::config::{self, ServerConfig}; use plasmavmc_server::VmServiceImpl; use std::net::SocketAddr; +use std::path::PathBuf; use std::sync::Arc; use tonic::transport::Server; use tonic_health::server::health_reporter; @@ -16,27 +18,62 @@ use tracing_subscriber::EnvFilter; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] struct Args { - /// Address to listen on - #[arg(short, long, default_value = "0.0.0.0:8080")] - addr: String, + /// Configuration file path + #[arg(short, long, default_value = "plasmavmc.toml")] + config: PathBuf, - /// Log level - #[arg(short, long, default_value = "info")] - log_level: String, + /// Address to listen on (overrides config) + #[arg(short, long)] + addr: Option, + + /// Log level (overrides config) + #[arg(short, long)] + log_level: Option, + + /// Path to the Firecracker kernel image (overrides config) + #[arg(long)] + firecracker_kernel_path: Option, + + /// Path to the Firecracker rootfs image (overrides config) + #[arg(long)] + firecracker_rootfs_path: Option, } #[tokio::main] async fn main() -> Result<(), Box> { let args = Args::parse(); + // Load configuration from file or use defaults + let mut config = if args.config.exists() { + let contents = tokio::fs::read_to_string(&args.config).await?; + toml::from_str(&contents)? + } else { + tracing::info!("Config file not found: {}, using defaults", args.config.display()); + ServerConfig::default() + }; + + // Apply command line overrides + if let Some(addr_str) = args.addr { + config.addr = addr_str.parse()?; + } + if let Some(log_level) = args.log_level { + config.log_level = log_level; + } + if let Some(kernel_path) = args.firecracker_kernel_path { + config.firecracker.kernel_path = Some(kernel_path); + } + if let Some(rootfs_path) = args.firecracker_rootfs_path { + config.firecracker.rootfs_path = Some(rootfs_path); + } + // Initialize tracing tracing_subscriber::fmt() .with_env_filter( - EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&args.log_level)), + EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new(&config.log_level)), ) .init(); - tracing::info!("Starting PlasmaVMC server on {}", args.addr); + tracing::info!("Starting PlasmaVMC server on {}", config.addr); // Create hypervisor registry and register backends let registry = Arc::new(HypervisorRegistry::new()); @@ -46,7 +83,7 @@ async fn main() -> Result<(), Box> { registry.register(kvm_backend); // Register FireCracker backend if kernel/rootfs paths are configured - if let Ok(firecracker_backend) = FireCrackerBackend::from_env() { + if let Some(firecracker_backend) = FireCrackerBackend::new_from_config(&config.firecracker) { registry.register(Arc::new(firecracker_backend)); tracing::info!("Registered FireCracker backend"); } else { @@ -68,7 +105,7 @@ async fn main() -> Result<(), Box> { .await; // Parse address - let addr: SocketAddr = args.addr.parse()?; + let addr: SocketAddr = config.addr; tracing::info!("PlasmaVMC server listening on {}", addr);