photoncloud-monorepo/plasmavmc/crates/plasmavmc-kvm/src/lib.rs

1424 lines
48 KiB
Rust

//! KVM/QEMU hypervisor backend for PlasmaVMC
//!
//! This crate provides the KVM backend implementation for the HypervisorBackend trait.
//! It uses QEMU with KVM acceleration to run virtual machines.
mod env;
mod qmp;
use async_trait::async_trait;
use env::{
resolve_kernel_initrd, resolve_nbd_aio_mode, resolve_nbd_max_queues, resolve_qcow2_path,
resolve_qemu_path, resolve_qmp_timeout_secs, resolve_runtime_dir, ENV_QCOW2_PATH,
};
use nix::sys::signal::{kill as nix_kill, Signal};
use nix::unistd::Pid;
use plasmavmc_hypervisor::{BackendCapabilities, HypervisorBackend, UnsupportedReason};
use plasmavmc_types::{
AttachedDisk, DiskAttachment, DiskBus, DiskCache, Error, HypervisorType, NetworkSpec, NicModel,
Result, VirtualMachine, VmHandle, VmSpec, VmState, VmStatus, VolumeFormat,
};
use qmp::QmpClient;
use serde_json::{json, Value};
use std::path::{Path, PathBuf};
use std::time::Duration;
use tokio::process::Command;
use tokio::{net::UnixStream, time::Instant};
/// KVM/QEMU hypervisor backend
pub struct KvmBackend {
/// Path to QEMU binary
qemu_path: PathBuf,
/// Runtime directory for VM state
runtime_dir: PathBuf,
}
impl KvmBackend {
/// Create a new KVM backend
pub fn new(qemu_path: impl Into<PathBuf>, runtime_dir: impl Into<PathBuf>) -> Self {
Self {
qemu_path: qemu_path.into(),
runtime_dir: runtime_dir.into(),
}
}
/// Create with default paths
pub fn with_defaults() -> Self {
Self::new("/usr/bin/qemu-system-x86_64", resolve_runtime_dir())
}
fn qmp_socket_path(&self, handle: &VmHandle) -> PathBuf {
if let Some(path) = handle.backend_state.get("qmp_socket") {
PathBuf::from(path)
} else {
PathBuf::from(&handle.runtime_dir).join("qmp.sock")
}
}
}
fn volume_format_name(format: VolumeFormat) -> &'static str {
match format {
VolumeFormat::Raw => "raw",
VolumeFormat::Qcow2 => "qcow2",
}
}
fn effective_disk_cache(disk: &AttachedDisk) -> DiskCache {
match (&disk.attachment, disk.cache) {
// Shared NBD-backed volumes perform better and behave more predictably
// with direct I/O than with host-side writeback caching.
(DiskAttachment::Nbd { .. }, DiskCache::Writeback) => DiskCache::None,
_ => disk.cache,
}
}
fn disk_aio_mode(disk: &AttachedDisk) -> Option<&'static str> {
match (&disk.attachment, disk.cache) {
(DiskAttachment::File { .. }, DiskCache::None) => Some("native"),
(DiskAttachment::File { .. }, _) => Some("threads"),
(DiskAttachment::Nbd { .. }, _) => Some(resolve_nbd_aio_mode()),
(DiskAttachment::CephRbd { .. }, _) => None,
}
}
fn disk_uses_dedicated_iothread(disk: &AttachedDisk) -> bool {
matches!(
(&disk.attachment, disk.bus),
(DiskAttachment::Nbd { .. }, DiskBus::Virtio)
)
}
fn disk_queue_count(vm: &VirtualMachine, disk: &AttachedDisk) -> u16 {
if !disk_uses_dedicated_iothread(disk) {
return 1;
}
vm.spec
.cpu
.vcpus
.clamp(1, resolve_nbd_max_queues().max(1) as u32) as u16
}
fn sanitize_device_component(value: &str, fallback_index: usize) -> String {
let sanitized: String = value
.chars()
.map(|ch| if ch.is_ascii_alphanumeric() { ch } else { '-' })
.collect();
if sanitized.is_empty() {
format!("disk-{fallback_index}")
} else {
sanitized
}
}
fn bootindex_suffix(boot_index: Option<u32>) -> String {
boot_index
.filter(|index| *index > 0)
.map(|index| format!(",bootindex={index}"))
.unwrap_or_default()
}
fn qmp_timeout() -> Duration {
Duration::from_secs(resolve_qmp_timeout_secs())
}
fn disk_cache_json(cache: DiskCache) -> Value {
json!({
"direct": matches!(cache, DiskCache::None),
"no-flush": false
})
}
fn validate_ceph_component(field_name: &str, value: &str) -> Result<()> {
if value.is_empty() {
return Err(Error::HypervisorError(format!("{field_name} is required")));
}
let mut chars = value.chars();
let Some(first) = chars.next() else {
return Err(Error::HypervisorError(format!("{field_name} is required")));
};
if !first.is_ascii_alphanumeric() {
return Err(Error::HypervisorError(format!(
"{field_name} must start with an ASCII alphanumeric character"
)));
}
if chars.any(|ch| !(ch.is_ascii_alphanumeric() || matches!(ch, '.' | '_' | '-'))) {
return Err(Error::HypervisorError(format!(
"{field_name} contains unsupported characters"
)));
}
Ok(())
}
fn parse_host_port(authority: &str, default_port: u16) -> Result<(String, u16)> {
if let Some(rest) = authority.strip_prefix('[') {
let (host, tail) = rest
.split_once(']')
.ok_or_else(|| Error::HypervisorError("invalid IPv6 authority".into()))?;
let port = tail
.strip_prefix(':')
.and_then(|value| value.parse::<u16>().ok())
.unwrap_or(default_port);
return Ok((host.to_string(), port));
}
if let Some((host, port)) = authority.rsplit_once(':') {
if !host.is_empty() {
if let Ok(port) = port.parse::<u16>() {
return Ok((host.to_string(), port));
}
}
}
Ok((authority.to_string(), default_port))
}
fn parse_nbd_uri(uri: &str) -> Result<(String, u16, Option<String>)> {
let remainder = uri
.strip_prefix("nbd://")
.ok_or_else(|| Error::HypervisorError(format!("unsupported NBD URI: {uri}")))?;
if remainder.contains('@') || remainder.contains('?') || remainder.contains('#') {
return Err(Error::HypervisorError(format!(
"unsupported NBD URI components: {uri}"
)));
}
let (authority, path) = remainder.split_once('/').unwrap_or((remainder, ""));
let (host, port) = parse_host_port(authority, 10809)?;
if host.is_empty() {
return Err(Error::HypervisorError(format!(
"missing NBD host in URI: {uri}"
)));
}
let export = (!path.is_empty()).then(|| path.to_string());
Ok((host, port, export))
}
fn parse_ceph_monitor(monitor: &str) -> Result<Value> {
let (host, port) = parse_host_port(monitor, 6789)?;
if host.is_empty() {
return Err(Error::HypervisorError(format!(
"invalid Ceph monitor address: {monitor}"
)));
}
Ok(json!({
"type": "inet",
"host": host,
"port": port.to_string()
}))
}
fn disk_blockdev_arg(disk: &AttachedDisk, disk_id: &str) -> Result<String> {
let effective_cache = effective_disk_cache(disk);
let aio_mode = disk_aio_mode(disk);
let file = match &disk.attachment {
DiskAttachment::File { path, .. } => {
let mut file = json!({
"driver": "file",
"filename": path
});
if let Some(aio_mode) = aio_mode {
file["aio"] = json!(aio_mode);
}
file
}
DiskAttachment::Nbd { uri, .. } => {
let (host, port, export) = parse_nbd_uri(uri)?;
let mut nbd = json!({
"driver": "nbd",
"server": {
"type": "inet",
"host": host,
"port": port.to_string()
}
});
if let Some(export) = export {
nbd["export"] = json!(export);
}
if let Some(aio_mode) = aio_mode {
nbd["aio"] = json!(aio_mode);
}
nbd
}
DiskAttachment::CephRbd {
pool,
image,
monitors,
user,
..
} => {
validate_ceph_component("ceph pool", pool)?;
validate_ceph_component("ceph image", image)?;
if !user.is_empty() {
validate_ceph_component("ceph user", user)?;
}
let servers: Vec<Value> = monitors
.iter()
.map(|monitor| parse_ceph_monitor(monitor))
.collect::<Result<Vec<_>>>()?;
let mut rbd = json!({
"driver": "rbd",
"pool": pool,
"image": image,
"server": servers
});
if !user.is_empty() {
rbd["user"] = json!(user);
}
rbd
}
};
let format_driver = match &disk.attachment {
DiskAttachment::File { format, .. } | DiskAttachment::Nbd { format, .. } => {
volume_format_name(*format)
}
DiskAttachment::CephRbd { .. } => "raw",
};
Ok(json!({
"node-name": format!("drive-{disk_id}"),
"driver": format_driver,
"read-only": disk.read_only,
"cache": disk_cache_json(effective_cache),
"file": file
})
.to_string())
}
fn build_disk_args(vm: &VirtualMachine, disks: &[AttachedDisk]) -> Result<Vec<String>> {
if disks.is_empty() && vm.spec.disks.is_empty() {
let qcow_path = resolve_qcow2_path().ok_or_else(|| {
Error::HypervisorError(format!(
"{ENV_QCOW2_PATH} not set; provide qcow2 image to spawn VM"
))
})?;
if !qcow_path.exists() {
return Err(Error::HypervisorError(format!(
"Primary disk is not materialized at {}",
qcow_path.display()
)));
}
return Ok(vec![
"-blockdev".into(),
json!({
"node-name": "drive-root",
"driver": "qcow2",
"read-only": false,
"cache": disk_cache_json(DiskCache::Writeback),
"file": {
"driver": "file",
"filename": qcow_path.display().to_string()
}
})
.to_string(),
"-device".into(),
"virtio-blk-pci,drive=drive-root,id=disk-root".into(),
]);
}
let mut args = Vec::new();
let has_scsi = vm
.spec
.disks
.iter()
.any(|disk| matches!(disk.bus, DiskBus::Scsi));
let has_ahci = vm
.spec
.disks
.iter()
.any(|disk| matches!(disk.bus, DiskBus::Ide | DiskBus::Sata));
if has_scsi {
args.push("-device".into());
args.push("virtio-scsi-pci,id=scsi0".into());
}
if has_ahci {
args.push("-device".into());
args.push("ich9-ahci,id=ahci0".into());
}
let mut disks: Vec<&AttachedDisk> = disks.iter().collect();
disks.sort_by(|lhs, rhs| {
lhs.boot_index
.unwrap_or(u32::MAX)
.cmp(&rhs.boot_index.unwrap_or(u32::MAX))
.then_with(|| lhs.id.cmp(&rhs.id))
});
let mut scsi_slot = 0usize;
let mut ahci_slot = 0usize;
for (index, disk) in disks.into_iter().enumerate() {
let disk_id = sanitize_device_component(&disk.id, index);
if disk_uses_dedicated_iothread(disk) {
args.push("-object".into());
args.push(format!("iothread,id=iothread-{disk_id}"));
}
args.push("-blockdev".into());
args.push(disk_blockdev_arg(disk, &disk_id)?);
let bootindex = bootindex_suffix(disk.boot_index);
let device_arg = match disk.bus {
DiskBus::Virtio => {
let mut device_arg =
format!("virtio-blk-pci,drive=drive-{disk_id},id=disk-{disk_id}");
if disk_uses_dedicated_iothread(disk) {
let queues = disk_queue_count(vm, disk);
device_arg.push_str(&format!(
",iothread=iothread-{disk_id},num-queues={queues},queue-size=1024"
));
}
device_arg.push_str(&bootindex);
device_arg
}
DiskBus::Scsi => {
let slot = scsi_slot;
scsi_slot += 1;
format!(
"scsi-hd,drive=drive-{disk_id},id=disk-{disk_id},bus=scsi0.0,channel=0,scsi-id={slot},lun=0{bootindex}"
)
}
DiskBus::Ide | DiskBus::Sata => {
if ahci_slot >= 6 {
return Err(Error::HypervisorError(
"Too many IDE/SATA disks for a single AHCI controller".into(),
));
}
let slot = ahci_slot;
ahci_slot += 1;
format!(
"ide-hd,drive=drive-{disk_id},id=disk-{disk_id},bus=ahci0.{slot}{bootindex}"
)
}
};
args.push("-device".into());
args.push(device_arg);
}
Ok(args)
}
/// Build a minimal QEMU argument list for paused launch with QMP socket.
fn build_qemu_args(
vm: &VirtualMachine,
disks: &[AttachedDisk],
qmp_socket: &Path,
console_log: &Path,
kernel: Option<&Path>,
initrd: Option<&Path>,
) -> Result<Vec<String>> {
let mut args = vec![
"-machine".into(),
"q35,accel=kvm".into(),
"-name".into(),
vm.name.clone(),
"-m".into(),
vm.spec.memory.size_mib.to_string(),
"-smp".into(),
vm.spec.cpu.vcpus.to_string(),
"-cpu".into(),
vm.spec
.cpu
.cpu_model
.clone()
.unwrap_or_else(|| "host".into()),
"-enable-kvm".into(),
"-nographic".into(),
"-display".into(),
"none".into(),
"-monitor".into(),
"none".into(),
"-qmp".into(),
format!("unix:{},server=on,wait=off", qmp_socket.display()),
"-serial".into(),
format!("file:{}", console_log.display()),
"-S".into(),
];
args.extend(build_disk_args(vm, disks)?);
if let Some(kernel) = kernel {
args.push("-kernel".into());
args.push(kernel.display().to_string());
if let Some(initrd) = initrd {
args.push("-initrd".into());
args.push(initrd.display().to_string());
}
args.push("-append".into());
args.push("console=ttyS0".into());
}
Ok(args)
}
/// Build QEMU args for an incoming migration listener.
fn build_qemu_args_incoming(
vm: &VirtualMachine,
disks: &[AttachedDisk],
qmp_socket: &Path,
console_log: &Path,
kernel: Option<&Path>,
initrd: Option<&Path>,
listen_uri: &str,
) -> Result<Vec<String>> {
let mut args = build_qemu_args(vm, disks, qmp_socket, console_log, kernel, initrd)?;
// Remove -S from the paused launch; incoming migration manages CPU start.
if let Some(pos) = args.iter().position(|arg| arg == "-S") {
args.remove(pos);
}
args.push("-incoming".into());
args.push(listen_uri.to_string());
Ok(args)
}
/// Wait for QMP socket to become available.
async fn wait_for_qmp(qmp_socket: &Path, timeout: Duration) -> Result<()> {
let start = Instant::now();
loop {
match UnixStream::connect(qmp_socket).await {
Ok(stream) => {
drop(stream);
return Ok(());
}
Err(e) => {
if start.elapsed() >= timeout {
return Err(Error::HypervisorError(format!(
"Timed out waiting for QMP socket {}: {e}",
qmp_socket.display()
)));
}
tokio::time::sleep(Duration::from_millis(50)).await;
}
}
}
}
fn kill_pid(pid: u32) -> Result<()> {
let pid = Pid::from_raw(pid as i32);
match nix_kill(pid, Signal::SIGKILL) {
Ok(()) => Ok(()),
Err(nix::errno::Errno::ESRCH) => Ok(()),
Err(error) => Err(Error::HypervisorError(format!(
"failed to send SIGKILL to pid {}: {error}",
pid.as_raw()
))),
}
}
fn pid_running(pid: u32) -> bool {
match nix_kill(Pid::from_raw(pid as i32), None::<Signal>) {
Ok(()) => true,
Err(nix::errno::Errno::EPERM) => true,
Err(nix::errno::Errno::ESRCH) => false,
Err(_) => false,
}
}
fn vm_stopped_out_of_band(handle: &VmHandle, qmp_socket: &Path) -> bool {
if let Some(pid) = handle.pid {
return !pid_running(pid);
}
!qmp_socket.exists()
}
fn stopped_status() -> VmStatus {
VmStatus {
actual_state: VmState::Stopped,
..VmStatus::default()
}
}
#[async_trait]
impl HypervisorBackend for KvmBackend {
fn backend_type(&self) -> HypervisorType {
HypervisorType::Kvm
}
fn capabilities(&self) -> BackendCapabilities {
BackendCapabilities {
live_migration: true,
hot_plug_cpu: true,
hot_plug_memory: true,
hot_plug_disk: true,
hot_plug_nic: true,
vnc_console: true,
serial_console: true,
nested_virtualization: true,
gpu_passthrough: true,
max_vcpus: 256,
max_memory_gib: 4096,
supported_disk_buses: vec![DiskBus::Virtio, DiskBus::Scsi, DiskBus::Ide, DiskBus::Sata],
supported_nic_models: vec![NicModel::VirtioNet, NicModel::E1000],
}
}
fn supports(&self, _spec: &VmSpec) -> std::result::Result<(), UnsupportedReason> {
// KVM supports all features, so no limitations
Ok(())
}
async fn create(&self, vm: &VirtualMachine, disks: &[AttachedDisk]) -> Result<VmHandle> {
tracing::info!(
vm_id = %vm.id,
name = %vm.name,
"Creating VM (runtime prep + spawn)"
);
let runtime_dir = self.runtime_dir.join(vm.id.to_string());
tokio::fs::create_dir_all(&runtime_dir)
.await
.map_err(|e| Error::HypervisorError(format!("Failed to create runtime dir: {e}")))?;
let qmp_socket = runtime_dir.join("qmp.sock");
let console_log = runtime_dir.join("console.log");
// Remove stale socket if it exists from a previous run.
let _ = tokio::fs::remove_file(&qmp_socket).await;
let _ = tokio::fs::remove_file(&console_log).await;
let qemu_bin = resolve_qemu_path(&self.qemu_path);
let (kernel_path, initrd_path) = resolve_kernel_initrd();
let args = build_qemu_args(
vm,
disks,
&qmp_socket,
&console_log,
kernel_path.as_deref(),
initrd_path.as_deref(),
)?;
let mut cmd = Command::new(&qemu_bin);
cmd.args(&args);
tracing::debug!(
vm_id = %vm.id,
qemu_bin = %qemu_bin.display(),
runtime_dir = %runtime_dir.display(),
qmp_socket = %qmp_socket.display(),
?args,
"Spawning KVM QEMU"
);
let mut child = cmd
.spawn()
.map_err(|e| Error::HypervisorError(format!("Failed to spawn QEMU: {e}")))?;
let pid = child.id().map(|p| p);
// Wait for QMP readiness before detaching so slow nested workers do not leave orphans.
if let Err(err) = wait_for_qmp(&qmp_socket, qmp_timeout()).await {
tracing::warn!(
vm_id = %vm.id,
qmp_socket = %qmp_socket.display(),
?pid,
error = %err,
"QMP socket did not become ready; cleaning up spawned QEMU"
);
let _ = child.start_kill();
let _ = child.wait().await;
let _ = tokio::fs::remove_file(&qmp_socket).await;
return Err(err);
}
// Detach process; lifecycle managed via QMP/kill later.
tokio::spawn(async move {
let _ = child.wait().await;
});
let mut handle = VmHandle::new(vm.id, runtime_dir.to_string_lossy().to_string());
handle
.backend_state
.insert("qmp_socket".into(), qmp_socket.display().to_string());
handle
.backend_state
.insert("console_log".into(), console_log.display().to_string());
handle.pid = pid;
handle.attached_disks = disks.to_vec();
Ok(handle)
}
async fn start(&self, handle: &VmHandle) -> Result<()> {
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
tracing::info!(
vm_id = %handle.vm_id,
qmp_socket = %qmp_socket.display(),
"Starting VM via QMP cont"
);
let mut client = QmpClient::connect(&qmp_socket).await?;
client.command::<Value>("cont", None::<Value>).await?;
Ok(())
}
async fn stop(&self, handle: &VmHandle, timeout: Duration) -> Result<()> {
let qmp_socket = self.qmp_socket_path(handle);
if let Err(e) = wait_for_qmp(&qmp_socket, qmp_timeout()).await {
if vm_stopped_out_of_band(handle, &qmp_socket) {
tracing::info!(vm_id = %handle.vm_id, "VM already stopped before QMP stop");
return Ok(());
}
if let Some(pid) = handle.pid {
tracing::warn!(vm_id = %handle.vm_id, pid, "QMP unavailable; sending SIGKILL");
return kill_pid(pid);
}
return Err(e);
}
tracing::info!(
vm_id = %handle.vm_id,
timeout_secs = timeout.as_secs(),
qmp_socket = %qmp_socket.display(),
"Stopping VM via QMP system_powerdown"
);
let mut client = QmpClient::connect(&qmp_socket).await?;
if let Err(e) = client
.command::<Value>("system_powerdown", None::<Value>)
.await
{
if vm_stopped_out_of_band(handle, &qmp_socket) {
tracing::info!(
vm_id = %handle.vm_id,
error = %e,
"VM exited while handling system_powerdown; treating stop as successful"
);
return Ok(());
}
tracing::warn!(
vm_id = %handle.vm_id,
error = %e,
"QMP powerdown command raced with shutdown; waiting for VM to stop"
);
}
let start = Instant::now();
loop {
if vm_stopped_out_of_band(handle, &qmp_socket) {
break;
}
match QmpClient::connect(&qmp_socket).await {
Ok(mut client) => match client.query_status().await {
Ok(status)
if matches!(status.actual_state, VmState::Stopped | VmState::Failed) =>
{
break;
}
Ok(_) => {}
Err(e) if vm_stopped_out_of_band(handle, &qmp_socket) => break,
Err(e) => {
tracing::debug!(
vm_id = %handle.vm_id,
error = %e,
"QMP query failed while waiting for shutdown"
);
}
},
Err(e) if vm_stopped_out_of_band(handle, &qmp_socket) => break,
Err(e) => {
tracing::debug!(
vm_id = %handle.vm_id,
error = %e,
"QMP reconnect failed while waiting for shutdown"
);
}
}
if start.elapsed() >= timeout {
if let Some(pid) = handle.pid {
tracing::warn!(vm_id = %handle.vm_id, pid, "Stop timed out; sending SIGKILL");
kill_pid(pid)?;
break;
}
return Err(Error::HypervisorError(format!(
"Timeout waiting for VM {} to stop",
handle.vm_id
)));
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
Ok(())
}
async fn kill(&self, handle: &VmHandle) -> Result<()> {
tracing::info!(vm_id = %handle.vm_id, "Force killing VM via QMP quit");
let qmp_socket = self.qmp_socket_path(handle);
match wait_for_qmp(&qmp_socket, qmp_timeout()).await {
Ok(_) => {
let mut client = QmpClient::connect(&qmp_socket).await?;
if let Err(e) = client.command::<Value>("quit", None::<Value>).await {
tracing::warn!(vm_id = %handle.vm_id, error = %e, "QMP quit failed; attempting SIGKILL");
if let Some(pid) = handle.pid {
return kill_pid(pid);
}
return Err(e);
}
}
Err(e) => {
if let Some(pid) = handle.pid {
tracing::warn!(vm_id = %handle.vm_id, pid, "QMP unavailable; attempting SIGKILL");
return kill_pid(pid);
}
return Err(e);
}
}
Ok(())
}
async fn reboot(&self, handle: &VmHandle) -> Result<()> {
tracing::info!(vm_id = %handle.vm_id, "Rebooting VM via QMP system_reset");
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
client
.command::<Value>("system_reset", None::<Value>)
.await?;
Ok(())
}
async fn prepare_incoming(
&self,
vm: &VirtualMachine,
listen_uri: &str,
disks: &[AttachedDisk],
) -> Result<VmHandle> {
tracing::info!(
vm_id = %vm.id,
listen_uri,
"Preparing incoming migration listener"
);
let runtime_dir = self.runtime_dir.join(vm.id.to_string());
tokio::fs::create_dir_all(&runtime_dir)
.await
.map_err(|e| Error::HypervisorError(format!("Failed to create runtime dir: {e}")))?;
let qmp_socket = runtime_dir.join("qmp.sock");
let console_log = runtime_dir.join("console.log");
let _ = tokio::fs::remove_file(&qmp_socket).await;
let _ = tokio::fs::remove_file(&console_log).await;
let qemu_bin = resolve_qemu_path(&self.qemu_path);
let (kernel_path, initrd_path) = resolve_kernel_initrd();
let args = build_qemu_args_incoming(
vm,
disks,
&qmp_socket,
&console_log,
kernel_path.as_deref(),
initrd_path.as_deref(),
listen_uri,
)?;
let mut cmd = Command::new(&qemu_bin);
cmd.args(&args);
tracing::debug!(
vm_id = %vm.id,
qemu_bin = %qemu_bin.display(),
runtime_dir = %runtime_dir.display(),
qmp_socket = %qmp_socket.display(),
?args,
"Spawning QEMU for incoming migration"
);
let mut child = cmd
.spawn()
.map_err(|e| Error::HypervisorError(format!("Failed to spawn QEMU: {e}")))?;
let pid = child.id().map(|p| p);
if let Err(err) = wait_for_qmp(&qmp_socket, qmp_timeout()).await {
tracing::warn!(
vm_id = %vm.id,
qmp_socket = %qmp_socket.display(),
?pid,
error = %err,
"Incoming migration QMP socket did not become ready; cleaning up spawned QEMU"
);
let _ = child.start_kill();
let _ = child.wait().await;
let _ = tokio::fs::remove_file(&qmp_socket).await;
return Err(err);
}
tokio::spawn(async move {
let _ = child.wait().await;
});
let mut handle = VmHandle::new(vm.id, runtime_dir.to_string_lossy().to_string());
handle
.backend_state
.insert("qmp_socket".into(), qmp_socket.display().to_string());
handle
.backend_state
.insert("console_log".into(), console_log.display().to_string());
handle.pid = pid;
handle.attached_disks = disks.to_vec();
Ok(handle)
}
async fn migrate(
&self,
handle: &VmHandle,
destination_uri: &str,
timeout: Duration,
wait: bool,
) -> Result<()> {
tracing::info!(
vm_id = %handle.vm_id,
destination_uri,
wait,
"Initiating live migration via QMP"
);
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
client
.command("migrate", Some(json!({ "uri": destination_uri })))
.await?;
if !wait {
return Ok(());
}
let start = Instant::now();
loop {
let resp = client
.command::<Value>("query-migrate", None::<Value>)
.await?;
let status = resp
.get("status")
.and_then(Value::as_str)
.unwrap_or("unknown");
match status {
"completed" => return Ok(()),
"failed" | "cancelled" => {
let err = resp
.get("error")
.and_then(Value::as_str)
.unwrap_or("migration failed");
return Err(Error::HypervisorError(format!("Migration failed: {err}")));
}
_ => {}
}
if start.elapsed() >= timeout {
return Err(Error::HypervisorError(format!(
"Timeout waiting for migration of VM {}",
handle.vm_id
)));
}
tokio::time::sleep(Duration::from_millis(200)).await;
}
}
async fn delete(&self, handle: &VmHandle) -> Result<()> {
tracing::info!(vm_id = %handle.vm_id, "Deleting VM resources");
if handle.pid.is_some() || self.qmp_socket_path(handle).exists() {
let _ = self.kill(handle).await;
}
if let Some(pid) = handle.pid {
let deadline = Instant::now() + Duration::from_secs(5);
while pid_running(pid) {
if Instant::now() >= deadline {
return Err(Error::HypervisorError(format!(
"Timed out waiting for VM {} process {} to exit",
handle.vm_id, pid
)));
}
tokio::time::sleep(Duration::from_millis(100)).await;
}
}
let runtime_dir = PathBuf::from(&handle.runtime_dir);
if tokio::fs::try_exists(&runtime_dir)
.await
.map_err(|e| Error::HypervisorError(format!("Failed to inspect runtime dir: {e}")))?
{
tokio::fs::remove_dir_all(&runtime_dir).await.map_err(|e| {
Error::HypervisorError(format!("Failed to remove runtime dir: {e}"))
})?;
}
tracing::info!(vm_id = %handle.vm_id, "Deleted VM resources");
Ok(())
}
async fn status(&self, handle: &VmHandle) -> Result<VmStatus> {
let qmp_socket = self.qmp_socket_path(handle);
tracing::debug!(
vm_id = %handle.vm_id,
qmp_socket = %qmp_socket.display(),
"Querying VM status via QMP"
);
match QmpClient::connect(&qmp_socket).await {
Ok(mut client) => match client.query_status().await {
Ok(status) => Ok(status),
Err(e) if vm_stopped_out_of_band(handle, &qmp_socket) => Ok(stopped_status()),
Err(e) => Err(e),
},
Err(e) if vm_stopped_out_of_band(handle, &qmp_socket) => Ok(stopped_status()),
Err(e) => Err(e),
}
}
async fn attach_disk(&self, handle: &VmHandle, disk: &AttachedDisk) -> Result<()> {
tracing::info!(
vm_id = %handle.vm_id,
disk_id = %disk.id,
"Attaching disk via QMP device_add"
);
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
let blockdev_args = match &disk.attachment {
DiskAttachment::File { path, format } => serde_json::json!({
"node-name": format!("drive-{}", disk.id),
"driver": volume_format_name(*format),
"read-only": disk.read_only,
"file": {
"driver": "file",
"filename": path
}
}),
DiskAttachment::Nbd { .. } => {
return Err(Error::UnsupportedFeature(
"KVM hot-plug for NBD-backed disks is not implemented".into(),
));
}
DiskAttachment::CephRbd { .. } => {
return Err(Error::UnsupportedFeature(
"KVM hot-plug for Ceph RBD-backed disks is not implemented".into(),
));
}
};
client.command("blockdev-add", Some(blockdev_args)).await?;
// Step 2: Add virtio-blk-pci frontend device
let device_args = serde_json::json!({
"driver": "virtio-blk-pci",
"id": format!("disk-{}", disk.id),
"drive": format!("drive-{}", disk.id)
});
client.command("device_add", Some(device_args)).await?;
tracing::info!(
vm_id = %handle.vm_id,
disk_id = %disk.id,
"Disk attached successfully"
);
Ok(())
}
async fn detach_disk(&self, handle: &VmHandle, disk_id: &str) -> Result<()> {
tracing::info!(
vm_id = %handle.vm_id,
disk_id = disk_id,
"Detaching disk via QMP device_del"
);
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
// Remove the virtio-blk-pci device (backend will be cleaned up automatically)
let device_args = serde_json::json!({
"id": format!("disk-{}", disk_id)
});
client.command("device_del", Some(device_args)).await?;
tracing::info!(
vm_id = %handle.vm_id,
disk_id = disk_id,
"Disk detached successfully"
);
Ok(())
}
async fn attach_nic(&self, handle: &VmHandle, nic: &NetworkSpec) -> Result<()> {
tracing::info!(
vm_id = %handle.vm_id,
nic_id = %nic.id,
"Attaching NIC via QMP device_add"
);
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
// Generate MAC address if not provided
let mac_addr = nic
.mac_address
.as_ref()
.map(|s| s.as_str())
.unwrap_or_else(|| {
// Generate a simple MAC (should be more sophisticated in production)
"52:54:00:12:34:56"
});
// Step 1: Add network backend via netdev_add
let netdev_args = serde_json::json!({
"type": "tap",
"id": format!("netdev-{}", nic.id),
"ifname": format!("tap-{}", nic.id),
"script": "no",
"downscript": "no"
});
client.command("netdev_add", Some(netdev_args)).await?;
// Step 2: Add virtio-net-pci frontend device
let device_args = serde_json::json!({
"driver": "virtio-net-pci",
"id": format!("net-{}", nic.id),
"netdev": format!("netdev-{}", nic.id),
"mac": mac_addr
});
client.command("device_add", Some(device_args)).await?;
tracing::info!(
vm_id = %handle.vm_id,
nic_id = %nic.id,
mac = mac_addr,
"NIC attached successfully"
);
Ok(())
}
async fn detach_nic(&self, handle: &VmHandle, nic_id: &str) -> Result<()> {
tracing::info!(
vm_id = %handle.vm_id,
nic_id = nic_id,
"Detaching NIC via QMP device_del"
);
let qmp_socket = self.qmp_socket_path(handle);
wait_for_qmp(&qmp_socket, qmp_timeout()).await?;
let mut client = QmpClient::connect(&qmp_socket).await?;
// Remove the virtio-net-pci device (netdev backend will be cleaned up automatically)
let device_args = serde_json::json!({
"id": format!("net-{}", nic_id)
});
client.command("device_del", Some(device_args)).await?;
tracing::info!(
vm_id = %handle.vm_id,
nic_id = nic_id,
"NIC detached successfully"
);
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use plasmavmc_types::DiskSpec;
use tokio::net::UnixListener;
#[test]
fn test_kvm_backend_creation() {
let backend = KvmBackend::with_defaults();
assert_eq!(backend.backend_type(), HypervisorType::Kvm);
}
#[test]
fn test_kvm_capabilities() {
let backend = KvmBackend::with_defaults();
let caps = backend.capabilities();
assert!(caps.live_migration);
assert!(caps.vnc_console);
assert!(caps.serial_console);
assert_eq!(caps.max_vcpus, 256);
}
#[test]
fn test_kvm_supports_all_specs() {
let backend = KvmBackend::with_defaults();
let spec = VmSpec::default();
assert!(backend.supports(&spec).is_ok());
}
#[test]
fn build_qemu_args_contains_qmp_and_memory() {
let _guard = crate::env::env_test_lock().lock().unwrap();
let vm = VirtualMachine::new("vm1", "org", "proj", VmSpec::default());
let qmp = PathBuf::from("/tmp/qmp.sock");
let temp = tempfile::tempdir().unwrap();
let qcow = temp.path().join("image.qcow2");
std::fs::write(&qcow, b"image").unwrap();
std::env::set_var(env::ENV_QCOW2_PATH, &qcow);
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &[], &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("qmp.sock"));
assert!(args_joined.contains("512")); // default memory MiB
assert!(args_joined.contains("image.qcow2"));
assert!(args_joined.contains("console.log"));
std::env::remove_var(env::ENV_QCOW2_PATH);
}
#[test]
fn build_qemu_args_includes_all_materialized_disks() {
let _guard = crate::env::env_test_lock().lock().unwrap();
let temp = tempfile::tempdir().unwrap();
let volume_dir = temp.path().join("volumes");
std::fs::create_dir_all(&volume_dir).unwrap();
std::fs::write(volume_dir.join("vm-root.qcow2"), b"root").unwrap();
std::fs::write(volume_dir.join("vm-data.qcow2"), b"data").unwrap();
let mut spec = VmSpec::default();
spec.disks = vec![
DiskSpec {
id: "root".into(),
source: plasmavmc_types::DiskSource::Volume {
volume_id: "vm-root".into(),
},
size_gib: 4,
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
},
DiskSpec {
id: "data".into(),
source: plasmavmc_types::DiskSource::Volume {
volume_id: "vm-data".into(),
},
size_gib: 2,
bus: DiskBus::Virtio,
cache: DiskCache::Writeback,
boot_index: None,
},
];
let vm = VirtualMachine::new("vm1", "org", "proj", spec);
let disks = vec![
AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::File {
path: volume_dir.join("vm-root.qcow2").display().to_string(),
format: VolumeFormat::Qcow2,
},
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
read_only: false,
},
AttachedDisk {
id: "data".into(),
attachment: DiskAttachment::File {
path: volume_dir.join("vm-data.qcow2").display().to_string(),
format: VolumeFormat::Qcow2,
},
bus: DiskBus::Virtio,
cache: DiskCache::Writeback,
boot_index: None,
read_only: false,
},
];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("vm-root.qcow2"));
assert!(args_joined.contains("vm-data.qcow2"));
assert!(args_joined.contains("-blockdev"));
assert!(args_joined.contains("bootindex=1"));
assert!(args_joined.contains("\"cache\":{\"direct\":true,\"no-flush\":false}"));
assert!(args_joined.contains("\"cache\":{\"direct\":false,\"no-flush\":false}"));
assert!(args_joined.contains("\"aio\":\"native\""));
assert!(args_joined.contains("\"aio\":\"threads\""));
}
#[test]
fn build_qemu_args_assigns_iothread_to_nbd_virtio_disks() {
let mut spec = VmSpec::default();
spec.cpu.vcpus = 4;
let vm = VirtualMachine::new("vm1", "org", "proj", spec);
let disks = vec![AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::Nbd {
uri: "nbd://10.100.0.11:11000".into(),
format: VolumeFormat::Raw,
},
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
read_only: false,
}];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("\"driver\":\"nbd\""));
assert!(args_joined.contains("-object iothread,id=iothread-root"));
assert!(args_joined.contains("virtio-blk-pci,drive=drive-root,id=disk-root,iothread=iothread-root,num-queues=4,queue-size=1024,bootindex=1"));
}
#[test]
fn build_qemu_args_coerces_writeback_cache_to_none_for_nbd_disks() {
let _guard = crate::env::env_test_lock().lock().unwrap();
std::env::remove_var(crate::env::ENV_NBD_AIO_MODE);
let vm = VirtualMachine::new("vm1", "org", "proj", VmSpec::default());
let disks = vec![AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::Nbd {
uri: "nbd://10.100.0.11:11000".into(),
format: VolumeFormat::Raw,
},
bus: DiskBus::Virtio,
cache: DiskCache::Writeback,
boot_index: Some(1),
read_only: false,
}];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("\"cache\":{\"direct\":true,\"no-flush\":false}"));
assert!(args_joined.contains("\"aio\":\"io_uring\""));
}
#[test]
fn build_qemu_args_uses_io_uring_for_nbd_none_cache_by_default() {
let _guard = crate::env::env_test_lock().lock().unwrap();
std::env::remove_var(crate::env::ENV_NBD_AIO_MODE);
let vm = VirtualMachine::new("vm1", "org", "proj", VmSpec::default());
let disks = vec![AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::Nbd {
uri: "nbd://10.100.0.11:11000".into(),
format: VolumeFormat::Raw,
},
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
read_only: false,
}];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("\"cache\":{\"direct\":true,\"no-flush\":false}"));
assert!(args_joined.contains("\"aio\":\"io_uring\""));
}
#[test]
fn build_qemu_args_honors_nbd_aio_override() {
let _guard = crate::env::env_test_lock().lock().unwrap();
std::env::set_var(crate::env::ENV_NBD_AIO_MODE, "threads");
let vm = VirtualMachine::new("vm1", "org", "proj", VmSpec::default());
let disks = vec![AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::Nbd {
uri: "nbd://10.100.0.11:11000".into(),
format: VolumeFormat::Raw,
},
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
read_only: false,
}];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let args = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap();
let args_joined = args.join(" ");
assert!(args_joined.contains("\"cache\":{\"direct\":true,\"no-flush\":false}"));
assert!(args_joined.contains("\"aio\":\"threads\""));
std::env::remove_var(crate::env::ENV_NBD_AIO_MODE);
}
#[test]
fn build_qemu_args_rejects_invalid_ceph_identifiers() {
let vm = VirtualMachine::new("vm1", "org", "proj", VmSpec::default());
let disks = vec![AttachedDisk {
id: "root".into(),
attachment: DiskAttachment::CephRbd {
pool: "pool,inject".into(),
image: "image".into(),
monitors: vec!["10.0.0.10:6789".into()],
user: "admin".into(),
secret: None,
},
bus: DiskBus::Virtio,
cache: DiskCache::None,
boot_index: Some(1),
read_only: false,
}];
let qmp = PathBuf::from("/tmp/qmp.sock");
let console = PathBuf::from("/tmp/console.log");
let error = build_qemu_args(&vm, &disks, &qmp, &console, None, None).unwrap_err();
assert!(error.to_string().contains("unsupported characters"));
}
#[tokio::test]
async fn wait_for_qmp_succeeds_after_socket_created() {
let dir = tempfile::tempdir().unwrap();
let socket_path = dir.path().join("qmp.sock");
let socket_clone = socket_path.clone();
tokio::spawn(async move {
tokio::time::sleep(Duration::from_millis(100)).await;
let _listener = UnixListener::bind(socket_clone).expect("bind socket");
// Keep listener alive briefly
tokio::time::sleep(Duration::from_millis(200)).await;
});
wait_for_qmp(&socket_path, Duration::from_secs(1))
.await
.expect("qmp became ready");
}
// Integration smoke: requires env to point to QEMU and a qcow2 image.
#[tokio::test]
#[ignore]
async fn integration_create_start_status_stop() {
let _guard = crate::env::env_test_lock().lock().unwrap();
let qemu = std::env::var(env::ENV_QEMU_PATH)
.unwrap_or_else(|_| "/usr/bin/qemu-system-x86_64".into());
let qcow = match std::env::var(env::ENV_QCOW2_PATH) {
Ok(path) => path,
Err(_) => {
eprintln!("Skipping integration: {} not set", env::ENV_QCOW2_PATH);
return;
}
};
if !Path::new(&qemu).exists() || !Path::new(&qcow).exists() {
eprintln!("Skipping integration: qemu or qcow2 path missing");
return;
}
let backend = KvmBackend::new(qemu, tempfile::tempdir().unwrap().into_path());
let vm = VirtualMachine::new("int", "org", "proj", VmSpec::default());
let handle = backend.create(&vm, &[]).await.expect("create vm");
backend.start(&handle).await.expect("start vm");
let status = backend.status(&handle).await.expect("status vm");
assert!(
matches!(
status.actual_state,
VmState::Running | VmState::Stopped | VmState::Error
),
"unexpected state: {:?}",
status.actual_state
);
backend
.stop(&handle, Duration::from_secs(2))
.await
.expect("stop vm");
}
}