From 27c7be6ed1384b7fab2ddd042aff55ca7a5885be Mon Sep 17 00:00:00 2001 From: Babis Chalios Date: Mon, 15 Dec 2025 11:19:38 +0100 Subject: [PATCH] fix: saving/restoring async IO engine transport state When saving the state of a microVM with one or more block devices backed by the async IO engine, we need to take a few steps extra steps before serializing the state to the disk, as we need to make sure that there aren't any pending io_uring requests that have not been handled by the kernel yet. For these types of devices that need that we call a prepare_save() hook before serializing the device state. If there are indeed pending requests, once we handle them we need to let the guest know, by adding the corresponding VirtIO descriptors to the used ring. Moreover, since we use notification suppression, this might or might not require us to send an interrupt to the guest. Now, when we save the state of a VirtIO device, we save the device specific state **and** the transport (MMIO or PCI) state along with it. There were a few issues with how we were doing the serialization: 1. We were saving the transport state before we run the prepare_save() hook. The transport state includes information such as the `interrupt_status` in MMIO or `MSI-X config` in PCI. prepare_save() in the case of async IO might change this state, so us running it after saving the transport state essentially looses information. 2. We were saving the devices states after saving the KVM state. This is problematic because, if prepare_save() sends an interrupt to the guest we don't save that "pending interrupt" bit of information in the snapshot. These two issues, were making microVMs with block devices backed by async IO freeze in some cases post snapshot resume, since the guest is stuck in the kernel waiting for some notification for the device emulation which never arrives. Currently, this is only a problem with virtio-block with async IO engine. The only other device using the prepare_save() hook is currently virtio-net, but this one doesn't modify any VirtIO state, neither sends interrupts. Fix this by ensuring the correct ordering of operations during the snapshot phase. Signed-off-by: Babis Chalios --- src/vmm/src/device_manager/pci_mngr.rs | 8 +++-- src/vmm/src/device_manager/persist.rs | 8 +++-- src/vmm/src/devices/virtio/block/device.rs | 14 ++++---- src/vmm/src/devices/virtio/device.rs | 3 ++ src/vmm/src/devices/virtio/net/device.rs | 40 +++++++++++----------- src/vmm/src/lib.rs | 7 +++- 6 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/vmm/src/device_manager/pci_mngr.rs b/src/vmm/src/device_manager/pci_mngr.rs index d465482d8c1..219a558c2a2 100644 --- a/src/vmm/src/device_manager/pci_mngr.rs +++ b/src/vmm/src/device_manager/pci_mngr.rs @@ -283,9 +283,13 @@ impl<'a> Persist<'a> for PciDevices { for pci_dev in self.virtio_devices.values() { let locked_pci_dev = pci_dev.lock().expect("Poisoned lock"); - let transport_state = locked_pci_dev.state(); let virtio_dev = locked_pci_dev.virtio_device(); + // We need to call `prepare_save()` on the device before saving the transport + // so that, if we modify the transport state while preparing the device, e.g. sending + // an interrupt to the guest, this is correctly captured in the saved transport state. let mut locked_virtio_dev = virtio_dev.lock().expect("Poisoned lock"); + locked_virtio_dev.prepare_save(); + let transport_state = locked_pci_dev.state(); let pci_device_bdf = transport_state.pci_device_bdf.into(); @@ -316,7 +320,6 @@ impl<'a> Persist<'a> for PciDevices { snapshotting yet" ); } else { - block_dev.prepare_save(); let device_state = block_dev.save(); state.block_devices.push(VirtioDeviceState { device_id: block_dev.id().to_string(), @@ -338,7 +341,6 @@ impl<'a> Persist<'a> for PciDevices { imds_compat: mmds_guard.imds_compat(), }); } - net_dev.prepare_save(); let device_state = net_dev.save(); state.net_devices.push(VirtioDeviceState { diff --git a/src/vmm/src/device_manager/persist.rs b/src/vmm/src/device_manager/persist.rs index 2a0393e57f2..16e65a67845 100644 --- a/src/vmm/src/device_manager/persist.rs +++ b/src/vmm/src/device_manager/persist.rs @@ -227,11 +227,15 @@ impl<'a> Persist<'a> for MMIODeviceManager { let _: Result<(), ()> = self.for_each_virtio_device(|_, devid, device| { let mmio_transport_locked = device.inner.lock().expect("Poisoned lock"); + let mut locked_device = mmio_transport_locked.locked_device(); + // We need to call `prepare_save()` on the device before saving the transport + // so that, if we modify the transport state while preparing the device, e.g. sending + // an interrupt to the guest, this is correctly captured in the saved transport state. + locked_device.prepare_save(); let transport_state = mmio_transport_locked.save(); let device_info = device.resources; let device_id = devid.clone(); - let mut locked_device = mmio_transport_locked.locked_device(); match locked_device.device_type() { virtio_ids::VIRTIO_ID_BALLOON => { let device_state = locked_device @@ -255,7 +259,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { snapshotting yet" ); } else { - block.prepare_save(); let device_state = block.save(); states.block_devices.push(VirtioDeviceState { device_id, @@ -275,7 +278,6 @@ impl<'a> Persist<'a> for MMIODeviceManager { }); } - net.prepare_save(); let device_state = net.save(); states.net_devices.push(VirtioDeviceState { device_id, diff --git a/src/vmm/src/devices/virtio/block/device.rs b/src/vmm/src/devices/virtio/block/device.rs index 13155efb31d..d3f3b10f3b3 100644 --- a/src/vmm/src/devices/virtio/block/device.rs +++ b/src/vmm/src/devices/virtio/block/device.rs @@ -82,13 +82,6 @@ impl Block { } } - pub fn prepare_save(&mut self) { - match self { - Self::Virtio(b) => b.prepare_save(), - Self::VhostUser(b) => b.prepare_save(), - } - } - pub fn process_virtio_queues(&mut self) -> Result<(), InvalidAvailIdx> { match self { Self::Virtio(b) => b.process_virtio_queues(), @@ -227,6 +220,13 @@ impl VirtioDevice for Block { self.process_virtio_queues(); } } + + fn prepare_save(&mut self) { + match self { + Self::Virtio(b) => b.prepare_save(), + Self::VhostUser(b) => b.prepare_save(), + } + } } impl MutEventSubscriber for Block { diff --git a/src/vmm/src/devices/virtio/device.rs b/src/vmm/src/devices/virtio/device.rs index 5f7d2170f21..a9f04346e26 100644 --- a/src/vmm/src/devices/virtio/device.rs +++ b/src/vmm/src/devices/virtio/device.rs @@ -169,6 +169,9 @@ pub trait VirtioDevice: AsAny + Send { /// Kick the device, as if it had received external events. fn kick(&mut self) {} + + /// Prepare the device for saving its state + fn prepare_save(&mut self) {} } impl fmt::Debug for dyn VirtioDevice { diff --git a/src/vmm/src/devices/virtio/net/device.rs b/src/vmm/src/devices/virtio/net/device.rs index 4ce89c342c7..a27f29824ee 100755 --- a/src/vmm/src/devices/virtio/net/device.rs +++ b/src/vmm/src/devices/virtio/net/device.rs @@ -938,26 +938,6 @@ impl Net { Ok(()) } - - /// Prepare saving state - pub fn prepare_save(&mut self) { - // We shouldn't be messing with the queue if the device is not activated. - // Anyways, if it isn't there's nothing to prepare; we haven't parsed any - // descriptors yet from it and we can't have a deferred frame. - if !self.is_activated() { - return; - } - - // Give potential deferred RX frame to guest - self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); - // Reset the parsed available descriptors, so we will re-parse them - self.queues[RX_INDEX].next_avail -= - Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); - self.rx_buffer.parsed_descriptors.clear(); - self.rx_buffer.iovec.clear(); - self.rx_buffer.used_bytes = 0; - self.rx_buffer.used_descriptors = 0; - } } impl VirtioDevice for Net { @@ -1069,6 +1049,26 @@ impl VirtioDevice for Net { self.process_virtio_queues(); } } + + /// Prepare saving state + fn prepare_save(&mut self) { + // We shouldn't be messing with the queue if the device is not activated. + // Anyways, if it isn't there's nothing to prepare; we haven't parsed any + // descriptors yet from it and we can't have a deferred frame. + if !self.is_activated() { + return; + } + + // Give potential deferred RX frame to guest + self.rx_buffer.finish_frame(&mut self.queues[RX_INDEX]); + // Reset the parsed available descriptors, so we will re-parse them + self.queues[RX_INDEX].next_avail -= + Wrapping(u16::try_from(self.rx_buffer.parsed_descriptors.len()).unwrap()); + self.rx_buffer.parsed_descriptors.clear(); + self.rx_buffer.iovec.clear(); + self.rx_buffer.used_bytes = 0; + self.rx_buffer.used_descriptors = 0; + } } #[cfg(test)] diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index f3cd9bad211..845ec3efd55 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -436,6 +436,12 @@ impl Vmm { /// Saves the state of a paused Microvm. pub fn save_state(&mut self, vm_info: &VmInfo) -> Result { use self::MicrovmStateError::SaveVmState; + // We need to save device state before saving KVM state. + // Some devices, (at the time of writing this comment block device with async engine) + // might modify the VirtIO transport and send an interrupt to the guest. If we save KVM + // state before we save device state, that interrupt will never be delivered to the guest + // upon resuming from the snapshot. + let device_states = self.device_manager.save(); let vcpu_states = self.save_vcpu_states()?; let kvm_state = self.kvm.save_state(); let vm_state = { @@ -450,7 +456,6 @@ impl Vmm { self.vm.save_state(&mpidrs).map_err(SaveVmState)? } }; - let device_states = self.device_manager.save(); Ok(MicrovmState { vm_info: vm_info.clone(),