aboutsummaryrefslogtreecommitdiff
path: root/hw/vfio/pci.c
diff options
context:
space:
mode:
Diffstat (limited to 'hw/vfio/pci.c')
-rw-r--r--hw/vfio/pci.c846
1 files changed, 499 insertions, 347 deletions
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 4feaa1cb68..64780d1b79 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -19,6 +19,7 @@
*/
#include "qemu/osdep.h"
+#include CONFIG_DEVICES /* CONFIG_IOMMUFD */
#include <linux/vfio.h>
#include <sys/ioctl.h>
@@ -29,10 +30,10 @@
#include "hw/qdev-properties.h"
#include "hw/qdev-properties-system.h"
#include "migration/vmstate.h"
+#include "qapi/qmp/qdict.h"
#include "qemu/error-report.h"
#include "qemu/main-loop.h"
#include "qemu/module.h"
-#include "qemu/option.h"
#include "qemu/range.h"
#include "qemu/units.h"
#include "sysemu/kvm.h"
@@ -42,11 +43,16 @@
#include "qapi/error.h"
#include "migration/blocker.h"
#include "migration/qemu-file.h"
+#include "sysemu/iommufd.h"
#define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug"
+/* Protected by BQL */
+static KVMRouteChange vfio_route_change;
+
static void vfio_disable_interrupts(VFIOPCIDevice *vdev);
static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled);
+static void vfio_msi_disable_common(VFIOPCIDevice *vdev);
/*
* Disabling BAR mmaping can be slow, but toggling it around INTx can
@@ -365,12 +371,56 @@ static void vfio_msi_interrupt(void *opaque)
notify(&vdev->pdev, nr);
}
+/*
+ * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid
+ * fd to kernel.
+ */
+static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev)
+{
+ g_autofree struct vfio_irq_set *irq_set = NULL;
+ int ret = 0, argsz;
+ int32_t *fd;
+
+ argsz = sizeof(*irq_set) + sizeof(*fd);
+
+ irq_set = g_malloc0(argsz);
+ irq_set->argsz = argsz;
+ irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
+ VFIO_IRQ_SET_ACTION_TRIGGER;
+ irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
+ irq_set->start = 0;
+ irq_set->count = 1;
+ fd = (int32_t *)&irq_set->data;
+ *fd = -1;
+
+ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set);
+
+ return ret;
+}
+
static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
{
struct vfio_irq_set *irq_set;
int ret = 0, i, argsz;
int32_t *fds;
+ /*
+ * If dynamic MSI-X allocation is supported, the vectors to be allocated
+ * and enabled can be scattered. Before kernel enabling MSI-X, setting
+ * nr_vectors causes all these vectors to be allocated on host.
+ *
+ * To keep allocation as needed, use vector 0 with an invalid fd to get
+ * MSI-X enabled first, then set vectors with a potentially sparse set of
+ * eventfds to enable interrupts only when enabled in guest.
+ */
+ if (msix && !vdev->msix->noresize) {
+ ret = vfio_enable_msix_no_vec(vdev);
+
+ if (ret) {
+ return ret;
+ }
+ }
+
argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds));
irq_set = g_malloc0(argsz);
@@ -412,30 +462,36 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix)
static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector,
int vector_n, bool msix)
{
- int virq;
-
if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) {
return;
}
- if (event_notifier_init(&vector->kvm_interrupt, 0)) {
+ vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change,
+ vector_n, &vdev->pdev);
+}
+
+static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector)
+{
+ if (vector->virq < 0) {
return;
}
- virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev);
- if (virq < 0) {
- event_notifier_cleanup(&vector->kvm_interrupt);
- return;
+ if (event_notifier_init(&vector->kvm_interrupt, 0)) {
+ goto fail_notifier;
}
if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt,
- NULL, virq) < 0) {
- kvm_irqchip_release_virq(kvm_state, virq);
- event_notifier_cleanup(&vector->kvm_interrupt);
- return;
+ NULL, vector->virq) < 0) {
+ goto fail_kvm;
}
- vector->virq = virq;
+ return;
+
+fail_kvm:
+ event_notifier_cleanup(&vector->kvm_interrupt);
+fail_notifier:
+ kvm_irqchip_release_virq(kvm_state, vector->virq);
+ vector->virq = -1;
}
static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector)
@@ -460,6 +516,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
VFIOPCIDevice *vdev = VFIO_PCI(pdev);
VFIOMSIVector *vector;
int ret;
+ bool resizing = !!(vdev->nr_vectors < nr + 1);
trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr);
@@ -490,36 +547,54 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr,
}
} else {
if (msg) {
- vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+ if (vdev->defer_kvm_irq_routing) {
+ vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+ } else {
+ vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+ vfio_add_kvm_msi_virq(vdev, vector, nr, true);
+ kvm_irqchip_commit_route_changes(&vfio_route_change);
+ vfio_connect_kvm_msi_virq(vector);
+ }
}
}
/*
- * We don't want to have the host allocate all possible MSI vectors
- * for a device if they're not in use, so we shutdown and incrementally
- * increase them as needed.
+ * When dynamic allocation is not supported, we don't want to have the
+ * host allocate all possible MSI vectors for a device if they're not
+ * in use, so we shutdown and incrementally increase them as needed.
+ * nr_vectors represents the total number of vectors allocated.
+ *
+ * When dynamic allocation is supported, let the host only allocate
+ * and enable a vector when it is in use in guest. nr_vectors represents
+ * the upper bound of vectors being enabled (but not all of the ranges
+ * is allocated or enabled).
*/
- if (vdev->nr_vectors < nr + 1) {
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+ if (resizing) {
vdev->nr_vectors = nr + 1;
- ret = vfio_enable_vectors(vdev, true);
- if (ret) {
- error_report("vfio: failed to enable vectors, %d", ret);
- }
- } else {
- Error *err = NULL;
- int32_t fd;
+ }
- if (vector->virq >= 0) {
- fd = event_notifier_get_fd(&vector->kvm_interrupt);
+ if (!vdev->defer_kvm_irq_routing) {
+ if (vdev->msix->noresize && resizing) {
+ vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
+ ret = vfio_enable_vectors(vdev, true);
+ if (ret) {
+ error_report("vfio: failed to enable vectors, %d", ret);
+ }
} else {
- fd = event_notifier_get_fd(&vector->interrupt);
- }
+ Error *err = NULL;
+ int32_t fd;
- if (vfio_set_irq_signaling(&vdev->vbasedev,
- VFIO_PCI_MSIX_IRQ_INDEX, nr,
- VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
- error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+ if (vector->virq >= 0) {
+ fd = event_notifier_get_fd(&vector->kvm_interrupt);
+ } else {
+ fd = event_notifier_get_fd(&vector->interrupt);
+ }
+
+ if (vfio_set_irq_signaling(&vdev->vbasedev,
+ VFIO_PCI_MSIX_IRQ_INDEX, nr,
+ VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) {
+ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+ }
}
}
@@ -566,10 +641,30 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr)
}
}
+static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
+{
+ assert(!vdev->defer_kvm_irq_routing);
+ vdev->defer_kvm_irq_routing = true;
+ vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state);
+}
+
+static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev)
+{
+ int i;
+
+ assert(vdev->defer_kvm_irq_routing);
+ vdev->defer_kvm_irq_routing = false;
+
+ kvm_irqchip_commit_route_changes(&vfio_route_change);
+
+ for (i = 0; i < vdev->nr_vectors; i++) {
+ vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]);
+ }
+}
+
static void vfio_msix_enable(VFIOPCIDevice *vdev)
{
- PCIDevice *pdev = &vdev->pdev;
- unsigned int nr, max_vec = 0;
+ int ret;
vfio_disable_interrupts(vdev);
@@ -578,37 +673,44 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
vdev->interrupt = VFIO_INT_MSIX;
/*
- * Some communication channels between VF & PF or PF & fw rely on the
- * physical state of the device and expect that enabling MSI-X from the
- * guest enables the same on the host. When our guest is Linux, the
- * guest driver call to pci_enable_msix() sets the enabling bit in the
- * MSI-X capability, but leaves the vector table masked. We therefore
- * can't rely on a vector_use callback (from request_irq() in the guest)
- * to switch the physical device into MSI-X mode because that may come a
- * long time after pci_enable_msix(). This code enables vector 0 with
- * triggering to userspace, then immediately release the vector, leaving
- * the physical device with no vectors enabled, but MSI-X enabled, just
- * like the guest view.
- * If there are already unmasked vectors (in migration resume phase and
- * some guest startups) which will be enabled soon, we can allocate all
- * of them here to avoid inefficiently disabling and enabling vectors
- * repeatedly later.
+ * Setting vector notifiers triggers synchronous vector-use
+ * callbacks for each active vector. Deferring to commit the KVM
+ * routes once rather than per vector provides a substantial
+ * performance improvement.
*/
- if (!pdev->msix_function_masked) {
- for (nr = 0; nr < msix_nr_vectors_allocated(pdev); nr++) {
- if (!msix_is_masked(pdev, nr)) {
- max_vec = nr;
- }
- }
- }
- vfio_msix_vector_do_use(pdev, max_vec, NULL, NULL);
- vfio_msix_vector_release(pdev, max_vec);
+ vfio_prepare_kvm_msi_virq_batch(vdev);
- if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use,
+ if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use,
vfio_msix_vector_release, NULL)) {
error_report("vfio: msix_set_vector_notifiers failed");
}
+ vfio_commit_kvm_msi_virq_batch(vdev);
+
+ if (vdev->nr_vectors) {
+ ret = vfio_enable_vectors(vdev, true);
+ if (ret) {
+ error_report("vfio: failed to enable vectors, %d", ret);
+ }
+ } else {
+ /*
+ * Some communication channels between VF & PF or PF & fw rely on the
+ * physical state of the device and expect that enabling MSI-X from the
+ * guest enables the same on the host. When our guest is Linux, the
+ * guest driver call to pci_enable_msix() sets the enabling bit in the
+ * MSI-X capability, but leaves the vector table masked. We therefore
+ * can't rely on a vector_use callback (from request_irq() in the guest)
+ * to switch the physical device into MSI-X mode because that may come a
+ * long time after pci_enable_msix(). This code sets vector 0 with an
+ * invalid fd to make the physical device MSI-X enabled, but with no
+ * vectors enabled, just like the guest view.
+ */
+ ret = vfio_enable_msix_no_vec(vdev);
+ if (ret) {
+ error_report("vfio: failed to enable MSI-X, %d", ret);
+ }
+ }
+
trace_vfio_msix_enable(vdev->vbasedev.name);
}
@@ -620,6 +722,13 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
retry:
+ /*
+ * Setting vector notifiers needs to enable route for each vector.
+ * Deferring to commit the KVM routes once rather than per vector
+ * provides a substantial performance improvement.
+ */
+ vfio_prepare_kvm_msi_virq_batch(vdev);
+
vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
for (i = 0; i < vdev->nr_vectors; i++) {
@@ -643,6 +752,8 @@ retry:
vfio_add_kvm_msi_virq(vdev, vector, i, false);
}
+ vfio_commit_kvm_msi_virq_batch(vdev);
+
/* Set interrupt type prior to possible interrupts */
vdev->interrupt = VFIO_INT_MSI;
@@ -650,29 +761,17 @@ retry:
if (ret) {
if (ret < 0) {
error_report("vfio: Error: Failed to setup MSI fds: %m");
- } else if (ret != vdev->nr_vectors) {
+ } else {
error_report("vfio: Error: Failed to enable %d "
"MSI vectors, retry with %d", vdev->nr_vectors, ret);
}
- for (i = 0; i < vdev->nr_vectors; i++) {
- VFIOMSIVector *vector = &vdev->msi_vectors[i];
- if (vector->virq >= 0) {
- vfio_remove_kvm_msi_virq(vector);
- }
- qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt),
- NULL, NULL, NULL);
- event_notifier_cleanup(&vector->interrupt);
- }
-
- g_free(vdev->msi_vectors);
- vdev->msi_vectors = NULL;
+ vfio_msi_disable_common(vdev);
- if (ret > 0 && ret != vdev->nr_vectors) {
+ if (ret > 0) {
vdev->nr_vectors = ret;
goto retry;
}
- vdev->nr_vectors = 0;
/*
* Failing to setup MSI doesn't really fall within any specification.
@@ -680,7 +779,6 @@ retry:
* out to fall back to INTx for this device.
*/
error_report("vfio: Error: Failed to enable MSI");
- vdev->interrupt = VFIO_INT_NONE;
return;
}
@@ -690,7 +788,6 @@ retry:
static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
{
- Error *err = NULL;
int i;
for (i = 0; i < vdev->nr_vectors; i++) {
@@ -709,15 +806,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev)
vdev->msi_vectors = NULL;
vdev->nr_vectors = 0;
vdev->interrupt = VFIO_INT_NONE;
-
- vfio_intx_enable(vdev, &err);
- if (err) {
- error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
- }
}
static void vfio_msix_disable(VFIOPCIDevice *vdev)
{
+ Error *err = NULL;
int i;
msix_unset_vector_notifiers(&vdev->pdev);
@@ -733,11 +826,17 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
}
}
- if (vdev->nr_vectors) {
- vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
- }
+ /*
+ * Always clear MSI-X IRQ index. A PF device could have enabled
+ * MSI-X with no vectors. See vfio_msix_enable().
+ */
+ vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX);
vfio_msi_disable_common(vdev);
+ vfio_intx_enable(vdev, &err);
+ if (err) {
+ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+ }
memset(vdev->msix->pending, 0,
BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long));
@@ -747,8 +846,14 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev)
static void vfio_msi_disable(VFIOPCIDevice *vdev)
{
+ Error *err = NULL;
+
vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX);
vfio_msi_disable_common(vdev);
+ vfio_intx_enable(vdev, &err);
+ if (err) {
+ error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+ }
trace_vfio_msi_disable(vdev->vbasedev.name);
}
@@ -941,7 +1046,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev)
}
if (vfio_opt_rom_in_denylist(vdev)) {
- if (dev->opts && qemu_opt_get(dev->opts, "rombar")) {
+ if (dev->opts && qdict_haskey(dev->opts, "rombar")) {
warn_report("Device at %s is known to cause system instability"
" issues during option rom execution",
vdev->vbasedev.name);
@@ -1084,8 +1189,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar)
/* If BAR is mapped and page aligned, update to fill PAGE_SIZE */
if (bar_addr != PCI_BAR_UNMAPPED &&
- !(bar_addr & ~qemu_real_host_page_mask)) {
- size = qemu_real_host_page_size;
+ !(bar_addr & ~qemu_real_host_page_mask())) {
+ size = qemu_real_host_page_size();
}
memory_region_transaction_begin();
@@ -1201,7 +1306,7 @@ void vfio_pci_write_config(PCIDevice *pdev,
for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
if (old_addr[bar] != pdev->io_regions[bar].addr &&
vdev->bars[bar].region.size > 0 &&
- vdev->bars[bar].region.size < qemu_real_host_page_size) {
+ vdev->bars[bar].region.size < qemu_real_host_page_size()) {
vfio_sub_page_bar_update_mapping(pdev, bar);
}
}
@@ -1289,7 +1394,7 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev)
}
/* MSI-X table start and end aligned to host page size */
- start = vdev->msix->table_offset & qemu_real_host_page_mask;
+ start = vdev->msix->table_offset & qemu_real_host_page_mask();
end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset +
(vdev->msix->entries * PCI_MSIX_ENTRY_SIZE));
@@ -1447,7 +1552,9 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
uint8_t pos;
uint16_t ctrl;
uint32_t table, pba;
- int fd = vdev->vbasedev.fd;
+ int ret, fd = vdev->vbasedev.fd;
+ struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info),
+ .index = VFIO_PCI_MSIX_IRQ_INDEX };
VFIOMSIXInfo *msix;
pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX);
@@ -1484,6 +1591,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK;
msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1;
+ ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "failed to get MSI-X irq info");
+ g_free(msix);
+ return;
+ }
+
+ msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE);
+
/*
* Test the size of the pba_offset variable and catch if it extends outside
* of the specified BAR. If it is the case, we need to apply a hardware
@@ -1516,7 +1632,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp)
}
trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar,
- msix->table_offset, msix->entries);
+ msix->table_offset, msix->entries,
+ msix->noresize);
vdev->msix = msix;
vfio_pci_fixup_msix_region(vdev);
@@ -1529,8 +1646,8 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp)
int ret;
Error *err = NULL;
- vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) *
- sizeof(unsigned long));
+ vdev->msix->pending = g_new0(unsigned long,
+ BITS_TO_LONGS(vdev->msix->entries));
ret = msix_init(&vdev->pdev, vdev->msix->entries,
vdev->bars[vdev->msix->table_bar].mr,
vdev->msix->table_bar, vdev->msix->table_offset,
@@ -1706,9 +1823,11 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
vfio_bar_quirk_finalize(vdev, i);
vfio_region_finalize(&bar->region);
- if (bar->size) {
+ if (bar->mr) {
+ assert(bar->size);
object_unparent(OBJECT(bar->mr));
g_free(bar->mr);
+ bar->mr = NULL;
}
}
@@ -1780,6 +1899,81 @@ static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos,
vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask);
}
+static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev)
+{
+ struct vfio_device_info_cap_pci_atomic_comp *cap;
+ g_autofree struct vfio_device_info *info = NULL;
+ PCIBus *bus = pci_get_bus(&vdev->pdev);
+ PCIDevice *parent = bus->parent_dev;
+ struct vfio_info_cap_header *hdr;
+ uint32_t mask = 0;
+ uint8_t *pos;
+
+ /*
+ * PCIe Atomic Ops completer support is only added automatically for single
+ * function devices downstream of a root port supporting DEVCAP2. Support
+ * is added during realize and, if added, removed during device exit. The
+ * single function requirement avoids conflicting requirements should a
+ * slot be composed of multiple devices with differing capabilities.
+ */
+ if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap ||
+ pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT ||
+ pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 ||
+ vdev->pdev.devfn ||
+ vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) {
+ return;
+ }
+
+ pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
+
+ /* Abort if there'a already an Atomic Ops configuration on the root port */
+ if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP128)) {
+ return;
+ }
+
+ info = vfio_get_device_info(vdev->vbasedev.fd);
+ if (!info) {
+ return;
+ }
+
+ hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP);
+ if (!hdr) {
+ return;
+ }
+
+ cap = (void *)hdr;
+ if (cap->flags & VFIO_PCI_ATOMIC_COMP32) {
+ mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32;
+ }
+ if (cap->flags & VFIO_PCI_ATOMIC_COMP64) {
+ mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64;
+ }
+ if (cap->flags & VFIO_PCI_ATOMIC_COMP128) {
+ mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128;
+ }
+
+ if (!mask) {
+ return;
+ }
+
+ pci_long_test_and_set_mask(pos, mask);
+ vdev->clear_parent_atomics_on_exit = true;
+}
+
+static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev)
+{
+ if (vdev->clear_parent_atomics_on_exit) {
+ PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev;
+ uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2;
+
+ pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP64 |
+ PCI_EXP_DEVCAP2_ATOMIC_COMP128);
+ }
+}
+
static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
Error **errp)
{
@@ -1883,6 +2077,8 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size,
QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0);
vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0);
}
+
+ vfio_pci_enable_rp_atomics(vdev);
}
/*
@@ -1940,6 +2136,7 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos)
static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
{
+ ERRP_GUARD();
PCIDevice *pdev = &vdev->pdev;
uint8_t cap_id, next, size;
int ret;
@@ -2020,6 +2217,54 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp)
return 0;
}
+static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos)
+{
+ uint32_t ctrl;
+ int i, nbar;
+
+ ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL);
+ nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT;
+
+ for (i = 0; i < nbar; i++) {
+ uint32_t cap;
+ int size;
+
+ ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8));
+ size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT;
+
+ /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */
+ cap = size <= 27 ? 1U << (size + 4) : 0;
+
+ /*
+ * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one
+ * size in the range 1MB to 512GB. We intend to mask all sizes except
+ * the one currently enabled in the size field, therefore if it's
+ * outside the range, hide the whole capability as this virtualization
+ * trick won't work. If >512GB resizable BARs start to appear, we
+ * might need an opt-in or reservation scheme in the kernel.
+ */
+ if (!(cap & PCI_REBAR_CAP_SIZES)) {
+ return -EINVAL;
+ }
+
+ /* Hide all sizes reported in the ctrl reg per above requirement. */
+ ctrl &= (PCI_REBAR_CTRL_BAR_SIZE |
+ PCI_REBAR_CTRL_NBAR_MASK |
+ PCI_REBAR_CTRL_BAR_IDX);
+
+ /*
+ * The BAR size field is RW, however we've mangled the capability
+ * register such that we only report a single size, ie. the current
+ * BAR size. A write of an unsupported value is undefined, therefore
+ * the register field is essentially RO.
+ */
+ vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0);
+ vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0);
+ }
+
+ return 0;
+}
+
static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
{
PCIDevice *pdev = &vdev->pdev;
@@ -2093,9 +2338,13 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev)
case 0: /* kernel masked capability */
case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */
case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */
- case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */
trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next);
break;
+ case PCI_EXT_CAP_ID_REBAR:
+ if (!vfio_setup_rebar_ecap(vdev, next)) {
+ pcie_add_capability(pdev, cap_id, cap_ver, next, size);
+ }
+ break;
default:
pcie_add_capability(pdev, cap_id, cap_ver, next, size);
}
@@ -2130,7 +2379,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp)
return 0;
}
-static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
+void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
{
PCIDevice *pdev = &vdev->pdev;
uint16_t cmd;
@@ -2167,7 +2416,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev)
vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2);
}
-static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
+void vfio_pci_post_reset(VFIOPCIDevice *vdev)
{
Error *err = NULL;
int nr;
@@ -2191,7 +2440,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev)
vfio_quirk_reset(vdev);
}
-static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
+bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
{
char tmp[13];
@@ -2201,22 +2450,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name)
return (strcmp(tmp, name) == 0);
}
-static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev,
+ struct vfio_pci_hot_reset_info **info_p)
{
- VFIOGroup *group;
struct vfio_pci_hot_reset_info *info;
- struct vfio_pci_dependent_device *devices;
- struct vfio_pci_hot_reset *reset;
- int32_t *fds;
- int ret, i, count;
- bool multi = false;
-
- trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
+ int ret, count;
- if (!single) {
- vfio_pci_pre_reset(vdev);
- }
- vdev->vbasedev.needs_reset = false;
+ assert(info_p && !*info_p);
info = g_malloc0(sizeof(*info));
info->argsz = sizeof(*info);
@@ -2224,163 +2464,36 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret && errno != ENOSPC) {
ret = -errno;
+ g_free(info);
if (!vdev->has_pm_reset) {
error_report("vfio: Cannot reset device %s, "
"no available reset mechanism.", vdev->vbasedev.name);
}
- goto out_single;
+ return ret;
}
count = info->count;
- info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices)));
- info->argsz = sizeof(*info) + (count * sizeof(*devices));
- devices = &info->devices[0];
+ info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0])));
+ info->argsz = sizeof(*info) + (count * sizeof(info->devices[0]));
ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info);
if (ret) {
ret = -errno;
+ g_free(info);
error_report("vfio: hot reset info failed: %m");
- goto out_single;
- }
-
- trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name);
-
- /* Verify that we have all the groups required */
- for (i = 0; i < info->count; i++) {
- PCIHostDeviceAddress host;
- VFIOPCIDevice *tmp;
- VFIODevice *vbasedev_iter;
-
- host.domain = devices[i].segment;
- host.bus = devices[i].bus;
- host.slot = PCI_SLOT(devices[i].devfn);
- host.function = PCI_FUNC(devices[i].devfn);
-
- trace_vfio_pci_hot_reset_dep_devices(host.domain,
- host.bus, host.slot, host.function, devices[i].group_id);
-
- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
- continue;
- }
-
- QLIST_FOREACH(group, &vfio_group_list, next) {
- if (group->groupid == devices[i].group_id) {
- break;
- }
- }
-
- if (!group) {
- if (!vdev->has_pm_reset) {
- error_report("vfio: Cannot reset device %s, "
- "depends on group %d which is not owned.",
- vdev->vbasedev.name, devices[i].group_id);
- }
- ret = -EPERM;
- goto out;
- }
-
- /* Prep dependent devices for reset and clear our marker. */
- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
- if (!vbasedev_iter->dev->realized ||
- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
- continue;
- }
- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
- if (single) {
- ret = -EINVAL;
- goto out_single;
- }
- vfio_pci_pre_reset(tmp);
- tmp->vbasedev.needs_reset = false;
- multi = true;
- break;
- }
- }
- }
-
- if (!single && !multi) {
- ret = -EINVAL;
- goto out_single;
- }
-
- /* Determine how many group fds need to be passed */
- count = 0;
- QLIST_FOREACH(group, &vfio_group_list, next) {
- for (i = 0; i < info->count; i++) {
- if (group->groupid == devices[i].group_id) {
- count++;
- break;
- }
- }
- }
-
- reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds)));
- reset->argsz = sizeof(*reset) + (count * sizeof(*fds));
- fds = &reset->group_fds[0];
-
- /* Fill in group fds */
- QLIST_FOREACH(group, &vfio_group_list, next) {
- for (i = 0; i < info->count; i++) {
- if (group->groupid == devices[i].group_id) {
- fds[reset->count++] = group->fd;
- break;
- }
- }
+ return ret;
}
- /* Bus reset! */
- ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset);
- g_free(reset);
-
- trace_vfio_pci_hot_reset_result(vdev->vbasedev.name,
- ret ? "%m" : "Success");
-
-out:
- /* Re-enable INTx on affected devices */
- for (i = 0; i < info->count; i++) {
- PCIHostDeviceAddress host;
- VFIOPCIDevice *tmp;
- VFIODevice *vbasedev_iter;
-
- host.domain = devices[i].segment;
- host.bus = devices[i].bus;
- host.slot = PCI_SLOT(devices[i].devfn);
- host.function = PCI_FUNC(devices[i].devfn);
-
- if (vfio_pci_host_match(&host, vdev->vbasedev.name)) {
- continue;
- }
-
- QLIST_FOREACH(group, &vfio_group_list, next) {
- if (group->groupid == devices[i].group_id) {
- break;
- }
- }
-
- if (!group) {
- break;
- }
+ *info_p = info;
+ return 0;
+}
- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
- if (!vbasedev_iter->dev->realized ||
- vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) {
- continue;
- }
- tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev);
- if (vfio_pci_host_match(&host, tmp->vbasedev.name)) {
- vfio_pci_post_reset(tmp);
- break;
- }
- }
- }
-out_single:
- if (!single) {
- vfio_pci_post_reset(vdev);
- }
- g_free(info);
+static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single)
+{
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops;
- return ret;
+ return ops->pci_hot_reset(vbasedev, single);
}
/*
@@ -2431,14 +2544,45 @@ static bool vfio_msix_present(void *opaque, int version_id)
return msix_present(pdev);
}
-const VMStateDescription vmstate_vfio_pci_config = {
+static bool vfio_display_migration_needed(void *opaque)
+{
+ VFIOPCIDevice *vdev = opaque;
+
+ /*
+ * We need to migrate the VFIODisplay object if ramfb *migration* was
+ * explicitly requested (in which case we enforced both ramfb=on and
+ * display=on), or ramfb migration was left at the default "auto"
+ * setting, and *ramfb* was explicitly requested (in which case we
+ * enforced display=on).
+ */
+ return vdev->ramfb_migrate == ON_OFF_AUTO_ON ||
+ (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb);
+}
+
+static const VMStateDescription vmstate_vfio_display = {
+ .name = "VFIOPCIDevice/VFIODisplay",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .needed = vfio_display_migration_needed,
+ .fields = (const VMStateField[]){
+ VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate,
+ VFIODisplay),
+ VMSTATE_END_OF_LIST()
+ }
+};
+
+static const VMStateDescription vmstate_vfio_pci_config = {
.name = "VFIOPCIDevice",
.version_id = 1,
.minimum_version_id = 1,
- .fields = (VMStateField[]) {
+ .fields = (const VMStateField[]) {
VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice),
VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present),
VMSTATE_END_OF_LIST()
+ },
+ .subsections = (const VMStateDescription * const []) {
+ &vmstate_vfio_display,
+ NULL
}
};
@@ -2453,7 +2597,12 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
{
VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev);
PCIDevice *pdev = &vdev->pdev;
- int ret;
+ pcibus_t old_addr[PCI_NUM_REGIONS - 1];
+ int bar, ret;
+
+ for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+ old_addr[bar] = pdev->io_regions[bar].addr;
+ }
ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1);
if (ret) {
@@ -2463,6 +2612,18 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f)
vfio_pci_write_config(pdev, PCI_COMMAND,
pci_get_word(pdev->config + PCI_COMMAND), 2);
+ for (bar = 0; bar < PCI_ROM_SLOT; bar++) {
+ /*
+ * The address may not be changed in some scenarios
+ * (e.g. the VF driver isn't loaded in VM).
+ */
+ if (old_addr[bar] != pdev->io_regions[bar].addr &&
+ vdev->bars[bar].region.size > 0 &&
+ vdev->bars[bar].region.size < qemu_real_host_page_size()) {
+ vfio_sub_page_bar_update_mapping(pdev, bar);
+ }
+ }
+
if (msi_enabled(pdev)) {
vfio_msi_enable(vdev);
} else if (msix_enabled(pdev)) {
@@ -2632,12 +2793,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp)
}
}
-static void vfio_put_device(VFIOPCIDevice *vdev)
+static void vfio_pci_put_device(VFIOPCIDevice *vdev)
{
+ vfio_detach_device(&vdev->vbasedev);
+
g_free(vdev->vbasedev.name);
g_free(vdev->msix);
-
- vfio_put_base_device(&vdev->vbasedev);
}
static void vfio_err_notifier_handler(void *opaque)
@@ -2782,99 +2943,68 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev)
static void vfio_realize(PCIDevice *pdev, Error **errp)
{
+ ERRP_GUARD();
VFIOPCIDevice *vdev = VFIO_PCI(pdev);
- VFIODevice *vbasedev_iter;
- VFIOGroup *group;
- char *tmp, *subsys, group_path[PATH_MAX], *group_name;
+ VFIODevice *vbasedev = &vdev->vbasedev;
+ char *tmp, *subsys;
Error *err = NULL;
- ssize_t len;
- struct stat st;
- int groupid;
int i, ret;
bool is_mdev;
+ char uuid[UUID_STR_LEN];
+ char *name;
- if (!vdev->vbasedev.sysfsdev) {
+ if (vbasedev->fd < 0 && !vbasedev->sysfsdev) {
if (!(~vdev->host.domain || ~vdev->host.bus ||
~vdev->host.slot || ~vdev->host.function)) {
error_setg(errp, "No provided host device");
error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F "
+#ifdef CONFIG_IOMMUFD
+ "or -device vfio-pci,fd=DEVICE_FD "
+#endif
"or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n");
return;
}
- vdev->vbasedev.sysfsdev =
+ vbasedev->sysfsdev =
g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x",
vdev->host.domain, vdev->host.bus,
vdev->host.slot, vdev->host.function);
}
- if (stat(vdev->vbasedev.sysfsdev, &st) < 0) {
- error_setg_errno(errp, errno, "no such host device");
- error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev);
+ if (vfio_device_get_name(vbasedev, errp) < 0) {
return;
}
- vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev);
- vdev->vbasedev.ops = &vfio_pci_ops;
- vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI;
- vdev->vbasedev.dev = DEVICE(vdev);
-
- tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev);
- len = readlink(tmp, group_path, sizeof(group_path));
- g_free(tmp);
-
- if (len <= 0 || len >= sizeof(group_path)) {
- error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG,
- "no iommu_group found");
- goto error;
- }
-
- group_path[len] = 0;
-
- group_name = basename(group_path);
- if (sscanf(group_name, "%d", &groupid) != 1) {
- error_setg_errno(errp, errno, "failed to read %s", group_path);
- goto error;
- }
-
- trace_vfio_realize(vdev->vbasedev.name, groupid);
-
- group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp);
- if (!group) {
- goto error;
- }
-
- QLIST_FOREACH(vbasedev_iter, &group->device_list, next) {
- if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) {
- error_setg(errp, "device is already attached");
- vfio_put_group(group);
- goto error;
- }
- }
-
/*
* Mediated devices *might* operate compatibly with discarding of RAM, but
* we cannot know for certain, it depends on whether the mdev vendor driver
* stays in sync with the active working set of the guest driver. Prevent
* the x-balloon-allowed option unless this is minimally an mdev device.
*/
- tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev);
+ tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev);
subsys = realpath(tmp, NULL);
g_free(tmp);
is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0);
free(subsys);
- trace_vfio_mdev(vdev->vbasedev.name, is_mdev);
+ trace_vfio_mdev(vbasedev->name, is_mdev);
- if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) {
+ if (vbasedev->ram_block_discard_allowed && !is_mdev) {
error_setg(errp, "x-balloon-allowed only potentially compatible "
"with mdev devices");
- vfio_put_group(group);
goto error;
}
- ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp);
+ if (!qemu_uuid_is_null(&vdev->vf_token)) {
+ qemu_uuid_unparse(&vdev->vf_token, uuid);
+ name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid);
+ } else {
+ name = g_strdup(vbasedev->name);
+ }
+
+ ret = vfio_attach_device(name, vbasedev,
+ pci_device_iommu_address_space(pdev), errp);
+ g_free(name);
if (ret) {
- vfio_put_group(group);
goto error;
}
@@ -2885,7 +3015,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
/* Get a copy of config space */
- ret = pread(vdev->vbasedev.fd, vdev->pdev.config,
+ ret = pread(vbasedev->fd, vdev->pdev.config,
MIN(pci_config_size(&vdev->pdev), vdev->config_size),
vdev->config_offset);
if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) {
@@ -2913,7 +3043,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0);
- trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id);
+ trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id);
} else {
vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID);
}
@@ -2924,7 +3054,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0);
- trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id);
+ trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id);
} else {
vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID);
}
@@ -2936,7 +3066,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID,
vdev->sub_vendor_id, ~0);
- trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name,
+ trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name,
vdev->sub_vendor_id);
}
@@ -2946,7 +3076,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto error;
}
vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0);
- trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name,
+ trace_vfio_pci_emulated_sub_device_id(vbasedev->name,
vdev->sub_device_id);
}
@@ -3005,7 +3135,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
goto out_teardown;
}
- ret = vfio_get_dev_region_info(&vdev->vbasedev,
+ ret = vfio_get_dev_region_info(vbasedev,
VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL,
VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion);
if (ret) {
@@ -3066,24 +3196,23 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
}
- if (vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) {
- ret = vfio_pci_nvidia_v100_ram_init(vdev, errp);
- if (ret && ret != -ENODEV) {
- error_report("Failed to setup NVIDIA V100 GPU RAM");
- }
+ if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) {
+ warn_report("x-ramfb-migrate=on but ramfb=off. "
+ "Forcing x-ramfb-migrate to off.");
+ vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
}
-
- if (vfio_pci_is(vdev, PCI_VENDOR_ID_IBM, PCI_ANY_ID)) {
- ret = vfio_pci_nvlink2_init(vdev, errp);
- if (ret && ret != -ENODEV) {
- error_report("Failed to setup NVlink2 bridge");
+ if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) {
+ if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) {
+ vdev->ramfb_migrate = ON_OFF_AUTO_OFF;
+ } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) {
+ error_setg(errp, "x-ramfb-migrate requires enable-migration");
+ goto out_deregister;
}
}
if (!pdev->failover_pair_id) {
- ret = vfio_migration_probe(&vdev->vbasedev, errp);
- if (ret) {
- error_report("%s: Migration disabled", vdev->vbasedev.name);
+ if (!vfio_migration_realize(vbasedev, errp)) {
+ goto out_deregister;
}
}
@@ -3094,19 +3223,26 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
return;
out_deregister:
+ if (vdev->interrupt == VFIO_INT_INTx) {
+ vfio_intx_disable(vdev);
+ }
pci_device_set_intx_routing_notifier(&vdev->pdev, NULL);
- kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+ if (vdev->irqchip_change_notifier.notify) {
+ kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier);
+ }
+ if (vdev->intx.mmap_timer) {
+ timer_free(vdev->intx.mmap_timer);
+ }
out_teardown:
vfio_teardown_msi(vdev);
vfio_bars_exit(vdev);
error:
- error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name);
+ error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name);
}
static void vfio_instance_finalize(Object *obj)
{
VFIOPCIDevice *vdev = VFIO_PCI(obj);
- VFIOGroup *group = vdev->vbasedev.group;
vfio_display_finalize(vdev);
vfio_bars_finalize(vdev);
@@ -3119,8 +3255,7 @@ static void vfio_instance_finalize(Object *obj)
*
* g_free(vdev->igd_opregion);
*/
- vfio_put_device(vdev);
- vfio_put_group(group);
+ vfio_pci_put_device(vdev);
}
static void vfio_exitfn(PCIDevice *pdev)
@@ -3138,8 +3273,9 @@ static void vfio_exitfn(PCIDevice *pdev)
timer_free(vdev->intx.mmap_timer);
}
vfio_teardown_msi(vdev);
+ vfio_pci_disable_rp_atomics(vdev);
vfio_bars_exit(vdev);
- vfio_migration_finalize(&vdev->vbasedev);
+ vfio_migration_exit(&vdev->vbasedev);
}
static void vfio_pci_reset(DeviceState *dev)
@@ -3185,6 +3321,7 @@ static void vfio_instance_init(Object *obj)
{
PCIDevice *pci_dev = PCI_DEVICE(obj);
VFIOPCIDevice *vdev = VFIO_PCI(obj);
+ VFIODevice *vbasedev = &vdev->vbasedev;
device_add_bootindex_property(obj, &vdev->bootindex,
"bootindex", NULL,
@@ -3194,6 +3331,9 @@ static void vfio_instance_init(Object *obj)
vdev->host.slot = ~0U;
vdev->host.function = ~0U;
+ vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops,
+ DEVICE(vdev), false);
+
vdev->nv_gpudirect_clique = 0xFF;
/* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command
@@ -3203,6 +3343,7 @@ static void vfio_instance_init(Object *obj)
static Property vfio_pci_dev_properties[] = {
DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host),
+ DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token),
DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev),
DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice,
vbasedev.pre_copy_dirty_page_tracking,
@@ -3219,8 +3360,8 @@ static Property vfio_pci_dev_properties[] = {
VFIO_FEATURE_ENABLE_REQ_BIT, true),
DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features,
VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false),
- DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice,
- vbasedev.enable_migration, false),
+ DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice,
+ vbasedev.enable_migration, ON_OFF_AUTO_AUTO),
DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false),
DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice,
vbasedev.ram_block_discard_allowed, false),
@@ -3245,14 +3386,20 @@ static Property vfio_pci_dev_properties[] = {
qdev_prop_nv_gpudirect_clique, uint8_t),
DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo,
OFF_AUTOPCIBAR_OFF),
- /*
- * TODO - support passed fds... is this necessary?
- * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name),
- * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name),
- */
+#ifdef CONFIG_IOMMUFD
+ DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd,
+ TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *),
+#endif
DEFINE_PROP_END_OF_LIST(),
};
+#ifdef CONFIG_IOMMUFD
+static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp)
+{
+ vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp);
+}
+#endif
+
static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
{
DeviceClass *dc = DEVICE_CLASS(klass);
@@ -3260,6 +3407,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data)
dc->reset = vfio_pci_reset;
device_class_set_props(dc, vfio_pci_dev_properties);
+#ifdef CONFIG_IOMMUFD
+ object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd);
+#endif
dc->desc = "VFIO-based PCI device assignment";
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
pdc->realize = vfio_realize;
@@ -3284,6 +3434,8 @@ static const TypeInfo vfio_pci_dev_info = {
static Property vfio_pci_dev_nohotplug_properties[] = {
DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false),
+ DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate,
+ ON_OFF_AUTO_AUTO),
DEFINE_PROP_END_OF_LIST(),
};