diff options
Diffstat (limited to 'hw/vfio/pci.c')
-rw-r--r-- | hw/vfio/pci.c | 846 |
1 files changed, 499 insertions, 347 deletions
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c index 4feaa1cb68..64780d1b79 100644 --- a/hw/vfio/pci.c +++ b/hw/vfio/pci.c @@ -19,6 +19,7 @@ */ #include "qemu/osdep.h" +#include CONFIG_DEVICES /* CONFIG_IOMMUFD */ #include <linux/vfio.h> #include <sys/ioctl.h> @@ -29,10 +30,10 @@ #include "hw/qdev-properties.h" #include "hw/qdev-properties-system.h" #include "migration/vmstate.h" +#include "qapi/qmp/qdict.h" #include "qemu/error-report.h" #include "qemu/main-loop.h" #include "qemu/module.h" -#include "qemu/option.h" #include "qemu/range.h" #include "qemu/units.h" #include "sysemu/kvm.h" @@ -42,11 +43,16 @@ #include "qapi/error.h" #include "migration/blocker.h" #include "migration/qemu-file.h" +#include "sysemu/iommufd.h" #define TYPE_VFIO_PCI_NOHOTPLUG "vfio-pci-nohotplug" +/* Protected by BQL */ +static KVMRouteChange vfio_route_change; + static void vfio_disable_interrupts(VFIOPCIDevice *vdev); static void vfio_mmap_set_enabled(VFIOPCIDevice *vdev, bool enabled); +static void vfio_msi_disable_common(VFIOPCIDevice *vdev); /* * Disabling BAR mmaping can be slow, but toggling it around INTx can @@ -365,12 +371,56 @@ static void vfio_msi_interrupt(void *opaque) notify(&vdev->pdev, nr); } +/* + * Get MSI-X enabled, but no vector enabled, by setting vector 0 with an invalid + * fd to kernel. + */ +static int vfio_enable_msix_no_vec(VFIOPCIDevice *vdev) +{ + g_autofree struct vfio_irq_set *irq_set = NULL; + int ret = 0, argsz; + int32_t *fd; + + argsz = sizeof(*irq_set) + sizeof(*fd); + + irq_set = g_malloc0(argsz); + irq_set->argsz = argsz; + irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | + VFIO_IRQ_SET_ACTION_TRIGGER; + irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX; + irq_set->start = 0; + irq_set->count = 1; + fd = (int32_t *)&irq_set->data; + *fd = -1; + + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_SET_IRQS, irq_set); + + return ret; +} + static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) { struct vfio_irq_set *irq_set; int ret = 0, i, argsz; int32_t *fds; + /* + * If dynamic MSI-X allocation is supported, the vectors to be allocated + * and enabled can be scattered. Before kernel enabling MSI-X, setting + * nr_vectors causes all these vectors to be allocated on host. + * + * To keep allocation as needed, use vector 0 with an invalid fd to get + * MSI-X enabled first, then set vectors with a potentially sparse set of + * eventfds to enable interrupts only when enabled in guest. + */ + if (msix && !vdev->msix->noresize) { + ret = vfio_enable_msix_no_vec(vdev); + + if (ret) { + return ret; + } + } + argsz = sizeof(*irq_set) + (vdev->nr_vectors * sizeof(*fds)); irq_set = g_malloc0(argsz); @@ -412,30 +462,36 @@ static int vfio_enable_vectors(VFIOPCIDevice *vdev, bool msix) static void vfio_add_kvm_msi_virq(VFIOPCIDevice *vdev, VFIOMSIVector *vector, int vector_n, bool msix) { - int virq; - if ((msix && vdev->no_kvm_msix) || (!msix && vdev->no_kvm_msi)) { return; } - if (event_notifier_init(&vector->kvm_interrupt, 0)) { + vector->virq = kvm_irqchip_add_msi_route(&vfio_route_change, + vector_n, &vdev->pdev); +} + +static void vfio_connect_kvm_msi_virq(VFIOMSIVector *vector) +{ + if (vector->virq < 0) { return; } - virq = kvm_irqchip_add_msi_route(kvm_state, vector_n, &vdev->pdev); - if (virq < 0) { - event_notifier_cleanup(&vector->kvm_interrupt); - return; + if (event_notifier_init(&vector->kvm_interrupt, 0)) { + goto fail_notifier; } if (kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &vector->kvm_interrupt, - NULL, virq) < 0) { - kvm_irqchip_release_virq(kvm_state, virq); - event_notifier_cleanup(&vector->kvm_interrupt); - return; + NULL, vector->virq) < 0) { + goto fail_kvm; } - vector->virq = virq; + return; + +fail_kvm: + event_notifier_cleanup(&vector->kvm_interrupt); +fail_notifier: + kvm_irqchip_release_virq(kvm_state, vector->virq); + vector->virq = -1; } static void vfio_remove_kvm_msi_virq(VFIOMSIVector *vector) @@ -460,6 +516,7 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, VFIOPCIDevice *vdev = VFIO_PCI(pdev); VFIOMSIVector *vector; int ret; + bool resizing = !!(vdev->nr_vectors < nr + 1); trace_vfio_msix_vector_do_use(vdev->vbasedev.name, nr); @@ -490,36 +547,54 @@ static int vfio_msix_vector_do_use(PCIDevice *pdev, unsigned int nr, } } else { if (msg) { - vfio_add_kvm_msi_virq(vdev, vector, nr, true); + if (vdev->defer_kvm_irq_routing) { + vfio_add_kvm_msi_virq(vdev, vector, nr, true); + } else { + vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); + vfio_add_kvm_msi_virq(vdev, vector, nr, true); + kvm_irqchip_commit_route_changes(&vfio_route_change); + vfio_connect_kvm_msi_virq(vector); + } } } /* - * We don't want to have the host allocate all possible MSI vectors - * for a device if they're not in use, so we shutdown and incrementally - * increase them as needed. + * When dynamic allocation is not supported, we don't want to have the + * host allocate all possible MSI vectors for a device if they're not + * in use, so we shutdown and incrementally increase them as needed. + * nr_vectors represents the total number of vectors allocated. + * + * When dynamic allocation is supported, let the host only allocate + * and enable a vector when it is in use in guest. nr_vectors represents + * the upper bound of vectors being enabled (but not all of the ranges + * is allocated or enabled). */ - if (vdev->nr_vectors < nr + 1) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + if (resizing) { vdev->nr_vectors = nr + 1; - ret = vfio_enable_vectors(vdev, true); - if (ret) { - error_report("vfio: failed to enable vectors, %d", ret); - } - } else { - Error *err = NULL; - int32_t fd; + } - if (vector->virq >= 0) { - fd = event_notifier_get_fd(&vector->kvm_interrupt); + if (!vdev->defer_kvm_irq_routing) { + if (vdev->msix->noresize && resizing) { + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); + ret = vfio_enable_vectors(vdev, true); + if (ret) { + error_report("vfio: failed to enable vectors, %d", ret); + } } else { - fd = event_notifier_get_fd(&vector->interrupt); - } + Error *err = NULL; + int32_t fd; - if (vfio_set_irq_signaling(&vdev->vbasedev, - VFIO_PCI_MSIX_IRQ_INDEX, nr, - VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + if (vector->virq >= 0) { + fd = event_notifier_get_fd(&vector->kvm_interrupt); + } else { + fd = event_notifier_get_fd(&vector->interrupt); + } + + if (vfio_set_irq_signaling(&vdev->vbasedev, + VFIO_PCI_MSIX_IRQ_INDEX, nr, + VFIO_IRQ_SET_ACTION_TRIGGER, fd, &err)) { + error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } } } @@ -566,10 +641,30 @@ static void vfio_msix_vector_release(PCIDevice *pdev, unsigned int nr) } } +static void vfio_prepare_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +{ + assert(!vdev->defer_kvm_irq_routing); + vdev->defer_kvm_irq_routing = true; + vfio_route_change = kvm_irqchip_begin_route_changes(kvm_state); +} + +static void vfio_commit_kvm_msi_virq_batch(VFIOPCIDevice *vdev) +{ + int i; + + assert(vdev->defer_kvm_irq_routing); + vdev->defer_kvm_irq_routing = false; + + kvm_irqchip_commit_route_changes(&vfio_route_change); + + for (i = 0; i < vdev->nr_vectors; i++) { + vfio_connect_kvm_msi_virq(&vdev->msi_vectors[i]); + } +} + static void vfio_msix_enable(VFIOPCIDevice *vdev) { - PCIDevice *pdev = &vdev->pdev; - unsigned int nr, max_vec = 0; + int ret; vfio_disable_interrupts(vdev); @@ -578,37 +673,44 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev) vdev->interrupt = VFIO_INT_MSIX; /* - * Some communication channels between VF & PF or PF & fw rely on the - * physical state of the device and expect that enabling MSI-X from the - * guest enables the same on the host. When our guest is Linux, the - * guest driver call to pci_enable_msix() sets the enabling bit in the - * MSI-X capability, but leaves the vector table masked. We therefore - * can't rely on a vector_use callback (from request_irq() in the guest) - * to switch the physical device into MSI-X mode because that may come a - * long time after pci_enable_msix(). This code enables vector 0 with - * triggering to userspace, then immediately release the vector, leaving - * the physical device with no vectors enabled, but MSI-X enabled, just - * like the guest view. - * If there are already unmasked vectors (in migration resume phase and - * some guest startups) which will be enabled soon, we can allocate all - * of them here to avoid inefficiently disabling and enabling vectors - * repeatedly later. + * Setting vector notifiers triggers synchronous vector-use + * callbacks for each active vector. Deferring to commit the KVM + * routes once rather than per vector provides a substantial + * performance improvement. */ - if (!pdev->msix_function_masked) { - for (nr = 0; nr < msix_nr_vectors_allocated(pdev); nr++) { - if (!msix_is_masked(pdev, nr)) { - max_vec = nr; - } - } - } - vfio_msix_vector_do_use(pdev, max_vec, NULL, NULL); - vfio_msix_vector_release(pdev, max_vec); + vfio_prepare_kvm_msi_virq_batch(vdev); - if (msix_set_vector_notifiers(pdev, vfio_msix_vector_use, + if (msix_set_vector_notifiers(&vdev->pdev, vfio_msix_vector_use, vfio_msix_vector_release, NULL)) { error_report("vfio: msix_set_vector_notifiers failed"); } + vfio_commit_kvm_msi_virq_batch(vdev); + + if (vdev->nr_vectors) { + ret = vfio_enable_vectors(vdev, true); + if (ret) { + error_report("vfio: failed to enable vectors, %d", ret); + } + } else { + /* + * Some communication channels between VF & PF or PF & fw rely on the + * physical state of the device and expect that enabling MSI-X from the + * guest enables the same on the host. When our guest is Linux, the + * guest driver call to pci_enable_msix() sets the enabling bit in the + * MSI-X capability, but leaves the vector table masked. We therefore + * can't rely on a vector_use callback (from request_irq() in the guest) + * to switch the physical device into MSI-X mode because that may come a + * long time after pci_enable_msix(). This code sets vector 0 with an + * invalid fd to make the physical device MSI-X enabled, but with no + * vectors enabled, just like the guest view. + */ + ret = vfio_enable_msix_no_vec(vdev); + if (ret) { + error_report("vfio: failed to enable MSI-X, %d", ret); + } + } + trace_vfio_msix_enable(vdev->vbasedev.name); } @@ -620,6 +722,13 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev) vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev); retry: + /* + * Setting vector notifiers needs to enable route for each vector. + * Deferring to commit the KVM routes once rather than per vector + * provides a substantial performance improvement. + */ + vfio_prepare_kvm_msi_virq_batch(vdev); + vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors); for (i = 0; i < vdev->nr_vectors; i++) { @@ -643,6 +752,8 @@ retry: vfio_add_kvm_msi_virq(vdev, vector, i, false); } + vfio_commit_kvm_msi_virq_batch(vdev); + /* Set interrupt type prior to possible interrupts */ vdev->interrupt = VFIO_INT_MSI; @@ -650,29 +761,17 @@ retry: if (ret) { if (ret < 0) { error_report("vfio: Error: Failed to setup MSI fds: %m"); - } else if (ret != vdev->nr_vectors) { + } else { error_report("vfio: Error: Failed to enable %d " "MSI vectors, retry with %d", vdev->nr_vectors, ret); } - for (i = 0; i < vdev->nr_vectors; i++) { - VFIOMSIVector *vector = &vdev->msi_vectors[i]; - if (vector->virq >= 0) { - vfio_remove_kvm_msi_virq(vector); - } - qemu_set_fd_handler(event_notifier_get_fd(&vector->interrupt), - NULL, NULL, NULL); - event_notifier_cleanup(&vector->interrupt); - } - - g_free(vdev->msi_vectors); - vdev->msi_vectors = NULL; + vfio_msi_disable_common(vdev); - if (ret > 0 && ret != vdev->nr_vectors) { + if (ret > 0) { vdev->nr_vectors = ret; goto retry; } - vdev->nr_vectors = 0; /* * Failing to setup MSI doesn't really fall within any specification. @@ -680,7 +779,6 @@ retry: * out to fall back to INTx for this device. */ error_report("vfio: Error: Failed to enable MSI"); - vdev->interrupt = VFIO_INT_NONE; return; } @@ -690,7 +788,6 @@ retry: static void vfio_msi_disable_common(VFIOPCIDevice *vdev) { - Error *err = NULL; int i; for (i = 0; i < vdev->nr_vectors; i++) { @@ -709,15 +806,11 @@ static void vfio_msi_disable_common(VFIOPCIDevice *vdev) vdev->msi_vectors = NULL; vdev->nr_vectors = 0; vdev->interrupt = VFIO_INT_NONE; - - vfio_intx_enable(vdev, &err); - if (err) { - error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); - } } static void vfio_msix_disable(VFIOPCIDevice *vdev) { + Error *err = NULL; int i; msix_unset_vector_notifiers(&vdev->pdev); @@ -733,11 +826,17 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) } } - if (vdev->nr_vectors) { - vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); - } + /* + * Always clear MSI-X IRQ index. A PF device could have enabled + * MSI-X with no vectors. See vfio_msix_enable(). + */ + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSIX_IRQ_INDEX); vfio_msi_disable_common(vdev); + vfio_intx_enable(vdev, &err); + if (err) { + error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } memset(vdev->msix->pending, 0, BITS_TO_LONGS(vdev->msix->entries) * sizeof(unsigned long)); @@ -747,8 +846,14 @@ static void vfio_msix_disable(VFIOPCIDevice *vdev) static void vfio_msi_disable(VFIOPCIDevice *vdev) { + Error *err = NULL; + vfio_disable_irqindex(&vdev->vbasedev, VFIO_PCI_MSI_IRQ_INDEX); vfio_msi_disable_common(vdev); + vfio_intx_enable(vdev, &err); + if (err) { + error_reportf_err(err, VFIO_MSG_PREFIX, vdev->vbasedev.name); + } trace_vfio_msi_disable(vdev->vbasedev.name); } @@ -941,7 +1046,7 @@ static void vfio_pci_size_rom(VFIOPCIDevice *vdev) } if (vfio_opt_rom_in_denylist(vdev)) { - if (dev->opts && qemu_opt_get(dev->opts, "rombar")) { + if (dev->opts && qdict_haskey(dev->opts, "rombar")) { warn_report("Device at %s is known to cause system instability" " issues during option rom execution", vdev->vbasedev.name); @@ -1084,8 +1189,8 @@ static void vfio_sub_page_bar_update_mapping(PCIDevice *pdev, int bar) /* If BAR is mapped and page aligned, update to fill PAGE_SIZE */ if (bar_addr != PCI_BAR_UNMAPPED && - !(bar_addr & ~qemu_real_host_page_mask)) { - size = qemu_real_host_page_size; + !(bar_addr & ~qemu_real_host_page_mask())) { + size = qemu_real_host_page_size(); } memory_region_transaction_begin(); @@ -1201,7 +1306,7 @@ void vfio_pci_write_config(PCIDevice *pdev, for (bar = 0; bar < PCI_ROM_SLOT; bar++) { if (old_addr[bar] != pdev->io_regions[bar].addr && vdev->bars[bar].region.size > 0 && - vdev->bars[bar].region.size < qemu_real_host_page_size) { + vdev->bars[bar].region.size < qemu_real_host_page_size()) { vfio_sub_page_bar_update_mapping(pdev, bar); } } @@ -1289,7 +1394,7 @@ static void vfio_pci_fixup_msix_region(VFIOPCIDevice *vdev) } /* MSI-X table start and end aligned to host page size */ - start = vdev->msix->table_offset & qemu_real_host_page_mask; + start = vdev->msix->table_offset & qemu_real_host_page_mask(); end = REAL_HOST_PAGE_ALIGN((uint64_t)vdev->msix->table_offset + (vdev->msix->entries * PCI_MSIX_ENTRY_SIZE)); @@ -1447,7 +1552,9 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) uint8_t pos; uint16_t ctrl; uint32_t table, pba; - int fd = vdev->vbasedev.fd; + int ret, fd = vdev->vbasedev.fd; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info), + .index = VFIO_PCI_MSIX_IRQ_INDEX }; VFIOMSIXInfo *msix; pos = pci_find_capability(&vdev->pdev, PCI_CAP_ID_MSIX); @@ -1484,6 +1591,15 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) msix->pba_offset = pba & ~PCI_MSIX_FLAGS_BIRMASK; msix->entries = (ctrl & PCI_MSIX_FLAGS_QSIZE) + 1; + ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_IRQ_INFO, &irq_info); + if (ret < 0) { + error_setg_errno(errp, -ret, "failed to get MSI-X irq info"); + g_free(msix); + return; + } + + msix->noresize = !!(irq_info.flags & VFIO_IRQ_INFO_NORESIZE); + /* * Test the size of the pba_offset variable and catch if it extends outside * of the specified BAR. If it is the case, we need to apply a hardware @@ -1516,7 +1632,8 @@ static void vfio_msix_early_setup(VFIOPCIDevice *vdev, Error **errp) } trace_vfio_msix_early_setup(vdev->vbasedev.name, pos, msix->table_bar, - msix->table_offset, msix->entries); + msix->table_offset, msix->entries, + msix->noresize); vdev->msix = msix; vfio_pci_fixup_msix_region(vdev); @@ -1529,8 +1646,8 @@ static int vfio_msix_setup(VFIOPCIDevice *vdev, int pos, Error **errp) int ret; Error *err = NULL; - vdev->msix->pending = g_malloc0(BITS_TO_LONGS(vdev->msix->entries) * - sizeof(unsigned long)); + vdev->msix->pending = g_new0(unsigned long, + BITS_TO_LONGS(vdev->msix->entries)); ret = msix_init(&vdev->pdev, vdev->msix->entries, vdev->bars[vdev->msix->table_bar].mr, vdev->msix->table_bar, vdev->msix->table_offset, @@ -1706,9 +1823,11 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev) vfio_bar_quirk_finalize(vdev, i); vfio_region_finalize(&bar->region); - if (bar->size) { + if (bar->mr) { + assert(bar->size); object_unparent(OBJECT(bar->mr)); g_free(bar->mr); + bar->mr = NULL; } } @@ -1780,6 +1899,81 @@ static void vfio_add_emulated_long(VFIOPCIDevice *vdev, int pos, vfio_set_long_bits(vdev->emulated_config_bits + pos, mask, mask); } +static void vfio_pci_enable_rp_atomics(VFIOPCIDevice *vdev) +{ + struct vfio_device_info_cap_pci_atomic_comp *cap; + g_autofree struct vfio_device_info *info = NULL; + PCIBus *bus = pci_get_bus(&vdev->pdev); + PCIDevice *parent = bus->parent_dev; + struct vfio_info_cap_header *hdr; + uint32_t mask = 0; + uint8_t *pos; + + /* + * PCIe Atomic Ops completer support is only added automatically for single + * function devices downstream of a root port supporting DEVCAP2. Support + * is added during realize and, if added, removed during device exit. The + * single function requirement avoids conflicting requirements should a + * slot be composed of multiple devices with differing capabilities. + */ + if (pci_bus_is_root(bus) || !parent || !parent->exp.exp_cap || + pcie_cap_get_type(parent) != PCI_EXP_TYPE_ROOT_PORT || + pcie_cap_get_version(parent) != PCI_EXP_FLAGS_VER2 || + vdev->pdev.devfn || + vdev->pdev.cap_present & QEMU_PCI_CAP_MULTIFUNCTION) { + return; + } + + pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2; + + /* Abort if there'a already an Atomic Ops configuration on the root port */ + if (pci_get_long(pos) & (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64 | + PCI_EXP_DEVCAP2_ATOMIC_COMP128)) { + return; + } + + info = vfio_get_device_info(vdev->vbasedev.fd); + if (!info) { + return; + } + + hdr = vfio_get_device_info_cap(info, VFIO_DEVICE_INFO_CAP_PCI_ATOMIC_COMP); + if (!hdr) { + return; + } + + cap = (void *)hdr; + if (cap->flags & VFIO_PCI_ATOMIC_COMP32) { + mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP32; + } + if (cap->flags & VFIO_PCI_ATOMIC_COMP64) { + mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP64; + } + if (cap->flags & VFIO_PCI_ATOMIC_COMP128) { + mask |= PCI_EXP_DEVCAP2_ATOMIC_COMP128; + } + + if (!mask) { + return; + } + + pci_long_test_and_set_mask(pos, mask); + vdev->clear_parent_atomics_on_exit = true; +} + +static void vfio_pci_disable_rp_atomics(VFIOPCIDevice *vdev) +{ + if (vdev->clear_parent_atomics_on_exit) { + PCIDevice *parent = pci_get_bus(&vdev->pdev)->parent_dev; + uint8_t *pos = parent->config + parent->exp.exp_cap + PCI_EXP_DEVCAP2; + + pci_long_test_and_clear_mask(pos, PCI_EXP_DEVCAP2_ATOMIC_COMP32 | + PCI_EXP_DEVCAP2_ATOMIC_COMP64 | + PCI_EXP_DEVCAP2_ATOMIC_COMP128); + } +} + static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, Error **errp) { @@ -1883,6 +2077,8 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size, QEMU_PCI_EXP_LNKCAP_MLS(QEMU_PCI_EXP_LNK_2_5GT), ~0); vfio_add_emulated_word(vdev, pos + PCI_EXP_LNKCTL, 0, ~0); } + + vfio_pci_enable_rp_atomics(vdev); } /* @@ -1940,6 +2136,7 @@ static void vfio_check_af_flr(VFIOPCIDevice *vdev, uint8_t pos) static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) { + ERRP_GUARD(); PCIDevice *pdev = &vdev->pdev; uint8_t cap_id, next, size; int ret; @@ -2020,6 +2217,54 @@ static int vfio_add_std_cap(VFIOPCIDevice *vdev, uint8_t pos, Error **errp) return 0; } +static int vfio_setup_rebar_ecap(VFIOPCIDevice *vdev, uint16_t pos) +{ + uint32_t ctrl; + int i, nbar; + + ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL); + nbar = (ctrl & PCI_REBAR_CTRL_NBAR_MASK) >> PCI_REBAR_CTRL_NBAR_SHIFT; + + for (i = 0; i < nbar; i++) { + uint32_t cap; + int size; + + ctrl = pci_get_long(vdev->pdev.config + pos + PCI_REBAR_CTRL + (i * 8)); + size = (ctrl & PCI_REBAR_CTRL_BAR_SIZE) >> PCI_REBAR_CTRL_BAR_SHIFT; + + /* The cap register reports sizes 1MB to 128TB, with 4 reserved bits */ + cap = size <= 27 ? 1U << (size + 4) : 0; + + /* + * The PCIe spec (v6.0.1, 7.8.6) requires HW to support at least one + * size in the range 1MB to 512GB. We intend to mask all sizes except + * the one currently enabled in the size field, therefore if it's + * outside the range, hide the whole capability as this virtualization + * trick won't work. If >512GB resizable BARs start to appear, we + * might need an opt-in or reservation scheme in the kernel. + */ + if (!(cap & PCI_REBAR_CAP_SIZES)) { + return -EINVAL; + } + + /* Hide all sizes reported in the ctrl reg per above requirement. */ + ctrl &= (PCI_REBAR_CTRL_BAR_SIZE | + PCI_REBAR_CTRL_NBAR_MASK | + PCI_REBAR_CTRL_BAR_IDX); + + /* + * The BAR size field is RW, however we've mangled the capability + * register such that we only report a single size, ie. the current + * BAR size. A write of an unsupported value is undefined, therefore + * the register field is essentially RO. + */ + vfio_add_emulated_long(vdev, pos + PCI_REBAR_CAP + (i * 8), cap, ~0); + vfio_add_emulated_long(vdev, pos + PCI_REBAR_CTRL + (i * 8), ctrl, ~0); + } + + return 0; +} + static void vfio_add_ext_cap(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; @@ -2093,9 +2338,13 @@ static void vfio_add_ext_cap(VFIOPCIDevice *vdev) case 0: /* kernel masked capability */ case PCI_EXT_CAP_ID_SRIOV: /* Read-only VF BARs confuse OVMF */ case PCI_EXT_CAP_ID_ARI: /* XXX Needs next function virtualization */ - case PCI_EXT_CAP_ID_REBAR: /* Can't expose read-only */ trace_vfio_add_ext_cap_dropped(vdev->vbasedev.name, cap_id, next); break; + case PCI_EXT_CAP_ID_REBAR: + if (!vfio_setup_rebar_ecap(vdev, next)) { + pcie_add_capability(pdev, cap_id, cap_ver, next, size); + } + break; default: pcie_add_capability(pdev, cap_id, cap_ver, next, size); } @@ -2130,7 +2379,7 @@ static int vfio_add_capabilities(VFIOPCIDevice *vdev, Error **errp) return 0; } -static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) +void vfio_pci_pre_reset(VFIOPCIDevice *vdev) { PCIDevice *pdev = &vdev->pdev; uint16_t cmd; @@ -2167,7 +2416,7 @@ static void vfio_pci_pre_reset(VFIOPCIDevice *vdev) vfio_pci_write_config(pdev, PCI_COMMAND, cmd, 2); } -static void vfio_pci_post_reset(VFIOPCIDevice *vdev) +void vfio_pci_post_reset(VFIOPCIDevice *vdev) { Error *err = NULL; int nr; @@ -2191,7 +2440,7 @@ static void vfio_pci_post_reset(VFIOPCIDevice *vdev) vfio_quirk_reset(vdev); } -static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) +bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) { char tmp[13]; @@ -2201,22 +2450,13 @@ static bool vfio_pci_host_match(PCIHostDeviceAddress *addr, const char *name) return (strcmp(tmp, name) == 0); } -static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +int vfio_pci_get_pci_hot_reset_info(VFIOPCIDevice *vdev, + struct vfio_pci_hot_reset_info **info_p) { - VFIOGroup *group; struct vfio_pci_hot_reset_info *info; - struct vfio_pci_dependent_device *devices; - struct vfio_pci_hot_reset *reset; - int32_t *fds; - int ret, i, count; - bool multi = false; - - trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi"); + int ret, count; - if (!single) { - vfio_pci_pre_reset(vdev); - } - vdev->vbasedev.needs_reset = false; + assert(info_p && !*info_p); info = g_malloc0(sizeof(*info)); info->argsz = sizeof(*info); @@ -2224,163 +2464,36 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret && errno != ENOSPC) { ret = -errno; + g_free(info); if (!vdev->has_pm_reset) { error_report("vfio: Cannot reset device %s, " "no available reset mechanism.", vdev->vbasedev.name); } - goto out_single; + return ret; } count = info->count; - info = g_realloc(info, sizeof(*info) + (count * sizeof(*devices))); - info->argsz = sizeof(*info) + (count * sizeof(*devices)); - devices = &info->devices[0]; + info = g_realloc(info, sizeof(*info) + (count * sizeof(info->devices[0]))); + info->argsz = sizeof(*info) + (count * sizeof(info->devices[0])); ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_GET_PCI_HOT_RESET_INFO, info); if (ret) { ret = -errno; + g_free(info); error_report("vfio: hot reset info failed: %m"); - goto out_single; - } - - trace_vfio_pci_hot_reset_has_dep_devices(vdev->vbasedev.name); - - /* Verify that we have all the groups required */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - trace_vfio_pci_hot_reset_dep_devices(host.domain, - host.bus, host.slot, host.function, devices[i].group_id); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - if (!vdev->has_pm_reset) { - error_report("vfio: Cannot reset device %s, " - "depends on group %d which is not owned.", - vdev->vbasedev.name, devices[i].group_id); - } - ret = -EPERM; - goto out; - } - - /* Prep dependent devices for reset and clear our marker. */ - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - if (single) { - ret = -EINVAL; - goto out_single; - } - vfio_pci_pre_reset(tmp); - tmp->vbasedev.needs_reset = false; - multi = true; - break; - } - } - } - - if (!single && !multi) { - ret = -EINVAL; - goto out_single; - } - - /* Determine how many group fds need to be passed */ - count = 0; - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - count++; - break; - } - } - } - - reset = g_malloc0(sizeof(*reset) + (count * sizeof(*fds))); - reset->argsz = sizeof(*reset) + (count * sizeof(*fds)); - fds = &reset->group_fds[0]; - - /* Fill in group fds */ - QLIST_FOREACH(group, &vfio_group_list, next) { - for (i = 0; i < info->count; i++) { - if (group->groupid == devices[i].group_id) { - fds[reset->count++] = group->fd; - break; - } - } + return ret; } - /* Bus reset! */ - ret = ioctl(vdev->vbasedev.fd, VFIO_DEVICE_PCI_HOT_RESET, reset); - g_free(reset); - - trace_vfio_pci_hot_reset_result(vdev->vbasedev.name, - ret ? "%m" : "Success"); - -out: - /* Re-enable INTx on affected devices */ - for (i = 0; i < info->count; i++) { - PCIHostDeviceAddress host; - VFIOPCIDevice *tmp; - VFIODevice *vbasedev_iter; - - host.domain = devices[i].segment; - host.bus = devices[i].bus; - host.slot = PCI_SLOT(devices[i].devfn); - host.function = PCI_FUNC(devices[i].devfn); - - if (vfio_pci_host_match(&host, vdev->vbasedev.name)) { - continue; - } - - QLIST_FOREACH(group, &vfio_group_list, next) { - if (group->groupid == devices[i].group_id) { - break; - } - } - - if (!group) { - break; - } + *info_p = info; + return 0; +} - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (!vbasedev_iter->dev->realized || - vbasedev_iter->type != VFIO_DEVICE_TYPE_PCI) { - continue; - } - tmp = container_of(vbasedev_iter, VFIOPCIDevice, vbasedev); - if (vfio_pci_host_match(&host, tmp->vbasedev.name)) { - vfio_pci_post_reset(tmp); - break; - } - } - } -out_single: - if (!single) { - vfio_pci_post_reset(vdev); - } - g_free(info); +static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool single) +{ + VFIODevice *vbasedev = &vdev->vbasedev; + const VFIOIOMMUClass *ops = vbasedev->bcontainer->ops; - return ret; + return ops->pci_hot_reset(vbasedev, single); } /* @@ -2431,14 +2544,45 @@ static bool vfio_msix_present(void *opaque, int version_id) return msix_present(pdev); } -const VMStateDescription vmstate_vfio_pci_config = { +static bool vfio_display_migration_needed(void *opaque) +{ + VFIOPCIDevice *vdev = opaque; + + /* + * We need to migrate the VFIODisplay object if ramfb *migration* was + * explicitly requested (in which case we enforced both ramfb=on and + * display=on), or ramfb migration was left at the default "auto" + * setting, and *ramfb* was explicitly requested (in which case we + * enforced display=on). + */ + return vdev->ramfb_migrate == ON_OFF_AUTO_ON || + (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO && vdev->enable_ramfb); +} + +static const VMStateDescription vmstate_vfio_display = { + .name = "VFIOPCIDevice/VFIODisplay", + .version_id = 1, + .minimum_version_id = 1, + .needed = vfio_display_migration_needed, + .fields = (const VMStateField[]){ + VMSTATE_STRUCT_POINTER(dpy, VFIOPCIDevice, vfio_display_vmstate, + VFIODisplay), + VMSTATE_END_OF_LIST() + } +}; + +static const VMStateDescription vmstate_vfio_pci_config = { .name = "VFIOPCIDevice", .version_id = 1, .minimum_version_id = 1, - .fields = (VMStateField[]) { + .fields = (const VMStateField[]) { VMSTATE_PCI_DEVICE(pdev, VFIOPCIDevice), VMSTATE_MSIX_TEST(pdev, VFIOPCIDevice, vfio_msix_present), VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription * const []) { + &vmstate_vfio_display, + NULL } }; @@ -2453,7 +2597,12 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) { VFIOPCIDevice *vdev = container_of(vbasedev, VFIOPCIDevice, vbasedev); PCIDevice *pdev = &vdev->pdev; - int ret; + pcibus_t old_addr[PCI_NUM_REGIONS - 1]; + int bar, ret; + + for (bar = 0; bar < PCI_ROM_SLOT; bar++) { + old_addr[bar] = pdev->io_regions[bar].addr; + } ret = vmstate_load_state(f, &vmstate_vfio_pci_config, vdev, 1); if (ret) { @@ -2463,6 +2612,18 @@ static int vfio_pci_load_config(VFIODevice *vbasedev, QEMUFile *f) vfio_pci_write_config(pdev, PCI_COMMAND, pci_get_word(pdev->config + PCI_COMMAND), 2); + for (bar = 0; bar < PCI_ROM_SLOT; bar++) { + /* + * The address may not be changed in some scenarios + * (e.g. the VF driver isn't loaded in VM). + */ + if (old_addr[bar] != pdev->io_regions[bar].addr && + vdev->bars[bar].region.size > 0 && + vdev->bars[bar].region.size < qemu_real_host_page_size()) { + vfio_sub_page_bar_update_mapping(pdev, bar); + } + } + if (msi_enabled(pdev)) { vfio_msi_enable(vdev); } else if (msix_enabled(pdev)) { @@ -2632,12 +2793,12 @@ static void vfio_populate_device(VFIOPCIDevice *vdev, Error **errp) } } -static void vfio_put_device(VFIOPCIDevice *vdev) +static void vfio_pci_put_device(VFIOPCIDevice *vdev) { + vfio_detach_device(&vdev->vbasedev); + g_free(vdev->vbasedev.name); g_free(vdev->msix); - - vfio_put_base_device(&vdev->vbasedev); } static void vfio_err_notifier_handler(void *opaque) @@ -2782,99 +2943,68 @@ static void vfio_unregister_req_notifier(VFIOPCIDevice *vdev) static void vfio_realize(PCIDevice *pdev, Error **errp) { + ERRP_GUARD(); VFIOPCIDevice *vdev = VFIO_PCI(pdev); - VFIODevice *vbasedev_iter; - VFIOGroup *group; - char *tmp, *subsys, group_path[PATH_MAX], *group_name; + VFIODevice *vbasedev = &vdev->vbasedev; + char *tmp, *subsys; Error *err = NULL; - ssize_t len; - struct stat st; - int groupid; int i, ret; bool is_mdev; + char uuid[UUID_STR_LEN]; + char *name; - if (!vdev->vbasedev.sysfsdev) { + if (vbasedev->fd < 0 && !vbasedev->sysfsdev) { if (!(~vdev->host.domain || ~vdev->host.bus || ~vdev->host.slot || ~vdev->host.function)) { error_setg(errp, "No provided host device"); error_append_hint(errp, "Use -device vfio-pci,host=DDDD:BB:DD.F " +#ifdef CONFIG_IOMMUFD + "or -device vfio-pci,fd=DEVICE_FD " +#endif "or -device vfio-pci,sysfsdev=PATH_TO_DEVICE\n"); return; } - vdev->vbasedev.sysfsdev = + vbasedev->sysfsdev = g_strdup_printf("/sys/bus/pci/devices/%04x:%02x:%02x.%01x", vdev->host.domain, vdev->host.bus, vdev->host.slot, vdev->host.function); } - if (stat(vdev->vbasedev.sysfsdev, &st) < 0) { - error_setg_errno(errp, errno, "no such host device"); - error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.sysfsdev); + if (vfio_device_get_name(vbasedev, errp) < 0) { return; } - vdev->vbasedev.name = g_path_get_basename(vdev->vbasedev.sysfsdev); - vdev->vbasedev.ops = &vfio_pci_ops; - vdev->vbasedev.type = VFIO_DEVICE_TYPE_PCI; - vdev->vbasedev.dev = DEVICE(vdev); - - tmp = g_strdup_printf("%s/iommu_group", vdev->vbasedev.sysfsdev); - len = readlink(tmp, group_path, sizeof(group_path)); - g_free(tmp); - - if (len <= 0 || len >= sizeof(group_path)) { - error_setg_errno(errp, len < 0 ? errno : ENAMETOOLONG, - "no iommu_group found"); - goto error; - } - - group_path[len] = 0; - - group_name = basename(group_path); - if (sscanf(group_name, "%d", &groupid) != 1) { - error_setg_errno(errp, errno, "failed to read %s", group_path); - goto error; - } - - trace_vfio_realize(vdev->vbasedev.name, groupid); - - group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), errp); - if (!group) { - goto error; - } - - QLIST_FOREACH(vbasedev_iter, &group->device_list, next) { - if (strcmp(vbasedev_iter->name, vdev->vbasedev.name) == 0) { - error_setg(errp, "device is already attached"); - vfio_put_group(group); - goto error; - } - } - /* * Mediated devices *might* operate compatibly with discarding of RAM, but * we cannot know for certain, it depends on whether the mdev vendor driver * stays in sync with the active working set of the guest driver. Prevent * the x-balloon-allowed option unless this is minimally an mdev device. */ - tmp = g_strdup_printf("%s/subsystem", vdev->vbasedev.sysfsdev); + tmp = g_strdup_printf("%s/subsystem", vbasedev->sysfsdev); subsys = realpath(tmp, NULL); g_free(tmp); is_mdev = subsys && (strcmp(subsys, "/sys/bus/mdev") == 0); free(subsys); - trace_vfio_mdev(vdev->vbasedev.name, is_mdev); + trace_vfio_mdev(vbasedev->name, is_mdev); - if (vdev->vbasedev.ram_block_discard_allowed && !is_mdev) { + if (vbasedev->ram_block_discard_allowed && !is_mdev) { error_setg(errp, "x-balloon-allowed only potentially compatible " "with mdev devices"); - vfio_put_group(group); goto error; } - ret = vfio_get_device(group, vdev->vbasedev.name, &vdev->vbasedev, errp); + if (!qemu_uuid_is_null(&vdev->vf_token)) { + qemu_uuid_unparse(&vdev->vf_token, uuid); + name = g_strdup_printf("%s vf_token=%s", vbasedev->name, uuid); + } else { + name = g_strdup(vbasedev->name); + } + + ret = vfio_attach_device(name, vbasedev, + pci_device_iommu_address_space(pdev), errp); + g_free(name); if (ret) { - vfio_put_group(group); goto error; } @@ -2885,7 +3015,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } /* Get a copy of config space */ - ret = pread(vdev->vbasedev.fd, vdev->pdev.config, + ret = pread(vbasedev->fd, vdev->pdev.config, MIN(pci_config_size(&vdev->pdev), vdev->config_size), vdev->config_offset); if (ret < (int)MIN(pci_config_size(&vdev->pdev), vdev->config_size)) { @@ -2913,7 +3043,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } vfio_add_emulated_word(vdev, PCI_VENDOR_ID, vdev->vendor_id, ~0); - trace_vfio_pci_emulated_vendor_id(vdev->vbasedev.name, vdev->vendor_id); + trace_vfio_pci_emulated_vendor_id(vbasedev->name, vdev->vendor_id); } else { vdev->vendor_id = pci_get_word(pdev->config + PCI_VENDOR_ID); } @@ -2924,7 +3054,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } vfio_add_emulated_word(vdev, PCI_DEVICE_ID, vdev->device_id, ~0); - trace_vfio_pci_emulated_device_id(vdev->vbasedev.name, vdev->device_id); + trace_vfio_pci_emulated_device_id(vbasedev->name, vdev->device_id); } else { vdev->device_id = pci_get_word(pdev->config + PCI_DEVICE_ID); } @@ -2936,7 +3066,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_VENDOR_ID, vdev->sub_vendor_id, ~0); - trace_vfio_pci_emulated_sub_vendor_id(vdev->vbasedev.name, + trace_vfio_pci_emulated_sub_vendor_id(vbasedev->name, vdev->sub_vendor_id); } @@ -2946,7 +3076,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto error; } vfio_add_emulated_word(vdev, PCI_SUBSYSTEM_ID, vdev->sub_device_id, ~0); - trace_vfio_pci_emulated_sub_device_id(vdev->vbasedev.name, + trace_vfio_pci_emulated_sub_device_id(vbasedev->name, vdev->sub_device_id); } @@ -3005,7 +3135,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) goto out_teardown; } - ret = vfio_get_dev_region_info(&vdev->vbasedev, + ret = vfio_get_dev_region_info(vbasedev, VFIO_REGION_TYPE_PCI_VENDOR_TYPE | PCI_VENDOR_ID_INTEL, VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION, &opregion); if (ret) { @@ -3066,24 +3196,23 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) } } - if (vfio_pci_is(vdev, PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID)) { - ret = vfio_pci_nvidia_v100_ram_init(vdev, errp); - if (ret && ret != -ENODEV) { - error_report("Failed to setup NVIDIA V100 GPU RAM"); - } + if (vdev->ramfb_migrate == ON_OFF_AUTO_ON && !vdev->enable_ramfb) { + warn_report("x-ramfb-migrate=on but ramfb=off. " + "Forcing x-ramfb-migrate to off."); + vdev->ramfb_migrate = ON_OFF_AUTO_OFF; } - - if (vfio_pci_is(vdev, PCI_VENDOR_ID_IBM, PCI_ANY_ID)) { - ret = vfio_pci_nvlink2_init(vdev, errp); - if (ret && ret != -ENODEV) { - error_report("Failed to setup NVlink2 bridge"); + if (vbasedev->enable_migration == ON_OFF_AUTO_OFF) { + if (vdev->ramfb_migrate == ON_OFF_AUTO_AUTO) { + vdev->ramfb_migrate = ON_OFF_AUTO_OFF; + } else if (vdev->ramfb_migrate == ON_OFF_AUTO_ON) { + error_setg(errp, "x-ramfb-migrate requires enable-migration"); + goto out_deregister; } } if (!pdev->failover_pair_id) { - ret = vfio_migration_probe(&vdev->vbasedev, errp); - if (ret) { - error_report("%s: Migration disabled", vdev->vbasedev.name); + if (!vfio_migration_realize(vbasedev, errp)) { + goto out_deregister; } } @@ -3094,19 +3223,26 @@ static void vfio_realize(PCIDevice *pdev, Error **errp) return; out_deregister: + if (vdev->interrupt == VFIO_INT_INTx) { + vfio_intx_disable(vdev); + } pci_device_set_intx_routing_notifier(&vdev->pdev, NULL); - kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + if (vdev->irqchip_change_notifier.notify) { + kvm_irqchip_remove_change_notifier(&vdev->irqchip_change_notifier); + } + if (vdev->intx.mmap_timer) { + timer_free(vdev->intx.mmap_timer); + } out_teardown: vfio_teardown_msi(vdev); vfio_bars_exit(vdev); error: - error_prepend(errp, VFIO_MSG_PREFIX, vdev->vbasedev.name); + error_prepend(errp, VFIO_MSG_PREFIX, vbasedev->name); } static void vfio_instance_finalize(Object *obj) { VFIOPCIDevice *vdev = VFIO_PCI(obj); - VFIOGroup *group = vdev->vbasedev.group; vfio_display_finalize(vdev); vfio_bars_finalize(vdev); @@ -3119,8 +3255,7 @@ static void vfio_instance_finalize(Object *obj) * * g_free(vdev->igd_opregion); */ - vfio_put_device(vdev); - vfio_put_group(group); + vfio_pci_put_device(vdev); } static void vfio_exitfn(PCIDevice *pdev) @@ -3138,8 +3273,9 @@ static void vfio_exitfn(PCIDevice *pdev) timer_free(vdev->intx.mmap_timer); } vfio_teardown_msi(vdev); + vfio_pci_disable_rp_atomics(vdev); vfio_bars_exit(vdev); - vfio_migration_finalize(&vdev->vbasedev); + vfio_migration_exit(&vdev->vbasedev); } static void vfio_pci_reset(DeviceState *dev) @@ -3185,6 +3321,7 @@ static void vfio_instance_init(Object *obj) { PCIDevice *pci_dev = PCI_DEVICE(obj); VFIOPCIDevice *vdev = VFIO_PCI(obj); + VFIODevice *vbasedev = &vdev->vbasedev; device_add_bootindex_property(obj, &vdev->bootindex, "bootindex", NULL, @@ -3194,6 +3331,9 @@ static void vfio_instance_init(Object *obj) vdev->host.slot = ~0U; vdev->host.function = ~0U; + vfio_device_init(vbasedev, VFIO_DEVICE_TYPE_PCI, &vfio_pci_ops, + DEVICE(vdev), false); + vdev->nv_gpudirect_clique = 0xFF; /* QEMU_PCI_CAP_EXPRESS initialization does not depend on QEMU command @@ -3203,6 +3343,7 @@ static void vfio_instance_init(Object *obj) static Property vfio_pci_dev_properties[] = { DEFINE_PROP_PCI_HOST_DEVADDR("host", VFIOPCIDevice, host), + DEFINE_PROP_UUID_NODEFAULT("vf-token", VFIOPCIDevice, vf_token), DEFINE_PROP_STRING("sysfsdev", VFIOPCIDevice, vbasedev.sysfsdev), DEFINE_PROP_ON_OFF_AUTO("x-pre-copy-dirty-page-tracking", VFIOPCIDevice, vbasedev.pre_copy_dirty_page_tracking, @@ -3219,8 +3360,8 @@ static Property vfio_pci_dev_properties[] = { VFIO_FEATURE_ENABLE_REQ_BIT, true), DEFINE_PROP_BIT("x-igd-opregion", VFIOPCIDevice, features, VFIO_FEATURE_ENABLE_IGD_OPREGION_BIT, false), - DEFINE_PROP_BOOL("x-enable-migration", VFIOPCIDevice, - vbasedev.enable_migration, false), + DEFINE_PROP_ON_OFF_AUTO("enable-migration", VFIOPCIDevice, + vbasedev.enable_migration, ON_OFF_AUTO_AUTO), DEFINE_PROP_BOOL("x-no-mmap", VFIOPCIDevice, vbasedev.no_mmap, false), DEFINE_PROP_BOOL("x-balloon-allowed", VFIOPCIDevice, vbasedev.ram_block_discard_allowed, false), @@ -3245,14 +3386,20 @@ static Property vfio_pci_dev_properties[] = { qdev_prop_nv_gpudirect_clique, uint8_t), DEFINE_PROP_OFF_AUTO_PCIBAR("x-msix-relocation", VFIOPCIDevice, msix_relo, OFF_AUTOPCIBAR_OFF), - /* - * TODO - support passed fds... is this necessary? - * DEFINE_PROP_STRING("vfiofd", VFIOPCIDevice, vfiofd_name), - * DEFINE_PROP_STRING("vfiogroupfd, VFIOPCIDevice, vfiogroupfd_name), - */ +#ifdef CONFIG_IOMMUFD + DEFINE_PROP_LINK("iommufd", VFIOPCIDevice, vbasedev.iommufd, + TYPE_IOMMUFD_BACKEND, IOMMUFDBackend *), +#endif DEFINE_PROP_END_OF_LIST(), }; +#ifdef CONFIG_IOMMUFD +static void vfio_pci_set_fd(Object *obj, const char *str, Error **errp) +{ + vfio_device_set_fd(&VFIO_PCI(obj)->vbasedev, str, errp); +} +#endif + static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) { DeviceClass *dc = DEVICE_CLASS(klass); @@ -3260,6 +3407,9 @@ static void vfio_pci_dev_class_init(ObjectClass *klass, void *data) dc->reset = vfio_pci_reset; device_class_set_props(dc, vfio_pci_dev_properties); +#ifdef CONFIG_IOMMUFD + object_class_property_add_str(klass, "fd", NULL, vfio_pci_set_fd); +#endif dc->desc = "VFIO-based PCI device assignment"; set_bit(DEVICE_CATEGORY_MISC, dc->categories); pdc->realize = vfio_realize; @@ -3284,6 +3434,8 @@ static const TypeInfo vfio_pci_dev_info = { static Property vfio_pci_dev_nohotplug_properties[] = { DEFINE_PROP_BOOL("ramfb", VFIOPCIDevice, enable_ramfb, false), + DEFINE_PROP_ON_OFF_AUTO("x-ramfb-migrate", VFIOPCIDevice, ramfb_migrate, + ON_OFF_AUTO_AUTO), DEFINE_PROP_END_OF_LIST(), }; |