aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Maydell <peter.maydell@linaro.org>2018-05-24 14:22:23 +0100
committerPeter Maydell <peter.maydell@linaro.org>2018-05-24 14:22:23 +0100
commit45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d (patch)
treeff8ceaa4ff25dcc341bd03ce64b7d5a40e13028c
parent37cbe4da617e87778ea324c3f5d08ba780bed1ea (diff)
parent63b88968f139b6a77f2f81e6f1eedf70c0170a85 (diff)
downloadqemu-arm-45eabb2ede0caf2f3dd0e1ace045d0915c53ef4d.tar.gz
Merge remote-tracking branch 'remotes/mst/tags/for_upstream' into staging
pc, pci, virtio, vhost: fixes, features Beginning of merging vDPA, new PCI ID, a new virtio balloon stat, intel iommu rework fixing a couple of security problems (no CVEs yet), fixes all over the place. Signed-off-by: Michael S. Tsirkin <mst@redhat.com> # gpg: Signature made Wed 23 May 2018 15:41:32 BST # gpg: using RSA key 281F0DB8D28D5469 # gpg: Good signature from "Michael S. Tsirkin <mst@kernel.org>" # gpg: aka "Michael S. Tsirkin <mst@redhat.com>" # Primary key fingerprint: 0270 606B 6F3C DF3D 0B17 0970 C350 3912 AFBE 8E67 # Subkey fingerprint: 5D09 FD08 71C8 F85B 94CA 8A0D 281F 0DB8 D28D 5469 * remotes/mst/tags/for_upstream: (28 commits) intel-iommu: rework the page walk logic util: implement simple iova tree intel-iommu: trace domain id during page walk intel-iommu: pass in address space when page walk intel-iommu: introduce vtd_page_walk_info intel-iommu: only do page walk for MAP notifiers intel-iommu: add iommu lock intel-iommu: remove IntelIOMMUNotifierNode intel-iommu: send PSI always even if across PDEs nvdimm: fix typo in label-size definition contrib/vhost-user-blk: enable protocol feature for vhost-user-blk hw/virtio: Fix brace Werror with clang 6.0.0 libvhost-user: Send messages with no data vhost-user+postcopy: Use qemu_set_nonblock virtio: support setting memory region based host notifier vhost-user: support receiving file descriptors in slave_read vhost-user: add Net prefix to internal state structure linux-headers: add kvm header for mips linux-headers: add unistd.h on all arches update-linux-headers.sh: unistd.h, kvm consistency ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
-rw-r--r--MAINTAINERS6
-rw-r--r--contrib/libvhost-user/libvhost-user.c16
-rw-r--r--contrib/vhost-user-blk/vhost-user-blk.c7
-rw-r--r--docs/specs/pci-ids.txt1
-rw-r--r--docs/virtio-balloon-stats.txt2
-rw-r--r--hw/i386/intel_iommu.c396
-rw-r--r--hw/i386/kvm/clock.c2
-rw-r--r--hw/i386/trace-events5
-rw-r--r--hw/mem/nvdimm.c2
-rw-r--r--hw/pci-host/q35.c17
-rw-r--r--hw/virtio/trace-events1
-rw-r--r--hw/virtio/vhost-user.c45
-rw-r--r--hw/virtio/vhost.c7
-rw-r--r--hw/virtio/virtio-balloon.c2
-rw-r--r--hw/virtio/virtio-pci.c22
-rw-r--r--hw/virtio/virtio.c13
-rw-r--r--include/hw/i386/intel_iommu.h19
-rw-r--r--include/hw/mem/nvdimm.h2
-rw-r--r--include/hw/pci/pci.h1
-rw-r--r--include/hw/virtio/virtio-bus.h2
-rw-r--r--include/hw/virtio/virtio.h2
-rw-r--r--include/qemu/iova-tree.h134
-rw-r--r--include/standard-headers/asm-x86/kvm_para.h (renamed from linux-headers/asm-x86/kvm_para.h)49
-rw-r--r--include/standard-headers/linux/virtio_balloon.h4
-rw-r--r--include/sysemu/kvm.h1
-rw-r--r--linux-headers/asm-arm/bitsperlong.h1
-rw-r--r--linux-headers/asm-arm/kvm_para.h2
-rw-r--r--linux-headers/asm-arm64/bitsperlong.h24
-rw-r--r--linux-headers/asm-arm64/kvm_para.h1
-rw-r--r--linux-headers/asm-generic/bitsperlong.h16
-rw-r--r--linux-headers/asm-generic/kvm_para.h4
-rw-r--r--linux-headers/asm-generic/unistd.h781
-rw-r--r--linux-headers/asm-mips/bitsperlong.h9
-rw-r--r--linux-headers/asm-mips/kvm.h25
-rw-r--r--linux-headers/asm-mips/kvm_para.h5
-rw-r--r--linux-headers/asm-mips/sgidefs.h45
-rw-r--r--linux-headers/asm-mips/unistd.h44
-rw-r--r--linux-headers/asm-powerpc/bitsperlong.h13
-rw-r--r--linux-headers/asm-powerpc/epapr_hcalls.h99
-rw-r--r--linux-headers/asm-powerpc/kvm_para.h98
-rw-r--r--linux-headers/asm-s390/bitsperlong.h14
-rw-r--r--linux-headers/asm-s390/kvm_para.h8
-rw-r--r--linux-headers/asm-s390/unistd_32.h1
-rw-r--r--linux-headers/asm-s390/unistd_64.h1
-rw-r--r--linux-headers/asm-x86/bitsperlong.h14
-rw-r--r--linux-headers/linux/kvm_para.h35
-rw-r--r--net/vhost-user.c38
-rwxr-xr-xscripts/update-linux-headers.sh20
-rw-r--r--target/i386/cpu.c4
-rw-r--r--target/i386/cpu.h2
-rw-r--r--target/i386/kvm.c4
-rw-r--r--target/i386/kvm_i386.h6
-rw-r--r--util/Makefile.objs1
-rw-r--r--util/iova-tree.c114
54 files changed, 1731 insertions, 456 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index 4a0bc1e71b..5b335aa948 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1787,6 +1787,12 @@ F: include/sysemu/replay.h
F: docs/replay.txt
F: stubs/replay.c
+IOVA Tree
+M: Peter Xu <peterx@redhat.com>
+S: Maintained
+F: include/qemu/iova-tree.h
+F: util/iova-tree.c
+
Usermode Emulation
------------------
Overall
diff --git a/contrib/libvhost-user/libvhost-user.c b/contrib/libvhost-user/libvhost-user.c
index beeed0c43f..54e643d871 100644
--- a/contrib/libvhost-user/libvhost-user.c
+++ b/contrib/libvhost-user/libvhost-user.c
@@ -323,13 +323,15 @@ vu_message_write(VuDev *dev, int conn_fd, VhostUserMsg *vmsg)
rc = sendmsg(conn_fd, &msg, 0);
} while (rc < 0 && (errno == EINTR || errno == EAGAIN));
- do {
- if (vmsg->data) {
- rc = write(conn_fd, vmsg->data, vmsg->size);
- } else {
- rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
- }
- } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+ if (vmsg->size) {
+ do {
+ if (vmsg->data) {
+ rc = write(conn_fd, vmsg->data, vmsg->size);
+ } else {
+ rc = write(conn_fd, p + VHOST_USER_HDR_SIZE, vmsg->size);
+ }
+ } while (rc < 0 && (errno == EINTR || errno == EAGAIN));
+ }
if (rc <= 0) {
vu_panic(dev, "Error while writing: %s", strerror(errno));
diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c
index 67dac8155a..a6a132a492 100644
--- a/contrib/vhost-user-blk/vhost-user-blk.c
+++ b/contrib/vhost-user-blk/vhost-user-blk.c
@@ -311,6 +311,12 @@ vub_get_features(VuDev *dev)
1ull << VHOST_USER_F_PROTOCOL_FEATURES;
}
+static uint64_t
+vub_get_protocol_features(VuDev *dev)
+{
+ return 1ull << VHOST_USER_PROTOCOL_F_CONFIG;
+}
+
static int
vub_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
{
@@ -373,6 +379,7 @@ vub_set_config(VuDev *vu_dev, const uint8_t *data,
static const VuDevIface vub_iface = {
.get_features = vub_get_features,
.queue_set_started = vub_queue_set_started,
+ .get_protocol_features = vub_get_protocol_features,
.get_config = vub_get_config,
.set_config = vub_set_config,
};
diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
index bb99a0257e..4d53e5c7d9 100644
--- a/docs/specs/pci-ids.txt
+++ b/docs/specs/pci-ids.txt
@@ -62,6 +62,7 @@ PCI devices (other than virtio):
1b36:000a PCI-PCI bridge (multiseat)
1b36:000b PCIe Expander Bridge (-device pxb-pcie)
1b36:000d PCI xhci usb host adapter
+1b36:000f mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c
All these devices are documented in docs/specs.
diff --git a/docs/virtio-balloon-stats.txt b/docs/virtio-balloon-stats.txt
index 7a66d25da5..9985e1dffc 100644
--- a/docs/virtio-balloon-stats.txt
+++ b/docs/virtio-balloon-stats.txt
@@ -34,6 +34,8 @@ which will return a dictionary containing:
- stat-total-memory
- stat-available-memory
- stat-disk-caches
+ - stat-htlb-pgalloc
+ - stat-htlb-pgfail
o A key named last-update, which contains the last stats update
timestamp in seconds. Since this timestamp is generated by the host,
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index fb31de9416..b5a09b7908 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -128,6 +128,22 @@ static uint64_t vtd_set_clear_mask_quad(IntelIOMMUState *s, hwaddr addr,
return new_val;
}
+static inline void vtd_iommu_lock(IntelIOMMUState *s)
+{
+ qemu_mutex_lock(&s->iommu_lock);
+}
+
+static inline void vtd_iommu_unlock(IntelIOMMUState *s)
+{
+ qemu_mutex_unlock(&s->iommu_lock);
+}
+
+/* Whether the address space needs to notify new mappings */
+static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as)
+{
+ return as->notifier_flags & IOMMU_NOTIFIER_MAP;
+}
+
/* GHashTable functions */
static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2)
{
@@ -172,9 +188,9 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value,
}
/* Reset all the gen of VTDAddressSpace to zero and set the gen of
- * IntelIOMMUState to 1.
+ * IntelIOMMUState to 1. Must be called with IOMMU lock held.
*/
-static void vtd_reset_context_cache(IntelIOMMUState *s)
+static void vtd_reset_context_cache_locked(IntelIOMMUState *s)
{
VTDAddressSpace *vtd_as;
VTDBus *vtd_bus;
@@ -197,12 +213,20 @@ static void vtd_reset_context_cache(IntelIOMMUState *s)
s->context_cache_gen = 1;
}
-static void vtd_reset_iotlb(IntelIOMMUState *s)
+/* Must be called with IOMMU lock held. */
+static void vtd_reset_iotlb_locked(IntelIOMMUState *s)
{
assert(s->iotlb);
g_hash_table_remove_all(s->iotlb);
}
+static void vtd_reset_iotlb(IntelIOMMUState *s)
+{
+ vtd_iommu_lock(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_iommu_unlock(s);
+}
+
static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id,
uint32_t level)
{
@@ -215,6 +239,7 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level)
return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K;
}
+/* Must be called with IOMMU lock held */
static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id,
hwaddr addr)
{
@@ -235,6 +260,7 @@ out:
return entry;
}
+/* Must be with IOMMU lock held */
static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
uint16_t domain_id, hwaddr addr, uint64_t slpte,
uint8_t access_flags, uint32_t level)
@@ -246,7 +272,7 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id,
trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id);
if (g_hash_table_size(s->iotlb) >= VTD_IOTLB_MAX_SIZE) {
trace_vtd_iotlb_reset("iotlb exceeds size limit");
- vtd_reset_iotlb(s);
+ vtd_reset_iotlb_locked(s);
}
entry->gfn = gfn;
@@ -723,22 +749,116 @@ static int vtd_iova_to_slpte(VTDContextEntry *ce, uint64_t iova, bool is_write,
typedef int (*vtd_page_walk_hook)(IOMMUTLBEntry *entry, void *private);
/**
+ * Constant information used during page walking
+ *
+ * @hook_fn: hook func to be called when detected page
+ * @private: private data to be passed into hook func
+ * @notify_unmap: whether we should notify invalid entries
+ * @as: VT-d address space of the device
+ * @aw: maximum address width
+ * @domain: domain ID of the page walk
+ */
+typedef struct {
+ VTDAddressSpace *as;
+ vtd_page_walk_hook hook_fn;
+ void *private;
+ bool notify_unmap;
+ uint8_t aw;
+ uint16_t domain_id;
+} vtd_page_walk_info;
+
+static int vtd_page_walk_one(IOMMUTLBEntry *entry, vtd_page_walk_info *info)
+{
+ VTDAddressSpace *as = info->as;
+ vtd_page_walk_hook hook_fn = info->hook_fn;
+ void *private = info->private;
+ DMAMap target = {
+ .iova = entry->iova,
+ .size = entry->addr_mask,
+ .translated_addr = entry->translated_addr,
+ .perm = entry->perm,
+ };
+ DMAMap *mapped = iova_tree_find(as->iova_tree, &target);
+
+ if (entry->perm == IOMMU_NONE && !info->notify_unmap) {
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+
+ assert(hook_fn);
+
+ /* Update local IOVA mapped ranges */
+ if (entry->perm) {
+ if (mapped) {
+ /* If it's exactly the same translation, skip */
+ if (!memcmp(mapped, &target, sizeof(target))) {
+ trace_vtd_page_walk_one_skip_map(entry->iova, entry->addr_mask,
+ entry->translated_addr);
+ return 0;
+ } else {
+ /*
+ * Translation changed. Normally this should not
+ * happen, but it can happen when with buggy guest
+ * OSes. Note that there will be a small window that
+ * we don't have map at all. But that's the best
+ * effort we can do. The ideal way to emulate this is
+ * atomically modify the PTE to follow what has
+ * changed, but we can't. One example is that vfio
+ * driver only has VFIO_IOMMU_[UN]MAP_DMA but no
+ * interface to modify a mapping (meanwhile it seems
+ * meaningless to even provide one). Anyway, let's
+ * mark this as a TODO in case one day we'll have
+ * a better solution.
+ */
+ IOMMUAccessFlags cache_perm = entry->perm;
+ int ret;
+
+ /* Emulate an UNMAP */
+ entry->perm = IOMMU_NONE;
+ trace_vtd_page_walk_one(info->domain_id,
+ entry->iova,
+ entry->translated_addr,
+ entry->addr_mask,
+ entry->perm);
+ ret = hook_fn(entry, private);
+ if (ret) {
+ return ret;
+ }
+ /* Drop any existing mapping */
+ iova_tree_remove(as->iova_tree, &target);
+ /* Recover the correct permission */
+ entry->perm = cache_perm;
+ }
+ }
+ iova_tree_insert(as->iova_tree, &target);
+ } else {
+ if (!mapped) {
+ /* Skip since we didn't map this range at all */
+ trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask);
+ return 0;
+ }
+ iova_tree_remove(as->iova_tree, &target);
+ }
+
+ trace_vtd_page_walk_one(info->domain_id, entry->iova,
+ entry->translated_addr, entry->addr_mask,
+ entry->perm);
+ return hook_fn(entry, private);
+}
+
+/**
* vtd_page_walk_level - walk over specific level for IOVA range
*
* @addr: base GPA addr to start the walk
* @start: IOVA range start address
* @end: IOVA range end address (start <= addr < end)
- * @hook_fn: hook func to be called when detected page
- * @private: private data to be passed into hook func
* @read: whether parent level has read permission
* @write: whether parent level has write permission
- * @notify_unmap: whether we should notify invalid entries
- * @aw: maximum address width
+ * @info: constant information for the page walk
*/
static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
- uint64_t end, vtd_page_walk_hook hook_fn,
- void *private, uint32_t level, bool read,
- bool write, bool notify_unmap, uint8_t aw)
+ uint64_t end, uint32_t level, bool read,
+ bool write, vtd_page_walk_info *info)
{
bool read_cur, write_cur, entry_valid;
uint32_t offset;
@@ -781,37 +901,34 @@ static int vtd_page_walk_level(dma_addr_t addr, uint64_t start,
*/
entry_valid = read_cur | write_cur;
- if (vtd_is_last_slpte(slpte, level)) {
+ if (!vtd_is_last_slpte(slpte, level) && entry_valid) {
+ /*
+ * This is a valid PDE (or even bigger than PDE). We need
+ * to walk one further level.
+ */
+ ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, info->aw),
+ iova, MIN(iova_next, end), level - 1,
+ read_cur, write_cur, info);
+ } else {
+ /*
+ * This means we are either:
+ *
+ * (1) the real page entry (either 4K page, or huge page)
+ * (2) the whole range is invalid
+ *
+ * In either case, we send an IOTLB notification down.
+ */
entry.target_as = &address_space_memory;
entry.iova = iova & subpage_mask;
- /* NOTE: this is only meaningful if entry_valid == true */
- entry.translated_addr = vtd_get_slpte_addr(slpte, aw);
- entry.addr_mask = ~subpage_mask;
entry.perm = IOMMU_ACCESS_FLAG(read_cur, write_cur);
- if (!entry_valid && !notify_unmap) {
- trace_vtd_page_walk_skip_perm(iova, iova_next);
- goto next;
- }
- trace_vtd_page_walk_one(level, entry.iova, entry.translated_addr,
- entry.addr_mask, entry.perm);
- if (hook_fn) {
- ret = hook_fn(&entry, private);
- if (ret < 0) {
- return ret;
- }
- }
- } else {
- if (!entry_valid) {
- trace_vtd_page_walk_skip_perm(iova, iova_next);
- goto next;
- }
- ret = vtd_page_walk_level(vtd_get_slpte_addr(slpte, aw), iova,
- MIN(iova_next, end), hook_fn, private,
- level - 1, read_cur, write_cur,
- notify_unmap, aw);
- if (ret < 0) {
- return ret;
- }
+ entry.addr_mask = ~subpage_mask;
+ /* NOTE: this is only meaningful if entry_valid == true */
+ entry.translated_addr = vtd_get_slpte_addr(slpte, info->aw);
+ ret = vtd_page_walk_one(&entry, info);
+ }
+
+ if (ret < 0) {
+ return ret;
}
next:
@@ -827,28 +944,24 @@ next:
* @ce: context entry to walk upon
* @start: IOVA address to start the walk
* @end: IOVA range end address (start <= addr < end)
- * @hook_fn: the hook that to be called for each detected area
- * @private: private data for the hook function
- * @aw: maximum address width
+ * @info: page walking information struct
*/
static int vtd_page_walk(VTDContextEntry *ce, uint64_t start, uint64_t end,
- vtd_page_walk_hook hook_fn, void *private,
- bool notify_unmap, uint8_t aw)
+ vtd_page_walk_info *info)
{
dma_addr_t addr = vtd_ce_get_slpt_base(ce);
uint32_t level = vtd_ce_get_level(ce);
- if (!vtd_iova_range_check(start, ce, aw)) {
+ if (!vtd_iova_range_check(start, ce, info->aw)) {
return -VTD_FR_ADDR_BEYOND_MGAW;
}
- if (!vtd_iova_range_check(end, ce, aw)) {
+ if (!vtd_iova_range_check(end, ce, info->aw)) {
/* Fix end so that it reaches the maximum */
- end = vtd_iova_limit(ce, aw);
+ end = vtd_iova_limit(ce, info->aw);
}
- return vtd_page_walk_level(addr, start, end, hook_fn, private,
- level, true, true, notify_unmap, aw);
+ return vtd_page_walk_level(addr, start, end, level, true, true, info);
}
/* Map a device to its corresponding domain (context-entry) */
@@ -907,6 +1020,58 @@ static int vtd_dev_to_context_entry(IntelIOMMUState *s, uint8_t bus_num,
return 0;
}
+static int vtd_sync_shadow_page_hook(IOMMUTLBEntry *entry,
+ void *private)
+{
+ memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
+ return 0;
+}
+
+/* If context entry is NULL, we'll try to fetch it on our own. */
+static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as,
+ VTDContextEntry *ce,
+ hwaddr addr, hwaddr size)
+{
+ IntelIOMMUState *s = vtd_as->iommu_state;
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_sync_shadow_page_hook,
+ .private = (void *)&vtd_as->iommu,
+ .notify_unmap = true,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ };
+ VTDContextEntry ce_cache;
+ int ret;
+
+ if (ce) {
+ /* If the caller provided context entry, use it */
+ ce_cache = *ce;
+ } else {
+ /* If the caller didn't provide ce, try to fetch */
+ ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
+ vtd_as->devfn, &ce_cache);
+ if (ret) {
+ /*
+ * This should not really happen, but in case it happens,
+ * we just skip the sync for this time. After all we even
+ * don't have the root table pointer!
+ */
+ trace_vtd_err("Detected invalid context entry when "
+ "trying to sync shadow page table");
+ return 0;
+ }
+ }
+
+ info.domain_id = VTD_CONTEXT_ENTRY_DID(ce_cache.hi);
+
+ return vtd_page_walk(&ce_cache, addr, addr + size, &info);
+}
+
+static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as)
+{
+ return vtd_sync_shadow_page_table_range(vtd_as, NULL, 0, UINT64_MAX);
+}
+
/*
* Fetch translation type for specific device. Returns <0 if error
* happens, otherwise return the shifted type to check against
@@ -1088,7 +1253,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
IntelIOMMUState *s = vtd_as->iommu_state;
VTDContextEntry ce;
uint8_t bus_num = pci_bus_num(bus);
- VTDContextCacheEntry *cc_entry = &vtd_as->context_cache_entry;
+ VTDContextCacheEntry *cc_entry;
uint64_t slpte, page_mask;
uint32_t level;
uint16_t source_id = vtd_make_source_id(bus_num, devfn);
@@ -1105,6 +1270,10 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
*/
assert(!vtd_is_interrupt_addr(addr));
+ vtd_iommu_lock(s);
+
+ cc_entry = &vtd_as->context_cache_entry;
+
/* Try to fetch slpte form IOTLB */
iotlb_entry = vtd_lookup_iotlb(s, source_id, addr);
if (iotlb_entry) {
@@ -1164,7 +1333,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
* IOMMU region can be swapped back.
*/
vtd_pt_enable_fast_path(s, source_id);
-
+ vtd_iommu_unlock(s);
return true;
}
@@ -1185,6 +1354,7 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus,
vtd_update_iotlb(s, source_id, VTD_CONTEXT_ENTRY_DID(ce.hi), addr, slpte,
access_flags, level);
out:
+ vtd_iommu_unlock(s);
entry->iova = addr & page_mask;
entry->translated_addr = vtd_get_slpte_addr(slpte, s->aw_bits) & page_mask;
entry->addr_mask = ~page_mask;
@@ -1192,6 +1362,7 @@ out:
return true;
error:
+ vtd_iommu_unlock(s);
entry->iova = 0;
entry->translated_addr = 0;
entry->addr_mask = 0;
@@ -1230,20 +1401,23 @@ static void vtd_interrupt_remap_table_setup(IntelIOMMUState *s)
static void vtd_iommu_replay_all(IntelIOMMUState *s)
{
- IntelIOMMUNotifierNode *node;
+ VTDAddressSpace *vtd_as;
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- memory_region_iommu_replay_all(&node->vtd_as->iommu);
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
+ vtd_sync_shadow_page_table(vtd_as);
}
}
static void vtd_context_global_invalidate(IntelIOMMUState *s)
{
trace_vtd_inv_desc_cc_global();
+ /* Protects context cache */
+ vtd_iommu_lock(s);
s->context_cache_gen++;
if (s->context_cache_gen == VTD_CONTEXT_CACHE_GEN_MAX) {
- vtd_reset_context_cache(s);
+ vtd_reset_context_cache_locked(s);
}
+ vtd_iommu_unlock(s);
vtd_switch_address_space_all(s);
/*
* From VT-d spec 6.5.2.1, a global context entry invalidation
@@ -1295,7 +1469,9 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
if (vtd_as && ((devfn_it & mask) == (devfn & mask))) {
trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it),
VTD_PCI_FUNC(devfn_it));
+ vtd_iommu_lock(s);
vtd_as->context_cache_entry.context_cache_gen = 0;
+ vtd_iommu_unlock(s);
/*
* Do switch address space when needed, in case if the
* device passthrough bit is switched.
@@ -1303,14 +1479,13 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s,
vtd_switch_address_space(vtd_as);
/*
* So a device is moving out of (or moving into) a
- * domain, a replay() suites here to notify all the
- * IOMMU_NOTIFIER_MAP registers about this change.
+ * domain, resync the shadow page table.
* This won't bring bad even if we have no such
* notifier registered - the IOMMU notification
* framework will skip MAP notifications if that
* happened.
*/
- memory_region_iommu_replay_all(&vtd_as->iommu);
+ vtd_sync_shadow_page_table(vtd_as);
}
}
}
@@ -1354,48 +1529,60 @@ static void vtd_iotlb_global_invalidate(IntelIOMMUState *s)
static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id)
{
- IntelIOMMUNotifierNode *node;
VTDContextEntry ce;
VTDAddressSpace *vtd_as;
trace_vtd_inv_desc_iotlb_domain(domain_id);
+ vtd_iommu_lock(s);
g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_domain,
&domain_id);
+ vtd_iommu_unlock(s);
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce) &&
domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
- memory_region_iommu_replay_all(&vtd_as->iommu);
+ vtd_sync_shadow_page_table(vtd_as);
}
}
}
-static int vtd_page_invalidate_notify_hook(IOMMUTLBEntry *entry,
- void *private)
-{
- memory_region_notify_iommu((IOMMUMemoryRegion *)private, *entry);
- return 0;
-}
-
static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s,
uint16_t domain_id, hwaddr addr,
uint8_t am)
{
- IntelIOMMUNotifierNode *node;
+ VTDAddressSpace *vtd_as;
VTDContextEntry ce;
int ret;
+ hwaddr size = (1 << am) * VTD_PAGE_SIZE;
- QLIST_FOREACH(node, &(s->notifiers_list), next) {
- VTDAddressSpace *vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) {
ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus),
vtd_as->devfn, &ce);
if (!ret && domain_id == VTD_CONTEXT_ENTRY_DID(ce.hi)) {
- vtd_page_walk(&ce, addr, addr + (1 << am) * VTD_PAGE_SIZE,
- vtd_page_invalidate_notify_hook,
- (void *)&vtd_as->iommu, true, s->aw_bits);
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /*
+ * As long as we have MAP notifications registered in
+ * any of our IOMMU notifiers, we need to sync the
+ * shadow page table.
+ */
+ vtd_sync_shadow_page_table_range(vtd_as, &ce, addr, size);
+ } else {
+ /*
+ * For UNMAP-only notifiers, we don't need to walk the
+ * page tables. We just deliver the PSI down to
+ * invalidate caches.
+ */
+ IOMMUTLBEntry entry = {
+ .target_as = &address_space_memory,
+ .iova = addr,
+ .translated_addr = 0,
+ .addr_mask = size - 1,
+ .perm = IOMMU_NONE,
+ };
+ memory_region_notify_iommu(&vtd_as->iommu, entry);
+ }
}
}
}
@@ -1411,7 +1598,9 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
info.domain_id = domain_id;
info.addr = addr;
info.mask = ~((1 << am) - 1);
+ vtd_iommu_lock(s);
g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info);
+ vtd_iommu_unlock(s);
vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am);
}
@@ -2326,8 +2515,6 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
{
VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu);
IntelIOMMUState *s = vtd_as->iommu_state;
- IntelIOMMUNotifierNode *node = NULL;
- IntelIOMMUNotifierNode *next_node = NULL;
if (!s->caching_mode && new & IOMMU_NOTIFIER_MAP) {
error_report("We need to set caching-mode=1 for intel-iommu to enable "
@@ -2335,22 +2522,13 @@ static void vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu,
exit(1);
}
- if (old == IOMMU_NOTIFIER_NONE) {
- node = g_malloc0(sizeof(*node));
- node->vtd_as = vtd_as;
- QLIST_INSERT_HEAD(&s->notifiers_list, node, next);
- return;
- }
+ /* Update per-address-space notifier flags */
+ vtd_as->notifier_flags = new;
- /* update notifier node with new flags */
- QLIST_FOREACH_SAFE(node, &s->notifiers_list, next, next_node) {
- if (node->vtd_as == vtd_as) {
- if (new == IOMMU_NOTIFIER_NONE) {
- QLIST_REMOVE(node, next);
- g_free(node);
- }
- return;
- }
+ if (old == IOMMU_NOTIFIER_NONE) {
+ QLIST_INSERT_HEAD(&s->vtd_as_with_notifiers, vtd_as, next);
+ } else if (new == IOMMU_NOTIFIER_NONE) {
+ QLIST_REMOVE(vtd_as, next);
}
}
@@ -2719,6 +2897,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
vtd_dev_as->devfn = (uint8_t)devfn;
vtd_dev_as->iommu_state = s;
vtd_dev_as->context_cache_entry.context_cache_gen = 0;
+ vtd_dev_as->iova_tree = iova_tree_new();
/*
* Memory region relationships looks like (Address range shows
@@ -2771,6 +2950,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
hwaddr start = n->start;
hwaddr end = n->end;
IntelIOMMUState *s = as->iommu_state;
+ DMAMap map;
/*
* Note: all the codes in this function has a assumption that IOVA
@@ -2815,17 +2995,19 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
VTD_PCI_FUNC(as->devfn),
entry.iova, size);
+ map.iova = entry.iova;
+ map.size = entry.addr_mask;
+ iova_tree_remove(as->iova_tree, &map);
+
memory_region_notify_one(n, &entry);
}
static void vtd_address_space_unmap_all(IntelIOMMUState *s)
{
- IntelIOMMUNotifierNode *node;
VTDAddressSpace *vtd_as;
IOMMUNotifier *n;
- QLIST_FOREACH(node, &s->notifiers_list, next) {
- vtd_as = node->vtd_as;
+ QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) {
IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) {
vtd_address_space_unmap(vtd_as, n);
}
@@ -2857,8 +3039,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n)
PCI_FUNC(vtd_as->devfn),
VTD_CONTEXT_ENTRY_DID(ce.hi),
ce.hi, ce.lo);
- vtd_page_walk(&ce, 0, ~0ULL, vtd_replay_hook, (void *)n, false,
- s->aw_bits);
+ if (vtd_as_has_map_notifier(vtd_as)) {
+ /* This is required only for MAP typed notifiers */
+ vtd_page_walk_info info = {
+ .hook_fn = vtd_replay_hook,
+ .private = (void *)n,
+ .notify_unmap = false,
+ .aw = s->aw_bits,
+ .as = vtd_as,
+ .domain_id = VTD_CONTEXT_ENTRY_DID(ce.hi),
+ };
+
+ vtd_page_walk(&ce, 0, ~0ULL, &info);
+ }
} else {
trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn),
PCI_FUNC(vtd_as->devfn));
@@ -2930,8 +3123,10 @@ static void vtd_init(IntelIOMMUState *s)
s->cap |= VTD_CAP_CM;
}
- vtd_reset_context_cache(s);
- vtd_reset_iotlb(s);
+ vtd_iommu_lock(s);
+ vtd_reset_context_cache_locked(s);
+ vtd_reset_iotlb_locked(s);
+ vtd_iommu_unlock(s);
/* Define registers with default values and bit semantics */
vtd_define_long(s, DMAR_VER_REG, 0x10UL, 0, 0);
@@ -3070,7 +3265,8 @@ static void vtd_realize(DeviceState *dev, Error **errp)
return;
}
- QLIST_INIT(&s->notifiers_list);
+ QLIST_INIT(&s->vtd_as_with_notifiers);
+ qemu_mutex_init(&s->iommu_lock);
memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num));
memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s,
"intel_iommu", DMAR_REG_SIZE);
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index 7dac319403..0bf1c60a06 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -26,7 +26,7 @@
#include "qapi/error.h"
#include <linux/kvm.h>
-#include <linux/kvm_para.h>
+#include "standard-headers/asm-x86/kvm_para.h"
#define TYPE_KVM_CLOCK "kvmclock"
#define KVM_CLOCK(obj) OBJECT_CHECK(KVMClockState, (obj), TYPE_KVM_CLOCK)
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 22d44648af..e14d06ec83 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -39,9 +39,10 @@ vtd_fault_disabled(void) "Fault processing disabled for context entry"
vtd_replay_ce_valid(uint8_t bus, uint8_t dev, uint8_t fn, uint16_t domain, uint64_t hi, uint64_t lo) "replay valid context device %02"PRIx8":%02"PRIx8".%02"PRIx8" domain 0x%"PRIx16" hi 0x%"PRIx64" lo 0x%"PRIx64
vtd_replay_ce_invalid(uint8_t bus, uint8_t dev, uint8_t fn) "replay invalid context device %02"PRIx8":%02"PRIx8".%02"PRIx8
vtd_page_walk_level(uint64_t addr, uint32_t level, uint64_t start, uint64_t end) "walk (base=0x%"PRIx64", level=%"PRIu32") iova range 0x%"PRIx64" - 0x%"PRIx64
-vtd_page_walk_one(uint32_t level, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "detected page level 0x%"PRIx32" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one(uint16_t domain, uint64_t iova, uint64_t gpa, uint64_t mask, int perm) "domain 0x%"PRIu16" iova 0x%"PRIx64" -> gpa 0x%"PRIx64" mask 0x%"PRIx64" perm %d"
+vtd_page_walk_one_skip_map(uint64_t iova, uint64_t mask, uint64_t translated) "iova 0x%"PRIx64" mask 0x%"PRIx64" translated 0x%"PRIx64
+vtd_page_walk_one_skip_unmap(uint64_t iova, uint64_t mask) "iova 0x%"PRIx64" mask 0x%"PRIx64
vtd_page_walk_skip_read(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to unable to read"
-vtd_page_walk_skip_perm(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to perm empty"
vtd_page_walk_skip_reserve(uint64_t iova, uint64_t next) "Page walk skip iova 0x%"PRIx64" - 0x%"PRIx64" due to rsrv set"
vtd_switch_address_space(uint8_t bus, uint8_t slot, uint8_t fn, bool on) "Device %02x:%02x.%x switching address space (iommu enabled=%d)"
vtd_as_unmap_whole(uint8_t bus, uint8_t slot, uint8_t fn, uint64_t iova, uint64_t size) "Device %02x:%02x.%x start 0x%"PRIx64" size 0x%"PRIx64
diff --git a/hw/mem/nvdimm.c b/hw/mem/nvdimm.c
index acb656b672..4087aca25e 100644
--- a/hw/mem/nvdimm.c
+++ b/hw/mem/nvdimm.c
@@ -89,7 +89,7 @@ static void nvdimm_set_unarmed(Object *obj, bool value, Error **errp)
static void nvdimm_init(Object *obj)
{
- object_property_add(obj, NVDIMM_LABLE_SIZE_PROP, "int",
+ object_property_add(obj, NVDIMM_LABEL_SIZE_PROP, "int",
nvdimm_get_label_size, nvdimm_set_label_size, NULL,
NULL, NULL);
object_property_add_bool(obj, NVDIMM_UNARMED_PROP,
diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c
index a36a1195e4..02f9576588 100644
--- a/hw/pci-host/q35.c
+++ b/hw/pci-host/q35.c
@@ -535,13 +535,15 @@ static void mch_realize(PCIDevice *d, Error **errp)
/* if *disabled* show SMRAM to all CPUs */
memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region",
- mch->pci_address_space, 0xa0000, 0x20000);
- memory_region_add_subregion_overlap(mch->system_memory, 0xa0000,
+ mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+ MCH_HOST_BRIDGE_SMRAM_C_SIZE);
+ memory_region_add_subregion_overlap(mch->system_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
&mch->smram_region, 1);
memory_region_set_enabled(&mch->smram_region, true);
memory_region_init_alias(&mch->open_high_smram, OBJECT(mch), "smram-open-high",
- mch->ram_memory, 0xa0000, 0x20000);
+ mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+ MCH_HOST_BRIDGE_SMRAM_C_SIZE);
memory_region_add_subregion_overlap(mch->system_memory, 0xfeda0000,
&mch->open_high_smram, 1);
memory_region_set_enabled(&mch->open_high_smram, false);
@@ -550,11 +552,14 @@ static void mch_realize(PCIDevice *d, Error **errp)
memory_region_init(&mch->smram, OBJECT(mch), "smram", 1ull << 32);
memory_region_set_enabled(&mch->smram, true);
memory_region_init_alias(&mch->low_smram, OBJECT(mch), "smram-low",
- mch->ram_memory, 0xa0000, 0x20000);
+ mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+ MCH_HOST_BRIDGE_SMRAM_C_SIZE);
memory_region_set_enabled(&mch->low_smram, true);
- memory_region_add_subregion(&mch->smram, 0xa0000, &mch->low_smram);
+ memory_region_add_subregion(&mch->smram, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+ &mch->low_smram);
memory_region_init_alias(&mch->high_smram, OBJECT(mch), "smram-high",
- mch->ram_memory, 0xa0000, 0x20000);
+ mch->ram_memory, MCH_HOST_BRIDGE_SMRAM_C_BASE,
+ MCH_HOST_BRIDGE_SMRAM_C_SIZE);
memory_region_set_enabled(&mch->high_smram, true);
memory_region_add_subregion(&mch->smram, 0xfeda0000, &mch->high_smram);
diff --git a/hw/virtio/trace-events b/hw/virtio/trace-events
index 1422ff03ab..07bcbe9e85 100644
--- a/hw/virtio/trace-events
+++ b/hw/virtio/trace-events
@@ -6,6 +6,7 @@ vhost_region_add_section(const char *name, uint64_t gpa, uint64_t size, uint64_t
vhost_region_add_section_merge(const char *name, uint64_t new_size, uint64_t gpa, uint64_t owr) "%s: size: 0x%"PRIx64 " gpa: 0x%"PRIx64 " owr: 0x%"PRIx64
vhost_region_add_section_aligned(const char *name, uint64_t gpa, uint64_t size, uint64_t host) "%s: 0x%"PRIx64"+0x%"PRIx64" @ 0x%"PRIx64
vhost_section(const char *name, int r) "%s:%d"
+vhost_iotlb_miss(void *dev, int step) "%p step %d"
# hw/virtio/vhost-user.c
vhost_user_postcopy_end_entry(void) ""
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 38da8692bb..ca554d4ff1 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -852,14 +852,44 @@ static void slave_read(void *opaque)
VhostUserHeader hdr = { 0, };
VhostUserPayload payload = { 0, };
int size, ret = 0;
+ struct iovec iov;
+ struct msghdr msgh;
+ int fd = -1;
+ char control[CMSG_SPACE(sizeof(fd))];
+ struct cmsghdr *cmsg;
+ size_t fdsize;
+
+ memset(&msgh, 0, sizeof(msgh));
+ msgh.msg_iov = &iov;
+ msgh.msg_iovlen = 1;
+ msgh.msg_control = control;
+ msgh.msg_controllen = sizeof(control);
/* Read header */
- size = read(u->slave_fd, &hdr, VHOST_USER_HDR_SIZE);
+ iov.iov_base = &hdr;
+ iov.iov_len = VHOST_USER_HDR_SIZE;
+
+ size = recvmsg(u->slave_fd, &msgh, 0);
if (size != VHOST_USER_HDR_SIZE) {
error_report("Failed to read from slave.");
goto err;
}
+ if (msgh.msg_flags & MSG_CTRUNC) {
+ error_report("Truncated message.");
+ goto err;
+ }
+
+ for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
+ cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
+ if (cmsg->cmsg_level == SOL_SOCKET &&
+ cmsg->cmsg_type == SCM_RIGHTS) {
+ fdsize = cmsg->cmsg_len - CMSG_LEN(0);
+ memcpy(&fd, CMSG_DATA(cmsg), fdsize);
+ break;
+ }
+ }
+
if (hdr.size > VHOST_USER_PAYLOAD_SIZE) {
error_report("Failed to read msg header."
" Size %d exceeds the maximum %zu.", hdr.size,
@@ -883,9 +913,15 @@ static void slave_read(void *opaque)
break;
default:
error_report("Received unexpected msg type.");
+ if (fd != -1) {
+ close(fd);
+ }
ret = -EINVAL;
}
+ /* Message handlers need to make sure that fd will be consumed. */
+ fd = -1;
+
/*
* REPLY_ACK feature handling. Other reply types has to be managed
* directly in their request handlers.
@@ -918,6 +954,9 @@ err:
qemu_set_fd_handler(u->slave_fd, NULL, NULL, NULL);
close(u->slave_fd);
u->slave_fd = -1;
+ if (fd != -1) {
+ close(fd);
+ }
return;
}
@@ -1076,7 +1115,7 @@ static int vhost_user_postcopy_advise(struct vhost_dev *dev, Error **errp)
error_setg(errp, "%s: Failed to get ufd", __func__);
return -1;
}
- fcntl(ufd, F_SETFL, O_NONBLOCK);
+ qemu_set_nonblock(ufd);
/* register ufd with userfault thread */
u->postcopy_fd.fd = ufd;
@@ -1316,7 +1355,7 @@ static bool vhost_user_requires_shm_log(struct vhost_dev *dev)
static int vhost_user_migration_done(struct vhost_dev *dev, char* mac_addr)
{
- VhostUserMsg msg = { 0 };
+ VhostUserMsg msg = { };
assert(dev->vhost_ops->backend_type == VHOST_BACKEND_TYPE_USER);
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index 9d5850a7d7..b08290036d 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -894,12 +894,15 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
rcu_read_lock();
+ trace_vhost_iotlb_miss(dev, 1);
+
iotlb = address_space_get_iotlb_entry(dev->vdev->dma_as,
iova, write);
if (iotlb.target_as != NULL) {
ret = vhost_memory_region_lookup(dev, iotlb.translated_addr,
&uaddr, &len);
if (ret) {
+ trace_vhost_iotlb_miss(dev, 3);
error_report("Fail to lookup the translated address "
"%"PRIx64, iotlb.translated_addr);
goto out;
@@ -911,10 +914,14 @@ int vhost_device_iotlb_miss(struct vhost_dev *dev, uint64_t iova, int write)
ret = vhost_backend_update_device_iotlb(dev, iova, uaddr,
len, iotlb.perm);
if (ret) {
+ trace_vhost_iotlb_miss(dev, 4);
error_report("Fail to update device iotlb");
goto out;
}
}
+
+ trace_vhost_iotlb_miss(dev, 2);
+
out:
rcu_read_unlock();
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index f456cea2e7..1f7a87f094 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -52,6 +52,8 @@ static const char *balloon_stat_names[] = {
[VIRTIO_BALLOON_S_MEMTOT] = "stat-total-memory",
[VIRTIO_BALLOON_S_AVAIL] = "stat-available-memory",
[VIRTIO_BALLOON_S_CACHES] = "stat-disk-caches",
+ [VIRTIO_BALLOON_S_HTLB_PGALLOC] = "stat-htlb-pgalloc",
+ [VIRTIO_BALLOON_S_HTLB_PGFAIL] = "stat-htlb-pgfail",
[VIRTIO_BALLOON_S_NR] = NULL
};
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index 1e8ab7bbc5..5eb0c323ca 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -1037,6 +1037,27 @@ assign_error:
return r;
}
+static int virtio_pci_set_host_notifier_mr(DeviceState *d, int n,
+ MemoryRegion *mr, bool assign)
+{
+ VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+ int offset;
+
+ if (n >= VIRTIO_QUEUE_MAX || !virtio_pci_modern(proxy) ||
+ virtio_pci_queue_mem_mult(proxy) != memory_region_size(mr)) {
+ return -1;
+ }
+
+ if (assign) {
+ offset = virtio_pci_queue_mem_mult(proxy) * n;
+ memory_region_add_subregion_overlap(&proxy->notify.mr, offset, mr, 1);
+ } else {
+ memory_region_del_subregion(&proxy->notify.mr, mr);
+ }
+
+ return 0;
+}
+
static void virtio_pci_vmstate_change(DeviceState *d, bool running)
{
VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -2652,6 +2673,7 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
k->has_extra_state = virtio_pci_has_extra_state;
k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
+ k->set_host_notifier_mr = virtio_pci_set_host_notifier_mr;
k->vmstate_change = virtio_pci_vmstate_change;
k->pre_plugged = virtio_pci_pre_plugged;
k->device_plugged = virtio_pci_device_plugged;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 006d3d1148..1debb0147b 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2454,6 +2454,19 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue *vq)
return &vq->host_notifier;
}
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+ MemoryRegion *mr, bool assign)
+{
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ if (k->set_host_notifier_mr) {
+ return k->set_host_notifier_mr(qbus->parent, n, mr, assign);
+ }
+
+ return -1;
+}
+
void virtio_device_set_child_bus_name(VirtIODevice *vdev, char *bus_name)
{
g_free(vdev->bus_name);
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 45ec8919b6..fbfedcb1c0 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -27,6 +27,7 @@
#include "hw/i386/ioapic.h"
#include "hw/pci/msi.h"
#include "hw/sysbus.h"
+#include "qemu/iova-tree.h"
#define TYPE_INTEL_IOMMU_DEVICE "intel-iommu"
#define INTEL_IOMMU_DEVICE(obj) \
@@ -67,7 +68,6 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
typedef struct VTDIrq VTDIrq;
typedef struct VTD_MSIMessage VTD_MSIMessage;
-typedef struct IntelIOMMUNotifierNode IntelIOMMUNotifierNode;
/* Context-Entry */
struct VTDContextEntry {
@@ -93,6 +93,10 @@ struct VTDAddressSpace {
MemoryRegion iommu_ir; /* Interrupt region: 0xfeeXXXXX */
IntelIOMMUState *iommu_state;
VTDContextCacheEntry context_cache_entry;
+ QLIST_ENTRY(VTDAddressSpace) next;
+ /* Superset of notifier flags that this address space has */
+ IOMMUNotifierFlag notifier_flags;
+ IOVATree *iova_tree; /* Traces mapped IOVA ranges */
};
struct VTDBus {
@@ -253,11 +257,6 @@ struct VTD_MSIMessage {
/* When IR is enabled, all MSI/MSI-X data bits should be zero */
#define VTD_IR_MSI_DATA (0)
-struct IntelIOMMUNotifierNode {
- VTDAddressSpace *vtd_as;
- QLIST_ENTRY(IntelIOMMUNotifierNode) next;
-};
-
/* The iommu (DMAR) device state struct */
struct IntelIOMMUState {
X86IOMMUState x86_iommu;
@@ -295,7 +294,7 @@ struct IntelIOMMUState {
GHashTable *vtd_as_by_busptr; /* VTDBus objects indexed by PCIBus* reference */
VTDBus *vtd_as_by_bus_num[VTD_PCI_BUS_MAX]; /* VTDBus objects indexed by bus number */
/* list of registered notifiers */
- QLIST_HEAD(, IntelIOMMUNotifierNode) notifiers_list;
+ QLIST_HEAD(, VTDAddressSpace) vtd_as_with_notifiers;
/* interrupt remapping */
bool intr_enabled; /* Whether guest enabled IR */
@@ -305,6 +304,12 @@ struct IntelIOMMUState {
OnOffAuto intr_eim; /* Toggle for EIM cabability */
bool buggy_eim; /* Force buggy EIM unless eim=off */
uint8_t aw_bits; /* Host/IOVA address width (in bits) */
+
+ /*
+ * Protects IOMMU states in general. Currently it protects the
+ * per-IOMMU IOTLB cache, and context entry cache in VTDAddressSpace.
+ */
+ QemuMutex iommu_lock;
};
/* Find the VTD Address space associated with the given bus pointer,
diff --git a/include/hw/mem/nvdimm.h b/include/hw/mem/nvdimm.h
index 7fd87c4e1c..74c60332e1 100644
--- a/include/hw/mem/nvdimm.h
+++ b/include/hw/mem/nvdimm.h
@@ -48,7 +48,7 @@
#define NVDIMM_GET_CLASS(obj) OBJECT_GET_CLASS(NVDIMMClass, (obj), \
TYPE_NVDIMM)
-#define NVDIMM_LABLE_SIZE_PROP "label-size"
+#define NVDIMM_LABEL_SIZE_PROP "label-size"
#define NVDIMM_UNARMED_PROP "unarmed"
struct NVDIMMDevice {
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index a9c3ee5aa2..990d6fcbde 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -101,6 +101,7 @@ extern bool pci_available;
#define PCI_DEVICE_ID_REDHAT_PCIE_RP 0x000c
#define PCI_DEVICE_ID_REDHAT_XHCI 0x000d
#define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e
+#define PCI_DEVICE_ID_REDHAT_MDPY 0x000f
#define PCI_DEVICE_ID_REDHAT_QXL 0x0100
#define FMT_PCIBUS PRIx64
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index ced3d2d2b0..7fec9dc929 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -52,6 +52,8 @@ typedef struct VirtioBusClass {
bool (*has_extra_state)(DeviceState *d);
bool (*query_guest_notifiers)(DeviceState *d);
int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign);
+ int (*set_host_notifier_mr)(DeviceState *d, int n,
+ MemoryRegion *mr, bool assign);
void (*vmstate_change)(DeviceState *d, bool running);
/*
* Expose the features the transport layer supports before
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 098bdaaea3..9c1fa07d6d 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -239,6 +239,8 @@ void virtio_queue_set_align(VirtIODevice *vdev, int n, int align);
void virtio_queue_notify(VirtIODevice *vdev, int n);
uint16_t virtio_queue_vector(VirtIODevice *vdev, int n);
void virtio_queue_set_vector(VirtIODevice *vdev, int n, uint16_t vector);
+int virtio_queue_set_host_notifier_mr(VirtIODevice *vdev, int n,
+ MemoryRegion *mr, bool assign);
int virtio_set_status(VirtIODevice *vdev, uint8_t val);
void virtio_reset(void *opaque);
void virtio_update_irq(VirtIODevice *vdev);
diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
new file mode 100644
index 0000000000..b061932097
--- /dev/null
+++ b/include/qemu/iova-tree.h
@@ -0,0 +1,134 @@
+/*
+ * An very simplified iova tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+#ifndef IOVA_TREE_H
+#define IOVA_TREE_H
+
+/*
+ * Currently the iova tree will only allow to keep ranges
+ * information, and no extra user data is allowed for each element. A
+ * benefit is that we can merge adjacent ranges internally within the
+ * tree. It can save a lot of memory when the ranges are splitted but
+ * mostly continuous.
+ *
+ * Note that current implementation does not provide any thread
+ * protections. Callers of the iova tree should be responsible
+ * for the thread safety issue.
+ */
+
+#include "qemu/osdep.h"
+#include "exec/memory.h"
+#include "exec/hwaddr.h"
+
+#define IOVA_OK (0)
+#define IOVA_ERR_INVALID (-1) /* Invalid parameters */
+#define IOVA_ERR_OVERLAP (-2) /* IOVA range overlapped */
+
+typedef struct IOVATree IOVATree;
+typedef struct DMAMap {
+ hwaddr iova;
+ hwaddr translated_addr;
+ hwaddr size; /* Inclusive */
+ IOMMUAccessFlags perm;
+} QEMU_PACKED DMAMap;
+typedef gboolean (*iova_tree_iterator)(DMAMap *map);
+
+/**
+ * iova_tree_new:
+ *
+ * Create a new iova tree.
+ *
+ * Returns: the tree pointer when succeeded, or NULL if error.
+ */
+IOVATree *iova_tree_new(void);
+
+/**
+ * iova_tree_insert:
+ *
+ * @tree: the iova tree to insert
+ * @map: the mapping to insert
+ *
+ * Insert an iova range to the tree. If there is overlapped
+ * ranges, IOVA_ERR_OVERLAP will be returned.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_insert(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_remove:
+ *
+ * @tree: the iova tree to remove range from
+ * @map: the map range to remove
+ *
+ * Remove mappings from the tree that are covered by the map range
+ * provided. The range does not need to be exactly what has inserted,
+ * all the mappings that are included in the provided range will be
+ * removed from the tree. Here map->translated_addr is meaningless.
+ *
+ * Return: 0 if succeeded, or <0 if error.
+ */
+int iova_tree_remove(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that overlaps with the
+ * mapping range specified. Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found. Note that
+ * the returned DMAMap pointer is maintained internally. User should
+ * only read the content but never modify or free the content. Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map);
+
+/**
+ * iova_tree_find_address:
+ *
+ * @tree: the iova tree to search from
+ * @iova: the iova address to find
+ *
+ * Similar to iova_tree_find(), but it tries to find mapping with
+ * range iova=iova & size=0.
+ *
+ * Return: same as iova_tree_find().
+ */
+DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova);
+
+/**
+ * iova_tree_foreach:
+ *
+ * @tree: the iova tree to iterate on
+ * @iterator: the interator for the mappings, return true to stop
+ *
+ * Iterate over the iova tree.
+ *
+ * Return: 1 if found any overlap, 0 if not, <0 if error.
+ */
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
+
+/**
+ * iova_tree_destroy:
+ *
+ * @tree: the iova tree to destroy
+ *
+ * Destroy an existing iova tree.
+ *
+ * Return: None.
+ */
+void iova_tree_destroy(IOVATree *tree);
+
+#endif
diff --git a/linux-headers/asm-x86/kvm_para.h b/include/standard-headers/asm-x86/kvm_para.h
index 4c58184395..53a85ae3ed 100644
--- a/linux-headers/asm-x86/kvm_para.h
+++ b/include/standard-headers/asm-x86/kvm_para.h
@@ -2,16 +2,17 @@
#ifndef _ASM_X86_KVM_PARA_H
#define _ASM_X86_KVM_PARA_H
-#include <linux/types.h>
-#include <asm/hyperv.h>
+#include "standard-headers/linux/types.h"
/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
* should be used to determine that a VM is running under KVM.
*/
#define KVM_CPUID_SIGNATURE 0x40000000
-/* This CPUID returns a feature bitmap in eax. Before enabling a particular
- * paravirtualization, the appropriate feature bit should be checked.
+/* This CPUID returns two feature bitmaps in eax, edx. Before enabling
+ * a particular paravirtualization, the appropriate feature bit should
+ * be checked in eax. The performance hint feature bit should be checked
+ * in edx.
*/
#define KVM_CPUID_FEATURES 0x40000001
#define KVM_FEATURE_CLOCKSOURCE 0
@@ -28,6 +29,8 @@
#define KVM_FEATURE_PV_TLB_FLUSH 9
#define KVM_FEATURE_ASYNC_PF_VMEXIT 10
+#define KVM_HINTS_DEDICATED 0
+
/* The last 8 bits are used to indicate how to interpret the flags field
* in pvclock structure. If no bits are set, all flags are ignored.
*/
@@ -45,12 +48,12 @@
#define MSR_KVM_PV_EOI_EN 0x4b564d04
struct kvm_steal_time {
- __u64 steal;
- __u32 version;
- __u32 flags;
- __u8 preempted;
- __u8 u8_pad[3];
- __u32 pad[11];
+ uint64_t steal;
+ uint32_t version;
+ uint32_t flags;
+ uint8_t preempted;
+ uint8_t uint8_t_pad[3];
+ uint32_t pad[11];
};
#define KVM_VCPU_PREEMPTED (1 << 0)
@@ -58,11 +61,11 @@ struct kvm_steal_time {
#define KVM_CLOCK_PAIRING_WALLCLOCK 0
struct kvm_clock_pairing {
- __s64 sec;
- __s64 nsec;
- __u64 tsc;
- __u32 flags;
- __u32 pad[9];
+ int64_t sec;
+ int64_t nsec;
+ uint64_t tsc;
+ uint32_t flags;
+ uint32_t pad[9];
};
#define KVM_STEAL_ALIGNMENT_BITS 5
@@ -82,14 +85,14 @@ struct kvm_clock_pairing {
/* Payload for KVM_HC_MMU_OP */
struct kvm_mmu_op_header {
- __u32 op;
- __u32 pad;
+ uint32_t op;
+ uint32_t pad;
};
struct kvm_mmu_op_write_pte {
struct kvm_mmu_op_header header;
- __u64 pte_phys;
- __u64 pte_val;
+ uint64_t pte_phys;
+ uint64_t pte_val;
};
struct kvm_mmu_op_flush_tlb {
@@ -98,16 +101,16 @@ struct kvm_mmu_op_flush_tlb {
struct kvm_mmu_op_release_pt {
struct kvm_mmu_op_header header;
- __u64 pt_phys;
+ uint64_t pt_phys;
};
#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
#define KVM_PV_REASON_PAGE_READY 2
struct kvm_vcpu_pv_apf_data {
- __u32 reason;
- __u8 pad[60];
- __u32 enabled;
+ uint32_t reason;
+ uint8_t pad[60];
+ uint32_t enabled;
};
#define KVM_PV_EOI_BIT 0
diff --git a/include/standard-headers/linux/virtio_balloon.h b/include/standard-headers/linux/virtio_balloon.h
index 7b0a41b8fc..e446805ae9 100644
--- a/include/standard-headers/linux/virtio_balloon.h
+++ b/include/standard-headers/linux/virtio_balloon.h
@@ -53,7 +53,9 @@ struct virtio_balloon_config {
#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */
#define VIRTIO_BALLOON_S_AVAIL 6 /* Available memory as in /proc */
#define VIRTIO_BALLOON_S_CACHES 7 /* Disk caches */
-#define VIRTIO_BALLOON_S_NR 8
+#define VIRTIO_BALLOON_S_HTLB_PGALLOC 8 /* Hugetlb page allocations */
+#define VIRTIO_BALLOON_S_HTLB_PGFAIL 9 /* Hugetlb page allocation failures */
+#define VIRTIO_BALLOON_S_NR 10
/*
* Memory statistics structure.
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 23669c4d5a..0b64b8e067 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -22,7 +22,6 @@
#ifdef NEED_CPU_H
# ifdef CONFIG_KVM
# include <linux/kvm.h>
-# include <linux/kvm_para.h>
# define CONFIG_KVM_IS_POSSIBLE
# endif
#else
diff --git a/linux-headers/asm-arm/bitsperlong.h b/linux-headers/asm-arm/bitsperlong.h
new file mode 100644
index 0000000000..6dc0bb0c13
--- /dev/null
+++ b/linux-headers/asm-arm/bitsperlong.h
@@ -0,0 +1 @@
+#include <asm-generic/bitsperlong.h>
diff --git a/linux-headers/asm-arm/kvm_para.h b/linux-headers/asm-arm/kvm_para.h
deleted file mode 100644
index baacc4996d..0000000000
--- a/linux-headers/asm-arm/kvm_para.h
+++ /dev/null
@@ -1,2 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#include <asm-generic/kvm_para.h>
diff --git a/linux-headers/asm-arm64/bitsperlong.h b/linux-headers/asm-arm64/bitsperlong.h
new file mode 100644
index 0000000000..485d60bee2
--- /dev/null
+++ b/linux-headers/asm-arm64/bitsperlong.h
@@ -0,0 +1,24 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __ASM_BITSPERLONG_H
+#define __ASM_BITSPERLONG_H
+
+#define __BITS_PER_LONG 64
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_BITSPERLONG_H */
diff --git a/linux-headers/asm-arm64/kvm_para.h b/linux-headers/asm-arm64/kvm_para.h
deleted file mode 100644
index 14fab8f0b9..0000000000
--- a/linux-headers/asm-arm64/kvm_para.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <asm-generic/kvm_para.h>
diff --git a/linux-headers/asm-generic/bitsperlong.h b/linux-headers/asm-generic/bitsperlong.h
new file mode 100644
index 0000000000..0aac245b6b
--- /dev/null
+++ b/linux-headers/asm-generic/bitsperlong.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_GENERIC_BITS_PER_LONG
+#define __ASM_GENERIC_BITS_PER_LONG
+
+/*
+ * There seems to be no way of detecting this automatically from user
+ * space, so 64 bit architectures should override this in their
+ * bitsperlong.h. In particular, an architecture that supports
+ * both 32 and 64 bit user space must not rely on CONFIG_64BIT
+ * to decide it, but rather check a compiler provided macro.
+ */
+#ifndef __BITS_PER_LONG
+#define __BITS_PER_LONG 32
+#endif
+
+#endif /* __ASM_GENERIC_BITS_PER_LONG */
diff --git a/linux-headers/asm-generic/kvm_para.h b/linux-headers/asm-generic/kvm_para.h
deleted file mode 100644
index 486f0af73c..0000000000
--- a/linux-headers/asm-generic/kvm_para.h
+++ /dev/null
@@ -1,4 +0,0 @@
-/*
- * There isn't anything here, but the file must not be empty or patch
- * will delete it.
- */
diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h
new file mode 100644
index 0000000000..8bcb186c6f
--- /dev/null
+++ b/linux-headers/asm-generic/unistd.h
@@ -0,0 +1,781 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#include <asm/bitsperlong.h>
+
+/*
+ * This file contains the system call numbers, based on the
+ * layout of the x86-64 architecture, which embeds the
+ * pointer to the syscall in the table.
+ *
+ * As a basic principle, no duplication of functionality
+ * should be added, e.g. we don't use lseek when llseek
+ * is present. New architectures should use this file
+ * and implement the less feature-full calls in user space.
+ */
+
+#ifndef __SYSCALL
+#define __SYSCALL(x, y)
+#endif
+
+#if __BITS_PER_LONG == 32 || defined(__SYSCALL_COMPAT)
+#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _32)
+#else
+#define __SC_3264(_nr, _32, _64) __SYSCALL(_nr, _64)
+#endif
+
+#ifdef __SYSCALL_COMPAT
+#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _comp)
+#define __SC_COMP_3264(_nr, _32, _64, _comp) __SYSCALL(_nr, _comp)
+#else
+#define __SC_COMP(_nr, _sys, _comp) __SYSCALL(_nr, _sys)
+#define __SC_COMP_3264(_nr, _32, _64, _comp) __SC_3264(_nr, _32, _64)
+#endif
+
+#define __NR_io_setup 0
+__SC_COMP(__NR_io_setup, sys_io_setup, compat_sys_io_setup)
+#define __NR_io_destroy 1
+__SYSCALL(__NR_io_destroy, sys_io_destroy)
+#define __NR_io_submit 2
+__SC_COMP(__NR_io_submit, sys_io_submit, compat_sys_io_submit)
+#define __NR_io_cancel 3
+__SYSCALL(__NR_io_cancel, sys_io_cancel)
+#define __NR_io_getevents 4
+__SC_COMP(__NR_io_getevents, sys_io_getevents, compat_sys_io_getevents)
+
+/* fs/xattr.c */
+#define __NR_setxattr 5
+__SYSCALL(__NR_setxattr, sys_setxattr)
+#define __NR_lsetxattr 6
+__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
+#define __NR_fsetxattr 7
+__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
+#define __NR_getxattr 8
+__SYSCALL(__NR_getxattr, sys_getxattr)
+#define __NR_lgetxattr 9
+__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
+#define __NR_fgetxattr 10
+__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
+#define __NR_listxattr 11
+__SYSCALL(__NR_listxattr, sys_listxattr)
+#define __NR_llistxattr 12
+__SYSCALL(__NR_llistxattr, sys_llistxattr)
+#define __NR_flistxattr 13
+__SYSCALL(__NR_flistxattr, sys_flistxattr)
+#define __NR_removexattr 14
+__SYSCALL(__NR_removexattr, sys_removexattr)
+#define __NR_lremovexattr 15
+__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
+#define __NR_fremovexattr 16
+__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
+
+/* fs/dcache.c */
+#define __NR_getcwd 17
+__SYSCALL(__NR_getcwd, sys_getcwd)
+
+/* fs/cookies.c */
+#define __NR_lookup_dcookie 18
+__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie)
+
+/* fs/eventfd.c */
+#define __NR_eventfd2 19
+__SYSCALL(__NR_eventfd2, sys_eventfd2)
+
+/* fs/eventpoll.c */
+#define __NR_epoll_create1 20
+__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
+#define __NR_epoll_ctl 21
+__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
+#define __NR_epoll_pwait 22
+__SC_COMP(__NR_epoll_pwait, sys_epoll_pwait, compat_sys_epoll_pwait)
+
+/* fs/fcntl.c */
+#define __NR_dup 23
+__SYSCALL(__NR_dup, sys_dup)
+#define __NR_dup3 24
+__SYSCALL(__NR_dup3, sys_dup3)
+#define __NR3264_fcntl 25
+__SC_COMP_3264(__NR3264_fcntl, sys_fcntl64, sys_fcntl, compat_sys_fcntl64)
+
+/* fs/inotify_user.c */
+#define __NR_inotify_init1 26
+__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
+#define __NR_inotify_add_watch 27
+__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
+#define __NR_inotify_rm_watch 28
+__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
+
+/* fs/ioctl.c */
+#define __NR_ioctl 29
+__SC_COMP(__NR_ioctl, sys_ioctl, compat_sys_ioctl)
+
+/* fs/ioprio.c */
+#define __NR_ioprio_set 30
+__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
+#define __NR_ioprio_get 31
+__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
+
+/* fs/locks.c */
+#define __NR_flock 32
+__SYSCALL(__NR_flock, sys_flock)
+
+/* fs/namei.c */
+#define __NR_mknodat 33
+__SYSCALL(__NR_mknodat, sys_mknodat)
+#define __NR_mkdirat 34
+__SYSCALL(__NR_mkdirat, sys_mkdirat)
+#define __NR_unlinkat 35
+__SYSCALL(__NR_unlinkat, sys_unlinkat)
+#define __NR_symlinkat 36
+__SYSCALL(__NR_symlinkat, sys_symlinkat)
+#define __NR_linkat 37
+__SYSCALL(__NR_linkat, sys_linkat)
+#ifdef __ARCH_WANT_RENAMEAT
+/* renameat is superseded with flags by renameat2 */
+#define __NR_renameat 38
+__SYSCALL(__NR_renameat, sys_renameat)
+#endif /* __ARCH_WANT_RENAMEAT */
+
+/* fs/namespace.c */
+#define __NR_umount2 39
+__SYSCALL(__NR_umount2, sys_umount)
+#define __NR_mount 40
+__SC_COMP(__NR_mount, sys_mount, compat_sys_mount)
+#define __NR_pivot_root 41
+__SYSCALL(__NR_pivot_root, sys_pivot_root)
+
+/* fs/nfsctl.c */
+#define __NR_nfsservctl 42
+__SYSCALL(__NR_nfsservctl, sys_ni_syscall)
+
+/* fs/open.c */
+#define __NR3264_statfs 43
+__SC_COMP_3264(__NR3264_statfs, sys_statfs64, sys_statfs, \
+ compat_sys_statfs64)
+#define __NR3264_fstatfs 44
+__SC_COMP_3264(__NR3264_fstatfs, sys_fstatfs64, sys_fstatfs, \
+ compat_sys_fstatfs64)
+#define __NR3264_truncate 45
+__SC_COMP_3264(__NR3264_truncate, sys_truncate64, sys_truncate, \
+ compat_sys_truncate64)
+#define __NR3264_ftruncate 46
+__SC_COMP_3264(__NR3264_ftruncate, sys_ftruncate64, sys_ftruncate, \
+ compat_sys_ftruncate64)
+
+#define __NR_fallocate 47
+__SC_COMP(__NR_fallocate, sys_fallocate, compat_sys_fallocate)
+#define __NR_faccessat 48
+__SYSCALL(__NR_faccessat, sys_faccessat)
+#define __NR_chdir 49
+__SYSCALL(__NR_chdir, sys_chdir)
+#define __NR_fchdir 50
+__SYSCALL(__NR_fchdir, sys_fchdir)
+#define __NR_chroot 51
+__SYSCALL(__NR_chroot, sys_chroot)
+#define __NR_fchmod 52
+__SYSCALL(__NR_fchmod, sys_fchmod)
+#define __NR_fchmodat 53
+__SYSCALL(__NR_fchmodat, sys_fchmodat)
+#define __NR_fchownat 54
+__SYSCALL(__NR_fchownat, sys_fchownat)
+#define __NR_fchown 55
+__SYSCALL(__NR_fchown, sys_fchown)
+#define __NR_openat 56
+__SC_COMP(__NR_openat, sys_openat, compat_sys_openat)
+#define __NR_close 57
+__SYSCALL(__NR_close, sys_close)
+#define __NR_vhangup 58
+__SYSCALL(__NR_vhangup, sys_vhangup)
+
+/* fs/pipe.c */
+#define __NR_pipe2 59
+__SYSCALL(__NR_pipe2, sys_pipe2)
+
+/* fs/quota.c */
+#define __NR_quotactl 60
+__SYSCALL(__NR_quotactl, sys_quotactl)
+
+/* fs/readdir.c */
+#define __NR_getdents64 61
+__SYSCALL(__NR_getdents64, sys_getdents64)
+
+/* fs/read_write.c */
+#define __NR3264_lseek 62
+__SC_3264(__NR3264_lseek, sys_llseek, sys_lseek)
+#define __NR_read 63
+__SYSCALL(__NR_read, sys_read)
+#define __NR_write 64
+__SYSCALL(__NR_write, sys_write)
+#define __NR_readv 65
+__SC_COMP(__NR_readv, sys_readv, compat_sys_readv)
+#define __NR_writev 66
+__SC_COMP(__NR_writev, sys_writev, compat_sys_writev)
+#define __NR_pread64 67
+__SC_COMP(__NR_pread64, sys_pread64, compat_sys_pread64)
+#define __NR_pwrite64 68
+__SC_COMP(__NR_pwrite64, sys_pwrite64, compat_sys_pwrite64)
+#define __NR_preadv 69
+__SC_COMP(__NR_preadv, sys_preadv, compat_sys_preadv)
+#define __NR_pwritev 70
+__SC_COMP(__NR_pwritev, sys_pwritev, compat_sys_pwritev)
+
+/* fs/sendfile.c */
+#define __NR3264_sendfile 71
+__SYSCALL(__NR3264_sendfile, sys_sendfile64)
+
+/* fs/select.c */
+#define __NR_pselect6 72
+__SC_COMP(__NR_pselect6, sys_pselect6, compat_sys_pselect6)
+#define __NR_ppoll 73
+__SC_COMP(__NR_ppoll, sys_ppoll, compat_sys_ppoll)
+
+/* fs/signalfd.c */
+#define __NR_signalfd4 74
+__SC_COMP(__NR_signalfd4, sys_signalfd4, compat_sys_signalfd4)
+
+/* fs/splice.c */
+#define __NR_vmsplice 75
+__SC_COMP(__NR_vmsplice, sys_vmsplice, compat_sys_vmsplice)
+#define __NR_splice 76
+__SYSCALL(__NR_splice, sys_splice)
+#define __NR_tee 77
+__SYSCALL(__NR_tee, sys_tee)
+
+/* fs/stat.c */
+#define __NR_readlinkat 78
+__SYSCALL(__NR_readlinkat, sys_readlinkat)
+#define __NR3264_fstatat 79
+__SC_3264(__NR3264_fstatat, sys_fstatat64, sys_newfstatat)
+#define __NR3264_fstat 80
+__SC_3264(__NR3264_fstat, sys_fstat64, sys_newfstat)
+
+/* fs/sync.c */
+#define __NR_sync 81
+__SYSCALL(__NR_sync, sys_sync)
+#define __NR_fsync 82
+__SYSCALL(__NR_fsync, sys_fsync)
+#define __NR_fdatasync 83
+__SYSCALL(__NR_fdatasync, sys_fdatasync)
+#ifdef __ARCH_WANT_SYNC_FILE_RANGE2
+#define __NR_sync_file_range2 84
+__SC_COMP(__NR_sync_file_range2, sys_sync_file_range2, \
+ compat_sys_sync_file_range2)
+#else
+#define __NR_sync_file_range 84
+__SC_COMP(__NR_sync_file_range, sys_sync_file_range, \
+ compat_sys_sync_file_range)
+#endif
+
+/* fs/timerfd.c */
+#define __NR_timerfd_create 85
+__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
+#define __NR_timerfd_settime 86
+__SC_COMP(__NR_timerfd_settime, sys_timerfd_settime, \
+ compat_sys_timerfd_settime)
+#define __NR_timerfd_gettime 87
+__SC_COMP(__NR_timerfd_gettime, sys_timerfd_gettime, \
+ compat_sys_timerfd_gettime)
+
+/* fs/utimes.c */
+#define __NR_utimensat 88
+__SC_COMP(__NR_utimensat, sys_utimensat, compat_sys_utimensat)
+
+/* kernel/acct.c */
+#define __NR_acct 89
+__SYSCALL(__NR_acct, sys_acct)
+
+/* kernel/capability.c */
+#define __NR_capget 90
+__SYSCALL(__NR_capget, sys_capget)
+#define __NR_capset 91
+__SYSCALL(__NR_capset, sys_capset)
+
+/* kernel/exec_domain.c */
+#define __NR_personality 92
+__SYSCALL(__NR_personality, sys_personality)
+
+/* kernel/exit.c */
+#define __NR_exit 93
+__SYSCALL(__NR_exit, sys_exit)
+#define __NR_exit_group 94
+__SYSCALL(__NR_exit_group, sys_exit_group)
+#define __NR_waitid 95
+__SC_COMP(__NR_waitid, sys_waitid, compat_sys_waitid)
+
+/* kernel/fork.c */
+#define __NR_set_tid_address 96
+__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
+#define __NR_unshare 97
+__SYSCALL(__NR_unshare, sys_unshare)
+
+/* kernel/futex.c */
+#define __NR_futex 98
+__SC_COMP(__NR_futex, sys_futex, compat_sys_futex)
+#define __NR_set_robust_list 99
+__SC_COMP(__NR_set_robust_list, sys_set_robust_list, \
+ compat_sys_set_robust_list)
+#define __NR_get_robust_list 100
+__SC_COMP(__NR_get_robust_list, sys_get_robust_list, \
+ compat_sys_get_robust_list)
+
+/* kernel/hrtimer.c */
+#define __NR_nanosleep 101
+__SC_COMP(__NR_nanosleep, sys_nanosleep, compat_sys_nanosleep)
+
+/* kernel/itimer.c */
+#define __NR_getitimer 102
+__SC_COMP(__NR_getitimer, sys_getitimer, compat_sys_getitimer)
+#define __NR_setitimer 103
+__SC_COMP(__NR_setitimer, sys_setitimer, compat_sys_setitimer)
+
+/* kernel/kexec.c */
+#define __NR_kexec_load 104
+__SC_COMP(__NR_kexec_load, sys_kexec_load, compat_sys_kexec_load)
+
+/* kernel/module.c */
+#define __NR_init_module 105
+__SYSCALL(__NR_init_module, sys_init_module)
+#define __NR_delete_module 106
+__SYSCALL(__NR_delete_module, sys_delete_module)
+
+/* kernel/posix-timers.c */
+#define __NR_timer_create 107
+__SC_COMP(__NR_timer_create, sys_timer_create, compat_sys_timer_create)
+#define __NR_timer_gettime 108
+__SC_COMP(__NR_timer_gettime, sys_timer_gettime, compat_sys_timer_gettime)
+#define __NR_timer_getoverrun 109
+__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
+#define __NR_timer_settime 110
+__SC_COMP(__NR_timer_settime, sys_timer_settime, compat_sys_timer_settime)
+#define __NR_timer_delete 111
+__SYSCALL(__NR_timer_delete, sys_timer_delete)
+#define __NR_clock_settime 112
+__SC_COMP(__NR_clock_settime, sys_clock_settime, compat_sys_clock_settime)
+#define __NR_clock_gettime 113
+__SC_COMP(__NR_clock_gettime, sys_clock_gettime, compat_sys_clock_gettime)
+#define __NR_clock_getres 114
+__SC_COMP(__NR_clock_getres, sys_clock_getres, compat_sys_clock_getres)
+#define __NR_clock_nanosleep 115
+__SC_COMP(__NR_clock_nanosleep, sys_clock_nanosleep, \
+ compat_sys_clock_nanosleep)
+
+/* kernel/printk.c */
+#define __NR_syslog 116
+__SYSCALL(__NR_syslog, sys_syslog)
+
+/* kernel/ptrace.c */
+#define __NR_ptrace 117
+__SYSCALL(__NR_ptrace, sys_ptrace)
+
+/* kernel/sched/core.c */
+#define __NR_sched_setparam 118
+__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
+#define __NR_sched_setscheduler 119
+__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
+#define __NR_sched_getscheduler 120
+__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
+#define __NR_sched_getparam 121
+__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
+#define __NR_sched_setaffinity 122
+__SC_COMP(__NR_sched_setaffinity, sys_sched_setaffinity, \
+ compat_sys_sched_setaffinity)
+#define __NR_sched_getaffinity 123
+__SC_COMP(__NR_sched_getaffinity, sys_sched_getaffinity, \
+ compat_sys_sched_getaffinity)
+#define __NR_sched_yield 124
+__SYSCALL(__NR_sched_yield, sys_sched_yield)
+#define __NR_sched_get_priority_max 125
+__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
+#define __NR_sched_get_priority_min 126
+__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
+#define __NR_sched_rr_get_interval 127
+__SC_COMP(__NR_sched_rr_get_interval, sys_sched_rr_get_interval, \
+ compat_sys_sched_rr_get_interval)
+
+/* kernel/signal.c */
+#define __NR_restart_syscall 128
+__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
+#define __NR_kill 129
+__SYSCALL(__NR_kill, sys_kill)
+#define __NR_tkill 130
+__SYSCALL(__NR_tkill, sys_tkill)
+#define __NR_tgkill 131
+__SYSCALL(__NR_tgkill, sys_tgkill)
+#define __NR_sigaltstack 132
+__SC_COMP(__NR_sigaltstack, sys_sigaltstack, compat_sys_sigaltstack)
+#define __NR_rt_sigsuspend 133
+__SC_COMP(__NR_rt_sigsuspend, sys_rt_sigsuspend, compat_sys_rt_sigsuspend)
+#define __NR_rt_sigaction 134
+__SC_COMP(__NR_rt_sigaction, sys_rt_sigaction, compat_sys_rt_sigaction)
+#define __NR_rt_sigprocmask 135
+__SC_COMP(__NR_rt_sigprocmask, sys_rt_sigprocmask, compat_sys_rt_sigprocmask)
+#define __NR_rt_sigpending 136
+__SC_COMP(__NR_rt_sigpending, sys_rt_sigpending, compat_sys_rt_sigpending)
+#define __NR_rt_sigtimedwait 137
+__SC_COMP(__NR_rt_sigtimedwait, sys_rt_sigtimedwait, \
+ compat_sys_rt_sigtimedwait)
+#define __NR_rt_sigqueueinfo 138
+__SC_COMP(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo, \
+ compat_sys_rt_sigqueueinfo)
+#define __NR_rt_sigreturn 139
+__SC_COMP(__NR_rt_sigreturn, sys_rt_sigreturn, compat_sys_rt_sigreturn)
+
+/* kernel/sys.c */
+#define __NR_setpriority 140
+__SYSCALL(__NR_setpriority, sys_setpriority)
+#define __NR_getpriority 141
+__SYSCALL(__NR_getpriority, sys_getpriority)
+#define __NR_reboot 142
+__SYSCALL(__NR_reboot, sys_reboot)
+#define __NR_setregid 143
+__SYSCALL(__NR_setregid, sys_setregid)
+#define __NR_setgid 144
+__SYSCALL(__NR_setgid, sys_setgid)
+#define __NR_setreuid 145
+__SYSCALL(__NR_setreuid, sys_setreuid)
+#define __NR_setuid 146
+__SYSCALL(__NR_setuid, sys_setuid)
+#define __NR_setresuid 147
+__SYSCALL(__NR_setresuid, sys_setresuid)
+#define __NR_getresuid 148
+__SYSCALL(__NR_getresuid, sys_getresuid)
+#define __NR_setresgid 149
+__SYSCALL(__NR_setresgid, sys_setresgid)
+#define __NR_getresgid 150
+__SYSCALL(__NR_getresgid, sys_getresgid)
+#define __NR_setfsuid 151
+__SYSCALL(__NR_setfsuid, sys_setfsuid)
+#define __NR_setfsgid 152
+__SYSCALL(__NR_setfsgid, sys_setfsgid)
+#define __NR_times 153
+__SC_COMP(__NR_times, sys_times, compat_sys_times)
+#define __NR_setpgid 154
+__SYSCALL(__NR_setpgid, sys_setpgid)
+#define __NR_getpgid 155
+__SYSCALL(__NR_getpgid, sys_getpgid)
+#define __NR_getsid 156
+__SYSCALL(__NR_getsid, sys_getsid)
+#define __NR_setsid 157
+__SYSCALL(__NR_setsid, sys_setsid)
+#define __NR_getgroups 158
+__SYSCALL(__NR_getgroups, sys_getgroups)
+#define __NR_setgroups 159
+__SYSCALL(__NR_setgroups, sys_setgroups)
+#define __NR_uname 160
+__SYSCALL(__NR_uname, sys_newuname)
+#define __NR_sethostname 161
+__SYSCALL(__NR_sethostname, sys_sethostname)
+#define __NR_setdomainname 162
+__SYSCALL(__NR_setdomainname, sys_setdomainname)
+#define __NR_getrlimit 163
+__SC_COMP(__NR_getrlimit, sys_getrlimit, compat_sys_getrlimit)
+#define __NR_setrlimit 164
+__SC_COMP(__NR_setrlimit, sys_setrlimit, compat_sys_setrlimit)
+#define __NR_getrusage 165
+__SC_COMP(__NR_getrusage, sys_getrusage, compat_sys_getrusage)
+#define __NR_umask 166
+__SYSCALL(__NR_umask, sys_umask)
+#define __NR_prctl 167
+__SYSCALL(__NR_prctl, sys_prctl)
+#define __NR_getcpu 168
+__SYSCALL(__NR_getcpu, sys_getcpu)
+
+/* kernel/time.c */
+#define __NR_gettimeofday 169
+__SC_COMP(__NR_gettimeofday, sys_gettimeofday, compat_sys_gettimeofday)
+#define __NR_settimeofday 170
+__SC_COMP(__NR_settimeofday, sys_settimeofday, compat_sys_settimeofday)
+#define __NR_adjtimex 171
+__SC_COMP(__NR_adjtimex, sys_adjtimex, compat_sys_adjtimex)
+
+/* kernel/timer.c */
+#define __NR_getpid 172
+__SYSCALL(__NR_getpid, sys_getpid)
+#define __NR_getppid 173
+__SYSCALL(__NR_getppid, sys_getppid)
+#define __NR_getuid 174
+__SYSCALL(__NR_getuid, sys_getuid)
+#define __NR_geteuid 175
+__SYSCALL(__NR_geteuid, sys_geteuid)
+#define __NR_getgid 176
+__SYSCALL(__NR_getgid, sys_getgid)
+#define __NR_getegid 177
+__SYSCALL(__NR_getegid, sys_getegid)
+#define __NR_gettid 178
+__SYSCALL(__NR_gettid, sys_gettid)
+#define __NR_sysinfo 179
+__SC_COMP(__NR_sysinfo, sys_sysinfo, compat_sys_sysinfo)
+
+/* ipc/mqueue.c */
+#define __NR_mq_open 180
+__SC_COMP(__NR_mq_open, sys_mq_open, compat_sys_mq_open)
+#define __NR_mq_unlink 181
+__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
+#define __NR_mq_timedsend 182
+__SC_COMP(__NR_mq_timedsend, sys_mq_timedsend, compat_sys_mq_timedsend)
+#define __NR_mq_timedreceive 183
+__SC_COMP(__NR_mq_timedreceive, sys_mq_timedreceive, \
+ compat_sys_mq_timedreceive)
+#define __NR_mq_notify 184
+__SC_COMP(__NR_mq_notify, sys_mq_notify, compat_sys_mq_notify)
+#define __NR_mq_getsetattr 185
+__SC_COMP(__NR_mq_getsetattr, sys_mq_getsetattr, compat_sys_mq_getsetattr)
+
+/* ipc/msg.c */
+#define __NR_msgget 186
+__SYSCALL(__NR_msgget, sys_msgget)
+#define __NR_msgctl 187
+__SC_COMP(__NR_msgctl, sys_msgctl, compat_sys_msgctl)
+#define __NR_msgrcv 188
+__SC_COMP(__NR_msgrcv, sys_msgrcv, compat_sys_msgrcv)
+#define __NR_msgsnd 189
+__SC_COMP(__NR_msgsnd, sys_msgsnd, compat_sys_msgsnd)
+
+/* ipc/sem.c */
+#define __NR_semget 190
+__SYSCALL(__NR_semget, sys_semget)
+#define __NR_semctl 191
+__SC_COMP(__NR_semctl, sys_semctl, compat_sys_semctl)
+#define __NR_semtimedop 192
+__SC_COMP(__NR_semtimedop, sys_semtimedop, compat_sys_semtimedop)
+#define __NR_semop 193
+__SYSCALL(__NR_semop, sys_semop)
+
+/* ipc/shm.c */
+#define __NR_shmget 194
+__SYSCALL(__NR_shmget, sys_shmget)
+#define __NR_shmctl 195
+__SC_COMP(__NR_shmctl, sys_shmctl, compat_sys_shmctl)
+#define __NR_shmat 196
+__SC_COMP(__NR_shmat, sys_shmat, compat_sys_shmat)
+#define __NR_shmdt 197
+__SYSCALL(__NR_shmdt, sys_shmdt)
+
+/* net/socket.c */
+#define __NR_socket 198
+__SYSCALL(__NR_socket, sys_socket)
+#define __NR_socketpair 199
+__SYSCALL(__NR_socketpair, sys_socketpair)
+#define __NR_bind 200
+__SYSCALL(__NR_bind, sys_bind)
+#define __NR_listen 201
+__SYSCALL(__NR_listen, sys_listen)
+#define __NR_accept 202
+__SYSCALL(__NR_accept, sys_accept)
+#define __NR_connect 203
+__SYSCALL(__NR_connect, sys_connect)
+#define __NR_getsockname 204
+__SYSCALL(__NR_getsockname, sys_getsockname)
+#define __NR_getpeername 205
+__SYSCALL(__NR_getpeername, sys_getpeername)
+#define __NR_sendto 206
+__SYSCALL(__NR_sendto, sys_sendto)
+#define __NR_recvfrom 207
+__SC_COMP(__NR_recvfrom, sys_recvfrom, compat_sys_recvfrom)
+#define __NR_setsockopt 208
+__SC_COMP(__NR_setsockopt, sys_setsockopt, compat_sys_setsockopt)
+#define __NR_getsockopt 209
+__SC_COMP(__NR_getsockopt, sys_getsockopt, compat_sys_getsockopt)
+#define __NR_shutdown 210
+__SYSCALL(__NR_shutdown, sys_shutdown)
+#define __NR_sendmsg 211
+__SC_COMP(__NR_sendmsg, sys_sendmsg, compat_sys_sendmsg)
+#define __NR_recvmsg 212
+__SC_COMP(__NR_recvmsg, sys_recvmsg, compat_sys_recvmsg)
+
+/* mm/filemap.c */
+#define __NR_readahead 213
+__SC_COMP(__NR_readahead, sys_readahead, compat_sys_readahead)
+
+/* mm/nommu.c, also with MMU */
+#define __NR_brk 214
+__SYSCALL(__NR_brk, sys_brk)
+#define __NR_munmap 215
+__SYSCALL(__NR_munmap, sys_munmap)
+#define __NR_mremap 216
+__SYSCALL(__NR_mremap, sys_mremap)
+
+/* security/keys/keyctl.c */
+#define __NR_add_key 217
+__SYSCALL(__NR_add_key, sys_add_key)
+#define __NR_request_key 218
+__SYSCALL(__NR_request_key, sys_request_key)
+#define __NR_keyctl 219
+__SC_COMP(__NR_keyctl, sys_keyctl, compat_sys_keyctl)
+
+/* arch/example/kernel/sys_example.c */
+#define __NR_clone 220
+__SYSCALL(__NR_clone, sys_clone)
+#define __NR_execve 221
+__SC_COMP(__NR_execve, sys_execve, compat_sys_execve)
+
+#define __NR3264_mmap 222
+__SC_3264(__NR3264_mmap, sys_mmap2, sys_mmap)
+/* mm/fadvise.c */
+#define __NR3264_fadvise64 223
+__SC_COMP(__NR3264_fadvise64, sys_fadvise64_64, compat_sys_fadvise64_64)
+
+/* mm/, CONFIG_MMU only */
+#ifndef __ARCH_NOMMU
+#define __NR_swapon 224
+__SYSCALL(__NR_swapon, sys_swapon)
+#define __NR_swapoff 225
+__SYSCALL(__NR_swapoff, sys_swapoff)
+#define __NR_mprotect 226
+__SYSCALL(__NR_mprotect, sys_mprotect)
+#define __NR_msync 227
+__SYSCALL(__NR_msync, sys_msync)
+#define __NR_mlock 228
+__SYSCALL(__NR_mlock, sys_mlock)
+#define __NR_munlock 229
+__SYSCALL(__NR_munlock, sys_munlock)
+#define __NR_mlockall 230
+__SYSCALL(__NR_mlockall, sys_mlockall)
+#define __NR_munlockall 231
+__SYSCALL(__NR_munlockall, sys_munlockall)
+#define __NR_mincore 232
+__SYSCALL(__NR_mincore, sys_mincore)
+#define __NR_madvise 233
+__SYSCALL(__NR_madvise, sys_madvise)
+#define __NR_remap_file_pages 234
+__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
+#define __NR_mbind 235
+__SC_COMP(__NR_mbind, sys_mbind, compat_sys_mbind)
+#define __NR_get_mempolicy 236
+__SC_COMP(__NR_get_mempolicy, sys_get_mempolicy, compat_sys_get_mempolicy)
+#define __NR_set_mempolicy 237
+__SC_COMP(__NR_set_mempolicy, sys_set_mempolicy, compat_sys_set_mempolicy)
+#define __NR_migrate_pages 238
+__SC_COMP(__NR_migrate_pages, sys_migrate_pages, compat_sys_migrate_pages)
+#define __NR_move_pages 239
+__SC_COMP(__NR_move_pages, sys_move_pages, compat_sys_move_pages)
+#endif
+
+#define __NR_rt_tgsigqueueinfo 240
+__SC_COMP(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo, \
+ compat_sys_rt_tgsigqueueinfo)
+#define __NR_perf_event_open 241
+__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+#define __NR_accept4 242
+__SYSCALL(__NR_accept4, sys_accept4)
+#define __NR_recvmmsg 243
+__SC_COMP(__NR_recvmmsg, sys_recvmmsg, compat_sys_recvmmsg)
+
+/*
+ * Architectures may provide up to 16 syscalls of their own
+ * starting with this value.
+ */
+#define __NR_arch_specific_syscall 244
+
+#define __NR_wait4 260
+__SC_COMP(__NR_wait4, sys_wait4, compat_sys_wait4)
+#define __NR_prlimit64 261
+__SYSCALL(__NR_prlimit64, sys_prlimit64)
+#define __NR_fanotify_init 262
+__SYSCALL(__NR_fanotify_init, sys_fanotify_init)
+#define __NR_fanotify_mark 263
+__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
+#define __NR_name_to_handle_at 264
+__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
+#define __NR_open_by_handle_at 265
+__SC_COMP(__NR_open_by_handle_at, sys_open_by_handle_at, \
+ compat_sys_open_by_handle_at)
+#define __NR_clock_adjtime 266
+__SC_COMP(__NR_clock_adjtime, sys_clock_adjtime, compat_sys_clock_adjtime)
+#define __NR_syncfs 267
+__SYSCALL(__NR_syncfs, sys_syncfs)
+#define __NR_setns 268
+__SYSCALL(__NR_setns, sys_setns)
+#define __NR_sendmmsg 269
+__SC_COMP(__NR_sendmmsg, sys_sendmmsg, compat_sys_sendmmsg)
+#define __NR_process_vm_readv 270
+__SC_COMP(__NR_process_vm_readv, sys_process_vm_readv, \
+ compat_sys_process_vm_readv)
+#define __NR_process_vm_writev 271
+__SC_COMP(__NR_process_vm_writev, sys_process_vm_writev, \
+ compat_sys_process_vm_writev)
+#define __NR_kcmp 272
+__SYSCALL(__NR_kcmp, sys_kcmp)
+#define __NR_finit_module 273
+__SYSCALL(__NR_finit_module, sys_finit_module)
+#define __NR_sched_setattr 274
+__SYSCALL(__NR_sched_setattr, sys_sched_setattr)
+#define __NR_sched_getattr 275
+__SYSCALL(__NR_sched_getattr, sys_sched_getattr)
+#define __NR_renameat2 276
+__SYSCALL(__NR_renameat2, sys_renameat2)
+#define __NR_seccomp 277
+__SYSCALL(__NR_seccomp, sys_seccomp)
+#define __NR_getrandom 278
+__SYSCALL(__NR_getrandom, sys_getrandom)
+#define __NR_memfd_create 279
+__SYSCALL(__NR_memfd_create, sys_memfd_create)
+#define __NR_bpf 280
+__SYSCALL(__NR_bpf, sys_bpf)
+#define __NR_execveat 281
+__SC_COMP(__NR_execveat, sys_execveat, compat_sys_execveat)
+#define __NR_userfaultfd 282
+__SYSCALL(__NR_userfaultfd, sys_userfaultfd)
+#define __NR_membarrier 283
+__SYSCALL(__NR_membarrier, sys_membarrier)
+#define __NR_mlock2 284
+__SYSCALL(__NR_mlock2, sys_mlock2)
+#define __NR_copy_file_range 285
+__SYSCALL(__NR_copy_file_range, sys_copy_file_range)
+#define __NR_preadv2 286
+__SC_COMP(__NR_preadv2, sys_preadv2, compat_sys_preadv2)
+#define __NR_pwritev2 287
+__SC_COMP(__NR_pwritev2, sys_pwritev2, compat_sys_pwritev2)
+#define __NR_pkey_mprotect 288
+__SYSCALL(__NR_pkey_mprotect, sys_pkey_mprotect)
+#define __NR_pkey_alloc 289
+__SYSCALL(__NR_pkey_alloc, sys_pkey_alloc)
+#define __NR_pkey_free 290
+__SYSCALL(__NR_pkey_free, sys_pkey_free)
+#define __NR_statx 291
+__SYSCALL(__NR_statx, sys_statx)
+
+#undef __NR_syscalls
+#define __NR_syscalls 292
+
+/*
+ * 32 bit systems traditionally used different
+ * syscalls for off_t and loff_t arguments, while
+ * 64 bit systems only need the off_t version.
+ * For new 32 bit platforms, there is no need to
+ * implement the old 32 bit off_t syscalls, so
+ * they take different names.
+ * Here we map the numbers so that both versions
+ * use the same syscall table layout.
+ */
+#if __BITS_PER_LONG == 64 && !defined(__SYSCALL_COMPAT)
+#define __NR_fcntl __NR3264_fcntl
+#define __NR_statfs __NR3264_statfs
+#define __NR_fstatfs __NR3264_fstatfs
+#define __NR_truncate __NR3264_truncate
+#define __NR_ftruncate __NR3264_ftruncate
+#define __NR_lseek __NR3264_lseek
+#define __NR_sendfile __NR3264_sendfile
+#define __NR_newfstatat __NR3264_fstatat
+#define __NR_fstat __NR3264_fstat
+#define __NR_mmap __NR3264_mmap
+#define __NR_fadvise64 __NR3264_fadvise64
+#ifdef __NR3264_stat
+#define __NR_stat __NR3264_stat
+#define __NR_lstat __NR3264_lstat
+#endif
+#else
+#define __NR_fcntl64 __NR3264_fcntl
+#define __NR_statfs64 __NR3264_statfs
+#define __NR_fstatfs64 __NR3264_fstatfs
+#define __NR_truncate64 __NR3264_truncate
+#define __NR_ftruncate64 __NR3264_ftruncate
+#define __NR_llseek __NR3264_lseek
+#define __NR_sendfile64 __NR3264_sendfile
+#define __NR_fstatat64 __NR3264_fstatat
+#define __NR_fstat64 __NR3264_fstat
+#define __NR_mmap2 __NR3264_mmap
+#define __NR_fadvise64_64 __NR3264_fadvise64
+#ifdef __NR3264_stat
+#define __NR_stat64 __NR3264_stat
+#define __NR_lstat64 __NR3264_lstat
+#endif
+#endif
diff --git a/linux-headers/asm-mips/bitsperlong.h b/linux-headers/asm-mips/bitsperlong.h
new file mode 100644
index 0000000000..7268380d8d
--- /dev/null
+++ b/linux-headers/asm-mips/bitsperlong.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_MIPS_BITSPERLONG_H
+#define __ASM_MIPS_BITSPERLONG_H
+
+#define __BITS_PER_LONG _MIPS_SZLONG
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_MIPS_BITSPERLONG_H */
diff --git a/linux-headers/asm-mips/kvm.h b/linux-headers/asm-mips/kvm.h
index 6985eb59b0..edcf717c43 100644
--- a/linux-headers/asm-mips/kvm.h
+++ b/linux-headers/asm-mips/kvm.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
@@ -19,6 +20,10 @@
* Some parts derived from the x86 version of this file.
*/
+#define __KVM_HAVE_READONLY_MEM
+
+#define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+
/*
* for KVM_GET_REGS and KVM_SET_REGS
*
@@ -52,9 +57,14 @@ struct kvm_fpu {
* Register set = 0: GP registers from kvm_regs (see definitions below).
*
* Register set = 1: CP0 registers.
- * bits[15..8] - Must be zero.
- * bits[7..3] - Register 'rd' index.
- * bits[2..0] - Register 'sel' index.
+ * bits[15..8] - COP0 register set.
+ *
+ * COP0 register set = 0: Main CP0 registers.
+ * bits[7..3] - Register 'rd' index.
+ * bits[2..0] - Register 'sel' index.
+ *
+ * COP0 register set = 1: MAARs.
+ * bits[7..0] - MAAR index.
*
* Register set = 2: KVM specific registers (see definitions below).
*
@@ -113,6 +123,15 @@ struct kvm_fpu {
/*
+ * KVM_REG_MIPS_CP0 - Coprocessor 0 registers.
+ */
+
+#define KVM_REG_MIPS_MAAR (KVM_REG_MIPS_CP0 | (1 << 8))
+#define KVM_REG_MIPS_CP0_MAAR(n) (KVM_REG_MIPS_MAAR | \
+ KVM_REG_SIZE_U64 | (n))
+
+
+/*
* KVM_REG_MIPS_KVM - KVM specific control registers.
*/
diff --git a/linux-headers/asm-mips/kvm_para.h b/linux-headers/asm-mips/kvm_para.h
deleted file mode 100644
index dbb2464f3b..0000000000
--- a/linux-headers/asm-mips/kvm_para.h
+++ /dev/null
@@ -1,5 +0,0 @@
-#ifndef _ASM_MIPS_KVM_PARA_H
-#define _ASM_MIPS_KVM_PARA_H
-
-
-#endif /* _ASM_MIPS_KVM_PARA_H */
diff --git a/linux-headers/asm-mips/sgidefs.h b/linux-headers/asm-mips/sgidefs.h
new file mode 100644
index 0000000000..26143e3b7c
--- /dev/null
+++ b/linux-headers/asm-mips/sgidefs.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 1996, 1999, 2001 Ralf Baechle
+ * Copyright (C) 1999 Silicon Graphics, Inc.
+ * Copyright (C) 2001 MIPS Technologies, Inc.
+ */
+#ifndef __ASM_SGIDEFS_H
+#define __ASM_SGIDEFS_H
+
+/*
+ * Using a Linux compiler for building Linux seems logic but not to
+ * everybody.
+ */
+#ifndef __linux__
+#error Use a Linux compiler or give up.
+#endif
+
+/*
+ * Definitions for the ISA levels
+ *
+ * With the introduction of MIPS32 / MIPS64 instruction sets definitions
+ * MIPS ISAs are no longer subsets of each other. Therefore comparisons
+ * on these symbols except with == may result in unexpected results and
+ * are forbidden!
+ */
+#define _MIPS_ISA_MIPS1 1
+#define _MIPS_ISA_MIPS2 2
+#define _MIPS_ISA_MIPS3 3
+#define _MIPS_ISA_MIPS4 4
+#define _MIPS_ISA_MIPS5 5
+#define _MIPS_ISA_MIPS32 6
+#define _MIPS_ISA_MIPS64 7
+
+/*
+ * Subprogram calling convention
+ */
+#define _MIPS_SIM_ABI32 1
+#define _MIPS_SIM_NABI32 2
+#define _MIPS_SIM_ABI64 3
+
+#endif /* __ASM_SGIDEFS_H */
diff --git a/linux-headers/asm-mips/unistd.h b/linux-headers/asm-mips/unistd.h
index 2a2020938e..9bfef7f764 100644
--- a/linux-headers/asm-mips/unistd.h
+++ b/linux-headers/asm-mips/unistd.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
/*
* This file is subject to the terms and conditions of the GNU General Public
* License. See the file "COPYING" in the main directory of this archive
@@ -377,16 +378,27 @@
#define __NR_memfd_create (__NR_Linux + 354)
#define __NR_bpf (__NR_Linux + 355)
#define __NR_execveat (__NR_Linux + 356)
+#define __NR_userfaultfd (__NR_Linux + 357)
+#define __NR_membarrier (__NR_Linux + 358)
+#define __NR_mlock2 (__NR_Linux + 359)
+#define __NR_copy_file_range (__NR_Linux + 360)
+#define __NR_preadv2 (__NR_Linux + 361)
+#define __NR_pwritev2 (__NR_Linux + 362)
+#define __NR_pkey_mprotect (__NR_Linux + 363)
+#define __NR_pkey_alloc (__NR_Linux + 364)
+#define __NR_pkey_free (__NR_Linux + 365)
+#define __NR_statx (__NR_Linux + 366)
+
/*
* Offset of the last Linux o32 flavoured syscall
*/
-#define __NR_Linux_syscalls 356
+#define __NR_Linux_syscalls 366
#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
#define __NR_O32_Linux 4000
-#define __NR_O32_Linux_syscalls 356
+#define __NR_O32_Linux_syscalls 366
#if _MIPS_SIM == _MIPS_SIM_ABI64
@@ -711,16 +723,26 @@
#define __NR_memfd_create (__NR_Linux + 314)
#define __NR_bpf (__NR_Linux + 315)
#define __NR_execveat (__NR_Linux + 316)
+#define __NR_userfaultfd (__NR_Linux + 317)
+#define __NR_membarrier (__NR_Linux + 318)
+#define __NR_mlock2 (__NR_Linux + 319)
+#define __NR_copy_file_range (__NR_Linux + 320)
+#define __NR_preadv2 (__NR_Linux + 321)
+#define __NR_pwritev2 (__NR_Linux + 322)
+#define __NR_pkey_mprotect (__NR_Linux + 323)
+#define __NR_pkey_alloc (__NR_Linux + 324)
+#define __NR_pkey_free (__NR_Linux + 325)
+#define __NR_statx (__NR_Linux + 326)
/*
* Offset of the last Linux 64-bit flavoured syscall
*/
-#define __NR_Linux_syscalls 316
+#define __NR_Linux_syscalls 326
#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */
#define __NR_64_Linux 5000
-#define __NR_64_Linux_syscalls 316
+#define __NR_64_Linux_syscalls 326
#if _MIPS_SIM == _MIPS_SIM_NABI32
@@ -1049,15 +1071,25 @@
#define __NR_memfd_create (__NR_Linux + 318)
#define __NR_bpf (__NR_Linux + 319)
#define __NR_execveat (__NR_Linux + 320)
+#define __NR_userfaultfd (__NR_Linux + 321)
+#define __NR_membarrier (__NR_Linux + 322)
+#define __NR_mlock2 (__NR_Linux + 323)
+#define __NR_copy_file_range (__NR_Linux + 324)
+#define __NR_preadv2 (__NR_Linux + 325)
+#define __NR_pwritev2 (__NR_Linux + 326)
+#define __NR_pkey_mprotect (__NR_Linux + 327)
+#define __NR_pkey_alloc (__NR_Linux + 328)
+#define __NR_pkey_free (__NR_Linux + 329)
+#define __NR_statx (__NR_Linux + 330)
/*
* Offset of the last N32 flavoured syscall
*/
-#define __NR_Linux_syscalls 320
+#define __NR_Linux_syscalls 330
#endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */
#define __NR_N32_Linux 6000
-#define __NR_N32_Linux_syscalls 320
+#define __NR_N32_Linux_syscalls 330
#endif /* _ASM_UNISTD_H */
diff --git a/linux-headers/asm-powerpc/bitsperlong.h b/linux-headers/asm-powerpc/bitsperlong.h
new file mode 100644
index 0000000000..46ece3ecff
--- /dev/null
+++ b/linux-headers/asm-powerpc/bitsperlong.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_POWERPC_BITSPERLONG_H
+#define __ASM_POWERPC_BITSPERLONG_H
+
+#if defined(__powerpc64__)
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_POWERPC_BITSPERLONG_H */
diff --git a/linux-headers/asm-powerpc/epapr_hcalls.h b/linux-headers/asm-powerpc/epapr_hcalls.h
deleted file mode 100644
index 6cca559993..0000000000
--- a/linux-headers/asm-powerpc/epapr_hcalls.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: ((GPL-2.0+ WITH Linux-syscall-note) OR BSD-3-Clause) */
-/*
- * ePAPR hcall interface
- *
- * Copyright 2008-2011 Freescale Semiconductor, Inc.
- *
- * Author: Timur Tabi <timur@freescale.com>
- *
- * This file is provided under a dual BSD/GPL license. When using or
- * redistributing this file, you may do so under either license.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of Freescale Semiconductor nor the
- * names of its contributors may be used to endorse or promote products
- * derived from this software without specific prior written permission.
- *
- *
- * ALTERNATIVELY, this software may be distributed under the terms of the
- * GNU General Public License ("GPL") as published by the Free Software
- * Foundation, either version 2 of that License or (at your option) any
- * later version.
- *
- * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY
- * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef _ASM_POWERPC_EPAPR_HCALLS_H
-#define _ASM_POWERPC_EPAPR_HCALLS_H
-
-#define EV_BYTE_CHANNEL_SEND 1
-#define EV_BYTE_CHANNEL_RECEIVE 2
-#define EV_BYTE_CHANNEL_POLL 3
-#define EV_INT_SET_CONFIG 4
-#define EV_INT_GET_CONFIG 5
-#define EV_INT_SET_MASK 6
-#define EV_INT_GET_MASK 7
-#define EV_INT_IACK 9
-#define EV_INT_EOI 10
-#define EV_INT_SEND_IPI 11
-#define EV_INT_SET_TASK_PRIORITY 12
-#define EV_INT_GET_TASK_PRIORITY 13
-#define EV_DOORBELL_SEND 14
-#define EV_MSGSND 15
-#define EV_IDLE 16
-
-/* vendor ID: epapr */
-#define EV_LOCAL_VENDOR_ID 0 /* for private use */
-#define EV_EPAPR_VENDOR_ID 1
-#define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */
-#define EV_IBM_VENDOR_ID 3 /* IBM */
-#define EV_GHS_VENDOR_ID 4 /* Green Hills Software */
-#define EV_ENEA_VENDOR_ID 5 /* Enea */
-#define EV_WR_VENDOR_ID 6 /* Wind River Systems */
-#define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */
-#define EV_KVM_VENDOR_ID 42 /* KVM */
-
-/* The max number of bytes that a byte channel can send or receive per call */
-#define EV_BYTE_CHANNEL_MAX_BYTES 16
-
-
-#define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num))
-#define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num)
-
-/* epapr return codes */
-#define EV_SUCCESS 0
-#define EV_EPERM 1 /* Operation not permitted */
-#define EV_ENOENT 2 /* Entry Not Found */
-#define EV_EIO 3 /* I/O error occurred */
-#define EV_EAGAIN 4 /* The operation had insufficient
- * resources to complete and should be
- * retried
- */
-#define EV_ENOMEM 5 /* There was insufficient memory to
- * complete the operation */
-#define EV_EFAULT 6 /* Bad guest address */
-#define EV_ENODEV 7 /* No such device */
-#define EV_EINVAL 8 /* An argument supplied to the hcall
- was out of range or invalid */
-#define EV_INTERNAL 9 /* An internal error occurred */
-#define EV_CONFIG 10 /* A configuration error was detected */
-#define EV_INVALID_STATE 11 /* The object is in an invalid state */
-#define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */
-#define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */
-
-#endif /* _ASM_POWERPC_EPAPR_HCALLS_H */
diff --git a/linux-headers/asm-powerpc/kvm_para.h b/linux-headers/asm-powerpc/kvm_para.h
deleted file mode 100644
index 9beb49cc10..0000000000
--- a/linux-headers/asm-powerpc/kvm_para.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- *
- * Copyright IBM Corp. 2008
- *
- * Authors: Hollis Blanchard <hollisb@us.ibm.com>
- */
-
-#ifndef __POWERPC_KVM_PARA_H__
-#define __POWERPC_KVM_PARA_H__
-
-#include <linux/types.h>
-
-/*
- * Additions to this struct must only occur at the end, and should be
- * accompanied by a KVM_MAGIC_FEAT flag to advertise that they are present
- * (albeit not necessarily relevant to the current target hardware platform).
- *
- * Struct fields are always 32 or 64 bit aligned, depending on them being 32
- * or 64 bit wide respectively.
- *
- * See Documentation/virtual/kvm/ppc-pv.txt
- */
-struct kvm_vcpu_arch_shared {
- __u64 scratch1;
- __u64 scratch2;
- __u64 scratch3;
- __u64 critical; /* Guest may not get interrupts if == r1 */
- __u64 sprg0;
- __u64 sprg1;
- __u64 sprg2;
- __u64 sprg3;
- __u64 srr0;
- __u64 srr1;
- __u64 dar; /* dear on BookE */
- __u64 msr;
- __u32 dsisr;
- __u32 int_pending; /* Tells the guest if we have an interrupt */
- __u32 sr[16];
- __u32 mas0;
- __u32 mas1;
- __u64 mas7_3;
- __u64 mas2;
- __u32 mas4;
- __u32 mas6;
- __u32 esr;
- __u32 pir;
-
- /*
- * SPRG4-7 are user-readable, so we can only keep these consistent
- * between the shared area and the real registers when there's an
- * intervening exit to KVM. This also applies to SPRG3 on some
- * chips.
- *
- * This suffices for access by guest userspace, since in PR-mode
- * KVM, an exit must occur when changing the guest's MSR[PR].
- * If the guest kernel writes to SPRG3-7 via the shared area, it
- * must also use the shared area for reading while in kernel space.
- */
- __u64 sprg4;
- __u64 sprg5;
- __u64 sprg6;
- __u64 sprg7;
-};
-
-#define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */
-
-#define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num)
-
-#include <asm/epapr_hcalls.h>
-
-#define KVM_FEATURE_MAGIC_PAGE 1
-
-/* Magic page flags from host to guest */
-
-#define KVM_MAGIC_FEAT_SR (1 << 0)
-
-/* MASn, ESR, PIR, and high SPRGs */
-#define KVM_MAGIC_FEAT_MAS0_TO_SPRG7 (1 << 1)
-
-/* Magic page flags from guest to host */
-
-#define MAGIC_PAGE_FLAG_NOT_MAPPED_NX (1 << 0)
-
-
-#endif /* __POWERPC_KVM_PARA_H__ */
diff --git a/linux-headers/asm-s390/bitsperlong.h b/linux-headers/asm-s390/bitsperlong.h
new file mode 100644
index 0000000000..cceaf47b02
--- /dev/null
+++ b/linux-headers/asm-s390/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_S390_BITSPERLONG_H
+#define __ASM_S390_BITSPERLONG_H
+
+#ifndef __s390x__
+#define __BITS_PER_LONG 32
+#else
+#define __BITS_PER_LONG 64
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_S390_BITSPERLONG_H */
+
diff --git a/linux-headers/asm-s390/kvm_para.h b/linux-headers/asm-s390/kvm_para.h
deleted file mode 100644
index b9ab584adf..0000000000
--- a/linux-headers/asm-s390/kvm_para.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * User API definitions for paravirtual devices on s390
- *
- * Copyright IBM Corp. 2008
- *
- * Author(s): Christian Borntraeger <borntraeger@de.ibm.com>
- */
diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h
index 1ae66a263b..d0f97cd0a4 100644
--- a/linux-headers/asm-s390/unistd_32.h
+++ b/linux-headers/asm-s390/unistd_32.h
@@ -360,5 +360,6 @@
#define __NR_s390_guarded_storage 378
#define __NR_statx 379
#define __NR_s390_sthyi 380
+#define __NR_kexec_file_load 381
#endif /* _ASM_S390_UNISTD_32_H */
diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h
index 8aa9d046a9..23ffb97746 100644
--- a/linux-headers/asm-s390/unistd_64.h
+++ b/linux-headers/asm-s390/unistd_64.h
@@ -327,5 +327,6 @@
#define __NR_s390_guarded_storage 378
#define __NR_statx 379
#define __NR_s390_sthyi 380
+#define __NR_kexec_file_load 381
#endif /* _ASM_S390_UNISTD_64_H */
diff --git a/linux-headers/asm-x86/bitsperlong.h b/linux-headers/asm-x86/bitsperlong.h
new file mode 100644
index 0000000000..5d72c84588
--- /dev/null
+++ b/linux-headers/asm-x86/bitsperlong.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef __ASM_X86_BITSPERLONG_H
+#define __ASM_X86_BITSPERLONG_H
+
+#if defined(__x86_64__) && !defined(__ILP32__)
+# define __BITS_PER_LONG 64
+#else
+# define __BITS_PER_LONG 32
+#endif
+
+#include <asm-generic/bitsperlong.h>
+
+#endif /* __ASM_X86_BITSPERLONG_H */
+
diff --git a/linux-headers/linux/kvm_para.h b/linux-headers/linux/kvm_para.h
deleted file mode 100644
index 8bcd0aa853..0000000000
--- a/linux-headers/linux/kvm_para.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef __LINUX_KVM_PARA_H
-#define __LINUX_KVM_PARA_H
-
-/*
- * This header file provides a method for making a hypercall to the host
- * Architectures should define:
- * - kvm_hypercall0, kvm_hypercall1...
- * - kvm_arch_para_features
- * - kvm_para_available
- */
-
-/* Return values for hypercalls */
-#define KVM_ENOSYS 1000
-#define KVM_EFAULT EFAULT
-#define KVM_E2BIG E2BIG
-#define KVM_EPERM EPERM
-#define KVM_EOPNOTSUPP 95
-
-#define KVM_HC_VAPIC_POLL_IRQ 1
-#define KVM_HC_MMU_OP 2
-#define KVM_HC_FEATURES 3
-#define KVM_HC_PPC_MAP_MAGIC_PAGE 4
-#define KVM_HC_KICK_CPU 5
-#define KVM_HC_MIPS_GET_CLOCK_FREQ 6
-#define KVM_HC_MIPS_EXIT_VM 7
-#define KVM_HC_MIPS_CONSOLE_OUTPUT 8
-#define KVM_HC_CLOCK_PAIRING 9
-
-/*
- * hypercalls use architecture specific
- */
-#include <asm/kvm_para.h>
-
-#endif /* __LINUX_KVM_PARA_H */
diff --git a/net/vhost-user.c b/net/vhost-user.c
index e0f16c895b..fa28aad12d 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -20,38 +20,38 @@
#include "qemu/option.h"
#include "trace.h"
-typedef struct VhostUserState {
+typedef struct NetVhostUserState {
NetClientState nc;
CharBackend chr; /* only queue index 0 */
VHostNetState *vhost_net;
guint watch;
uint64_t acked_features;
bool started;
-} VhostUserState;
+} NetVhostUserState;
VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
{
- VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
return s->vhost_net;
}
uint64_t vhost_user_get_acked_features(NetClientState *nc)
{
- VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
return s->acked_features;
}
static void vhost_user_stop(int queues, NetClientState *ncs[])
{
- VhostUserState *s;
+ NetVhostUserState *s;
int i;
for (i = 0; i < queues; i++) {
assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
- s = DO_UPCAST(VhostUserState, nc, ncs[i]);
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
if (s->vhost_net) {
/* save acked features */
@@ -68,7 +68,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be)
{
VhostNetOptions options;
struct vhost_net *net = NULL;
- VhostUserState *s;
+ NetVhostUserState *s;
int max_queues;
int i;
@@ -77,7 +77,7 @@ static int vhost_user_start(int queues, NetClientState *ncs[], CharBackend *be)
for (i = 0; i < queues; i++) {
assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
- s = DO_UPCAST(VhostUserState, nc, ncs[i]);
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
options.net_backend = ncs[i];
options.opaque = be;
@@ -123,7 +123,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf,
without GUEST_ANNOUNCE capability.
*/
if (size == 60) {
- VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
int r;
static int display_rarp_failure = 1;
char mac_addr[6];
@@ -146,7 +146,7 @@ static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf,
static void vhost_user_cleanup(NetClientState *nc)
{
- VhostUserState *s = DO_UPCAST(VhostUserState, nc, nc);
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
if (s->vhost_net) {
vhost_net_cleanup(s->vhost_net);
@@ -180,7 +180,7 @@ static bool vhost_user_has_ufo(NetClientState *nc)
static NetClientInfo net_vhost_user_info = {
.type = NET_CLIENT_DRIVER_VHOST_USER,
- .size = sizeof(VhostUserState),
+ .size = sizeof(NetVhostUserState),
.receive = vhost_user_receive,
.cleanup = vhost_user_cleanup,
.has_vnet_hdr = vhost_user_has_vnet_hdr,
@@ -190,7 +190,7 @@ static NetClientInfo net_vhost_user_info = {
static gboolean net_vhost_user_watch(GIOChannel *chan, GIOCondition cond,
void *opaque)
{
- VhostUserState *s = opaque;
+ NetVhostUserState *s = opaque;
qemu_chr_fe_disconnect(&s->chr);
@@ -203,7 +203,7 @@ static void chr_closed_bh(void *opaque)
{
const char *name = opaque;
NetClientState *ncs[MAX_QUEUE_NUM];
- VhostUserState *s;
+ NetVhostUserState *s;
Error *err = NULL;
int queues;
@@ -212,7 +212,7 @@ static void chr_closed_bh(void *opaque)
MAX_QUEUE_NUM);
assert(queues < MAX_QUEUE_NUM);
- s = DO_UPCAST(VhostUserState, nc, ncs[0]);
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
qmp_set_link(name, false, &err);
vhost_user_stop(queues, ncs);
@@ -229,7 +229,7 @@ static void net_vhost_user_event(void *opaque, int event)
{
const char *name = opaque;
NetClientState *ncs[MAX_QUEUE_NUM];
- VhostUserState *s;
+ NetVhostUserState *s;
Chardev *chr;
Error *err = NULL;
int queues;
@@ -239,7 +239,7 @@ static void net_vhost_user_event(void *opaque, int event)
MAX_QUEUE_NUM);
assert(queues < MAX_QUEUE_NUM);
- s = DO_UPCAST(VhostUserState, nc, ncs[0]);
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
chr = qemu_chr_fe_get_driver(&s->chr);
trace_vhost_user_event(chr->label, event);
switch (event) {
@@ -283,7 +283,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
{
Error *err = NULL;
NetClientState *nc, *nc0 = NULL;
- VhostUserState *s;
+ NetVhostUserState *s;
int i;
assert(name);
@@ -296,7 +296,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
nc->queue_index = i;
if (!nc0) {
nc0 = nc;
- s = DO_UPCAST(VhostUserState, nc, nc);
+ s = DO_UPCAST(NetVhostUserState, nc, nc);
if (!qemu_chr_fe_init(&s->chr, chr, &err)) {
error_report_err(err);
return -1;
@@ -305,7 +305,7 @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
}
- s = DO_UPCAST(VhostUserState, nc, nc0);
+ s = DO_UPCAST(NetVhostUserState, nc, nc0);
do {
if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) {
error_report_err(err);
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index b65c03f0ae..947dec2852 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -43,6 +43,7 @@ cp_portable() {
-e 'limits' \
-e 'linux/kernel' \
-e 'linux/sysinfo' \
+ -e 'asm-generic/kvm_para' \
> /dev/null
then
echo "Unexpected #include in input file $f".
@@ -83,11 +84,6 @@ for arch in $ARCHLIST; do
continue
fi
- # Blacklist architectures which have KVM headers but are actually dead
- if [ "$arch" = "ia64" -o "$arch" = "mips" ]; then
- continue
- fi
-
if [ "$arch" = x86 ]; then
arch_var=SRCARCH
else
@@ -98,11 +94,12 @@ for arch in $ARCHLIST; do
rm -rf "$output/linux-headers/asm-$arch"
mkdir -p "$output/linux-headers/asm-$arch"
- for header in kvm.h kvm_para.h unistd.h; do
+ for header in kvm.h unistd.h bitsperlong.h; do
cp "$tmpdir/include/asm/$header" "$output/linux-headers/asm-$arch"
done
- if [ $arch = powerpc ]; then
- cp "$tmpdir/include/asm/epapr_hcalls.h" "$output/linux-headers/asm-powerpc/"
+
+ if [ $arch = mips ]; then
+ cp "$tmpdir/include/asm/sgidefs.h" "$output/linux-headers/asm-mips/"
fi
rm -rf "$output/include/standard-headers/asm-$arch"
@@ -121,20 +118,23 @@ for arch in $ARCHLIST; do
cp "$tmpdir/include/asm/unistd_32.h" "$output/linux-headers/asm-x86/"
cp "$tmpdir/include/asm/unistd_x32.h" "$output/linux-headers/asm-x86/"
cp "$tmpdir/include/asm/unistd_64.h" "$output/linux-headers/asm-x86/"
+ cp_portable "$tmpdir/include/asm/kvm_para.h" "$output/include/standard-headers/asm-$arch"
fi
done
rm -rf "$output/linux-headers/linux"
mkdir -p "$output/linux-headers/linux"
-for header in kvm.h kvm_para.h vfio.h vfio_ccw.h vhost.h \
+for header in kvm.h vfio.h vfio_ccw.h vhost.h \
psci.h psp-sev.h userfaultfd.h; do
cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
done
+
rm -rf "$output/linux-headers/asm-generic"
mkdir -p "$output/linux-headers/asm-generic"
-for header in kvm_para.h; do
+for header in unistd.h bitsperlong.h; do
cp "$tmpdir/include/asm-generic/$header" "$output/linux-headers/asm-generic"
done
+
if [ -L "$linux/source" ]; then
cp "$linux/source/COPYING" "$output/linux-headers"
else
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index d95310ffd4..94260412e2 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -40,9 +40,7 @@
#include "qom/qom-qobject.h"
#include "sysemu/arch_init.h"
-#if defined(CONFIG_KVM)
-#include <linux/kvm_para.h>
-#endif
+#include "standard-headers/asm-x86/kvm_para.h"
#include "sysemu/sysemu.h"
#include "hw/qdev-properties.h"
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 8ac13f6c2c..664504610e 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -688,8 +688,6 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
#define CPUID_7_0_EDX_SPEC_CTRL (1U << 26) /* Speculation Control */
#define CPUID_7_0_EDX_SPEC_CTRL_SSBD (1U << 31) /* Speculative Store Bypass Disable */
-#define KVM_HINTS_DEDICATED (1U << 0)
-
#define CPUID_8000_0008_EBX_IBPB (1U << 12) /* Indirect Branch Prediction Barrier */
#define CPUID_XSAVE_XSAVEOPT (1U << 0)
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 0c656a91a4..6511329d11 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -18,7 +18,7 @@
#include <sys/utsname.h>
#include <linux/kvm.h>
-#include <linux/kvm_para.h>
+#include "standard-headers/asm-x86/kvm_para.h"
#include "qemu-common.h"
#include "cpu.h"
@@ -387,7 +387,7 @@ uint32_t kvm_arch_get_supported_cpuid(KVMState *s, uint32_t function,
ret &= ~(1U << KVM_FEATURE_PV_UNHALT);
}
} else if (function == KVM_CPUID_FEATURES && reg == R_EDX) {
- ret |= KVM_HINTS_DEDICATED;
+ ret |= 1U << KVM_HINTS_DEDICATED;
found = 1;
}
diff --git a/target/i386/kvm_i386.h b/target/i386/kvm_i386.h
index 1de9876cd9..e5df24cad1 100644
--- a/target/i386/kvm_i386.h
+++ b/target/i386/kvm_i386.h
@@ -30,12 +30,6 @@
#define kvm_pic_in_kernel() 0
#define kvm_ioapic_in_kernel() 0
-/* These constants must never be used at runtime if kvm_enabled() is false.
- * They exist so we don't need #ifdefs around KVM-specific code that already
- * checks kvm_enabled() properly.
- */
-#define KVM_CPUID_FEATURES 0
-
#endif /* CONFIG_KVM */
bool kvm_allows_irq0_override(void);
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 728c3541db..e1c3fed4dc 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -47,4 +47,5 @@ util-obj-y += qht.o
util-obj-y += range.o
util-obj-y += stats64.o
util-obj-y += systemd.o
+util-obj-y += iova-tree.o
util-obj-$(CONFIG_LINUX) += vfio-helpers.o
diff --git a/util/iova-tree.c b/util/iova-tree.c
new file mode 100644
index 0000000000..2d9cebfc89
--- /dev/null
+++ b/util/iova-tree.c
@@ -0,0 +1,114 @@
+/*
+ * IOVA tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+
+#include <glib.h>
+#include "qemu/iova-tree.h"
+
+struct IOVATree {
+ GTree *tree;
+};
+
+static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
+{
+ const DMAMap *m1 = a, *m2 = b;
+
+ if (m1->iova > m2->iova + m2->size) {
+ return 1;
+ }
+
+ if (m1->iova + m1->size < m2->iova) {
+ return -1;
+ }
+
+ /* Overlapped */
+ return 0;
+}
+
+IOVATree *iova_tree_new(void)
+{
+ IOVATree *iova_tree = g_new0(IOVATree, 1);
+
+ /* We don't have values actually, no need to free */
+ iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL);
+
+ return iova_tree;
+}
+
+DMAMap *iova_tree_find(IOVATree *tree, DMAMap *map)
+{
+ return g_tree_lookup(tree->tree, map);
+}
+
+DMAMap *iova_tree_find_address(IOVATree *tree, hwaddr iova)
+{
+ DMAMap map = { .iova = iova, .size = 0 };
+
+ return iova_tree_find(tree, &map);
+}
+
+static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range)
+{
+ /* Key and value are sharing the same range data */
+ g_tree_insert(gtree, range, range);
+}
+
+int iova_tree_insert(IOVATree *tree, DMAMap *map)
+{
+ DMAMap *new;
+
+ if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
+ return IOVA_ERR_INVALID;
+ }
+
+ /* We don't allow to insert range that overlaps with existings */
+ if (iova_tree_find(tree, map)) {
+ return IOVA_ERR_OVERLAP;
+ }
+
+ new = g_new0(DMAMap, 1);
+ memcpy(new, map, sizeof(*new));
+ iova_tree_insert_internal(tree->tree, new);
+
+ return IOVA_OK;
+}
+
+static gboolean iova_tree_traverse(gpointer key, gpointer value,
+ gpointer data)
+{
+ iova_tree_iterator iterator = data;
+ DMAMap *map = key;
+
+ g_assert(key == value);
+
+ return iterator(map);
+}
+
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator)
+{
+ g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
+}
+
+int iova_tree_remove(IOVATree *tree, DMAMap *map)
+{
+ DMAMap *overlap;
+
+ while ((overlap = iova_tree_find(tree, map))) {
+ g_tree_remove(tree->tree, overlap);
+ }
+
+ return IOVA_OK;
+}
+
+void iova_tree_destroy(IOVATree *tree)
+{
+ g_tree_destroy(tree->tree);
+ g_free(tree);
+}