diff options
Diffstat (limited to 'hw/i386/intel_iommu.c')
-rw-r--r-- | hw/i386/intel_iommu.c | 1115 |
1 files changed, 750 insertions, 365 deletions
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index 75f075547f..cc8e59674e 100644 --- a/hw/i386/intel_iommu.c +++ b/hw/i386/intel_iommu.c @@ -49,17 +49,24 @@ /* pe operations */ #define VTD_PE_GET_TYPE(pe) ((pe)->val[0] & VTD_SM_PASID_ENTRY_PGTT) #define VTD_PE_GET_LEVEL(pe) (2 + (((pe)->val[0] >> 2) & VTD_SM_PASID_ENTRY_AW)) -#define VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write) {\ - if (ret_fr) { \ - ret_fr = -ret_fr; \ - if (is_fpd_set && vtd_is_qualified_fault(ret_fr)) { \ - trace_vtd_fault_disabled(); \ - } else { \ - vtd_report_dmar_fault(s, source_id, addr, ret_fr, is_write); \ - } \ - goto error; \ - } \ -} + +/* + * PCI bus number (or SID) is not reliable since the device is usaully + * initialized before guest can configure the PCI bridge + * (SECONDARY_BUS_NUMBER). + */ +struct vtd_as_key { + PCIBus *bus; + uint8_t devfn; + uint32_t pasid; +}; + +struct vtd_iotlb_key { + uint64_t gfn; + uint32_t pasid; + uint16_t sid; + uint8_t level; +}; static void vtd_address_space_refresh_all(IntelIOMMUState *s); static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n); @@ -181,6 +188,18 @@ static void vtd_update_scalable_state(IntelIOMMUState *s) } } +static void vtd_update_iq_dw(IntelIOMMUState *s) +{ + uint64_t val = vtd_get_quad_raw(s, DMAR_IQA_REG); + + if (s->ecap & VTD_ECAP_SMTS && + val & VTD_IQA_DW_MASK) { + s->iq_dw = true; + } else { + s->iq_dw = false; + } +} + /* Whether the address space needs to notify new mappings */ static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) { @@ -188,14 +207,47 @@ static inline gboolean vtd_as_has_map_notifier(VTDAddressSpace *as) } /* GHashTable functions */ -static gboolean vtd_uint64_equal(gconstpointer v1, gconstpointer v2) +static gboolean vtd_iotlb_equal(gconstpointer v1, gconstpointer v2) +{ + const struct vtd_iotlb_key *key1 = v1; + const struct vtd_iotlb_key *key2 = v2; + + return key1->sid == key2->sid && + key1->pasid == key2->pasid && + key1->level == key2->level && + key1->gfn == key2->gfn; +} + +static guint vtd_iotlb_hash(gconstpointer v) { - return *((const uint64_t *)v1) == *((const uint64_t *)v2); + const struct vtd_iotlb_key *key = v; + uint64_t hash64 = key->gfn | ((uint64_t)(key->sid) << VTD_IOTLB_SID_SHIFT) | + (uint64_t)(key->level - 1) << VTD_IOTLB_LVL_SHIFT | + (uint64_t)(key->pasid) << VTD_IOTLB_PASID_SHIFT; + + return (guint)((hash64 >> 32) ^ (hash64 & 0xffffffffU)); } -static guint vtd_uint64_hash(gconstpointer v) +static gboolean vtd_as_equal(gconstpointer v1, gconstpointer v2) { - return (guint)*(const uint64_t *)v; + const struct vtd_as_key *key1 = v1; + const struct vtd_as_key *key2 = v2; + + return (key1->bus == key2->bus) && (key1->devfn == key2->devfn) && + (key1->pasid == key2->pasid); +} + +/* + * Note that we use pointer to PCIBus as the key, so hashing/shifting + * based on the pointer value is intended. Note that we deal with + * collisions through vtd_as_equal(). + */ +static guint vtd_as_hash(gconstpointer v) +{ + const struct vtd_as_key *key = v; + guint value = (guint)(uintptr_t)key->bus; + + return (guint)(value << 8 | key->devfn); } static gboolean vtd_hash_remove_by_domain(gpointer key, gpointer value, @@ -236,22 +288,14 @@ static gboolean vtd_hash_remove_by_page(gpointer key, gpointer value, static void vtd_reset_context_cache_locked(IntelIOMMUState *s) { VTDAddressSpace *vtd_as; - VTDBus *vtd_bus; - GHashTableIter bus_it; - uint32_t devfn_it; + GHashTableIter as_it; trace_vtd_context_cache_reset(); - g_hash_table_iter_init(&bus_it, s->vtd_as_by_busptr); + g_hash_table_iter_init(&as_it, s->vtd_address_spaces); - while (g_hash_table_iter_next (&bus_it, NULL, (void**)&vtd_bus)) { - for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { - vtd_as = vtd_bus->dev_as[devfn_it]; - if (!vtd_as) { - continue; - } - vtd_as->context_cache_entry.context_cache_gen = 0; - } + while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) { + vtd_as->context_cache_entry.context_cache_gen = 0; } s->context_cache_gen = 1; } @@ -278,13 +322,6 @@ static void vtd_reset_caches(IntelIOMMUState *s) vtd_iommu_unlock(s); } -static uint64_t vtd_get_iotlb_key(uint64_t gfn, uint16_t source_id, - uint32_t level) -{ - return gfn | ((uint64_t)(source_id) << VTD_IOTLB_SID_SHIFT) | - ((uint64_t)(level) << VTD_IOTLB_LVL_SHIFT); -} - static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) { return (addr & vtd_slpt_level_page_mask(level)) >> VTD_PAGE_SHIFT_4K; @@ -292,15 +329,17 @@ static uint64_t vtd_get_iotlb_gfn(hwaddr addr, uint32_t level) /* Must be called with IOMMU lock held */ static VTDIOTLBEntry *vtd_lookup_iotlb(IntelIOMMUState *s, uint16_t source_id, - hwaddr addr) + uint32_t pasid, hwaddr addr) { + struct vtd_iotlb_key key; VTDIOTLBEntry *entry; - uint64_t key; int level; for (level = VTD_SL_PT_LEVEL; level < VTD_SL_PML4_LEVEL; level++) { - key = vtd_get_iotlb_key(vtd_get_iotlb_gfn(addr, level), - source_id, level); + key.gfn = vtd_get_iotlb_gfn(addr, level); + key.level = level; + key.sid = source_id; + key.pasid = pasid; entry = g_hash_table_lookup(s->iotlb, &key); if (entry) { goto out; @@ -314,10 +353,11 @@ out: /* Must be with IOMMU lock held */ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, uint16_t domain_id, hwaddr addr, uint64_t slpte, - uint8_t access_flags, uint32_t level) + uint8_t access_flags, uint32_t level, + uint32_t pasid) { VTDIOTLBEntry *entry = g_malloc(sizeof(*entry)); - uint64_t *key = g_malloc(sizeof(*key)); + struct vtd_iotlb_key *key = g_malloc(sizeof(*key)); uint64_t gfn = vtd_get_iotlb_gfn(addr, level); trace_vtd_iotlb_page_update(source_id, addr, slpte, domain_id); @@ -331,7 +371,13 @@ static void vtd_update_iotlb(IntelIOMMUState *s, uint16_t source_id, entry->slpte = slpte; entry->access_flags = access_flags; entry->mask = vtd_slpt_level_page_mask(level); - *key = vtd_get_iotlb_key(gfn, source_id, level); + entry->pasid = pasid; + + key->gfn = gfn; + key->sid = source_id; + key->level = level; + key->pasid = pasid; + g_hash_table_replace(s->iotlb, key, entry); } @@ -351,7 +397,7 @@ static void vtd_generate_interrupt(IntelIOMMUState *s, hwaddr mesg_addr_reg, trace_vtd_irq_generate(msi.address, msi.data); - apic_get_class()->send_msi(&msi); + apic_get_class(NULL)->send_msi(&msi); } /* Generate a fault event to software via MSI if conditions are met. @@ -423,19 +469,12 @@ static void vtd_set_frcd_and_update_ppf(IntelIOMMUState *s, uint16_t index) /* Must not update F field now, should be done later */ static void vtd_record_frcd(IntelIOMMUState *s, uint16_t index, - uint16_t source_id, hwaddr addr, - VTDFaultReason fault, bool is_write) + uint64_t hi, uint64_t lo) { - uint64_t hi = 0, lo; hwaddr frcd_reg_addr = DMAR_FRCD_REG_OFFSET + (((uint64_t)index) << 4); assert(index < DMAR_FRCD_REG_NR); - lo = VTD_FRCD_FI(addr); - hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); - if (!is_write) { - hi |= VTD_FRCD_T; - } vtd_set_quad_raw(s, frcd_reg_addr, lo); vtd_set_quad_raw(s, frcd_reg_addr + 8, hi); @@ -461,21 +500,11 @@ static bool vtd_try_collapse_fault(IntelIOMMUState *s, uint16_t source_id) } /* Log and report an DMAR (address translation) fault to software */ -static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, - hwaddr addr, VTDFaultReason fault, - bool is_write) +static void vtd_report_frcd_fault(IntelIOMMUState *s, uint64_t source_id, + uint64_t hi, uint64_t lo) { uint32_t fsts_reg = vtd_get_long_raw(s, DMAR_FSTS_REG); - assert(fault < VTD_FR_MAX); - - if (fault == VTD_FR_RESERVED_ERR) { - /* This is not a normal fault reason case. Drop it. */ - return; - } - - trace_vtd_dmar_fault(source_id, fault, addr, is_write); - if (fsts_reg & VTD_FSTS_PFO) { error_report_once("New fault is not recorded due to " "Primary Fault Overflow"); @@ -495,7 +524,7 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, return; } - vtd_record_frcd(s, s->next_frcd_reg, source_id, addr, fault, is_write); + vtd_record_frcd(s, s->next_frcd_reg, hi, lo); if (fsts_reg & VTD_FSTS_PPF) { error_report_once("There are pending faults already, " @@ -520,6 +549,40 @@ static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, } } +/* Log and report an DMAR (address translation) fault to software */ +static void vtd_report_dmar_fault(IntelIOMMUState *s, uint16_t source_id, + hwaddr addr, VTDFaultReason fault, + bool is_write, bool is_pasid, + uint32_t pasid) +{ + uint64_t hi, lo; + + assert(fault < VTD_FR_MAX); + + trace_vtd_dmar_fault(source_id, fault, addr, is_write); + + lo = VTD_FRCD_FI(addr); + hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault) | + VTD_FRCD_PV(pasid) | VTD_FRCD_PP(is_pasid); + if (!is_write) { + hi |= VTD_FRCD_T; + } + + vtd_report_frcd_fault(s, source_id, hi, lo); +} + + +static void vtd_report_ir_fault(IntelIOMMUState *s, uint64_t source_id, + VTDFaultReason fault, uint16_t index) +{ + uint64_t hi, lo; + + lo = VTD_FRCD_IR_IDX(index); + hi = VTD_FRCD_SID(source_id) | VTD_FRCD_FR(fault); + + vtd_report_frcd_fault(s, source_id, hi, lo); +} + /* Handle Invalidation Queue Errors of queued invalidation interface error * conditions. */ @@ -569,7 +632,8 @@ static int vtd_get_root_entry(IntelIOMMUState *s, uint8_t index, dma_addr_t addr; addr = s->root + index * sizeof(*re); - if (dma_memory_read(&address_space_memory, addr, re, sizeof(*re))) { + if (dma_memory_read(&address_space_memory, addr, + re, sizeof(*re), MEMTXATTRS_UNSPECIFIED)) { re->lo = 0; return -VTD_FR_ROOT_TABLE_INV; } @@ -602,7 +666,8 @@ static int vtd_get_context_entry_from_root(IntelIOMMUState *s, } addr = addr + index * ce_size; - if (dma_memory_read(&address_space_memory, addr, ce, ce_size)) { + if (dma_memory_read(&address_space_memory, addr, + ce, ce_size, MEMTXATTRS_UNSPECIFIED)) { return -VTD_FR_CONTEXT_TABLE_INV; } @@ -639,8 +704,8 @@ static uint64_t vtd_get_slpte(dma_addr_t base_addr, uint32_t index) assert(index < VTD_SL_PT_ENTRY_NR); if (dma_memory_read(&address_space_memory, - base_addr + index * sizeof(slpte), &slpte, - sizeof(slpte))) { + base_addr + index * sizeof(slpte), + &slpte, sizeof(slpte), MEMTXATTRS_UNSPECIFIED)) { slpte = (uint64_t)-1; return slpte; } @@ -704,10 +769,13 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t pasid_dir_base, index = VTD_PASID_DIR_INDEX(pasid); entry_size = VTD_PASID_DIR_ENTRY_SIZE; addr = pasid_dir_base + index * entry_size; - if (dma_memory_read(&address_space_memory, addr, pdire, entry_size)) { + if (dma_memory_read(&address_space_memory, addr, + pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) { return -VTD_FR_PASID_TABLE_INV; } + pdire->val = le64_to_cpu(pdire->val); + return 0; } @@ -728,9 +796,13 @@ static int vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState *s, index = VTD_PASID_TABLE_INDEX(pasid); entry_size = VTD_PASID_ENTRY_SIZE; addr = addr + index * entry_size; - if (dma_memory_read(&address_space_memory, addr, pe, entry_size)) { + if (dma_memory_read(&address_space_memory, addr, + pe, entry_size, MEMTXATTRS_UNSPECIFIED)) { return -VTD_FR_PASID_TABLE_INV; } + for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) { + pe->val[i] = le64_to_cpu(pe->val[i]); + } /* Do translation type check */ if (!vtd_pe_type_check(x86_iommu, pe)) { @@ -796,13 +868,15 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState *s, static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, VTDContextEntry *ce, - VTDPASIDEntry *pe) + VTDPASIDEntry *pe, + uint32_t pasid) { - uint32_t pasid; dma_addr_t pasid_dir_base; int ret = 0; - pasid = VTD_CE_GET_RID2PASID(ce); + if (pasid == PCI_NO_PASID) { + pasid = VTD_CE_GET_RID2PASID(ce); + } pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); ret = vtd_get_pe_from_pasid_table(s, pasid_dir_base, pasid, pe); @@ -811,15 +885,17 @@ static int vtd_ce_get_rid2pasid_entry(IntelIOMMUState *s, static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s, VTDContextEntry *ce, - bool *pe_fpd_set) + bool *pe_fpd_set, + uint32_t pasid) { int ret; - uint32_t pasid; dma_addr_t pasid_dir_base; VTDPASIDDirEntry pdire; VTDPASIDEntry pe; - pasid = VTD_CE_GET_RID2PASID(ce); + if (pasid == PCI_NO_PASID) { + pasid = VTD_CE_GET_RID2PASID(ce); + } pasid_dir_base = VTD_CE_GET_PASID_DIR_TABLE(ce); /* @@ -865,12 +941,13 @@ static inline uint32_t vtd_ce_get_level(VTDContextEntry *ce) } static uint32_t vtd_get_iova_level(IntelIOMMUState *s, - VTDContextEntry *ce) + VTDContextEntry *ce, + uint32_t pasid) { VTDPASIDEntry pe; if (s->root_scalable) { - vtd_ce_get_rid2pasid_entry(s, ce, &pe); + vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); return VTD_PE_GET_LEVEL(&pe); } @@ -883,12 +960,13 @@ static inline uint32_t vtd_ce_get_agaw(VTDContextEntry *ce) } static uint32_t vtd_get_iova_agaw(IntelIOMMUState *s, - VTDContextEntry *ce) + VTDContextEntry *ce, + uint32_t pasid) { VTDPASIDEntry pe; if (s->root_scalable) { - vtd_ce_get_rid2pasid_entry(s, ce, &pe); + vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); return 30 + ((pe.val[0] >> 2) & VTD_SM_PASID_ENTRY_AW) * 9; } @@ -930,31 +1008,33 @@ static inline bool vtd_ce_type_check(X86IOMMUState *x86_iommu, } static inline uint64_t vtd_iova_limit(IntelIOMMUState *s, - VTDContextEntry *ce, uint8_t aw) + VTDContextEntry *ce, uint8_t aw, + uint32_t pasid) { - uint32_t ce_agaw = vtd_get_iova_agaw(s, ce); + uint32_t ce_agaw = vtd_get_iova_agaw(s, ce, pasid); return 1ULL << MIN(ce_agaw, aw); } /* Return true if IOVA passes range check, otherwise false. */ static inline bool vtd_iova_range_check(IntelIOMMUState *s, uint64_t iova, VTDContextEntry *ce, - uint8_t aw) + uint8_t aw, uint32_t pasid) { /* * Check if @iova is above 2^X-1, where X is the minimum of MGAW * in CAP_REG and AW in context-entry. */ - return !(iova & ~(vtd_iova_limit(s, ce, aw) - 1)); + return !(iova & ~(vtd_iova_limit(s, ce, aw, pasid) - 1)); } static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, - VTDContextEntry *ce) + VTDContextEntry *ce, + uint32_t pasid) { VTDPASIDEntry pe; if (s->root_scalable) { - vtd_ce_get_rid2pasid_entry(s, ce, &pe); + vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); return pe.val[0] & VTD_SM_PASID_ENTRY_SLPTPTR; } @@ -965,66 +1045,59 @@ static dma_addr_t vtd_get_iova_pgtbl_base(IntelIOMMUState *s, * Rsvd field masks for spte: * vtd_spte_rsvd 4k pages * vtd_spte_rsvd_large large pages + * + * We support only 3-level and 4-level page tables (see vtd_init() which + * sets only VTD_CAP_SAGAW_39bit and maybe VTD_CAP_SAGAW_48bit bits in s->cap). */ -static uint64_t vtd_spte_rsvd[5]; -static uint64_t vtd_spte_rsvd_large[5]; +#define VTD_SPTE_RSVD_LEN 5 +static uint64_t vtd_spte_rsvd[VTD_SPTE_RSVD_LEN]; +static uint64_t vtd_spte_rsvd_large[VTD_SPTE_RSVD_LEN]; static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level) { - uint64_t rsvd_mask = vtd_spte_rsvd[level]; + uint64_t rsvd_mask; + + /* + * We should have caught a guest-mis-programmed level earlier, + * via vtd_is_level_supported. + */ + assert(level < VTD_SPTE_RSVD_LEN); + /* + * Zero level doesn't exist. The smallest level is VTD_SL_PT_LEVEL=1 and + * checked by vtd_is_last_slpte(). + */ + assert(level); if ((level == VTD_SL_PD_LEVEL || level == VTD_SL_PDP_LEVEL) && (slpte & VTD_SL_PT_PAGE_SIZE_MASK)) { /* large page */ rsvd_mask = vtd_spte_rsvd_large[level]; + } else { + rsvd_mask = vtd_spte_rsvd[level]; } return slpte & rsvd_mask; } -/* Find the VTD address space associated with a given bus number */ -static VTDBus *vtd_find_as_from_bus_num(IntelIOMMUState *s, uint8_t bus_num) -{ - VTDBus *vtd_bus = s->vtd_as_by_bus_num[bus_num]; - GHashTableIter iter; - - if (vtd_bus) { - return vtd_bus; - } - - /* - * Iterate over the registered buses to find the one which - * currently holds this bus number and update the bus_num - * lookup table. - */ - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { - if (pci_bus_num(vtd_bus->bus) == bus_num) { - s->vtd_as_by_bus_num[bus_num] = vtd_bus; - return vtd_bus; - } - } - - return NULL; -} - /* Given the @iova, get relevant @slptep. @slpte_level will be the last level * of the translation, can be used for deciding the size of large page. */ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, uint64_t iova, bool is_write, uint64_t *slptep, uint32_t *slpte_level, - bool *reads, bool *writes, uint8_t aw_bits) + bool *reads, bool *writes, uint8_t aw_bits, + uint32_t pasid) { - dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); - uint32_t level = vtd_get_iova_level(s, ce); + dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid); + uint32_t level = vtd_get_iova_level(s, ce, pasid); uint32_t offset; uint64_t slpte; uint64_t access_right_check; + uint64_t xlat, size; - if (!vtd_iova_range_check(s, iova, ce, aw_bits)) { - error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 ")", - __func__, iova); + if (!vtd_iova_range_check(s, iova, ce, aw_bits, pasid)) { + error_report_once("%s: detected IOVA overflow (iova=0x%" PRIx64 "," + "pasid=0x%" PRIx32 ")", __func__, iova, pasid); return -VTD_FR_ADDR_BEYOND_MGAW; } @@ -1037,8 +1110,9 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, if (slpte == (uint64_t)-1) { error_report_once("%s: detected read error on DMAR slpte " - "(iova=0x%" PRIx64 ")", __func__, iova); - if (level == vtd_get_iova_level(s, ce)) { + "(iova=0x%" PRIx64 ", pasid=0x%" PRIx32 ")", + __func__, iova, pasid); + if (level == vtd_get_iova_level(s, ce, pasid)) { /* Invalid programming of context-entry */ return -VTD_FR_CONTEXT_ENTRY_INV; } else { @@ -1050,26 +1124,50 @@ static int vtd_iova_to_slpte(IntelIOMMUState *s, VTDContextEntry *ce, if (!(slpte & access_right_check)) { error_report_once("%s: detected slpte permission error " "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " - "slpte=0x%" PRIx64 ", write=%d)", __func__, - iova, level, slpte, is_write); + "slpte=0x%" PRIx64 ", write=%d, pasid=0x%" + PRIx32 ")", __func__, iova, level, + slpte, is_write, pasid); return is_write ? -VTD_FR_WRITE : -VTD_FR_READ; } if (vtd_slpte_nonzero_rsvd(slpte, level)) { error_report_once("%s: detected splte reserve non-zero " "iova=0x%" PRIx64 ", level=0x%" PRIx32 - "slpte=0x%" PRIx64 ")", __func__, iova, - level, slpte); + "slpte=0x%" PRIx64 ", pasid=0x%" PRIX32 ")", + __func__, iova, level, slpte, pasid); return -VTD_FR_PAGING_ENTRY_RSVD; } if (vtd_is_last_slpte(slpte, level)) { *slptep = slpte; *slpte_level = level; - return 0; + break; } addr = vtd_get_slpte_addr(slpte, aw_bits); level--; } + + xlat = vtd_get_slpte_addr(*slptep, aw_bits); + size = ~vtd_slpt_level_page_mask(level) + 1; + + /* + * From VT-d spec 3.14: Untranslated requests and translation + * requests that result in an address in the interrupt range will be + * blocked with condition code LGN.4 or SGN.8. + */ + if ((xlat > VTD_INTERRUPT_ADDR_LAST || + xlat + size - 1 < VTD_INTERRUPT_ADDR_FIRST)) { + return 0; + } else { + error_report_once("%s: xlat address is in interrupt range " + "(iova=0x%" PRIx64 ", level=0x%" PRIx32 ", " + "slpte=0x%" PRIx64 ", write=%d, " + "xlat=0x%" PRIx64 ", size=0x%" PRIx64 ", " + "pasid=0x%" PRIx32 ")", + __func__, iova, level, slpte, is_write, + xlat, size, pasid); + return s->scalable_mode ? -VTD_FR_SM_INTERRUPT_ADDR : + -VTD_FR_INTERRUPT_ADDR; + } } typedef int (*vtd_page_walk_hook)(IOMMUTLBEvent *event, void *private); @@ -1105,7 +1203,7 @@ static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) .translated_addr = entry->translated_addr, .perm = entry->perm, }; - DMAMap *mapped = iova_tree_find(as->iova_tree, &target); + const DMAMap *mapped = iova_tree_find(as->iova_tree, &target); if (event->type == IOMMU_NOTIFIER_UNMAP && !info->notify_unmap) { trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); @@ -1153,7 +1251,7 @@ static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) return ret; } /* Drop any existing mapping */ - iova_tree_remove(as->iova_tree, &target); + iova_tree_remove(as->iova_tree, target); /* Recover the correct type */ event->type = IOMMU_NOTIFIER_MAP; entry->perm = cache_perm; @@ -1166,7 +1264,7 @@ static int vtd_page_walk_one(IOMMUTLBEvent *event, vtd_page_walk_info *info) trace_vtd_page_walk_one_skip_unmap(entry->iova, entry->addr_mask); return 0; } - iova_tree_remove(as->iova_tree, &target); + iova_tree_remove(as->iova_tree, target); } trace_vtd_page_walk_one(info->domain_id, entry->iova, @@ -1280,18 +1378,19 @@ next: */ static int vtd_page_walk(IntelIOMMUState *s, VTDContextEntry *ce, uint64_t start, uint64_t end, - vtd_page_walk_info *info) + vtd_page_walk_info *info, + uint32_t pasid) { - dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce); - uint32_t level = vtd_get_iova_level(s, ce); + dma_addr_t addr = vtd_get_iova_pgtbl_base(s, ce, pasid); + uint32_t level = vtd_get_iova_level(s, ce, pasid); - if (!vtd_iova_range_check(s, start, ce, info->aw)) { + if (!vtd_iova_range_check(s, start, ce, info->aw, pasid)) { return -VTD_FR_ADDR_BEYOND_MGAW; } - if (!vtd_iova_range_check(s, end, ce, info->aw)) { + if (!vtd_iova_range_check(s, end, ce, info->aw, pasid)) { /* Fix end so that it reaches the maximum */ - end = vtd_iova_limit(s, ce, info->aw); + end = vtd_iova_limit(s, ce, info->aw, pasid); } return vtd_page_walk_level(addr, start, end, level, true, true, info); @@ -1359,7 +1458,7 @@ static int vtd_ce_rid2pasid_check(IntelIOMMUState *s, * has valid rid2pasid setting, which includes valid * rid2pasid field and corresponding pasid entry setting */ - return vtd_ce_get_rid2pasid_entry(s, ce, &pe); + return vtd_ce_get_rid2pasid_entry(s, ce, &pe, PCI_NO_PASID); } /* Map a device to its corresponding domain (context-entry) */ @@ -1442,12 +1541,13 @@ static int vtd_sync_shadow_page_hook(IOMMUTLBEvent *event, } static uint16_t vtd_get_domain_id(IntelIOMMUState *s, - VTDContextEntry *ce) + VTDContextEntry *ce, + uint32_t pasid) { VTDPASIDEntry pe; if (s->root_scalable) { - vtd_ce_get_rid2pasid_entry(s, ce, &pe); + vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); return VTD_SM_PASID_ENTRY_DID(pe.val[1]); } @@ -1465,19 +1565,23 @@ static int vtd_sync_shadow_page_table_range(VTDAddressSpace *vtd_as, .notify_unmap = true, .aw = s->aw_bits, .as = vtd_as, - .domain_id = vtd_get_domain_id(s, ce), + .domain_id = vtd_get_domain_id(s, ce, vtd_as->pasid), }; - return vtd_page_walk(s, ce, addr, addr + size, &info); + return vtd_page_walk(s, ce, addr, addr + size, &info, vtd_as->pasid); } -static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) +static int vtd_address_space_sync(VTDAddressSpace *vtd_as) { int ret; VTDContextEntry ce; IOMMUNotifier *n; - if (!(vtd_as->iommu.iommu_notify_flags & IOMMU_NOTIFIER_IOTLB_EVENTS)) { + /* If no MAP notifier registered, we simply invalidate all the cache */ + if (!vtd_as_has_map_notifier(vtd_as)) { + IOMMU_NOTIFIER_FOREACH(n, &vtd_as->iommu) { + memory_region_unmap_iommu_notifier_range(n); + } return 0; } @@ -1512,19 +1616,38 @@ static int vtd_sync_shadow_page_table(VTDAddressSpace *vtd_as) * 1st-level translation or 2nd-level translation, it depends * on PGTT setting. */ -static bool vtd_dev_pt_enabled(VTDAddressSpace *as) +static bool vtd_dev_pt_enabled(IntelIOMMUState *s, VTDContextEntry *ce, + uint32_t pasid) { - IntelIOMMUState *s; - VTDContextEntry ce; VTDPASIDEntry pe; int ret; + if (s->root_scalable) { + ret = vtd_ce_get_rid2pasid_entry(s, ce, &pe, pasid); + if (ret) { + /* + * This error is guest triggerable. We should assumt PT + * not enabled for safety. + */ + return false; + } + return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); + } + + return (vtd_ce_get_type(ce) == VTD_CONTEXT_TT_PASS_THROUGH); + +} + +static bool vtd_as_pt_enabled(VTDAddressSpace *as) +{ + IntelIOMMUState *s; + VTDContextEntry ce; + assert(as); s = as->iommu_state; - ret = vtd_dev_to_context_entry(s, pci_bus_num(as->bus), - as->devfn, &ce); - if (ret) { + if (vtd_dev_to_context_entry(s, pci_bus_num(as->bus), as->devfn, + &ce)) { /* * Possibly failed to parse the context entry for some reason * (e.g., during init, or any guest configuration errors on @@ -1534,29 +1657,20 @@ static bool vtd_dev_pt_enabled(VTDAddressSpace *as) return false; } - if (s->root_scalable) { - ret = vtd_ce_get_rid2pasid_entry(s, &ce, &pe); - if (ret) { - error_report_once("%s: vtd_ce_get_rid2pasid_entry error: %"PRId32, - __func__, ret); - return false; - } - return (VTD_PE_GET_TYPE(&pe) == VTD_SM_PASID_ENTRY_PT); - } - - return (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH); + return vtd_dev_pt_enabled(s, &ce, as->pasid); } /* Return whether the device is using IOMMU translation. */ static bool vtd_switch_address_space(VTDAddressSpace *as) { - bool use_iommu; + bool use_iommu, pt; /* Whether we need to take the BQL on our own */ - bool take_bql = !qemu_mutex_iothread_locked(); + bool take_bql = !bql_locked(); assert(as); - use_iommu = as->iommu_state->dmar_enabled && !vtd_dev_pt_enabled(as); + use_iommu = as->iommu_state->dmar_enabled && !vtd_as_pt_enabled(as); + pt = as->iommu_state->dmar_enabled && vtd_as_pt_enabled(as); trace_vtd_switch_address_space(pci_bus_num(as->bus), VTD_PCI_SLOT(as->devfn), @@ -1569,20 +1683,62 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) * it. We'd better make sure we have had it already, or, take it. */ if (take_bql) { - qemu_mutex_lock_iothread(); + bql_lock(); } /* Turn off first then on the other */ if (use_iommu) { memory_region_set_enabled(&as->nodmar, false); memory_region_set_enabled(MEMORY_REGION(&as->iommu), true); + /* + * vt-d spec v3.4 3.14: + * + * """ + * Requests-with-PASID with input address in range 0xFEEx_xxxx + * are translated normally like any other request-with-PASID + * through DMA-remapping hardware. + * """ + * + * Need to disable ir for as with PASID. + */ + if (as->pasid != PCI_NO_PASID) { + memory_region_set_enabled(&as->iommu_ir, false); + } else { + memory_region_set_enabled(&as->iommu_ir, true); + } } else { memory_region_set_enabled(MEMORY_REGION(&as->iommu), false); memory_region_set_enabled(&as->nodmar, true); } + /* + * vtd-spec v3.4 3.14: + * + * """ + * Requests-with-PASID with input address in range 0xFEEx_xxxx are + * translated normally like any other request-with-PASID through + * DMA-remapping hardware. However, if such a request is processed + * using pass-through translation, it will be blocked as described + * in the paragraph below. + * + * Software must not program paging-structure entries to remap any + * address to the interrupt address range. Untranslated requests + * and translation requests that result in an address in the + * interrupt range will be blocked with condition code LGN.4 or + * SGN.8. + * """ + * + * We enable per as memory region (iommu_ir_fault) for catching + * the translation for interrupt range through PASID + PT. + */ + if (pt && as->pasid != PCI_NO_PASID) { + memory_region_set_enabled(&as->iommu_ir_fault, true); + } else { + memory_region_set_enabled(&as->iommu_ir_fault, false); + } + if (take_bql) { - qemu_mutex_unlock_iothread(); + bql_unlock(); } return use_iommu; @@ -1590,24 +1746,13 @@ static bool vtd_switch_address_space(VTDAddressSpace *as) static void vtd_switch_address_space_all(IntelIOMMUState *s) { + VTDAddressSpace *vtd_as; GHashTableIter iter; - VTDBus *vtd_bus; - int i; - - g_hash_table_iter_init(&iter, s->vtd_as_by_busptr); - while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_bus)) { - for (i = 0; i < PCI_DEVFN_MAX; i++) { - if (!vtd_bus->dev_as[i]) { - continue; - } - vtd_switch_address_space(vtd_bus->dev_as[i]); - } - } -} -static inline uint16_t vtd_make_source_id(uint8_t bus_num, uint8_t devfn) -{ - return ((bus_num & 0xffUL) << 8) | (devfn & 0xffUL); + g_hash_table_iter_init(&iter, s->vtd_address_spaces); + while (g_hash_table_iter_next(&iter, NULL, (void **)&vtd_as)) { + vtd_switch_address_space(vtd_as); + } } static const bool vtd_qualified_faults[] = { @@ -1621,11 +1766,12 @@ static const bool vtd_qualified_faults[] = { [VTD_FR_PAGING_ENTRY_INV] = true, [VTD_FR_ROOT_TABLE_INV] = false, [VTD_FR_CONTEXT_TABLE_INV] = false, + [VTD_FR_INTERRUPT_ADDR] = true, [VTD_FR_ROOT_ENTRY_RSVD] = false, [VTD_FR_PAGING_ENTRY_RSVD] = true, [VTD_FR_CONTEXT_ENTRY_TT] = true, [VTD_FR_PASID_TABLE_INV] = false, - [VTD_FR_RESERVED_ERR] = false, + [VTD_FR_SM_INTERRUPT_ADDR] = true, [VTD_FR_MAX] = false, }; @@ -1643,18 +1789,37 @@ static inline bool vtd_is_interrupt_addr(hwaddr addr) return VTD_INTERRUPT_ADDR_FIRST <= addr && addr <= VTD_INTERRUPT_ADDR_LAST; } +static gboolean vtd_find_as_by_sid(gpointer key, gpointer value, + gpointer user_data) +{ + struct vtd_as_key *as_key = (struct vtd_as_key *)key; + uint16_t target_sid = *(uint16_t *)user_data; + uint16_t sid = PCI_BUILD_BDF(pci_bus_num(as_key->bus), as_key->devfn); + return sid == target_sid; +} + +static VTDAddressSpace *vtd_get_as_by_sid(IntelIOMMUState *s, uint16_t sid) +{ + uint8_t bus_num = PCI_BUS_NUM(sid); + VTDAddressSpace *vtd_as = s->vtd_as_cache[bus_num]; + + if (vtd_as && + (sid == PCI_BUILD_BDF(pci_bus_num(vtd_as->bus), vtd_as->devfn))) { + return vtd_as; + } + + vtd_as = g_hash_table_find(s->vtd_address_spaces, vtd_find_as_by_sid, &sid); + s->vtd_as_cache[bus_num] = vtd_as; + + return vtd_as; +} + static void vtd_pt_enable_fast_path(IntelIOMMUState *s, uint16_t source_id) { - VTDBus *vtd_bus; VTDAddressSpace *vtd_as; bool success = false; - vtd_bus = vtd_find_as_from_bus_num(s, VTD_SID_TO_BUS(source_id)); - if (!vtd_bus) { - goto out; - } - - vtd_as = vtd_bus->dev_as[VTD_SID_TO_DEVFN(source_id)]; + vtd_as = vtd_get_as_by_sid(s, source_id); if (!vtd_as) { goto out; } @@ -1668,6 +1833,22 @@ out: trace_vtd_pt_enable_fast_path(source_id, success); } +static void vtd_report_fault(IntelIOMMUState *s, + int err, bool is_fpd_set, + uint16_t source_id, + hwaddr addr, + bool is_write, + bool is_pasid, + uint32_t pasid) +{ + if (is_fpd_set && vtd_is_qualified_fault(err)) { + trace_vtd_fault_disabled(); + } else { + vtd_report_dmar_fault(s, source_id, addr, err, is_write, + is_pasid, pasid); + } +} + /* Map dev to context-entry then do a paging-structures walk to do a iommu * translation. * @@ -1689,13 +1870,14 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, uint8_t bus_num = pci_bus_num(bus); VTDContextCacheEntry *cc_entry; uint64_t slpte, page_mask; - uint32_t level; - uint16_t source_id = vtd_make_source_id(bus_num, devfn); + uint32_t level, pasid = vtd_as->pasid; + uint16_t source_id = PCI_BUILD_BDF(bus_num, devfn); int ret_fr; bool is_fpd_set = false; bool reads = true; bool writes = true; uint8_t access_flags; + bool rid2pasid = (pasid == PCI_NO_PASID) && s->root_scalable; VTDIOTLBEntry *iotlb_entry; /* @@ -1708,15 +1890,17 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry = &vtd_as->context_cache_entry; - /* Try to fetch slpte form IOTLB */ - iotlb_entry = vtd_lookup_iotlb(s, source_id, addr); - if (iotlb_entry) { - trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, - iotlb_entry->domain_id); - slpte = iotlb_entry->slpte; - access_flags = iotlb_entry->access_flags; - page_mask = iotlb_entry->mask; - goto out; + /* Try to fetch slpte form IOTLB, we don't need RID2PASID logic */ + if (!rid2pasid) { + iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr); + if (iotlb_entry) { + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + iotlb_entry->domain_id); + slpte = iotlb_entry->slpte; + access_flags = iotlb_entry->access_flags; + page_mask = iotlb_entry->mask; + goto out; + } } /* Try to fetch context-entry from cache first */ @@ -1727,16 +1911,26 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, ce = cc_entry->context_entry; is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; if (!is_fpd_set && s->root_scalable) { - ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); - VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); + ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, pasid); + if (ret_fr) { + vtd_report_fault(s, -ret_fr, is_fpd_set, + source_id, addr, is_write, + false, 0); + goto error; + } } } else { ret_fr = vtd_dev_to_context_entry(s, bus_num, devfn, &ce); is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; if (!ret_fr && !is_fpd_set && s->root_scalable) { - ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set); + ret_fr = vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, pasid); + } + if (ret_fr) { + vtd_report_fault(s, -ret_fr, is_fpd_set, + source_id, addr, is_write, + false, 0); + goto error; } - VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); /* Update context-cache */ trace_vtd_iotlb_cc_update(bus_num, devfn, ce.hi, ce.lo, cc_entry->context_cache_gen, @@ -1745,11 +1939,15 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, cc_entry->context_cache_gen = s->context_cache_gen; } + if (rid2pasid) { + pasid = VTD_CE_GET_RID2PASID(&ce); + } + /* * We don't need to translate for pass-through context entries. * Also, let's ignore IOTLB caching as well for PT devices. */ - if (vtd_ce_get_type(&ce) == VTD_CONTEXT_TT_PASS_THROUGH) { + if (vtd_dev_pt_enabled(s, &ce, pasid)) { entry->iova = addr & VTD_PAGE_MASK_4K; entry->translated_addr = entry->iova; entry->addr_mask = ~VTD_PAGE_MASK_4K; @@ -1770,14 +1968,31 @@ static bool vtd_do_iommu_translate(VTDAddressSpace *vtd_as, PCIBus *bus, return true; } + /* Try to fetch slpte form IOTLB for RID2PASID slow path */ + if (rid2pasid) { + iotlb_entry = vtd_lookup_iotlb(s, source_id, pasid, addr); + if (iotlb_entry) { + trace_vtd_iotlb_page_hit(source_id, addr, iotlb_entry->slpte, + iotlb_entry->domain_id); + slpte = iotlb_entry->slpte; + access_flags = iotlb_entry->access_flags; + page_mask = iotlb_entry->mask; + goto out; + } + } + ret_fr = vtd_iova_to_slpte(s, &ce, addr, is_write, &slpte, &level, - &reads, &writes, s->aw_bits); - VTD_PE_GET_FPD_ERR(ret_fr, is_fpd_set, s, source_id, addr, is_write); + &reads, &writes, s->aw_bits, pasid); + if (ret_fr) { + vtd_report_fault(s, -ret_fr, is_fpd_set, source_id, + addr, is_write, pasid != PCI_NO_PASID, pasid); + goto error; + } page_mask = vtd_slpt_level_page_mask(level); access_flags = IOMMU_ACCESS_FLAG(reads, writes); - vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce), addr, slpte, - access_flags, level); + vtd_update_iotlb(s, source_id, vtd_get_domain_id(s, &ce, pasid), + addr, slpte, access_flags, level, pasid); out: vtd_iommu_unlock(s); entry->iova = addr & page_mask; @@ -1830,7 +2045,7 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s) VTDAddressSpace *vtd_as; QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { - vtd_sync_shadow_page_table(vtd_as); + vtd_address_space_sync(vtd_as); } } @@ -1862,11 +2077,10 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, uint16_t source_id, uint16_t func_mask) { + GHashTableIter as_it; uint16_t mask; - VTDBus *vtd_bus; VTDAddressSpace *vtd_as; uint8_t bus_n, devfn; - uint16_t devfn_it; trace_vtd_inv_desc_cc_devices(source_id, func_mask); @@ -1889,32 +2103,31 @@ static void vtd_context_device_invalidate(IntelIOMMUState *s, mask = ~mask; bus_n = VTD_SID_TO_BUS(source_id); - vtd_bus = vtd_find_as_from_bus_num(s, bus_n); - if (vtd_bus) { - devfn = VTD_SID_TO_DEVFN(source_id); - for (devfn_it = 0; devfn_it < PCI_DEVFN_MAX; ++devfn_it) { - vtd_as = vtd_bus->dev_as[devfn_it]; - if (vtd_as && ((devfn_it & mask) == (devfn & mask))) { - trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(devfn_it), - VTD_PCI_FUNC(devfn_it)); - vtd_iommu_lock(s); - vtd_as->context_cache_entry.context_cache_gen = 0; - vtd_iommu_unlock(s); - /* - * Do switch address space when needed, in case if the - * device passthrough bit is switched. - */ - vtd_switch_address_space(vtd_as); - /* - * So a device is moving out of (or moving into) a - * domain, resync the shadow page table. - * This won't bring bad even if we have no such - * notifier registered - the IOMMU notification - * framework will skip MAP notifications if that - * happened. - */ - vtd_sync_shadow_page_table(vtd_as); - } + devfn = VTD_SID_TO_DEVFN(source_id); + + g_hash_table_iter_init(&as_it, s->vtd_address_spaces); + while (g_hash_table_iter_next(&as_it, NULL, (void **)&vtd_as)) { + if ((pci_bus_num(vtd_as->bus) == bus_n) && + (vtd_as->devfn & mask) == (devfn & mask)) { + trace_vtd_inv_desc_cc_device(bus_n, VTD_PCI_SLOT(vtd_as->devfn), + VTD_PCI_FUNC(vtd_as->devfn)); + vtd_iommu_lock(s); + vtd_as->context_cache_entry.context_cache_gen = 0; + vtd_iommu_unlock(s); + /* + * Do switch address space when needed, in case if the + * device passthrough bit is switched. + */ + vtd_switch_address_space(vtd_as); + /* + * So a device is moving out of (or moving into) a + * domain, resync the shadow page table. + * This won't bring bad even if we have no such + * notifier registered - the IOMMU notification + * framework will skip MAP notifications if that + * happened. + */ + vtd_address_space_sync(vtd_as); } } } @@ -1971,15 +2184,15 @@ static void vtd_iotlb_domain_invalidate(IntelIOMMUState *s, uint16_t domain_id) QLIST_FOREACH(vtd_as, &s->vtd_as_with_notifiers, next) { if (!vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce) && - domain_id == vtd_get_domain_id(s, &ce)) { - vtd_sync_shadow_page_table(vtd_as); + domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) { + vtd_address_space_sync(vtd_as); } } } static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, uint16_t domain_id, hwaddr addr, - uint8_t am) + uint8_t am, uint32_t pasid) { VTDAddressSpace *vtd_as; VTDContextEntry ce; @@ -1987,9 +2200,12 @@ static void vtd_iotlb_page_invalidate_notify(IntelIOMMUState *s, hwaddr size = (1 << am) * VTD_PAGE_SIZE; QLIST_FOREACH(vtd_as, &(s->vtd_as_with_notifiers), next) { + if (pasid != PCI_NO_PASID && pasid != vtd_as->pasid) { + continue; + } ret = vtd_dev_to_context_entry(s, pci_bus_num(vtd_as->bus), vtd_as->devfn, &ce); - if (!ret && domain_id == vtd_get_domain_id(s, &ce)) { + if (!ret && domain_id == vtd_get_domain_id(s, &ce, vtd_as->pasid)) { if (vtd_as_has_map_notifier(vtd_as)) { /* * As long as we have MAP notifications registered in @@ -2033,7 +2249,7 @@ static void vtd_iotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id, vtd_iommu_lock(s); g_hash_table_foreach_remove(s->iotlb, vtd_hash_remove_by_page, &info); vtd_iommu_unlock(s); - vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am); + vtd_iotlb_page_invalidate_notify(s, domain_id, addr, am, PCI_NO_PASID); } /* Flush IOTLB @@ -2197,12 +2413,13 @@ static void vtd_handle_gcmd_ire(IntelIOMMUState *s, bool en) /* Handle write to Global Command Register */ static void vtd_handle_gcmd_write(IntelIOMMUState *s) { + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); uint32_t status = vtd_get_long_raw(s, DMAR_GSTS_REG); uint32_t val = vtd_get_long_raw(s, DMAR_GCMD_REG); uint32_t changed = status ^ val; trace_vtd_reg_write_gcmd(status, val); - if (changed & VTD_GCMD_TE) { + if ((changed & VTD_GCMD_TE) && s->dma_translation) { /* Translation enable/disable */ vtd_handle_gcmd_te(s, val & VTD_GCMD_TE); } @@ -2218,7 +2435,8 @@ static void vtd_handle_gcmd_write(IntelIOMMUState *s) /* Set/update the interrupt remapping root-table pointer */ vtd_handle_gcmd_sirtp(s); } - if (changed & VTD_GCMD_IRE) { + if ((changed & VTD_GCMD_IRE) && + x86_iommu_ir_supported(x86_iommu)) { /* Interrupt remap enable/disable */ vtd_handle_gcmd_ire(s, val & VTD_GCMD_IRE); } @@ -2275,7 +2493,8 @@ static bool vtd_get_inv_desc(IntelIOMMUState *s, uint32_t dw = s->iq_dw ? 32 : 16; dma_addr_t addr = base_addr + offset * dw; - if (dma_memory_read(&address_space_memory, addr, inv_desc, dw)) { + if (dma_memory_read(&address_space_memory, addr, + inv_desc, dw, MEMTXATTRS_UNSPECIFIED)) { error_report_once("Read INV DESC failed."); return false; } @@ -2308,8 +2527,9 @@ static bool vtd_process_wait_desc(IntelIOMMUState *s, VTDInvDesc *inv_desc) dma_addr_t status_addr = inv_desc->hi; trace_vtd_inv_desc_wait_sw(status_addr, status_data); status_data = cpu_to_le32(status_data); - if (dma_memory_write(&address_space_memory, status_addr, &status_data, - sizeof(status_data))) { + if (dma_memory_write(&address_space_memory, status_addr, + &status_data, sizeof(status_data), + MEMTXATTRS_UNSPECIFIED)) { trace_vtd_inv_desc_wait_write_fail(inv_desc->hi, inv_desc->lo); return false; } @@ -2426,18 +2646,13 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, { VTDAddressSpace *vtd_dev_as; IOMMUTLBEvent event; - struct VTDBus *vtd_bus; hwaddr addr; uint64_t sz; uint16_t sid; - uint8_t devfn; bool size; - uint8_t bus_num; addr = VTD_INV_DESC_DEVICE_IOTLB_ADDR(inv_desc->hi); sid = VTD_INV_DESC_DEVICE_IOTLB_SID(inv_desc->lo); - devfn = sid & 0xff; - bus_num = sid >> 8; size = VTD_INV_DESC_DEVICE_IOTLB_SIZE(inv_desc->hi); if ((inv_desc->lo & VTD_INV_DESC_DEVICE_IOTLB_RSVD_LO) || @@ -2448,12 +2663,11 @@ static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s, return false; } - vtd_bus = vtd_find_as_from_bus_num(s, bus_num); - if (!vtd_bus) { - goto done; - } - - vtd_dev_as = vtd_bus->dev_as[devfn]; + /* + * Using sid is OK since the guest should have finished the + * initialization of both the bus and device. + */ + vtd_dev_as = vtd_get_as_by_sid(s, sid); if (!vtd_dev_as) { goto done; } @@ -2869,12 +3083,7 @@ static void vtd_mem_write(void *opaque, hwaddr addr, } else { vtd_set_quad(s, addr, val); } - if (s->ecap & VTD_ECAP_SMTS && - val & VTD_IQA_DW_MASK) { - s->iq_dw = true; - } else { - s->iq_dw = false; - } + vtd_update_iq_dw(s); break; case DMAR_IQA_REG_HI: @@ -3015,6 +3224,28 @@ static int vtd_iommu_notify_flag_changed(IOMMUMemoryRegion *iommu, { VTDAddressSpace *vtd_as = container_of(iommu, VTDAddressSpace, iommu); IntelIOMMUState *s = vtd_as->iommu_state; + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); + + /* TODO: add support for VFIO and vhost users */ + if (s->snoop_control) { + error_setg_errno(errp, ENOTSUP, + "Snoop Control with vhost or VFIO is not supported"); + return -ENOTSUP; + } + if (!s->caching_mode && (new & IOMMU_NOTIFIER_MAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires caching mode", + pci_bus_num(vtd_as->bus), PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + return -ENOTSUP; + } + if (!x86_iommu->dt_supported && (new & IOMMU_NOTIFIER_DEVIOTLB_UNMAP)) { + error_setg_errno(errp, ENOTSUP, + "device %02x.%02x.%x requires device IOTLB mode", + pci_bus_num(vtd_as->bus), PCI_SLOT(vtd_as->devfn), + PCI_FUNC(vtd_as->devfn)); + return -ENOTSUP; + } /* Update per-address-space notifier flags */ vtd_as->notifier_flags = new; @@ -3032,13 +3263,6 @@ static int vtd_post_load(void *opaque, int version_id) IntelIOMMUState *iommu = opaque; /* - * Memory regions are dynamically turned on/off depending on - * context entry configurations from the guest. After migration, - * we need to make sure the memory regions are still correct. - */ - vtd_switch_address_space_all(iommu); - - /* * We don't need to migrate the root_scalable because we can * simply do the calculation after the loading is complete. We * can actually do similar things with root, dmar_enabled, etc. @@ -3047,6 +3271,15 @@ static int vtd_post_load(void *opaque, int version_id) */ vtd_update_scalable_state(iommu); + vtd_update_iq_dw(iommu); + + /* + * Memory regions are dynamically turned on/off depending on + * context entry configurations from the guest. After migration, + * we need to make sure the memory regions are still correct. + */ + vtd_switch_address_space_all(iommu); + return 0; } @@ -3056,7 +3289,7 @@ static const VMStateDescription vtd_vmstate = { .minimum_version_id = 1, .priority = MIG_PRI_IOMMU, .post_load = vtd_post_load, - .fields = (VMStateField[]) { + .fields = (const VMStateField[]) { VMSTATE_UINT64(root, IntelIOMMUState), VMSTATE_UINT64(intr_root, IntelIOMMUState), VMSTATE_UINT64(iq, IntelIOMMUState), @@ -3099,13 +3332,17 @@ static Property vtd_properties[] = { VTD_HOST_ADDRESS_WIDTH), DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE), DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE), + DEFINE_PROP_BOOL("snoop-control", IntelIOMMUState, snoop_control, false), + DEFINE_PROP_BOOL("x-pasid-mode", IntelIOMMUState, pasid, false), DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true), + DEFINE_PROP_BOOL("dma-translation", IntelIOMMUState, dma_translation, true), DEFINE_PROP_END_OF_LIST(), }; /* Read IRTE entry with specific index */ -static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, - VTD_IR_TableEntry *entry, uint16_t sid) +static bool vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, + VTD_IR_TableEntry *entry, uint16_t sid, + bool do_fault) { static const uint16_t vtd_svt_mask[VTD_SQ_MAX] = \ {0xffff, 0xfffb, 0xfff9, 0xfff8}; @@ -3116,40 +3353,62 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, if (index >= iommu->intr_size) { error_report_once("%s: index too large: ind=0x%x", __func__, index); - return -VTD_FR_IR_INDEX_OVER; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_INDEX_OVER, index); + } + return false; } addr = iommu->intr_root + index * sizeof(*entry); - if (dma_memory_read(&address_space_memory, addr, entry, - sizeof(*entry))) { + if (dma_memory_read(&address_space_memory, addr, + entry, sizeof(*entry), MEMTXATTRS_UNSPECIFIED)) { error_report_once("%s: read failed: ind=0x%x addr=0x%" PRIx64, __func__, index, addr); - return -VTD_FR_IR_ROOT_INVAL; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_ROOT_INVAL, index); + } + return false; } - trace_vtd_ir_irte_get(index, le64_to_cpu(entry->data[1]), - le64_to_cpu(entry->data[0])); + entry->data[0] = le64_to_cpu(entry->data[0]); + entry->data[1] = le64_to_cpu(entry->data[1]); + + trace_vtd_ir_irte_get(index, entry->data[1], entry->data[0]); + + /* + * The remaining potential fault conditions are "qualified" by the + * Fault Processing Disable bit in the IRTE. Even "not present". + * So just clear the do_fault flag if PFD is set, which will + * prevent faults being raised. + */ + if (entry->irte.fault_disable) { + do_fault = false; + } if (!entry->irte.present) { error_report_once("%s: detected non-present IRTE " "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", - __func__, index, le64_to_cpu(entry->data[1]), - le64_to_cpu(entry->data[0])); - return -VTD_FR_IR_ENTRY_P; + __func__, index, entry->data[1], entry->data[0]); + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_ENTRY_P, index); + } + return false; } if (entry->irte.__reserved_0 || entry->irte.__reserved_1 || entry->irte.__reserved_2) { error_report_once("%s: detected non-zero reserved IRTE " "(index=%u, high=0x%" PRIx64 ", low=0x%" PRIx64 ")", - __func__, index, le64_to_cpu(entry->data[1]), - le64_to_cpu(entry->data[0])); - return -VTD_FR_IR_IRTE_RSVD; + __func__, index, entry->data[1], entry->data[0]); + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_IRTE_RSVD, index); + } + return false; } if (sid != X86_IOMMU_SID_INVALID) { /* Validate IRTE SID */ - source_id = le32_to_cpu(entry->irte.source_id); + source_id = entry->irte.source_id; switch (entry->irte.sid_vtype) { case VTD_SVT_NONE: break; @@ -3160,7 +3419,10 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, error_report_once("%s: invalid IRTE SID " "(index=%u, sid=%u, source_id=%u)", __func__, index, sid, source_id); - return -VTD_FR_IR_SID_ERR; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index); + } + return false; } break; @@ -3172,7 +3434,10 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, error_report_once("%s: invalid SVT_BUS " "(index=%u, bus=%u, min=%u, max=%u)", __func__, index, bus, bus_min, bus_max); - return -VTD_FR_IR_SID_ERR; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index); + } + return false; } break; @@ -3181,29 +3446,30 @@ static int vtd_irte_get(IntelIOMMUState *iommu, uint16_t index, "(index=%u, type=%d)", __func__, index, entry->irte.sid_vtype); /* Take this as verification failure. */ - return -VTD_FR_IR_SID_ERR; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_SID_ERR, index); + } + return false; } } - return 0; + return true; } /* Fetch IRQ information of specific IR index */ -static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, - X86IOMMUIrq *irq, uint16_t sid) +static bool vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, + X86IOMMUIrq *irq, uint16_t sid, bool do_fault) { VTD_IR_TableEntry irte = {}; - int ret = 0; - ret = vtd_irte_get(iommu, index, &irte, sid); - if (ret) { - return ret; + if (!vtd_irte_get(iommu, index, &irte, sid, do_fault)) { + return false; } irq->trigger_mode = irte.irte.trigger_mode; irq->vector = irte.irte.vector; irq->delivery_mode = irte.irte.delivery_mode; - irq->dest = le32_to_cpu(irte.irte.dest_id); + irq->dest = irte.irte.dest_id; if (!iommu->intr_eime) { #define VTD_IR_APIC_DEST_MASK (0xff00ULL) #define VTD_IR_APIC_DEST_SHIFT (8) @@ -3216,16 +3482,15 @@ static int vtd_remap_irq_get(IntelIOMMUState *iommu, uint16_t index, trace_vtd_ir_remap(index, irq->trigger_mode, irq->vector, irq->delivery_mode, irq->dest, irq->dest_mode); - return 0; + return true; } /* Interrupt remapping for MSI/MSI-X entry */ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, MSIMessage *origin, MSIMessage *translated, - uint16_t sid) + uint16_t sid, bool do_fault) { - int ret = 0; VTD_IR_MSIAddress addr; uint16_t index; X86IOMMUIrq irq = {}; @@ -3242,14 +3507,20 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, if (origin->address & VTD_MSI_ADDR_HI_MASK) { error_report_once("%s: MSI address high 32 bits non-zero detected: " "address=0x%" PRIx64, __func__, origin->address); - return -VTD_FR_IR_REQ_RSVD; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0); + } + return -EINVAL; } addr.data = origin->address & VTD_MSI_ADDR_LO_MASK; if (addr.addr.__head != 0xfee) { error_report_once("%s: MSI address low 32 bit invalid: 0x%" PRIx32, __func__, addr.data); - return -VTD_FR_IR_REQ_RSVD; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0); + } + return -EINVAL; } /* This is compatible mode. */ @@ -3258,7 +3529,7 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, goto out; } - index = addr.addr.index_h << 15 | le16_to_cpu(addr.addr.index_l); + index = addr.addr.index_h << 15 | addr.addr.index_l; #define VTD_IR_MSI_DATA_SUBHANDLE (0x0000ffff) #define VTD_IR_MSI_DATA_RESERVED (0xffff0000) @@ -3268,9 +3539,8 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, index += origin->data & VTD_IR_MSI_DATA_SUBHANDLE; } - ret = vtd_remap_irq_get(iommu, index, &irq, sid); - if (ret) { - return ret; + if (!vtd_remap_irq_get(iommu, index, &irq, sid, do_fault)) { + return -EINVAL; } if (addr.addr.sub_valid) { @@ -3280,7 +3550,10 @@ static int vtd_interrupt_remap_msi(IntelIOMMUState *iommu, "(sid=%u, address=0x%" PRIx64 ", data=0x%" PRIx32 ")", __func__, sid, origin->address, origin->data); - return -VTD_FR_IR_REQ_RSVD; + if (do_fault) { + vtd_report_ir_fault(iommu, sid, VTD_FR_IR_REQ_RSVD, 0); + } + return -EINVAL; } } else { uint8_t vector = origin->data & 0xff; @@ -3320,7 +3593,7 @@ static int vtd_int_remap(X86IOMMUState *iommu, MSIMessage *src, MSIMessage *dst, uint16_t sid) { return vtd_interrupt_remap_msi(INTEL_IOMMU_DEVICE(iommu), - src, dst, sid); + src, dst, sid, false); } static MemTxResult vtd_mem_ir_read(void *opaque, hwaddr addr, @@ -3346,14 +3619,13 @@ static MemTxResult vtd_mem_ir_write(void *opaque, hwaddr addr, sid = attrs.requester_id; } - ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid); + ret = vtd_interrupt_remap_msi(opaque, &from, &to, sid, true); if (ret) { - /* TODO: report error */ /* Drop this interrupt */ return MEMTX_ERROR; } - apic_get_class()->send_msi(&to); + apic_get_class(NULL)->send_msi(&to); return MEMTX_OK; } @@ -3372,32 +3644,98 @@ static const MemoryRegionOps vtd_mem_ir_ops = { }, }; -VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) +static void vtd_report_ir_illegal_access(VTDAddressSpace *vtd_as, + hwaddr addr, bool is_write) { - uintptr_t key = (uintptr_t)bus; - VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, &key); - VTDAddressSpace *vtd_dev_as; - char name[128]; + IntelIOMMUState *s = vtd_as->iommu_state; + uint8_t bus_n = pci_bus_num(vtd_as->bus); + uint16_t sid = PCI_BUILD_BDF(bus_n, vtd_as->devfn); + bool is_fpd_set = false; + VTDContextEntry ce; + + assert(vtd_as->pasid != PCI_NO_PASID); - if (!vtd_bus) { - uintptr_t *new_key = g_malloc(sizeof(*new_key)); - *new_key = (uintptr_t)bus; - /* No corresponding free() */ - vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \ - PCI_DEVFN_MAX); - vtd_bus->bus = bus; - g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus); + /* Try out best to fetch FPD, we can't do anything more */ + if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { + is_fpd_set = ce.lo & VTD_CONTEXT_ENTRY_FPD; + if (!is_fpd_set && s->root_scalable) { + vtd_ce_get_pasid_fpd(s, &ce, &is_fpd_set, vtd_as->pasid); + } } - vtd_dev_as = vtd_bus->dev_as[devfn]; + vtd_report_fault(s, VTD_FR_SM_INTERRUPT_ADDR, + is_fpd_set, sid, addr, is_write, + true, vtd_as->pasid); +} + +static MemTxResult vtd_mem_ir_fault_read(void *opaque, hwaddr addr, + uint64_t *data, unsigned size, + MemTxAttrs attrs) +{ + vtd_report_ir_illegal_access(opaque, addr, false); + + return MEMTX_ERROR; +} + +static MemTxResult vtd_mem_ir_fault_write(void *opaque, hwaddr addr, + uint64_t value, unsigned size, + MemTxAttrs attrs) +{ + vtd_report_ir_illegal_access(opaque, addr, true); + + return MEMTX_ERROR; +} + +static const MemoryRegionOps vtd_mem_ir_fault_ops = { + .read_with_attrs = vtd_mem_ir_fault_read, + .write_with_attrs = vtd_mem_ir_fault_write, + .endianness = DEVICE_LITTLE_ENDIAN, + .impl = { + .min_access_size = 1, + .max_access_size = 8, + }, + .valid = { + .min_access_size = 1, + .max_access_size = 8, + }, +}; + +VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, + int devfn, unsigned int pasid) +{ + /* + * We can't simply use sid here since the bus number might not be + * initialized by the guest. + */ + struct vtd_as_key key = { + .bus = bus, + .devfn = devfn, + .pasid = pasid, + }; + VTDAddressSpace *vtd_dev_as; + char name[128]; + vtd_dev_as = g_hash_table_lookup(s->vtd_address_spaces, &key); if (!vtd_dev_as) { - snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), - PCI_FUNC(devfn)); - vtd_bus->dev_as[devfn] = vtd_dev_as = g_malloc0(sizeof(VTDAddressSpace)); + struct vtd_as_key *new_key = g_malloc(sizeof(*new_key)); + + new_key->bus = bus; + new_key->devfn = devfn; + new_key->pasid = pasid; + + if (pasid == PCI_NO_PASID) { + snprintf(name, sizeof(name), "vtd-%02x.%x", PCI_SLOT(devfn), + PCI_FUNC(devfn)); + } else { + snprintf(name, sizeof(name), "vtd-%02x.%x-pasid-%x", PCI_SLOT(devfn), + PCI_FUNC(devfn), pasid); + } + + vtd_dev_as = g_new0(VTDAddressSpace, 1); vtd_dev_as->bus = bus; vtd_dev_as->devfn = (uint8_t)devfn; + vtd_dev_as->pasid = pasid; vtd_dev_as->iommu_state = s; vtd_dev_as->context_cache_entry.context_cache_gen = 0; vtd_dev_as->iova_tree = iova_tree_new(); @@ -3439,6 +3777,24 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) &vtd_dev_as->iommu_ir, 1); /* + * This region is used for catching fault to access interrupt + * range via passthrough + PASID. See also + * vtd_switch_address_space(). We can't use alias since we + * need to know the sid which is valid for MSI who uses + * bus_master_as (see msi_send_message()). + */ + memory_region_init_io(&vtd_dev_as->iommu_ir_fault, OBJECT(s), + &vtd_mem_ir_fault_ops, vtd_dev_as, "vtd-no-ir", + VTD_INTERRUPT_ADDR_SIZE); + /* + * Hook to root since when PT is enabled vtd_dev_as->iommu + * will be disabled. + */ + memory_region_add_subregion_overlap(MEMORY_REGION(&vtd_dev_as->root), + VTD_INTERRUPT_ADDR_FIRST, + &vtd_dev_as->iommu_ir_fault, 2); + + /* * Hook both the containers under the root container, we * switch between DMAR & noDMAR by enable/disable * corresponding sub-containers @@ -3450,6 +3806,8 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) &vtd_dev_as->nodmar, 0); vtd_switch_address_space(vtd_dev_as); + + g_hash_table_insert(s->vtd_address_spaces, new_key, vtd_dev_as); } return vtd_dev_as; } @@ -3457,7 +3815,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn) /* Unmap the whole range in the notifier's scope. */ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) { - hwaddr size, remain; + hwaddr total, remain; hwaddr start = n->start; hwaddr end = n->end; IntelIOMMUState *s = as->iommu_state; @@ -3478,7 +3836,7 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) } assert(start <= end); - size = remain = end - start + 1; + total = remain = end - start + 1; while (remain >= VTD_PAGE_SIZE) { IOMMUTLBEvent event; @@ -3506,11 +3864,11 @@ static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n) trace_vtd_as_unmap_whole(pci_bus_num(as->bus), VTD_PCI_SLOT(as->devfn), VTD_PCI_FUNC(as->devfn), - n->start, size); + n->start, total); map.iova = n->start; - map.size = size; - iova_tree_remove(as->iova_tree, &map); + map.size = total - 1; /* Inclusive */ + iova_tree_remove(as->iova_tree, map); } static void vtd_address_space_unmap_all(IntelIOMMUState *s) @@ -3543,22 +3901,19 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) IntelIOMMUState *s = vtd_as->iommu_state; uint8_t bus_n = pci_bus_num(vtd_as->bus); VTDContextEntry ce; + DMAMap map = { .iova = 0, .size = HWADDR_MAX }; - /* - * The replay can be triggered by either a invalidation or a newly - * created entry. No matter what, we release existing mappings - * (it means flushing caches for UNMAP-only registers). - */ - vtd_address_space_unmap(vtd_as, n); + /* replay is protected by BQL, page walk will re-setup it safely */ + iova_tree_remove(vtd_as->iova_tree, map); if (vtd_dev_to_context_entry(s, bus_n, vtd_as->devfn, &ce) == 0) { trace_vtd_replay_ce_valid(s->root_scalable ? "scalable mode" : "legacy mode", bus_n, PCI_SLOT(vtd_as->devfn), PCI_FUNC(vtd_as->devfn), - vtd_get_domain_id(s, &ce), + vtd_get_domain_id(s, &ce, vtd_as->pasid), ce.hi, ce.lo); - if (vtd_as_has_map_notifier(vtd_as)) { + if (n->notifier_flags & IOMMU_NOTIFIER_MAP) { /* This is required only for MAP typed notifiers */ vtd_page_walk_info info = { .hook_fn = vtd_replay_hook, @@ -3566,10 +3921,10 @@ static void vtd_iommu_replay(IOMMUMemoryRegion *iommu_mr, IOMMUNotifier *n) .notify_unmap = false, .aw = s->aw_bits, .as = vtd_as, - .domain_id = vtd_get_domain_id(s, &ce), + .domain_id = vtd_get_domain_id(s, &ce, vtd_as->pasid), }; - vtd_page_walk(s, &ce, 0, ~0ULL, &info); + vtd_page_walk(s, &ce, 0, ~0ULL, &info, vtd_as->pasid); } } else { trace_vtd_replay_ce_invalid(bus_n, PCI_SLOT(vtd_as->devfn), @@ -3605,12 +3960,17 @@ static void vtd_init(IntelIOMMUState *s) s->next_frcd_reg = 0; s->cap = VTD_CAP_FRO | VTD_CAP_NFR | VTD_CAP_ND | VTD_CAP_MAMV | VTD_CAP_PSI | VTD_CAP_SLLPS | - VTD_CAP_SAGAW_39bit | VTD_CAP_MGAW(s->aw_bits); + VTD_CAP_MGAW(s->aw_bits); if (s->dma_drain) { s->cap |= VTD_CAP_DRAIN; } - if (s->aw_bits == VTD_HOST_AW_48BIT) { - s->cap |= VTD_CAP_SAGAW_48bit; + if (s->dma_translation) { + if (s->aw_bits >= VTD_HOST_AW_39BIT) { + s->cap |= VTD_CAP_SAGAW_39bit; + } + if (s->aw_bits >= VTD_HOST_AW_48BIT) { + s->cap |= VTD_CAP_SAGAW_48bit; + } } s->ecap = VTD_ECAP_QI | VTD_ECAP_IRO; @@ -3629,6 +3989,12 @@ static void vtd_init(IntelIOMMUState *s) vtd_spte_rsvd_large[3] = VTD_SPTE_LPAGE_L3_RSVD_MASK(s->aw_bits, x86_iommu->dt_supported); + if (s->scalable_mode || s->snoop_control) { + vtd_spte_rsvd[1] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[2] &= ~VTD_SPTE_SNP; + vtd_spte_rsvd_large[3] &= ~VTD_SPTE_SNP; + } + if (x86_iommu_ir_supported(x86_iommu)) { s->ecap |= VTD_ECAP_IR | VTD_ECAP_MHMV; if (s->intr_eim == ON_OFF_AUTO_ON) { @@ -3654,6 +4020,14 @@ static void vtd_init(IntelIOMMUState *s) s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS; } + if (s->snoop_control) { + s->ecap |= VTD_ECAP_SC; + } + + if (s->pasid) { + s->ecap |= VTD_ECAP_PASID; + } + vtd_reset_caches(s); /* Define registers with default values and bit semantics */ @@ -3727,10 +4101,14 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void *opaque, int devfn) assert(0 <= devfn && devfn < PCI_DEVFN_MAX); - vtd_as = vtd_find_add_as(s, bus, devfn); + vtd_as = vtd_find_add_as(s, bus, devfn, PCI_NO_PASID); return &vtd_as->as; } +static PCIIOMMUOps vtd_iommu_ops = { + .get_address_space = vtd_host_dma_iommu, +}; + static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) { X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); @@ -3746,11 +4124,7 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF; } if (s->intr_eim == ON_OFF_AUTO_ON && !s->buggy_eim) { - if (!kvm_irqchip_in_kernel()) { - error_setg(errp, "eim=on requires accel=kvm,kernel-irqchip=split"); - return false; - } - if (!kvm_enable_x2apic()) { + if (kvm_irqchip_is_split() && !kvm_enable_x2apic()) { error_setg(errp, "eim=on requires support on the KVM side" "(X2APIC_API, first shipped in v4.7)"); return false; @@ -3770,6 +4144,11 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error **errp) return false; } + if (s->pasid && !s->scalable_mode) { + error_setg(errp, "Need to set scalable mode for PASID"); + return false; + } + return true; } @@ -3804,11 +4183,19 @@ static void vtd_realize(DeviceState *dev, Error **errp) MachineState *ms = MACHINE(qdev_get_machine()); PCMachineState *pcms = PC_MACHINE(ms); X86MachineState *x86ms = X86_MACHINE(ms); - PCIBus *bus = pcms->bus; + PCIBus *bus = pcms->pcibus; IntelIOMMUState *s = INTEL_IOMMU_DEVICE(dev); - X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(dev); + X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s); - x86_iommu->type = TYPE_INTEL; + if (s->pasid && x86_iommu->dt_supported) { + /* + * PASID-based-Device-TLB Invalidate Descriptor is not + * implemented and it requires support from vhost layer which + * needs to be implemented in the future. + */ + error_setg(errp, "PASID based device IOTLB is not supported"); + return; + } if (!vtd_decide_config(s, errp)) { return; @@ -3816,9 +4203,10 @@ static void vtd_realize(DeviceState *dev, Error **errp) QLIST_INIT(&s->vtd_as_with_notifiers); qemu_mutex_init(&s->iommu_lock); - memset(s->vtd_as_by_bus_num, 0, sizeof(s->vtd_as_by_bus_num)); memory_region_init_io(&s->csrmem, OBJECT(s), &vtd_mem_ops, s, "intel_iommu", DMAR_REG_SIZE); + memory_region_add_subregion(get_system_memory(), + Q35_HOST_BRIDGE_IOMMU_ADDR, &s->csrmem); /* Create the shared memory regions by all devices */ memory_region_init(&s->mr_nodmar, OBJECT(s), "vtd-nodmar", @@ -3833,16 +4221,13 @@ static void vtd_realize(DeviceState *dev, Error **errp) memory_region_add_subregion_overlap(&s->mr_nodmar, VTD_INTERRUPT_ADDR_FIRST, &s->mr_ir, 1); - - sysbus_init_mmio(SYS_BUS_DEVICE(s), &s->csrmem); /* No corresponding destroy */ - s->iotlb = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, + s->iotlb = g_hash_table_new_full(vtd_iotlb_hash, vtd_iotlb_equal, g_free, g_free); - s->vtd_as_by_busptr = g_hash_table_new_full(vtd_uint64_hash, vtd_uint64_equal, - g_free, g_free); + s->vtd_address_spaces = g_hash_table_new_full(vtd_as_hash, vtd_as_equal, + g_free, g_free); vtd_init(s); - sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, Q35_HOST_BRIDGE_IOMMU_ADDR); - pci_setup_iommu(bus, vtd_host_dma_iommu, dev); + pci_setup_iommu(bus, &vtd_iommu_ops, dev); /* Pseudo address space under root PCI bus. */ x86ms->ioapic_as = vtd_host_dma_iommu(bus, s, Q35_PSEUDO_DEVFN_IOAPIC); qemu_add_machine_init_done_notifier(&vtd_machine_done_notify); |