aboutsummaryrefslogtreecommitdiff
path: root/arch/x86/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kvm')
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/cpuid.c2
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h5
-rw-r--r--arch/x86/kvm/mmu.c41
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/svm.c54
-rw-r--r--arch/x86/kvm/vmx.c264
-rw-r--r--arch/x86/kvm/x86.c51
8 files changed, 265 insertions, 155 deletions
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2688c7dc5323..3ea624452f93 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -89,6 +89,5 @@ config KVM_MMU_AUDIT
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
-source drivers/lguest/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 59ca2eea522c..19adbb418443 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -469,7 +469,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
cpuid_mask(&entry->ecx, CPUID_7_ECX);
/* PKU is not yet implemented for shadow paging. */
- if (!tdp_enabled)
+ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
entry->edx &= kvm_cpuid_7_0_edx_x86_features;
entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 762cdf2595f9..e1e89ee4af75 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -84,11 +84,6 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
}
-static inline u32 kvm_read_pkru(struct kvm_vcpu *vcpu)
-{
- return kvm_x86_ops->get_pkru(vcpu);
-}
-
static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
{
vcpu->arch.hflags |= HF_GUEST_MASK;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9b1dd114956a..04d750813c9d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -108,7 +108,7 @@ module_param(dbg, bool, 0644);
(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
-#define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))
+#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
#define PT64_DIR_BASE_ADDR_MASK \
(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
#define PT64_LVL_ADDR_MASK(level) \
@@ -126,7 +126,7 @@ module_param(dbg, bool, 0644);
* PT32_LEVEL_BITS))) - 1))
#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \
- | shadow_x_mask | shadow_nx_mask)
+ | shadow_x_mask | shadow_nx_mask | shadow_me_mask)
#define ACC_EXEC_MASK 1
#define ACC_WRITE_MASK PT_WRITABLE_MASK
@@ -186,6 +186,7 @@ static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
static u64 __read_mostly shadow_mmio_value;
static u64 __read_mostly shadow_present_mask;
+static u64 __read_mostly shadow_me_mask;
/*
* SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
@@ -349,7 +350,7 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
*/
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
- u64 acc_track_mask)
+ u64 acc_track_mask, u64 me_mask)
{
BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask);
@@ -362,6 +363,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
shadow_x_mask = x_mask;
shadow_present_mask = p_mask;
shadow_acc_track_mask = acc_track_mask;
+ shadow_me_mask = me_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -2433,7 +2435,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
- shadow_user_mask | shadow_x_mask;
+ shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (sp_ad_disabled(sp))
spte |= shadow_acc_track_value;
@@ -2745,6 +2747,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
pte_access &= ~ACC_WRITE_MASK;
spte |= (u64)pfn << PAGE_SHIFT;
+ spte |= shadow_me_mask;
if (pte_access & ACC_WRITE_MASK) {
@@ -4106,16 +4109,28 @@ void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
bool uses_nx = context->nx || context->base_role.smep_andnot_wp;
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
/*
* Passing "true" to the last argument is okay; it adds a check
* on bit 8 of the SPTEs which KVM doesn't use anyway.
*/
- __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ shadow_zero_check = &context->shadow_zero_check;
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, uses_nx,
guest_cpuid_has_gbpages(vcpu), is_pse(vcpu),
true);
+
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
+
}
EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
@@ -4133,17 +4148,29 @@ static void
reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
+ struct rsvd_bits_validate *shadow_zero_check;
+ int i;
+
+ shadow_zero_check = &context->shadow_zero_check;
+
if (boot_cpu_is_amd())
- __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check,
+ __reset_rsvds_bits_mask(vcpu, shadow_zero_check,
boot_cpu_data.x86_phys_bits,
context->shadow_root_level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
true, true);
else
- __reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+ __reset_rsvds_bits_mask_ept(shadow_zero_check,
boot_cpu_data.x86_phys_bits,
false);
+ if (!shadow_me_mask)
+ return;
+
+ for (i = context->shadow_root_level; --i >= 0;) {
+ shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+ shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+ }
}
/*
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d7d248a000dd..4b9a3ae6b725 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -185,7 +185,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
* index of the protection domain, so pte_pkey * 2 is
* is the index of the first bit for the domain.
*/
- pkru_bits = (kvm_read_pkru(vcpu) >> (pte_pkey * 2)) & 3;
+ pkru_bits = (vcpu->arch.pkru >> (pte_pkey * 2)) & 3;
/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
offset = (pfec & ~1) +
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4d8141e533c3..8dbd8dbc83eb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1100,7 +1100,7 @@ static __init int svm_hardware_setup(void)
if (vls) {
if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_VIRTUAL_VMLOAD_VMSAVE) ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
!IS_ENABLED(CONFIG_X86_64)) {
vls = false;
} else {
@@ -1167,9 +1167,9 @@ static void avic_init_vmcb(struct vcpu_svm *svm)
{
struct vmcb *vmcb = svm->vmcb;
struct kvm_arch *vm_data = &svm->vcpu.kvm->arch;
- phys_addr_t bpa = page_to_phys(svm->avic_backing_page);
- phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page);
- phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page);
+ phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
+ phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page));
+ phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page));
vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK;
vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK;
@@ -1232,8 +1232,8 @@ static void init_vmcb(struct vcpu_svm *svm)
set_intercept(svm, INTERCEPT_MWAIT);
}
- control->iopm_base_pa = iopm_base;
- control->msrpm_base_pa = __pa(svm->msrpm);
+ control->iopm_base_pa = __sme_set(iopm_base);
+ control->msrpm_base_pa = __sme_set(__pa(svm->msrpm));
control->int_ctl = V_INTR_MASKING_MASK;
init_seg(&save->es);
@@ -1377,9 +1377,9 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu)
return -EINVAL;
new_entry = READ_ONCE(*entry);
- new_entry = (page_to_phys(svm->avic_backing_page) &
- AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
- AVIC_PHYSICAL_ID_ENTRY_VALID_MASK;
+ new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
+ AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
+ AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
WRITE_ONCE(*entry, new_entry);
svm->avic_physical_id_cache = entry;
@@ -1647,7 +1647,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
svm->vmcb = page_address(page);
clear_page(svm->vmcb);
- svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
+ svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT);
svm->asid_generation = 0;
init_vmcb(svm);
@@ -1675,7 +1675,7 @@ static void svm_free_vcpu(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT));
+ __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
__free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER);
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
@@ -1777,11 +1777,6 @@ static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_svm(vcpu)->vmcb->save.rflags = rflags;
}
-static u32 svm_get_pkru(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
-
static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
switch (reg) {
@@ -2335,7 +2330,7 @@ static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
u64 pdpte;
int ret;
- ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte,
+ ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte,
offset_in_page(cr3) + index * 8, 8);
if (ret)
return 0;
@@ -2347,7 +2342,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.nested_cr3 = root;
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
svm_flush_tlb(vcpu);
}
@@ -2430,6 +2425,16 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
svm->vmcb->control.exit_code_hi = 0;
svm->vmcb->control.exit_info_1 = error_code;
+
+ /*
+ * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
+ * written only when inject_pending_event runs (DR6 would written here
+ * too). This should be conditional on a new capability---if the
+ * capability is disabled, kvm_multiple_exception would write the
+ * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ */
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else
@@ -2868,7 +2873,7 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
svm->nested.msrpm[p] = svm->msrpm[p] | value;
}
- svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm);
+ svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm));
return true;
}
@@ -4501,7 +4506,7 @@ get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
pr_debug("SVM: %s: use GA mode for irq %u\n", __func__,
irq.vector);
*svm = to_svm(vcpu);
- vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page);
+ vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page));
vcpu_info->vector = irq.vector;
return 0;
@@ -4552,7 +4557,8 @@ static int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
struct amd_iommu_pi_data pi;
/* Try to enable guest_mode in IRTE */
- pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK;
+ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) &
+ AVIC_HPA_MASK);
pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id,
svm->vcpu.vcpu_id);
pi.is_guest_mode = true;
@@ -5001,7 +5007,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->save.cr3 = root;
+ svm->vmcb->save.cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_CR);
svm_flush_tlb(vcpu);
}
@@ -5010,7 +5016,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
{
struct vcpu_svm *svm = to_svm(vcpu);
- svm->vmcb->control.nested_cr3 = root;
+ svm->vmcb->control.nested_cr3 = __sme_set(root);
mark_dirty(svm->vmcb, VMCB_NPT);
/* Also sync guest cr3 here in case we live migrate */
@@ -5403,8 +5409,6 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.get_rflags = svm_get_rflags,
.set_rflags = svm_set_rflags,
- .get_pkru = svm_get_pkru,
-
.tlb_flush = svm_flush_tlb,
.run = svm_vcpu_run,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 39a6222bf968..d40900914a72 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -416,13 +416,10 @@ struct nested_vmx {
/* The guest-physical address of the current VMCS L1 keeps for L2 */
gpa_t current_vmptr;
- /* The host-usable pointer to the above */
- struct page *current_vmcs12_page;
- struct vmcs12 *current_vmcs12;
/*
* Cache of the guest's VMCS, existing outside of guest memory.
* Loaded from guest memory during VMPTRLD. Flushed to guest
- * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ * memory during VMCLEAR and VMPTRLD.
*/
struct vmcs12 *cached_vmcs12;
/*
@@ -639,8 +636,6 @@ struct vcpu_vmx {
u64 current_tsc_ratio;
- bool guest_pkru_valid;
- u32 guest_pkru;
u32 host_pkru;
/*
@@ -927,6 +922,10 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var);
static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
static int alloc_identity_pagetable(struct kvm *kvm);
+static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
+static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
+static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
+ u16 error_code);
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -2382,11 +2381,6 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
}
-static u32 vmx_get_pkru(struct kvm_vcpu *vcpu)
-{
- return to_vmx(vcpu)->guest_pkru;
-}
-
static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -2428,6 +2422,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
vmx_set_interrupt_shadow(vcpu, 0);
}
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
+ unsigned long exit_qual)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned int nr = vcpu->arch.exception.nr;
+ u32 intr_info = nr | INTR_INFO_VALID_MASK;
+
+ if (vcpu->arch.exception.has_error_code) {
+ vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ intr_info |= INTR_INFO_DELIVER_CODE_MASK;
+ }
+
+ if (kvm_exception_is_soft(nr))
+ intr_info |= INTR_TYPE_SOFT_EXCEPTION;
+ else
+ intr_info |= INTR_TYPE_HARD_EXCEPTION;
+
+ if (!(vmcs12->idt_vectoring_info_field & VECTORING_INFO_VALID_MASK) &&
+ vmx_get_nmi_mask(vcpu))
+ intr_info |= INTR_INFO_UNBLOCK_NMI;
+
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
+}
+
/*
* KVM wants to inject page-faults which it got to the guest. This function
* checks whether in a nested guest, we need to inject them to L1 or L2.
@@ -2437,23 +2455,38 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu)
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
unsigned int nr = vcpu->arch.exception.nr;
- if (!((vmcs12->exception_bitmap & (1u << nr)) ||
- (nr == PF_VECTOR && vcpu->arch.exception.nested_apf)))
- return 0;
+ if (nr == PF_VECTOR) {
+ if (vcpu->arch.exception.nested_apf) {
+ nested_vmx_inject_exception_vmexit(vcpu,
+ vcpu->arch.apf.nested_apf_token);
+ return 1;
+ }
+ /*
+ * FIXME: we must not write CR2 when L1 intercepts an L2 #PF exception.
+ * The fix is to add the ancillary datum (CR2 or DR6) to structs
+ * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6
+ * can be written only when inject_pending_event runs. This should be
+ * conditional on a new capability---if the capability is disabled,
+ * kvm_multiple_exception would write the ancillary information to
+ * CR2 or DR6, for backwards ABI-compatibility.
+ */
+ if (nested_vmx_is_page_fault_vmexit(vmcs12,
+ vcpu->arch.exception.error_code)) {
+ nested_vmx_inject_exception_vmexit(vcpu, vcpu->arch.cr2);
+ return 1;
+ }
+ } else {
+ unsigned long exit_qual = 0;
+ if (nr == DB_VECTOR)
+ exit_qual = vcpu->arch.dr6;
- if (vcpu->arch.exception.nested_apf) {
- vmcs_write32(VM_EXIT_INTR_ERROR_CODE, vcpu->arch.exception.error_code);
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
- INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
- vcpu->arch.apf.nested_apf_token);
- return 1;
+ if (vmcs12->exception_bitmap & (1u << nr)) {
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 1;
+ }
}
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
+ return 0;
}
static void vmx_queue_exception(struct kvm_vcpu *vcpu)
@@ -2667,7 +2700,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_handled() will not pass related exits to L1.
+ * nested_vmx_exit_reflected() will not pass related exits to L1.
* These rules have exceptions below.
*/
@@ -4955,6 +4988,28 @@ static bool vmx_get_enable_apicv(void)
return enable_apicv;
}
+static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ gfn_t gfn;
+
+ /*
+ * Don't need to mark the APIC access page dirty; it is never
+ * written to by the CPU during APIC virtualization.
+ */
+
+ if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
+ gfn = vmcs12->virtual_apic_page_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+
+ if (nested_cpu_has_posted_intr(vmcs12)) {
+ gfn = vmcs12->posted_intr_desc_addr >> PAGE_SHIFT;
+ kvm_vcpu_mark_page_dirty(vcpu, gfn);
+ }
+}
+
+
static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4962,18 +5017,15 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
void *vapic_page;
u16 status;
- if (vmx->nested.pi_desc &&
- vmx->nested.pi_pending) {
- vmx->nested.pi_pending = false;
- if (!pi_test_and_clear_on(vmx->nested.pi_desc))
- return;
-
- max_irr = find_last_bit(
- (unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (!vmx->nested.pi_desc || !vmx->nested.pi_pending)
+ return;
- if (max_irr == 256)
- return;
+ vmx->nested.pi_pending = false;
+ if (!pi_test_and_clear_on(vmx->nested.pi_desc))
+ return;
+ max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256);
+ if (max_irr != 256) {
vapic_page = kmap(vmx->nested.virtual_apic_page);
__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
kunmap(vmx->nested.virtual_apic_page);
@@ -4985,6 +5037,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
vmcs_write16(GUEST_INTR_STATUS, status);
}
}
+
+ nested_mark_vmcs12_pages_dirty(vcpu);
}
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -6502,7 +6556,7 @@ void vmx_enable_tdp(void)
enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull,
0ull, VMX_EPT_EXECUTABLE_MASK,
cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK,
- VMX_EPT_RWX_MASK);
+ VMX_EPT_RWX_MASK, 0ull);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
@@ -7134,34 +7188,32 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
return 1;
}
+static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx)
+{
+ vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
+ vmcs_write64(VMCS_LINK_POINTER, -1ull);
+}
+
static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
{
if (vmx->nested.current_vmptr == -1ull)
return;
- /* current_vmptr and current_vmcs12 are always set/reset together */
- if (WARN_ON(vmx->nested.current_vmcs12 == NULL))
- return;
-
if (enable_shadow_vmcs) {
/* copy to memory all shadowed fields in case
they were modified */
copy_shadow_to_vmcs12(vmx);
vmx->nested.sync_shadow_vmcs = false;
- vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
- SECONDARY_EXEC_SHADOW_VMCS);
- vmcs_write64(VMCS_LINK_POINTER, -1ull);
+ vmx_disable_shadow_vmcs(vmx);
}
vmx->nested.posted_intr_nv = -1;
/* Flush VMCS12 to guest memory */
- memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
- VMCS12_SIZE);
+ kvm_vcpu_write_guest_page(&vmx->vcpu,
+ vmx->nested.current_vmptr >> PAGE_SHIFT,
+ vmx->nested.cached_vmcs12, 0, VMCS12_SIZE);
- kunmap(vmx->nested.current_vmcs12_page);
- nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
}
/*
@@ -7175,12 +7227,14 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.vmxon = false;
free_vpid(vmx->nested.vpid02);
- nested_release_vmcs12(vmx);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
if (vmx->nested.msr_bitmap) {
free_page((unsigned long)vmx->nested.msr_bitmap);
vmx->nested.msr_bitmap = NULL;
}
if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
free_vmcs(vmx->vmcs01.shadow_vmcs);
vmx->vmcs01.shadow_vmcs = NULL;
@@ -7579,14 +7633,14 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
}
nested_release_vmcs12(vmx);
- vmx->nested.current_vmcs12 = new_vmcs12;
- vmx->nested.current_vmcs12_page = page;
/*
* Load VMCS12 from guest memory since it is not already
* cached.
*/
- memcpy(vmx->nested.cached_vmcs12,
- vmx->nested.current_vmcs12, VMCS12_SIZE);
+ memcpy(vmx->nested.cached_vmcs12, new_vmcs12, VMCS12_SIZE);
+ kunmap(page);
+ nested_release_page_clean(page);
+
set_current_vmptr(vmx, vmptr);
}
@@ -8019,12 +8073,11 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
* should handle it ourselves in L0 (and then continue L2). Only call this
* when in is_guest_mode (L2).
*/
-static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
+static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
{
u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- u32 exit_reason = vmx->exit_reason;
trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
vmcs_readl(EXIT_QUALIFICATION),
@@ -8033,6 +8086,18 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
+ * may write to these pages via their host physical address while
+ * L2 is running, bypassing any address-translation-based dirty
+ * tracking (e.g. EPT write protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
+
if (vmx->nested.nested_run_pending)
return false;
@@ -8169,6 +8234,29 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
}
}
+static int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason)
+{
+ u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ /*
+ * At this point, the exit interruption info in exit_intr_info
+ * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
+ * we need to query the in-kernel LAPIC.
+ */
+ WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
+ if ((exit_intr_info &
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
+ (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+
+ nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
+ vmcs_readl(EXIT_QUALIFICATION));
+ return 1;
+}
+
static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
{
*info1 = vmcs_readl(EXIT_QUALIFICATION);
@@ -8415,12 +8503,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_handled(vcpu)) {
- nested_vmx_vmexit(vcpu, exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
- }
+ if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
+ return nested_vmx_reflect_vmexit(vcpu, exit_reason);
if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
dump_vmcs();
@@ -8929,8 +9013,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- if (vmx->guest_pkru_valid)
- __write_pkru(vmx->guest_pkru);
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
+ vcpu->arch.pkru != vmx->host_pkru)
+ __write_pkru(vcpu->arch.pkru);
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
@@ -9078,13 +9164,11 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
* back on host, so it is safe to read guest PKRU from current
* XSAVE.
*/
- if (boot_cpu_has(X86_FEATURE_OSPKE)) {
- vmx->guest_pkru = __read_pkru();
- if (vmx->guest_pkru != vmx->host_pkru) {
- vmx->guest_pkru_valid = true;
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
+ vcpu->arch.pkru = __read_pkru();
+ if (vcpu->arch.pkru != vmx->host_pkru)
__write_pkru(vmx->host_pkru);
- } else
- vmx->guest_pkru_valid = false;
}
/*
@@ -9223,7 +9307,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->nested.current_vmcs12 = NULL;
vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
@@ -9509,12 +9592,15 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
WARN_ON(!is_guest_mode(vcpu));
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code))
- nested_vmx_vmexit(vcpu, to_vmx(vcpu)->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
- else
+ if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code)) {
+ vmcs12->vm_exit_intr_error_code = fault->error_code;
+ nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
+ PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
+ INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
+ fault->address);
+ } else {
kvm_inject_page_fault(vcpu, fault);
+ }
}
static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
@@ -10094,12 +10180,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
- *
- * A problem with this approach (when !enable_ept) is that L1 may be
- * injected with more page faults than it asked for. This could have
- * caused problems, but in practice existing hypervisors don't care.
- * To fix this, we will need to emulate the PFEC checking (on the L1
- * page tables), using walk_addr(), when injecting PFs to L1.
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
enable_ept ? vmcs12->page_fault_error_code_mask : 0);
@@ -10847,13 +10927,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->vm_exit_reason = exit_reason;
vmcs12->exit_qualification = exit_qualification;
-
vmcs12->vm_exit_intr_info = exit_intr_info;
- if ((vmcs12->vm_exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK))
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+
vmcs12->idt_vectoring_info_field = 0;
vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
@@ -11049,8 +11124,15 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
- && nested_exit_intr_ack_set(vcpu)) {
+ /*
+ * TODO: SDM says that with acknowledge interrupt on exit, bit 31 of
+ * the VM-exit interrupt information (valid interrupt) is always set to
+ * 1 on EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't need
+ * kvm_cpu_has_interrupt(). See the commit message for details.
+ */
+ if (nested_exit_intr_ack_set(vcpu) &&
+ exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ kvm_cpu_has_interrupt(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -11593,8 +11675,6 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
.get_rflags = vmx_get_rflags,
.set_rflags = vmx_set_rflags,
- .get_pkru = vmx_get_pkru,
-
.tlb_flush = vmx_flush_tlb,
.run = vmx_vcpu_run,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6c97c82814c4..ef5102f80497 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -54,6 +54,7 @@
#include <linux/kvm_irqfd.h>
#include <linux/irqbypass.h>
#include <linux/sched/stat.h>
+#include <linux/mem_encrypt.h>
#include <trace/events/kvm.h>
@@ -3159,15 +3160,18 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
kvm_set_hflags(vcpu, hflags);
vcpu->arch.smi_pending = events->smi.pending;
- if (events->smi.smm_inside_nmi)
- vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
- else
- vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+
+ if (events->smi.smm) {
+ if (events->smi.smm_inside_nmi)
+ vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ }
}
}
@@ -3242,7 +3246,12 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest + offset, src, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(dest + offset, &vcpu->arch.pkru,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest + offset, src, size);
+
}
valid -= feature;
@@ -3280,7 +3289,11 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
u32 size, offset, ecx, edx;
cpuid_count(XSTATE_CPUID, index,
&size, &offset, &ecx, &edx);
- memcpy(dest, src + offset, size);
+ if (feature == XFEATURE_MASK_PKRU)
+ memcpy(&vcpu->arch.pkru, src + offset,
+ sizeof(vcpu->arch.pkru));
+ else
+ memcpy(dest, src + offset, size);
}
valid -= feature;
@@ -6113,7 +6126,7 @@ int kvm_arch_init(void *opaque)
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
- PT_PRESENT_MASK, 0);
+ PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6215,6 +6228,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
lapic_irq.shorthand = 0;
lapic_irq.dest_mode = 0;
+ lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -6721,17 +6735,6 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
-void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
- unsigned long address)
-{
- /*
- * The physical address of apic access page is stored in the VMCS.
- * Update it when it becomes invalid.
- */
- if (address == gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT))
- kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-}
-
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -7629,7 +7632,9 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+ /* PKRU is separately restored in kvm_x86_ops->run. */
+ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
+ ~XFEATURE_MASK_PKRU);
trace_kvm_fpu(1);
}