aboutsummaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/include/asm/kvm_host.h3
-rw-r--r--arch/arm64/kvm/arm.c6
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c6
-rw-r--r--arch/x86/include/asm/cpufeatures.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h1
-rw-r--r--arch/x86/include/asm/kvm-x86-pmu-ops.h31
-rw-r--r--arch/x86/include/asm/kvm_host.h88
-rw-r--r--arch/x86/include/asm/uaccess.h142
-rw-r--r--arch/x86/include/asm/vmx.h10
-rw-r--r--arch/x86/include/uapi/asm/kvm.h11
-rw-r--r--arch/x86/kernel/asm-offsets_64.c4
-rw-r--r--arch/x86/kernel/kvm.c77
-rw-r--r--arch/x86/kernel/kvmclock.c2
-rw-r--r--arch/x86/kvm/i8259.c1
-rw-r--r--arch/x86/kvm/irq.c10
-rw-r--r--arch/x86/kvm/irq_comm.c2
-rw-r--r--arch/x86/kvm/mmu.h2
-rw-r--r--arch/x86/kvm/mmu/mmu.c468
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h64
-rw-r--r--arch/x86/kvm/mmu/spte.c35
-rw-r--r--arch/x86/kvm/mmu/spte.h8
-rw-r--r--arch/x86/kvm/mmu/tdp_iter.h34
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c86
-rw-r--r--arch/x86/kvm/pmu.c66
-rw-r--r--arch/x86/kvm/pmu.h7
-rw-r--r--arch/x86/kvm/svm/avic.c84
-rw-r--r--arch/x86/kvm/svm/nested.c308
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c16
-rw-r--r--arch/x86/kvm/svm/svm.c212
-rw-r--r--arch/x86/kvm/svm/svm.h55
-rw-r--r--arch/x86/kvm/trace.h20
-rw-r--r--arch/x86/kvm/vmx/nested.c63
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c2
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c2
-rw-r--r--arch/x86/kvm/vmx/vmcs.h5
-rw-r--r--arch/x86/kvm/vmx/vmx.c12
-rw-r--r--arch/x86/kvm/x86.c311
-rw-r--r--arch/x86/kvm/xen.c1245
-rw-r--r--arch/x86/kvm/xen.h62
40 files changed, 2535 insertions, 1029 deletions
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 94a27a7520f4..27ebb2929e0c 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -107,9 +107,6 @@ struct kvm_arch {
/* VTCR_EL2 value for this VM */
u64 vtcr;
- /* The maximum number of vCPUs depends on the used GIC model */
- int max_vcpus;
-
/* Interrupt controller */
struct vgic_dist vgic;
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 523bc934fe2f..7fceb855fa71 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -153,7 +153,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_vgic_early_init(kvm);
/* The maximum number of VCPUs is limited by the host's GIC model */
- kvm->arch.max_vcpus = kvm_arm_default_max_vcpus();
+ kvm->max_vcpus = kvm_arm_default_max_vcpus();
set_default_spectre(kvm);
@@ -230,7 +230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_MAX_VCPUS:
case KVM_CAP_MAX_VCPU_ID:
if (kvm)
- r = kvm->arch.max_vcpus;
+ r = kvm->max_vcpus;
else
r = kvm_arm_default_max_vcpus();
break;
@@ -306,7 +306,7 @@ int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
if (irqchip_in_kernel(kvm) && vgic_initialized(kvm))
return -EBUSY;
- if (id >= kvm->arch.max_vcpus)
+ if (id >= kvm->max_vcpus)
return -EINVAL;
return 0;
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index fc00304fe7d8..77feafd5c0e3 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -98,11 +98,11 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
ret = 0;
if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
- kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
+ kvm->max_vcpus = VGIC_V2_MAX_CPUS;
else
- kvm->arch.max_vcpus = VGIC_V3_MAX_CPUS;
+ kvm->max_vcpus = VGIC_V3_MAX_CPUS;
- if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus) {
+ if (atomic_read(&kvm->online_vcpus) > kvm->max_vcpus) {
ret = -E2BIG;
goto out_unlock;
}
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 73e643ae94b6..1d6826eac3e6 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -405,6 +405,7 @@
#define X86_FEATURE_SEV (19*32+ 1) /* AMD Secure Encrypted Virtualization */
#define X86_FEATURE_VM_PAGE_FLUSH (19*32+ 2) /* "" VM Page Flush MSR is supported */
#define X86_FEATURE_SEV_ES (19*32+ 3) /* AMD Secure Encrypted Virtualization - Encrypted State */
+#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
/*
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 3c368b639c04..96e4e9842dfc 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -126,6 +126,7 @@ KVM_X86_OP_OPTIONAL(migrate_timers)
KVM_X86_OP(msr_filter_changed)
KVM_X86_OP(complete_emulated_msr)
KVM_X86_OP(vcpu_deliver_sipi_vector)
+KVM_X86_OP_OPTIONAL_RET0(vcpu_get_apicv_inhibit_reasons);
#undef KVM_X86_OP
#undef KVM_X86_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm-x86-pmu-ops.h b/arch/x86/include/asm/kvm-x86-pmu-ops.h
new file mode 100644
index 000000000000..fdfd8e06fee6
--- /dev/null
+++ b/arch/x86/include/asm/kvm-x86-pmu-ops.h
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#if !defined(KVM_X86_PMU_OP) || !defined(KVM_X86_PMU_OP_OPTIONAL)
+BUILD_BUG_ON(1)
+#endif
+
+/*
+ * KVM_X86_PMU_OP() and KVM_X86_PMU_OP_OPTIONAL() are used to help generate
+ * both DECLARE/DEFINE_STATIC_CALL() invocations and
+ * "static_call_update()" calls.
+ *
+ * KVM_X86_PMU_OP_OPTIONAL() can be used for those functions that can have
+ * a NULL definition, for example if "static_call_cond()" will be used
+ * at the call sites.
+ */
+KVM_X86_PMU_OP(pmc_perf_hw_id)
+KVM_X86_PMU_OP(pmc_is_enabled)
+KVM_X86_PMU_OP(pmc_idx_to_pmc)
+KVM_X86_PMU_OP(rdpmc_ecx_to_pmc)
+KVM_X86_PMU_OP(msr_idx_to_pmc)
+KVM_X86_PMU_OP(is_valid_rdpmc_ecx)
+KVM_X86_PMU_OP(is_valid_msr)
+KVM_X86_PMU_OP(get_msr)
+KVM_X86_PMU_OP(set_msr)
+KVM_X86_PMU_OP(refresh)
+KVM_X86_PMU_OP(init)
+KVM_X86_PMU_OP(reset)
+KVM_X86_PMU_OP_OPTIONAL(deliver_pmi)
+KVM_X86_PMU_OP_OPTIONAL(cleanup)
+
+#undef KVM_X86_PMU_OP
+#undef KVM_X86_PMU_OP_OPTIONAL
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index e0c0f0e1f754..f164c6c1514a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -281,11 +281,11 @@ struct kvm_kernel_irq_routing_entry;
/*
* kvm_mmu_page_role tracks the properties of a shadow page (where shadow page
* also includes TDP pages) to determine whether or not a page can be used in
- * the given MMU context. This is a subset of the overall kvm_mmu_role to
+ * the given MMU context. This is a subset of the overall kvm_cpu_role to
* minimize the size of kvm_memory_slot.arch.gfn_track, i.e. allows allocating
* 2 bytes per gfn instead of 4 bytes per gfn.
*
- * Indirect upper-level shadow pages are tracked for write-protection via
+ * Upper-level shadow pages having gptes are tracked for write-protection via
* gfn_track. As above, gfn_track is a 16 bit counter, so KVM must not create
* more than 2^16-1 upper-level shadow pages at a single gfn, otherwise
* gfn_track will overflow and explosions will ensure.
@@ -331,7 +331,8 @@ union kvm_mmu_page_role {
unsigned smap_andnot_wp:1;
unsigned ad_disabled:1;
unsigned guest_mode:1;
- unsigned :6;
+ unsigned passthrough:1;
+ unsigned :5;
/*
* This is left at the top of the word so that
@@ -367,8 +368,6 @@ union kvm_mmu_extended_role {
struct {
unsigned int valid:1;
unsigned int execonly:1;
- unsigned int cr0_pg:1;
- unsigned int cr4_pae:1;
unsigned int cr4_pse:1;
unsigned int cr4_pke:1;
unsigned int cr4_smap:1;
@@ -378,7 +377,7 @@ union kvm_mmu_extended_role {
};
};
-union kvm_mmu_role {
+union kvm_cpu_role {
u64 as_u64;
struct {
union kvm_mmu_page_role base;
@@ -438,19 +437,8 @@ struct kvm_mmu {
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
struct kvm_mmu_root_info root;
- union kvm_mmu_role mmu_role;
- u8 root_level;
- u8 shadow_root_level;
- u8 ept_ad;
- bool direct_map;
- struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
-
- /*
- * Bitmap; bit set = permission fault
- * Byte index: page fault error code [4:1]
- * Bit index: pte permissions in ACC_* format
- */
- u8 permissions[16];
+ union kvm_cpu_role cpu_role;
+ union kvm_mmu_page_role root_role;
/*
* The pkru_mask indicates if protection key checks are needed. It
@@ -460,6 +448,15 @@ struct kvm_mmu {
*/
u32 pkru_mask;
+ struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
+
+ /*
+ * Bitmap; bit set = permission fault
+ * Byte index: page fault error code [4:1]
+ * Bit index: pte permissions in ACC_* format
+ */
+ u8 permissions[16];
+
u64 *pae_root;
u64 *pml4_root;
u64 *pml5_root;
@@ -607,16 +604,21 @@ struct kvm_vcpu_hv {
struct kvm_vcpu_xen {
u64 hypercall_rip;
u32 current_runstate;
- bool vcpu_info_set;
- bool vcpu_time_info_set;
- bool runstate_set;
- struct gfn_to_hva_cache vcpu_info_cache;
- struct gfn_to_hva_cache vcpu_time_info_cache;
- struct gfn_to_hva_cache runstate_cache;
+ u8 upcall_vector;
+ struct gfn_to_pfn_cache vcpu_info_cache;
+ struct gfn_to_pfn_cache vcpu_time_info_cache;
+ struct gfn_to_pfn_cache runstate_cache;
u64 last_steal;
u64 runstate_entry_time;
u64 runstate_times[4];
unsigned long evtchn_pending_sel;
+ u32 vcpu_id; /* The Xen / ACPI vCPU ID */
+ u32 timer_virq;
+ u64 timer_expires; /* In guest epoch */
+ atomic_t timer_pending;
+ struct hrtimer timer;
+ int poll_evtchn;
+ struct timer_list poll_timer;
};
struct kvm_vcpu_arch {
@@ -753,8 +755,7 @@ struct kvm_vcpu_arch {
gpa_t time;
struct pvclock_vcpu_time_info hv_clock;
unsigned int hw_tsc_khz;
- struct gfn_to_hva_cache pv_time;
- bool pv_time_enabled;
+ struct gfn_to_pfn_cache pv_time;
/* set guest stopped flag in pvclock flags field */
bool pvclock_set_guest_stopped_request;
@@ -1024,9 +1025,12 @@ struct msr_bitmap_range {
/* Xen emulation context */
struct kvm_xen {
+ u32 xen_version;
bool long_mode;
u8 upcall_vector;
struct gfn_to_pfn_cache shinfo_cache;
+ struct idr evtchn_ports;
+ unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)];
};
enum kvm_irqchip_mode {
@@ -1119,6 +1123,8 @@ struct kvm_arch {
u64 cur_tsc_generation;
int nr_vcpus_matched_tsc;
+ u32 default_tsc_khz;
+
seqcount_raw_spinlock_t pvclock_sc;
bool use_master_clock;
u64 master_kernel_ns;
@@ -1455,8 +1461,6 @@ struct kvm_x86_ops {
int cpu_dirty_log_size;
void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu);
- /* pmu operations of sub-arch */
- const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
@@ -1498,11 +1502,18 @@ struct kvm_x86_ops {
int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err);
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
+
+ /*
+ * Returns vCPU specific APICv inhibit reasons
+ */
+ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu);
};
struct kvm_x86_nested_ops {
void (*leave_nested)(struct kvm_vcpu *vcpu);
int (*check_events)(struct kvm_vcpu *vcpu);
+ bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
@@ -1527,6 +1538,7 @@ struct kvm_x86_init_ops {
unsigned int (*handle_intel_pt_intr)(void);
struct kvm_x86_ops *runtime_ops;
+ struct kvm_pmu_ops *pmu_ops;
};
struct kvm_arch_async_pf {
@@ -1548,20 +1560,6 @@ extern struct kvm_x86_ops kvm_x86_ops;
#define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
#include <asm/kvm-x86-ops.h>
-static inline void kvm_ops_static_call_update(void)
-{
-#define __KVM_X86_OP(func) \
- static_call_update(kvm_x86_##func, kvm_x86_ops.func);
-#define KVM_X86_OP(func) \
- WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
-#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
-#define KVM_X86_OP_OPTIONAL_RET0(func) \
- static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
- (void *)__static_call_return0);
-#include <asm/kvm-x86-ops.h>
-#undef __KVM_X86_OP
-}
-
#define __KVM_HAVE_ARCH_VM_ALLOC
static inline struct kvm *kvm_arch_alloc_vm(void)
{
@@ -1799,6 +1797,7 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception);
bool kvm_apicv_activated(struct kvm *kvm);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu);
void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu);
void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm,
enum kvm_apicv_inhibit reason, bool set);
@@ -1988,6 +1987,7 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages);
KVM_X86_QUIRK_CD_NW_CLEARED | \
KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \
KVM_X86_QUIRK_OUT_7E_INC_RIP | \
- KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)
+ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \
+ KVM_X86_QUIRK_FIX_HYPERCALL_INSN)
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index f78e2b3501a1..35f222aa66bf 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -382,6 +382,103 @@ do { \
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+#ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm_volatile_goto("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ _ASM_EXTABLE_UA(1b, %l[label]) \
+ : CC_OUT(z) (success), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory" \
+ : label); \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+#endif // CONFIG_X86_32
+#else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+#define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \
+ int __err = 0; \
+ bool success; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\
+ CC_SET(z) \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[errout]) \
+ : CC_OUT(z) (success), \
+ [errout] "+r" (__err), \
+ [ptr] "+m" (*_ptr), \
+ [old] "+a" (__old) \
+ : [new] ltype (__new) \
+ : "memory", "cc"); \
+ if (unlikely(__err)) \
+ goto label; \
+ if (unlikely(!success)) \
+ *_old = __old; \
+ likely(success); })
+
+#ifdef CONFIG_X86_32
+/*
+ * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
+ * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses
+ * both ESI and EDI for the memory operand, compilation will fail if the error
+ * is an input+output as there will be no register available for input.
+ */
+#define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \
+ int __result; \
+ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \
+ __typeof__(*(_ptr)) __old = *_old; \
+ __typeof__(*(_ptr)) __new = (_new); \
+ asm volatile("\n" \
+ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
+ "mov $0, %%ecx\n\t" \
+ "setz %%cl\n" \
+ "2:\n" \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
+ : [result]"=c" (__result), \
+ "+A" (__old), \
+ [ptr] "+m" (*_ptr) \
+ : "b" ((u32)__new), \
+ "c" ((u32)((u64)__new >> 32)) \
+ : "memory", "cc"); \
+ if (unlikely(__result < 0)) \
+ goto label; \
+ if (unlikely(!__result)) \
+ *_old = __old; \
+ likely(__result); })
+#endif // CONFIG_X86_32
+#endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT
+
/* FIXME: this hack is definitely wrong -AK */
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))
@@ -474,6 +571,51 @@ do { \
} while (0)
#endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT
+extern void __try_cmpxchg_user_wrong_size(void);
+
+#ifndef CONFIG_X86_32
+#define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \
+ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label)
+#endif
+
+/*
+ * Force the pointer to u<size> to match the size expected by the asm helper.
+ * clang/LLVM compiles all cases and only discards the unused paths after
+ * processing errors, which breaks i386 if the pointer is an 8-byte value.
+ */
+#define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ bool __ret; \
+ __chk_user_ptr(_ptr); \
+ switch (sizeof(*(_ptr))) { \
+ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \
+ (__force u8 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \
+ (__force u16 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \
+ (__force u32 *)(_ptr), (_oldp), \
+ (_nval), _label); \
+ break; \
+ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\
+ (_nval), _label); \
+ break; \
+ default: __try_cmpxchg_user_wrong_size(); \
+ } \
+ __ret; })
+
+/* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */
+#define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \
+ int __ret = -EFAULT; \
+ __uaccess_begin_nospec(); \
+ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \
+_label: \
+ __uaccess_end(); \
+ __ret; \
+ })
+
/*
* We want the unsafe accessors to always be inlined and use
* the error labels - thus the macro games.
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 0ffaa3156a4e..6c343c6a1855 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -543,16 +543,14 @@ enum vm_entry_failure_code {
#define EPT_VIOLATION_ACC_READ_BIT 0
#define EPT_VIOLATION_ACC_WRITE_BIT 1
#define EPT_VIOLATION_ACC_INSTR_BIT 2
-#define EPT_VIOLATION_READABLE_BIT 3
-#define EPT_VIOLATION_WRITABLE_BIT 4
-#define EPT_VIOLATION_EXECUTABLE_BIT 5
+#define EPT_VIOLATION_RWX_SHIFT 3
+#define EPT_VIOLATION_GVA_IS_VALID_BIT 7
#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8
#define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT)
#define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT)
#define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT)
-#define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT)
-#define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT)
-#define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT)
+#define EPT_VIOLATION_RWX_MASK (VMX_EPT_RWX_MASK << EPT_VIOLATION_RWX_SHIFT)
+#define EPT_VIOLATION_GVA_IS_VALID (1 << EPT_VIOLATION_GVA_IS_VALID_BIT)
#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT)
/*
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index bf6e96011dfe..21614807a2cb 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -428,11 +428,12 @@ struct kvm_sync_regs {
struct kvm_vcpu_events events;
};
-#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
-#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
-#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
-#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
-#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0)
+#define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1)
+#define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2)
+#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3)
+#define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4)
+#define KVM_X86_QUIRK_FIX_HYPERCALL_INSN (1 << 5)
#define KVM_STATE_NESTED_FORMAT_VMX 0
#define KVM_STATE_NESTED_FORMAT_SVM 1
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index b14533af7676..9b698215d261 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -5,7 +5,7 @@
#include <asm/ia32.h>
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
#include <asm/kvm_para.h>
#endif
@@ -20,7 +20,7 @@ int main(void)
BLANK();
#endif
-#if defined(CONFIG_KVM_GUEST) && defined(CONFIG_PARAVIRT_SPINLOCKS)
+#if defined(CONFIG_KVM_GUEST)
OFFSET(KVM_STEAL_TIME_preempted, kvm_steal_time, preempted);
BLANK();
#endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index a22deb58f86d..d0bb2b3fb305 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -752,6 +752,42 @@ static void kvm_crash_shutdown(struct pt_regs *regs)
}
#endif
+#if defined(CONFIG_X86_32) || !defined(CONFIG_SMP)
+bool __kvm_vcpu_is_preempted(long cpu);
+
+__visible bool __kvm_vcpu_is_preempted(long cpu)
+{
+ struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
+
+ return !!(src->preempted & KVM_VCPU_PREEMPTED);
+}
+PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
+
+#else
+
+#include <asm/asm-offsets.h>
+
+extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
+
+/*
+ * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
+ * restoring to/from the stack.
+ */
+asm(
+".pushsection .text;"
+".global __raw_callee_save___kvm_vcpu_is_preempted;"
+".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
+"__raw_callee_save___kvm_vcpu_is_preempted:"
+ASM_ENDBR
+"movq __per_cpu_offset(,%rdi,8), %rax;"
+"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
+"setne %al;"
+ASM_RET
+".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
+".popsection");
+
+#endif
+
static void __init kvm_guest_init(void)
{
int i;
@@ -764,6 +800,9 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
static_call_update(pv_steal_clock, kvm_steal_clock);
+
+ pv_ops.lock.vcpu_is_preempted =
+ PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
@@ -1005,40 +1044,6 @@ static void kvm_wait(u8 *ptr, u8 val)
}
}
-#ifdef CONFIG_X86_32
-__visible bool __kvm_vcpu_is_preempted(long cpu)
-{
- struct kvm_steal_time *src = &per_cpu(steal_time, cpu);
-
- return !!(src->preempted & KVM_VCPU_PREEMPTED);
-}
-PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);
-
-#else
-
-#include <asm/asm-offsets.h>
-
-extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);
-
-/*
- * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
- * restoring to/from the stack.
- */
-asm(
-".pushsection .text;"
-".global __raw_callee_save___kvm_vcpu_is_preempted;"
-".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
-"__raw_callee_save___kvm_vcpu_is_preempted:"
-ASM_ENDBR
-"movq __per_cpu_offset(,%rdi,8), %rax;"
-"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
-"setne %al;"
-ASM_RET
-".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
-".popsection");
-
-#endif
-
/*
* Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
*/
@@ -1082,10 +1087,6 @@ void __init kvm_spinlock_init(void)
pv_ops.lock.wait = kvm_wait;
pv_ops.lock.kick = kvm_kick_cpu;
- if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
- pv_ops.lock.vcpu_is_preempted =
- PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
- }
/*
* When PV spinlock is enabled which is preferred over
* virt_spin_lock(), virt_spin_lock_key's value is meaningless.
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index c5caa7311bd8..16333ba1904b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -239,7 +239,7 @@ static void __init kvmclock_init_mem(void)
static int __init kvm_setup_vsyscall_timeinfo(void)
{
- if (!kvm_para_available() || !kvmclock)
+ if (!kvm_para_available() || !kvmclock || nopv)
return 0;
kvmclock_init_mem();
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index be99dc86293d..e1bb6218bb96 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -252,7 +252,6 @@ int kvm_pic_read_irq(struct kvm *kvm)
*/
irq2 = 7;
intno = s->pics[1].irq_base + irq2;
- irq = irq2 + 8;
} else
intno = s->pics[0].irq_base + irq;
} else {
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 172b05343cfd..f371f1292ca3 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -22,10 +22,14 @@
*/
int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
{
+ int r = 0;
+
if (lapic_in_kernel(vcpu))
- return apic_has_pending_timer(vcpu);
+ r = apic_has_pending_timer(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ r += kvm_xen_has_pending_timer(vcpu);
- return 0;
+ return r;
}
EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
@@ -143,6 +147,8 @@ void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu))
kvm_inject_apic_timer_irqs(vcpu);
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_inject_timer_irqs(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 6e0dab04320e..0687162c4f22 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -181,7 +181,7 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (!level)
return -1;
- return kvm_xen_set_evtchn_fast(e, kvm);
+ return kvm_xen_set_evtchn_fast(&e->xen_evtchn, kvm);
#endif
default:
break;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index a335e7f1f69e..c242e26cfa80 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -138,7 +138,7 @@ static inline void kvm_mmu_load_pgd(struct kvm_vcpu *vcpu)
return;
static_call(kvm_x86_load_mmu_pgd)(vcpu, root_hpa,
- vcpu->arch.mmu->shadow_root_level);
+ vcpu->arch.mmu->root_role.level);
}
struct kvm_page_fault {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 64a2a7e2be90..9826c62f8e40 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -193,11 +193,12 @@ struct kvm_mmu_role_regs {
/*
* Yes, lot's of underscores. They're a hint that you probably shouldn't be
- * reading from the role_regs. Once the mmu_role is constructed, it becomes
+ * reading from the role_regs. Once the root_role is constructed, it becomes
* the single source of truth for the MMU's state.
*/
#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
-static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\
+static inline bool __maybe_unused \
+____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
{ \
return !!(regs->reg & flag); \
}
@@ -221,17 +222,26 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
{ \
- return !!(mmu->mmu_role. base_or_ext . reg##_##name); \
+ return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
}
-BUILD_MMU_ROLE_ACCESSOR(ext, cr0, pg);
BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
-BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pae);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
+BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
+
+static inline bool is_cr0_pg(struct kvm_mmu *mmu)
+{
+ return mmu->cpu_role.base.level > 0;
+}
+
+static inline bool is_cr4_pae(struct kvm_mmu *mmu)
+{
+ return !mmu->cpu_role.base.has_4_byte_gpte;
+}
static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
{
@@ -244,19 +254,6 @@ static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
return regs;
}
-static int role_regs_to_root_level(struct kvm_mmu_role_regs *regs)
-{
- if (!____is_cr0_pg(regs))
- return 0;
- else if (____is_efer_lma(regs))
- return ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL :
- PT64_ROOT_4LEVEL;
- else if (____is_cr4_pae(regs))
- return PT32E_ROOT_LEVEL;
- else
- return PT32_ROOT_LEVEL;
-}
-
static inline bool kvm_available_flush_tlb_with_range(void)
{
return kvm_x86_ops.tlb_remote_flush_with_range;
@@ -473,30 +470,6 @@ retry:
}
#endif
-static bool spte_has_volatile_bits(u64 spte)
-{
- if (!is_shadow_present_pte(spte))
- return false;
-
- /*
- * Always atomically update spte if it can be updated
- * out of mmu-lock, it can ensure dirty bit is not lost,
- * also, it can help us to get a stable is_writable_pte()
- * to ensure tlb flush is not missed.
- */
- if (spte_can_locklessly_be_made_writable(spte) ||
- is_access_track_spte(spte))
- return true;
-
- if (spte_ad_enabled(spte)) {
- if ((spte & shadow_accessed_mask) == 0 ||
- (is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
- return true;
- }
-
- return false;
-}
-
/* Rules for using mmu_spte_set:
* Set the sptep from nonpresent to present.
* Note: the sptep being assigned *must* be either not present
@@ -557,7 +530,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
* we always atomically update it, see the comments in
* spte_has_volatile_bits().
*/
- if (spte_can_locklessly_be_made_writable(old_spte) &&
+ if (is_mmu_writable_spte(old_spte) &&
!is_writable_pte(new_spte))
flush = true;
@@ -591,7 +564,8 @@ static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep)
u64 old_spte = *sptep;
int level = sptep_to_sp(sptep)->role.level;
- if (!spte_has_volatile_bits(old_spte))
+ if (!is_shadow_present_pte(old_spte) ||
+ !spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, 0ull);
else
old_spte = __update_clear_spte_slow(sptep, 0ull);
@@ -737,6 +711,9 @@ static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
{
+ if (sp->role.passthrough)
+ return sp->gfn;
+
if (!sp->role.direct)
return sp->gfns[index];
@@ -745,6 +722,11 @@ static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
{
+ if (sp->role.passthrough) {
+ WARN_ON_ONCE(gfn != sp->gfn);
+ return;
+ }
+
if (!sp->role.direct) {
sp->gfns[index] = gfn;
return;
@@ -1187,7 +1169,7 @@ static bool spte_write_protect(u64 *sptep, bool pt_protect)
u64 spte = *sptep;
if (!is_writable_pte(spte) &&
- !(pt_protect && spte_can_locklessly_be_made_writable(spte)))
+ !(pt_protect && is_mmu_writable_spte(spte)))
return false;
rmap_printk("spte %p %llx\n", sptep, *sptep);
@@ -1856,27 +1838,35 @@ static bool kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list);
+static bool sp_has_gptes(struct kvm_mmu_page *sp)
+{
+ if (sp->role.direct)
+ return false;
+
+ if (sp->role.passthrough)
+ return false;
+
+ return true;
+}
+
#define for_each_valid_sp(_kvm, _sp, _list) \
hlist_for_each_entry(_sp, _list, hash_link) \
if (is_obsolete_sp((_kvm), (_sp))) { \
} else
-#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
+#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
for_each_valid_sp(_kvm, _sp, \
&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
- if ((_sp)->gfn != (_gfn) || (_sp)->role.direct) {} else
+ if ((_sp)->gfn != (_gfn) || !sp_has_gptes(_sp)) {} else
-static bool kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
struct list_head *invalid_list)
{
int ret = vcpu->arch.mmu->sync_page(vcpu, sp);
- if (ret < 0) {
+ if (ret < 0)
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
- return false;
- }
-
- return !!ret;
+ return ret;
}
static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
@@ -1998,7 +1988,7 @@ static int mmu_sync_children(struct kvm_vcpu *vcpu,
for_each_sp(pages, sp, parents, i) {
kvm_unlink_unsync_page(vcpu->kvm, sp);
- flush |= kvm_sync_page(vcpu, sp, &invalid_list);
+ flush |= kvm_sync_page(vcpu, sp, &invalid_list) > 0;
mmu_pages_clear_parents(&parents);
}
if (need_resched() || rwlock_needbreak(&vcpu->kvm->mmu_lock)) {
@@ -2034,15 +2024,16 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
int direct,
unsigned int access)
{
- bool direct_mmu = vcpu->arch.mmu->direct_map;
+ bool direct_mmu = vcpu->arch.mmu->root_role.direct;
union kvm_mmu_page_role role;
struct hlist_head *sp_list;
unsigned quadrant;
struct kvm_mmu_page *sp;
+ int ret;
int collisions = 0;
LIST_HEAD(invalid_list);
- role = vcpu->arch.mmu->mmu_role.base;
+ role = vcpu->arch.mmu->root_role;
role.level = level;
role.direct = direct;
role.access = access;
@@ -2051,6 +2042,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
role.quadrant = quadrant;
}
+ if (level <= vcpu->arch.mmu->cpu_role.base.level)
+ role.passthrough = 0;
sp_list = &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
for_each_valid_sp(vcpu->kvm, sp, sp_list) {
@@ -2091,11 +2084,13 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
* If the sync fails, the page is zapped. If so, break
* in order to rebuild it.
*/
- if (!kvm_sync_page(vcpu, sp, &invalid_list))
+ ret = kvm_sync_page(vcpu, sp, &invalid_list);
+ if (ret < 0)
break;
WARN_ON(!list_empty(&invalid_list));
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (ret > 0)
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
__clear_sp_write_flooding_count(sp);
@@ -2112,7 +2107,7 @@ trace_get_page:
sp->gfn = gfn;
sp->role = role;
hlist_add_head(&sp->hash_link, sp_list);
- if (!direct) {
+ if (sp_has_gptes(sp)) {
account_shadowed(vcpu->kvm, sp);
if (level == PG_LEVEL_4K && kvm_vcpu_write_protect_gfn(vcpu, gfn))
kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1);
@@ -2132,11 +2127,11 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
{
iterator->addr = addr;
iterator->shadow_addr = root;
- iterator->level = vcpu->arch.mmu->shadow_root_level;
+ iterator->level = vcpu->arch.mmu->root_role.level;
if (iterator->level >= PT64_ROOT_4LEVEL &&
- vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
- !vcpu->arch.mmu->direct_map)
+ vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
+ !vcpu->arch.mmu->root_role.direct)
iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
@@ -2321,7 +2316,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
/* Zapping children means active_mmu_pages has become unstable. */
list_unstable = *nr_zapped;
- if (!sp->role.invalid && !sp->role.direct)
+ if (!sp->role.invalid && sp_has_gptes(sp))
unaccount_shadowed(kvm, sp);
if (sp->unsync)
@@ -2501,7 +2496,7 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
r = 0;
write_lock(&kvm->mmu_lock);
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
sp->role.word);
r = 1;
@@ -2518,7 +2513,7 @@ static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
gpa_t gpa;
int r;
- if (vcpu->arch.mmu->direct_map)
+ if (vcpu->arch.mmu->root_role.direct)
return 0;
gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2563,7 +2558,7 @@ int mmu_try_to_unsync_pages(struct kvm *kvm, const struct kvm_memory_slot *slot,
* that case, KVM must complete emulation of the guest TLB flush before
* allowing shadow pages to become unsync (writable by the guest).
*/
- for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
+ for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
if (!can_unsync)
return -EPERM;
@@ -3036,7 +3031,7 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, struct kvm_page_fault *fa
* and only if L1's MAXPHYADDR is inaccurate with respect to
* the hardware's).
*/
- if (unlikely(!shadow_mmio_value) ||
+ if (unlikely(!enable_mmio_caching) ||
unlikely(fault->gfn > kvm_mmu_max_gfn())) {
*ret_val = RET_PF_EMULATE;
return true;
@@ -3196,8 +3191,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
* be removed in the fast path only if the SPTE was
* write-protected for dirty-logging or access tracking.
*/
- if (fault->write &&
- spte_can_locklessly_be_made_writable(spte)) {
+ if (fault->write && is_mmu_writable_spte(spte)) {
new_spte |= PT_WRITABLE_MASK;
/*
@@ -3327,7 +3321,7 @@ void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu)
* This should not be called while L2 is active, L2 can't invalidate
* _only_ its own roots, e.g. INVVPID unconditionally exits.
*/
- WARN_ON_ONCE(mmu->mmu_role.base.guest_mode);
+ WARN_ON_ONCE(mmu->root_role.guest_mode);
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
root_hpa = mmu->prev_roots[i].hpa;
@@ -3370,7 +3364,7 @@ static hpa_t mmu_alloc_root(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gva,
static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- u8 shadow_root_level = mmu->shadow_root_level;
+ u8 shadow_root_level = mmu->root_role.level;
hpa_t root;
unsigned i;
int r;
@@ -3494,7 +3488,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* On SVM, reading PDPTRs might access guest memory, which might fault
* and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
*/
- if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
for (i = 0; i < 4; ++i) {
pdptrs[i] = mmu->get_pdptr(vcpu, i);
if (!(pdptrs[i] & PT_PRESENT_MASK))
@@ -3518,9 +3512,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* Do we shadow a long mode page table? If so we need to
* write-protect the guests page table root.
*/
- if (mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
root = mmu_alloc_root(vcpu, root_gfn, 0,
- mmu->shadow_root_level, false);
+ mmu->root_role.level, false);
mmu->root.hpa = root;
goto set_root_pgd;
}
@@ -3536,7 +3530,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
* the shadow page table may be a PAE or a long mode page table.
*/
pm_mask = PT_PRESENT_MASK | shadow_me_mask;
- if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+ if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
if (WARN_ON_ONCE(!mmu->pml4_root)) {
@@ -3545,7 +3539,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask;
- if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) {
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
if (WARN_ON_ONCE(!mmu->pml5_root)) {
r = -EIO;
goto out_unlock;
@@ -3557,7 +3551,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
for (i = 0; i < 4; ++i) {
WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
- if (mmu->root_level == PT32E_ROOT_LEVEL) {
+ if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
if (!(pdptrs[i] & PT_PRESENT_MASK)) {
mmu->pae_root[i] = INVALID_PAE_ROOT;
continue;
@@ -3570,9 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
mmu->pae_root[i] = root | pm_mask;
}
- if (mmu->shadow_root_level == PT64_ROOT_5LEVEL)
+ if (mmu->root_role.level == PT64_ROOT_5LEVEL)
mmu->root.hpa = __pa(mmu->pml5_root);
- else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL)
+ else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
mmu->root.hpa = __pa(mmu->pml4_root);
else
mmu->root.hpa = __pa(mmu->pae_root);
@@ -3588,7 +3582,7 @@ out_unlock:
static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL;
+ bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
u64 *pml5_root = NULL;
u64 *pml4_root = NULL;
u64 *pae_root;
@@ -3599,8 +3593,9 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
* equivalent level in the guest's NPT to shadow. Allocate the tables
* on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
*/
- if (mmu->direct_map || mmu->root_level >= PT64_ROOT_4LEVEL ||
- mmu->shadow_root_level < PT64_ROOT_4LEVEL)
+ if (mmu->root_role.direct ||
+ mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+ mmu->root_role.level < PT64_ROOT_4LEVEL)
return 0;
/*
@@ -3696,7 +3691,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
int i;
struct kvm_mmu_page *sp;
- if (vcpu->arch.mmu->direct_map)
+ if (vcpu->arch.mmu->root_role.direct)
return;
if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
@@ -3704,7 +3699,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
- if (vcpu->arch.mmu->root_level >= PT64_ROOT_4LEVEL) {
+ if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
hpa_t root = vcpu->arch.mmu->root.hpa;
sp = to_shadow_page(root);
@@ -3926,7 +3921,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
arch.token = alloc_apf_token(vcpu);
arch.gfn = gfn;
- arch.direct_map = vcpu->arch.mmu->direct_map;
+ arch.direct_map = vcpu->arch.mmu->root_role.direct;
arch.cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu);
return kvm_setup_async_pf(vcpu, cr2_or_gpa,
@@ -4144,7 +4139,6 @@ static void nonpaging_init_context(struct kvm_mmu *context)
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->direct_map = true;
}
static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
@@ -4238,7 +4232,7 @@ static bool fast_pgd_switch(struct kvm *kvm, struct kvm_mmu *mmu,
void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
- union kvm_mmu_page_role new_role = mmu->mmu_role.base;
+ union kvm_mmu_page_role new_role = mmu->root_role;
if (!fast_pgd_switch(vcpu->kvm, mmu, new_pgd, new_role)) {
/* kvm_mmu_ensure_valid_pgd will set up a new root. */
@@ -4420,7 +4414,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
{
__reset_rsvds_bits_mask(&context->guest_rsvd_check,
vcpu->arch.reserved_gpa_bits,
- context->root_level, is_efer_nx(context),
+ context->cpu_role.base.level, is_efer_nx(context),
guest_can_use_gbpages(vcpu),
is_cr4_pse(context),
guest_cpuid_is_amd_or_hygon(vcpu));
@@ -4485,16 +4479,6 @@ static inline u64 reserved_hpa_bits(void)
static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct kvm_mmu *context)
{
- /*
- * KVM uses NX when TDP is disabled to handle a variety of scenarios,
- * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
- * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
- * The iTLB multi-hit workaround can be toggled at any time, so assume
- * NX can be used by any non-nested shadow MMU to avoid having to reset
- * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
- */
- bool uses_nx = is_efer_nx(context) || !tdp_enabled;
-
/* @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. */
bool is_amd = true;
/* KVM doesn't use 2-level page tables for the shadow MMU. */
@@ -4502,17 +4486,18 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
struct rsvd_bits_validate *shadow_zero_check;
int i;
- WARN_ON_ONCE(context->shadow_root_level < PT32E_ROOT_LEVEL);
+ WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
shadow_zero_check = &context->shadow_zero_check;
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
- context->shadow_root_level, uses_nx,
+ context->root_role.level,
+ context->root_role.efer_nx,
guest_can_use_gbpages(vcpu), is_pse, is_amd);
if (!shadow_me_mask)
return;
- for (i = context->shadow_root_level; --i >= 0;) {
+ for (i = context->root_role.level; --i >= 0;) {
shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
}
@@ -4539,7 +4524,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
if (boot_cpu_is_amd())
__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
- context->shadow_root_level, false,
+ context->root_role.level, false,
boot_cpu_has(X86_FEATURE_GBPAGES),
false, true);
else
@@ -4550,7 +4535,7 @@ reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
if (!shadow_me_mask)
return;
- for (i = context->shadow_root_level; --i >= 0;) {
+ for (i = context->root_role.level; --i >= 0;) {
shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
}
@@ -4735,7 +4720,6 @@ static void paging64_init_context(struct kvm_mmu *context)
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->direct_map = false;
}
static void paging32_init_context(struct kvm_mmu *context)
@@ -4744,51 +4728,45 @@ static void paging32_init_context(struct kvm_mmu *context)
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->direct_map = false;
}
-static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs)
+static union kvm_cpu_role
+kvm_calc_cpu_role(struct kvm_vcpu *vcpu, const struct kvm_mmu_role_regs *regs)
{
- union kvm_mmu_extended_role ext = {0};
-
- if (____is_cr0_pg(regs)) {
- ext.cr0_pg = 1;
- ext.cr4_pae = ____is_cr4_pae(regs);
- ext.cr4_smep = ____is_cr4_smep(regs);
- ext.cr4_smap = ____is_cr4_smap(regs);
- ext.cr4_pse = ____is_cr4_pse(regs);
-
- /* PKEY and LA57 are active iff long mode is active. */
- ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
- ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
- ext.efer_lma = ____is_efer_lma(regs);
- }
-
- ext.valid = 1;
-
- return ext;
-}
-
-static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs,
- bool base_only)
-{
- union kvm_mmu_role role = {0};
+ union kvm_cpu_role role = {0};
role.base.access = ACC_ALL;
- if (____is_cr0_pg(regs)) {
- role.base.efer_nx = ____is_efer_nx(regs);
- role.base.cr0_wp = ____is_cr0_wp(regs);
- }
role.base.smm = is_smm(vcpu);
role.base.guest_mode = is_guest_mode(vcpu);
+ role.ext.valid = 1;
- if (base_only)
+ if (!____is_cr0_pg(regs)) {
+ role.base.direct = 1;
return role;
+ }
- role.ext = kvm_calc_mmu_role_ext(vcpu, regs);
+ role.base.efer_nx = ____is_efer_nx(regs);
+ role.base.cr0_wp = ____is_cr0_wp(regs);
+ role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
+ role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
+ role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
+ if (____is_efer_lma(regs))
+ role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
+ : PT64_ROOT_4LEVEL;
+ else if (____is_cr4_pae(regs))
+ role.base.level = PT32E_ROOT_LEVEL;
+ else
+ role.base.level = PT32_ROOT_LEVEL;
+
+ role.ext.cr4_smep = ____is_cr4_smep(regs);
+ role.ext.cr4_smap = ____is_cr4_smap(regs);
+ role.ext.cr4_pse = ____is_cr4_pse(regs);
+
+ /* PKEY and LA57 are active iff long mode is active. */
+ role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
+ role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
+ role.ext.efer_lma = ____is_efer_lma(regs);
return role;
}
@@ -4805,40 +4783,43 @@ static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
return max_tdp_level;
}
-static union kvm_mmu_role
+static union kvm_mmu_page_role
kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs, bool base_only)
+ union kvm_cpu_role cpu_role)
{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
+ union kvm_mmu_page_role role = {0};
- role.base.ad_disabled = (shadow_accessed_mask == 0);
- role.base.level = kvm_mmu_get_tdp_level(vcpu);
- role.base.direct = true;
- role.base.has_4_byte_gpte = false;
+ role.access = ACC_ALL;
+ role.cr0_wp = true;
+ role.efer_nx = true;
+ role.smm = cpu_role.base.smm;
+ role.guest_mode = cpu_role.base.guest_mode;
+ role.ad_disabled = (shadow_accessed_mask == 0);
+ role.level = kvm_mmu_get_tdp_level(vcpu);
+ role.direct = true;
+ role.has_4_byte_gpte = false;
return role;
}
-static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- union kvm_mmu_role new_role =
- kvm_calc_tdp_mmu_root_page_role(vcpu, &regs, false);
+ union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
- if (new_role.as_u64 == context->mmu_role.as_u64)
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
return;
- context->mmu_role.as_u64 = new_role.as_u64;
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
context->page_fault = kvm_tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = NULL;
- context->shadow_root_level = kvm_mmu_get_tdp_level(vcpu);
- context->direct_map = true;
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
- context->root_level = role_regs_to_root_level(&regs);
if (!is_cr0_pg(context))
context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -4851,46 +4832,16 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
reset_tdp_shadow_zero_bits_mask(context);
}
-static union kvm_mmu_role
-kvm_calc_shadow_root_page_role_common(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs, bool base_only)
-{
- union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, regs, base_only);
-
- role.base.smep_andnot_wp = role.ext.cr4_smep && !____is_cr0_wp(regs);
- role.base.smap_andnot_wp = role.ext.cr4_smap && !____is_cr0_wp(regs);
- role.base.has_4_byte_gpte = ____is_cr0_pg(regs) && !____is_cr4_pae(regs);
-
- return role;
-}
-
-static union kvm_mmu_role
-kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs, bool base_only)
-{
- union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, regs, base_only);
-
- role.base.direct = !____is_cr0_pg(regs);
-
- if (!____is_efer_lma(regs))
- role.base.level = PT32E_ROOT_LEVEL;
- else if (____is_cr4_la57(regs))
- role.base.level = PT64_ROOT_5LEVEL;
- else
- role.base.level = PT64_ROOT_4LEVEL;
-
- return role;
-}
-
static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
- struct kvm_mmu_role_regs *regs,
- union kvm_mmu_role new_role)
+ union kvm_cpu_role cpu_role,
+ union kvm_mmu_page_role root_role)
{
- if (new_role.as_u64 == context->mmu_role.as_u64)
+ if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
+ root_role.word == context->root_role.word)
return;
- context->mmu_role.as_u64 = new_role.as_u64;
+ context->cpu_role.as_u64 = cpu_role.as_u64;
+ context->root_role.word = root_role.word;
if (!is_cr0_pg(context))
nonpaging_init_context(context);
@@ -4898,35 +4849,34 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
paging64_init_context(context);
else
paging32_init_context(context);
- context->root_level = role_regs_to_root_level(regs);
reset_guest_paging_metadata(vcpu, context);
- context->shadow_root_level = new_role.base.level;
-
reset_shadow_zero_bits_mask(vcpu, context);
}
static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs)
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- union kvm_mmu_role new_role =
- kvm_calc_shadow_mmu_root_page_role(vcpu, regs, false);
+ union kvm_mmu_page_role root_role;
- shadow_mmu_init_context(vcpu, context, regs, new_role);
-}
+ root_role = cpu_role.base;
-static union kvm_mmu_role
-kvm_calc_shadow_npt_root_page_role(struct kvm_vcpu *vcpu,
- struct kvm_mmu_role_regs *regs)
-{
- union kvm_mmu_role role =
- kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
+ /* KVM uses PAE paging whenever the guest isn't using 64-bit paging. */
+ root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
- role.base.direct = false;
- role.base.level = kvm_mmu_get_tdp_level(vcpu);
+ /*
+ * KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts.
+ */
+ root_role.efer_nx = true;
- return role;
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
}
void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -4938,24 +4888,34 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
.cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
- union kvm_mmu_role new_role;
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+ union kvm_mmu_page_role root_role;
+
+ /* NPT requires CR0.PG=1. */
+ WARN_ON_ONCE(cpu_role.base.direct);
- new_role = kvm_calc_shadow_npt_root_page_role(vcpu, &regs);
+ root_role = cpu_role.base;
+ root_role.level = kvm_mmu_get_tdp_level(vcpu);
+ if (root_role.level == PT64_ROOT_5LEVEL &&
+ cpu_role.base.level == PT64_ROOT_4LEVEL)
+ root_role.passthrough = 1;
- shadow_mmu_init_context(vcpu, context, &regs, new_role);
+ shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
kvm_mmu_new_pgd(vcpu, nested_cr3);
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
-static union kvm_mmu_role
+static union kvm_cpu_role
kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
bool execonly, u8 level)
{
- union kvm_mmu_role role = {0};
-
- /* SMM flag is inherited from root_mmu */
- role.base.smm = vcpu->arch.root_mmu.mmu_role.base.smm;
+ union kvm_cpu_role role = {0};
+ /*
+ * KVM does not support SMM transfer monitors, and consequently does not
+ * support the "entry to SMM" control either. role.base.smm is always 0.
+ */
+ WARN_ON_ONCE(is_smm(vcpu));
role.base.level = level;
role.base.has_4_byte_gpte = false;
role.base.direct = false;
@@ -4963,7 +4923,6 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
role.base.guest_mode = true;
role.base.access = ACC_ALL;
- /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
role.ext.word = 0;
role.ext.execonly = execonly;
role.ext.valid = 1;
@@ -4977,22 +4936,20 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
{
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
u8 level = vmx_eptp_page_walk_level(new_eptp);
- union kvm_mmu_role new_role =
+ union kvm_cpu_role new_mode =
kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
execonly, level);
- if (new_role.as_u64 != context->mmu_role.as_u64) {
- context->mmu_role.as_u64 = new_role.as_u64;
+ if (new_mode.as_u64 != context->cpu_role.as_u64) {
+ /* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
+ context->cpu_role.as_u64 = new_mode.as_u64;
+ context->root_role.word = new_mode.base.word;
- context->shadow_root_level = level;
-
- context->ept_ad = accessed_dirty;
context->page_fault = ept_page_fault;
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
- context->root_level = level;
- context->direct_map = false;
+
update_permission_bitmask(context, true);
context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
@@ -5003,49 +4960,30 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
}
EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
+static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role cpu_role)
{
struct kvm_mmu *context = &vcpu->arch.root_mmu;
- struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- kvm_init_shadow_mmu(vcpu, &regs);
+ kvm_init_shadow_mmu(vcpu, cpu_role);
context->get_guest_pgd = get_cr3;
context->get_pdptr = kvm_pdptr_read;
context->inject_page_fault = kvm_inject_page_fault;
}
-static union kvm_mmu_role
-kvm_calc_nested_mmu_role(struct kvm_vcpu *vcpu, struct kvm_mmu_role_regs *regs)
-{
- union kvm_mmu_role role;
-
- role = kvm_calc_shadow_root_page_role_common(vcpu, regs, false);
-
- /*
- * Nested MMUs are used only for walking L2's gva->gpa, they never have
- * shadow pages of their own and so "direct" has no meaning. Set it
- * to "true" to try to detect bogus usage of the nested MMU.
- */
- role.base.direct = true;
- role.base.level = role_regs_to_root_level(regs);
- return role;
-}
-
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
+static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+ union kvm_cpu_role new_mode)
{
- struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
- union kvm_mmu_role new_role = kvm_calc_nested_mmu_role(vcpu, &regs);
struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
- if (new_role.as_u64 == g_context->mmu_role.as_u64)
+ if (new_mode.as_u64 == g_context->cpu_role.as_u64)
return;
- g_context->mmu_role.as_u64 = new_role.as_u64;
+ g_context->cpu_role.as_u64 = new_mode.as_u64;
g_context->get_guest_pgd = get_cr3;
g_context->get_pdptr = kvm_pdptr_read;
g_context->inject_page_fault = kvm_inject_page_fault;
- g_context->root_level = new_role.base.level;
/*
* L2 page tables are never shadowed, so there is no need to sync
@@ -5075,12 +5013,15 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
void kvm_init_mmu(struct kvm_vcpu *vcpu)
{
+ struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
+ union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
+
if (mmu_is_nested(vcpu))
- init_kvm_nested_mmu(vcpu);
+ init_kvm_nested_mmu(vcpu, cpu_role);
else if (tdp_enabled)
- init_kvm_tdp_mmu(vcpu);
+ init_kvm_tdp_mmu(vcpu, cpu_role);
else
- init_kvm_softmmu(vcpu);
+ init_kvm_softmmu(vcpu, cpu_role);
}
EXPORT_SYMBOL_GPL(kvm_init_mmu);
@@ -5098,9 +5039,12 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
* problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
- vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
- vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
- vcpu->arch.nested_mmu.mmu_role.ext.valid = 0;
+ vcpu->arch.root_mmu.root_role.word = 0;
+ vcpu->arch.guest_mmu.root_role.word = 0;
+ vcpu->arch.nested_mmu.root_role.word = 0;
+ vcpu->arch.root_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.guest_mmu.cpu_role.ext.valid = 0;
+ vcpu->arch.nested_mmu.cpu_role.ext.valid = 0;
kvm_mmu_reset_context(vcpu);
/*
@@ -5121,13 +5065,13 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
{
int r;
- r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->direct_map);
+ r = mmu_topup_memory_caches(vcpu, !vcpu->arch.mmu->root_role.direct);
if (r)
goto out;
r = mmu_alloc_special_roots(vcpu);
if (r)
goto out;
- if (vcpu->arch.mmu->direct_map)
+ if (vcpu->arch.mmu->root_role.direct)
r = mmu_alloc_direct_roots(vcpu);
else
r = mmu_alloc_shadow_roots(vcpu);
@@ -5354,7 +5298,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
++vcpu->kvm->stat.mmu_pte_write;
- for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
+ for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
if (detect_write_misaligned(sp, gpa, bytes) ||
detect_write_flooding(sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
@@ -5384,7 +5328,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = EMULTYPE_PF;
- bool direct = vcpu->arch.mmu->direct_map;
+ bool direct = vcpu->arch.mmu->root_role.direct;
if (WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
return RET_PF_RETRY;
@@ -5415,7 +5359,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
* paging in both guests. If true, we simply unprotect the page
* and resume the guest.
*/
- if (vcpu->arch.mmu->direct_map &&
+ if (vcpu->arch.mmu->root_role.direct &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
return 1;
@@ -6307,7 +6251,7 @@ int kvm_mmu_vendor_module_init(void)
*/
BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
- BUILD_BUG_ON(sizeof(union kvm_mmu_role) != sizeof(u64));
+ BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
kvm_mmu_reset_all_pte_masks();
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 01fee5f67ac3..b025decf610d 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -63,7 +63,7 @@
#define PT_LEVEL_BITS PT64_LEVEL_BITS
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
- #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
+ #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled)
#ifdef CONFIG_X86_64
#define CMPXCHG "cmpxchgq"
#endif
@@ -144,42 +144,6 @@ static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level)
FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte);
}
-static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- pt_element_t __user *ptep_user, unsigned index,
- pt_element_t orig_pte, pt_element_t new_pte)
-{
- signed char r;
-
- if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
- return -EFAULT;
-
-#ifdef CMPXCHG
- asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
- "setnz %b[r]\n"
- "2:"
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
- : [ptr] "+m" (*ptep_user),
- [old] "+a" (orig_pte),
- [r] "=q" (r)
- : [new] "r" (new_pte)
- : "memory");
-#else
- asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
- "setnz %b[r]\n"
- "2:"
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %k[r])
- : [ptr] "+m" (*ptep_user),
- [old] "+A" (orig_pte),
- [r] "=q" (r)
- : [new_lo] "b" ((u32)new_pte),
- [new_hi] "c" ((u32)(new_pte >> 32))
- : "memory");
-#endif
-
- user_access_end();
- return r;
-}
-
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
u64 gpte)
@@ -187,7 +151,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
if (!FNAME(is_present_gpte)(gpte))
goto no_present;
- /* if accessed bit is not supported prefetch non accessed gpte */
+ /* Prefetch only accessed entries (unless A/D bits are disabled). */
if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
!(gpte & PT_GUEST_ACCESSED_MASK))
goto no_present;
@@ -278,7 +242,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
if (unlikely(!walker->pte_writable[level - 1]))
continue;
- ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
+ ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault);
if (ret)
return ret;
@@ -317,7 +281,7 @@ static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu,
* is not reserved and does not indicate a large page at this level,
* so clear PT_PAGE_SIZE_MASK in gpte if that is the case.
*/
- gpte &= level - (PT32_ROOT_LEVEL + mmu->mmu_role.ext.cr4_pse);
+ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse);
#endif
/*
* PG_LEVEL_4K always terminates. The RHS has bit 7 set
@@ -355,7 +319,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
- walker->level = mmu->root_level;
+ walker->level = mmu->cpu_role.base.level;
pte = mmu->get_guest_pgd(vcpu);
have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
@@ -515,14 +479,21 @@ error:
* The other bits are set to 0.
*/
if (!(errcode & PFERR_RSVD_MASK)) {
- vcpu->arch.exit_qualification &= 0x180;
+ vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID |
+ EPT_VIOLATION_GVA_TRANSLATED);
if (write_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE;
if (user_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ;
if (fetch_fault)
vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR;
- vcpu->arch.exit_qualification |= (pte_access & 0x7) << 3;
+
+ /*
+ * Note, pte_access holds the raw RWX bits from the EPTE, not
+ * ACC_*_MASK flags!
+ */
+ vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) <<
+ EPT_VIOLATION_RWX_SHIFT;
}
#endif
walker->fault.address = addr;
@@ -650,7 +621,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
WARN_ON_ONCE(gw->gfn != base_gfn);
direct_access = gw->pte_access;
- top_level = vcpu->arch.mmu->root_level;
+ top_level = vcpu->arch.mmu->cpu_role.base.level;
if (top_level == PT32E_ROOT_LEVEL)
top_level = PT32_ROOT_LEVEL;
/*
@@ -1017,7 +988,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
- union kvm_mmu_page_role mmu_role = vcpu->arch.mmu->mmu_role.base;
+ union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
int i;
bool host_writable;
gpa_t first_pte_gpa;
@@ -1036,6 +1007,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
.level = 0xf,
.access = 0x7,
.quadrant = 0x3,
+ .passthrough = 0x1,
};
/*
@@ -1045,7 +1017,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
* reserved bits checks will be wrong, etc...
*/
if (WARN_ON_ONCE(sp->role.direct ||
- (sp->role.word ^ mmu_role.word) & ~sync_role_ign.word))
+ (sp->role.word ^ root_role.word) & ~sync_role_ign.word))
return -1;
first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 4739b53c9734..800b857b3a53 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -19,7 +19,7 @@
#include <asm/memtype.h>
#include <asm/vmx.h>
-static bool __read_mostly enable_mmio_caching = true;
+bool __read_mostly enable_mmio_caching = true;
module_param_named(mmio_caching, enable_mmio_caching, bool, 0444);
u64 __read_mostly shadow_host_writable_mask;
@@ -90,6 +90,34 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
E820_TYPE_RAM);
}
+/*
+ * Returns true if the SPTE has bits that may be set without holding mmu_lock.
+ * The caller is responsible for checking if the SPTE is shadow-present, and
+ * for determining whether or not the caller cares about non-leaf SPTEs.
+ */
+bool spte_has_volatile_bits(u64 spte)
+{
+ /*
+ * Always atomically update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (!is_writable_pte(spte) && is_mmu_writable_spte(spte))
+ return true;
+
+ if (is_access_track_spte(spte))
+ return true;
+
+ if (spte_ad_enabled(spte)) {
+ if (!(spte & shadow_accessed_mask) ||
+ (is_writable_pte(spte) && !(spte & shadow_dirty_mask)))
+ return true;
+ }
+
+ return false;
+}
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
@@ -139,7 +167,7 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
else
pte_access &= ~ACC_WRITE_MASK;
- if (!kvm_is_mmio_pfn(pfn))
+ if (shadow_me_mask && !kvm_is_mmio_pfn(pfn))
spte |= shadow_me_mask;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -351,6 +379,9 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask)
WARN_ON(mmio_value && (REMOVED_SPTE & mmio_mask) == mmio_value))
mmio_value = 0;
+ if (!mmio_value)
+ enable_mmio_caching = false;
+
shadow_mmio_value = mmio_value;
shadow_mmio_mask = mmio_mask;
shadow_mmio_access_mask = access_mask;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index e4abeb5df1b1..76f12b375b5a 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -5,6 +5,8 @@
#include "mmu_internal.h"
+extern bool __read_mostly enable_mmio_caching;
+
/*
* A MMU present SPTE is backed by actual memory and may or may not be present
* in hardware. E.g. MMIO SPTEs are not considered present. Use bit 11, as it
@@ -204,7 +206,7 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
static inline bool is_mmio_spte(u64 spte)
{
return (spte & shadow_mmio_mask) == shadow_mmio_value &&
- likely(shadow_mmio_value);
+ likely(enable_mmio_caching);
}
static inline bool is_shadow_present_pte(u64 pte)
@@ -390,7 +392,7 @@ static inline void check_spte_writable_invariants(u64 spte)
"kvm: Writable SPTE is not MMU-writable: %llx", spte);
}
-static inline bool spte_can_locklessly_be_made_writable(u64 spte)
+static inline bool is_mmu_writable_spte(u64 spte)
{
return spte & shadow_mmu_writable_mask;
}
@@ -404,6 +406,8 @@ static inline u64 get_mmio_spte_generation(u64 spte)
return gen;
}
+bool spte_has_volatile_bits(u64 spte);
+
bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
const struct kvm_memory_slot *slot,
unsigned int pte_access, gfn_t gfn, kvm_pfn_t pfn,
diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
index b1eaf6ec0e0b..f0af385c56e0 100644
--- a/arch/x86/kvm/mmu/tdp_iter.h
+++ b/arch/x86/kvm/mmu/tdp_iter.h
@@ -6,6 +6,7 @@
#include <linux/kvm_host.h>
#include "mmu.h"
+#include "spte.h"
/*
* TDP MMU SPTEs are RCU protected to allow paging structures (non-leaf SPTEs)
@@ -17,9 +18,38 @@ static inline u64 kvm_tdp_mmu_read_spte(tdp_ptep_t sptep)
{
return READ_ONCE(*rcu_dereference(sptep));
}
-static inline void kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 val)
+
+static inline u64 kvm_tdp_mmu_write_spte_atomic(tdp_ptep_t sptep, u64 new_spte)
+{
+ return xchg(rcu_dereference(sptep), new_spte);
+}
+
+static inline void __kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 new_spte)
+{
+ WRITE_ONCE(*rcu_dereference(sptep), new_spte);
+}
+
+static inline u64 kvm_tdp_mmu_write_spte(tdp_ptep_t sptep, u64 old_spte,
+ u64 new_spte, int level)
{
- WRITE_ONCE(*rcu_dereference(sptep), val);
+ /*
+ * Atomically write the SPTE if it is a shadow-present, leaf SPTE with
+ * volatile bits, i.e. has bits that can be set outside of mmu_lock.
+ * The Writable bit can be set by KVM's fast page fault handler, and
+ * Accessed and Dirty bits can be set by the CPU.
+ *
+ * Note, non-leaf SPTEs do have Accessed bits and those bits are
+ * technically volatile, but KVM doesn't consume the Accessed bit of
+ * non-leaf SPTEs, i.e. KVM doesn't care if it clobbers the bit. This
+ * logic needs to be reassessed if KVM were to use non-leaf Accessed
+ * bits, e.g. to skip stepping down into child SPTEs when aging SPTEs.
+ */
+ if (is_shadow_present_pte(old_spte) && is_last_spte(old_spte, level) &&
+ spte_has_volatile_bits(old_spte))
+ return kvm_tdp_mmu_write_spte_atomic(sptep, new_spte);
+
+ __kvm_tdp_mmu_write_spte(sptep, new_spte);
+ return old_spte;
}
/*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index edc68538819b..f84133856063 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -310,7 +310,7 @@ static void tdp_mmu_init_child_sp(struct kvm_mmu_page *child_sp,
hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu)
{
- union kvm_mmu_page_role role = vcpu->arch.mmu->mmu_role.base;
+ union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *root;
@@ -426,9 +426,9 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
tdp_mmu_unlink_sp(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- u64 *sptep = rcu_dereference(pt) + i;
+ tdp_ptep_t sptep = pt + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
- u64 old_child_spte;
+ u64 old_spte;
if (shared) {
/*
@@ -440,8 +440,8 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* value to the removed SPTE value.
*/
for (;;) {
- old_child_spte = xchg(sptep, REMOVED_SPTE);
- if (!is_removed_spte(old_child_spte))
+ old_spte = kvm_tdp_mmu_write_spte_atomic(sptep, REMOVED_SPTE);
+ if (!is_removed_spte(old_spte))
break;
cpu_relax();
}
@@ -455,23 +455,43 @@ static void handle_removed_pt(struct kvm *kvm, tdp_ptep_t pt, bool shared)
* are guarded by the memslots generation, not by being
* unreachable.
*/
- old_child_spte = READ_ONCE(*sptep);
- if (!is_shadow_present_pte(old_child_spte))
+ old_spte = kvm_tdp_mmu_read_spte(sptep);
+ if (!is_shadow_present_pte(old_spte))
continue;
/*
- * Marking the SPTE as a removed SPTE is not
- * strictly necessary here as the MMU lock will
- * stop other threads from concurrently modifying
- * this SPTE. Using the removed SPTE value keeps
- * the two branches consistent and simplifies
- * the function.
+ * Use the common helper instead of a raw WRITE_ONCE as
+ * the SPTE needs to be updated atomically if it can be
+ * modified by a different vCPU outside of mmu_lock.
+ * Even though the parent SPTE is !PRESENT, the TLB
+ * hasn't yet been flushed, and both Intel and AMD
+ * document that A/D assists can use upper-level PxE
+ * entries that are cached in the TLB, i.e. the CPU can
+ * still access the page and mark it dirty.
+ *
+ * No retry is needed in the atomic update path as the
+ * sole concern is dropping a Dirty bit, i.e. no other
+ * task can zap/remove the SPTE as mmu_lock is held for
+ * write. Marking the SPTE as a removed SPTE is not
+ * strictly necessary for the same reason, but using
+ * the remove SPTE value keeps the shared/exclusive
+ * paths consistent and allows the handle_changed_spte()
+ * call below to hardcode the new value to REMOVED_SPTE.
+ *
+ * Note, even though dropping a Dirty bit is the only
+ * scenario where a non-atomic update could result in a
+ * functional bug, simply checking the Dirty bit isn't
+ * sufficient as a fast page fault could read the upper
+ * level SPTE before it is zapped, and then make this
+ * target SPTE writable, resume the guest, and set the
+ * Dirty bit between reading the SPTE above and writing
+ * it here.
*/
- WRITE_ONCE(*sptep, REMOVED_SPTE);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte,
+ REMOVED_SPTE, level);
}
handle_changed_spte(kvm, kvm_mmu_page_as_id(sp), gfn,
- old_child_spte, REMOVED_SPTE, level,
- shared);
+ old_spte, REMOVED_SPTE, level, shared);
}
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -667,14 +687,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
KVM_PAGES_PER_HPAGE(iter->level));
/*
- * No other thread can overwrite the removed SPTE as they
- * must either wait on the MMU lock or use
- * tdp_mmu_set_spte_atomic which will not overwrite the
- * special removed SPTE value. No bookkeeping is needed
- * here since the SPTE is going from non-present
- * to non-present.
+ * No other thread can overwrite the removed SPTE as they must either
+ * wait on the MMU lock or use tdp_mmu_set_spte_atomic() which will not
+ * overwrite the special removed SPTE value. No bookkeeping is needed
+ * here since the SPTE is going from non-present to non-present. Use
+ * the raw write helper to avoid an unnecessary check on volatile bits.
*/
- kvm_tdp_mmu_write_spte(iter->sptep, 0);
+ __kvm_tdp_mmu_write_spte(iter->sptep, 0);
return 0;
}
@@ -699,10 +718,13 @@ static inline int tdp_mmu_zap_spte_atomic(struct kvm *kvm,
* unless performing certain dirty logging operations.
* Leaving record_dirty_log unset in that case prevents page
* writes from being double counted.
+ *
+ * Returns the old SPTE value, which _may_ be different than @old_spte if the
+ * SPTE had voldatile bits.
*/
-static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
- u64 old_spte, u64 new_spte, gfn_t gfn, int level,
- bool record_acc_track, bool record_dirty_log)
+static u64 __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
+ u64 old_spte, u64 new_spte, gfn_t gfn, int level,
+ bool record_acc_track, bool record_dirty_log)
{
lockdep_assert_held_write(&kvm->mmu_lock);
@@ -715,7 +737,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
*/
WARN_ON(is_removed_spte(old_spte) || is_removed_spte(new_spte));
- kvm_tdp_mmu_write_spte(sptep, new_spte);
+ old_spte = kvm_tdp_mmu_write_spte(sptep, old_spte, new_spte, level);
__handle_changed_spte(kvm, as_id, gfn, old_spte, new_spte, level, false);
@@ -724,6 +746,7 @@ static void __tdp_mmu_set_spte(struct kvm *kvm, int as_id, tdp_ptep_t sptep,
if (record_dirty_log)
handle_changed_spte_dirty_log(kvm, as_id, gfn, old_spte,
new_spte, level);
+ return old_spte;
}
static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -732,9 +755,10 @@ static inline void _tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
{
WARN_ON_ONCE(iter->yielded);
- __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep, iter->old_spte,
- new_spte, iter->gfn, iter->level,
- record_acc_track, record_dirty_log);
+ iter->old_spte = __tdp_mmu_set_spte(kvm, iter->as_id, iter->sptep,
+ iter->old_spte, new_spte,
+ iter->gfn, iter->level,
+ record_acc_track, record_dirty_log);
}
static inline void tdp_mmu_set_spte(struct kvm *kvm, struct tdp_iter *iter,
@@ -1835,7 +1859,7 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
gfn_t gfn = addr >> PAGE_SHIFT;
int leaf = -1;
- *root_level = vcpu->arch.mmu->shadow_root_level;
+ *root_level = vcpu->arch.mmu->root_role.level;
tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) {
leaf = iter.level;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index eca39f56c231..618f529f1c4d 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -49,6 +49,32 @@
* * AMD: [0 .. AMD64_NUM_COUNTERS-1] <=> gp counters
*/
+static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
+
+#define KVM_X86_PMU_OP(func) \
+ DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
+ *(((struct kvm_pmu_ops *)0)->func));
+#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
+{
+ memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
+
+#define __KVM_X86_PMU_OP(func) \
+ static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
+#define KVM_X86_PMU_OP(func) \
+ WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
+#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
+#include <asm/kvm-x86-pmu-ops.h>
+#undef __KVM_X86_PMU_OP
+}
+
+static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
+{
+ return static_call(kvm_x86_pmu_pmc_is_enabled)(pmc);
+}
+
static void kvm_pmi_trigger_fn(struct irq_work *irq_work)
{
struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, irq_work);
@@ -213,7 +239,7 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
ARCH_PERFMON_EVENTSEL_CMASK |
HSW_IN_TX |
HSW_IN_TX_CHECKPOINTED))) {
- config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
if (config != PERF_COUNT_HW_MAX)
type = PERF_TYPE_HARDWARE;
}
@@ -263,7 +289,7 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx)
pmc->current_config = (u64)ctrl;
pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE,
- kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc),
+ static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc),
!(en_field & 0x2), /* exclude user */
!(en_field & 0x1), /* exclude kernel */
pmi);
@@ -272,7 +298,7 @@ EXPORT_SYMBOL_GPL(reprogram_fixed_counter);
void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx)
{
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, pmc_idx);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, pmc_idx);
if (!pmc)
return;
@@ -294,7 +320,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
int bit;
for_each_set_bit(bit, pmu->reprogram_pmi, X86_PMC_IDX_MAX) {
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, bit);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, bit);
if (unlikely(!pmc || !pmc->perf_event)) {
clear_bit(bit, pmu->reprogram_pmi);
@@ -316,7 +342,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
/* check if idx is a valid index to access PMU */
bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
- return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx);
+ return static_call(kvm_x86_pmu_is_valid_rdpmc_ecx)(vcpu, idx);
}
bool is_vmware_backdoor_pmc(u32 pmc_idx)
@@ -366,7 +392,7 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
- pmc = kvm_x86_ops.pmu_ops->rdpmc_ecx_to_pmc(vcpu, idx, &mask);
+ pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
if (!pmc)
return 1;
@@ -382,22 +408,21 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
{
if (lapic_in_kernel(vcpu)) {
- if (kvm_x86_ops.pmu_ops->deliver_pmi)
- kvm_x86_ops.pmu_ops->deliver_pmi(vcpu);
+ static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC);
}
}
bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
- return kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr) ||
- kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, msr);
+ return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) ||
+ static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
}
static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc = kvm_x86_ops.pmu_ops->msr_idx_to_pmc(vcpu, msr);
+ struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
if (pmc)
__set_bit(pmc->idx, pmu->pmc_in_use);
@@ -405,13 +430,13 @@ static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
- return kvm_x86_ops.pmu_ops->get_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
}
int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
- return kvm_x86_ops.pmu_ops->set_msr(vcpu, msr_info);
+ return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}
/* refresh PMU settings. This function generally is called when underlying
@@ -420,7 +445,7 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
*/
void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
{
- kvm_x86_ops.pmu_ops->refresh(vcpu);
+ static_call(kvm_x86_pmu_refresh)(vcpu);
}
void kvm_pmu_reset(struct kvm_vcpu *vcpu)
@@ -428,7 +453,7 @@ void kvm_pmu_reset(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
irq_work_sync(&pmu->irq_work);
- kvm_x86_ops.pmu_ops->reset(vcpu);
+ static_call(kvm_x86_pmu_reset)(vcpu);
}
void kvm_pmu_init(struct kvm_vcpu *vcpu)
@@ -436,7 +461,7 @@ void kvm_pmu_init(struct kvm_vcpu *vcpu)
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
memset(pmu, 0, sizeof(*pmu));
- kvm_x86_ops.pmu_ops->init(vcpu);
+ static_call(kvm_x86_pmu_init)(vcpu);
init_irq_work(&pmu->irq_work, kvm_pmi_trigger_fn);
pmu->event_count = 0;
pmu->need_cleanup = false;
@@ -468,14 +493,13 @@ void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
pmu->pmc_in_use, X86_PMC_IDX_MAX);
for_each_set_bit(i, bitmask, X86_PMC_IDX_MAX) {
- pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (pmc && pmc->perf_event && !pmc_speculative_in_use(pmc))
pmc_stop_counter(pmc);
}
- if (kvm_x86_ops.pmu_ops->cleanup)
- kvm_x86_ops.pmu_ops->cleanup(vcpu);
+ static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
bitmap_zero(pmu->pmc_in_use, X86_PMC_IDX_MAX);
}
@@ -505,7 +529,7 @@ static inline bool eventsel_match_perf_hw_id(struct kvm_pmc *pmc,
unsigned int config;
pmc->eventsel &= (ARCH_PERFMON_EVENTSEL_EVENT | ARCH_PERFMON_EVENTSEL_UMASK);
- config = kvm_x86_ops.pmu_ops->pmc_perf_hw_id(pmc);
+ config = static_call(kvm_x86_pmu_pmc_perf_hw_id)(pmc);
pmc->eventsel = old_eventsel;
return config == perf_hw_id;
}
@@ -533,7 +557,7 @@ void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 perf_hw_id)
int i;
for_each_set_bit(i, pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX) {
- pmc = kvm_x86_ops.pmu_ops->pmc_idx_to_pmc(pmu, i);
+ pmc = static_call(kvm_x86_pmu_pmc_idx_to_pmc)(pmu, i);
if (!pmc || !pmc_is_enabled(pmc) || !pmc_speculative_in_use(pmc))
continue;
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 9e66fba1d6a3..2a53b6c9495c 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -39,6 +39,8 @@ struct kvm_pmu_ops {
void (*cleanup)(struct kvm_vcpu *vcpu);
};
+void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops);
+
static inline u64 pmc_bitmask(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
@@ -86,11 +88,6 @@ static inline bool pmc_is_fixed(struct kvm_pmc *pmc)
return pmc->type == KVM_PMC_FIXED;
}
-static inline bool pmc_is_enabled(struct kvm_pmc *pmc)
-{
- return kvm_x86_ops.pmu_ops->pmc_is_enabled(pmc);
-}
-
static inline bool kvm_valid_perf_global_ctrl(struct kvm_pmu *pmu,
u64 data)
{
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 421619540ff9..54fe03714f8a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -165,9 +165,8 @@ free_avic:
return err;
}
-void avic_init_vmcb(struct vcpu_svm *svm)
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb)
{
- struct vmcb *vmcb = svm->vmcb;
struct kvm_svm *kvm_svm = to_kvm_svm(svm->vcpu.kvm);
phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page));
phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page));
@@ -285,11 +284,77 @@ void avic_ring_doorbell(struct kvm_vcpu *vcpu)
put_cpu();
}
-static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
- u32 icrl, u32 icrh)
+/*
+ * A fast-path version of avic_kick_target_vcpus(), which attempts to match
+ * destination APIC ID to vCPU without looping through all vCPUs.
+ */
+static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
{
+ u32 dest, apic_id;
struct kvm_vcpu *vcpu;
+ int dest_mode = icrl & APIC_DEST_MASK;
+ int shorthand = icrl & APIC_SHORT_MASK;
+ struct kvm_svm *kvm_svm = to_kvm_svm(kvm);
+ u32 *avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page);
+
+ if (shorthand != APIC_DEST_NOSHORT)
+ return -EINVAL;
+
+ /*
+ * The AVIC incomplete IPI #vmexit info provides index into
+ * the physical APIC ID table, which can be used to derive
+ * guest physical APIC ID.
+ */
+ if (dest_mode == APIC_DEST_PHYSICAL) {
+ apic_id = index;
+ } else {
+ if (!apic_x2apic_mode(source)) {
+ /* For xAPIC logical mode, the index is for logical APIC table. */
+ apic_id = avic_logical_id_table[index] & 0x1ff;
+ } else {
+ return -EINVAL;
+ }
+ }
+
+ /*
+ * Assuming vcpu ID is the same as physical apic ID,
+ * and use it to retrieve the target vCPU.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, apic_id);
+ if (!vcpu)
+ return -EINVAL;
+
+ if (apic_x2apic_mode(vcpu->arch.apic))
+ dest = icrh;
+ else
+ dest = GET_APIC_DEST_FIELD(icrh);
+
+ /*
+ * Try matching the destination APIC ID with the vCPU.
+ */
+ if (kvm_apic_match_dest(vcpu, source, shorthand, dest, dest_mode)) {
+ vcpu->arch.apic->irr_pending = true;
+ svm_complete_interrupt_delivery(vcpu,
+ icrl & APIC_MODE_MASK,
+ icrl & APIC_INT_LEVELTRIG,
+ icrl & APIC_VECTOR_MASK);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
+ u32 icrl, u32 icrh, u32 index)
+{
unsigned long i;
+ struct kvm_vcpu *vcpu;
+
+ if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index))
+ return;
+
+ trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index);
/*
* Wake any target vCPUs that are blocking, i.e. waiting for a wake
@@ -316,7 +381,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
u32 icrh = svm->vmcb->control.exit_info_1 >> 32;
u32 icrl = svm->vmcb->control.exit_info_1;
u32 id = svm->vmcb->control.exit_info_2 >> 32;
- u32 index = svm->vmcb->control.exit_info_2 & 0xFF;
+ u32 index = svm->vmcb->control.exit_info_2 & 0x1FF;
struct kvm_lapic *apic = vcpu->arch.apic;
trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index);
@@ -343,7 +408,7 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
* set the appropriate IRR bits on the valid target
* vcpus. So, we just need to kick the appropriate vcpu.
*/
- avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh);
+ avic_kick_target_vcpus(vcpu->kvm, apic, icrl, icrh, index);
break;
case AVIC_IPI_FAILURE_INVALID_TARGET:
break;
@@ -357,6 +422,13 @@ int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu)
return 1;
}
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu))
+ return APICV_INHIBIT_REASON_NESTED;
+ return 0;
+}
+
static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat)
{
struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 96bab464967f..bed5e1692cef 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -36,40 +36,45 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
- if (svm->vmcb->control.exit_code != SVM_EXIT_NPF) {
+ if (vmcb->control.exit_code != SVM_EXIT_NPF) {
/*
* TODO: track the cause of the nested page fault, and
* correctly fill in the high bits of exit_info_1.
*/
- svm->vmcb->control.exit_code = SVM_EXIT_NPF;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = (1ULL << 32);
- svm->vmcb->control.exit_info_2 = fault->address;
+ vmcb->control.exit_code = SVM_EXIT_NPF;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = (1ULL << 32);
+ vmcb->control.exit_info_2 = fault->address;
}
- svm->vmcb->control.exit_info_1 &= ~0xffffffffULL;
- svm->vmcb->control.exit_info_1 |= fault->error_code;
+ vmcb->control.exit_info_1 &= ~0xffffffffULL;
+ vmcb->control.exit_info_1 |= fault->error_code;
nested_svm_vmexit(svm);
}
-static void svm_inject_page_fault_nested(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(!is_guest_mode(vcpu));
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb *vmcb = svm->vmcb;
+
+ WARN_ON(!is_guest_mode(vcpu));
if (vmcb12_is_intercept(&svm->nested.ctl,
INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !svm->nested.nested_run_pending) {
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = fault->error_code;
- svm->vmcb->control.exit_info_2 = fault->address;
- nested_svm_vmexit(svm);
- } else {
- kvm_inject_page_fault(vcpu, fault);
- }
+ !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
+ vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_info_1 = fault->error_code;
+ vmcb->control.exit_info_2 = fault->address;
+ nested_svm_vmexit(svm);
+ return true;
+ }
+
+ return false;
}
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
@@ -121,6 +126,20 @@ static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
+static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
+{
+ if (!svm->v_vmload_vmsave_enabled)
+ return true;
+
+ if (!nested_npt_enabled(svm))
+ return true;
+
+ if (!(svm->nested.ctl.virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK))
+ return true;
+
+ return false;
+}
+
void recalc_intercepts(struct vcpu_svm *svm)
{
struct vmcb_control_area *c, *h;
@@ -162,8 +181,17 @@ void recalc_intercepts(struct vcpu_svm *svm)
if (!intercept_smi)
vmcb_clr_intercept(c, INTERCEPT_SMI);
- vmcb_set_intercept(c, INTERCEPT_VMLOAD);
- vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ if (nested_vmcb_needs_vls_intercept(svm)) {
+ /*
+ * If the virtual VMLOAD/VMSAVE is not enabled for the L2,
+ * we must intercept these instructions to correctly
+ * emulate them in case L1 doesn't intercept them.
+ */
+ vmcb_set_intercept(c, INTERCEPT_VMLOAD);
+ vmcb_set_intercept(c, INTERCEPT_VMSAVE);
+ } else {
+ WARN_ON(!(c->virt_ext & VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK));
+ }
}
/*
@@ -413,6 +441,10 @@ void nested_sync_control_from_vmcb02(struct vcpu_svm *svm)
*/
mask &= ~V_IRQ_MASK;
}
+
+ if (nested_vgif_enabled(svm))
+ mask |= V_GIF_MASK;
+
svm->nested.ctl.int_ctl &= ~mask;
svm->nested.ctl.int_ctl |= svm->vmcb->control.int_ctl & mask;
}
@@ -454,11 +486,6 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
vmcb12->control.exit_int_info = exit_int_info;
}
-static inline bool nested_npt_enabled(struct vcpu_svm *svm)
-{
- return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
-}
-
static void nested_svm_transition_tlb_flush(struct kvm_vcpu *vcpu)
{
/*
@@ -515,6 +542,8 @@ void nested_vmcb02_compute_g_pat(struct vcpu_svm *svm)
static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12)
{
bool new_vmcb12 = false;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
nested_vmcb02_compute_g_pat(svm);
@@ -526,18 +555,18 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_SEG))) {
- svm->vmcb->save.es = vmcb12->save.es;
- svm->vmcb->save.cs = vmcb12->save.cs;
- svm->vmcb->save.ss = vmcb12->save.ss;
- svm->vmcb->save.ds = vmcb12->save.ds;
- svm->vmcb->save.cpl = vmcb12->save.cpl;
- vmcb_mark_dirty(svm->vmcb, VMCB_SEG);
+ vmcb02->save.es = vmcb12->save.es;
+ vmcb02->save.cs = vmcb12->save.cs;
+ vmcb02->save.ss = vmcb12->save.ss;
+ vmcb02->save.ds = vmcb12->save.ds;
+ vmcb02->save.cpl = vmcb12->save.cpl;
+ vmcb_mark_dirty(vmcb02, VMCB_SEG);
}
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DT))) {
- svm->vmcb->save.gdtr = vmcb12->save.gdtr;
- svm->vmcb->save.idtr = vmcb12->save.idtr;
- vmcb_mark_dirty(svm->vmcb, VMCB_DT);
+ vmcb02->save.gdtr = vmcb12->save.gdtr;
+ vmcb02->save.idtr = vmcb12->save.idtr;
+ vmcb_mark_dirty(vmcb02, VMCB_DT);
}
kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags | X86_EFLAGS_FIXED);
@@ -554,47 +583,59 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
kvm_rip_write(&svm->vcpu, vmcb12->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
- svm->vmcb->save.rax = vmcb12->save.rax;
- svm->vmcb->save.rsp = vmcb12->save.rsp;
- svm->vmcb->save.rip = vmcb12->save.rip;
+ vmcb02->save.rax = vmcb12->save.rax;
+ vmcb02->save.rsp = vmcb12->save.rsp;
+ vmcb02->save.rip = vmcb12->save.rip;
/* These bits will be set properly on the first execution when new_vmc12 is true */
if (unlikely(new_vmcb12 || vmcb_is_dirty(vmcb12, VMCB_DR))) {
- svm->vmcb->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
+ vmcb02->save.dr7 = svm->nested.save.dr7 | DR7_FIXED_1;
svm->vcpu.arch.dr6 = svm->nested.save.dr6 | DR6_ACTIVE_LOW;
- vmcb_mark_dirty(svm->vmcb, VMCB_DR);
+ vmcb_mark_dirty(vmcb02, VMCB_DR);
+ }
+
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ /*
+ * Reserved bits of DEBUGCTL are ignored. Be consistent with
+ * svm_set_msr's definition of reserved bits.
+ */
+ svm_copy_lbrs(vmcb02, vmcb12);
+ vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
+ svm_update_lbrv(&svm->vcpu);
+
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb02, vmcb01);
}
}
static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
{
- const u32 int_ctl_vmcb01_bits =
- V_INTR_MASKING_MASK | V_GIF_MASK | V_GIF_ENABLE_MASK;
-
- const u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
+ u32 int_ctl_vmcb01_bits = V_INTR_MASKING_MASK;
+ u32 int_ctl_vmcb12_bits = V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK;
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
/*
* Filled at exit: exit_code, exit_code_hi, exit_info_1, exit_info_2,
* exit_int_info, exit_int_info_err, next_rip, insn_len, insn_bytes.
*/
- /*
- * Also covers avic_vapic_bar, avic_backing_page, avic_logical_id,
- * avic_physical_id.
- */
- WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+ if (svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK))
+ int_ctl_vmcb12_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
+ else
+ int_ctl_vmcb01_bits |= (V_GIF_MASK | V_GIF_ENABLE_MASK);
/* Copied from vmcb01. msrpm_base can be overwritten later. */
- svm->vmcb->control.nested_ctl = svm->vmcb01.ptr->control.nested_ctl;
- svm->vmcb->control.iopm_base_pa = svm->vmcb01.ptr->control.iopm_base_pa;
- svm->vmcb->control.msrpm_base_pa = svm->vmcb01.ptr->control.msrpm_base_pa;
+ vmcb02->control.nested_ctl = vmcb01->control.nested_ctl;
+ vmcb02->control.iopm_base_pa = vmcb01->control.iopm_base_pa;
+ vmcb02->control.msrpm_base_pa = vmcb01->control.msrpm_base_pa;
/* Done at vmrun: asid. */
/* Also overwritten later if necessary. */
- svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
+ vmcb02->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
/* nested_cr3. */
if (nested_npt_enabled(svm))
@@ -605,21 +646,53 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm)
svm->nested.ctl.tsc_offset,
svm->tsc_ratio_msr);
- svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset;
+ vmcb02->control.tsc_offset = vcpu->arch.tsc_offset;
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
WARN_ON(!svm->tsc_scaling_enabled);
nested_svm_update_tsc_ratio_msr(vcpu);
}
- svm->vmcb->control.int_ctl =
+ vmcb02->control.int_ctl =
(svm->nested.ctl.int_ctl & int_ctl_vmcb12_bits) |
- (svm->vmcb01.ptr->control.int_ctl & int_ctl_vmcb01_bits);
-
- svm->vmcb->control.int_vector = svm->nested.ctl.int_vector;
- svm->vmcb->control.int_state = svm->nested.ctl.int_state;
- svm->vmcb->control.event_inj = svm->nested.ctl.event_inj;
- svm->vmcb->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ (vmcb01->control.int_ctl & int_ctl_vmcb01_bits);
+
+ vmcb02->control.int_vector = svm->nested.ctl.int_vector;
+ vmcb02->control.int_state = svm->nested.ctl.int_state;
+ vmcb02->control.event_inj = svm->nested.ctl.event_inj;
+ vmcb02->control.event_inj_err = svm->nested.ctl.event_inj_err;
+
+ vmcb02->control.virt_ext = vmcb01->control.virt_ext &
+ LBR_CTL_ENABLE_MASK;
+ if (svm->lbrv_enabled)
+ vmcb02->control.virt_ext |=
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+
+ if (!nested_vmcb_needs_vls_intercept(svm))
+ vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
+
+ if (kvm_pause_in_guest(svm->vcpu.kvm)) {
+ /* use guest values since host doesn't use them */
+ vmcb02->control.pause_filter_count =
+ svm->pause_filter_enabled ?
+ svm->nested.ctl.pause_filter_count : 0;
+
+ vmcb02->control.pause_filter_thresh =
+ svm->pause_threshold_enabled ?
+ svm->nested.ctl.pause_filter_thresh : 0;
+
+ } else if (!vmcb12_is_intercept(&svm->nested.ctl, INTERCEPT_PAUSE)) {
+ /* use host values when guest doesn't use them */
+ vmcb02->control.pause_filter_count = vmcb01->control.pause_filter_count;
+ vmcb02->control.pause_filter_thresh = vmcb01->control.pause_filter_thresh;
+ } else {
+ /*
+ * Intercept every PAUSE otherwise and
+ * ignore both host and guest values
+ */
+ vmcb02->control.pause_filter_count = 0;
+ vmcb02->control.pause_filter_thresh = 0;
+ }
nested_svm_transition_tlb_flush(vcpu);
@@ -680,14 +753,14 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
if (ret)
return ret;
- if (!npt_enabled)
- vcpu->arch.mmu->inject_page_fault = svm_inject_page_fault_nested;
-
if (!from_vmrun)
kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
svm_set_gif(svm, true);
+ if (kvm_vcpu_apicv_active(vcpu))
+ kvm_make_request(KVM_REQ_APICV_UPDATE, vcpu);
+
return 0;
}
@@ -698,6 +771,7 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
struct vmcb *vmcb12;
struct kvm_host_map map;
u64 vmcb12_gpa;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
if (!svm->nested.hsave_msr) {
kvm_inject_gp(vcpu, 0);
@@ -741,14 +815,14 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu)
* Since vmcb01 is not in use, we can use it to store some of the L1
* state.
*/
- svm->vmcb01.ptr->save.efer = vcpu->arch.efer;
- svm->vmcb01.ptr->save.cr0 = kvm_read_cr0(vcpu);
- svm->vmcb01.ptr->save.cr4 = vcpu->arch.cr4;
- svm->vmcb01.ptr->save.rflags = kvm_get_rflags(vcpu);
- svm->vmcb01.ptr->save.rip = kvm_rip_read(vcpu);
+ vmcb01->save.efer = vcpu->arch.efer;
+ vmcb01->save.cr0 = kvm_read_cr0(vcpu);
+ vmcb01->save.cr4 = vcpu->arch.cr4;
+ vmcb01->save.rflags = kvm_get_rflags(vcpu);
+ vmcb01->save.rip = kvm_rip_read(vcpu);
if (!npt_enabled)
- svm->vmcb01.ptr->save.cr3 = kvm_read_cr3(vcpu);
+ vmcb01->save.cr3 = kvm_read_cr3(vcpu);
svm->nested.nested_run_pending = 1;
@@ -814,14 +888,12 @@ void svm_copy_vmloadsave_state(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
int nested_svm_vmexit(struct vcpu_svm *svm)
{
struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vmcb *vmcb01 = svm->vmcb01.ptr;
+ struct vmcb *vmcb02 = svm->nested.vmcb02.ptr;
struct vmcb *vmcb12;
- struct vmcb *vmcb = svm->vmcb;
struct kvm_host_map map;
int rc;
- /* Triple faults in L2 should never escape. */
- WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
rc = kvm_vcpu_map(vcpu, gpa_to_gfn(svm->nested.vmcb12_gpa), &map);
if (rc) {
if (rc == -EINVAL)
@@ -843,57 +915,68 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/* Give the current vmcb to the guest */
- vmcb12->save.es = vmcb->save.es;
- vmcb12->save.cs = vmcb->save.cs;
- vmcb12->save.ss = vmcb->save.ss;
- vmcb12->save.ds = vmcb->save.ds;
- vmcb12->save.gdtr = vmcb->save.gdtr;
- vmcb12->save.idtr = vmcb->save.idtr;
+ vmcb12->save.es = vmcb02->save.es;
+ vmcb12->save.cs = vmcb02->save.cs;
+ vmcb12->save.ss = vmcb02->save.ss;
+ vmcb12->save.ds = vmcb02->save.ds;
+ vmcb12->save.gdtr = vmcb02->save.gdtr;
+ vmcb12->save.idtr = vmcb02->save.idtr;
vmcb12->save.efer = svm->vcpu.arch.efer;
vmcb12->save.cr0 = kvm_read_cr0(vcpu);
vmcb12->save.cr3 = kvm_read_cr3(vcpu);
- vmcb12->save.cr2 = vmcb->save.cr2;
+ vmcb12->save.cr2 = vmcb02->save.cr2;
vmcb12->save.cr4 = svm->vcpu.arch.cr4;
vmcb12->save.rflags = kvm_get_rflags(vcpu);
vmcb12->save.rip = kvm_rip_read(vcpu);
vmcb12->save.rsp = kvm_rsp_read(vcpu);
vmcb12->save.rax = kvm_rax_read(vcpu);
- vmcb12->save.dr7 = vmcb->save.dr7;
+ vmcb12->save.dr7 = vmcb02->save.dr7;
vmcb12->save.dr6 = svm->vcpu.arch.dr6;
- vmcb12->save.cpl = vmcb->save.cpl;
+ vmcb12->save.cpl = vmcb02->save.cpl;
- vmcb12->control.int_state = vmcb->control.int_state;
- vmcb12->control.exit_code = vmcb->control.exit_code;
- vmcb12->control.exit_code_hi = vmcb->control.exit_code_hi;
- vmcb12->control.exit_info_1 = vmcb->control.exit_info_1;
- vmcb12->control.exit_info_2 = vmcb->control.exit_info_2;
+ vmcb12->control.int_state = vmcb02->control.int_state;
+ vmcb12->control.exit_code = vmcb02->control.exit_code;
+ vmcb12->control.exit_code_hi = vmcb02->control.exit_code_hi;
+ vmcb12->control.exit_info_1 = vmcb02->control.exit_info_1;
+ vmcb12->control.exit_info_2 = vmcb02->control.exit_info_2;
if (vmcb12->control.exit_code != SVM_EXIT_ERR)
nested_save_pending_event_to_vmcb12(svm, vmcb12);
if (svm->nrips_enabled)
- vmcb12->control.next_rip = vmcb->control.next_rip;
+ vmcb12->control.next_rip = vmcb02->control.next_rip;
vmcb12->control.int_ctl = svm->nested.ctl.int_ctl;
vmcb12->control.tlb_ctl = svm->nested.ctl.tlb_ctl;
vmcb12->control.event_inj = svm->nested.ctl.event_inj;
vmcb12->control.event_inj_err = svm->nested.ctl.event_inj_err;
+ if (!kvm_pause_in_guest(vcpu->kvm) && vmcb02->control.pause_filter_count)
+ vmcb01->control.pause_filter_count = vmcb02->control.pause_filter_count;
+
nested_svm_copy_common_state(svm->nested.vmcb02.ptr, svm->vmcb01.ptr);
svm_switch_vmcb(svm, &svm->vmcb01);
+ if (unlikely(svm->lbrv_enabled && (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ svm_copy_lbrs(vmcb12, vmcb02);
+ svm_update_lbrv(vcpu);
+ } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ svm_copy_lbrs(vmcb01, vmcb02);
+ svm_update_lbrv(vcpu);
+ }
+
/*
* On vmexit the GIF is set to false and
* no event can be injected in L1.
*/
svm_set_gif(svm, false);
- svm->vmcb->control.exit_int_info = 0;
+ vmcb01->control.exit_int_info = 0;
svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset;
- if (svm->vmcb->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
- svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset;
- vmcb_mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+ if (vmcb01->control.tsc_offset != svm->vcpu.arch.tsc_offset) {
+ vmcb01->control.tsc_offset = svm->vcpu.arch.tsc_offset;
+ vmcb_mark_dirty(vmcb01, VMCB_INTERCEPTS);
}
if (svm->tsc_ratio_msr != kvm_default_tsc_scaling_ratio) {
@@ -907,13 +990,13 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
/*
* Restore processor state that had been saved in vmcb01
*/
- kvm_set_rflags(vcpu, svm->vmcb->save.rflags);
- svm_set_efer(vcpu, svm->vmcb->save.efer);
- svm_set_cr0(vcpu, svm->vmcb->save.cr0 | X86_CR0_PE);
- svm_set_cr4(vcpu, svm->vmcb->save.cr4);
- kvm_rax_write(vcpu, svm->vmcb->save.rax);
- kvm_rsp_write(vcpu, svm->vmcb->save.rsp);
- kvm_rip_write(vcpu, svm->vmcb->save.rip);
+ kvm_set_rflags(vcpu, vmcb01->save.rflags);
+ svm_set_efer(vcpu, vmcb01->save.efer);
+ svm_set_cr0(vcpu, vmcb01->save.cr0 | X86_CR0_PE);
+ svm_set_cr4(vcpu, vmcb01->save.cr4);
+ kvm_rax_write(vcpu, vmcb01->save.rax);
+ kvm_rsp_write(vcpu, vmcb01->save.rsp);
+ kvm_rip_write(vcpu, vmcb01->save.rip);
svm->vcpu.arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(&svm->vcpu);
@@ -931,7 +1014,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
nested_svm_uninit_mmu_context(vcpu);
- rc = nested_svm_load_cr3(vcpu, svm->vmcb->save.cr3, false, true);
+ rc = nested_svm_load_cr3(vcpu, vmcb01->save.cr3, false, true);
if (rc)
return 1;
@@ -949,9 +1032,16 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
* right now so that it an be accounted for before we execute
* L1's next instruction.
*/
- if (unlikely(svm->vmcb->save.rflags & X86_EFLAGS_TF))
+ if (unlikely(vmcb01->save.rflags & X86_EFLAGS_TF))
kvm_queue_exception(&(svm->vcpu), DB_VECTOR);
+ /*
+ * Un-inhibit the AVIC right away, so that other vCPUs can start
+ * to benefit from it right away.
+ */
+ if (kvm_apicv_activated(vcpu->kvm))
+ kvm_vcpu_update_apicv(vcpu);
+
return 0;
}
@@ -1162,12 +1252,13 @@ static bool nested_exit_on_exception(struct vcpu_svm *svm)
static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
{
unsigned int nr = svm->vcpu.arch.exception.nr;
+ struct vmcb *vmcb = svm->vmcb;
- svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
- svm->vmcb->control.exit_code_hi = 0;
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ vmcb->control.exit_code_hi = 0;
if (svm->vcpu.arch.exception.has_error_code)
- svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+ vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
/*
* EXITINFO2 is undefined for all exception intercepts other
@@ -1175,11 +1266,11 @@ static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
*/
if (nr == PF_VECTOR) {
if (svm->vcpu.arch.exception.nested_apf)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
else if (svm->vcpu.arch.exception.has_payload)
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
else
- svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
+ vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
} else if (nr == DB_VECTOR) {
/* See inject_pending_event. */
kvm_deliver_exception_payload(&svm->vcpu);
@@ -1567,6 +1658,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
struct kvm_x86_nested_ops svm_nested_ops = {
.leave_nested = svm_leave_nested,
.check_events = svm_check_nested_events,
+ .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
.triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 311cbaa0c3dd..47e8eaca1e90 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -341,7 +341,7 @@ static void amd_pmu_reset(struct kvm_vcpu *vcpu)
}
}
-struct kvm_pmu_ops amd_pmu_ops = {
+struct kvm_pmu_ops amd_pmu_ops __initdata = {
.pmc_perf_hw_id = amd_pmc_perf_hw_id,
.pmc_is_enabled = amd_pmc_is_enabled,
.pmc_idx_to_pmc = amd_pmc_idx_to_pmc,
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 537aaddc852f..b67ce873d5d2 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2738,8 +2738,12 @@ static int sev_handle_vmgexit_msr_protocol(struct vcpu_svm *svm)
pr_info("SEV-ES guest requested termination: %#llx:%#llx\n",
reason_set, reason_code);
- ret = -EINVAL;
- break;
+ vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+ vcpu->run->system_event.type = KVM_SYSTEM_EVENT_SEV_TERM;
+ vcpu->run->system_event.ndata = 1;
+ vcpu->run->system_event.data[0] = control->ghcb_gpa;
+
+ return 0;
}
default:
/* Error, keep GHCB MSR value as-is */
@@ -2922,6 +2926,14 @@ void sev_es_init_vmcb(struct vcpu_svm *svm)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) &&
+ (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP) ||
+ guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDPID))) {
+ set_msr_interception(vcpu, svm->msrpm, MSR_TSC_AUX, 1, 1);
+ if (guest_cpuid_has(&svm->vcpu, X86_FEATURE_RDTSCP))
+ svm_clr_intercept(svm, INTERCEPT_RDTSCP);
+ }
}
void sev_es_vcpu_reset(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index bd4c64b362d2..75b4f3ac8b1a 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -62,8 +62,6 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id);
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
-
static bool erratum_383_found __read_mostly;
u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
@@ -101,6 +99,7 @@ static const struct svm_direct_access_msrs {
{ .index = MSR_EFER, .always = false },
{ .index = MSR_IA32_CR_PAT, .always = false },
{ .index = MSR_AMD64_SEV_ES_GHCB, .always = true },
+ { .index = MSR_TSC_AUX, .always = false },
{ .index = MSR_INVALID, .always = false },
};
@@ -172,7 +171,7 @@ static int vls = true;
module_param(vls, int, 0444);
/* enable/disable Virtual GIF */
-static int vgif = true;
+int vgif = true;
module_param(vgif, int, 0444);
/* enable/disable LBR virtualization */
@@ -189,6 +188,9 @@ module_param(tsc_scaling, int, 0444);
static bool avic;
module_param(avic, bool, 0444);
+static bool force_avic;
+module_param_unsafe(force_avic, bool, 0444);
+
bool __read_mostly dump_invalid_vmcb;
module_param(dump_invalid_vmcb, bool, 0644);
@@ -790,6 +792,17 @@ static void init_msrpm_offsets(void)
}
}
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
+{
+ to_vmcb->save.dbgctl = from_vmcb->save.dbgctl;
+ to_vmcb->save.br_from = from_vmcb->save.br_from;
+ to_vmcb->save.br_to = from_vmcb->save.br_to;
+ to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from;
+ to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to;
+
+ vmcb_mark_dirty(to_vmcb, VMCB_LBR);
+}
+
static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -799,6 +812,10 @@ static void svm_enable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 1, 1);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 1, 1);
+
+ /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
}
static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
@@ -810,6 +827,67 @@ static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTBRANCHTOIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTFROMIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_LASTINTTOIP, 0, 0);
+
+ /*
+ * Move the LBR msrs back to the vmcb01 to avoid copying them
+ * on nested guest entries.
+ */
+ if (is_guest_mode(vcpu))
+ svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
+}
+
+static int svm_get_lbr_msr(struct vcpu_svm *svm, u32 index)
+{
+ /*
+ * If the LBR virtualization is disabled, the LBR msrs are always
+ * kept in the vmcb01 to avoid copying them on nested guest entries.
+ *
+ * If nested, and the LBR virtualization is enabled/disabled, the msrs
+ * are moved between the vmcb01 and vmcb02 as needed.
+ */
+ struct vmcb *vmcb =
+ (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) ?
+ svm->vmcb : svm->vmcb01.ptr;
+
+ switch (index) {
+ case MSR_IA32_DEBUGCTLMSR:
+ return vmcb->save.dbgctl;
+ case MSR_IA32_LASTBRANCHFROMIP:
+ return vmcb->save.br_from;
+ case MSR_IA32_LASTBRANCHTOIP:
+ return vmcb->save.br_to;
+ case MSR_IA32_LASTINTFROMIP:
+ return vmcb->save.last_excp_from;
+ case MSR_IA32_LASTINTTOIP:
+ return vmcb->save.last_excp_to;
+ default:
+ KVM_BUG(false, svm->vcpu.kvm,
+ "%s: Unknown MSR 0x%x", __func__, index);
+ return 0;
+ }
+}
+
+void svm_update_lbrv(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ bool enable_lbrv = svm_get_lbr_msr(svm, MSR_IA32_DEBUGCTLMSR) &
+ DEBUGCTLMSR_LBR;
+
+ bool current_enable_lbrv = !!(svm->vmcb->control.virt_ext &
+ LBR_CTL_ENABLE_MASK);
+
+ if (unlikely(is_guest_mode(vcpu) && svm->lbrv_enabled))
+ if (unlikely(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))
+ enable_lbrv = true;
+
+ if (enable_lbrv == current_enable_lbrv)
+ return;
+
+ if (enable_lbrv)
+ svm_enable_lbrv(vcpu);
+ else
+ svm_disable_lbrv(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -831,6 +909,9 @@ static void grow_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ return;
+
control->pause_filter_count = __grow_ple_window(old,
pause_filter_count,
pause_filter_count_grow,
@@ -849,6 +930,9 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
struct vmcb_control_area *control = &svm->vmcb->control;
int old = control->pause_filter_count;
+ if (kvm_pause_in_guest(vcpu->kvm) || !old)
+ return;
+
control->pause_filter_count =
__shrink_ple_window(old,
pause_filter_count,
@@ -960,6 +1044,8 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_EIP, 0, 0);
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SYSENTER_ESP, 0, 0);
+
+ svm->v_vmload_vmsave_enabled = false;
} else {
/*
* If hardware supports Virtual VMLOAD VMSAVE then enable it
@@ -979,8 +1065,9 @@ static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu)
static void init_vmcb(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb_control_area *control = &svm->vmcb->control;
- struct vmcb_save_area *save = &svm->vmcb->save;
+ struct vmcb *vmcb = svm->vmcb01.ptr;
+ struct vmcb_control_area *control = &vmcb->control;
+ struct vmcb_save_area *save = &vmcb->save;
svm_set_intercept(svm, INTERCEPT_CR0_READ);
svm_set_intercept(svm, INTERCEPT_CR3_READ);
@@ -1104,7 +1191,7 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
set_msr_interception(vcpu, svm->msrpm, MSR_IA32_SPEC_CTRL, 1, 1);
if (kvm_vcpu_apicv_active(vcpu))
- avic_init_vmcb(svm);
+ avic_init_vmcb(svm, vmcb);
if (vgif) {
svm_clr_intercept(svm, INTERCEPT_STGI);
@@ -1122,10 +1209,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
}
}
- svm_hv_init_vmcb(svm->vmcb);
+ svm_hv_init_vmcb(vmcb);
init_vmcb_after_set_cpuid(vcpu);
- vmcb_mark_all_dirty(svm->vmcb);
+ vmcb_mark_all_dirty(vmcb);
enable_gif(svm);
}
@@ -1380,7 +1467,7 @@ static void svm_set_vintr(struct vcpu_svm *svm)
/*
* The following fields are ignored when AVIC is enabled
*/
- WARN_ON(kvm_apicv_activated(svm->vcpu.kvm));
+ WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu));
svm_set_intercept(svm, INTERCEPT_VINTR);
@@ -2142,7 +2229,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* Likewise, clear the VINTR intercept, we will set it
* again while processing KVM_REQ_EVENT if needed.
*/
- if (vgif_enabled(svm))
+ if (vgif)
svm_clr_intercept(svm, INTERCEPT_STGI);
if (svm_is_intercept(svm, INTERCEPT_VINTR))
svm_clear_vintr(svm);
@@ -2160,7 +2247,7 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
* in use, we still rely on the VINTR intercept (rather than
* STGI) to detect an open interrupt window.
*/
- if (!vgif_enabled(svm))
+ if (!vgif)
svm_clear_vintr(svm);
}
}
@@ -2575,25 +2662,12 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_TSC_AUX:
msr_info->data = svm->tsc_aux;
break;
- /*
- * Nobody will change the following 5 values in the VMCB so we can
- * safely return them on rdmsr. They will always be 0 until LBRV is
- * implemented.
- */
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm->vmcb->save.dbgctl;
- break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm->vmcb->save.br_from;
- break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm->vmcb->save.br_to;
- break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm->vmcb->save.last_excp_from;
- break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm->vmcb->save.last_excp_to;
+ msr_info->data = svm_get_lbr_msr(svm, msr_info->index);
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -2839,12 +2913,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm->vmcb->save.dbgctl = data;
- vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
- if (data & (1ULL<<0))
- svm_enable_lbrv(vcpu);
+ if (svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK)
+ svm->vmcb->save.dbgctl = data;
else
- svm_disable_lbrv(vcpu);
+ svm->vmcb01.ptr->save.dbgctl = data;
+
+ svm_update_lbrv(vcpu);
+
break;
case MSR_VM_HSAVE_PA:
/*
@@ -2901,9 +2976,16 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
svm_clear_vintr(to_svm(vcpu));
/*
- * For AVIC, the only reason to end up here is ExtINTs.
+ * If not running nested, for AVIC, the only reason to end up here is ExtINTs.
* In this case AVIC was temporarily disabled for
* requesting the IRQ window and we have to re-enable it.
+ *
+ * If running nested, still remove the VM wide AVIC inhibit to
+ * support case in which the interrupt window was requested when the
+ * vCPU was not running nested.
+
+ * All vCPUs which run still run nested, will remain to have their
+ * AVIC still inhibited due to per-cpu AVIC inhibition.
*/
kvm_clear_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
@@ -2914,7 +2996,6 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu)
static int pause_interception(struct kvm_vcpu *vcpu)
{
bool in_kernel;
-
/*
* CPL is not made available for an SEV-ES guest, therefore
* vcpu->arch.preempted_in_kernel can never be true. Just
@@ -2922,8 +3003,7 @@ static int pause_interception(struct kvm_vcpu *vcpu)
*/
in_kernel = !sev_es_guest(vcpu->kvm) && svm_get_cpl(vcpu) == 0;
- if (!kvm_pause_in_guest(vcpu->kvm))
- grow_ple_window(vcpu);
+ grow_ple_window(vcpu);
kvm_vcpu_on_spin(vcpu, in_kernel);
return kvm_skip_emulated_instruction(vcpu);
@@ -3496,14 +3576,20 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu)
* enabled, the STGI interception will not occur. Enable the irq
* window under the assumption that the hardware will set the GIF.
*/
- if (vgif_enabled(svm) || gif_set(svm)) {
+ if (vgif || gif_set(svm)) {
/*
* IRQ window is not needed when AVIC is enabled,
* unless we have pending ExtINT since it cannot be injected
- * via AVIC. In such case, we need to temporarily disable AVIC,
+ * via AVIC. In such case, KVM needs to temporarily disable AVIC,
* and fallback to injecting IRQ via V_IRQ.
+ *
+ * If running nested, AVIC is already locally inhibited
+ * on this vCPU, therefore there is no need to request
+ * the VM wide AVIC inhibition.
*/
- kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+ if (!is_guest_mode(vcpu))
+ kvm_set_apicv_inhibit(vcpu->kvm, APICV_INHIBIT_REASON_IRQWIN);
+
svm_set_vintr(svm);
}
}
@@ -3516,7 +3602,7 @@ static void svm_enable_nmi_window(struct kvm_vcpu *vcpu)
return; /* IRET will cause a vm exit */
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
return; /* STGI will cause a vm exit */
}
@@ -3865,7 +3951,7 @@ static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
hv_track_root_tdp(vcpu, root_hpa);
cr3 = vcpu->arch.cr3;
- } else if (vcpu->arch.mmu->shadow_root_level >= PT64_ROOT_4LEVEL) {
+ } else if (vcpu->arch.mmu->root_role.level >= PT64_ROOT_4LEVEL) {
cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu);
} else {
/* PCID in the guest should be impossible with a 32-bit MMU. */
@@ -3946,6 +4032,17 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
guest_cpuid_has(vcpu, X86_FEATURE_NRIPS);
svm->tsc_scaling_enabled = tsc_scaling && guest_cpuid_has(vcpu, X86_FEATURE_TSCRATEMSR);
+ svm->lbrv_enabled = lbrv && guest_cpuid_has(vcpu, X86_FEATURE_LBRV);
+
+ svm->v_vmload_vmsave_enabled = vls && guest_cpuid_has(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD);
+
+ svm->pause_filter_enabled = kvm_cpu_cap_has(X86_FEATURE_PAUSEFILTER) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PAUSEFILTER);
+
+ svm->pause_threshold_enabled = kvm_cpu_cap_has(X86_FEATURE_PFTHRESHOLD) &&
+ guest_cpuid_has(vcpu, X86_FEATURE_PFTHRESHOLD);
+
+ svm->vgif_enabled = vgif && guest_cpuid_has(vcpu, X86_FEATURE_VGIF);
svm_recalc_instruction_intercepts(vcpu, svm);
@@ -3963,13 +4060,6 @@ static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
*/
if (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC))
kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_X2APIC);
-
- /*
- * Currently, AVIC does not work with nested virtualization.
- * So, we disable AVIC when cpuid for SVM is set in the L1 guest.
- */
- if (nested && guest_cpuid_has(vcpu, X86_FEATURE_SVM))
- kvm_set_apicv_inhibit(kvm, APICV_INHIBIT_REASON_NESTED);
}
init_vmcb_after_set_cpuid(vcpu);
}
@@ -4224,7 +4314,7 @@ static int svm_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
- ret = nested_svm_vmexit(svm);
+ ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW);
if (ret)
return ret;
@@ -4321,7 +4411,7 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
struct vcpu_svm *svm = to_svm(vcpu);
if (!gif_set(svm)) {
- if (vgif_enabled(svm))
+ if (vgif)
svm_set_intercept(svm, INTERCEPT_STGI);
/* STGI will cause a vm exit */
} else {
@@ -4605,7 +4695,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.sched_in = svm_sched_in,
- .pmu_ops = &amd_pmu_ops,
.nested_ops = &svm_nested_ops,
.deliver_interrupt = svm_deliver_interrupt,
@@ -4632,6 +4721,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.complete_emulated_msr = svm_complete_emulated_msr,
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
+ .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons,
};
/*
@@ -4695,6 +4785,20 @@ static __init void svm_set_cpu_caps(void)
if (tsc_scaling)
kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+ if (vls)
+ kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD);
+ if (lbrv)
+ kvm_cpu_cap_set(X86_FEATURE_LBRV);
+
+ if (boot_cpu_has(X86_FEATURE_PAUSEFILTER))
+ kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER);
+
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD);
+
+ if (vgif)
+ kvm_cpu_cap_set(X86_FEATURE_VGIF);
+
/* Nested VM can receive #VMEXIT instead of triggering #GP */
kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
}
@@ -4806,15 +4910,20 @@ static __init int svm_hardware_setup(void)
nrips = false;
}
- enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+ enable_apicv = avic = avic && npt_enabled && (boot_cpu_has(X86_FEATURE_AVIC) || force_avic);
if (enable_apicv) {
- pr_info("AVIC enabled\n");
+ if (!boot_cpu_has(X86_FEATURE_AVIC)) {
+ pr_warn("AVIC is not supported in CPUID but force enabled");
+ pr_warn("Your system might crash and burn");
+ } else
+ pr_info("AVIC enabled\n");
amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
} else {
svm_x86_ops.vcpu_blocking = NULL;
svm_x86_ops.vcpu_unblocking = NULL;
+ svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL;
}
if (vls) {
@@ -4879,6 +4988,7 @@ static struct kvm_x86_init_ops svm_init_ops __initdata = {
.check_processor_compatibility = svm_check_processor_compat,
.runtime_ops = &svm_x86_ops,
+ .pmu_ops = &amd_pmu_ops,
};
static int __init svm_init(void)
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f77a7d2d39dd..32220a1b0ea2 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -29,10 +29,11 @@
#define IOPM_SIZE PAGE_SIZE * 3
#define MSRPM_SIZE PAGE_SIZE * 2
-#define MAX_DIRECT_ACCESS_MSRS 20
+#define MAX_DIRECT_ACCESS_MSRS 21
#define MSRPM_OFFSETS 16
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
+extern int vgif;
extern bool intercept_smi;
/*
@@ -231,9 +232,14 @@ struct vcpu_svm {
unsigned int3_injected;
unsigned long int3_rip;
- /* cached guest cpuid flags for faster access */
+ /* optional nested SVM features that are enabled for this guest */
bool nrips_enabled : 1;
bool tsc_scaling_enabled : 1;
+ bool v_vmload_vmsave_enabled : 1;
+ bool lbrv_enabled : 1;
+ bool pause_filter_enabled : 1;
+ bool pause_threshold_enabled : 1;
+ bool vgif_enabled : 1;
u32 ldr_reg;
u32 dfr_reg;
@@ -452,44 +458,70 @@ static inline bool svm_is_intercept(struct vcpu_svm *svm, int bit)
return vmcb_is_intercept(&svm->vmcb->control, bit);
}
-static inline bool vgif_enabled(struct vcpu_svm *svm)
+static inline bool nested_vgif_enabled(struct vcpu_svm *svm)
{
- return !!(svm->vmcb->control.int_ctl & V_GIF_ENABLE_MASK);
+ return svm->vgif_enabled && (svm->nested.ctl.int_ctl & V_GIF_ENABLE_MASK);
+}
+
+static inline struct vmcb *get_vgif_vmcb(struct vcpu_svm *svm)
+{
+ if (!vgif)
+ return NULL;
+
+ if (is_guest_mode(&svm->vcpu) && !nested_vgif_enabled(svm))
+ return svm->nested.vmcb02.ptr;
+ else
+ return svm->vmcb01.ptr;
}
static inline void enable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl |= V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl |= V_GIF_MASK;
else
svm->vcpu.arch.hflags |= HF_GIF_MASK;
}
static inline void disable_gif(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- svm->vmcb->control.int_ctl &= ~V_GIF_MASK;
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ vmcb->control.int_ctl &= ~V_GIF_MASK;
else
svm->vcpu.arch.hflags &= ~HF_GIF_MASK;
}
static inline bool gif_set(struct vcpu_svm *svm)
{
- if (vgif_enabled(svm))
- return !!(svm->vmcb->control.int_ctl & V_GIF_MASK);
+ struct vmcb *vmcb = get_vgif_vmcb(svm);
+
+ if (vmcb)
+ return !!(vmcb->control.int_ctl & V_GIF_MASK);
else
return !!(svm->vcpu.arch.hflags & HF_GIF_MASK);
}
+static inline bool nested_npt_enabled(struct vcpu_svm *svm)
+{
+ return svm->nested.ctl.nested_ctl & SVM_NESTED_CTL_NP_ENABLE;
+}
+
/* svm.c */
#define MSR_INVALID 0xffffffffU
+#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
+
extern bool dump_invalid_vmcb;
u32 svm_msrpm_offset(u32 msr);
u32 *svm_vcpu_alloc_msrpm(void);
void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm);
void svm_vcpu_free_msrpm(u32 *msrpm);
+void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb);
+void svm_update_lbrv(struct kvm_vcpu *vcpu);
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -574,7 +606,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
-void avic_init_vmcb(struct vcpu_svm *svm);
+void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu);
int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu);
int avic_init_vcpu(struct vcpu_svm *svm);
@@ -592,6 +624,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
void avic_ring_doorbell(struct kvm_vcpu *vcpu);
+unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index e3a24b8f04be..de4762517569 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1459,6 +1459,26 @@ TRACE_EVENT(kvm_avic_ga_log,
__entry->vmid, __entry->vcpuid)
);
+TRACE_EVENT(kvm_avic_kick_vcpu_slowpath,
+ TP_PROTO(u32 icrh, u32 icrl, u32 index),
+ TP_ARGS(icrh, icrl, index),
+
+ TP_STRUCT__entry(
+ __field(u32, icrh)
+ __field(u32, icrl)
+ __field(u32, index)
+ ),
+
+ TP_fast_assign(
+ __entry->icrh = icrh;
+ __entry->icrl = icrl;
+ __entry->index = index;
+ ),
+
+ TP_printk("icrh:icrl=%#08x:%08x, index=%u",
+ __entry->icrh, __entry->icrl, __entry->index)
+);
+
TRACE_EVENT(kvm_hv_timer_state,
TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
TP_ARGS(vcpu_id, hv_timer_in_use),
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f18744f7ff82..a6688663da4d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -476,24 +476,23 @@ static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit
return 0;
}
-
-static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
+static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
WARN_ON(!is_guest_mode(vcpu));
if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
- !to_vmx(vcpu)->nested.nested_run_pending) {
+ !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
vmcs12->vm_exit_intr_error_code = fault->error_code;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
fault->address);
- } else {
- kvm_inject_page_fault(vcpu, fault);
+ return true;
}
+ return false;
}
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -2614,9 +2613,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3);
}
- if (!enable_ept)
- vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
-
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
vmcs12->guest_ia32_perf_global_ctrl))) {
@@ -3695,12 +3691,34 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
}
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
{
u32 idt_vectoring;
unsigned int nr;
- if (vcpu->arch.exception.injected) {
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -3733,6 +3751,8 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
idt_vectoring |= INTR_TYPE_EXT_INTR;
vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
}
}
@@ -4202,12 +4222,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
if (to_vmx(vcpu)->exit_reason.enclave_mode)
vmcs12->vm_exit_reason |= VMX_EXIT_REASONS_SGX_ENCLAVE_MODE;
vmcs12->exit_qualification = exit_qualification;
- vmcs12->vm_exit_intr_info = exit_intr_info;
-
- vmcs12->idt_vectoring_info_field = 0;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
@@ -4219,7 +4239,12 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* Transfer the event that L0 or L1 may wanted to inject into
* L2 to IDT_VECTORING_INFO_FIELD.
*/
- vmcs12_save_pending_event(vcpu, vmcs12);
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
* According to spec, there's no need to store the guest's
@@ -4518,9 +4543,6 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
- /* Similarly, triple faults in L2 should never escape. */
- WARN_ON_ONCE(kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu));
-
if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
/*
* KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
@@ -6804,6 +6826,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
+ .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bc3f8512bb64..9db662399487 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -723,7 +723,7 @@ static void intel_pmu_cleanup(struct kvm_vcpu *vcpu)
intel_pmu_release_guest_lbr_event(vcpu);
}
-struct kvm_pmu_ops intel_pmu_ops = {
+struct kvm_pmu_ops intel_pmu_ops __initdata = {
.pmc_perf_hw_id = intel_pmc_perf_hw_id,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 3834bb30ce54..37d0bfff269d 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -311,7 +311,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq,
continue;
}
- vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+ vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
vcpu_info.vector = irq.vector;
trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index e325c290a816..2b9d7a7e83f7 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -104,6 +104,11 @@ static inline bool is_breakpoint(u32 intr_info)
return is_exception_n(intr_info, BP_VECTOR);
}
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
static inline bool is_page_fault(u32 intr_info)
{
return is_exception_n(intr_info, PF_VECTOR);
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04d170c4b61e..cf8581978bce 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -2444,7 +2444,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
&_cpu_based_exec_control) < 0)
return -EIO;
#ifdef CONFIG_X86_64
- if ((_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
+ if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
_cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
~CPU_BASED_CR8_STORE_EXITING;
#endif
@@ -2948,7 +2948,7 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
- mmu->shadow_root_level));
+ mmu->root_role.level));
else
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
@@ -4380,7 +4380,7 @@ static void init_vmcs(struct vcpu_vmx *vmx)
if (cpu_has_secondary_exec_ctrls())
secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx));
- if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
+ if (enable_apicv && lapic_in_kernel(&vmx->vcpu)) {
vmcs_write64(EOI_EXIT_BITMAP0, 0);
vmcs_write64(EOI_EXIT_BITMAP1, 0);
vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -5405,9 +5405,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
error_code |= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
? PFERR_FETCH_MASK : 0;
/* ept page table entry is present? */
- error_code |= (exit_qualification &
- (EPT_VIOLATION_READABLE | EPT_VIOLATION_WRITABLE |
- EPT_VIOLATION_EXECUTABLE))
+ error_code |= (exit_qualification & EPT_VIOLATION_RWX_MASK)
? PFERR_PRESENT_MASK : 0;
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != 0 ?
@@ -7818,7 +7816,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
.pi_update_irte = vmx_pi_update_irte,
@@ -8072,6 +8069,7 @@ static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.handle_intel_pt_intr = NULL,
.runtime_ops = &vmx_x86_ops,
+ .pmu_ops = &intel_pmu_ops,
};
static void vmx_cleanup_l1d_flush(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 07d789b1d366..ba4faeb32437 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -748,6 +748,7 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
+/* Returns true if the page fault was immediately morphed into a VM-Exit. */
bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
@@ -766,8 +767,26 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
fault_mmu->root.hpa);
+ /*
+ * A workaround for KVM's bad exception handling. If KVM injected an
+ * exception into L2, and L2 encountered a #PF while vectoring the
+ * injected exception, manually check to see if L1 wants to intercept
+ * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
+ * In all other cases, defer the check to nested_ops->check_events(),
+ * which will correctly handle priority (this does not). Note, other
+ * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
+ * most problematic, e.g. when L0 and L1 are both intercepting #PF for
+ * shadow paging.
+ *
+ * TODO: Rewrite exception handling to track injected and pending
+ * (VM-Exit) exceptions separately.
+ */
+ if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
+ kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
+ return true;
+
fault_mmu->inject_page_fault(vcpu, fault);
- return fault->nested_page_fault;
+ return false;
}
EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
@@ -961,11 +980,13 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
- vcpu->arch.pkru != vcpu->arch.host_pkru)
+ vcpu->arch.pkru != vcpu->arch.host_pkru &&
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE)))
write_pkru(vcpu->arch.pkru);
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
@@ -974,13 +995,15 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
if (vcpu->arch.guest_state_protected)
return;
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (static_cpu_has(X86_FEATURE_PKU) &&
- (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
- (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) ||
+ kvm_read_cr4_bits(vcpu, X86_CR4_PKE))) {
vcpu->arch.pkru = rdpkru();
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
write_pkru(vcpu->arch.host_pkru);
}
+#endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
@@ -2249,14 +2272,13 @@ static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
- vcpu->arch.pv_time_enabled = false;
- if (!(system_time & 1))
- return;
-
- if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.pv_time, system_time & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = true;
+ if (system_time & 1) {
+ kvm_gfn_to_pfn_cache_init(vcpu->kvm, &vcpu->arch.pv_time, vcpu,
+ KVM_HOST_USES_PFN, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info));
+ } else {
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
+ }
return;
}
@@ -2961,63 +2983,55 @@ u64 get_kvmclock_ns(struct kvm *kvm)
return data.clock;
}
-static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
- struct gfn_to_hva_cache *cache,
- unsigned int offset)
+static void kvm_setup_guest_pvclock(struct kvm_vcpu *v,
+ struct gfn_to_pfn_cache *gpc,
+ unsigned int offset)
{
struct kvm_vcpu_arch *vcpu = &v->arch;
- struct pvclock_vcpu_time_info guest_hv_clock;
+ struct pvclock_vcpu_time_info *guest_hv_clock;
+ unsigned long flags;
- if (unlikely(kvm_read_guest_offset_cached(v->kvm, cache,
- &guest_hv_clock, offset, sizeof(guest_hv_clock))))
- return;
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ offset + sizeof(*guest_hv_clock)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ guest_hv_clock = (void *)(gpc->khva + offset);
- /* This VCPU is paused, but it's legal for a guest to read another
+ /*
+ * This VCPU is paused, but it's legal for a guest to read another
* VCPU's kvmclock, so we really have to follow the specification where
* it says that version is odd if data is being modified, and even after
* it is consistent.
- *
- * Version field updates must be kept separate. This is because
- * kvm_write_guest_cached might use a "rep movs" instruction, and
- * writes within a string instruction are weakly ordered. So there
- * are three writes overall.
- *
- * As a small optimization, only write the version field in the first
- * and third write. The vcpu->pv_time cache is still valid, because the
- * version field is the first in the struct.
*/
- BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
- if (guest_hv_clock.version & 1)
- ++guest_hv_clock.version; /* first time write, random junk */
-
- vcpu->hv_clock.version = guest_hv_clock.version + 1;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1;
smp_wmb();
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
- vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+ vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED);
if (vcpu->pvclock_set_guest_stopped_request) {
vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
vcpu->pvclock_set_guest_stopped_request = false;
}
- trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+ memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock));
+ smp_wmb();
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock));
+ guest_hv_clock->version = ++vcpu->hv_clock.version;
- smp_wmb();
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
- vcpu->hv_clock.version++;
- kvm_write_guest_offset_cached(v->kvm, cache,
- &vcpu->hv_clock, offset,
- sizeof(vcpu->hv_clock.version));
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
}
static int kvm_guest_time_update(struct kvm_vcpu *v)
@@ -3106,13 +3120,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
- if (vcpu->pv_time_enabled)
- kvm_setup_pvclock_page(v, &vcpu->pv_time, 0);
- if (vcpu->xen.vcpu_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_info_cache,
- offsetof(struct compat_vcpu_info, time));
- if (vcpu->xen.vcpu_time_info_set)
- kvm_setup_pvclock_page(v, &vcpu->xen.vcpu_time_info_cache, 0);
+ if (vcpu->pv_time.active)
+ kvm_setup_guest_pvclock(v, &vcpu->pv_time, 0);
+ if (vcpu->xen.vcpu_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_info_cache,
+ offsetof(struct compat_vcpu_info, time));
+ if (vcpu->xen.vcpu_time_info_cache.active)
+ kvm_setup_guest_pvclock(v, &vcpu->xen.vcpu_time_info_cache, 0);
kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
return 0;
}
@@ -3300,7 +3314,7 @@ static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
- vcpu->arch.pv_time_enabled = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.pv_time);
vcpu->arch.time = 0;
}
@@ -4284,7 +4298,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR |
KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
KVM_XEN_HVM_CONFIG_SHARED_INFO |
- KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL;
+ KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
if (sched_info_on())
r |= KVM_XEN_HVM_CONFIG_RUNSTATE;
break;
@@ -4331,6 +4346,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = boot_cpu_has(X86_FEATURE_XSAVE);
break;
case KVM_CAP_TSC_CONTROL:
+ case KVM_CAP_VM_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
case KVM_CAP_X2APIC_API:
@@ -5102,7 +5118,7 @@ static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
*/
static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.pv_time_enabled)
+ if (!vcpu->arch.pv_time.active)
return -EINVAL;
vcpu->arch.pvclock_set_guest_stopped_request = true;
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
@@ -6186,7 +6202,7 @@ static int kvm_arch_suspend_notifier(struct kvm *kvm)
mutex_lock(&kvm->lock);
kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!vcpu->arch.pv_time_enabled)
+ if (!vcpu->arch.pv_time.active)
continue;
ret = kvm_set_guest_paused(vcpu);
@@ -6513,6 +6529,15 @@ set_pit2_out:
r = kvm_xen_hvm_set_attr(kvm, &xha);
break;
}
+ case KVM_XEN_HVM_EVTCHN_SEND: {
+ struct kvm_irq_routing_xen_evtchn uxe;
+
+ r = -EFAULT;
+ if (copy_from_user(&uxe, argp, sizeof(uxe)))
+ goto out;
+ r = kvm_xen_hvm_evtchn_send(kvm, &uxe);
+ break;
+ }
#endif
case KVM_SET_CLOCK:
r = kvm_vm_ioctl_set_clock(kvm, argp);
@@ -6520,6 +6545,28 @@ set_pit2_out:
case KVM_GET_CLOCK:
r = kvm_vm_ioctl_get_clock(kvm, argp);
break;
+ case KVM_SET_TSC_KHZ: {
+ u32 user_tsc_khz;
+
+ r = -EINVAL;
+ user_tsc_khz = (u32)arg;
+
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
+ goto out;
+
+ if (user_tsc_khz == 0)
+ user_tsc_khz = tsc_khz;
+
+ WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz);
+ r = 0;
+
+ goto out;
+ }
+ case KVM_GET_TSC_KHZ: {
+ r = READ_ONCE(kvm->arch.default_tsc_khz);
+ goto out;
+ }
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
if (!kvm_x86_ops.mem_enc_ioctl)
@@ -7229,15 +7276,8 @@ static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
exception, &write_emultor);
}
-#define CMPXCHG_TYPE(t, ptr, old, new) \
- (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
-
-#ifdef CONFIG_X86_64
-# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
-#else
-# define CMPXCHG64(ptr, old, new) \
- (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
-#endif
+#define emulator_try_cmpxchg_user(t, ptr, old, new) \
+ (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t))
static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
@@ -7246,12 +7286,11 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
unsigned int bytes,
struct x86_exception *exception)
{
- struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u64 page_line_mask;
+ unsigned long hva;
gpa_t gpa;
- char *kaddr;
- bool exchanged;
+ int r;
/* guests cmpxchg8b have to be emulated atomically */
if (bytes > 8 || (bytes & (bytes - 1)))
@@ -7275,31 +7314,32 @@ static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
- if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
+ hva = kvm_vcpu_gfn_to_hva(vcpu, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(addr))
goto emul_write;
- kaddr = map.hva + offset_in_page(gpa);
+ hva += offset_in_page(gpa);
switch (bytes) {
case 1:
- exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u8, hva, old, new);
break;
case 2:
- exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u16, hva, old, new);
break;
case 4:
- exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u32, hva, old, new);
break;
case 8:
- exchanged = CMPXCHG64(kaddr, old, new);
+ r = emulator_try_cmpxchg_user(u64, hva, old, new);
break;
default:
BUG();
}
- kvm_vcpu_unmap(vcpu, &map, true);
-
- if (!exchanged)
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
+ if (r)
return X86EMUL_CMPXCHG_FAILED;
kvm_page_track_write(vcpu, gpa, new, bytes);
@@ -8061,7 +8101,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
- if (!vcpu->arch.mmu->direct_map) {
+ if (!vcpu->arch.mmu->root_role.direct) {
/*
* Write permission should be allowed since only
* write access need to be emulated.
@@ -8094,7 +8134,7 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
kvm_release_pfn_clean(pfn);
/* The instructions are well-emulated on direct mmu. */
- if (vcpu->arch.mmu->direct_map) {
+ if (vcpu->arch.mmu->root_role.direct) {
unsigned int indirect_shadow_pages;
write_lock(&vcpu->kvm->mmu_lock);
@@ -8162,7 +8202,7 @@ static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
vcpu->arch.last_retry_eip = ctxt->eip;
vcpu->arch.last_retry_addr = cr2_or_gpa;
- if (!vcpu->arch.mmu->direct_map)
+ if (!vcpu->arch.mmu->root_role.direct)
gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -8442,7 +8482,7 @@ restart:
ctxt->exception.address = cr2_or_gpa;
/* With shadow page tables, cr2 contains a GVA or nGPA. */
- if (vcpu->arch.mmu->direct_map) {
+ if (vcpu->arch.mmu->root_role.direct) {
ctxt->gpa_available = true;
ctxt->gpa_val = cr2_or_gpa;
}
@@ -8789,22 +8829,22 @@ static int kvmclock_cpu_online(unsigned int cpu)
static void kvm_timer_init(void)
{
- max_tsc_khz = tsc_khz;
-
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
-#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy *policy;
- int cpu;
-
- cpu = get_cpu();
- policy = cpufreq_cpu_get(cpu);
- if (policy) {
- if (policy->cpuinfo.max_freq)
- max_tsc_khz = policy->cpuinfo.max_freq;
- cpufreq_cpu_put(policy);
+ max_tsc_khz = tsc_khz;
+
+ if (IS_ENABLED(CONFIG_CPU_FREQ)) {
+ struct cpufreq_policy *policy;
+ int cpu;
+
+ cpu = get_cpu();
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
+ put_cpu();
}
- put_cpu();
-#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
@@ -9089,6 +9129,14 @@ bool kvm_apicv_activated(struct kvm *kvm)
}
EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu)
+{
+ ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons);
+ ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu);
+
+ return (vm_reasons | vcpu_reasons) == 0;
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated);
static void set_or_clear_apicv_inhibit(unsigned long *inhibits,
enum kvm_apicv_inhibit reason, bool set)
@@ -9266,6 +9314,17 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
+ /*
+ * If the quirk is disabled, synthesize a #UD and let the guest pick up
+ * the pieces.
+ */
+ if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) {
+ ctxt->exception.error_code_valid = false;
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->have_exception = true;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+
static_call(kvm_x86_patch_hypercall)(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
@@ -9763,7 +9822,8 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
down_read(&vcpu->kvm->arch.apicv_update_lock);
- activate = kvm_apicv_activated(vcpu->kvm);
+ activate = kvm_vcpu_apicv_activated(vcpu);
+
if (vcpu->arch.apicv_active == activate)
goto out;
@@ -10166,7 +10226,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
* per-VM state, and responsing vCPUs must wait for the update
* to complete before servicing KVM_REQ_APICV_UPDATE.
*/
- WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu));
+ WARN_ON_ONCE(kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu));
exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu);
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
@@ -10364,6 +10424,9 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
break;
kvm_clear_request(KVM_REQ_UNBLOCK, vcpu);
+ if (kvm_xen_has_pending_events(vcpu))
+ kvm_xen_inject_pending_events(vcpu);
+
if (kvm_cpu_has_pending_timer(vcpu))
kvm_inject_pending_timer_irqs(vcpu);
@@ -11249,9 +11312,10 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
+ kvm_xen_init_vcpu(vcpu);
kvm_vcpu_mtrr_init(vcpu);
vcpu_load(vcpu);
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
kvm_vcpu_reset(vcpu, false);
kvm_init_mmu(vcpu);
vcpu_put(vcpu);
@@ -11306,6 +11370,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
fpu_free_guest_fpstate(&vcpu->arch.guest_fpu);
+ kvm_xen_destroy_vcpu(vcpu);
kvm_hv_vcpu_uninit(vcpu);
kvm_pmu_destroy(vcpu);
kfree(vcpu->arch.mce_banks);
@@ -11567,6 +11632,24 @@ void kvm_arch_hardware_disable(void)
drop_user_return_notifiers();
}
+static inline void kvm_ops_update(struct kvm_x86_init_ops *ops)
+{
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+#define __KVM_X86_OP(func) \
+ static_call_update(kvm_x86_##func, kvm_x86_ops.func);
+#define KVM_X86_OP(func) \
+ WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func)
+#define KVM_X86_OP_OPTIONAL __KVM_X86_OP
+#define KVM_X86_OP_OPTIONAL_RET0(func) \
+ static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \
+ (void *)__static_call_return0);
+#include <asm/kvm-x86-ops.h>
+#undef __KVM_X86_OP
+
+ kvm_pmu_ops_update(ops->pmu_ops);
+}
+
int kvm_arch_hardware_setup(void *opaque)
{
struct kvm_x86_init_ops *ops = opaque;
@@ -11581,8 +11664,7 @@ int kvm_arch_hardware_setup(void *opaque)
if (r != 0)
return r;
- memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
- kvm_ops_static_call_update();
+ kvm_ops_update(ops);
kvm_register_perf_callbacks(ops->handle_intel_pt_intr);
@@ -11698,6 +11780,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
pvclock_update_vm_gtod_copy(kvm);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
+ kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz;
kvm->arch.guest_can_read_msr_platform_info = true;
kvm->arch.enable_pmu = enable_pmu;
@@ -11733,20 +11816,15 @@ static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
vcpu_put(vcpu);
}
-static void kvm_free_vcpus(struct kvm *kvm)
+static void kvm_unload_vcpu_mmus(struct kvm *kvm)
{
unsigned long i;
struct kvm_vcpu *vcpu;
- /*
- * Unpin any mmu pages first.
- */
kvm_for_each_vcpu(i, vcpu, kvm) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
}
-
- kvm_destroy_vcpus(kvm);
}
void kvm_arch_sync_events(struct kvm *kvm)
@@ -11852,11 +11930,12 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
__x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
mutex_unlock(&kvm->slots_lock);
}
+ kvm_unload_vcpu_mmus(kvm);
static_call_cond(kvm_x86_vm_destroy)(kvm);
kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
- kvm_free_vcpus(kvm);
+ kvm_destroy_vcpus(kvm);
kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
kvm_mmu_uninit_vm(kvm);
@@ -12179,6 +12258,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
return true;
+ if (kvm_xen_has_pending_events(vcpu))
+ return true;
+
return false;
}
@@ -12280,7 +12362,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
- if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) ||
+ if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) ||
work->wakeup_all)
return;
@@ -12288,7 +12370,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
if (unlikely(r))
return;
- if (!vcpu->arch.mmu->direct_map &&
+ if (!vcpu->arch.mmu->root_role.direct &&
work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
return;
@@ -12986,6 +13068,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index bf6cc25eee76..610beba35907 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -9,17 +9,25 @@
#include "x86.h"
#include "xen.h"
#include "hyperv.h"
+#include "lapic.h"
+#include <linux/eventfd.h>
#include <linux/kvm_host.h>
#include <linux/sched/stat.h>
#include <trace/events/kvm.h>
#include <xen/interface/xen.h>
#include <xen/interface/vcpu.h>
+#include <xen/interface/version.h>
#include <xen/interface/event_channel.h>
+#include <xen/interface/sched.h>
#include "trace.h"
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm);
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r);
+
DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn)
@@ -102,6 +110,66 @@ out:
return ret;
}
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+ if (atomic_read(&vcpu->arch.xen.timer_pending) > 0) {
+ struct kvm_xen_evtchn e;
+
+ e.vcpu_id = vcpu->vcpu_id;
+ e.vcpu_idx = vcpu->vcpu_idx;
+ e.port = vcpu->arch.xen.timer_virq;
+ e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+
+ kvm_xen_set_evtchn(&e, vcpu->kvm);
+
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ }
+}
+
+static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
+{
+ struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu,
+ arch.xen.timer);
+ if (atomic_read(&vcpu->arch.xen.timer_pending))
+ return HRTIMER_NORESTART;
+
+ atomic_inc(&vcpu->arch.xen.timer_pending);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+
+ return HRTIMER_NORESTART;
+}
+
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+{
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+ vcpu->arch.xen.timer_expires = guest_abs;
+
+ if (delta_ns <= 0) {
+ xen_timer_callback(&vcpu->arch.xen.timer);
+ } else {
+ ktime_t ktime_now = ktime_get();
+ hrtimer_start(&vcpu->arch.xen.timer,
+ ktime_add_ns(ktime_now, delta_ns),
+ HRTIMER_MODE_ABS_HARD);
+ }
+}
+
+static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_cancel(&vcpu->arch.xen.timer);
+ vcpu->arch.xen.timer_expires = 0;
+ atomic_set(&vcpu->arch.xen.timer_pending, 0);
+}
+
+static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
+{
+ hrtimer_init(&vcpu->arch.xen.timer, CLOCK_MONOTONIC,
+ HRTIMER_MODE_ABS_HARD);
+ vcpu->arch.xen.timer.function = xen_timer_callback;
+}
+
static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
@@ -133,27 +201,36 @@ static void kvm_xen_update_runstate(struct kvm_vcpu *v, int state)
void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
{
struct kvm_vcpu_xen *vx = &v->arch.xen;
- struct gfn_to_hva_cache *ghc = &vx->runstate_cache;
- struct kvm_memslots *slots = kvm_memslots(v->kvm);
- bool atomic = (state == RUNSTATE_runnable);
- uint64_t state_entry_time;
- int __user *user_state;
- uint64_t __user *user_times;
+ struct gfn_to_pfn_cache *gpc = &vx->runstate_cache;
+ uint64_t *user_times;
+ unsigned long flags;
+ size_t user_len;
+ int *user_state;
kvm_xen_update_runstate(v, state);
- if (!vx->runstate_set)
+ if (!vx->runstate_cache.active)
return;
- if (unlikely(slots->generation != ghc->generation || kvm_is_error_hva(ghc->hva)) &&
- kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len))
- return;
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_len = sizeof(struct vcpu_runstate_info);
+ else
+ user_len = sizeof(struct compat_vcpu_runstate_info);
- /* We made sure it fits in a single page */
- BUG_ON(!ghc->memslot);
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ user_len)) {
+ read_unlock_irqrestore(&gpc->lock, flags);
- if (atomic)
- pagefault_disable();
+ /* When invoked from kvm_sched_out() we cannot sleep */
+ if (state == RUNSTATE_runnable)
+ return;
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa, user_len))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
/*
* The only difference between 32-bit and 64-bit versions of the
@@ -167,38 +244,33 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
*/
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0);
BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0);
- user_state = (int __user *)ghc->hva;
-
BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c);
-
- user_times = (uint64_t __user *)(ghc->hva +
- offsetof(struct compat_vcpu_runstate_info,
- state_entry_time));
#ifdef CONFIG_X86_64
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4);
BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
offsetof(struct compat_vcpu_runstate_info, time) + 4);
-
- if (v->kvm->arch.xen.long_mode)
- user_times = (uint64_t __user *)(ghc->hva +
- offsetof(struct vcpu_runstate_info,
- state_entry_time));
#endif
+
+ user_state = gpc->khva;
+
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode)
+ user_times = gpc->khva + offsetof(struct vcpu_runstate_info,
+ state_entry_time);
+ else
+ user_times = gpc->khva + offsetof(struct compat_vcpu_runstate_info,
+ state_entry_time);
+
/*
* First write the updated state_entry_time at the appropriate
* location determined by 'offset'.
*/
- state_entry_time = vx->runstate_entry_time;
- state_entry_time |= XEN_RUNSTATE_UPDATE;
-
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
- sizeof(state_entry_time));
+ sizeof(user_times[0]));
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
- sizeof(state_entry_time));
+ sizeof(user_times[0]));
- if (__put_user(state_entry_time, user_times))
- goto out;
+ user_times[0] = vx->runstate_entry_time | XEN_RUNSTATE_UPDATE;
smp_wmb();
/*
@@ -212,8 +284,7 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
sizeof(vx->current_runstate));
- if (__put_user(vx->current_runstate, user_state))
- goto out;
+ *user_state = vx->current_runstate;
/*
* Write the actual runstate times immediately after the
@@ -228,42 +299,114 @@ void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, int state)
BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
sizeof(vx->runstate_times));
- if (__copy_to_user(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times)))
- goto out;
+ memcpy(user_times + 1, vx->runstate_times, sizeof(vx->runstate_times));
smp_wmb();
/*
* Finally, clear the XEN_RUNSTATE_UPDATE bit in the guest's
* runstate_entry_time field.
*/
- state_entry_time &= ~XEN_RUNSTATE_UPDATE;
- __put_user(state_entry_time, user_times);
+ user_times[0] &= ~XEN_RUNSTATE_UPDATE;
smp_wmb();
- out:
- mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
+ read_unlock_irqrestore(&gpc->lock, flags);
- if (atomic)
- pagefault_enable();
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
}
-int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+static void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
+{
+ struct kvm_lapic_irq irq = { };
+ int r;
+
+ irq.dest_id = v->vcpu_id;
+ irq.vector = v->arch.xen.upcall_vector;
+ irq.dest_mode = APIC_DEST_PHYSICAL;
+ irq.shorthand = APIC_DEST_NOSHORT;
+ irq.delivery_mode = APIC_DM_FIXED;
+ irq.level = 1;
+
+ /* The fast version will always work for physical unicast */
+ WARN_ON_ONCE(!kvm_irq_delivery_to_apic_fast(v->kvm, NULL, &irq, &r, NULL));
+}
+
+/*
+ * On event channel delivery, the vcpu_info may not have been accessible.
+ * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
+ * need to be marked into the vcpu_info (and evtchn_upcall_pending set).
+ * Do so now that we can sleep in the context of the vCPU to bring the
+ * page in, and refresh the pfn cache for it.
+ */
+void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
{
unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
- bool atomic = in_atomic() || !task_is_running(current);
- int err;
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
+
+ if (!evtchn_pending_sel)
+ return;
+
+ /*
+ * Yes, this is an open-coded loop. But that's just what put_user()
+ * does anyway. Page it in and retry the instruction. We're just a
+ * little more honest about it.
+ */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info)))
+ return;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ }
+
+ /* Now gpc->khva is a valid kernel address for the vcpu_info */
+ if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
+ struct vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orq %0, %1\n"
+ "notq %0\n"
+ LOCK_PREFIX "andq %0, %2\n"
+ : "=r" (evtchn_pending_sel),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ } else {
+ u32 evtchn_pending_sel32 = evtchn_pending_sel;
+ struct compat_vcpu_info *vi = gpc->khva;
+
+ asm volatile(LOCK_PREFIX "orl %0, %1\n"
+ "notl %0\n"
+ LOCK_PREFIX "andl %0, %2\n"
+ : "=r" (evtchn_pending_sel32),
+ "+m" (vi->evtchn_pending_sel),
+ "+m" (v->arch.xen.evtchn_pending_sel)
+ : "0" (evtchn_pending_sel32));
+ WRITE_ONCE(vi->evtchn_upcall_pending, 1);
+ }
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (v->arch.xen.upcall_vector)
+ kvm_xen_inject_vcpu_vector(v);
+
+ mark_page_dirty_in_slot(v->kvm, gpc->memslot, gpc->gpa >> PAGE_SHIFT);
+}
+
+int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
+{
+ struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
+ unsigned long flags;
u8 rc = 0;
/*
* If the global upcall vector (HVMIRQ_callback_vector) is set and
* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
*/
- struct gfn_to_hva_cache *ghc = &v->arch.xen.vcpu_info_cache;
- struct kvm_memslots *slots = kvm_memslots(v->kvm);
- bool ghc_valid = slots->generation == ghc->generation &&
- !kvm_is_error_hva(ghc->hva) && ghc->memslot;
-
- unsigned int offset = offsetof(struct vcpu_info, evtchn_upcall_pending);
/* No need for compat handling here */
BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
@@ -273,101 +416,35 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
BUILD_BUG_ON(sizeof(rc) !=
sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
- /*
- * For efficiency, this mirrors the checks for using the valid
- * cache in kvm_read_guest_offset_cached(), but just uses
- * __get_user() instead. And falls back to the slow path.
- */
- if (!evtchn_pending_sel && ghc_valid) {
- /* Fast path */
- pagefault_disable();
- err = __get_user(rc, (u8 __user *)ghc->hva + offset);
- pagefault_enable();
- if (!err)
- return rc;
- }
-
- /* Slow path */
+ read_lock_irqsave(&gpc->lock, flags);
+ while (!kvm_gfn_to_pfn_cache_check(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
+ read_unlock_irqrestore(&gpc->lock, flags);
- /*
- * This function gets called from kvm_vcpu_block() after setting the
- * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
- * from a HLT. So we really mustn't sleep. If the page ended up absent
- * at that point, just return 1 in order to trigger an immediate wake,
- * and we'll end up getting called again from a context where we *can*
- * fault in the page and wait for it.
- */
- if (atomic)
- return 1;
+ /*
+ * This function gets called from kvm_vcpu_block() after setting the
+ * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
+ * from a HLT. So we really mustn't sleep. If the page ended up absent
+ * at that point, just return 1 in order to trigger an immediate wake,
+ * and we'll end up getting called again from a context where we *can*
+ * fault in the page and wait for it.
+ */
+ if (in_atomic() || !task_is_running(current))
+ return 1;
- if (!ghc_valid) {
- err = kvm_gfn_to_hva_cache_init(v->kvm, ghc, ghc->gpa, ghc->len);
- if (err || !ghc->memslot) {
+ if (kvm_gfn_to_pfn_cache_refresh(v->kvm, gpc, gpc->gpa,
+ sizeof(struct vcpu_info))) {
/*
* If this failed, userspace has screwed up the
* vcpu_info mapping. No interrupts for you.
*/
return 0;
}
+ read_lock_irqsave(&gpc->lock, flags);
}
- /*
- * Now we have a valid (protected by srcu) userspace HVA in
- * ghc->hva which points to the struct vcpu_info. If there
- * are any bits in the in-kernel evtchn_pending_sel then
- * we need to write those to the guest vcpu_info and set
- * its evtchn_upcall_pending flag. If there aren't any bits
- * to add, we only want to *check* evtchn_upcall_pending.
- */
- if (evtchn_pending_sel) {
- bool long_mode = v->kvm->arch.xen.long_mode;
-
- if (!user_access_begin((void __user *)ghc->hva, sizeof(struct vcpu_info)))
- return 0;
-
- if (IS_ENABLED(CONFIG_64BIT) && long_mode) {
- struct vcpu_info __user *vi = (void __user *)ghc->hva;
-
- /* Attempt to set the evtchn_pending_sel bits in the
- * guest, and if that succeeds then clear the same
- * bits in the in-kernel version. */
- asm volatile("1:\t" LOCK_PREFIX "orq %0, %1\n"
- "\tnotq %0\n"
- "\t" LOCK_PREFIX "andq %0, %2\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "=r" (evtchn_pending_sel),
- "+m" (vi->evtchn_pending_sel),
- "+m" (v->arch.xen.evtchn_pending_sel)
- : "0" (evtchn_pending_sel));
- } else {
- struct compat_vcpu_info __user *vi = (void __user *)ghc->hva;
- u32 evtchn_pending_sel32 = evtchn_pending_sel;
-
- /* Attempt to set the evtchn_pending_sel bits in the
- * guest, and if that succeeds then clear the same
- * bits in the in-kernel version. */
- asm volatile("1:\t" LOCK_PREFIX "orl %0, %1\n"
- "\tnotl %0\n"
- "\t" LOCK_PREFIX "andl %0, %2\n"
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- : "=r" (evtchn_pending_sel32),
- "+m" (vi->evtchn_pending_sel),
- "+m" (v->arch.xen.evtchn_pending_sel)
- : "0" (evtchn_pending_sel32));
- }
- rc = 1;
- unsafe_put_user(rc, (u8 __user *)ghc->hva + offset, err);
-
- err:
- user_access_end();
-
- mark_page_dirty_in_slot(v->kvm, ghc->memslot, ghc->gpa >> PAGE_SHIFT);
- } else {
- __get_user(rc, (u8 __user *)ghc->hva + offset);
- }
-
+ rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
+ read_unlock_irqrestore(&gpc->lock, flags);
return rc;
}
@@ -375,36 +452,51 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
{
int r = -ENOENT;
- mutex_lock(&kvm->lock);
switch (data->type) {
case KVM_XEN_ATTR_TYPE_LONG_MODE:
if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
r = -EINVAL;
} else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.long_mode = !!data->u.long_mode;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
case KVM_XEN_ATTR_TYPE_SHARED_INFO:
+ mutex_lock(&kvm->lock);
r = kvm_xen_shared_info_init(kvm, data->u.shared_info.gfn);
+ mutex_unlock(&kvm->lock);
break;
case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
if (data->u.vector && data->u.vector < 0x10)
r = -EINVAL;
else {
+ mutex_lock(&kvm->lock);
kvm->arch.xen.upcall_vector = data->u.vector;
+ mutex_unlock(&kvm->lock);
r = 0;
}
break;
+ case KVM_XEN_ATTR_TYPE_EVTCHN:
+ r = kvm_xen_setattr_evtchn(kvm, data);
+ break;
+
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ mutex_lock(&kvm->lock);
+ kvm->arch.xen.xen_version = data->u.xen_version;
+ mutex_unlock(&kvm->lock);
+ r = 0;
+ break;
+
default:
break;
}
- mutex_unlock(&kvm->lock);
return r;
}
@@ -433,6 +525,11 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
r = 0;
break;
+ case KVM_XEN_ATTR_TYPE_XEN_VERSION:
+ data->u.xen_version = kvm->arch.xen.xen_version;
+ r = 0;
+ break;
+
default:
break;
}
@@ -457,48 +554,34 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
offsetof(struct compat_vcpu_info, time));
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm, &vcpu->arch.xen.vcpu_info_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_info));
- if (!r) {
- vcpu->arch.xen.vcpu_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
+
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.vcpu_time_info_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct pvclock_vcpu_time_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.vcpu_time_info_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct pvclock_vcpu_time_info));
- if (!r) {
- vcpu->arch.xen.vcpu_time_info_set = true;
+ if (!r)
kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
@@ -507,24 +590,16 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
}
if (data->u.gpa == GPA_INVALID) {
- vcpu->arch.xen.runstate_set = false;
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
r = 0;
break;
}
- /* It must fit within a single page */
- if ((data->u.gpa & ~PAGE_MASK) + sizeof(struct vcpu_runstate_info) > PAGE_SIZE) {
- r = -EINVAL;
- break;
- }
-
- r = kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ r = kvm_gfn_to_pfn_cache_init(vcpu->kvm,
&vcpu->arch.xen.runstate_cache,
- data->u.gpa,
+ NULL, KVM_HOST_USES_PFN, data->u.gpa,
sizeof(struct vcpu_runstate_info));
- if (!r) {
- vcpu->arch.xen.runstate_set = true;
- }
break;
case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
@@ -622,6 +697,46 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = 0;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ if (data->u.vcpu_id >= KVM_MAX_VCPUS)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
+ r = 0;
+ }
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ if (data->u.timer.port) {
+ if (data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
+ r = -EINVAL;
+ break;
+ }
+ vcpu->arch.xen.timer_virq = data->u.timer.port;
+ kvm_xen_init_timer(vcpu);
+
+ /* Restart the timer if it's set */
+ if (data->u.timer.expires_ns)
+ kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+ data->u.timer.expires_ns -
+ get_kvmclock_ns(vcpu->kvm));
+ } else if (kvm_xen_timer_enabled(vcpu)) {
+ kvm_xen_stop_timer(vcpu);
+ vcpu->arch.xen.timer_virq = 0;
+ }
+
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ if (data->u.vector && data->u.vector < 0x10)
+ r = -EINVAL;
+ else {
+ vcpu->arch.xen.upcall_vector = data->u.vector;
+ r = 0;
+ }
+ break;
+
default:
break;
}
@@ -639,7 +754,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
switch (data->type) {
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
- if (vcpu->arch.xen.vcpu_info_set)
+ if (vcpu->arch.xen.vcpu_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -647,7 +762,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
break;
case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
- if (vcpu->arch.xen.vcpu_time_info_set)
+ if (vcpu->arch.xen.vcpu_time_info_cache.active)
data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
else
data->u.gpa = GPA_INVALID;
@@ -659,7 +774,7 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EOPNOTSUPP;
break;
}
- if (vcpu->arch.xen.runstate_set) {
+ if (vcpu->arch.xen.runstate_cache.active) {
data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
r = 0;
}
@@ -697,6 +812,23 @@ int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
r = -EINVAL;
break;
+ case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
+ data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
+ data->u.timer.port = vcpu->arch.xen.timer_virq;
+ data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
+ data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
+ r = 0;
+ break;
+
+ case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
+ data->u.vector = vcpu->arch.xen.upcall_vector;
+ r = 0;
+ break;
+
default:
break;
}
@@ -777,7 +909,11 @@ int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
{
- if (xhc->flags & ~KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL)
+ /* Only some feature flags need to be *enabled* by userspace */
+ u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL |
+ KVM_XEN_HVM_CONFIG_EVTCHN_SEND;
+
+ if (xhc->flags & ~permitted_flags)
return -EINVAL;
/*
@@ -802,18 +938,6 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc)
return 0;
}
-void kvm_xen_init_vm(struct kvm *kvm)
-{
-}
-
-void kvm_xen_destroy_vm(struct kvm *kvm)
-{
- kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
-
- if (kvm->arch.xen_hvm_config.msr)
- static_branch_slow_dec_deferred(&kvm_xen_enabled);
-}
-
static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
{
kvm_rax_write(vcpu, result);
@@ -830,10 +954,268 @@ static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
return kvm_xen_hypercall_set_result(vcpu, run->xen.u.hcall.result);
}
+static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports,
+ evtchn_port_t *ports)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
+ unsigned long *pending_bits;
+ unsigned long flags;
+ bool ret = true;
+ int idx, i;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ idx = srcu_read_lock(&kvm->srcu);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
+ goto out_rcu;
+
+ ret = false;
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ } else {
+ struct compat_shared_info *shinfo = gpc->khva;
+ pending_bits = (unsigned long *)&shinfo->evtchn_pending;
+ }
+
+ for (i = 0; i < nr_ports; i++) {
+ if (test_bit(ports[i], pending_bits)) {
+ ret = true;
+ break;
+ }
+ }
+
+ out_rcu:
+ srcu_read_unlock(&kvm->srcu, idx);
+ read_unlock_irqrestore(&gpc->lock, flags);
+
+ return ret;
+}
+
+static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
+ u64 param, u64 *r)
+{
+ int idx, i;
+ struct sched_poll sched_poll;
+ evtchn_port_t port, *ports;
+ gpa_t gpa;
+
+ if (!longmode || !lapic_in_kernel(vcpu) ||
+ !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
+ return false;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &sched_poll,
+ sizeof(sched_poll))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ if (unlikely(sched_poll.nr_ports > 1)) {
+ /* Xen (unofficially) limits number of pollers to 128 */
+ if (sched_poll.nr_ports > 128) {
+ *r = -EINVAL;
+ return true;
+ }
+
+ ports = kmalloc_array(sched_poll.nr_ports,
+ sizeof(*ports), GFP_KERNEL);
+ if (!ports) {
+ *r = -ENOMEM;
+ return true;
+ }
+ } else
+ ports = &port;
+
+ for (i = 0; i < sched_poll.nr_ports; i++) {
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu,
+ (gva_t)(sched_poll.ports + i),
+ NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa,
+ &ports[i], sizeof(port))) {
+ *r = -EFAULT;
+ goto out;
+ }
+ }
+
+ if (sched_poll.nr_ports == 1)
+ vcpu->arch.xen.poll_evtchn = port;
+ else
+ vcpu->arch.xen.poll_evtchn = -1;
+
+ set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (!wait_pending_event(vcpu, sched_poll.nr_ports, ports)) {
+ vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
+
+ if (sched_poll.timeout)
+ mod_timer(&vcpu->arch.xen.poll_timer,
+ jiffies + nsecs_to_jiffies(sched_poll.timeout));
+
+ kvm_vcpu_halt(vcpu);
+
+ if (sched_poll.timeout)
+ del_timer(&vcpu->arch.xen.poll_timer);
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ kvm_clear_request(KVM_REQ_UNHALT, vcpu);
+ }
+
+ vcpu->arch.xen.poll_evtchn = 0;
+ *r = 0;
+out:
+ /* Really, this is only needed in case of timeout */
+ clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask);
+
+ if (unlikely(sched_poll.nr_ports > 1))
+ kfree(ports);
+ return true;
+}
+
+static void cancel_evtchn_poll(struct timer_list *t)
+{
+ struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
+
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+}
+
+static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
+ int cmd, u64 param, u64 *r)
+{
+ switch (cmd) {
+ case SCHEDOP_poll:
+ if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
+ return true;
+ fallthrough;
+ case SCHEDOP_yield:
+ kvm_vcpu_on_spin(vcpu, true);
+ *r = 0;
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+struct compat_vcpu_set_singleshot_timer {
+ uint64_t timeout_abs_ns;
+ uint32_t flags;
+} __attribute__((packed));
+
+static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
+ int vcpu_id, u64 param, u64 *r)
+{
+ struct vcpu_set_singleshot_timer oneshot;
+ s64 delta;
+ gpa_t gpa;
+ int idx;
+
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ switch (cmd) {
+ case VCPUOP_set_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ /*
+ * The only difference for 32-bit compat is the 4 bytes of
+ * padding after the interesting part of the structure. So
+ * for a faithful emulation of Xen we have to *try* to copy
+ * the padding and return -EFAULT if we can't. Otherwise we
+ * might as well just have copied the 12-byte 32-bit struct.
+ */
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
+ BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
+ offsetof(struct vcpu_set_singleshot_timer, flags));
+ BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
+ sizeof_field(struct vcpu_set_singleshot_timer, flags));
+
+ if (!gpa ||
+ kvm_vcpu_read_guest(vcpu, gpa, &oneshot, longmode ? sizeof(oneshot) :
+ sizeof(struct compat_vcpu_set_singleshot_timer))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
+ if ((oneshot.flags & VCPU_SSHOTTMR_future) && delta < 0) {
+ *r = -ETIME;
+ return true;
+ }
+
+ kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
+ *r = 0;
+ return true;
+
+ case VCPUOP_stop_singleshot_timer:
+ if (vcpu->arch.xen.vcpu_id != vcpu_id) {
+ *r = -EINVAL;
+ return true;
+ }
+ kvm_xen_stop_timer(vcpu);
+ *r = 0;
+ return true;
+ }
+
+ return false;
+}
+
+static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
+ u64 *r)
+{
+ if (!kvm_xen_timer_enabled(vcpu))
+ return false;
+
+ if (timeout) {
+ uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+ int64_t delta = timeout - guest_now;
+
+ /* Xen has a 'Linux workaround' in do_set_timer_op() which
+ * checks for negative absolute timeout values (caused by
+ * integer overflow), and for values about 13 days in the
+ * future (2^50ns) which would be caused by jiffies
+ * overflow. For those cases, it sets the timeout 100ms in
+ * the future (not *too* soon, since if a guest really did
+ * set a long timeout on purpose we don't want to keep
+ * churning CPU time by waking it up).
+ */
+ if (unlikely((int64_t)timeout < 0 ||
+ (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
+ delta = 100 * NSEC_PER_MSEC;
+ timeout = guest_now + delta;
+ }
+
+ kvm_xen_start_timer(vcpu, timeout, delta);
+ } else {
+ kvm_xen_stop_timer(vcpu);
+ }
+
+ *r = 0;
+ return true;
+}
+
int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
{
bool longmode;
- u64 input, params[6];
+ u64 input, params[6], r = -ENOSYS;
+ bool handled = false;
input = (u64)kvm_register_read(vcpu, VCPU_REGS_RAX);
@@ -864,6 +1246,40 @@ int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
trace_kvm_xen_hypercall(input, params[0], params[1], params[2],
params[3], params[4], params[5]);
+ switch (input) {
+ case __HYPERVISOR_xen_version:
+ if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
+ r = vcpu->kvm->arch.xen.xen_version;
+ handled = true;
+ }
+ break;
+ case __HYPERVISOR_event_channel_op:
+ if (params[0] == EVTCHNOP_send)
+ handled = kvm_xen_hcall_evtchn_send(vcpu, params[1], &r);
+ break;
+ case __HYPERVISOR_sched_op:
+ handled = kvm_xen_hcall_sched_op(vcpu, longmode, params[0],
+ params[1], &r);
+ break;
+ case __HYPERVISOR_vcpu_op:
+ handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, params[0], params[1],
+ params[2], &r);
+ break;
+ case __HYPERVISOR_set_timer_op: {
+ u64 timeout = params[0];
+ /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */
+ if (!longmode)
+ timeout |= params[1] << 32;
+ handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, &r);
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (handled)
+ return kvm_xen_hypercall_set_result(vcpu, r);
+
vcpu->run->exit_reason = KVM_EXIT_XEN;
vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
vcpu->run->xen.u.hcall.longmode = longmode;
@@ -890,14 +1306,28 @@ static inline int max_evtchn_port(struct kvm *kvm)
return COMPAT_EVTCHN_2L_NR_CHANNELS;
}
+static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port)
+{
+ int poll_evtchn = vcpu->arch.xen.poll_evtchn;
+
+ if ((poll_evtchn == port || poll_evtchn == -1) &&
+ test_and_clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.xen.poll_mask)) {
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
+ kvm_vcpu_kick(vcpu);
+ }
+}
+
/*
- * This follows the kvm_set_irq() API, so it returns:
+ * The return value from this function is propagated to kvm_set_irq() API,
+ * so it returns:
* < 0 Interrupt was ignored (masked or not delivered for other reasons)
* = 0 Interrupt was coalesced (previous irq is still pending)
* > 0 Number of CPUs interrupt was delivered to
+ *
+ * It is also called directly from kvm_arch_set_irq_inatomic(), where the
+ * only check on its return value is a comparison with -EWOULDBLOCK'.
*/
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm)
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
struct kvm_vcpu *vcpu;
@@ -905,23 +1335,29 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
unsigned long flags;
int port_word_bit;
bool kick_vcpu = false;
- int idx;
- int rc;
+ int vcpu_idx, idx, rc;
- vcpu = kvm_get_vcpu_by_id(kvm, e->xen_evtchn.vcpu);
- if (!vcpu)
- return -1;
+ vcpu_idx = READ_ONCE(xe->vcpu_idx);
+ if (vcpu_idx >= 0)
+ vcpu = kvm_get_vcpu(kvm, vcpu_idx);
+ else {
+ vcpu = kvm_get_vcpu_by_id(kvm, xe->vcpu_id);
+ if (!vcpu)
+ return -EINVAL;
+ WRITE_ONCE(xe->vcpu_idx, kvm_vcpu_get_idx(vcpu));
+ }
- if (!vcpu->arch.xen.vcpu_info_set)
- return -1;
+ if (!vcpu->arch.xen.vcpu_info_cache.active)
+ return -EINVAL;
- if (e->xen_evtchn.port >= max_evtchn_port(kvm))
- return -1;
+ if (xe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
rc = -EWOULDBLOCK;
- read_lock_irqsave(&gpc->lock, flags);
idx = srcu_read_lock(&kvm->srcu);
+
+ read_lock_irqsave(&gpc->lock, flags);
if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, PAGE_SIZE))
goto out_rcu;
@@ -929,12 +1365,12 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
struct shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
- port_word_bit = e->xen_evtchn.port / 64;
+ port_word_bit = xe->port / 64;
} else {
struct compat_shared_info *shinfo = gpc->khva;
pending_bits = (unsigned long *)&shinfo->evtchn_pending;
mask_bits = (unsigned long *)&shinfo->evtchn_mask;
- port_word_bit = e->xen_evtchn.port / 32;
+ port_word_bit = xe->port / 32;
}
/*
@@ -944,39 +1380,68 @@ int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
* already set, then we kick the vCPU in question to write to the
* *real* evtchn_pending_sel in its own guest vcpu_info struct.
*/
- if (test_and_set_bit(e->xen_evtchn.port, pending_bits)) {
+ if (test_and_set_bit(xe->port, pending_bits)) {
rc = 0; /* It was already raised */
- } else if (test_bit(e->xen_evtchn.port, mask_bits)) {
- rc = -1; /* Masked */
+ } else if (test_bit(xe->port, mask_bits)) {
+ rc = -ENOTCONN; /* Masked */
+ kvm_xen_check_poller(vcpu, xe->port);
} else {
- rc = 1; /* Delivered. But was the vCPU waking already? */
- if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
- kick_vcpu = true;
+ rc = 1; /* Delivered to the bitmap in shared_info. */
+ /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */
+ read_unlock_irqrestore(&gpc->lock, flags);
+ gpc = &vcpu->arch.xen.vcpu_info_cache;
+
+ read_lock_irqsave(&gpc->lock, flags);
+ if (!kvm_gfn_to_pfn_cache_check(kvm, gpc, gpc->gpa, sizeof(struct vcpu_info))) {
+ /*
+ * Could not access the vcpu_info. Set the bit in-kernel
+ * and prod the vCPU to deliver it for itself.
+ */
+ if (!test_and_set_bit(port_word_bit, &vcpu->arch.xen.evtchn_pending_sel))
+ kick_vcpu = true;
+ goto out_rcu;
+ }
+
+ if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
+ struct vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit, &vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ } else {
+ struct compat_vcpu_info *vcpu_info = gpc->khva;
+ if (!test_and_set_bit(port_word_bit,
+ (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
+ WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1);
+ kick_vcpu = true;
+ }
+ }
+
+ /* For the per-vCPU lapic vector, deliver it as MSI. */
+ if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
+ kvm_xen_inject_vcpu_vector(vcpu);
+ kick_vcpu = false;
+ }
}
out_rcu:
- srcu_read_unlock(&kvm->srcu, idx);
read_unlock_irqrestore(&gpc->lock, flags);
+ srcu_read_unlock(&kvm->srcu, idx);
if (kick_vcpu) {
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
kvm_vcpu_kick(vcpu);
}
return rc;
}
-/* This is the version called from kvm_set_irq() as the .set function */
-static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
- int irq_source_id, int level, bool line_status)
+static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm)
{
bool mm_borrowed = false;
int rc;
- if (!level)
- return -1;
-
- rc = kvm_xen_set_evtchn_fast(e, kvm);
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
return rc;
@@ -1020,7 +1485,7 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
int idx;
- rc = kvm_xen_set_evtchn_fast(e, kvm);
+ rc = kvm_xen_set_evtchn_fast(xe, kvm);
if (rc != -EWOULDBLOCK)
break;
@@ -1037,11 +1502,27 @@ static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm
return rc;
}
+/* This is the version called from kvm_set_irq() as the .set function */
+static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm,
+ int irq_source_id, int level, bool line_status)
+{
+ if (!level)
+ return -EINVAL;
+
+ return kvm_xen_set_evtchn(&e->xen_evtchn, kvm);
+}
+
+/*
+ * Set up an event channel interrupt from the KVM IRQ routing table.
+ * Used for e.g. PIRQ from passed through physical devices.
+ */
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
+ struct kvm_vcpu *vcpu;
+
if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
return -EINVAL;
@@ -1049,10 +1530,328 @@ int kvm_xen_setup_evtchn(struct kvm *kvm,
if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
return -EINVAL;
+ /*
+ * Xen gives us interesting mappings from vCPU index to APIC ID,
+ * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
+ * to find it. Do that once at setup time, instead of every time.
+ * But beware that on live update / live migration, the routing
+ * table might be reinstated before the vCPU threads have finished
+ * recreating their vCPUs.
+ */
+ vcpu = kvm_get_vcpu_by_id(kvm, ue->u.xen_evtchn.vcpu);
+ if (vcpu)
+ e->xen_evtchn.vcpu_idx = kvm_vcpu_get_idx(vcpu);
+ else
+ e->xen_evtchn.vcpu_idx = -1;
+
e->xen_evtchn.port = ue->u.xen_evtchn.port;
- e->xen_evtchn.vcpu = ue->u.xen_evtchn.vcpu;
+ e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
e->set = evtchn_set_fn;
return 0;
}
+
+/*
+ * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
+ */
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe)
+{
+ struct kvm_xen_evtchn e;
+ int ret;
+
+ if (!uxe->port || uxe->port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ e.port = uxe->port;
+ e.vcpu_id = uxe->vcpu;
+ e.vcpu_idx = -1;
+ e.priority = uxe->priority;
+
+ ret = kvm_xen_set_evtchn(&e, kvm);
+
+ /*
+ * None of that 'return 1 if it actually got delivered' nonsense.
+ * We don't care if it was masked (-ENOTCONN) either.
+ */
+ if (ret > 0 || ret == -ENOTCONN)
+ ret = 0;
+
+ return ret;
+}
+
+/*
+ * Support for *outbound* event channel events via the EVTCHNOP_send hypercall.
+ */
+struct evtchnfd {
+ u32 send_port;
+ u32 type;
+ union {
+ struct kvm_xen_evtchn port;
+ struct {
+ u32 port; /* zero */
+ struct eventfd_ctx *ctx;
+ } eventfd;
+ } deliver;
+};
+
+/*
+ * Update target vCPU or priority for a registered sending channel.
+ */
+static int kvm_xen_eventfd_update(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct evtchnfd *evtchnfd;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ /* For an UPDATE, nothing may change except the priority/vcpu */
+ if (evtchnfd->type != data->u.evtchn.type)
+ return -EINVAL;
+
+ /*
+ * Port cannot change, and if it's zero that was an eventfd
+ * which can't be changed either.
+ */
+ if (!evtchnfd->deliver.port.port ||
+ evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
+ return -EINVAL;
+
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ }
+ mutex_unlock(&kvm->lock);
+ return 0;
+}
+
+/*
+ * Configure the target (eventfd or local port delivery) for sending on
+ * a given event channel.
+ */
+static int kvm_xen_eventfd_assign(struct kvm *kvm,
+ struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+ struct eventfd_ctx *eventfd = NULL;
+ struct evtchnfd *evtchnfd = NULL;
+ int ret = -EINVAL;
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ evtchnfd = kzalloc(sizeof(struct evtchnfd), GFP_KERNEL);
+ if (!evtchnfd)
+ return -ENOMEM;
+
+ switch(data->u.evtchn.type) {
+ case EVTCHNSTAT_ipi:
+ /* IPI must map back to the same port# */
+ if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
+ goto out; /* -EINVAL */
+ break;
+
+ case EVTCHNSTAT_interdomain:
+ if (data->u.evtchn.deliver.port.port) {
+ if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
+ goto out; /* -EINVAL */
+ } else {
+ eventfd = eventfd_ctx_fdget(data->u.evtchn.deliver.eventfd.fd);
+ if (IS_ERR(eventfd)) {
+ ret = PTR_ERR(eventfd);
+ goto out;
+ }
+ }
+ break;
+
+ case EVTCHNSTAT_virq:
+ case EVTCHNSTAT_closed:
+ case EVTCHNSTAT_unbound:
+ case EVTCHNSTAT_pirq:
+ default: /* Unknown event channel type */
+ goto out; /* -EINVAL */
+ }
+
+ evtchnfd->send_port = data->u.evtchn.send_port;
+ evtchnfd->type = data->u.evtchn.type;
+ if (eventfd) {
+ evtchnfd->deliver.eventfd.ctx = eventfd;
+ } else {
+ /* We only support 2 level event channels for now */
+ if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
+ goto out; /* -EINVAL; */
+
+ evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
+ evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
+ evtchnfd->deliver.port.vcpu_idx = -1;
+ evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
+ }
+
+ mutex_lock(&kvm->lock);
+ ret = idr_alloc(&kvm->arch.xen.evtchn_ports, evtchnfd, port, port + 1,
+ GFP_KERNEL);
+ mutex_unlock(&kvm->lock);
+ if (ret >= 0)
+ return 0;
+
+ if (ret == -ENOSPC)
+ ret = -EEXIST;
+out:
+ if (eventfd)
+ eventfd_ctx_put(eventfd);
+ kfree(evtchnfd);
+ return ret;
+}
+
+static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
+{
+ struct evtchnfd *evtchnfd;
+
+ mutex_lock(&kvm->lock);
+ evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, port);
+ mutex_unlock(&kvm->lock);
+
+ if (!evtchnfd)
+ return -ENOENT;
+
+ if (kvm)
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ return 0;
+}
+
+static int kvm_xen_eventfd_reset(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ mutex_lock(&kvm->lock);
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ idr_remove(&kvm->arch.xen.evtchn_ports, evtchnfd->send_port);
+ synchronize_srcu(&kvm->srcu);
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
+static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data)
+{
+ u32 port = data->u.evtchn.send_port;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
+ return kvm_xen_eventfd_reset(kvm);
+
+ if (!port || port >= max_evtchn_port(kvm))
+ return -EINVAL;
+
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
+ return kvm_xen_eventfd_deassign(kvm, port);
+ if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
+ return kvm_xen_eventfd_update(kvm, data);
+ if (data->u.evtchn.flags)
+ return -EINVAL;
+
+ return kvm_xen_eventfd_assign(kvm, data);
+}
+
+static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r)
+{
+ struct evtchnfd *evtchnfd;
+ struct evtchn_send send;
+ gpa_t gpa;
+ int idx;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ gpa = kvm_mmu_gva_to_gpa_system(vcpu, param, NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+ if (!gpa || kvm_vcpu_read_guest(vcpu, gpa, &send, sizeof(send))) {
+ *r = -EFAULT;
+ return true;
+ }
+
+ /* The evtchn_ports idr is protected by vcpu->kvm->srcu */
+ evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, send.port);
+ if (!evtchnfd)
+ return false;
+
+ if (evtchnfd->deliver.port.port) {
+ int ret = kvm_xen_set_evtchn(&evtchnfd->deliver.port, vcpu->kvm);
+ if (ret < 0 && ret != -ENOTCONN)
+ return false;
+ } else {
+ eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
+ }
+
+ *r = 0;
+ return true;
+}
+
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
+ vcpu->arch.xen.poll_evtchn = 0;
+ timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0);
+}
+
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_timer_enabled(vcpu))
+ kvm_xen_stop_timer(vcpu);
+
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.runstate_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_info_cache);
+ kvm_gfn_to_pfn_cache_destroy(vcpu->kvm,
+ &vcpu->arch.xen.vcpu_time_info_cache);
+ del_timer_sync(&vcpu->arch.xen.poll_timer);
+}
+
+void kvm_xen_init_vm(struct kvm *kvm)
+{
+ idr_init(&kvm->arch.xen.evtchn_ports);
+}
+
+void kvm_xen_destroy_vm(struct kvm *kvm)
+{
+ struct evtchnfd *evtchnfd;
+ int i;
+
+ kvm_gfn_to_pfn_cache_destroy(kvm, &kvm->arch.xen.shinfo_cache);
+
+ idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
+ if (!evtchnfd->deliver.port.port)
+ eventfd_ctx_put(evtchnfd->deliver.eventfd.ctx);
+ kfree(evtchnfd);
+ }
+ idr_destroy(&kvm->arch.xen.evtchn_ports);
+
+ if (kvm->arch.xen_hvm_config.msr)
+ static_branch_slow_dec_deferred(&kvm_xen_enabled);
+}
diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h
index adbcc9ed59db..ee5c4ae0755c 100644
--- a/arch/x86/kvm/xen.h
+++ b/arch/x86/kvm/xen.h
@@ -15,16 +15,19 @@
extern struct static_key_false_deferred kvm_xen_enabled;
int __kvm_xen_has_interrupt(struct kvm_vcpu *vcpu);
+void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu);
int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data);
int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data);
+int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *evt);
int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data);
int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc);
void kvm_xen_init_vm(struct kvm *kvm);
void kvm_xen_destroy_vm(struct kvm *kvm);
-
-int kvm_xen_set_evtchn_fast(struct kvm_kernel_irq_routing_entry *e,
+void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu);
+void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu);
+int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe,
struct kvm *kvm);
int kvm_xen_setup_evtchn(struct kvm *kvm,
struct kvm_kernel_irq_routing_entry *e,
@@ -46,11 +49,33 @@ static inline bool kvm_xen_hypercall_enabled(struct kvm *kvm)
static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
if (static_branch_unlikely(&kvm_xen_enabled.key) &&
- vcpu->arch.xen.vcpu_info_set && vcpu->kvm->arch.xen.upcall_vector)
+ vcpu->arch.xen.vcpu_info_cache.active &&
+ vcpu->kvm->arch.xen.upcall_vector)
return __kvm_xen_has_interrupt(vcpu);
return 0;
}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return static_branch_unlikely(&kvm_xen_enabled.key) &&
+ vcpu->arch.xen.evtchn_pending_sel;
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return !!vcpu->arch.xen.timer_virq;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ if (kvm_xen_hypercall_enabled(vcpu->kvm) && kvm_xen_timer_enabled(vcpu))
+ return atomic_read(&vcpu->arch.xen.timer_pending);
+
+ return 0;
+}
+
+void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu);
#else
static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
{
@@ -65,6 +90,14 @@ static inline void kvm_xen_destroy_vm(struct kvm *kvm)
{
}
+static inline void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
+{
+}
+
static inline bool kvm_xen_msr_enabled(struct kvm *kvm)
{
return false;
@@ -79,6 +112,29 @@ static inline int kvm_xen_has_interrupt(struct kvm_vcpu *vcpu)
{
return 0;
}
+
+static inline void kvm_xen_inject_pending_events(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_has_pending_events(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
+static inline int kvm_xen_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static inline void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline bool kvm_xen_timer_enabled(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
#endif
int kvm_xen_hypercall(struct kvm_vcpu *vcpu);