From 0924ab2cfa98b1ece26c033d696651fd62896c69 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 14 Dec 2011 19:25:13 +0100 Subject: KVM: x86: Prevent starting PIT timers in the absence of irqchip support User space may create the PIT and forgets about setting up the irqchips. In that case, firing PIT IRQs will crash the host: BUG: unable to handle kernel NULL pointer dereference at 0000000000000128 IP: [] kvm_set_irq+0x30/0x170 [kvm] ... Call Trace: [] pit_do_work+0x51/0xd0 [kvm] [] process_one_work+0x111/0x4d0 [] worker_thread+0x152/0x340 [] kthread+0x7e/0x90 [] kernel_thread_helper+0x4/0x10 Prevent this by checking the irqchip mode before starting a timer. We can't deny creating the PIT if the irqchips aren't set up yet as current user land expects this order to work. Signed-off-by: Jan Kiszka Signed-off-by: Marcelo Tosatti --- arch/x86/kvm/i8254.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 76e3f1cd036..405f2620392 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -338,11 +338,15 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) { + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; + if (!irqchip_in_kernel(kvm)) + return; + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("create pit timer, interval is %llu nsec\n", interval); @@ -394,13 +398,13 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(ps, val, 0); + create_pit_timer(kvm, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(ps, val, 1); + create_pit_timer(kvm, val, 1); } break; default: -- cgit v1.2.3 From 423873736b78f549fbfa2f715f2e4de7e6c5e1e9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:03 -0700 Subject: KVM: Remove ability to assign a device without iommu support This option has no users and it exposes a security hole that we can allow devices to be assigned without iommu protection. Make KVM_DEV_ASSIGN_ENABLE_IOMMU a mandatory option. Signed-off-by: Alex Williamson Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 3 +++ virt/kvm/assigned-dev.c | 18 +++++++++--------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 7945b0bd35e..ee2c96b3ba5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1151,6 +1151,9 @@ following flags are specified: /* Depends on KVM_CAP_IOMMU */ #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) +The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure +isolation of the device. Usages not specifying this flag are deprecated. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index 3ad0925d23a..a251a28f79c 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -487,6 +487,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) + return -EINVAL; + mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); @@ -544,16 +547,14 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match); + if (!kvm->arch.iommu_domain) { + r = kvm_iommu_map_guest(kvm); if (r) goto out_list_del; } + r = kvm_assign_device(kvm, match); + if (r) + goto out_list_del; out: srcu_read_unlock(&kvm->srcu, idx); @@ -593,8 +594,7 @@ static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, goto out; } - if (match->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) - kvm_deassign_device(kvm, match); + kvm_deassign_device(kvm, match); kvm_free_assigned_device(kvm, match); -- cgit v1.2.3 From 3d27e23b17010c668db311140b17bbbb70c78fb9 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 20 Dec 2011 21:59:09 -0700 Subject: KVM: Device assignment permission checks Only allow KVM device assignment to attach to devices which: - Are not bridges - Have BAR resources (assume others are special devices) - The user has permissions to use Assigning a bridge is a configuration error, it's not supported, and typically doesn't result in the behavior the user is expecting anyway. Devices without BAR resources are typically chipset components that also don't have host drivers. We don't want users to hold such devices captive or cause system problems by fencing them off into an iommu domain. We determine "permission to use" by testing whether the user has access to the PCI sysfs resource files. By default a normal user will not have access to these files, so it provides a good indication that an administration agent has granted the user access to the device. [Yang Bai: add missing #include] [avi: fix comment style] Signed-off-by: Alex Williamson Signed-off-by: Yang Bai Signed-off-by: Marcelo Tosatti --- Documentation/virtual/kvm/api.txt | 4 +++ virt/kvm/assigned-dev.c | 75 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index ee2c96b3ba5..4df9af4f613 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1154,6 +1154,10 @@ following flags are specified: The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure isolation of the device. Usages not specifying this flag are deprecated. +Only PCI header type 0 devices with PCI BAR resources are supported by +device assignment. The user requesting this ioctl must have read/write +access to the PCI sysfs resource files associated with the device. + 4.49 KVM_DEASSIGN_PCI_DEVICE Capability: KVM_CAP_DEVICE_DEASSIGNMENT diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c index a251a28f79c..758e3b36d4c 100644 --- a/virt/kvm/assigned-dev.c +++ b/virt/kvm/assigned-dev.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include "irq.h" static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, @@ -480,12 +482,73 @@ out: return r; } +/* + * We want to test whether the caller has been granted permissions to + * use this device. To be able to configure and control the device, + * the user needs access to PCI configuration space and BAR resources. + * These are accessed through PCI sysfs. PCI config space is often + * passed to the process calling this ioctl via file descriptor, so we + * can't rely on access to that file. We can check for permissions + * on each of the BAR resource files, which is a pretty clear + * indicator that the user has been granted access to the device. + */ +static int probe_sysfs_permissions(struct pci_dev *dev) +{ +#ifdef CONFIG_SYSFS + int i; + bool bar_found = false; + + for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { + char *kpath, *syspath; + struct path path; + struct inode *inode; + int r; + + if (!pci_resource_len(dev, i)) + continue; + + kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); + if (!kpath) + return -ENOMEM; + + /* Per sysfs-rules, sysfs is always at /sys */ + syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); + kfree(kpath); + if (!syspath) + return -ENOMEM; + + r = kern_path(syspath, LOOKUP_FOLLOW, &path); + kfree(syspath); + if (r) + return r; + + inode = path.dentry->d_inode; + + r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); + path_put(&path); + if (r) + return r; + + bar_found = true; + } + + /* If no resources, probably something special */ + if (!bar_found) + return -EPERM; + + return 0; +#else + return -EINVAL; /* No way to control the device without sysfs */ +#endif +} + static int kvm_vm_ioctl_assign_device(struct kvm *kvm, struct kvm_assigned_pci_dev *assigned_dev) { int r = 0, idx; struct kvm_assigned_dev_kernel *match; struct pci_dev *dev; + u8 header_type; if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) return -EINVAL; @@ -516,6 +579,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, r = -EINVAL; goto out_free; } + + /* Don't allow bridges to be assigned */ + pci_read_config_byte(dev, PCI_HEADER_TYPE, &header_type); + if ((header_type & PCI_HEADER_TYPE) != PCI_HEADER_TYPE_NORMAL) { + r = -EPERM; + goto out_put; + } + + r = probe_sysfs_permissions(dev); + if (r) + goto out_put; + if (pci_enable_device(dev)) { printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); r = -EBUSY; -- cgit v1.2.3 From 4d25a066b69fb749a39d0d4c610689dd765a0b0e Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Wed, 21 Dec 2011 12:28:29 +0100 Subject: KVM: Don't automatically expose the TSC deadline timer in cpuid Unlike all of the other cpuid bits, the TSC deadline timer bit is set unconditionally, regardless of what userspace wants. This is broken in several ways: - if userspace doesn't use KVM_CREATE_IRQCHIP, and doesn't emulate the TSC deadline timer feature, a guest that uses the feature will break - live migration to older host kernels that don't support the TSC deadline timer will cause the feature to be pulled from under the guest's feet; breaking it - guests that are broken wrt the feature will fail. Fix by not enabling the feature automatically; instead report it to userspace. Because the feature depends on KVM_CREATE_IRQCHIP, which we cannot guarantee will be called, we expose it via a KVM_CAP_TSC_DEADLINE_TIMER and not KVM_GET_SUPPORTED_CPUID. Fixes the Illumos guest kernel, which uses the TSC deadline timer feature. [avi: add the KVM_CAP + documentation] Reported-by: Alexey Zaytsev Tested-by: Alexey Zaytsev Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- Documentation/virtual/kvm/api.txt | 9 +++++++++ arch/x86/kvm/x86.c | 19 +++++++++---------- include/linux/kvm.h | 1 + 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 4df9af4f613..e2a4b528736 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -1100,6 +1100,15 @@ emulate them efficiently. The fields in each entry are defined as follows: eax, ebx, ecx, edx: the values returned by the cpuid instruction for this function/index combination +The TSC deadline timer feature (CPUID leaf 1, ecx[24]) is always returned +as false, since the feature depends on KVM_CREATE_IRQCHIP for local APIC +support. Instead it is reported via + + ioctl(KVM_CHECK_EXTENSION, KVM_CAP_TSC_DEADLINE_TIMER) + +if that returns true and you use KVM_CREATE_IRQCHIP, or if you emulate the +feature in userspace, then you can enable the feature for KVM_SET_CPUID2. + 4.47 KVM_PPC_GET_PVINFO Capability: KVM_CAP_PPC_GET_PVINFO diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c38efd7b792..4c938da2ba0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -602,7 +602,6 @@ static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; struct kvm_lapic *apic = vcpu->arch.apic; - u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -615,15 +614,12 @@ static void update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= bit(X86_FEATURE_OSXSAVE); } - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - best->function == 0x1) { - best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); - timer_mode_mask = 3 << 17; - } else - timer_mode_mask = 1 << 17; - - if (apic) - apic->lapic_timer.timer_mode_mask = timer_mode_mask; + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -2135,6 +2131,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_TSC_CONTROL: r = kvm_has_tsc_control; break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; diff --git a/include/linux/kvm.h b/include/linux/kvm.h index c3892fc1d53..68e67e50d02 100644 --- a/include/linux/kvm.h +++ b/include/linux/kvm.h @@ -557,6 +557,7 @@ struct kvm_ppc_pvinfo { #define KVM_CAP_MAX_VCPUS 66 /* returns max vcpus per vm */ #define KVM_CAP_PPC_PAPR 68 #define KVM_CAP_S390_GMAP 71 +#define KVM_CAP_TSC_DEADLINE_TIMER 72 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 36cc66d638d3ffbc635b0d48b29c1128fdad38f4 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:08:52 +0000 Subject: KVM: PPC: move compute_tlbie_rb to book3s_64 common header compute_tlbie_rb is only used on ppc64 and cannot be compiled on ppc32. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf --- arch/powerpc/include/asm/kvm_book3s.h | 33 -------------------------------- arch/powerpc/include/asm/kvm_book3s_64.h | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index d4df013ad77..69c7377d207 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -381,39 +381,6 @@ static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) } #endif -static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, - unsigned long pte_index) -{ - unsigned long rb, va_low; - - rb = (v & ~0x7fUL) << 16; /* AVA field */ - va_low = pte_index >> 3; - if (v & HPTE_V_SECONDARY) - va_low = ~va_low; - /* xor vsid from AVA */ - if (!(v & HPTE_V_1TB_SEG)) - va_low ^= v >> 12; - else - va_low ^= v >> 24; - va_low &= 0x7ff; - if (v & HPTE_V_LARGE) { - rb |= 1; /* L field */ - if (cpu_has_feature(CPU_FTR_ARCH_206) && - (r & 0xff000)) { - /* non-16MB large page, must be 64k */ - /* (masks depend on page size) */ - rb |= 0x1000; /* page encoding in LP field */ - rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ - rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ - } - } else { - /* 4kB page */ - rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ - } - rb |= (v >> 54) & 0x300; /* B field */ - return rb; -} - /* Magic register values loaded into r3 and r4 before the 'sc' assembly * instruction for the OSI hypercalls */ #define OSI_SC_MAGIC_R3 0x113724FA diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index e43fe42b987..d0ac94f98f9 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -29,4 +29,37 @@ static inline struct kvmppc_book3s_shadow_vcpu *to_svcpu(struct kvm_vcpu *vcpu) #define SPAPR_TCE_SHIFT 12 +static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r, + unsigned long pte_index) +{ + unsigned long rb, va_low; + + rb = (v & ~0x7fUL) << 16; /* AVA field */ + va_low = pte_index >> 3; + if (v & HPTE_V_SECONDARY) + va_low = ~va_low; + /* xor vsid from AVA */ + if (!(v & HPTE_V_1TB_SEG)) + va_low ^= v >> 12; + else + va_low ^= v >> 24; + va_low &= 0x7ff; + if (v & HPTE_V_LARGE) { + rb |= 1; /* L field */ + if (cpu_has_feature(CPU_FTR_ARCH_206) && + (r & 0xff000)) { + /* non-16MB large page, must be 64k */ + /* (masks depend on page size) */ + rb |= 0x1000; /* page encoding in LP field */ + rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */ + rb |= (va_low & 0xfe); /* AVAL field (P7 doesn't seem to care) */ + } + } else { + /* 4kB page */ + rb |= (va_low & 0x7ff) << 12; /* remaining 11b of VA */ + } + rb |= (v >> 54) & 0x300; /* B field */ + return rb; +} + #endif /* __ASM_KVM_BOOK3S_64_H__ */ -- cgit v1.2.3 From 96f38d72867bc54c312decaf8463f1e9607136da Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 8 Nov 2011 07:17:39 +0000 Subject: KVM: PPC: protect use of kvmppc_h_pr kvmppc_h_pr is only available if CONFIG_KVM_BOOK3S_64_PR. Signed-off-by: Andreas Schwab Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_pr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 3c791e1eb67..e2cfb9e1e20 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -658,10 +658,12 @@ program_interrupt: ulong cmd = kvmppc_get_gpr(vcpu, 3); int i; +#ifdef CONFIG_KVM_BOOK3S_64_PR if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { r = RESUME_GUEST; break; } +#endif run->papr_hcall.nr = cmd; for (i = 0; i < 9; ++i) { -- cgit v1.2.3 From 251da03897b383904901620835044e298061875f Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 10 Nov 2011 16:03:20 +0000 Subject: KVM: PPC: fix kvmppc_start_thread() for CONFIG_SMP=N Currently kvmppc_start_thread() tries to wake other SMT threads via xics_wake_cpu(). Unfortunately xics_wake_cpu only exists when CONFIG_SMP=Y so when compiling with CONFIG_SMP=N we get: arch/powerpc/kvm/built-in.o: In function `.kvmppc_start_thread': book3s_hv.c:(.text+0xa1e0): undefined reference to `.xics_wake_cpu' The following should be fine since kvmppc_start_thread() shouldn't called to start non-zero threads when SMP=N since threads_per_core=1. Signed-off-by: Michael Neuling Signed-off-by: Alexander Graf --- arch/powerpc/kvm/book3s_hv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 0cb137a9b03..336983da9e7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -538,7 +538,7 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu) tpaca->kvm_hstate.napping = 0; vcpu->cpu = vc->pcpu; smp_wmb(); -#ifdef CONFIG_PPC_ICP_NATIVE +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) if (vcpu->arch.ptid) { tpaca->cpu_start = 0x80; wmb(); -- cgit v1.2.3 From fae9dbb4b462d2c908186a47464c7a5299ee27a9 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 20 Dec 2011 14:43:45 +0000 Subject: KVM: PPC: e500: include linux/export.h This is required for THIS_MODULE. We recently stopped acquiring it via some other header. Signed-off-by: Scott Wood Signed-off-by: Alexander Graf --- arch/powerpc/kvm/e500.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c index 26d20903f2b..8c0d45a6faf 100644 --- a/arch/powerpc/kvm/e500.c +++ b/arch/powerpc/kvm/e500.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3