aboutsummaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig36
-rw-r--r--arch/x86/include/asm/hw_irq.h84
-rw-r--r--arch/x86/include/asm/io_apic.h35
-rw-r--r--arch/x86/include/asm/irq_vectors.h6
-rw-r--r--arch/x86/include/asm/kvm_host.h37
-rw-r--r--arch/x86/include/asm/pci.h3
-rw-r--r--arch/x86/include/asm/pci_x86.h2
-rw-r--r--arch/x86/include/asm/pgtable_types.h1
-rw-r--r--arch/x86/include/asm/spinlock.h8
-rw-r--r--arch/x86/include/asm/vmx.h3
-rw-r--r--arch/x86/include/asm/xen/page.h64
-rw-r--r--arch/x86/include/asm/xsave.h1
-rw-r--r--arch/x86/include/uapi/asm/ldt.h7
-rw-r--r--arch/x86/include/uapi/asm/vmx.h6
-rw-r--r--arch/x86/kernel/acpi/boot.c99
-rw-r--r--arch/x86/kernel/apic/Makefile4
-rw-r--r--arch/x86/kernel/apic/apic.c22
-rw-r--r--arch/x86/kernel/apic/htirq.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c1356
-rw-r--r--arch/x86/kernel/apic/msi.c286
-rw-r--r--arch/x86/kernel/apic/vector.c719
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_uncore.c22
-rw-r--r--arch/x86/kernel/crash.c1
-rw-r--r--arch/x86/kernel/entry_32.S4
-rw-r--r--arch/x86/kernel/entry_64.S4
-rw-r--r--arch/x86/kernel/irqinit.c35
-rw-r--r--arch/x86/kernel/kvm.c9
-rw-r--r--arch/x86/kernel/kvmclock.c20
-rw-r--r--arch/x86/kernel/machine_kexec_32.c1
-rw-r--r--arch/x86/kernel/machine_kexec_64.c1
-rw-r--r--arch/x86/kernel/reboot.c1
-rw-r--r--arch/x86/kernel/smpboot.c8
-rw-r--r--arch/x86/kernel/tls.c6
-rw-r--r--arch/x86/kernel/traps.c2
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Makefile7
-rw-r--r--arch/x86/kvm/assigned-dev.c1052
-rw-r--r--arch/x86/kvm/assigned-dev.h32
-rw-r--r--arch/x86/kvm/cpuid.c57
-rw-r--r--arch/x86/kvm/emulate.c408
-rw-r--r--arch/x86/kvm/ioapic.c675
-rw-r--r--arch/x86/kvm/ioapic.h119
-rw-r--r--arch/x86/kvm/iommu.c353
-rw-r--r--arch/x86/kvm/irq_comm.c332
-rw-r--r--arch/x86/kvm/lapic.c210
-rw-r--r--arch/x86/kvm/lapic.h14
-rw-r--r--arch/x86/kvm/mmu.c7
-rw-r--r--arch/x86/kvm/svm.c24
-rw-r--r--arch/x86/kvm/trace.h37
-rw-r--r--arch/x86/kvm/vmx.c608
-rw-r--r--arch/x86/kvm/x86.c226
-rw-r--r--arch/x86/kvm/x86.h3
-rw-r--r--arch/x86/lguest/boot.c2
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/pageattr.c20
-rw-r--r--arch/x86/pci/intel_mid_pci.c10
-rw-r--r--arch/x86/pci/irq.c25
-rw-r--r--arch/x86/platform/uv/uv_irq.c6
-rw-r--r--arch/x86/xen/mmu.c40
-rw-r--r--arch/x86/xen/p2m.c1172
-rw-r--r--arch/x86/xen/setup.c441
-rw-r--r--arch/x86/xen/xen-ops.h6
63 files changed, 6011 insertions, 2880 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d69f1cd87fd9..ba397bde7948 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -249,10 +249,6 @@ config HAVE_INTEL_TXT
def_bool y
depends on INTEL_IOMMU && ACPI
-config X86_INTEL_MPX
- def_bool y
- depends on CPU_SUP_INTEL
-
config X86_32_SMP
def_bool y
depends on X86_32 && SMP
@@ -887,11 +883,11 @@ config X86_UP_IOAPIC
config X86_LOCAL_APIC
def_bool y
depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC || PCI_MSI
+ select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
config X86_IO_APIC
- def_bool y
- depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC || PCI_MSI
- select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
+ def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
+ depends on X86_LOCAL_APIC
select IRQ_DOMAIN
config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -1594,6 +1590,32 @@ config X86_SMAP
If unsure, say Y.
+config X86_INTEL_MPX
+ prompt "Intel MPX (Memory Protection Extensions)"
+ def_bool n
+ depends on CPU_SUP_INTEL
+ ---help---
+ MPX provides hardware features that can be used in
+ conjunction with compiler-instrumented code to check
+ memory references. It is designed to detect buffer
+ overflow or underflow bugs.
+
+ This option enables running applications which are
+ instrumented or otherwise use MPX. It does not use MPX
+ itself inside the kernel or to protect the kernel
+ against bad memory references.
+
+ Enabling this option will make the kernel larger:
+ ~8k of kernel text and 36 bytes of data on a 64-bit
+ defconfig. It adds a long to the 'mm_struct' which
+ will increase the kernel memory overhead of each
+ process and adds some branches to paths used during
+ exec() and munmap().
+
+ For details, see Documentation/x86/intel_mpx.txt
+
+ If unsure, say N.
+
config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 4615906d83df..9662290e0b20 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -94,30 +94,7 @@ extern void trace_call_function_single_interrupt(void);
#define trace_kvm_posted_intr_ipi kvm_posted_intr_ipi
#endif /* CONFIG_TRACING */
-/* IOAPIC */
-#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
-extern unsigned long io_apic_irqs;
-
-extern void setup_IO_APIC(void);
-extern void disable_IO_APIC(void);
-
-struct io_apic_irq_attr {
- int ioapic;
- int ioapic_pin;
- int trigger;
- int polarity;
-};
-
-static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
- int ioapic, int ioapic_pin,
- int trigger, int polarity)
-{
- irq_attr->ioapic = ioapic;
- irq_attr->ioapic_pin = ioapic_pin;
- irq_attr->trigger = trigger;
- irq_attr->polarity = polarity;
-}
-
+#ifdef CONFIG_IRQ_REMAP
/* Intel specific interrupt remapping information */
struct irq_2_iommu {
struct intel_iommu *iommu;
@@ -131,14 +108,12 @@ struct irq_2_irte {
u16 devid; /* Device ID for IRTE table */
u16 index; /* Index into IRTE table*/
};
+#endif /* CONFIG_IRQ_REMAP */
+
+#ifdef CONFIG_X86_LOCAL_APIC
+struct irq_data;
-/*
- * This is performance-critical, we want to do it O(1)
- *
- * Most irqs are mapped 1:1 with pins.
- */
struct irq_cfg {
- struct irq_pin_list *irq_2_pin;
cpumask_var_t domain;
cpumask_var_t old_domain;
u8 vector;
@@ -150,18 +125,39 @@ struct irq_cfg {
struct irq_2_irte irq_2_irte;
};
#endif
+ union {
+#ifdef CONFIG_X86_IO_APIC
+ struct {
+ struct list_head irq_2_pin;
+ };
+#endif
+ };
};
+extern struct irq_cfg *irq_cfg(unsigned int irq);
+extern struct irq_cfg *irqd_cfg(struct irq_data *irq_data);
+extern struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
+extern void lock_vector_lock(void);
+extern void unlock_vector_lock(void);
extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
+extern void clear_irq_vector(int irq, struct irq_cfg *cfg);
+extern void setup_vector_irq(int cpu);
+#ifdef CONFIG_SMP
extern void send_cleanup_vector(struct irq_cfg *);
+extern void irq_complete_move(struct irq_cfg *cfg);
+#else
+static inline void send_cleanup_vector(struct irq_cfg *c) { }
+static inline void irq_complete_move(struct irq_cfg *c) { }
+#endif
-struct irq_data;
-int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
- unsigned int *dest_id);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
-extern void setup_ioapic_dest(void);
-
-extern void enable_IO_APIC(void);
+extern int apic_retrigger_irq(struct irq_data *data);
+extern void apic_ack_edge(struct irq_data *data);
+extern int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id);
+#else /* CONFIG_X86_LOCAL_APIC */
+static inline void lock_vector_lock(void) {}
+static inline void unlock_vector_lock(void) {}
+#endif /* CONFIG_X86_LOCAL_APIC */
/* Statistics */
extern atomic_t irq_err_count;
@@ -185,7 +181,8 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
extern __visible void smp_invalidate_interrupt(struct pt_regs *);
#endif
-extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
+extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR
+ - FIRST_EXTERNAL_VECTOR])(void);
#ifdef CONFIG_TRACING
#define trace_interrupt interrupt
#endif
@@ -195,17 +192,6 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
typedef int vector_irq_t[NR_VECTORS];
DECLARE_PER_CPU(vector_irq_t, vector_irq);
-extern void setup_vector_irq(int cpu);
-
-#ifdef CONFIG_X86_IO_APIC
-extern void lock_vector_lock(void);
-extern void unlock_vector_lock(void);
-extern void __setup_vector_irq(int cpu);
-#else
-static inline void lock_vector_lock(void) {}
-static inline void unlock_vector_lock(void) {}
-static inline void __setup_vector_irq(int cpu) {}
-#endif
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 1733ab49ac5e..bf006cce9418 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -132,6 +132,10 @@ extern int noioapicquirk;
/* -1 if "noapic" boot option passed */
extern int noioapicreroute;
+extern unsigned long io_apic_irqs;
+
+#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1 << (x)) & io_apic_irqs))
+
/*
* If we use the IO-APIC for IRQ routing, disable automatic
* assignment of PCI IRQ's.
@@ -139,18 +143,15 @@ extern int noioapicreroute;
#define io_apic_assign_pci_irqs \
(mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
-struct io_apic_irq_attr;
struct irq_cfg;
extern void ioapic_insert_resources(void);
+extern int arch_early_ioapic_init(void);
extern int native_setup_ioapic_entry(int, struct IO_APIC_route_entry *,
unsigned int, int,
struct io_apic_irq_attr *);
extern void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg);
-extern void native_compose_msi_msg(struct pci_dev *pdev,
- unsigned int irq, unsigned int dest,
- struct msi_msg *msg, u8 hpet_id);
extern void native_eoi_ioapic_pin(int apic, int pin, int vector);
extern int save_ioapic_entries(void);
@@ -160,6 +161,13 @@ extern int restore_ioapic_entries(void);
extern void setup_ioapic_ids_from_mpc(void);
extern void setup_ioapic_ids_from_mpc_nocheck(void);
+struct io_apic_irq_attr {
+ int ioapic;
+ int ioapic_pin;
+ int trigger;
+ int polarity;
+};
+
enum ioapic_domain_type {
IOAPIC_DOMAIN_INVALID,
IOAPIC_DOMAIN_LEGACY,
@@ -188,8 +196,10 @@ extern int mp_find_ioapic_pin(int ioapic, u32 gsi);
extern u32 mp_pin_to_gsi(int ioapic, int pin);
extern int mp_map_gsi_to_irq(u32 gsi, unsigned int flags);
extern void mp_unmap_irq(int irq);
-extern void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
- struct ioapic_domain_cfg *cfg);
+extern int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg);
+extern int mp_unregister_ioapic(u32 gsi_base);
+extern int mp_ioapic_registered(u32 gsi_base);
extern int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
irq_hw_number_t hwirq);
extern void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq);
@@ -227,19 +237,25 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned
extern void io_apic_eoi(unsigned int apic, unsigned int vector);
-extern bool mp_should_keep_irq(struct device *dev);
-
+extern void setup_IO_APIC(void);
+extern void enable_IO_APIC(void);
+extern void disable_IO_APIC(void);
+extern void setup_ioapic_dest(void);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin);
+extern void print_IO_APICs(void);
#else /* !CONFIG_X86_IO_APIC */
+#define IO_APIC_IRQ(x) 0
#define io_apic_assign_pci_irqs 0
#define setup_ioapic_ids_from_mpc x86_init_noop
static inline void ioapic_insert_resources(void) { }
+static inline int arch_early_ioapic_init(void) { return 0; }
+static inline void print_IO_APICs(void) {}
#define gsi_top (NR_IRQS_LEGACY)
static inline int mp_find_ioapic(u32 gsi) { return 0; }
static inline u32 mp_pin_to_gsi(int ioapic, int pin) { return UINT_MAX; }
static inline int mp_map_gsi_to_irq(u32 gsi, unsigned int flags) { return gsi; }
static inline void mp_unmap_irq(int irq) { }
-static inline bool mp_should_keep_irq(struct device *dev) { return 1; }
static inline int save_ioapic_entries(void)
{
@@ -262,7 +278,6 @@ static inline void disable_ioapic_support(void) { }
#define native_io_apic_print_entries NULL
#define native_ioapic_set_affinity NULL
#define native_setup_ioapic_entry NULL
-#define native_compose_msi_msg NULL
#define native_eoi_ioapic_pin NULL
#endif
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 5702d7e3111d..666c89ec4bd7 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -126,6 +126,12 @@
#define NR_VECTORS 256
+#ifdef CONFIG_X86_LOCAL_APIC
+#define FIRST_SYSTEM_VECTOR LOCAL_TIMER_VECTOR
+#else
+#define FIRST_SYSTEM_VECTOR NR_VECTORS
+#endif
+
#define FPU_IRQ 13
#define FIRST_VM86_IRQ 3
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 6ed0c30d6a0c..d89c6b828c96 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -33,7 +33,7 @@
#define KVM_MAX_VCPUS 255
#define KVM_SOFT_MAX_VCPUS 160
-#define KVM_USER_MEM_SLOTS 125
+#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
#define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS)
@@ -51,6 +51,7 @@
| X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
#define CR3_L_MODE_RESERVED_BITS 0xFFFFFF0000000000ULL
+#define CR3_PCID_INVD (1UL << 63)
#define CR4_RESERVED_BITS \
(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
| X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
@@ -361,6 +362,7 @@ struct kvm_vcpu_arch {
int mp_state;
u64 ia32_misc_enable_msr;
bool tpr_access_reporting;
+ u64 ia32_xss;
/*
* Paging state of the vcpu
@@ -542,7 +544,7 @@ struct kvm_apic_map {
struct rcu_head rcu;
u8 ldr_bits;
/* fields bellow are used to decode ldr values in different modes */
- u32 cid_shift, cid_mask, lid_mask;
+ u32 cid_shift, cid_mask, lid_mask, broadcast;
struct kvm_lapic *phys_map[256];
/* first index is cluster id second is cpu id in a cluster */
struct kvm_lapic *logical_map[16][16];
@@ -602,6 +604,9 @@ struct kvm_arch {
struct kvm_xen_hvm_config xen_hvm_config;
+ /* reads protected by irq_srcu, writes by irq_lock */
+ struct hlist_head mask_notifier_list;
+
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
@@ -659,6 +664,16 @@ struct msr_data {
u64 data;
};
+struct kvm_lapic_irq {
+ u32 vector;
+ u32 delivery_mode;
+ u32 dest_mode;
+ u32 level;
+ u32 trig_mode;
+ u32 shorthand;
+ u32 dest_id;
+};
+
struct kvm_x86_ops {
int (*cpu_has_kvm_support)(void); /* __init */
int (*disabled_by_bios)(void); /* __init */
@@ -767,6 +782,7 @@ struct kvm_x86_ops {
enum x86_intercept_stage stage);
void (*handle_external_intr)(struct kvm_vcpu *vcpu);
bool (*mpx_supported)(void);
+ bool (*xsaves_supported)(void);
int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr);
@@ -818,6 +834,19 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
const void *val, int bytes);
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
+struct kvm_irq_mask_notifier {
+ void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked);
+ int irq;
+ struct hlist_node link;
+};
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask);
+
extern bool tdp_enabled;
u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu);
@@ -863,7 +892,7 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
-void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector);
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector);
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code);
@@ -895,6 +924,7 @@ int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
static inline int __kvm_irq_line_state(unsigned long *irq_state,
int irq_source_id, int level)
@@ -1066,6 +1096,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
void kvm_define_shared_msr(unsigned index, u32 msr);
int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 0892ea0e683f..4e370a5d8117 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -96,12 +96,15 @@ extern void pci_iommu_alloc(void);
#ifdef CONFIG_PCI_MSI
/* implemented in arch/x86/kernel/apic/io_apic. */
struct msi_desc;
+void native_compose_msi_msg(struct pci_dev *pdev, unsigned int irq,
+ unsigned int dest, struct msi_msg *msg, u8 hpet_id);
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
void native_teardown_msi_irq(unsigned int irq);
void native_restore_msi_irqs(struct pci_dev *dev);
int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
unsigned int irq_base, unsigned int irq_offset);
#else
+#define native_compose_msi_msg NULL
#define native_setup_msi_irqs NULL
#define native_teardown_msi_irq NULL
#endif
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index fa1195dae425..164e3f8d3c3d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
extern int (*pcibios_enable_irq)(struct pci_dev *dev);
extern void (*pcibios_disable_irq)(struct pci_dev *dev);
+extern bool mp_should_keep_irq(struct device *dev);
+
struct pci_raw_ops {
int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
int reg, int len, u32 *val);
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index af447f95e3be..25bcd4a89517 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -452,6 +452,7 @@ static inline void update_page_count(int level, unsigned long pages) { }
extern pte_t *lookup_address(unsigned long address, unsigned int *level);
extern pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address,
unsigned int *level);
+extern pmd_t *lookup_pmd_address(unsigned long address);
extern phys_addr_t slow_virt_to_phys(void *__address);
extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address,
unsigned numpages, unsigned long page_flags);
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index a4efe477ceab..625660f8a2fc 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -92,7 +92,7 @@ static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
unsigned count = SPIN_THRESHOLD;
do {
- if (ACCESS_ONCE(lock->tickets.head) == inc.tail)
+ if (READ_ONCE(lock->tickets.head) == inc.tail)
goto out;
cpu_relax();
} while (--count);
@@ -105,7 +105,7 @@ static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
{
arch_spinlock_t old, new;
- old.tickets = ACCESS_ONCE(lock->tickets);
+ old.tickets = READ_ONCE(lock->tickets);
if (old.tickets.head != (old.tickets.tail & ~TICKET_SLOWPATH_FLAG))
return 0;
@@ -162,14 +162,14 @@ static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
static inline int arch_spin_is_locked(arch_spinlock_t *lock)
{
- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+ struct __raw_tickets tmp = READ_ONCE(lock->tickets);
return tmp.tail != tmp.head;
}
static inline int arch_spin_is_contended(arch_spinlock_t *lock)
{
- struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
+ struct __raw_tickets tmp = READ_ONCE(lock->tickets);
return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC;
}
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index bcbfade26d8d..45afaee9555c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -69,6 +69,7 @@
#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000
#define SECONDARY_EXEC_SHADOW_VMCS 0x00004000
+#define SECONDARY_EXEC_XSAVES 0x00100000
#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -159,6 +160,8 @@ enum vmcs_field {
EOI_EXIT_BITMAP3_HIGH = 0x00002023,
VMREAD_BITMAP = 0x00002026,
VMWRITE_BITMAP = 0x00002028,
+ XSS_EXIT_BITMAP = 0x0000202C,
+ XSS_EXIT_BITMAP_HIGH = 0x0000202D,
GUEST_PHYSICAL_ADDRESS = 0x00002400,
GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401,
VMCS_LINK_POINTER = 0x00002800,
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index f58ef6c0613b..5eea09915a15 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -41,10 +41,12 @@ typedef struct xpaddr {
extern unsigned long *machine_to_phys_mapping;
extern unsigned long machine_to_phys_nr;
+extern unsigned long *xen_p2m_addr;
+extern unsigned long xen_p2m_size;
+extern unsigned long xen_max_p2m_pfn;
extern unsigned long get_phys_to_machine(unsigned long pfn);
extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
-extern bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
extern unsigned long set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e);
@@ -52,17 +54,52 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,
extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
-extern int m2p_add_override(unsigned long mfn, struct page *page,
- struct gnttab_map_grant_ref *kmap_op);
extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
struct gnttab_map_grant_ref *kmap_ops,
struct page **pages, unsigned int count);
-extern int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op,
- unsigned long mfn);
-extern struct page *m2p_find_override(unsigned long mfn);
extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
+/*
+ * Helper functions to write or read unsigned long values to/from
+ * memory, when the access may fault.
+ */
+static inline int xen_safe_write_ulong(unsigned long *addr, unsigned long val)
+{
+ return __put_user(val, (unsigned long __user *)addr);
+}
+
+static inline int xen_safe_read_ulong(unsigned long *addr, unsigned long *val)
+{
+ return __get_user(*val, (unsigned long __user *)addr);
+}
+
+/*
+ * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine():
+ * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator
+ * bits (identity or foreign) are set.
+ * - __pfn_to_mfn() returns the found entry of the p2m table. A possibly set
+ * identity or foreign indicator will be still set. __pfn_to_mfn() is
+ * encapsulating get_phys_to_machine() which is called in special cases only.
+ * - get_phys_to_machine() is to be called by __pfn_to_mfn() only in special
+ * cases needing an extended handling.
+ */
+static inline unsigned long __pfn_to_mfn(unsigned long pfn)
+{
+ unsigned long mfn;
+
+ if (pfn < xen_p2m_size)
+ mfn = xen_p2m_addr[pfn];
+ else if (unlikely(pfn < xen_max_p2m_pfn))
+ return get_phys_to_machine(pfn);
+ else
+ return IDENTITY_FRAME(pfn);
+
+ if (unlikely(mfn == INVALID_P2M_ENTRY))
+ return get_phys_to_machine(pfn);
+
+ return mfn;
+}
+
static inline unsigned long pfn_to_mfn(unsigned long pfn)
{
unsigned long mfn;
@@ -70,7 +107,7 @@ static inline unsigned long pfn_to_mfn(unsigned long pfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return pfn;
- mfn = get_phys_to_machine(pfn);
+ mfn = __pfn_to_mfn(pfn);
if (mfn != INVALID_P2M_ENTRY)
mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
@@ -83,7 +120,7 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
if (xen_feature(XENFEAT_auto_translated_physmap))
return 1;
- return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
+ return __pfn_to_mfn(pfn) != INVALID_P2M_ENTRY;
}
static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
@@ -102,7 +139,7 @@ static inline unsigned long mfn_to_pfn_no_overrides(unsigned long mfn)
* In such cases it doesn't matter what we return (we return garbage),
* but we must handle the fault without crashing!
*/
- ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
+ ret = xen_safe_read_ulong(&machine_to_phys_mapping[mfn], &pfn);
if (ret < 0)
return ~0;
@@ -117,7 +154,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
return mfn;
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) != mfn) {
+ if (__pfn_to_mfn(pfn) != mfn) {
/*
* If this appears to be a foreign mfn (because the pfn
* doesn't map back to the mfn), then check the local override
@@ -133,8 +170,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
* entry doesn't map back to the mfn and m2p_override doesn't have a
* valid entry for it.
*/
- if (pfn == ~0 &&
- get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
+ if (pfn == ~0 && __pfn_to_mfn(mfn) == IDENTITY_FRAME(mfn))
pfn = mfn;
return pfn;
@@ -180,7 +216,7 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
return mfn;
pfn = mfn_to_pfn(mfn);
- if (get_phys_to_machine(pfn) != mfn)
+ if (__pfn_to_mfn(pfn) != mfn)
return -1; /* force !pfn_valid() */
return pfn;
}
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 7e7a79ada658..5fa9770035dc 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -16,6 +16,7 @@
#define XSTATE_Hi16_ZMM 0x80
#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
+#define XSTATE_AVX512 (XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM)
/* Bit 63 of XCR0 is reserved for future expansion */
#define XSTATE_EXTEND_MASK (~(XSTATE_FPSSE | (1ULL << 63)))
diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h
index 46727eb37bfe..6e1aaf73852a 100644
--- a/arch/x86/include/uapi/asm/ldt.h
+++ b/arch/x86/include/uapi/asm/ldt.h
@@ -28,6 +28,13 @@ struct user_desc {
unsigned int seg_not_present:1;
unsigned int useable:1;
#ifdef __x86_64__
+ /*
+ * Because this bit is not present in 32-bit user code, user
+ * programs can pass uninitialized values here. Therefore, in
+ * any context in which a user_desc comes from a 32-bit program,
+ * the kernel must act as though lm == 0, regardless of the
+ * actual value.
+ */
unsigned int lm:1;
#endif
};
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 990a2fe1588d..b813bf9da1e2 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -72,6 +72,8 @@
#define EXIT_REASON_XSETBV 55
#define EXIT_REASON_APIC_WRITE 56
#define EXIT_REASON_INVPCID 58
+#define EXIT_REASON_XSAVES 63
+#define EXIT_REASON_XRSTORS 64
#define VMX_EXIT_REASONS \
{ EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \
@@ -116,6 +118,8 @@
{ EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \
{ EXIT_REASON_INVD, "INVD" }, \
{ EXIT_REASON_INVVPID, "INVVPID" }, \
- { EXIT_REASON_INVPCID, "INVPCID" }
+ { EXIT_REASON_INVPCID, "INVPCID" }, \
+ { EXIT_REASON_XSAVES, "XSAVES" }, \
+ { EXIT_REASON_XRSTORS, "XRSTORS" }
#endif /* _UAPIVMX_H */
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index a142e77693e1..4433a4be8171 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -76,6 +76,19 @@ int acpi_fix_pin2_polarity __initdata;
static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
#endif
+/*
+ * Locks related to IOAPIC hotplug
+ * Hotplug side:
+ * ->device_hotplug_lock
+ * ->acpi_ioapic_lock
+ * ->ioapic_lock
+ * Interrupt mapping side:
+ * ->acpi_ioapic_lock
+ * ->ioapic_mutex
+ * ->ioapic_lock
+ */
+static DEFINE_MUTEX(acpi_ioapic_lock);
+
/* --------------------------------------------------------------------------
Boot-time Configuration
-------------------------------------------------------------------------- */
@@ -395,10 +408,6 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return gsi;
- /* Don't set up the ACPI SCI because it's already set up */
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return mp_map_gsi_to_irq(gsi, IOAPIC_MAP_ALLOC);
-
trigger = trigger == ACPI_EDGE_SENSITIVE ? 0 : 1;
polarity = polarity == ACPI_ACTIVE_HIGH ? 0 : 1;
node = dev ? dev_to_node(dev) : NUMA_NO_NODE;
@@ -411,7 +420,8 @@ static int mp_register_gsi(struct device *dev, u32 gsi, int trigger,
if (irq < 0)
return irq;
- if (enable_update_mptable)
+ /* Don't set up the ACPI SCI because it's already set up */
+ if (enable_update_mptable && acpi_gbl_FADT.sci_interrupt != gsi)
mp_config_acpi_gsi(dev, gsi, trigger, polarity);
return irq;
@@ -424,9 +434,6 @@ static void mp_unregister_gsi(u32 gsi)
if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
return;
- if (acpi_gbl_FADT.sci_interrupt == gsi)
- return;
-
irq = mp_map_gsi_to_irq(gsi, 0);
if (irq > 0)
mp_unmap_irq(irq);
@@ -609,8 +616,10 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp)
if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
*irqp = gsi;
} else {
+ mutex_lock(&acpi_ioapic_lock);
irq = mp_map_gsi_to_irq(gsi,
IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK);
+ mutex_unlock(&acpi_ioapic_lock);
if (irq < 0)
return -1;
*irqp = irq;
@@ -650,7 +659,9 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
int irq = gsi;
#ifdef CONFIG_X86_IO_APIC
+ mutex_lock(&acpi_ioapic_lock);
irq = mp_register_gsi(dev, gsi, trigger, polarity);
+ mutex_unlock(&acpi_ioapic_lock);
#endif
return irq;
@@ -659,7 +670,9 @@ static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
static void acpi_unregister_gsi_ioapic(u32 gsi)
{
#ifdef CONFIG_X86_IO_APIC
+ mutex_lock(&acpi_ioapic_lock);
mp_unregister_gsi(gsi);
+ mutex_unlock(&acpi_ioapic_lock);
#endif
}
@@ -690,6 +703,7 @@ void acpi_unregister_gsi(u32 gsi)
}
EXPORT_SYMBOL_GPL(acpi_unregister_gsi);
+#ifdef CONFIG_X86_LOCAL_APIC
static void __init acpi_set_irq_model_ioapic(void)
{
acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
@@ -697,6 +711,7 @@ static void __init acpi_set_irq_model_ioapic(void)
__acpi_unregister_gsi = acpi_unregister_gsi_ioapic;
acpi_ioapic = 1;
}
+#endif
/*
* ACPI based hotplug support for CPU
@@ -759,20 +774,74 @@ EXPORT_SYMBOL(acpi_unmap_lsapic);
int acpi_register_ioapic(acpi_handle handle, u64 phys_addr, u32 gsi_base)
{
- /* TBD */
- return -EINVAL;
-}
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ int ioapic_id;
+ u64 addr;
+ struct ioapic_domain_cfg cfg = {
+ .type = IOAPIC_DOMAIN_DYNAMIC,
+ .ops = &acpi_irqdomain_ops,
+ };
+
+ ioapic_id = acpi_get_ioapic_id(handle, gsi_base, &addr);
+ if (ioapic_id < 0) {
+ unsigned long long uid;
+ acpi_status status;
+ status = acpi_evaluate_integer(handle, METHOD_NAME__UID,
+ NULL, &uid);
+ if (ACPI_FAILURE(status)) {
+ acpi_handle_warn(handle, "failed to get IOAPIC ID.\n");
+ return -EINVAL;
+ }
+ ioapic_id = (int)uid;
+ }
+
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_register_ioapic(ioapic_id, phys_addr, gsi_base, &cfg);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
EXPORT_SYMBOL(acpi_register_ioapic);
int acpi_unregister_ioapic(acpi_handle handle, u32 gsi_base)
{
- /* TBD */
- return -EINVAL;
-}
+ int ret = -ENOSYS;
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_unregister_ioapic(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
EXPORT_SYMBOL(acpi_unregister_ioapic);
+/**
+ * acpi_ioapic_registered - Check whether IOAPIC assoicatied with @gsi_base
+ * has been registered
+ * @handle: ACPI handle of the IOAPIC deivce
+ * @gsi_base: GSI base associated with the IOAPIC
+ *
+ * Assume caller holds some type of lock to serialize acpi_ioapic_registered()
+ * with acpi_register_ioapic()/acpi_unregister_ioapic().
+ */
+int acpi_ioapic_registered(acpi_handle handle, u32 gsi_base)
+{
+ int ret = 0;
+
+#ifdef CONFIG_ACPI_HOTPLUG_IOAPIC
+ mutex_lock(&acpi_ioapic_lock);
+ ret = mp_ioapic_registered(gsi_base);
+ mutex_unlock(&acpi_ioapic_lock);
+#endif
+
+ return ret;
+}
+
static int __init acpi_parse_sbf(struct acpi_table_header *table)
{
struct acpi_table_boot *sb;
@@ -1185,7 +1254,9 @@ static void __init acpi_process_madt(void)
/*
* Parse MADT IO-APIC entries
*/
+ mutex_lock(&acpi_ioapic_lock);
error = acpi_parse_madt_ioapic_entries();
+ mutex_unlock(&acpi_ioapic_lock);
if (!error) {
acpi_set_irq_model_ioapic();
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index dcb5b15401ce..8bb12ddc5db8 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,10 +2,12 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o vector.o
obj-y += hw_nmi.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
+obj-$(CONFIG_PCI_MSI) += msi.o
+obj-$(CONFIG_HT_IRQ) += htirq.o
obj-$(CONFIG_SMP) += ipi.o
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ba6cc041edb1..29b5b18afa27 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -196,7 +196,7 @@ static int disable_apic_timer __initdata;
int local_apic_timer_c2_ok;
EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
-int first_system_vector = 0xfe;
+int first_system_vector = FIRST_SYSTEM_VECTOR;
/*
* Debug level, exported for io_apic.c
@@ -1930,7 +1930,7 @@ int __init APIC_init_uniprocessor(void)
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
*/
-static inline void __smp_spurious_interrupt(void)
+static inline void __smp_spurious_interrupt(u8 vector)
{
u32 v;
@@ -1939,30 +1939,32 @@ static inline void __smp_spurious_interrupt(void)
* if it is a vectored one. Just in case...
* Spurious interrupts should not be ACKed.
*/
- v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
- if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
+ v = apic_read(APIC_ISR + ((vector & ~0x1f) >> 1));
+ if (v & (1 << (vector & 0x1f)))
ack_APIC_irq();
inc_irq_stat(irq_spurious_count);
/* see sw-dev-man vol 3, chapter 7.4.13.5 */
- pr_info("spurious APIC interrupt on CPU#%d, "
- "should never happen.\n", smp_processor_id());
+ pr_info("spurious APIC interrupt through vector %02x on CPU#%d, "
+ "should never happen.\n", vector, smp_processor_id());
}
__visible void smp_spurious_interrupt(struct pt_regs *regs)
{
entering_irq();
- __smp_spurious_interrupt();
+ __smp_spurious_interrupt(~regs->orig_ax);
exiting_irq();
}
__visible void smp_trace_spurious_interrupt(struct pt_regs *regs)
{
+ u8 vector = ~regs->orig_ax;
+
entering_irq();
- trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR);
- __smp_spurious_interrupt();
- trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR);
+ trace_spurious_apic_entry(vector);
+ __smp_spurious_interrupt(vector);
+ trace_spurious_apic_exit(vector);
exiting_irq();
}
diff --git a/arch/x86/kernel/apic/htirq.c b/arch/x86/kernel/apic/htirq.c
new file mode 100644
index 000000000000..816f36e979ad
--- /dev/null
+++ b/arch/x86/kernel/apic/htirq.c
@@ -0,0 +1,107 @@
+/*
+ * Support Hypertransport IRQ
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/pci.h>
+#include <linux/htirq.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/hypertransport.h>
+
+/*
+ * Hypertransport interrupt support
+ */
+static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
+{
+ struct ht_irq_msg msg;
+
+ fetch_ht_irq_msg(irq, &msg);
+
+ msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
+ msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
+
+ msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
+ msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
+
+ write_ht_irq_msg(irq, &msg);
+}
+
+static int
+ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ unsigned int dest;
+ int ret;
+
+ ret = apic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
+
+ target_ht_irq(data->irq, dest, cfg->vector);
+ return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static struct irq_chip ht_irq_chip = {
+ .name = "PCI-HT",
+ .irq_mask = mask_ht_irq,
+ .irq_unmask = unmask_ht_irq,
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = ht_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
+{
+ struct irq_cfg *cfg;
+ struct ht_irq_msg msg;
+ unsigned dest;
+ int err;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (err)
+ return err;
+
+ err = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus(), &dest);
+ if (err)
+ return err;
+
+ msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
+
+ msg.address_lo =
+ HT_IRQ_LOW_BASE |
+ HT_IRQ_LOW_DEST_ID(dest) |
+ HT_IRQ_LOW_VECTOR(cfg->vector) |
+ ((apic->irq_dest_mode == 0) ?
+ HT_IRQ_LOW_DM_PHYSICAL :
+ HT_IRQ_LOW_DM_LOGICAL) |
+ HT_IRQ_LOW_RQEOI_EDGE |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ HT_IRQ_LOW_MT_FIXED :
+ HT_IRQ_LOW_MT_ARBITRATED) |
+ HT_IRQ_LOW_IRQ_MASKED;
+
+ write_ht_irq_msg(irq, &msg);
+
+ irq_set_chip_and_handler_name(irq, &ht_irq_chip,
+ handle_edge_irq, "edge");
+
+ dev_dbg(&dev->dev, "irq %d for HT\n", irq);
+
+ return 0;
+}
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 7ffe0a2b870f..3f5f60406ab1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -32,15 +32,11 @@
#include <linux/module.h>
#include <linux/syscore_ops.h>
#include <linux/irqdomain.h>
-#include <linux/msi.h>
-#include <linux/htirq.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
#include <linux/slab.h>
#include <linux/bootmem.h>
-#include <linux/dmar.h>
-#include <linux/hpet.h>
#include <asm/idle.h>
#include <asm/io.h>
@@ -52,17 +48,12 @@
#include <asm/dma.h>
#include <asm/timer.h>
#include <asm/i8259.h>
-#include <asm/msidef.h>
-#include <asm/hypertransport.h>
#include <asm/setup.h>
#include <asm/irq_remapping.h>
-#include <asm/hpet.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
-#define __apicdebuginit(type) static type __init
-
#define for_each_ioapic(idx) \
for ((idx) = 0; (idx) < nr_ioapics; (idx)++)
#define for_each_ioapic_reverse(idx) \
@@ -74,7 +65,7 @@
for_each_pin((idx), (pin))
#define for_each_irq_pin(entry, head) \
- for (entry = head; entry; entry = entry->next)
+ list_for_each_entry(entry, &head, list)
/*
* Is the SiS APIC rmw bug present ?
@@ -83,7 +74,6 @@
int sis_apic_bug = -1;
static DEFINE_RAW_SPINLOCK(ioapic_lock);
-static DEFINE_RAW_SPINLOCK(vector_lock);
static DEFINE_MUTEX(ioapic_mutex);
static unsigned int ioapic_dynirq_base;
static int ioapic_initialized;
@@ -112,6 +102,7 @@ static struct ioapic {
struct ioapic_domain_cfg irqdomain_cfg;
struct irq_domain *irqdomain;
struct mp_pin_info *pin_info;
+ struct resource *iomem_res;
} ioapics[MAX_IO_APICS];
#define mpc_ioapic_ver(ioapic_idx) ioapics[ioapic_idx].mp_config.apicver
@@ -205,8 +196,6 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
-static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node);
-
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
{
@@ -228,8 +217,8 @@ void mp_save_irq(struct mpc_intsrc *m)
}
struct irq_pin_list {
+ struct list_head list;
int apic, pin;
- struct irq_pin_list *next;
};
static struct irq_pin_list *alloc_irq_pin_list(int node)
@@ -237,7 +226,26 @@ static struct irq_pin_list *alloc_irq_pin_list(int node)
return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
}
-int __init arch_early_irq_init(void)
+static void alloc_ioapic_saved_registers(int idx)
+{
+ size_t size;
+
+ if (ioapics[idx].saved_registers)
+ return;
+
+ size = sizeof(struct IO_APIC_route_entry) * ioapics[idx].nr_registers;
+ ioapics[idx].saved_registers = kzalloc(size, GFP_KERNEL);
+ if (!ioapics[idx].saved_registers)
+ pr_err("IOAPIC %d: suspend/resume impossible!\n", idx);
+}
+
+static void free_ioapic_saved_registers(int idx)
+{
+ kfree(ioapics[idx].saved_registers);
+ ioapics[idx].saved_registers = NULL;
+}
+
+int __init arch_early_ioapic_init(void)
{
struct irq_cfg *cfg;
int i, node = cpu_to_node(0);
@@ -245,13 +253,8 @@ int __init arch_early_irq_init(void)
if (!nr_legacy_irqs())
io_apic_irqs = ~0UL;
- for_each_ioapic(i) {
- ioapics[i].saved_registers =
- kzalloc(sizeof(struct IO_APIC_route_entry) *
- ioapics[i].nr_registers, GFP_KERNEL);
- if (!ioapics[i].saved_registers)
- pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
- }
+ for_each_ioapic(i)
+ alloc_ioapic_saved_registers(i);
/*
* For legacy IRQ's, start with assigning irq0 to irq15 to
@@ -266,61 +269,6 @@ int __init arch_early_irq_init(void)
return 0;
}
-static inline struct irq_cfg *irq_cfg(unsigned int irq)
-{
- return irq_get_chip_data(irq);
-}
-
-static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
-{
- struct irq_cfg *cfg;
-
- cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
- if (!cfg)
- return NULL;
- if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
- goto out_cfg;
- if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
- goto out_domain;
- return cfg;
-out_domain:
- free_cpumask_var(cfg->domain);
-out_cfg:
- kfree(cfg);
- return NULL;
-}
-
-static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
-{
- if (!cfg)
- return;
- irq_set_chip_data(at, NULL);
- free_cpumask_var(cfg->domain);
- free_cpumask_var(cfg->old_domain);
- kfree(cfg);
-}
-
-static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
-{
- int res = irq_alloc_desc_at(at, node);
- struct irq_cfg *cfg;
-
- if (res < 0) {
- if (res != -EEXIST)
- return NULL;
- cfg = irq_cfg(at);
- if (cfg)
- return cfg;
- }
-
- cfg = alloc_irq_cfg(at, node);
- if (cfg)
- irq_set_chip_data(at, cfg);
- else
- irq_free_desc(at);
- return cfg;
-}
-
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -445,15 +393,12 @@ static void ioapic_mask_entry(int apic, int pin)
*/
static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
{
- struct irq_pin_list **last, *entry;
+ struct irq_pin_list *entry;
/* don't allow duplicates */
- last = &cfg->irq_2_pin;
- for_each_irq_pin(entry, cfg->irq_2_pin) {
+ for_each_irq_pin(entry, cfg->irq_2_pin)
if (entry->apic == apic && entry->pin == pin)
return 0;
- last = &entry->next;
- }
entry = alloc_irq_pin_list(node);
if (!entry) {
@@ -464,22 +409,19 @@ static int __add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pi
entry->apic = apic;
entry->pin = pin;
- *last = entry;
+ list_add_tail(&entry->list, &cfg->irq_2_pin);
return 0;
}
static void __remove_pin_from_irq(struct irq_cfg *cfg, int apic, int pin)
{
- struct irq_pin_list **last, *entry;
+ struct irq_pin_list *tmp, *entry;
- last = &cfg->irq_2_pin;
- for_each_irq_pin(entry, cfg->irq_2_pin)
+ list_for_each_entry_safe(entry, tmp, &cfg->irq_2_pin, list)
if (entry->apic == apic && entry->pin == pin) {
- *last = entry->next;
+ list_del(&entry->list);
kfree(entry);
return;
- } else {
- last = &entry->next;
}
}
@@ -559,7 +501,7 @@ static void mask_ioapic(struct irq_cfg *cfg)
static void mask_ioapic_irq(struct irq_data *data)
{
- mask_ioapic(data->chip_data);
+ mask_ioapic(irqd_cfg(data));
}
static void __unmask_ioapic(struct irq_cfg *cfg)
@@ -578,7 +520,7 @@ static void unmask_ioapic(struct irq_cfg *cfg)
static void unmask_ioapic_irq(struct irq_data *data)
{
- unmask_ioapic(data->chip_data);
+ unmask_ioapic(irqd_cfg(data));
}
/*
@@ -1164,8 +1106,7 @@ void mp_unmap_irq(int irq)
* Find a specific PCI IRQ entry.
* Not an __init, possibly needed by modules
*/
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
- struct io_apic_irq_attr *irq_attr)
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
int irq, i, best_ioapic = -1, best_idx = -1;
@@ -1219,195 +1160,11 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
return -1;
out:
- irq = pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
- IOAPIC_MAP_ALLOC);
- if (irq > 0)
- set_io_apic_irq_attr(irq_attr, best_ioapic,
- mp_irqs[best_idx].dstirq,
- irq_trigger(best_idx),
- irq_polarity(best_idx));
- return irq;
+ return pin_2_irq(best_idx, best_ioapic, mp_irqs[best_idx].dstirq,
+ IOAPIC_MAP_ALLOC);
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-void lock_vector_lock(void)
-{
- /* Used to the online set of cpus does not change
- * during assign_irq_vector.
- */
- raw_spin_lock(&vector_lock);
-}
-
-void unlock_vector_lock(void)
-{
- raw_spin_unlock(&vector_lock);
-}
-
-static int
-__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
-{
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
- static int current_offset = VECTOR_OFFSET_START % 16;
- int cpu, err;
- cpumask_var_t tmp_mask;
-
- if (cfg->move_in_progress)
- return -EBUSY;
-
- if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
- return -ENOMEM;
-
- /* Only try and allocate irqs on cpus that are present */
- err = -ENOSPC;
- cpumask_clear(cfg->old_domain);
- cpu = cpumask_first_and(mask, cpu_online_mask);
- while (cpu < nr_cpu_ids) {
- int new_cpu, vector, offset;
-
- apic->vector_allocation_domain(cpu, tmp_mask, mask);
-
- if (cpumask_subset(tmp_mask, cfg->domain)) {
- err = 0;
- if (cpumask_equal(tmp_mask, cfg->domain))
- break;
- /*
- * New cpumask using the vector is a proper subset of
- * the current in use mask. So cleanup the vector
- * allocation for the members that are not used anymore.
- */
- cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
- cpumask_and(cfg->domain, cfg->domain, tmp_mask);
- break;
- }
-
- vector = current_vector;
- offset = current_offset;
-next:
- vector += 16;
- if (vector >= first_system_vector) {
- offset = (offset + 1) % 16;
- vector = FIRST_EXTERNAL_VECTOR + offset;
- }
-
- if (unlikely(current_vector == vector)) {
- cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
- cpumask_andnot(tmp_mask, mask, cfg->old_domain);
- cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
- continue;
- }
-
- if (test_bit(vector, used_vectors))
- goto next;
-
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
- if (per_cpu(vector_irq, new_cpu)[vector] > VECTOR_UNDEFINED)
- goto next;
- }
- /* Found one! */
- current_vector = vector;
- current_offset = offset;
- if (cfg->vector) {
- cpumask_copy(cfg->old_domain, cfg->domain);
- cfg->move_in_progress =
- cpumask_intersects(cfg->old_domain, cpu_online_mask);
- }
- for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
- per_cpu(vector_irq, new_cpu)[vector] = irq;
- cfg->vector = vector;
- cpumask_copy(cfg->domain, tmp_mask);
- err = 0;
- break;
- }
- free_cpumask_var(tmp_mask);
- return err;
-}
-
-int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
-{
- int err;
- unsigned long flags;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
- return err;
-}
-
-static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
-{
- int cpu, vector;
-
- BUG_ON(!cfg->vector);
-
- vector = cfg->vector;
- for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
-
- cfg->vector = 0;
- cpumask_clear(cfg->domain);
-
- if (likely(!cfg->move_in_progress))
- return;
- for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- if (per_cpu(vector_irq, cpu)[vector] != irq)
- continue;
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
- break;
- }
- }
- cfg->move_in_progress = 0;
-}
-
-void __setup_vector_irq(int cpu)
-{
- /* Initialize vector_irq on a new cpu */
- int irq, vector;
- struct irq_cfg *cfg;
-
- /*
- * vector_lock will make sure that we don't run into irq vector
- * assignments that might be happening on another cpu in parallel,
- * while we setup our initial vector to irq mappings.
- */
- raw_spin_lock(&vector_lock);
- /* Mark the inuse vectors */
- for_each_active_irq(irq) {
- cfg = irq_cfg(irq);
- if (!cfg)
- continue;
-
- if (!cpumask_test_cpu(cpu, cfg->domain))
- continue;
- vector = cfg->vector;
- per_cpu(vector_irq, cpu)[vector] = irq;
- }
- /* Mark the free vectors */
- for (vector = 0; vector < NR_VECTORS; ++vector) {
- irq = per_cpu(vector_irq, cpu)[vector];
- if (irq <= VECTOR_UNDEFINED)
- continue;
-
- cfg = irq_cfg(irq);
- if (!cpumask_test_cpu(cpu, cfg->domain))
- per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
- }
- raw_spin_unlock(&vector_lock);
-}
-
static struct irq_chip ioapic_chip;
#ifdef CONFIG_X86_32
@@ -1496,7 +1253,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
&dest)) {
pr_warn("Failed to obtain apicid for ioapic %d, pin %d\n",
mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- __clear_irq_vector(irq, cfg);
+ clear_irq_vector(irq, cfg);
return;
}
@@ -1510,7 +1267,7 @@ static void setup_ioapic_irq(unsigned int irq, struct irq_cfg *cfg,
if (x86_io_apic_ops.setup_entry(irq, &entry, dest, cfg->vector, attr)) {
pr_warn("Failed to setup ioapic entry for ioapic %d, pin %d\n",
mpc_ioapic_id(attr->ioapic), attr->ioapic_pin);
- __clear_irq_vector(irq, cfg);
+ clear_irq_vector(irq, cfg);
return;
}
@@ -1641,7 +1398,7 @@ void ioapic_zap_locks(void)
raw_spin_lock_init(&ioapic_lock);
}
-__apicdebuginit(void) print_IO_APIC(int ioapic_idx)
+static void __init print_IO_APIC(int ioapic_idx)
{
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
@@ -1698,7 +1455,7 @@ __apicdebuginit(void) print_IO_APIC(int ioapic_idx)
x86_io_apic_ops.print_entries(ioapic_idx, reg_01.bits.entries);
}
-__apicdebuginit(void) print_IO_APICs(void)
+void __init print_IO_APICs(void)
{
int ioapic_idx;
struct irq_cfg *cfg;
@@ -1731,8 +1488,7 @@ __apicdebuginit(void) print_IO_APICs(void)
cfg = irq_cfg(irq);
if (!cfg)
continue;
- entry = cfg->irq_2_pin;
- if (!entry)
+ if (list_empty(&cfg->irq_2_pin))
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
for_each_irq_pin(entry, cfg->irq_2_pin)
@@ -1743,205 +1499,6 @@ __apicdebuginit(void) print_IO_APICs(void)
printk(KERN_INFO ".................................... done.\n");
}
-__apicdebuginit(void) print_APIC_field(int base)
-{
- int i;
-
- printk(KERN_DEBUG);
-
- for (i = 0; i < 8; i++)
- pr_cont("%08x", apic_read(base + i*0x10));
-
- pr_cont("\n");
-}
-
-__apicdebuginit(void) print_local_APIC(void *dummy)
-{
- unsigned int i, v, ver, maxlvt;
- u64 icr;
-
- printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
- smp_processor_id(), hard_smp_processor_id());
- v = apic_read(APIC_ID);
- printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
- v = apic_read(APIC_LVR);
- printk(KERN_INFO "... APIC VERSION: %08x\n", v);
- ver = GET_APIC_VERSION(v);
- maxlvt = lapic_get_maxlvt();
-
- v = apic_read(APIC_TASKPRI);
- printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (!APIC_XAPIC(ver)) {
- v = apic_read(APIC_ARBPRI);
- printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
- v & APIC_ARBPRI_MASK);
- }
- v = apic_read(APIC_PROCPRI);
- printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
- }
-
- /*
- * Remote read supported only in the 82489DX and local APIC for
- * Pentium processors.
- */
- if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
- v = apic_read(APIC_RRR);
- printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
- }
-
- v = apic_read(APIC_LDR);
- printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
- if (!x2apic_enabled()) {
- v = apic_read(APIC_DFR);
- printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
- }
- v = apic_read(APIC_SPIV);
- printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
-
- printk(KERN_DEBUG "... APIC ISR field:\n");
- print_APIC_field(APIC_ISR);
- printk(KERN_DEBUG "... APIC TMR field:\n");
- print_APIC_field(APIC_TMR);
- printk(KERN_DEBUG "... APIC IRR field:\n");
- print_APIC_field(APIC_IRR);
-
- if (APIC_INTEGRATED(ver)) { /* !82489DX */
- if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
- apic_write(APIC_ESR, 0);
-
- v = apic_read(APIC_ESR);
- printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
- }
-
- icr = apic_icr_read();
- printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
- printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
-
- v = apic_read(APIC_LVTT);
- printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
-
- if (maxlvt > 3) { /* PC is LVT#4. */
- v = apic_read(APIC_LVTPC);
- printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
- }
- v = apic_read(APIC_LVT0);
- printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
- v = apic_read(APIC_LVT1);
- printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
-
- if (maxlvt > 2) { /* ERR is LVT#3. */
- v = apic_read(APIC_LVTERR);
- printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
- }
-
- v = apic_read(APIC_TMICT);
- printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
- v = apic_read(APIC_TMCCT);
- printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
- v = apic_read(APIC_TDCR);
- printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
-
- if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
- v = apic_read(APIC_EFEAT);
- maxlvt = (v >> 16) & 0xff;
- printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
- v = apic_read(APIC_ECTRL);
- printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
- for (i = 0; i < maxlvt; i++) {
- v = apic_read(APIC_EILVTn(i));
- printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
- }
- }
- pr_cont("\n");
-}
-
-__apicdebuginit(void) print_local_APICs(int maxcpu)
-{
- int cpu;
-
- if (!maxcpu)
- return;
-
- preempt_disable();
- for_each_online_cpu(cpu) {
- if (cpu >= maxcpu)
- break;
- smp_call_function_single(cpu, print_local_APIC, NULL, 1);
- }
- preempt_enable();
-}
-
-__apicdebuginit(void) print_PIC(void)
-{
- unsigned int v;
- unsigned long flags;
-
- if (!nr_legacy_irqs())
- return;
-
- printk(KERN_DEBUG "\nprinting PIC contents\n");
-
- raw_spin_lock_irqsave(&i8259A_lock, flags);
-
- v = inb(0xa1) << 8 | inb(0x21);
- printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
-
- v = inb(0xa0) << 8 | inb(0x20);
- printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
-
- outb(0x0b,0xa0);
- outb(0x0b,0x20);
- v = inb(0xa0) << 8 | inb(0x20);
- outb(0x0a,0xa0);
- outb(0x0a,0x20);
-
- raw_spin_unlock_irqrestore(&i8259A_lock, flags);
-
- printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
-
- v = inb(0x4d1) << 8 | inb(0x4d0);
- printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
-}
-
-static int __initdata show_lapic = 1;
-static __init int setup_show_lapic(char *arg)
-{
- int num = -1;
-
- if (strcmp(arg, "all") == 0) {
- show_lapic = CONFIG_NR_CPUS;
- } else {
- get_option(&arg, &num);
- if (num >= 0)
- show_lapic = num;
- }
-
- return 1;
-}
-__setup("show_lapic=", setup_show_lapic);
-
-__apicdebuginit(int) print_ICs(void)
-{
- if (apic_verbosity == APIC_QUIET)
- return 0;
-
- print_PIC();
-
- /* don't print out if apic is not there */
- if (!cpu_has_apic && !apic_from_smp_config())
- return 0;
-
- print_local_APICs(show_lapic);
- print_IO_APICs();
-
- return 0;
-}
-
-late_initcall(print_ICs);
-
-
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
@@ -2244,26 +1801,12 @@ static unsigned int startup_ioapic_irq(struct irq_data *data)
if (legacy_pic->irq_pending(irq))
was_pending = 1;
}
- __unmask_ioapic(data->chip_data);
+ __unmask_ioapic(irqd_cfg(data));
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
-static int ioapic_retrigger_irq(struct irq_data *data)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned long flags;
- int cpu;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
- apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- return 1;
-}
-
/*
* Level and edge triggered IO-APIC interrupts need different handling,
* so we use two separate IRQ descriptors. Edge triggered IRQs can be
@@ -2273,113 +1816,6 @@ static int ioapic_retrigger_irq(struct irq_data *data)
* races.
*/
-#ifdef CONFIG_SMP
-void send_cleanup_vector(struct irq_cfg *cfg)
-{
- cpumask_var_t cleanup_mask;
-
- if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
- unsigned int i;
- for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
- apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
- } else {
- cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
- apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
- free_cpumask_var(cleanup_mask);
- }
- cfg->move_in_progress = 0;
-}
-
-asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
-{
- unsigned vector, me;
-
- ack_APIC_irq();
- irq_enter();
- exit_idle();
-
- me = smp_processor_id();
- for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
- int irq;
- unsigned int irr;
- struct irq_desc *desc;
- struct irq_cfg *cfg;
- irq = __this_cpu_read(vector_irq[vector]);
-
- if (irq <= VECTOR_UNDEFINED)
- continue;
-
- desc = irq_to_desc(irq);
- if (!desc)
- continue;
-
- cfg = irq_cfg(irq);
- if (!cfg)
- continue;
-
- raw_spin_lock(&desc->lock);
-
- /*
- * Check if the irq migration is in progress. If so, we
- * haven't received the cleanup request yet for this irq.
- */
- if (cfg->move_in_progress)
- goto unlock;
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- goto unlock;
-
- irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
- /*
- * Check if the vector that needs to be cleanedup is
- * registered at the cpu's IRR. If so, then this is not
- * the best time to clean it up. Lets clean it up in the
- * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
- * to myself.
- */
- if (irr & (1 << (vector % 32))) {
- apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
- goto unlock;
- }
- __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
-unlock:
- raw_spin_unlock(&desc->lock);
- }
-
- irq_exit();
-}
-
-static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
-{
- unsigned me;
-
- if (likely(!cfg->move_in_progress))
- return;
-
- me = smp_processor_id();
-
- if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
- send_cleanup_vector(cfg);
-}
-
-static void irq_complete_move(struct irq_cfg *cfg)
-{
- __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
-}
-
-void irq_force_complete_move(int irq)
-{
- struct irq_cfg *cfg = irq_cfg(irq);
-
- if (!cfg)
- return;
-
- __irq_complete_move(cfg, cfg->vector);
-}
-#else
-static inline void irq_complete_move(struct irq_cfg *cfg) { }
-#endif
-
static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
{
int apic, pin;
@@ -2400,41 +1836,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
}
}
-/*
- * Either sets data->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
- * leaves data->affinity untouched.
- */
-int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
- unsigned int *dest_id)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int irq = data->irq;
- int err;
-
- if (!config_enabled(CONFIG_SMP))
- return -EPERM;
-
- if (!cpumask_intersects(mask, cpu_online_mask))
- return -EINVAL;
-
- err = assign_irq_vector(irq, cfg, mask);
- if (err)
- return err;
-
- err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
- if (err) {
- if (assign_irq_vector(irq, cfg, data->affinity))
- pr_err("Failed to recover vector for irq %d\n", irq);
- return err;
- }
-
- cpumask_copy(data->affinity, mask);
-
- return 0;
-}
-
-
int native_ioapic_set_affinity(struct irq_data *data,
const struct cpumask *mask,
bool force)
@@ -2447,24 +1848,17 @@ int native_ioapic_set_affinity(struct irq_data *data,
return -EPERM;
raw_spin_lock_irqsave(&ioapic_lock, flags);
- ret = __ioapic_set_affinity(data, mask, &dest);
+ ret = apic_set_affinity(data, mask, &dest);
if (!ret) {
/* Only the high 8 bits are valid. */
dest = SET_APIC_LOGICAL_ID(dest);
- __target_IO_APIC_irq(irq, dest, data->chip_data);
+ __target_IO_APIC_irq(irq, dest, irqd_cfg(data));
ret = IRQ_SET_MASK_OK_NOCOPY;
}
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
return ret;
}
-static void ack_apic_edge(struct irq_data *data)
-{
- irq_complete_move(data->chip_data);
- irq_move_irq(data);
- ack_APIC_irq();
-}
-
atomic_t irq_mis_count;
#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -2547,9 +1941,9 @@ static inline void ioapic_irqd_unmask(struct irq_data *data,
}
#endif
-static void ack_apic_level(struct irq_data *data)
+static void ack_ioapic_level(struct irq_data *data)
{
- struct irq_cfg *cfg = data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(data);
int i, irq = data->irq;
unsigned long v;
bool masked;
@@ -2619,10 +2013,10 @@ static struct irq_chip ioapic_chip __read_mostly = {
.irq_startup = startup_ioapic_irq,
.irq_mask = mask_ioapic_irq,
.irq_unmask = unmask_ioapic_irq,
- .irq_ack = ack_apic_edge,
- .irq_eoi = ack_apic_level,
+ .irq_ack = apic_ack_edge,
+ .irq_eoi = ack_ioapic_level,
.irq_set_affinity = native_ioapic_set_affinity,
- .irq_retrigger = ioapic_retrigger_irq,
+ .irq_retrigger = apic_retrigger_irq,
.flags = IRQCHIP_SKIP_SET_WAKE,
};
@@ -2965,6 +2359,16 @@ static int mp_irqdomain_create(int ioapic)
return 0;
}
+static void ioapic_destroy_irqdomain(int idx)
+{
+ if (ioapics[idx].irqdomain) {
+ irq_domain_remove(ioapics[idx].irqdomain);
+ ioapics[idx].irqdomain = NULL;
+ }
+ kfree(ioapics[idx].pin_info);
+ ioapics[idx].pin_info = NULL;
+}
+
void __init setup_IO_APIC(void)
{
int ioapic;
@@ -3044,399 +2448,6 @@ static int __init ioapic_init_ops(void)
device_initcall(ioapic_init_ops);
-/*
- * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
- */
-int arch_setup_hwirq(unsigned int irq, int node)
-{
- struct irq_cfg *cfg;
- unsigned long flags;
- int ret;
-
- cfg = alloc_irq_cfg(irq, node);
- if (!cfg)
- return -ENOMEM;
-
- raw_spin_lock_irqsave(&vector_lock, flags);
- ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
- raw_spin_unlock_irqrestore(&vector_lock, flags);
-
- if (!ret)
- irq_set_chip_data(irq, cfg);
- else
- free_irq_cfg(irq, cfg);
- return ret;
-}
-
-void arch_teardown_hwirq(unsigned int irq)
-{
- struct irq_cfg *cfg = irq_cfg(irq);
- unsigned long flags;
-
- free_remapped_irq(irq);
- raw_spin_lock_irqsave(&vector_lock, flags);
- __clear_irq_vector(irq, cfg);
- raw_spin_unlock_irqrestore(&vector_lock, flags);
- free_irq_cfg(irq, cfg);
-}
-
-/*
- * MSI message composition
- */
-void native_compose_msi_msg(struct pci_dev *pdev,
- unsigned int irq, unsigned int dest,
- struct msi_msg *msg, u8 hpet_id)
-{
- struct irq_cfg *cfg = irq_cfg(irq);
-
- msg->address_hi = MSI_ADDR_BASE_HI;
-
- if (x2apic_enabled())
- msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
-
- msg->address_lo =
- MSI_ADDR_BASE_LO |
- ((apic->irq_dest_mode == 0) ?
- MSI_ADDR_DEST_MODE_PHYSICAL:
- MSI_ADDR_DEST_MODE_LOGICAL) |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_ADDR_REDIRECTION_CPU:
- MSI_ADDR_REDIRECTION_LOWPRI) |
- MSI_ADDR_DEST_ID(dest);
-
- msg->data =
- MSI_DATA_TRIGGER_EDGE |
- MSI_DATA_LEVEL_ASSERT |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- MSI_DATA_DELIVERY_FIXED:
- MSI_DATA_DELIVERY_LOWPRI) |
- MSI_DATA_VECTOR(cfg->vector);
-}
-
-#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
- struct msi_msg *msg, u8 hpet_id)
-{
- struct irq_cfg *cfg;
- int err;
- unsigned dest;
-
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
-
- err = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus(), &dest);
- if (err)
- return err;
-
- x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
-
- return 0;
-}
-
-static int
-msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- struct msi_msg msg;
- unsigned int dest;
- int ret;
-
- ret = __ioapic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
-
- __get_cached_msi_msg(data->msi_desc, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- __pci_write_msi_msg(data->msi_desc, &msg);
-
- return IRQ_SET_MASK_OK_NOCOPY;
-}
-
-/*
- * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
- * which implement the MSI or MSI-X Capability Structure.
- */
-static struct irq_chip msi_chip = {
- .name = "PCI-MSI",
- .irq_unmask = pci_msi_unmask_irq,
- .irq_mask = pci_msi_mask_irq,
- .irq_ack = ack_apic_edge,
- .irq_set_affinity = msi_set_affinity,
- .irq_retrigger = ioapic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
- unsigned int irq_base, unsigned int irq_offset)
-{
- struct irq_chip *chip = &msi_chip;
- struct msi_msg msg;
- unsigned int irq = irq_base + irq_offset;
- int ret;
-
- ret = msi_compose_msg(dev, irq, &msg, -1);
- if (ret < 0)
- return ret;
-
- irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
-
- /*
- * MSI-X message is written per-IRQ, the offset is always 0.
- * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
- */
- if (!irq_offset)
- pci_write_msi_msg(irq, &msg);
-
- setup_remapped_irq(irq, irq_cfg(irq), chip);
-
- irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
-
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
-
- return 0;
-}
-
-int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
-{
- struct msi_desc *msidesc;
- unsigned int irq;
- int node, ret;
-
- /* Multiple MSI vectors only supported with interrupt remapping */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
-
- node = dev_to_node(&dev->dev);
-
- list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = irq_alloc_hwirq(node);
- if (!irq)
- return -ENOSPC;
-
- ret = setup_msi_irq(dev, msidesc, irq, 0);
- if (ret < 0) {
- irq_free_hwirq(irq);
- return ret;
- }
-
- }
- return 0;
-}
-
-void native_teardown_msi_irq(unsigned int irq)
-{
- irq_free_hwirq(irq);
-}
-
-#ifdef CONFIG_DMAR_TABLE
-static int
-dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
- bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest, irq = data->irq;
- struct msi_msg msg;
- int ret;
-
- ret = __ioapic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
-
- dmar_msi_read(irq, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
- msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
-
- dmar_msi_write(irq, &msg);
-
- return IRQ_SET_MASK_OK_NOCOPY;
-}
-
-static struct irq_chip dmar_msi_type = {
- .name = "DMAR_MSI",
- .irq_unmask = dmar_msi_unmask,
- .irq_mask = dmar_msi_mask,
- .irq_ack = ack_apic_edge,
- .irq_set_affinity = dmar_msi_set_affinity,
- .irq_retrigger = ioapic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_dmar_msi(unsigned int irq)
-{
- int ret;
- struct msi_msg msg;
-
- ret = msi_compose_msg(NULL, irq, &msg, -1);
- if (ret < 0)
- return ret;
- dmar_msi_write(irq, &msg);
- irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
- "edge");
- return 0;
-}
-#endif
-
-#ifdef CONFIG_HPET_TIMER
-
-static int hpet_msi_set_affinity(struct irq_data *data,
- const struct cpumask *mask, bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- struct msi_msg msg;
- unsigned int dest;
- int ret;
-
- ret = __ioapic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
-
- hpet_msi_read(data->handler_data, &msg);
-
- msg.data &= ~MSI_DATA_VECTOR_MASK;
- msg.data |= MSI_DATA_VECTOR(cfg->vector);
- msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
- msg.address_lo |= MSI_ADDR_DEST_ID(dest);
-
- hpet_msi_write(data->handler_data, &msg);
-
- return IRQ_SET_MASK_OK_NOCOPY;
-}
-
-static struct irq_chip hpet_msi_type = {
- .name = "HPET_MSI",
- .irq_unmask = hpet_msi_unmask,
- .irq_mask = hpet_msi_mask,
- .irq_ack = ack_apic_edge,
- .irq_set_affinity = hpet_msi_set_affinity,
- .irq_retrigger = ioapic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-int default_setup_hpet_msi(unsigned int irq, unsigned int id)
-{
- struct irq_chip *chip = &hpet_msi_type;
- struct msi_msg msg;
- int ret;
-
- ret = msi_compose_msg(NULL, irq, &msg, id);
- if (ret < 0)
- return ret;
-
- hpet_msi_write(irq_get_handler_data(irq), &msg);
- irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
- setup_remapped_irq(irq, irq_cfg(irq), chip);
-
- irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
- return 0;
-}
-#endif
-
-#endif /* CONFIG_PCI_MSI */
-/*
- * Hypertransport interrupt support
- */
-#ifdef CONFIG_HT_IRQ
-
-static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
-{
- struct ht_irq_msg msg;
- fetch_ht_irq_msg(irq, &msg);
-
- msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK);
- msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
-
- msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest);
- msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
-
- write_ht_irq_msg(irq, &msg);
-}
-
-static int
-ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
-{
- struct irq_cfg *cfg = data->chip_data;
- unsigned int dest;
- int ret;
-
- ret = __ioapic_set_affinity(data, mask, &dest);
- if (ret)
- return ret;
-
- target_ht_irq(data->irq, dest, cfg->vector);
- return IRQ_SET_MASK_OK_NOCOPY;
-}
-
-static struct irq_chip ht_irq_chip = {
- .name = "PCI-HT",
- .irq_mask = mask_ht_irq,
- .irq_unmask = unmask_ht_irq,
- .irq_ack = ack_apic_edge,
- .irq_set_affinity = ht_set_affinity,
- .irq_retrigger = ioapic_retrigger_irq,
- .flags = IRQCHIP_SKIP_SET_WAKE,
-};
-
-int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
-{
- struct irq_cfg *cfg;
- struct ht_irq_msg msg;
- unsigned dest;
- int err;
-
- if (disable_apic)
- return -ENXIO;
-
- cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
- if (err)
- return err;
-
- err = apic->cpu_mask_to_apicid_and(cfg->domain,
- apic->target_cpus(), &dest);
- if (err)
- return err;
-
- msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
-
- msg.address_lo =
- HT_IRQ_LOW_BASE |
- HT_IRQ_LOW_DEST_ID(dest) |
- HT_IRQ_LOW_VECTOR(cfg->vector) |
- ((apic->irq_dest_mode == 0) ?
- HT_IRQ_LOW_DM_PHYSICAL :
- HT_IRQ_LOW_DM_LOGICAL) |
- HT_IRQ_LOW_RQEOI_EDGE |
- ((apic->irq_delivery_mode != dest_LowestPrio) ?
- HT_IRQ_LOW_MT_FIXED :
- HT_IRQ_LOW_MT_ARBITRATED) |
- HT_IRQ_LOW_IRQ_MASKED;
-
- write_ht_irq_msg(irq, &msg);
-
- irq_set_chip_and_handler_name(irq, &ht_irq_chip,
- handle_edge_irq, "edge");
-
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
-
- return 0;
-}
-#endif /* CONFIG_HT_IRQ */
-
static int
io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
{
@@ -3451,7 +2462,7 @@ io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
return ret;
}
-static int __init io_apic_get_redir_entries(int ioapic)
+static int io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -3476,28 +2487,8 @@ unsigned int arch_dynirq_lower_bound(unsigned int from)
return ioapic_initialized ? ioapic_dynirq_base : gsi_top;
}
-int __init arch_probe_nr_irqs(void)
-{
- int nr;
-
- if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
- nr_irqs = NR_VECTORS * nr_cpu_ids;
-
- nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
-#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
- /*
- * for MSI and HT dyn irq
- */
- nr += gsi_top * 16;
-#endif
- if (nr < nr_irqs)
- nr_irqs = nr;
-
- return 0;
-}
-
#ifdef CONFIG_X86_32
-static int __init io_apic_get_unique_id(int ioapic, int apic_id)
+static int io_apic_get_unique_id(int ioapic, int apic_id)
{
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -3572,30 +2563,63 @@ static int __init io_apic_get_unique_id(int ioapic, int apic_id)
return apic_id;
}
-static u8 __init io_apic_unique_id(u8 id)
+static u8 io_apic_unique_id(int idx, u8 id)
{
if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
!APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
- return io_apic_get_unique_id(nr_ioapics, id);
+ return io_apic_get_unique_id(idx, id);
else
return id;
}
#else
-static u8 __init io_apic_unique_id(u8 id)
+static u8 io_apic_unique_id(int idx, u8 id)
{
- int i;
+ union IO_APIC_reg_00 reg_00;
DECLARE_BITMAP(used, 256);
+ unsigned long flags;
+ u8 new_id;
+ int i;
bitmap_zero(used, 256);
for_each_ioapic(i)
__set_bit(mpc_ioapic_id(i), used);
+
+ /* Hand out the requested id if available */
if (!test_bit(id, used))
return id;
- return find_first_zero_bit(used, 256);
+
+ /*
+ * Read the current id from the ioapic and keep it if
+ * available.
+ */
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ new_id = reg_00.bits.ID;
+ if (!test_bit(new_id, used)) {
+ apic_printk(APIC_VERBOSE, KERN_INFO
+ "IOAPIC[%d]: Using reg apic_id %d instead of %d\n",
+ idx, new_id, id);
+ return new_id;
+ }
+
+ /*
+ * Get the next free id and write it to the ioapic.
+ */
+ new_id = find_first_zero_bit(used, 256);
+ reg_00.bits.ID = new_id;
+ raw_spin_lock_irqsave(&ioapic_lock, flags);
+ io_apic_write(idx, 0, reg_00.raw);
+ reg_00.raw = io_apic_read(idx, 0);
+ raw_spin_unlock_irqrestore(&ioapic_lock, flags);
+ /* Sanity check */
+ BUG_ON(reg_00.bits.ID != new_id);
+
+ return new_id;
}
#endif
-static int __init io_apic_get_version(int ioapic)
+static int io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
@@ -3702,6 +2726,7 @@ static struct resource * __init ioapic_setup_resources(void)
snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
mem += IOAPIC_RESOURCE_NAME_SIZE;
num++;
+ ioapics[i].iomem_res = res;
}
ioapic_resources = res;
@@ -3799,21 +2824,7 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi)
return gsi - gsi_cfg->gsi_base;
}
-static __init int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- pr_warn("WARNING: Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
- MAX_IO_APICS, nr_ioapics);
- return 1;
- }
- if (!address) {
- pr_warn("WARNING: Bogus (zero) I/O APIC address found in table, skipping!\n");
- return 1;
- }
- return 0;
-}
-
-static __init int bad_ioapic_register(int idx)
+static int bad_ioapic_register(int idx)
{
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
@@ -3832,32 +2843,61 @@ static __init int bad_ioapic_register(int idx)
return 0;
}
-void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
- struct ioapic_domain_cfg *cfg)
+static int find_free_ioapic_entry(void)
{
- int idx = 0;
- int entries;
+ int idx;
+
+ for (idx = 0; idx < MAX_IO_APICS; idx++)
+ if (ioapics[idx].nr_registers == 0)
+ return idx;
+
+ return MAX_IO_APICS;
+}
+
+/**
+ * mp_register_ioapic - Register an IOAPIC device
+ * @id: hardware IOAPIC ID
+ * @address: physical address of IOAPIC register area
+ * @gsi_base: base of GSI associated with the IOAPIC
+ * @cfg: configuration information for the IOAPIC
+ */
+int mp_register_ioapic(int id, u32 address, u32 gsi_base,
+ struct ioapic_domain_cfg *cfg)
+{
+ bool hotplug = !!ioapic_initialized;
struct mp_ioapic_gsi *gsi_cfg;
+ int idx, ioapic, entries;
+ u32 gsi_end;
- if (bad_ioapic(address))
- return;
+ if (!address) {
+ pr_warn("Bogus (zero) I/O APIC address found, skipping!\n");
+ return -EINVAL;
+ }
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].mp_config.apicaddr == address) {
+ pr_warn("address 0x%x conflicts with IOAPIC%d\n",
+ address, ioapic);
+ return -EEXIST;
+ }
- idx = nr_ioapics;
+ idx = find_free_ioapic_entry();
+ if (idx >= MAX_IO_APICS) {
+ pr_warn("Max # of I/O APICs (%d) exceeded (found %d), skipping\n",
+ MAX_IO_APICS, idx);
+ return -ENOSPC;
+ }
ioapics[idx].mp_config.type = MP_IOAPIC;
ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
ioapics[idx].mp_config.apicaddr = address;
- ioapics[idx].irqdomain = NULL;
- ioapics[idx].irqdomain_cfg = *cfg;
set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
-
if (bad_ioapic_register(idx)) {
clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
- return;
+ return -ENODEV;
}
- ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
+ ioapics[idx].mp_config.apicid = io_apic_unique_id(idx, id);
ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
/*
@@ -3865,24 +2905,112 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base,
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
entries = io_apic_get_redir_entries(idx);
+ gsi_end = gsi_base + entries - 1;
+ for_each_ioapic(ioapic) {
+ gsi_cfg = mp_ioapic_gsi_routing(ioapic);
+ if ((gsi_base >= gsi_cfg->gsi_base &&
+ gsi_base <= gsi_cfg->gsi_end) ||
+ (gsi_end >= gsi_cfg->gsi_base &&
+ gsi_end <= gsi_cfg->gsi_end)) {
+ pr_warn("GSI range [%u-%u] for new IOAPIC conflicts with GSI[%u-%u]\n",
+ gsi_base, gsi_end,
+ gsi_cfg->gsi_base, gsi_cfg->gsi_end);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOSPC;
+ }
+ }
gsi_cfg = mp_ioapic_gsi_routing(idx);
gsi_cfg->gsi_base = gsi_base;
- gsi_cfg->gsi_end = gsi_base + entries - 1;
+ gsi_cfg->gsi_end = gsi_end;
+
+ ioapics[idx].irqdomain = NULL;
+ ioapics[idx].irqdomain_cfg = *cfg;
/*
- * The number of IO-APIC IRQ registers (== #pins):
+ * If mp_register_ioapic() is called during early boot stage when
+ * walking ACPI/SFI/DT tables, it's too early to create irqdomain,
+ * we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
- ioapics[idx].nr_registers = entries;
+ if (hotplug) {
+ if (mp_irqdomain_create(idx)) {
+ clear_fixmap(FIX_IO_APIC_BASE_0 + idx);
+ return -ENOMEM;
+ }
+ alloc_ioapic_saved_registers(idx);
+ }
if (gsi_cfg->gsi_end >= gsi_top)
gsi_top = gsi_cfg->gsi_end + 1;
+ if (nr_ioapics <= idx)
+ nr_ioapics = idx + 1;
+
+ /* Set nr_registers to mark entry present */
+ ioapics[idx].nr_registers = entries;
pr_info("IOAPIC[%d]: apic_id %d, version %d, address 0x%x, GSI %d-%d\n",
idx, mpc_ioapic_id(idx),
mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
gsi_cfg->gsi_base, gsi_cfg->gsi_end);
- nr_ioapics++;
+ return 0;
+}
+
+int mp_unregister_ioapic(u32 gsi_base)
+{
+ int ioapic, pin;
+ int found = 0;
+ struct mp_pin_info *pin_info;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ pr_warn("can't find IOAPIC for GSI %d\n", gsi_base);
+ return -ENODEV;
+ }
+
+ for_each_pin(ioapic, pin) {
+ pin_info = mp_pin_info(ioapic, pin);
+ if (pin_info->count) {
+ pr_warn("pin%d on IOAPIC%d is still in use.\n",
+ pin, ioapic);
+ return -EBUSY;
+ }
+ }
+
+ /* Mark entry not present */
+ ioapics[ioapic].nr_registers = 0;
+ ioapic_destroy_irqdomain(ioapic);
+ free_ioapic_saved_registers(ioapic);
+ if (ioapics[ioapic].iomem_res)
+ release_resource(ioapics[ioapic].iomem_res);
+ clear_fixmap(FIX_IO_APIC_BASE_0 + ioapic);
+ memset(&ioapics[ioapic], 0, sizeof(ioapics[ioapic]));
+
+ return 0;
+}
+
+int mp_ioapic_registered(u32 gsi_base)
+{
+ int ioapic;
+
+ for_each_ioapic(ioapic)
+ if (ioapics[ioapic].gsi_config.gsi_base == gsi_base)
+ return 1;
+
+ return 0;
+}
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+ int ioapic, int ioapic_pin,
+ int trigger, int polarity)
+{
+ irq_attr->ioapic = ioapic;
+ irq_attr->ioapic_pin = ioapic_pin;
+ irq_attr->trigger = trigger;
+ irq_attr->polarity = polarity;
}
int mp_irqdomain_map(struct irq_domain *domain, unsigned int virq,
@@ -3931,7 +3059,7 @@ void mp_irqdomain_unmap(struct irq_domain *domain, unsigned int virq)
ioapic_mask_entry(ioapic, pin);
__remove_pin_from_irq(cfg, ioapic, pin);
- WARN_ON(cfg->irq_2_pin != NULL);
+ WARN_ON(!list_empty(&cfg->irq_2_pin));
arch_teardown_hwirq(virq);
}
@@ -3964,18 +3092,6 @@ int mp_set_gsi_attr(u32 gsi, int trigger, int polarity, int node)
return ret;
}
-bool mp_should_keep_irq(struct device *dev)
-{
- if (dev->power.is_prepared)
- return true;
-#ifdef CONFIG_PM_RUNTIME
- if (dev->power.runtime_status == RPM_SUSPENDING)
- return true;
-#endif
-
- return false;
-}
-
/* Enable IOAPIC early just for system timer */
void __init pre_init_apic_IRQ0(void)
{
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
new file mode 100644
index 000000000000..d6ba2d660dc5
--- /dev/null
+++ b/arch/x86/kernel/apic/msi.c
@@ -0,0 +1,286 @@
+/*
+ * Support of MSI, HPET and DMAR interrupts.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/mm.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dmar.h>
+#include <linux/hpet.h>
+#include <linux/msi.h>
+#include <asm/msidef.h>
+#include <asm/hpet.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/irq_remapping.h>
+
+void native_compose_msi_msg(struct pci_dev *pdev,
+ unsigned int irq, unsigned int dest,
+ struct msi_msg *msg, u8 hpet_id)
+{
+ struct irq_cfg *cfg = irq_cfg(irq);
+
+ msg->address_hi = MSI_ADDR_BASE_HI;
+
+ if (x2apic_enabled())
+ msg->address_hi |= MSI_ADDR_EXT_DEST_ID(dest);
+
+ msg->address_lo =
+ MSI_ADDR_BASE_LO |
+ ((apic->irq_dest_mode == 0) ?
+ MSI_ADDR_DEST_MODE_PHYSICAL :
+ MSI_ADDR_DEST_MODE_LOGICAL) |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_ADDR_REDIRECTION_CPU :
+ MSI_ADDR_REDIRECTION_LOWPRI) |
+ MSI_ADDR_DEST_ID(dest);
+
+ msg->data =
+ MSI_DATA_TRIGGER_EDGE |
+ MSI_DATA_LEVEL_ASSERT |
+ ((apic->irq_delivery_mode != dest_LowestPrio) ?
+ MSI_DATA_DELIVERY_FIXED :
+ MSI_DATA_DELIVERY_LOWPRI) |
+ MSI_DATA_VECTOR(cfg->vector);
+}
+
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ struct msi_msg *msg, u8 hpet_id)
+{
+ struct irq_cfg *cfg;
+ int err;
+ unsigned dest;
+
+ if (disable_apic)
+ return -ENXIO;
+
+ cfg = irq_cfg(irq);
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (err)
+ return err;
+
+ err = apic->cpu_mask_to_apicid_and(cfg->domain,
+ apic->target_cpus(), &dest);
+ if (err)
+ return err;
+
+ x86_msi.compose_msi_msg(pdev, irq, dest, msg, hpet_id);
+
+ return 0;
+}
+
+static int
+msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct msi_msg msg;
+ unsigned int dest;
+ int ret;
+
+ ret = apic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
+
+ __get_cached_msi_msg(data->msi_desc, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ __pci_write_msi_msg(data->msi_desc, &msg);
+
+ return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+/*
+ * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
+ * which implement the MSI or MSI-X Capability Structure.
+ */
+static struct irq_chip msi_chip = {
+ .name = "PCI-MSI",
+ .irq_unmask = pci_msi_unmask_irq,
+ .irq_mask = pci_msi_mask_irq,
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = msi_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ unsigned int irq_base, unsigned int irq_offset)
+{
+ struct irq_chip *chip = &msi_chip;
+ struct msi_msg msg;
+ unsigned int irq = irq_base + irq_offset;
+ int ret;
+
+ ret = msi_compose_msg(dev, irq, &msg, -1);
+ if (ret < 0)
+ return ret;
+
+ irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
+
+ /*
+ * MSI-X message is written per-IRQ, the offset is always 0.
+ * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
+ */
+ if (!irq_offset)
+ pci_write_msi_msg(irq, &msg);
+
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
+
+ irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+
+ dev_dbg(&dev->dev, "irq %d for MSI/MSI-X\n", irq);
+
+ return 0;
+}
+
+int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ struct msi_desc *msidesc;
+ unsigned int irq;
+ int node, ret;
+
+ /* Multiple MSI vectors only supported with interrupt remapping */
+ if (type == PCI_CAP_ID_MSI && nvec > 1)
+ return 1;
+
+ node = dev_to_node(&dev->dev);
+
+ list_for_each_entry(msidesc, &dev->msi_list, list) {
+ irq = irq_alloc_hwirq(node);
+ if (!irq)
+ return -ENOSPC;
+
+ ret = setup_msi_irq(dev, msidesc, irq, 0);
+ if (ret < 0) {
+ irq_free_hwirq(irq);
+ return ret;
+ }
+
+ }
+ return 0;
+}
+
+void native_teardown_msi_irq(unsigned int irq)
+{
+ irq_free_hwirq(irq);
+}
+
+#ifdef CONFIG_DMAR_TABLE
+static int
+dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ bool force)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ unsigned int dest, irq = data->irq;
+ struct msi_msg msg;
+ int ret;
+
+ ret = apic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
+
+ dmar_msi_read(irq, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+ msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
+
+ dmar_msi_write(irq, &msg);
+
+ return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static struct irq_chip dmar_msi_type = {
+ .name = "DMAR_MSI",
+ .irq_unmask = dmar_msi_unmask,
+ .irq_mask = dmar_msi_mask,
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = dmar_msi_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int arch_setup_dmar_msi(unsigned int irq)
+{
+ int ret;
+ struct msi_msg msg;
+
+ ret = msi_compose_msg(NULL, irq, &msg, -1);
+ if (ret < 0)
+ return ret;
+ dmar_msi_write(irq, &msg);
+ irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
+ "edge");
+ return 0;
+}
+#endif
+
+/*
+ * MSI message composition
+ */
+#ifdef CONFIG_HPET_TIMER
+
+static int hpet_msi_set_affinity(struct irq_data *data,
+ const struct cpumask *mask, bool force)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ struct msi_msg msg;
+ unsigned int dest;
+ int ret;
+
+ ret = apic_set_affinity(data, mask, &dest);
+ if (ret)
+ return ret;
+
+ hpet_msi_read(data->handler_data, &msg);
+
+ msg.data &= ~MSI_DATA_VECTOR_MASK;
+ msg.data |= MSI_DATA_VECTOR(cfg->vector);
+ msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
+ msg.address_lo |= MSI_ADDR_DEST_ID(dest);
+
+ hpet_msi_write(data->handler_data, &msg);
+
+ return IRQ_SET_MASK_OK_NOCOPY;
+}
+
+static struct irq_chip hpet_msi_type = {
+ .name = "HPET_MSI",
+ .irq_unmask = hpet_msi_unmask,
+ .irq_mask = hpet_msi_mask,
+ .irq_ack = apic_ack_edge,
+ .irq_set_affinity = hpet_msi_set_affinity,
+ .irq_retrigger = apic_retrigger_irq,
+ .flags = IRQCHIP_SKIP_SET_WAKE,
+};
+
+int default_setup_hpet_msi(unsigned int irq, unsigned int id)
+{
+ struct irq_chip *chip = &hpet_msi_type;
+ struct msi_msg msg;
+ int ret;
+
+ ret = msi_compose_msg(NULL, irq, &msg, id);
+ if (ret < 0)
+ return ret;
+
+ hpet_msi_write(irq_get_handler_data(irq), &msg);
+ irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
+ setup_remapped_irq(irq, irq_cfg(irq), chip);
+
+ irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
+ return 0;
+}
+#endif
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
new file mode 100644
index 000000000000..6cedd7914581
--- /dev/null
+++ b/arch/x86/kernel/apic/vector.c
@@ -0,0 +1,719 @@
+/*
+ * Local APIC related interfaces to support IOAPIC, MSI, HT_IRQ etc.
+ *
+ * Copyright (C) 1997, 1998, 1999, 2000, 2009 Ingo Molnar, Hajnalka Szabo
+ * Moved from arch/x86/kernel/apic/io_apic.c.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/compiler.h>
+#include <linux/irqdomain.h>
+#include <linux/slab.h>
+#include <asm/hw_irq.h>
+#include <asm/apic.h>
+#include <asm/i8259.h>
+#include <asm/desc.h>
+#include <asm/irq_remapping.h>
+
+static DEFINE_RAW_SPINLOCK(vector_lock);
+
+void lock_vector_lock(void)
+{
+ /* Used to the online set of cpus does not change
+ * during assign_irq_vector.
+ */
+ raw_spin_lock(&vector_lock);
+}
+
+void unlock_vector_lock(void)
+{
+ raw_spin_unlock(&vector_lock);
+}
+
+struct irq_cfg *irq_cfg(unsigned int irq)
+{
+ return irq_get_chip_data(irq);
+}
+
+struct irq_cfg *irqd_cfg(struct irq_data *irq_data)
+{
+ return irq_data->chip_data;
+}
+
+static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
+{
+ struct irq_cfg *cfg;
+
+ cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
+ if (!cfg)
+ return NULL;
+ if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
+ goto out_cfg;
+ if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
+ goto out_domain;
+#ifdef CONFIG_X86_IO_APIC
+ INIT_LIST_HEAD(&cfg->irq_2_pin);
+#endif
+ return cfg;
+out_domain:
+ free_cpumask_var(cfg->domain);
+out_cfg:
+ kfree(cfg);
+ return NULL;
+}
+
+struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
+{
+ int res = irq_alloc_desc_at(at, node);
+ struct irq_cfg *cfg;
+
+ if (res < 0) {
+ if (res != -EEXIST)
+ return NULL;
+ cfg = irq_cfg(at);
+ if (cfg)
+ return cfg;
+ }
+
+ cfg = alloc_irq_cfg(at, node);
+ if (cfg)
+ irq_set_chip_data(at, cfg);
+ else
+ irq_free_desc(at);
+ return cfg;
+}
+
+static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
+{
+ if (!cfg)
+ return;
+ irq_set_chip_data(at, NULL);
+ free_cpumask_var(cfg->domain);
+ free_cpumask_var(cfg->old_domain);
+ kfree(cfg);
+}
+
+static int
+__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ /*
+ * NOTE! The local APIC isn't very good at handling
+ * multiple interrupts at the same interrupt level.
+ * As the interrupt level is determined by taking the
+ * vector number and shifting that right by 4, we
+ * want to spread these out a bit so that they don't
+ * all fall in the same interrupt level.
+ *
+ * Also, we've got to be careful not to trash gate
+ * 0x80, because int 0x80 is hm, kind of importantish. ;)
+ */
+ static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
+ static int current_offset = VECTOR_OFFSET_START % 16;
+ int cpu, err;
+ cpumask_var_t tmp_mask;
+
+ if (cfg->move_in_progress)
+ return -EBUSY;
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ cpumask_clear(cfg->old_domain);
+ cpu = cpumask_first_and(mask, cpu_online_mask);
+ while (cpu < nr_cpu_ids) {
+ int new_cpu, vector, offset;
+
+ apic->vector_allocation_domain(cpu, tmp_mask, mask);
+
+ if (cpumask_subset(tmp_mask, cfg->domain)) {
+ err = 0;
+ if (cpumask_equal(tmp_mask, cfg->domain))
+ break;
+ /*
+ * New cpumask using the vector is a proper subset of
+ * the current in use mask. So cleanup the vector
+ * allocation for the members that are not used anymore.
+ */
+ cpumask_andnot(cfg->old_domain, cfg->domain, tmp_mask);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ cpumask_and(cfg->domain, cfg->domain, tmp_mask);
+ break;
+ }
+
+ vector = current_vector;
+ offset = current_offset;
+next:
+ vector += 16;
+ if (vector >= first_system_vector) {
+ offset = (offset + 1) % 16;
+ vector = FIRST_EXTERNAL_VECTOR + offset;
+ }
+
+ if (unlikely(current_vector == vector)) {
+ cpumask_or(cfg->old_domain, cfg->old_domain, tmp_mask);
+ cpumask_andnot(tmp_mask, mask, cfg->old_domain);
+ cpu = cpumask_first_and(tmp_mask, cpu_online_mask);
+ continue;
+ }
+
+ if (test_bit(vector, used_vectors))
+ goto next;
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ if (per_cpu(vector_irq, new_cpu)[vector] >
+ VECTOR_UNDEFINED)
+ goto next;
+ }
+ /* Found one! */
+ current_vector = vector;
+ current_offset = offset;
+ if (cfg->vector) {
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ cfg->move_in_progress =
+ cpumask_intersects(cfg->old_domain, cpu_online_mask);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector] = irq;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
+ err = 0;
+ break;
+ }
+ free_cpumask_var(tmp_mask);
+
+ return err;
+}
+
+int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector(irq, cfg, mask);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+void clear_irq_vector(int irq, struct irq_cfg *cfg)
+{
+ int cpu, vector;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ BUG_ON(!cfg->vector);
+
+ vector = cfg->vector;
+ for_each_cpu_and(cpu, cfg->domain, cpu_online_mask)
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+
+ cfg->vector = 0;
+ cpumask_clear(cfg->domain);
+
+ if (likely(!cfg->move_in_progress)) {
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+ return;
+ }
+
+ for_each_cpu_and(cpu, cfg->old_domain, cpu_online_mask) {
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
+ vector++) {
+ if (per_cpu(vector_irq, cpu)[vector] != irq)
+ continue;
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ break;
+ }
+ }
+ cfg->move_in_progress = 0;
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+}
+
+int __init arch_probe_nr_irqs(void)
+{
+ int nr;
+
+ if (nr_irqs > (NR_VECTORS * nr_cpu_ids))
+ nr_irqs = NR_VECTORS * nr_cpu_ids;
+
+ nr = (gsi_top + nr_legacy_irqs()) + 8 * nr_cpu_ids;
+#if defined(CONFIG_PCI_MSI) || defined(CONFIG_HT_IRQ)
+ /*
+ * for MSI and HT dyn irq
+ */
+ if (gsi_top <= NR_IRQS_LEGACY)
+ nr += 8 * nr_cpu_ids;
+ else
+ nr += gsi_top * 16;
+#endif
+ if (nr < nr_irqs)
+ nr_irqs = nr;
+
+ return nr_legacy_irqs();
+}
+
+int __init arch_early_irq_init(void)
+{
+ return arch_early_ioapic_init();
+}
+
+static void __setup_vector_irq(int cpu)
+{
+ /* Initialize vector_irq on a new cpu */
+ int irq, vector;
+ struct irq_cfg *cfg;
+
+ /*
+ * vector_lock will make sure that we don't run into irq vector
+ * assignments that might be happening on another cpu in parallel,
+ * while we setup our initial vector to irq mappings.
+ */
+ raw_spin_lock(&vector_lock);
+ /* Mark the inuse vectors */
+ for_each_active_irq(irq) {
+ cfg = irq_cfg(irq);
+ if (!cfg)
+ continue;
+
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ continue;
+ vector = cfg->vector;
+ per_cpu(vector_irq, cpu)[vector] = irq;
+ }
+ /* Mark the free vectors */
+ for (vector = 0; vector < NR_VECTORS; ++vector) {
+ irq = per_cpu(vector_irq, cpu)[vector];
+ if (irq <= VECTOR_UNDEFINED)
+ continue;
+
+ cfg = irq_cfg(irq);
+ if (!cpumask_test_cpu(cpu, cfg->domain))
+ per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+ }
+ raw_spin_unlock(&vector_lock);
+}
+
+/*
+ * Setup the vector to irq mappings.
+ */
+void setup_vector_irq(int cpu)
+{
+ int irq;
+
+ /*
+ * On most of the platforms, legacy PIC delivers the interrupts on the
+ * boot cpu. But there are certain platforms where PIC interrupts are
+ * delivered to multiple cpu's. If the legacy IRQ is handled by the
+ * legacy PIC, for the new cpu that is coming online, setup the static
+ * legacy vector to irq mapping:
+ */
+ for (irq = 0; irq < nr_legacy_irqs(); irq++)
+ per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
+
+ __setup_vector_irq(cpu);
+}
+
+int apic_retrigger_irq(struct irq_data *data)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ unsigned long flags;
+ int cpu;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ cpu = cpumask_first_and(cfg->domain, cpu_online_mask);
+ apic->send_IPI_mask(cpumask_of(cpu), cfg->vector);
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ return 1;
+}
+
+void apic_ack_edge(struct irq_data *data)
+{
+ irq_complete_move(irqd_cfg(data));
+ irq_move_irq(data);
+ ack_APIC_irq();
+}
+
+/*
+ * Either sets data->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
+ * leaves data->affinity untouched.
+ */
+int apic_set_affinity(struct irq_data *data, const struct cpumask *mask,
+ unsigned int *dest_id)
+{
+ struct irq_cfg *cfg = irqd_cfg(data);
+ unsigned int irq = data->irq;
+ int err;
+
+ if (!config_enabled(CONFIG_SMP))
+ return -EPERM;
+
+ if (!cpumask_intersects(mask, cpu_online_mask))
+ return -EINVAL;
+
+ err = assign_irq_vector(irq, cfg, mask);
+ if (err)
+ return err;
+
+ err = apic->cpu_mask_to_apicid_and(mask, cfg->domain, dest_id);
+ if (err) {
+ if (assign_irq_vector(irq, cfg, data->affinity))
+ pr_err("Failed to recover vector for irq %d\n", irq);
+ return err;
+ }
+
+ cpumask_copy(data->affinity, mask);
+
+ return 0;
+}
+
+#ifdef CONFIG_SMP
+void send_cleanup_vector(struct irq_cfg *cfg)
+{
+ cpumask_var_t cleanup_mask;
+
+ if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+ unsigned int i;
+
+ for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+ apic->send_IPI_mask(cpumask_of(i),
+ IRQ_MOVE_CLEANUP_VECTOR);
+ } else {
+ cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+ apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+ free_cpumask_var(cleanup_mask);
+ }
+ cfg->move_in_progress = 0;
+}
+
+asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
+{
+ unsigned vector, me;
+
+ ack_APIC_irq();
+ irq_enter();
+ exit_idle();
+
+ me = smp_processor_id();
+ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
+ int irq;
+ unsigned int irr;
+ struct irq_desc *desc;
+ struct irq_cfg *cfg;
+
+ irq = __this_cpu_read(vector_irq[vector]);
+
+ if (irq <= VECTOR_UNDEFINED)
+ continue;
+
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ cfg = irq_cfg(irq);
+ if (!cfg)
+ continue;
+
+ raw_spin_lock(&desc->lock);
+
+ /*
+ * Check if the irq migration is in progress. If so, we
+ * haven't received the cleanup request yet for this irq.
+ */
+ if (cfg->move_in_progress)
+ goto unlock;
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ goto unlock;
+
+ irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
+ /*
+ * Check if the vector that needs to be cleanedup is
+ * registered at the cpu's IRR. If so, then this is not
+ * the best time to clean it up. Lets clean it up in the
+ * next attempt by sending another IRQ_MOVE_CLEANUP_VECTOR
+ * to myself.
+ */
+ if (irr & (1 << (vector % 32))) {
+ apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
+ goto unlock;
+ }
+ __this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+unlock:
+ raw_spin_unlock(&desc->lock);
+ }
+
+ irq_exit();
+}
+
+static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
+{
+ unsigned me;
+
+ if (likely(!cfg->move_in_progress))
+ return;
+
+ me = smp_processor_id();
+
+ if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
+ send_cleanup_vector(cfg);
+}
+
+void irq_complete_move(struct irq_cfg *cfg)
+{
+ __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
+}
+
+void irq_force_complete_move(int irq)
+{
+ struct irq_cfg *cfg = irq_cfg(irq);
+
+ if (!cfg)
+ return;
+
+ __irq_complete_move(cfg, cfg->vector);
+}
+#endif
+
+/*
+ * Dynamic irq allocate and deallocation. Should be replaced by irq domains!
+ */
+int arch_setup_hwirq(unsigned int irq, int node)
+{
+ struct irq_cfg *cfg;
+ unsigned long flags;
+ int ret;
+
+ cfg = alloc_irq_cfg(irq, node);
+ if (!cfg)
+ return -ENOMEM;
+
+ raw_spin_lock_irqsave(&vector_lock, flags);
+ ret = __assign_irq_vector(irq, cfg, apic->target_cpus());
+ raw_spin_unlock_irqrestore(&vector_lock, flags);
+
+ if (!ret)
+ irq_set_chip_data(irq, cfg);
+ else
+ free_irq_cfg(irq, cfg);
+ return ret;
+}
+
+void arch_teardown_hwirq(unsigned int irq)
+{
+ struct irq_cfg *cfg = irq_cfg(irq);
+
+ free_remapped_irq(irq);
+ clear_irq_vector(irq, cfg);
+ free_irq_cfg(irq, cfg);
+}
+
+static void __init print_APIC_field(int base)
+{
+ int i;
+
+ printk(KERN_DEBUG);
+
+ for (i = 0; i < 8; i++)
+ pr_cont("%08x", apic_read(base + i*0x10));
+
+ pr_cont("\n");
+}
+
+static void __init print_local_APIC(void *dummy)
+{
+ unsigned int i, v, ver, maxlvt;
+ u64 icr;
+
+ pr_debug("printing local APIC contents on CPU#%d/%d:\n",
+ smp_processor_id(), hard_smp_processor_id());
+ v = apic_read(APIC_ID);
+ pr_info("... APIC ID: %08x (%01x)\n", v, read_apic_id());
+ v = apic_read(APIC_LVR);
+ pr_info("... APIC VERSION: %08x\n", v);
+ ver = GET_APIC_VERSION(v);
+ maxlvt = lapic_get_maxlvt();
+
+ v = apic_read(APIC_TASKPRI);
+ pr_debug("... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ if (!APIC_XAPIC(ver)) {
+ v = apic_read(APIC_ARBPRI);
+ pr_debug("... APIC ARBPRI: %08x (%02x)\n",
+ v, v & APIC_ARBPRI_MASK);
+ }
+ v = apic_read(APIC_PROCPRI);
+ pr_debug("... APIC PROCPRI: %08x\n", v);
+ }
+
+ /*
+ * Remote read supported only in the 82489DX and local APIC for
+ * Pentium processors.
+ */
+ if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
+ v = apic_read(APIC_RRR);
+ pr_debug("... APIC RRR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_LDR);
+ pr_debug("... APIC LDR: %08x\n", v);
+ if (!x2apic_enabled()) {
+ v = apic_read(APIC_DFR);
+ pr_debug("... APIC DFR: %08x\n", v);
+ }
+ v = apic_read(APIC_SPIV);
+ pr_debug("... APIC SPIV: %08x\n", v);
+
+ pr_debug("... APIC ISR field:\n");
+ print_APIC_field(APIC_ISR);
+ pr_debug("... APIC TMR field:\n");
+ print_APIC_field(APIC_TMR);
+ pr_debug("... APIC IRR field:\n");
+ print_APIC_field(APIC_IRR);
+
+ /* !82489DX */
+ if (APIC_INTEGRATED(ver)) {
+ /* Due to the Pentium erratum 3AP. */
+ if (maxlvt > 3)
+ apic_write(APIC_ESR, 0);
+
+ v = apic_read(APIC_ESR);
+ pr_debug("... APIC ESR: %08x\n", v);
+ }
+
+ icr = apic_icr_read();
+ pr_debug("... APIC ICR: %08x\n", (u32)icr);
+ pr_debug("... APIC ICR2: %08x\n", (u32)(icr >> 32));
+
+ v = apic_read(APIC_LVTT);
+ pr_debug("... APIC LVTT: %08x\n", v);
+
+ if (maxlvt > 3) {
+ /* PC is LVT#4. */
+ v = apic_read(APIC_LVTPC);
+ pr_debug("... APIC LVTPC: %08x\n", v);
+ }
+ v = apic_read(APIC_LVT0);
+ pr_debug("... APIC LVT0: %08x\n", v);
+ v = apic_read(APIC_LVT1);
+ pr_debug("... APIC LVT1: %08x\n", v);
+
+ if (maxlvt > 2) {
+ /* ERR is LVT#3. */
+ v = apic_read(APIC_LVTERR);
+ pr_debug("... APIC LVTERR: %08x\n", v);
+ }
+
+ v = apic_read(APIC_TMICT);
+ pr_debug("... APIC TMICT: %08x\n", v);
+ v = apic_read(APIC_TMCCT);
+ pr_debug("... APIC TMCCT: %08x\n", v);
+ v = apic_read(APIC_TDCR);
+ pr_debug("... APIC TDCR: %08x\n", v);
+
+ if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+ v = apic_read(APIC_EFEAT);
+ maxlvt = (v >> 16) & 0xff;
+ pr_debug("... APIC EFEAT: %08x\n", v);
+ v = apic_read(APIC_ECTRL);
+ pr_debug("... APIC ECTRL: %08x\n", v);
+ for (i = 0; i < maxlvt; i++) {
+ v = apic_read(APIC_EILVTn(i));
+ pr_debug("... APIC EILVT%d: %08x\n", i, v);
+ }
+ }
+ pr_cont("\n");
+}
+
+static void __init print_local_APICs(int maxcpu)
+{
+ int cpu;
+
+ if (!maxcpu)
+ return;
+
+ preempt_disable();
+ for_each_online_cpu(cpu) {
+ if (cpu >= maxcpu)
+ break;
+ smp_call_function_single(cpu, print_local_APIC, NULL, 1);
+ }
+ preempt_enable();
+}
+
+static void __init print_PIC(void)
+{
+ unsigned int v;
+ unsigned long flags;
+
+ if (!nr_legacy_irqs())
+ return;
+
+ pr_debug("\nprinting PIC contents\n");
+
+ raw_spin_lock_irqsave(&i8259A_lock, flags);
+
+ v = inb(0xa1) << 8 | inb(0x21);
+ pr_debug("... PIC IMR: %04x\n", v);
+
+ v = inb(0xa0) << 8 | inb(0x20);
+ pr_debug("... PIC IRR: %04x\n", v);
+
+ outb(0x0b, 0xa0);
+ outb(0x0b, 0x20);
+ v = inb(0xa0) << 8 | inb(0x20);
+ outb(0x0a, 0xa0);
+ outb(0x0a, 0x20);
+
+ raw_spin_unlock_irqrestore(&i8259A_lock, flags);
+
+ pr_debug("... PIC ISR: %04x\n", v);
+
+ v = inb(0x4d1) << 8 | inb(0x4d0);
+ pr_debug("... PIC ELCR: %04x\n", v);
+}
+
+static int show_lapic __initdata = 1;
+static __init int setup_show_lapic(char *arg)
+{
+ int num = -1;
+
+ if (strcmp(arg, "all") == 0) {
+ show_lapic = CONFIG_NR_CPUS;
+ } else {
+ get_option(&arg, &num);
+ if (num >= 0)
+ show_lapic = num;
+ }
+
+ return 1;
+}
+__setup("show_lapic=", setup_show_lapic);
+
+static int __init print_ICs(void)
+{
+ if (apic_verbosity == APIC_QUIET)
+ return 0;
+
+ print_PIC();
+
+ /* don't print out if apic is not there */
+ if (!cpu_has_apic && !apic_from_smp_config())
+ return 0;
+
+ print_local_APICs(show_lapic);
+ print_IO_APICs();
+
+ return 0;
+}
+
+late_initcall(print_ICs);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index 08f3fed2b0f2..10b8d3eaaf15 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -276,6 +276,17 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type,
return box;
}
+/*
+ * Using uncore_pmu_event_init pmu event_init callback
+ * as a detection point for uncore events.
+ */
+static int uncore_pmu_event_init(struct perf_event *event);
+
+static bool is_uncore_event(struct perf_event *event)
+{
+ return event->pmu->event_init == uncore_pmu_event_init;
+}
+
static int
uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp)
{
@@ -290,13 +301,18 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, b
return -EINVAL;
n = box->n_events;
- box->event_list[n] = leader;
- n++;
+
+ if (is_uncore_event(leader)) {
+ box->event_list[n] = leader;
+ n++;
+ }
+
if (!dogrp)
return n;
list_for_each_entry(event, &leader->sibling_list, group_entry) {
- if (event->state <= PERF_EVENT_STATE_OFF)
+ if (!is_uncore_event(event) ||
+ event->state <= PERF_EVENT_STATE_OFF)
continue;
if (n >= max_count)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index f5ab56d14287..aceb2f90c716 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -28,6 +28,7 @@
#include <asm/nmi.h>
#include <asm/hw_irq.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/hpet.h>
#include <linux/kdebug.h>
#include <asm/cpu.h>
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 1cf7c97ff175..000d4199b03e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -732,10 +732,10 @@ ENTRY(interrupt)
ENTRY(irq_entries_start)
RING0_INT_FRAME
vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
.balign 32
.rept 7
- .if vector < NR_VECTORS
+ .if vector < FIRST_SYSTEM_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -4
.endif
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 90878aa38dbd..9ebaf63ba182 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -740,10 +740,10 @@ ENTRY(interrupt)
ENTRY(irq_entries_start)
INTR_FRAME
vector=FIRST_EXTERNAL_VECTOR
-.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
+.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
.balign 32
.rept 7
- .if vector < NR_VECTORS
+ .if vector < FIRST_SYSTEM_VECTOR
.if vector <> FIRST_EXTERNAL_VECTOR
CFI_ADJUST_CFA_OFFSET -8
.endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 4de73ee78361..70e181ea1eac 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -99,32 +99,9 @@ void __init init_IRQ(void)
x86_init.irqs.intr_init();
}
-/*
- * Setup the vector to irq mappings.
- */
-void setup_vector_irq(int cpu)
-{
-#ifndef CONFIG_X86_IO_APIC
- int irq;
-
- /*
- * On most of the platforms, legacy PIC delivers the interrupts on the
- * boot cpu. But there are certain platforms where PIC interrupts are
- * delivered to multiple cpu's. If the legacy IRQ is handled by the
- * legacy PIC, for the new cpu that is coming online, setup the static
- * legacy vector to irq mapping:
- */
- for (irq = 0; irq < nr_legacy_irqs(); irq++)
- per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
-#endif
-
- __setup_vector_irq(cpu);
-}
-
static void __init smp_intr_init(void)
{
#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
/*
* The reschedule interrupt is a CPU-to-CPU reschedule-helper
* IPI, driven by wakeup.
@@ -144,7 +121,6 @@ static void __init smp_intr_init(void)
/* IPI used for rebooting/stopping */
alloc_intr_gate(REBOOT_VECTOR, reboot_interrupt);
-#endif
#endif /* CONFIG_SMP */
}
@@ -159,7 +135,7 @@ static void __init apic_intr_init(void)
alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
#endif
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+#ifdef CONFIG_X86_LOCAL_APIC
/* self generated IPI for local APIC timer */
alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
@@ -197,10 +173,17 @@ void __init native_init_IRQ(void)
* 'special' SMP interrupts)
*/
i = FIRST_EXTERNAL_VECTOR;
- for_each_clear_bit_from(i, used_vectors, NR_VECTORS) {
+#ifndef CONFIG_X86_LOCAL_APIC
+#define first_system_vector NR_VECTORS
+#endif
+ for_each_clear_bit_from(i, used_vectors, first_system_vector) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
}
+#ifdef CONFIG_X86_LOCAL_APIC
+ for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
+ set_intr_gate(i, spurious_interrupt);
+#endif
if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs())
setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index f6945bef2cd1..94f643484300 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
pv_info.name = "KVM";
- pv_info.paravirt_enabled = 1;
+
+ /*
+ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM
+ * guest kernel works like a bare metal kernel with additional
+ * features, and paravirt_enabled is about features that are
+ * missing.
+ */
+ pv_info.paravirt_enabled = 0;
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d9156ceecdff..42caaef897c8 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -59,13 +59,12 @@ static void kvm_get_wallclock(struct timespec *now)
native_write_msr(msr_kvm_wall_clock, low, high);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
pvclock_read_wallclock(&wall_clock, vcpu_time, now);
- preempt_enable();
+ put_cpu();
}
static int kvm_set_wallclock(const struct timespec *now)
@@ -107,11 +106,10 @@ static unsigned long kvm_get_tsc_khz(void)
int cpu;
unsigned long tsc_khz;
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
src = &hv_clock[cpu].pvti;
tsc_khz = pvclock_tsc_khz(src);
- preempt_enable();
+ put_cpu();
return tsc_khz;
}
@@ -263,7 +261,6 @@ void __init kvmclock_init(void)
#endif
kvm_get_preset_lpj();
clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
- pv_info.paravirt_enabled = 1;
pv_info.name = "KVM";
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
@@ -284,23 +281,22 @@ int __init kvm_setup_vsyscall_timeinfo(void)
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
- preempt_disable();
- cpu = smp_processor_id();
+ cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
flags = pvclock_read_flags(vcpu_time);
if (!(flags & PVCLOCK_TSC_STABLE_BIT)) {
- preempt_enable();
+ put_cpu();
return 1;
}
if ((ret = pvclock_init_vsyscall(hv_clock, size))) {
- preempt_enable();
+ put_cpu();
return ret;
}
- preempt_enable();
+ put_cpu();
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 72e8e310258d..469b23d6acc2 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -20,6 +20,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/cpufeature.h>
#include <asm/desc.h>
#include <asm/cacheflush.h>
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 485981059a40..415480d3ea84 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -22,6 +22,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
+#include <asm/io_apic.h>
#include <asm/debugreg.h>
#include <asm/kexec-bzimage64.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 17962e667a91..bae6c609888e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -12,6 +12,7 @@
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
+#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
#include <asm/pgtable.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7a8f5845e8eb..6d7022c683e3 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1084,7 +1084,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned int i;
- preempt_disable();
smp_cpu_index_default();
/*
@@ -1102,22 +1101,19 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
}
set_cpu_sibling_map(0);
-
if (smp_sanity_check(max_cpus) < 0) {
pr_info("SMP disabled\n");
disable_smp();
- goto out;
+ return;
}
default_setup_apic_routing();
- preempt_disable();
if (read_apic_id() != boot_cpu_physical_apicid) {
panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
read_apic_id(), boot_cpu_physical_apicid);
/* Or can we switch back to PIC here? */
}
- preempt_enable();
connect_bsp_APIC();
@@ -1151,8 +1147,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
uv_system_init();
set_mtrr_aps_delayed_init();
-out:
- preempt_enable();
}
void arch_enable_nonboot_cpus_begin(void)
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 3e551eee87b9..4e942f31b1a7 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -55,12 +55,6 @@ static bool tls_desc_okay(const struct user_desc *info)
if (info->seg_not_present)
return false;
-#ifdef CONFIG_X86_64
- /* The L bit makes no sense for data. */
- if (info->lm)
- return false;
-#endif
-
return true;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a9ae20579895..88900e288021 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -331,7 +331,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
break; /* Success, it was handled */
case 1: /* Bound violation. */
info = mpx_generate_siginfo(regs, xsave_buf);
- if (PTR_ERR(info)) {
+ if (IS_ERR(info)) {
/*
* We failed to decode the MPX instruction. Act as if
* the exception was not caused by MPX.
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 4c540c4719d8..0de1fae2bdf0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -738,3 +738,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate)
return (void *)xsave + xstate_comp_offsets[feature];
}
+EXPORT_SYMBOL_GPL(get_xsave_addr);
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 25d22b2d6509..08f790dfadc9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,14 +7,13 @@ CFLAGS_vmx.o := -I.
KVM := ../../../virt/kvm
-kvm-y += $(KVM)/kvm_main.o $(KVM)/ioapic.o \
- $(KVM)/coalesced_mmio.o $(KVM)/irq_comm.o \
+kvm-y += $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
$(KVM)/eventfd.o $(KVM)/irqchip.o $(KVM)/vfio.o
-kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += $(KVM)/assigned-dev.o $(KVM)/iommu.o
kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
- i8254.o cpuid.o pmu.o
+ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o
kvm-intel-y += vmx.o
kvm-amd-y += svm.o
diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c
new file mode 100644
index 000000000000..6eb5c20ee373
--- /dev/null
+++ b/arch/x86/kvm/assigned-dev.c
@@ -0,0 +1,1052 @@
+/*
+ * Kernel-based Virtual Machine - device assignment support
+ *
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include "irq.h"
+#include "assigned-dev.h"
+
+struct kvm_assigned_dev_kernel {
+ struct kvm_irq_ack_notifier ack_notifier;
+ struct list_head list;
+ int assigned_dev_id;
+ int host_segnr;
+ int host_busnr;
+ int host_devfn;
+ unsigned int entries_nr;
+ int host_irq;
+ bool host_irq_disabled;
+ bool pci_2_3;
+ struct msix_entry *host_msix_entries;
+ int guest_irq;
+ struct msix_entry *guest_msix_entries;
+ unsigned long irq_requested_type;
+ int irq_source_id;
+ int flags;
+ struct pci_dev *dev;
+ struct kvm *kvm;
+ spinlock_t intx_lock;
+ spinlock_t intx_mask_lock;
+ char irq_name[32];
+ struct pci_saved_state *pci_saved_state;
+};
+
+static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head,
+ int assigned_dev_id)
+{
+ struct list_head *ptr;
+ struct kvm_assigned_dev_kernel *match;
+
+ list_for_each(ptr, head) {
+ match = list_entry(ptr, struct kvm_assigned_dev_kernel, list);
+ if (match->assigned_dev_id == assigned_dev_id)
+ return match;
+ }
+ return NULL;
+}
+
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+ *assigned_dev, int irq)
+{
+ int i, index;
+ struct msix_entry *host_msix_entries;
+
+ host_msix_entries = assigned_dev->host_msix_entries;
+
+ index = -1;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ if (irq == host_msix_entries[i].vector) {
+ index = i;
+ break;
+ }
+ if (index < 0)
+ printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+
+ return index;
+}
+
+static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret;
+
+ spin_lock(&assigned_dev->intx_lock);
+ if (pci_check_and_mask_intx(assigned_dev->dev)) {
+ assigned_dev->host_irq_disabled = true;
+ ret = IRQ_WAKE_THREAD;
+ } else
+ ret = IRQ_NONE;
+ spin_unlock(&assigned_dev->intx_lock);
+
+ return ret;
+}
+
+static void
+kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
+ int vector)
+{
+ if (unlikely(assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_GUEST_INTX)) {
+ spin_lock(&assigned_dev->intx_mask_lock);
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
+ kvm_set_irq(assigned_dev->kvm,
+ assigned_dev->irq_source_id, vector, 1,
+ false);
+ spin_unlock(&assigned_dev->intx_mask_lock);
+ } else
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ vector, 1, false);
+}
+
+static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ disable_irq_nosync(irq);
+ assigned_dev->host_irq_disabled = true;
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ }
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+
+#ifdef __KVM_HAVE_MSI
+static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 1);
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+
+ kvm_assigned_dev_raise_guest_irq(assigned_dev,
+ assigned_dev->guest_irq);
+
+ return IRQ_HANDLED;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+ int ret = 0;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ ret = kvm_set_irq_inatomic(assigned_dev->kvm,
+ assigned_dev->irq_source_id,
+ vector, 1);
+ }
+
+ return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED;
+}
+
+static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
+{
+ struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
+ int index = find_index_from_host_irq(assigned_dev, irq);
+ u32 vector;
+
+ if (index >= 0) {
+ vector = assigned_dev->guest_msix_entries[index].vector;
+ kvm_assigned_dev_raise_guest_irq(assigned_dev, vector);
+ }
+
+ return IRQ_HANDLED;
+}
+#endif
+
+/* Ack the irq line for an assigned device */
+static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
+{
+ struct kvm_assigned_dev_kernel *dev =
+ container_of(kian, struct kvm_assigned_dev_kernel,
+ ack_notifier);
+
+ kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
+
+ spin_lock(&dev->intx_mask_lock);
+
+ if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) {
+ bool reassert = false;
+
+ spin_lock_irq(&dev->intx_lock);
+ /*
+ * The guest IRQ may be shared so this ack can come from an
+ * IRQ for another guest device.
+ */
+ if (dev->host_irq_disabled) {
+ if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3))
+ enable_irq(dev->host_irq);
+ else if (!pci_check_and_unmask_intx(dev->dev))
+ reassert = true;
+ dev->host_irq_disabled = reassert;
+ }
+ spin_unlock_irq(&dev->intx_lock);
+
+ if (reassert)
+ kvm_set_irq(dev->kvm, dev->irq_source_id,
+ dev->guest_irq, 1, false);
+ }
+
+ spin_unlock(&dev->intx_mask_lock);
+}
+
+static void deassign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ if (assigned_dev->ack_notifier.gsi != -1)
+ kvm_unregister_irq_ack_notifier(kvm,
+ &assigned_dev->ack_notifier);
+
+ kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+ assigned_dev->guest_irq, 0, false);
+
+ if (assigned_dev->irq_source_id != -1)
+ kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
+ assigned_dev->irq_source_id = -1;
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
+
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ /*
+ * We disable irq here to prevent further events.
+ *
+ * Notice this maybe result in nested disable if the interrupt type is
+ * INTx, but it's OK for we are going to free it.
+ *
+ * If this function is a part of VM destroy, please ensure that till
+ * now, the kvm state is still legal for probably we also have to wait
+ * on a currently running IRQ handler.
+ */
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
+ int i;
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ disable_irq(assigned_dev->host_msix_entries[i].vector);
+
+ for (i = 0; i < assigned_dev->entries_nr; i++)
+ free_irq(assigned_dev->host_msix_entries[i].vector,
+ assigned_dev);
+
+ assigned_dev->entries_nr = 0;
+ kfree(assigned_dev->host_msix_entries);
+ kfree(assigned_dev->guest_msix_entries);
+ pci_disable_msix(assigned_dev->dev);
+ } else {
+ /* Deal with MSI and INTx */
+ if ((assigned_dev->irq_requested_type &
+ KVM_DEV_IRQ_HOST_INTX) &&
+ (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ spin_lock_irq(&assigned_dev->intx_lock);
+ pci_intx(assigned_dev->dev, false);
+ spin_unlock_irq(&assigned_dev->intx_lock);
+ synchronize_irq(assigned_dev->host_irq);
+ } else
+ disable_irq(assigned_dev->host_irq);
+
+ free_irq(assigned_dev->host_irq, assigned_dev);
+
+ if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
+ pci_disable_msi(assigned_dev->dev);
+ }
+
+ assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev,
+ unsigned long irq_requested_type)
+{
+ unsigned long guest_irq_type, host_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return -EINVAL;
+ /* no irq assignment to deassign */
+ if (!assigned_dev->irq_requested_type)
+ return -ENXIO;
+
+ host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+ guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+ if (host_irq_type)
+ deassign_host_irq(kvm, assigned_dev);
+ if (guest_irq_type)
+ deassign_guest_irq(kvm, assigned_dev);
+
+ return 0;
+}
+
+static void kvm_free_assigned_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *assigned_dev)
+{
+ kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
+
+static void kvm_free_assigned_device(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel
+ *assigned_dev)
+{
+ kvm_free_assigned_irq(kvm, assigned_dev);
+
+ pci_reset_function(assigned_dev->dev);
+ if (pci_load_and_free_saved_state(assigned_dev->dev,
+ &assigned_dev->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&assigned_dev->dev->dev));
+ else
+ pci_restore_state(assigned_dev->dev);
+
+ pci_clear_dev_assigned(assigned_dev->dev);
+
+ pci_release_regions(assigned_dev->dev);
+ pci_disable_device(assigned_dev->dev);
+ pci_dev_put(assigned_dev->dev);
+
+ list_del(&assigned_dev->list);
+ kfree(assigned_dev);
+}
+
+void kvm_free_all_assigned_devices(struct kvm *kvm)
+{
+ struct list_head *ptr, *ptr2;
+ struct kvm_assigned_dev_kernel *assigned_dev;
+
+ list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) {
+ assigned_dev = list_entry(ptr,
+ struct kvm_assigned_dev_kernel,
+ list);
+
+ kvm_free_assigned_device(kvm, assigned_dev);
+ }
+}
+
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ irq_handler_t irq_handler;
+ unsigned long flags;
+
+ dev->host_irq = dev->dev->irq;
+
+ /*
+ * We can only share the IRQ line with other host devices if we are
+ * able to disable the IRQ source at device-level - independently of
+ * the guest driver. Otherwise host devices may suffer from unbounded
+ * IRQ latencies when the guest keeps the line asserted.
+ */
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ irq_handler = kvm_assigned_dev_intx;
+ flags = IRQF_SHARED;
+ } else {
+ irq_handler = NULL;
+ flags = IRQF_ONESHOT;
+ }
+ if (request_threaded_irq(dev->host_irq, irq_handler,
+ kvm_assigned_dev_thread_intx, flags,
+ dev->irq_name, dev))
+ return -EIO;
+
+ if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) {
+ spin_lock_irq(&dev->intx_lock);
+ pci_intx(dev->dev, true);
+ spin_unlock_irq(&dev->intx_lock);
+ }
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int r;
+
+ if (!dev->dev->msi_enabled) {
+ r = pci_enable_msi(dev->dev);
+ if (r)
+ return r;
+ }
+
+ dev->host_irq = dev->dev->irq;
+ if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi,
+ kvm_assigned_dev_thread_msi, 0,
+ dev->irq_name, dev)) {
+ pci_disable_msi(dev->dev);
+ return -EIO;
+ }
+
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev)
+{
+ int i, r = -EINVAL;
+
+ /* host_msix_entries and guest_msix_entries should have been
+ * initialized */
+ if (dev->entries_nr == 0)
+ return r;
+
+ r = pci_enable_msix_exact(dev->dev,
+ dev->host_msix_entries, dev->entries_nr);
+ if (r)
+ return r;
+
+ for (i = 0; i < dev->entries_nr; i++) {
+ r = request_threaded_irq(dev->host_msix_entries[i].vector,
+ kvm_assigned_dev_msix,
+ kvm_assigned_dev_thread_msix,
+ 0, dev->irq_name, dev);
+ if (r)
+ goto err;
+ }
+
+ return 0;
+err:
+ for (i -= 1; i >= 0; i--)
+ free_irq(dev->host_msix_entries[i].vector, dev);
+ pci_disable_msix(dev->dev);
+ return r;
+}
+
+#endif
+
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = irq->guest_irq;
+ return 0;
+}
+
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ return 0;
+}
+#endif
+
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq)
+{
+ dev->guest_irq = irq->guest_irq;
+ dev->ack_notifier.gsi = -1;
+ return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ __u32 host_irq_type)
+{
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+ return r;
+
+ snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s",
+ pci_name(dev->dev));
+
+ switch (host_irq_type) {
+ case KVM_DEV_IRQ_HOST_INTX:
+ r = assigned_device_enable_host_intx(kvm, dev);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_HOST_MSI:
+ r = assigned_device_enable_host_msi(kvm, dev);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_HOST_MSIX:
+ r = assigned_device_enable_host_msix(kvm, dev);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+ dev->host_irq_disabled = false;
+
+ if (!r)
+ dev->irq_requested_type |= host_irq_type;
+
+ return r;
+}
+
+static int assign_guest_irq(struct kvm *kvm,
+ struct kvm_assigned_dev_kernel *dev,
+ struct kvm_assigned_irq *irq,
+ unsigned long guest_irq_type)
+{
+ int id;
+ int r = -EEXIST;
+
+ if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+ return r;
+
+ id = kvm_request_irq_source_id(kvm);
+ if (id < 0)
+ return id;
+
+ dev->irq_source_id = id;
+
+ switch (guest_irq_type) {
+ case KVM_DEV_IRQ_GUEST_INTX:
+ r = assigned_device_enable_guest_intx(kvm, dev, irq);
+ break;
+#ifdef __KVM_HAVE_MSI
+ case KVM_DEV_IRQ_GUEST_MSI:
+ r = assigned_device_enable_guest_msi(kvm, dev, irq);
+ break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+ case KVM_DEV_IRQ_GUEST_MSIX:
+ r = assigned_device_enable_guest_msix(kvm, dev, irq);
+ break;
+#endif
+ default:
+ r = -EINVAL;
+ }
+
+ if (!r) {
+ dev->irq_requested_type |= guest_irq_type;
+ if (dev->ack_notifier.gsi != -1)
+ kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+ } else {
+ kvm_free_irq_source_id(kvm, dev->irq_source_id);
+ dev->irq_source_id = -1;
+ }
+
+ return r;
+}
+
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
+ struct kvm_assigned_irq *assigned_irq)
+{
+ int r = -EINVAL;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long host_irq_type, guest_irq_type;
+
+ if (!irqchip_in_kernel(kvm))
+ return r;
+
+ mutex_lock(&kvm->lock);
+ r = -ENODEV;
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+ guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
+
+ r = -EINVAL;
+ /* can only assign one type at a time */
+ if (hweight_long(host_irq_type) > 1)
+ goto out;
+ if (hweight_long(guest_irq_type) > 1)
+ goto out;
+ if (host_irq_type == 0 && guest_irq_type == 0)
+ goto out;
+
+ r = 0;
+ if (host_irq_type)
+ r = assign_host_irq(kvm, match, host_irq_type);
+ if (r)
+ goto out;
+
+ if (guest_irq_type)
+ r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+ struct kvm_assigned_irq
+ *assigned_irq)
+{
+ int r = -ENODEV;
+ struct kvm_assigned_dev_kernel *match;
+ unsigned long irq_type;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_irq->assigned_dev_id);
+ if (!match)
+ goto out;
+
+ irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK |
+ KVM_DEV_IRQ_GUEST_MASK);
+ r = kvm_deassign_irq(kvm, match, irq_type);
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+/*
+ * We want to test whether the caller has been granted permissions to
+ * use this device. To be able to configure and control the device,
+ * the user needs access to PCI configuration space and BAR resources.
+ * These are accessed through PCI sysfs. PCI config space is often
+ * passed to the process calling this ioctl via file descriptor, so we
+ * can't rely on access to that file. We can check for permissions
+ * on each of the BAR resource files, which is a pretty clear
+ * indicator that the user has been granted access to the device.
+ */
+static int probe_sysfs_permissions(struct pci_dev *dev)
+{
+#ifdef CONFIG_SYSFS
+ int i;
+ bool bar_found = false;
+
+ for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) {
+ char *kpath, *syspath;
+ struct path path;
+ struct inode *inode;
+ int r;
+
+ if (!pci_resource_len(dev, i))
+ continue;
+
+ kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL);
+ if (!kpath)
+ return -ENOMEM;
+
+ /* Per sysfs-rules, sysfs is always at /sys */
+ syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i);
+ kfree(kpath);
+ if (!syspath)
+ return -ENOMEM;
+
+ r = kern_path(syspath, LOOKUP_FOLLOW, &path);
+ kfree(syspath);
+ if (r)
+ return r;
+
+ inode = path.dentry->d_inode;
+
+ r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS);
+ path_put(&path);
+ if (r)
+ return r;
+
+ bar_found = true;
+ }
+
+ /* If no resources, probably something special */
+ if (!bar_found)
+ return -EPERM;
+
+ return 0;
+#else
+ return -EINVAL; /* No way to control the device without sysfs */
+#endif
+}
+
+static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0, idx;
+ struct kvm_assigned_dev_kernel *match;
+ struct pci_dev *dev;
+
+ if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ idx = srcu_read_lock(&kvm->srcu);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (match) {
+ /* device already assigned */
+ r = -EEXIST;
+ goto out;
+ }
+
+ match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL);
+ if (match == NULL) {
+ printk(KERN_INFO "%s: Couldn't allocate memory\n",
+ __func__);
+ r = -ENOMEM;
+ goto out;
+ }
+ dev = pci_get_domain_bus_and_slot(assigned_dev->segnr,
+ assigned_dev->busnr,
+ assigned_dev->devfn);
+ if (!dev) {
+ printk(KERN_INFO "%s: host device not found\n", __func__);
+ r = -EINVAL;
+ goto out_free;
+ }
+
+ /* Don't allow bridges to be assigned */
+ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) {
+ r = -EPERM;
+ goto out_put;
+ }
+
+ r = probe_sysfs_permissions(dev);
+ if (r)
+ goto out_put;
+
+ if (pci_enable_device(dev)) {
+ printk(KERN_INFO "%s: Could not enable PCI device\n", __func__);
+ r = -EBUSY;
+ goto out_put;
+ }
+ r = pci_request_regions(dev, "kvm_assigned_device");
+ if (r) {
+ printk(KERN_INFO "%s: Could not get access to device regions\n",
+ __func__);
+ goto out_disable;
+ }
+
+ pci_reset_function(dev);
+ pci_save_state(dev);
+ match->pci_saved_state = pci_store_saved_state(dev);
+ if (!match->pci_saved_state)
+ printk(KERN_DEBUG "%s: Couldn't store %s saved state\n",
+ __func__, dev_name(&dev->dev));
+
+ if (!pci_intx_mask_supported(dev))
+ assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3;
+
+ match->assigned_dev_id = assigned_dev->assigned_dev_id;
+ match->host_segnr = assigned_dev->segnr;
+ match->host_busnr = assigned_dev->busnr;
+ match->host_devfn = assigned_dev->devfn;
+ match->flags = assigned_dev->flags;
+ match->dev = dev;
+ spin_lock_init(&match->intx_lock);
+ spin_lock_init(&match->intx_mask_lock);
+ match->irq_source_id = -1;
+ match->kvm = kvm;
+ match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+
+ list_add(&match->list, &kvm->arch.assigned_dev_head);
+
+ if (!kvm->arch.iommu_domain) {
+ r = kvm_iommu_map_guest(kvm);
+ if (r)
+ goto out_list_del;
+ }
+ r = kvm_assign_device(kvm, match->dev);
+ if (r)
+ goto out_list_del;
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+out_list_del:
+ if (pci_load_and_free_saved_state(dev, &match->pci_saved_state))
+ printk(KERN_INFO "%s: Couldn't reload %s saved state\n",
+ __func__, dev_name(&dev->dev));
+ list_del(&match->list);
+ pci_release_regions(dev);
+out_disable:
+ pci_disable_device(dev);
+out_put:
+ pci_dev_put(dev);
+out_free:
+ kfree(match);
+ srcu_read_unlock(&kvm->srcu, idx);
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_deassign_device(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ printk(KERN_INFO "%s: device hasn't been assigned before, "
+ "so cannot be deassigned\n", __func__);
+ r = -EINVAL;
+ goto out;
+ }
+
+ kvm_deassign_device(kvm, match->dev);
+
+ kvm_free_assigned_device(kvm, match);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+ struct kvm_assigned_msix_nr *entry_nr)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry_nr->assigned_dev_id);
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ if (adev->entries_nr == 0) {
+ adev->entries_nr = entry_nr->entry_nr;
+ if (adev->entries_nr == 0 ||
+ adev->entries_nr > KVM_MAX_MSIX_PER_DEV) {
+ r = -EINVAL;
+ goto msix_nr_out;
+ }
+
+ adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+ entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->host_msix_entries) {
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ adev->guest_msix_entries =
+ kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr,
+ GFP_KERNEL);
+ if (!adev->guest_msix_entries) {
+ kfree(adev->host_msix_entries);
+ r = -ENOMEM;
+ goto msix_nr_out;
+ }
+ } else /* Not allowed set MSI-X number twice */
+ r = -EINVAL;
+msix_nr_out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+ struct kvm_assigned_msix_entry *entry)
+{
+ int r = 0, i;
+ struct kvm_assigned_dev_kernel *adev;
+
+ mutex_lock(&kvm->lock);
+
+ adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ entry->assigned_dev_id);
+
+ if (!adev) {
+ r = -EINVAL;
+ goto msix_entry_out;
+ }
+
+ for (i = 0; i < adev->entries_nr; i++)
+ if (adev->guest_msix_entries[i].vector == 0 ||
+ adev->guest_msix_entries[i].entry == entry->entry) {
+ adev->guest_msix_entries[i].entry = entry->entry;
+ adev->guest_msix_entries[i].vector = entry->gsi;
+ adev->host_msix_entries[i].entry = entry->entry;
+ break;
+ }
+ if (i == adev->entries_nr) {
+ r = -ENOSPC;
+ goto msix_entry_out;
+ }
+
+msix_entry_out:
+ mutex_unlock(&kvm->lock);
+
+ return r;
+}
+#endif
+
+static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
+ struct kvm_assigned_pci_dev *assigned_dev)
+{
+ int r = 0;
+ struct kvm_assigned_dev_kernel *match;
+
+ mutex_lock(&kvm->lock);
+
+ match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+ assigned_dev->assigned_dev_id);
+ if (!match) {
+ r = -ENODEV;
+ goto out;
+ }
+
+ spin_lock(&match->intx_mask_lock);
+
+ match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX;
+ match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX;
+
+ if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
+ if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
+ kvm_set_irq(match->kvm, match->irq_source_id,
+ match->guest_irq, 0, false);
+ /*
+ * Masking at hardware-level is performed on demand,
+ * i.e. when an IRQ actually arrives at the host.
+ */
+ } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) {
+ /*
+ * Unmask the IRQ line if required. Unmasking at
+ * device level will be performed by user space.
+ */
+ spin_lock_irq(&match->intx_lock);
+ if (match->host_irq_disabled) {
+ enable_irq(match->host_irq);
+ match->host_irq_disabled = false;
+ }
+ spin_unlock_irq(&match->intx_lock);
+ }
+ }
+
+ spin_unlock(&match->intx_mask_lock);
+
+out:
+ mutex_unlock(&kvm->lock);
+ return r;
+}
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ int r;
+
+ switch (ioctl) {
+ case KVM_ASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_IRQ: {
+ r = -EOPNOTSUPP;
+ break;
+ }
+ case KVM_ASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_DEV_IRQ: {
+ struct kvm_assigned_irq assigned_irq;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+ goto out;
+ r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_DEASSIGN_PCI_DEVICE: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev);
+ if (r)
+ goto out;
+ break;
+ }
+#ifdef __KVM_HAVE_MSIX
+ case KVM_ASSIGN_SET_MSIX_NR: {
+ struct kvm_assigned_msix_nr entry_nr;
+ r = -EFAULT;
+ if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+ if (r)
+ goto out;
+ break;
+ }
+ case KVM_ASSIGN_SET_MSIX_ENTRY: {
+ struct kvm_assigned_msix_entry entry;
+ r = -EFAULT;
+ if (copy_from_user(&entry, argp, sizeof entry))
+ goto out;
+ r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+ if (r)
+ goto out;
+ break;
+ }
+#endif
+ case KVM_ASSIGN_SET_INTX_MASK: {
+ struct kvm_assigned_pci_dev assigned_dev;
+
+ r = -EFAULT;
+ if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev))
+ goto out;
+ r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev);
+ break;
+ }
+ default:
+ r = -ENOTTY;
+ break;
+ }
+out:
+ return r;
+}
diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h
new file mode 100644
index 000000000000..a428c1a211b2
--- /dev/null
+++ b/arch/x86/kvm/assigned-dev.h
@@ -0,0 +1,32 @@
+#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H
+#define ARCH_X86_KVM_ASSIGNED_DEV_H
+
+#include <linux/kvm_host.h>
+
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev);
+int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev);
+
+int kvm_iommu_map_guest(struct kvm *kvm);
+int kvm_iommu_unmap_guest(struct kvm *kvm);
+
+long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg);
+
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+#else
+static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ return 0;
+}
+
+static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
+ unsigned long arg)
+{
+ return -ENOTTY;
+}
+
+static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
+#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */
+
+#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 976e3a57f9ea..8a80737ee6e6 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -23,7 +23,7 @@
#include "mmu.h"
#include "trace.h"
-static u32 xstate_required_size(u64 xstate_bv)
+static u32 xstate_required_size(u64 xstate_bv, bool compacted)
{
int feature_bit = 0;
u32 ret = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
@@ -31,9 +31,10 @@ static u32 xstate_required_size(u64 xstate_bv)
xstate_bv &= XSTATE_EXTEND_MASK;
while (xstate_bv) {
if (xstate_bv & 0x1) {
- u32 eax, ebx, ecx, edx;
+ u32 eax, ebx, ecx, edx, offset;
cpuid_count(0xD, feature_bit, &eax, &ebx, &ecx, &edx);
- ret = max(ret, eax + ebx);
+ offset = compacted ? ret : ebx;
+ ret = max(ret, offset + eax);
}
xstate_bv >>= 1;
@@ -53,6 +54,8 @@ u64 kvm_supported_xcr0(void)
return xcr0;
}
+#define F(x) bit(X86_FEATURE_##x)
+
int kvm_update_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_cpuid_entry2 *best;
@@ -64,13 +67,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
/* Update OSXSAVE bit */
if (cpu_has_xsave && best->function == 0x1) {
- best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+ best->ecx &= ~F(OSXSAVE);
if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
- best->ecx |= bit(X86_FEATURE_OSXSAVE);
+ best->ecx |= F(OSXSAVE);
}
if (apic) {
- if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER))
+ if (best->ecx & F(TSC_DEADLINE_TIMER))
apic->lapic_timer.timer_mode_mask = 3 << 17;
else
apic->lapic_timer.timer_mode_mask = 1 << 17;
@@ -85,9 +88,13 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
(best->eax | ((u64)best->edx << 32)) &
kvm_supported_xcr0();
vcpu->arch.guest_xstate_size = best->ebx =
- xstate_required_size(vcpu->arch.xcr0);
+ xstate_required_size(vcpu->arch.xcr0, false);
}
+ best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ if (best && (best->eax & (F(XSAVES) | F(XSAVEC))))
+ best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
+
/*
* The existing code assumes virtual address is 48-bit in the canonical
* address checks; exit if it is ever changed.
@@ -122,8 +129,8 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
break;
}
}
- if (entry && (entry->edx & bit(X86_FEATURE_NX)) && !is_efer_nx()) {
- entry->edx &= ~bit(X86_FEATURE_NX);
+ if (entry && (entry->edx & F(NX)) && !is_efer_nx()) {
+ entry->edx &= ~F(NX);
printk(KERN_INFO "kvm: guest NX capability removed\n");
}
}
@@ -227,8 +234,6 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
entry->flags = 0;
}
-#define F(x) bit(X86_FEATURE_##x)
-
static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry,
u32 func, u32 index, int *nent, int maxnent)
{
@@ -267,6 +272,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0;
unsigned f_mpx = kvm_x86_ops->mpx_supported() ? F(MPX) : 0;
+ unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0;
/* cpuid 1.edx */
const u32 kvm_supported_word0_x86_features =
@@ -317,7 +323,12 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
const u32 kvm_supported_word9_x86_features =
F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
- F(ADX) | F(SMAP);
+ F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
+ F(AVX512CD);
+
+ /* cpuid 0xD.1.eax */
+ const u32 kvm_supported_word10_x86_features =
+ F(XSAVEOPT) | F(XSAVEC) | F(XGETBV1) | f_xsaves;
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
@@ -453,16 +464,34 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
u64 supported = kvm_supported_xcr0();
entry->eax &= supported;
+ entry->ebx = xstate_required_size(supported, false);
+ entry->ecx = entry->ebx;
entry->edx &= supported >> 32;
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+ if (!supported)
+ break;
+
for (idx = 1, i = 1; idx < 64; ++idx) {
u64 mask = ((u64)1 << idx);
if (*nent >= maxnent)
goto out;
do_cpuid_1_ent(&entry[i], function, idx);
- if (entry[i].eax == 0 || !(supported & mask))
- continue;
+ if (idx == 1) {
+ entry[i].eax &= kvm_supported_word10_x86_features;
+ entry[i].ebx = 0;
+ if (entry[i].eax & (F(XSAVES)|F(XSAVEC)))
+ entry[i].ebx =
+ xstate_required_size(supported,
+ true);
+ } else {
+ if (entry[i].eax == 0 || !(supported & mask))
+ continue;
+ if (WARN_ON_ONCE(entry[i].ecx & 1))
+ continue;
+ }
+ entry[i].ecx = 0;
+ entry[i].edx = 0;
entry[i].flags |=
KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
++*nent;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 9f8a2faf5040..169b09d76ddd 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -123,6 +123,7 @@
#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
#define Escape (5<<15) /* Escape to coprocessor instruction */
+#define InstrDual (6<<15) /* Alternate instruction decoding of mod == 3 */
#define Sse (1<<18) /* SSE Vector instruction */
/* Generic ModRM decode. */
#define ModRM (1<<19)
@@ -166,6 +167,8 @@
#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */
#define NoBigReal ((u64)1 << 50) /* No big real mode */
#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */
+#define NearBranch ((u64)1 << 52) /* Near branches */
+#define No16 ((u64)1 << 53) /* No 16 bit operand */
#define DstXacc (DstAccLo | SrcAccHi | SrcWrite)
@@ -209,6 +212,7 @@ struct opcode {
const struct group_dual *gdual;
const struct gprefix *gprefix;
const struct escape *esc;
+ const struct instr_dual *idual;
void (*fastop)(struct fastop *fake);
} u;
int (*check_perm)(struct x86_emulate_ctxt *ctxt);
@@ -231,6 +235,11 @@ struct escape {
struct opcode high[64];
};
+struct instr_dual {
+ struct opcode mod012;
+ struct opcode mod3;
+};
+
/* EFLAGS bit definitions. */
#define EFLG_ID (1<<21)
#define EFLG_VIP (1<<20)
@@ -379,6 +388,15 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *));
ON64(FOP2E(op##q, rax, cl)) \
FOP_END
+/* 2 operand, src and dest are reversed */
+#define FASTOP2R(op, name) \
+ FOP_START(name) \
+ FOP2E(op##b, dl, al) \
+ FOP2E(op##w, dx, ax) \
+ FOP2E(op##l, edx, eax) \
+ ON64(FOP2E(op##q, rdx, rax)) \
+ FOP_END
+
#define FOP3E(op, dst, src, src2) \
FOP_ALIGN #op " %" #src2 ", %" #src ", %" #dst " \n\t" FOP_RET
@@ -477,9 +495,9 @@ address_mask(struct x86_emulate_ctxt *ctxt, unsigned long reg)
}
static inline unsigned long
-register_address(struct x86_emulate_ctxt *ctxt, unsigned long reg)
+register_address(struct x86_emulate_ctxt *ctxt, int reg)
{
- return address_mask(ctxt, reg);
+ return address_mask(ctxt, reg_read(ctxt, reg));
}
static void masked_increment(ulong *reg, ulong mask, int inc)
@@ -488,7 +506,7 @@ static void masked_increment(ulong *reg, ulong mask, int inc)
}
static inline void
-register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, int inc)
+register_address_increment(struct x86_emulate_ctxt *ctxt, int reg, int inc)
{
ulong mask;
@@ -496,7 +514,7 @@ register_address_increment(struct x86_emulate_ctxt *ctxt, unsigned long *reg, in
mask = ~0UL;
else
mask = ad_mask(ctxt);
- masked_increment(reg, mask, inc);
+ masked_increment(reg_rmw(ctxt, reg), mask, inc);
}
static void rsp_increment(struct x86_emulate_ctxt *ctxt, int inc)
@@ -564,40 +582,6 @@ static int emulate_nm(struct x86_emulate_ctxt *ctxt)
return emulate_exception(ctxt, NM_VECTOR, 0, false);
}
-static inline int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
- int cs_l)
-{
- switch (ctxt->op_bytes) {
- case 2:
- ctxt->_eip = (u16)dst;
- break;
- case 4:
- ctxt->_eip = (u32)dst;
- break;
-#ifdef CONFIG_X86_64
- case 8:
- if ((cs_l && is_noncanonical_address(dst)) ||
- (!cs_l && (dst >> 32) != 0))
- return emulate_gp(ctxt, 0);
- ctxt->_eip = dst;
- break;
-#endif
- default:
- WARN(1, "unsupported eip assignment size\n");
- }
- return X86EMUL_CONTINUE;
-}
-
-static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
-{
- return assign_eip_far(ctxt, dst, ctxt->mode == X86EMUL_MODE_PROT64);
-}
-
-static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
-{
- return assign_eip_near(ctxt, ctxt->_eip + rel);
-}
-
static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
{
u16 selector;
@@ -641,25 +625,24 @@ static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size)
return true;
}
-static int __linearize(struct x86_emulate_ctxt *ctxt,
- struct segmented_address addr,
- unsigned *max_size, unsigned size,
- bool write, bool fetch,
- ulong *linear)
+static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
+ struct segmented_address addr,
+ unsigned *max_size, unsigned size,
+ bool write, bool fetch,
+ enum x86emul_mode mode, ulong *linear)
{
struct desc_struct desc;
bool usable;
ulong la;
u32 lim;
u16 sel;
- unsigned cpl;
la = seg_base(ctxt, addr.seg) + addr.ea;
*max_size = 0;
- switch (ctxt->mode) {
+ switch (mode) {
case X86EMUL_MODE_PROT64:
- if (((signed long)la << 16) >> 16 != la)
- return emulate_gp(ctxt, 0);
+ if (is_noncanonical_address(la))
+ goto bad;
*max_size = min_t(u64, ~0u, (1ull << 48) - la);
if (size > *max_size)
@@ -678,46 +661,20 @@ static int __linearize(struct x86_emulate_ctxt *ctxt,
if (!fetch && (desc.type & 8) && !(desc.type & 2))
goto bad;
lim = desc_limit_scaled(&desc);
- if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch &&
- (ctxt->d & NoBigReal)) {
- /* la is between zero and 0xffff */
- if (la > 0xffff)
- goto bad;
- *max_size = 0x10000 - la;
- } else if ((desc.type & 8) || !(desc.type & 4)) {
- /* expand-up segment */
- if (addr.ea > lim)
- goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
- } else {
+ if (!(desc.type & 8) && (desc.type & 4)) {
/* expand-down segment */
if (addr.ea <= lim)
goto bad;
lim = desc.d ? 0xffffffff : 0xffff;
- if (addr.ea > lim)
- goto bad;
- *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
}
+ if (addr.ea > lim)
+ goto bad;
+ *max_size = min_t(u64, ~0u, (u64)lim + 1 - addr.ea);
if (size > *max_size)
goto bad;
- cpl = ctxt->ops->cpl(ctxt);
- if (!(desc.type & 8)) {
- /* data segment */
- if (cpl > desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && !(desc.type & 4)) {
- /* nonconforming code segment */
- if (cpl != desc.dpl)
- goto bad;
- } else if ((desc.type & 8) && (desc.type & 4)) {
- /* conforming code segment */
- if (cpl < desc.dpl)
- goto bad;
- }
+ la &= (u32)-1;
break;
}
- if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : ctxt->ad_bytes != 8)
- la &= (u32)-1;
if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
return emulate_gp(ctxt, 0);
*linear = la;
@@ -735,9 +692,55 @@ static int linearize(struct x86_emulate_ctxt *ctxt,
ulong *linear)
{
unsigned max_size;
- return __linearize(ctxt, addr, &max_size, size, write, false, linear);
+ return __linearize(ctxt, addr, &max_size, size, write, false,
+ ctxt->mode, linear);
+}
+
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
+ enum x86emul_mode mode)
+{
+ ulong linear;
+ int rc;
+ unsigned max_size;
+ struct segmented_address addr = { .seg = VCPU_SREG_CS,
+ .ea = dst };
+
+ if (ctxt->op_bytes != sizeof(unsigned long))
+ addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ if (rc == X86EMUL_CONTINUE)
+ ctxt->_eip = addr.ea;
+ return rc;
+}
+
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ return assign_eip(ctxt, dst, ctxt->mode);
}
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
+ const struct desc_struct *cs_desc)
+{
+ enum x86emul_mode mode = ctxt->mode;
+
+#ifdef CONFIG_X86_64
+ if (ctxt->mode >= X86EMUL_MODE_PROT32 && cs_desc->l) {
+ u64 efer = 0;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+ if (efer & EFER_LMA)
+ mode = X86EMUL_MODE_PROT64;
+ }
+#endif
+ if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
+ mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ return assign_eip(ctxt, dst, mode);
+}
+
+static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
+{
+ return assign_eip_near(ctxt, ctxt->_eip + rel);
+}
static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
struct segmented_address addr,
@@ -776,7 +779,8 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size)
* boundary check itself. Instead, we use max_size to check
* against op_size.
*/
- rc = __linearize(ctxt, addr, &max_size, 0, false, true, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 0, false, true, ctxt->mode,
+ &linear);
if (unlikely(rc != X86EMUL_CONTINUE))
return rc;
@@ -911,6 +915,8 @@ FASTOP2W(btc);
FASTOP2(xadd);
+FASTOP2R(cmp, cmp_r);
+
static u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
@@ -1221,6 +1227,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
if (index_reg != 4)
modrm_ea += reg_read(ctxt, index_reg) << scale;
} else if ((ctxt->modrm_rm & 7) == 5 && ctxt->modrm_mod == 0) {
+ modrm_ea += insn_fetch(s32, ctxt);
if (ctxt->mode == X86EMUL_MODE_PROT64)
ctxt->rip_relative = 1;
} else {
@@ -1229,10 +1236,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
adjust_modrm_seg(ctxt, base_reg);
}
switch (ctxt->modrm_mod) {
- case 0:
- if (ctxt->modrm_rm == 5)
- modrm_ea += insn_fetch(s32, ctxt);
- break;
case 1:
modrm_ea += insn_fetch(s8, ctxt);
break;
@@ -1284,7 +1287,8 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt)
else
sv = (s64)ctxt->src.val & (s64)mask;
- ctxt->dst.addr.mem.ea += (sv >> 3);
+ ctxt->dst.addr.mem.ea = address_mask(ctxt,
+ ctxt->dst.addr.mem.ea + (sv >> 3));
}
/* only subword offset */
@@ -1610,6 +1614,9 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
sizeof(base3), &ctxt->exception);
if (ret != X86EMUL_CONTINUE)
return ret;
+ if (is_noncanonical_address(get_desc_base(&seg_desc) |
+ ((u64)base3 << 32)))
+ return emulate_gp(ctxt, 0);
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
@@ -1807,6 +1814,10 @@ static int em_push_sreg(struct x86_emulate_ctxt *ctxt)
int seg = ctxt->src2.val;
ctxt->src.val = get_segment_selector(ctxt, seg);
+ if (ctxt->op_bytes == 4) {
+ rsp_increment(ctxt, -2);
+ ctxt->op_bytes = 2;
+ }
return em_push(ctxt);
}
@@ -1850,7 +1861,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
static int em_pushf(struct x86_emulate_ctxt *ctxt)
{
- ctxt->src.val = (unsigned long)ctxt->eflags;
+ ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
return em_push(ctxt);
}
@@ -2035,7 +2046,7 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE) {
WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
/* assigning eip failed; restore the old cs */
@@ -2045,31 +2056,22 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
return rc;
}
-static int em_grp45(struct x86_emulate_ctxt *ctxt)
+static int em_jmp_abs(struct x86_emulate_ctxt *ctxt)
{
- int rc = X86EMUL_CONTINUE;
+ return assign_eip_near(ctxt, ctxt->src.val);
+}
- switch (ctxt->modrm_reg) {
- case 2: /* call near abs */ {
- long int old_eip;
- old_eip = ctxt->_eip;
- rc = assign_eip_near(ctxt, ctxt->src.val);
- if (rc != X86EMUL_CONTINUE)
- break;
- ctxt->src.val = old_eip;
- rc = em_push(ctxt);
- break;
- }
- case 4: /* jmp abs */
- rc = assign_eip_near(ctxt, ctxt->src.val);
- break;
- case 5: /* jmp far */
- rc = em_jmp_far(ctxt);
- break;
- case 6: /* push */
- rc = em_push(ctxt);
- break;
- }
+static int em_call_near_abs(struct x86_emulate_ctxt *ctxt)
+{
+ int rc;
+ long int old_eip;
+
+ old_eip = ctxt->_eip;
+ rc = assign_eip_near(ctxt, ctxt->src.val);
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+ ctxt->src.val = old_eip;
+ rc = em_push(ctxt);
return rc;
}
@@ -2128,11 +2130,11 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
/* Outer-privilege level return is not implemented */
if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl)
return X86EMUL_UNHANDLEABLE;
- rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, 0, false,
+ rc = __load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS, cpl, false,
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, eip, new_desc.l);
+ rc = assign_eip_far(ctxt, eip, &new_desc);
if (rc != X86EMUL_CONTINUE) {
WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
@@ -2316,6 +2318,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
ctxt->eflags &= ~msr_data;
+ ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
#endif
} else {
/* legacy mode */
@@ -2349,11 +2352,9 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
&& !vendor_intel(ctxt))
return emulate_ud(ctxt);
- /* XXX sysenter/sysexit have not been tested in 64bit mode.
- * Therefore, we inject an #UD.
- */
+ /* sysenter/sysexit have not been tested in 64bit mode. */
if (ctxt->mode == X86EMUL_MODE_PROT64)
- return emulate_ud(ctxt);
+ return X86EMUL_UNHANDLEABLE;
setup_syscalls_segments(ctxt, &cs, &ss);
@@ -2425,6 +2426,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
if ((msr_data & 0xfffc) == 0x0)
return emulate_gp(ctxt, 0);
ss_sel = (u16)(msr_data + 24);
+ rcx = (u32)rcx;
+ rdx = (u32)rdx;
break;
case X86EMUL_MODE_PROT64:
cs_sel = (u16)(msr_data + 32);
@@ -2599,7 +2602,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
save_state_to_tss16(ctxt, &tss_seg);
@@ -2607,13 +2609,11 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
if (old_tss_sel != 0xffff) {
@@ -2624,7 +2624,6 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
sizeof tss_seg.prev_task_link,
&ctxt->exception);
if (ret != X86EMUL_CONTINUE)
- /* FIXME: need to provide precise fault address */
return ret;
}
@@ -2813,7 +2812,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
*
* 1. jmp/call/int to task gate: Check against DPL of the task gate
* 2. Exception/IRQ/iret: No check is performed
- * 3. jmp/call to TSS: Check against DPL of the TSS
+ * 3. jmp/call to TSS/task-gate: No check is performed since the
+ * hardware checks it before exiting.
*/
if (reason == TASK_SWITCH_GATE) {
if (idt_index != -1) {
@@ -2830,13 +2830,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
return emulate_gp(ctxt, (idt_index << 3) | 0x2);
}
- } else if (reason != TASK_SWITCH_IRET) {
- int dpl = next_tss_desc.dpl;
- if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl)
- return emulate_gp(ctxt, tss_selector);
}
-
desc_limit = desc_limit_scaled(&next_tss_desc);
if (!next_tss_desc.p ||
((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
@@ -2913,8 +2908,8 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
{
int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
- register_address_increment(ctxt, reg_rmw(ctxt, reg), df * op->bytes);
- op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, reg));
+ register_address_increment(ctxt, reg, df * op->bytes);
+ op->addr.mem.ea = register_address(ctxt, reg);
}
static int em_das(struct x86_emulate_ctxt *ctxt)
@@ -3025,7 +3020,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return X86EMUL_CONTINUE;
- rc = assign_eip_far(ctxt, ctxt->src.val, new_desc.l);
+ rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
if (rc != X86EMUL_CONTINUE)
goto fail;
@@ -3215,6 +3210,8 @@ static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt)
return emulate_ud(ctxt);
ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg);
+ if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM)
+ ctxt->dst.bytes = 2;
return X86EMUL_CONTINUE;
}
@@ -3317,7 +3314,7 @@ static int em_sidt(struct x86_emulate_ctxt *ctxt)
return emulate_store_desc_ptr(ctxt, ctxt->ops->get_idt);
}
-static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+static int em_lgdt_lidt(struct x86_emulate_ctxt *ctxt, bool lgdt)
{
struct desc_ptr desc_ptr;
int rc;
@@ -3329,12 +3326,23 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
ctxt->op_bytes);
if (rc != X86EMUL_CONTINUE)
return rc;
- ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ if (ctxt->mode == X86EMUL_MODE_PROT64 &&
+ is_noncanonical_address(desc_ptr.address))
+ return emulate_gp(ctxt, 0);
+ if (lgdt)
+ ctxt->ops->set_gdt(ctxt, &desc_ptr);
+ else
+ ctxt->ops->set_idt(ctxt, &desc_ptr);
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
return X86EMUL_CONTINUE;
}
+static int em_lgdt(struct x86_emulate_ctxt *ctxt)
+{
+ return em_lgdt_lidt(ctxt, true);
+}
+
static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
{
int rc;
@@ -3348,20 +3356,7 @@ static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
static int em_lidt(struct x86_emulate_ctxt *ctxt)
{
- struct desc_ptr desc_ptr;
- int rc;
-
- if (ctxt->mode == X86EMUL_MODE_PROT64)
- ctxt->op_bytes = 8;
- rc = read_descriptor(ctxt, ctxt->src.addr.mem,
- &desc_ptr.size, &desc_ptr.address,
- ctxt->op_bytes);
- if (rc != X86EMUL_CONTINUE)
- return rc;
- ctxt->ops->set_idt(ctxt, &desc_ptr);
- /* Disable writeback. */
- ctxt->dst.type = OP_NONE;
- return X86EMUL_CONTINUE;
+ return em_lgdt_lidt(ctxt, false);
}
static int em_smsw(struct x86_emulate_ctxt *ctxt)
@@ -3384,7 +3379,7 @@ static int em_loop(struct x86_emulate_ctxt *ctxt)
{
int rc = X86EMUL_CONTINUE;
- register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX), -1);
+ register_address_increment(ctxt, VCPU_REGS_RCX, -1);
if ((address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) != 0) &&
(ctxt->b == 0xe2 || test_cc(ctxt->b ^ 0x5, ctxt->eflags)))
rc = jmp_rel(ctxt, ctxt->src.val);
@@ -3554,7 +3549,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
if (efer & EFER_LMA)
- rsvd = CR3_L_MODE_RESERVED_BITS;
+ rsvd = CR3_L_MODE_RESERVED_BITS & ~CR3_PCID_INVD;
if (new_val & rsvd)
return emulate_gp(ctxt, 0);
@@ -3596,8 +3591,15 @@ static int check_dr_read(struct x86_emulate_ctxt *ctxt)
if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
return emulate_ud(ctxt);
- if (check_dr7_gd(ctxt))
+ if (check_dr7_gd(ctxt)) {
+ ulong dr6;
+
+ ctxt->ops->get_dr(ctxt, 6, &dr6);
+ dr6 &= ~15;
+ dr6 |= DR6_BD | DR6_RTM;
+ ctxt->ops->set_dr(ctxt, 6, dr6);
return emulate_db(ctxt);
+ }
return X86EMUL_CONTINUE;
}
@@ -3684,6 +3686,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
#define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) }
#define GD(_f, _g) { .flags = ((_f) | GroupDual | ModRM), .u.gdual = (_g) }
+#define ID(_f, _i) { .flags = ((_f) | InstrDual | ModRM), .u.idual = (_i) }
#define E(_f, _e) { .flags = ((_f) | Escape | ModRM), .u.esc = (_e) }
#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
#define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) }
@@ -3780,11 +3783,11 @@ static const struct opcode group4[] = {
static const struct opcode group5[] = {
F(DstMem | SrcNone | Lock, em_inc),
F(DstMem | SrcNone | Lock, em_dec),
- I(SrcMem | Stack, em_grp45),
+ I(SrcMem | NearBranch, em_call_near_abs),
I(SrcMemFAddr | ImplicitOps | Stack, em_call_far),
- I(SrcMem | Stack, em_grp45),
- I(SrcMemFAddr | ImplicitOps, em_grp45),
- I(SrcMem | Stack, em_grp45), D(Undefined),
+ I(SrcMem | NearBranch, em_jmp_abs),
+ I(SrcMemFAddr | ImplicitOps, em_jmp_far),
+ I(SrcMem | Stack, em_push), D(Undefined),
};
static const struct opcode group6[] = {
@@ -3845,8 +3848,12 @@ static const struct gprefix pfx_0f_6f_0f_7f = {
I(Mmx, em_mov), I(Sse | Aligned, em_mov), N, I(Sse | Unaligned, em_mov),
};
+static const struct instr_dual instr_dual_0f_2b = {
+ I(0, em_mov), N
+};
+
static const struct gprefix pfx_0f_2b = {
- I(0, em_mov), I(0, em_mov), N, N,
+ ID(0, &instr_dual_0f_2b), ID(0, &instr_dual_0f_2b), N, N,
};
static const struct gprefix pfx_0f_28_0f_29 = {
@@ -3920,6 +3927,10 @@ static const struct escape escape_dd = { {
N, N, N, N, N, N, N, N,
} };
+static const struct instr_dual instr_dual_0f_c3 = {
+ I(DstMem | SrcReg | ModRM | No16 | Mov, em_mov), N
+};
+
static const struct opcode opcode_table[256] = {
/* 0x00 - 0x07 */
F6ALU(Lock, em_add),
@@ -3964,7 +3975,7 @@ static const struct opcode opcode_table[256] = {
I2bvIP(DstDI | SrcDX | Mov | String | Unaligned, em_in, ins, check_perm_in), /* insb, insw/insd */
I2bvIP(SrcSI | DstDX | String, em_out, outs, check_perm_out), /* outsb, outsw/outsd */
/* 0x70 - 0x7F */
- X16(D(SrcImmByte)),
+ X16(D(SrcImmByte | NearBranch)),
/* 0x80 - 0x87 */
G(ByteOp | DstMem | SrcImm, group1),
G(DstMem | SrcImm, group1),
@@ -3991,20 +4002,20 @@ static const struct opcode opcode_table[256] = {
I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
I2bv(DstMem | SrcAcc | Mov | MemAbs | PageTable, em_mov),
I2bv(SrcSI | DstDI | Mov | String, em_mov),
- F2bv(SrcSI | DstDI | String | NoWrite, em_cmp),
+ F2bv(SrcSI | DstDI | String | NoWrite, em_cmp_r),
/* 0xA8 - 0xAF */
F2bv(DstAcc | SrcImm | NoWrite, em_test),
I2bv(SrcAcc | DstDI | Mov | String, em_mov),
I2bv(SrcSI | DstAcc | Mov | String, em_mov),
- F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp),
+ F2bv(SrcAcc | DstDI | String | NoWrite, em_cmp_r),
/* 0xB0 - 0xB7 */
X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
/* 0xB8 - 0xBF */
X8(I(DstReg | SrcImm64 | Mov, em_mov)),
/* 0xC0 - 0xC7 */
G(ByteOp | Src2ImmByte, group2), G(Src2ImmByte, group2),
- I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
- I(ImplicitOps | Stack, em_ret),
+ I(ImplicitOps | NearBranch | SrcImmU16, em_ret_near_imm),
+ I(ImplicitOps | NearBranch, em_ret),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg),
I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg),
G(ByteOp, group11), G(0, group11),
@@ -4024,13 +4035,14 @@ static const struct opcode opcode_table[256] = {
/* 0xD8 - 0xDF */
N, E(0, &escape_d9), N, E(0, &escape_db), N, E(0, &escape_dd), N, N,
/* 0xE0 - 0xE7 */
- X3(I(SrcImmByte, em_loop)),
- I(SrcImmByte, em_jcxz),
+ X3(I(SrcImmByte | NearBranch, em_loop)),
+ I(SrcImmByte | NearBranch, em_jcxz),
I2bvIP(SrcImmUByte | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstImmUByte, em_out, out, check_perm_out),
/* 0xE8 - 0xEF */
- I(SrcImm | Stack, em_call), D(SrcImm | ImplicitOps),
- I(SrcImmFAddr | No64, em_jmp_far), D(SrcImmByte | ImplicitOps),
+ I(SrcImm | NearBranch, em_call), D(SrcImm | ImplicitOps | NearBranch),
+ I(SrcImmFAddr | No64, em_jmp_far),
+ D(SrcImmByte | ImplicitOps | NearBranch),
I2bvIP(SrcDX | DstAcc, em_in, in, check_perm_in),
I2bvIP(SrcAcc | DstDX, em_out, out, check_perm_out),
/* 0xF0 - 0xF7 */
@@ -4090,7 +4102,7 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N,
N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
/* 0x80 - 0x8F */
- X16(D(SrcImm)),
+ X16(D(SrcImm | NearBranch)),
/* 0x90 - 0x9F */
X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
/* 0xA0 - 0xA7 */
@@ -4121,7 +4133,7 @@ static const struct opcode twobyte_table[256] = {
D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
/* 0xC0 - 0xC7 */
F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
- N, D(DstMem | SrcReg | ModRM | Mov),
+ N, ID(0, &instr_dual_0f_c3),
N, N, N, GD(0, &group9),
/* 0xC8 - 0xCF */
X8(I(DstReg, em_bswap)),
@@ -4134,12 +4146,20 @@ static const struct opcode twobyte_table[256] = {
N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
};
+static const struct instr_dual instr_dual_0f_38_f0 = {
+ I(DstReg | SrcMem | Mov, em_movbe), N
+};
+
+static const struct instr_dual instr_dual_0f_38_f1 = {
+ I(DstMem | SrcReg | Mov, em_movbe), N
+};
+
static const struct gprefix three_byte_0f_38_f0 = {
- I(DstReg | SrcMem | Mov, em_movbe), N, N, N
+ ID(0, &instr_dual_0f_38_f0), N, N, N
};
static const struct gprefix three_byte_0f_38_f1 = {
- I(DstMem | SrcReg | Mov, em_movbe), N, N, N
+ ID(0, &instr_dual_0f_38_f1), N, N, N
};
/*
@@ -4152,8 +4172,8 @@ static const struct opcode opcode_map_0f_38[256] = {
/* 0x80 - 0xef */
X16(N), X16(N), X16(N), X16(N), X16(N), X16(N), X16(N),
/* 0xf0 - 0xf1 */
- GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f0),
- GP(EmulateOnUD | ModRM | Prefix, &three_byte_0f_38_f1),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f0),
+ GP(EmulateOnUD | ModRM, &three_byte_0f_38_f1),
/* 0xf2 - 0xff */
N, N, X4(N), X8(N)
};
@@ -4275,7 +4295,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, reg_read(ctxt, VCPU_REGS_RDI));
+ register_address(ctxt, VCPU_REGS_RDI);
op->addr.mem.seg = VCPU_SREG_ES;
op->val = 0;
op->count = 1;
@@ -4329,7 +4349,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI));
+ register_address(ctxt, VCPU_REGS_RSI);
op->addr.mem.seg = ctxt->seg_override;
op->val = 0;
op->count = 1;
@@ -4338,7 +4358,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op,
op->type = OP_MEM;
op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes;
op->addr.mem.ea =
- register_address(ctxt,
+ address_mask(ctxt,
reg_read(ctxt, VCPU_REGS_RBX) +
(reg_read(ctxt, VCPU_REGS_RAX) & 0xff));
op->addr.mem.seg = ctxt->seg_override;
@@ -4510,8 +4530,7 @@ done_prefixes:
/* vex-prefix instructions are not implemented */
if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) &&
- (mode == X86EMUL_MODE_PROT64 ||
- (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) {
+ (mode == X86EMUL_MODE_PROT64 || (ctxt->modrm & 0xc0) == 0xc0)) {
ctxt->d = NotImpl;
}
@@ -4549,6 +4568,12 @@ done_prefixes:
else
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
break;
+ case InstrDual:
+ if ((ctxt->modrm >> 6) == 3)
+ opcode = opcode.u.idual->mod3;
+ else
+ opcode = opcode.u.idual->mod012;
+ break;
default:
return EMULATION_FAILED;
}
@@ -4567,7 +4592,8 @@ done_prefixes:
return EMULATION_FAILED;
if (unlikely(ctxt->d &
- (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) {
+ (NotImpl|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm|NearBranch|
+ No16))) {
/*
* These are copied unconditionally here, and checked unconditionally
* in x86_emulate_insn.
@@ -4578,8 +4604,12 @@ done_prefixes:
if (ctxt->d & NotImpl)
return EMULATION_FAILED;
- if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack))
- ctxt->op_bytes = 8;
+ if (mode == X86EMUL_MODE_PROT64) {
+ if (ctxt->op_bytes == 4 && (ctxt->d & Stack))
+ ctxt->op_bytes = 8;
+ else if (ctxt->d & NearBranch)
+ ctxt->op_bytes = 8;
+ }
if (ctxt->d & Op3264) {
if (mode == X86EMUL_MODE_PROT64)
@@ -4588,6 +4618,9 @@ done_prefixes:
ctxt->op_bytes = 4;
}
+ if ((ctxt->d & No16) && ctxt->op_bytes == 2)
+ ctxt->op_bytes = 4;
+
if (ctxt->d & Sse)
ctxt->op_bytes = 16;
else if (ctxt->d & Mmx)
@@ -4631,7 +4664,8 @@ done_prefixes:
rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask);
if (ctxt->rip_relative)
- ctxt->memopp->addr.mem.ea += ctxt->_eip;
+ ctxt->memopp->addr.mem.ea = address_mask(ctxt,
+ ctxt->memopp->addr.mem.ea + ctxt->_eip);
done:
return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK;
@@ -4775,6 +4809,12 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
+ /* Instruction can only be executed in protected mode */
+ if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
+ rc = emulate_ud(ctxt);
+ goto done;
+ }
+
/* Privileged instruction can be executed only in CPL=0 */
if ((ctxt->d & Priv) && ops->cpl(ctxt)) {
if (ctxt->d & PrivUD)
@@ -4784,12 +4824,6 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
goto done;
}
- /* Instruction can only be executed in protected mode */
- if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) {
- rc = emulate_ud(ctxt);
- goto done;
- }
-
/* Do instruction specific permission checks */
if (ctxt->d & CheckPerm) {
rc = ctxt->check_perm(ctxt);
@@ -4974,8 +5008,7 @@ writeback:
count = ctxt->src.count;
else
count = ctxt->dst.count;
- register_address_increment(ctxt, reg_rmw(ctxt, VCPU_REGS_RCX),
- -count);
+ register_address_increment(ctxt, VCPU_REGS_RCX, -count);
if (!string_insn_completed(ctxt)) {
/*
@@ -5053,11 +5086,6 @@ twobyte_insn:
ctxt->dst.val = (ctxt->src.bytes == 1) ? (s8) ctxt->src.val :
(s16) ctxt->src.val;
break;
- case 0xc3: /* movnti */
- ctxt->dst.bytes = ctxt->op_bytes;
- ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val :
- (u32) ctxt->src.val;
- break;
default:
goto cannot_emulate;
}
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
new file mode 100644
index 000000000000..b1947e0f3e10
--- /dev/null
+++ b/arch/x86/kvm/ioapic.c
@@ -0,0 +1,675 @@
+/*
+ * Copyright (C) 2001 MandrakeSoft S.A.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * MandrakeSoft S.A.
+ * 43, rue d'Aboukir
+ * 75002 Paris - France
+ * http://www.linux-mandrake.com/
+ * http://www.mandrakesoft.com/
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Yunhong Jiang <yunhong.jiang@intel.com>
+ * Yaozu (Eddie) Dong <eddie.dong@intel.com>
+ * Based on Xen 3.1 code.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/smp.h>
+#include <linux/hrtimer.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <asm/processor.h>
+#include <asm/page.h>
+#include <asm/current.h>
+#include <trace/events/kvm.h>
+
+#include "ioapic.h"
+#include "lapic.h"
+#include "irq.h"
+
+#if 0
+#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg)
+#else
+#define ioapic_debug(fmt, arg...)
+#endif
+static int ioapic_service(struct kvm_ioapic *vioapic, int irq,
+ bool line_status);
+
+static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
+ unsigned long addr,
+ unsigned long length)
+{
+ unsigned long result = 0;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ result = ((((IOAPIC_NUM_PINS - 1) & 0xff) << 16)
+ | (IOAPIC_VERSION_ID & 0xff));
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ case IOAPIC_REG_ARB_ID:
+ result = ((ioapic->id & 0xf) << 24);
+ break;
+
+ default:
+ {
+ u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
+ u64 redir_content;
+
+ if (redir_index < IOAPIC_NUM_PINS)
+ redir_content =
+ ioapic->redirtbl[redir_index].bits;
+ else
+ redir_content = ~0ULL;
+
+ result = (ioapic->ioregsel & 0x1) ?
+ (redir_content >> 32) & 0xffffffff :
+ redir_content & 0xffffffff;
+ break;
+ }
+ }
+
+ return result;
+}
+
+static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
+{
+ ioapic->rtc_status.pending_eoi = 0;
+ bitmap_zero(ioapic->rtc_status.dest_map, KVM_MAX_VCPUS);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
+
+static void rtc_status_pending_eoi_check_valid(struct kvm_ioapic *ioapic)
+{
+ if (WARN_ON(ioapic->rtc_status.pending_eoi < 0))
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ bool new_val, old_val;
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ union kvm_ioapic_redirect_entry *e;
+
+ e = &ioapic->redirtbl[RTC_GSI];
+ if (!kvm_apic_match_dest(vcpu, NULL, 0, e->fields.dest_id,
+ e->fields.dest_mode))
+ return;
+
+ new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector);
+ old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+
+ if (new_val == old_val)
+ return;
+
+ if (new_val) {
+ __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi++;
+ } else {
+ __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi--;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
+{
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ if (RTC_GSI >= IOAPIC_NUM_PINS)
+ return;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ kvm_for_each_vcpu(i, vcpu, ioapic->kvm)
+ __rtc_irq_eoi_tracking_restore_one(vcpu);
+}
+
+static void rtc_irq_eoi(struct kvm_ioapic *ioapic, struct kvm_vcpu *vcpu)
+{
+ if (test_and_clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map)) {
+ --ioapic->rtc_status.pending_eoi;
+ rtc_status_pending_eoi_check_valid(ioapic);
+ }
+}
+
+static bool rtc_irq_check_coalesced(struct kvm_ioapic *ioapic)
+{
+ if (ioapic->rtc_status.pending_eoi > 0)
+ return true; /* coalesced */
+
+ return false;
+}
+
+static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
+ int irq_level, bool line_status)
+{
+ union kvm_ioapic_redirect_entry entry;
+ u32 mask = 1 << irq;
+ u32 old_irr;
+ int edge, ret;
+
+ entry = ioapic->redirtbl[irq];
+ edge = (entry.fields.trig_mode == IOAPIC_EDGE_TRIG);
+
+ if (!irq_level) {
+ ioapic->irr &= ~mask;
+ ret = 1;
+ goto out;
+ }
+
+ /*
+ * Return 0 for coalesced interrupts; for edge-triggered interrupts,
+ * this only happens if a previous edge has not been delivered due
+ * do masking. For level interrupts, the remote_irr field tells
+ * us if the interrupt is waiting for an EOI.
+ *
+ * RTC is special: it is edge-triggered, but userspace likes to know
+ * if it has been already ack-ed via EOI because coalesced RTC
+ * interrupts lead to time drift in Windows guests. So we track
+ * EOI manually for the RTC interrupt.
+ */
+ if (irq == RTC_GSI && line_status &&
+ rtc_irq_check_coalesced(ioapic)) {
+ ret = 0;
+ goto out;
+ }
+
+ old_irr = ioapic->irr;
+ ioapic->irr |= mask;
+ if ((edge && old_irr == ioapic->irr) ||
+ (!edge && entry.fields.remote_irr)) {
+ ret = 0;
+ goto out;
+ }
+
+ ret = ioapic_service(ioapic, irq, line_status);
+
+out:
+ trace_kvm_ioapic_set_irq(entry.bits, irq, ret == 0);
+ return ret;
+}
+
+static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
+{
+ u32 idx;
+
+ rtc_irq_eoi_tracking_reset(ioapic);
+ for_each_set_bit(idx, &irr, IOAPIC_NUM_PINS)
+ ioapic_set_irq(ioapic, idx, 1, true);
+
+ kvm_rtc_eoi_tracking_restore_all(ioapic);
+}
+
+
+static void update_handled_vectors(struct kvm_ioapic *ioapic)
+{
+ DECLARE_BITMAP(handled_vectors, 256);
+ int i;
+
+ memset(handled_vectors, 0, sizeof(handled_vectors));
+ for (i = 0; i < IOAPIC_NUM_PINS; ++i)
+ __set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
+ memcpy(ioapic->handled_vectors, handled_vectors,
+ sizeof(handled_vectors));
+ smp_wmb();
+}
+
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+ u32 *tmr)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+ union kvm_ioapic_redirect_entry *e;
+ int index;
+
+ spin_lock(&ioapic->lock);
+ for (index = 0; index < IOAPIC_NUM_PINS; index++) {
+ e = &ioapic->redirtbl[index];
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG ||
+ kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
+ index == RTC_GSI) {
+ if (kvm_apic_match_dest(vcpu, NULL, 0,
+ e->fields.dest_id, e->fields.dest_mode)) {
+ __set_bit(e->fields.vector,
+ (unsigned long *)eoi_exit_bitmap);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
+ __set_bit(e->fields.vector,
+ (unsigned long *)tmr);
+ }
+ }
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ if (!ioapic)
+ return;
+ kvm_make_scan_ioapic_request(kvm);
+}
+
+static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
+{
+ unsigned index;
+ bool mask_before, mask_after;
+ union kvm_ioapic_redirect_entry *e;
+
+ switch (ioapic->ioregsel) {
+ case IOAPIC_REG_VERSION:
+ /* Writes are ignored. */
+ break;
+
+ case IOAPIC_REG_APIC_ID:
+ ioapic->id = (val >> 24) & 0xf;
+ break;
+
+ case IOAPIC_REG_ARB_ID:
+ break;
+
+ default:
+ index = (ioapic->ioregsel - 0x10) >> 1;
+
+ ioapic_debug("change redir index %x val %x\n", index, val);
+ if (index >= IOAPIC_NUM_PINS)
+ return;
+ e = &ioapic->redirtbl[index];
+ mask_before = e->fields.mask;
+ if (ioapic->ioregsel & 1) {
+ e->bits &= 0xffffffff;
+ e->bits |= (u64) val << 32;
+ } else {
+ e->bits &= ~0xffffffffULL;
+ e->bits |= (u32) val;
+ e->fields.remote_irr = 0;
+ }
+ update_handled_vectors(ioapic);
+ mask_after = e->fields.mask;
+ if (mask_before != mask_after)
+ kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
+ if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
+ && ioapic->irr & (1 << index))
+ ioapic_service(ioapic, index, false);
+ kvm_vcpu_request_scan_ioapic(ioapic->kvm);
+ break;
+ }
+}
+
+static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
+{
+ union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+ struct kvm_lapic_irq irqe;
+ int ret;
+
+ if (entry->fields.mask)
+ return -1;
+
+ ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+ "vector=%x trig_mode=%x\n",
+ entry->fields.dest_id, entry->fields.dest_mode,
+ entry->fields.delivery_mode, entry->fields.vector,
+ entry->fields.trig_mode);
+
+ irqe.dest_id = entry->fields.dest_id;
+ irqe.vector = entry->fields.vector;
+ irqe.dest_mode = entry->fields.dest_mode;
+ irqe.trig_mode = entry->fields.trig_mode;
+ irqe.delivery_mode = entry->fields.delivery_mode << 8;
+ irqe.level = 1;
+ irqe.shorthand = 0;
+
+ if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
+ ioapic->irr &= ~(1 << irq);
+
+ if (irq == RTC_GSI && line_status) {
+ /*
+ * pending_eoi cannot ever become negative (see
+ * rtc_status_pending_eoi_check_valid) and the caller
+ * ensures that it is only called if it is >= zero, namely
+ * if rtc_irq_check_coalesced returns false).
+ */
+ BUG_ON(ioapic->rtc_status.pending_eoi != 0);
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe,
+ ioapic->rtc_status.dest_map);
+ ioapic->rtc_status.pending_eoi = (ret < 0 ? 0 : ret);
+ } else
+ ret = kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe, NULL);
+
+ if (ret && irqe.trig_mode == IOAPIC_LEVEL_TRIG)
+ entry->fields.remote_irr = 1;
+
+ return ret;
+}
+
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status)
+{
+ int ret, irq_level;
+
+ BUG_ON(irq < 0 || irq >= IOAPIC_NUM_PINS);
+
+ spin_lock(&ioapic->lock);
+ irq_level = __kvm_irq_line_state(&ioapic->irq_states[irq],
+ irq_source_id, level);
+ ret = ioapic_set_irq(ioapic, irq, irq_level, line_status);
+
+ spin_unlock(&ioapic->lock);
+
+ return ret;
+}
+
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id)
+{
+ int i;
+
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < KVM_IOAPIC_NUM_PINS; i++)
+ __clear_bit(irq_source_id, &ioapic->irq_states[i]);
+ spin_unlock(&ioapic->lock);
+}
+
+static void kvm_ioapic_eoi_inject_work(struct work_struct *work)
+{
+ int i;
+ struct kvm_ioapic *ioapic = container_of(work, struct kvm_ioapic,
+ eoi_inject.work);
+ spin_lock(&ioapic->lock);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.trig_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ if (ioapic->irr & (1 << i) && !ent->fields.remote_irr)
+ ioapic_service(ioapic, i, false);
+ }
+ spin_unlock(&ioapic->lock);
+}
+
+#define IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT 10000
+
+static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
+ struct kvm_ioapic *ioapic, int vector, int trigger_mode)
+{
+ int i;
+
+ for (i = 0; i < IOAPIC_NUM_PINS; i++) {
+ union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
+
+ if (ent->fields.vector != vector)
+ continue;
+
+ if (i == RTC_GSI)
+ rtc_irq_eoi(ioapic, vcpu);
+ /*
+ * We are dropping lock while calling ack notifiers because ack
+ * notifier callbacks for assigned devices call into IOAPIC
+ * recursively. Since remote_irr is cleared only after call
+ * to notifiers if the same vector will be delivered while lock
+ * is dropped it will be put into irr and will be delivered
+ * after ack notifier returns.
+ */
+ spin_unlock(&ioapic->lock);
+ kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
+ spin_lock(&ioapic->lock);
+
+ if (trigger_mode != IOAPIC_LEVEL_TRIG)
+ continue;
+
+ ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
+ ent->fields.remote_irr = 0;
+ if (!ent->fields.mask && (ioapic->irr & (1 << i))) {
+ ++ioapic->irq_eoi[i];
+ if (ioapic->irq_eoi[i] == IOAPIC_SUCCESSIVE_IRQ_MAX_COUNT) {
+ /*
+ * Real hardware does not deliver the interrupt
+ * immediately during eoi broadcast, and this
+ * lets a buggy guest make slow progress
+ * even if it does not correctly handle a
+ * level-triggered interrupt. Emulate this
+ * behavior if we detect an interrupt storm.
+ */
+ schedule_delayed_work(&ioapic->eoi_inject, HZ / 100);
+ ioapic->irq_eoi[i] = 0;
+ trace_kvm_ioapic_delayed_eoi_inj(ent->bits);
+ } else {
+ ioapic_service(ioapic, i, false);
+ }
+ } else {
+ ioapic->irq_eoi[i] = 0;
+ }
+ }
+}
+
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ smp_rmb();
+ return test_bit(vector, ioapic->handled_vectors);
+}
+
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
+{
+ struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
+
+ spin_lock(&ioapic->lock);
+ __kvm_ioapic_update_eoi(vcpu, ioapic, vector, trigger_mode);
+ spin_unlock(&ioapic->lock);
+}
+
+static inline struct kvm_ioapic *to_ioapic(struct kvm_io_device *dev)
+{
+ return container_of(dev, struct kvm_ioapic, dev);
+}
+
+static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
+{
+ return ((addr >= ioapic->base_address &&
+ (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
+}
+
+static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
+ void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 result;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("addr %lx\n", (unsigned long)addr);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ result = ioapic->ioregsel;
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ result = ioapic_read_indirect(ioapic, addr, len);
+ break;
+
+ default:
+ result = 0;
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+
+ switch (len) {
+ case 8:
+ *(u64 *) val = result;
+ break;
+ case 1:
+ case 2:
+ case 4:
+ memcpy(val, (char *)&result, len);
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: wrong length %d\n", len);
+ }
+ return 0;
+}
+
+static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
+ const void *val)
+{
+ struct kvm_ioapic *ioapic = to_ioapic(this);
+ u32 data;
+ if (!ioapic_in_range(ioapic, addr))
+ return -EOPNOTSUPP;
+
+ ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n",
+ (void*)addr, len, val);
+ ASSERT(!(addr & 0xf)); /* check alignment */
+
+ switch (len) {
+ case 8:
+ case 4:
+ data = *(u32 *) val;
+ break;
+ case 2:
+ data = *(u16 *) val;
+ break;
+ case 1:
+ data = *(u8 *) val;
+ break;
+ default:
+ printk(KERN_WARNING "ioapic: Unsupported size %d\n", len);
+ return 0;
+ }
+
+ addr &= 0xff;
+ spin_lock(&ioapic->lock);
+ switch (addr) {
+ case IOAPIC_REG_SELECT:
+ ioapic->ioregsel = data & 0xFF; /* 8-bit register */
+ break;
+
+ case IOAPIC_REG_WINDOW:
+ ioapic_write_indirect(ioapic, data);
+ break;
+
+ default:
+ break;
+ }
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
+{
+ int i;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ for (i = 0; i < IOAPIC_NUM_PINS; i++)
+ ioapic->redirtbl[i].fields.mask = 1;
+ ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
+ ioapic->ioregsel = 0;
+ ioapic->irr = 0;
+ ioapic->id = 0;
+ memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
+ rtc_irq_eoi_tracking_reset(ioapic);
+ update_handled_vectors(ioapic);
+}
+
+static const struct kvm_io_device_ops ioapic_mmio_ops = {
+ .read = ioapic_mmio_read,
+ .write = ioapic_mmio_write,
+};
+
+int kvm_ioapic_init(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic;
+ int ret;
+
+ ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+ if (!ioapic)
+ return -ENOMEM;
+ spin_lock_init(&ioapic->lock);
+ INIT_DELAYED_WORK(&ioapic->eoi_inject, kvm_ioapic_eoi_inject_work);
+ kvm->arch.vioapic = ioapic;
+ kvm_ioapic_reset(ioapic);
+ kvm_iodevice_init(&ioapic->dev, &ioapic_mmio_ops);
+ ioapic->kvm = kvm;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, ioapic->base_address,
+ IOAPIC_MEM_LENGTH, &ioapic->dev);
+ mutex_unlock(&kvm->slots_lock);
+ if (ret < 0) {
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+
+ return ret;
+}
+
+void kvm_ioapic_destroy(struct kvm *kvm)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+
+ cancel_delayed_work_sync(&ioapic->eoi_inject);
+ if (ioapic) {
+ kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev);
+ kvm->arch.vioapic = NULL;
+ kfree(ioapic);
+ }
+}
+
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
+
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
+{
+ struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
+ if (!ioapic)
+ return -EINVAL;
+
+ spin_lock(&ioapic->lock);
+ memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
+ ioapic->irr = 0;
+ update_handled_vectors(ioapic);
+ kvm_vcpu_request_scan_ioapic(kvm);
+ kvm_ioapic_inject_all(ioapic, state->irr);
+ spin_unlock(&ioapic->lock);
+ return 0;
+}
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
new file mode 100644
index 000000000000..3c9195535ffc
--- /dev/null
+++ b/arch/x86/kvm/ioapic.h
@@ -0,0 +1,119 @@
+#ifndef __KVM_IO_APIC_H
+#define __KVM_IO_APIC_H
+
+#include <linux/kvm_host.h>
+
+#include "iodev.h"
+
+struct kvm;
+struct kvm_vcpu;
+
+#define IOAPIC_NUM_PINS KVM_IOAPIC_NUM_PINS
+#define IOAPIC_VERSION_ID 0x11 /* IOAPIC version */
+#define IOAPIC_EDGE_TRIG 0
+#define IOAPIC_LEVEL_TRIG 1
+
+#define IOAPIC_DEFAULT_BASE_ADDRESS 0xfec00000
+#define IOAPIC_MEM_LENGTH 0x100
+
+/* Direct registers. */
+#define IOAPIC_REG_SELECT 0x00
+#define IOAPIC_REG_WINDOW 0x10
+
+/* Indirect registers. */
+#define IOAPIC_REG_APIC_ID 0x00 /* x86 IOAPIC only */
+#define IOAPIC_REG_VERSION 0x01
+#define IOAPIC_REG_ARB_ID 0x02 /* x86 IOAPIC only */
+
+/*ioapic delivery mode*/
+#define IOAPIC_FIXED 0x0
+#define IOAPIC_LOWEST_PRIORITY 0x1
+#define IOAPIC_PMI 0x2
+#define IOAPIC_NMI 0x4
+#define IOAPIC_INIT 0x5
+#define IOAPIC_EXTINT 0x7
+
+#ifdef CONFIG_X86
+#define RTC_GSI 8
+#else
+#define RTC_GSI -1U
+#endif
+
+struct rtc_status {
+ int pending_eoi;
+ DECLARE_BITMAP(dest_map, KVM_MAX_VCPUS);
+};
+
+union kvm_ioapic_redirect_entry {
+ u64 bits;
+ struct {
+ u8 vector;
+ u8 delivery_mode:3;
+ u8 dest_mode:1;
+ u8 delivery_status:1;
+ u8 polarity:1;
+ u8 remote_irr:1;
+ u8 trig_mode:1;
+ u8 mask:1;
+ u8 reserve:7;
+ u8 reserved[4];
+ u8 dest_id;
+ } fields;
+};
+
+struct kvm_ioapic {
+ u64 base_address;
+ u32 ioregsel;
+ u32 id;
+ u32 irr;
+ u32 pad;
+ union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
+ unsigned long irq_states[IOAPIC_NUM_PINS];
+ struct kvm_io_device dev;
+ struct kvm *kvm;
+ void (*ack_notifier)(void *opaque, int irq);
+ spinlock_t lock;
+ DECLARE_BITMAP(handled_vectors, 256);
+ struct rtc_status rtc_status;
+ struct delayed_work eoi_inject;
+ u32 irq_eoi[IOAPIC_NUM_PINS];
+};
+
+#ifdef DEBUG
+#define ASSERT(x) \
+do { \
+ if (!(x)) { \
+ printk(KERN_EMERG "assertion failed %s: %d: %s\n", \
+ __FILE__, __LINE__, #x); \
+ BUG(); \
+ } \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
+{
+ return kvm->arch.vioapic;
+}
+
+void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+ int short_hand, unsigned int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
+void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
+ int trigger_mode);
+bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
+int kvm_ioapic_init(struct kvm *kvm);
+void kvm_ioapic_destroy(struct kvm *kvm);
+int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
+ int level, bool line_status);
+void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, unsigned long *dest_map);
+int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
+ u32 *tmr);
+
+#endif
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
new file mode 100644
index 000000000000..17b73eeac8a4
--- /dev/null
+++ b/arch/x86/kvm/iommu.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2006, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ *
+ * Copyright (C) 2006-2008 Intel Corporation
+ * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
+ * Author: Allen M. Kay <allen.m.kay@intel.com>
+ * Author: Weidong Han <weidong.han@intel.com>
+ * Author: Ben-Ami Yassour <benami@il.ibm.com>
+ */
+
+#include <linux/list.h>
+#include <linux/kvm_host.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/stat.h>
+#include <linux/dmar.h>
+#include <linux/iommu.h>
+#include <linux/intel-iommu.h>
+#include "assigned-dev.h"
+
+static bool allow_unsafe_assigned_interrupts;
+module_param_named(allow_unsafe_assigned_interrupts,
+ allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(allow_unsafe_assigned_interrupts,
+ "Enable device assignment on platforms without interrupt remapping support.");
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm);
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages);
+
+static pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn,
+ unsigned long npages)
+{
+ gfn_t end_gfn;
+ pfn_t pfn;
+
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ end_gfn = gfn + npages;
+ gfn += 1;
+
+ if (is_error_noslot_pfn(pfn))
+ return pfn;
+
+ while (gfn < end_gfn)
+ gfn_to_pfn_memslot(slot, gfn++);
+
+ return pfn;
+}
+
+static void kvm_unpin_pages(struct kvm *kvm, pfn_t pfn, unsigned long npages)
+{
+ unsigned long i;
+
+ for (i = 0; i < npages; ++i)
+ kvm_release_pfn_clean(pfn + i);
+}
+
+int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ gfn_t gfn, end_gfn;
+ pfn_t pfn;
+ int r = 0;
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int flags;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ gfn = slot->base_gfn;
+ end_gfn = gfn + slot->npages;
+
+ flags = IOMMU_READ;
+ if (!(slot->flags & KVM_MEM_READONLY))
+ flags |= IOMMU_WRITE;
+ if (!kvm->arch.iommu_noncoherent)
+ flags |= IOMMU_CACHE;
+
+
+ while (gfn < end_gfn) {
+ unsigned long page_size;
+
+ /* Check if already mapped */
+ if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Get the page size we could use to map */
+ page_size = kvm_host_page_size(kvm, gfn);
+
+ /* Make sure the page_size does not exceed the memslot */
+ while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn)
+ page_size >>= 1;
+
+ /* Make sure gfn is aligned to the page size we want to map */
+ while ((gfn << PAGE_SHIFT) & (page_size - 1))
+ page_size >>= 1;
+
+ /* Make sure hva is aligned to the page size we want to map */
+ while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1))
+ page_size >>= 1;
+
+ /*
+ * Pin all pages we are about to map in memory. This is
+ * important because we unmap and unpin in 4kb steps later.
+ */
+ pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT);
+ if (is_error_noslot_pfn(pfn)) {
+ gfn += 1;
+ continue;
+ }
+
+ /* Map into IO address space */
+ r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn),
+ page_size, flags);
+ if (r) {
+ printk(KERN_ERR "kvm_iommu_map_address:"
+ "iommu failed to map pfn=%llx\n", pfn);
+ kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT);
+ goto unmap_pages;
+ }
+
+ gfn += page_size >> PAGE_SHIFT;
+
+
+ }
+
+ return 0;
+
+unmap_pages:
+ kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn);
+ return r;
+}
+
+static int kvm_iommu_map_memslots(struct kvm *kvm)
+{
+ int idx, r = 0;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_register_noncoherent_dma(kvm);
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ kvm_for_each_memslot(memslot, slots) {
+ r = kvm_iommu_map_pages(kvm, memslot);
+ if (r)
+ break;
+ }
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return r;
+}
+
+int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+ int r;
+ bool noncoherent;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ if (pdev == NULL)
+ return -ENODEV;
+
+ r = iommu_attach_device(domain, &pdev->dev);
+ if (r) {
+ dev_err(&pdev->dev, "kvm assign device failed ret %d", r);
+ return r;
+ }
+
+ noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY);
+
+ /* Check if need to update IOMMU page table for guest memory */
+ if (noncoherent != kvm->arch.iommu_noncoherent) {
+ kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_noncoherent = noncoherent;
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ goto out_unmap;
+ }
+
+ pci_set_dev_assigned(pdev);
+
+ dev_info(&pdev->dev, "kvm assign device\n");
+
+ return 0;
+out_unmap:
+ kvm_iommu_unmap_memslots(kvm);
+ return r;
+}
+
+int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ if (pdev == NULL)
+ return -ENODEV;
+
+ iommu_detach_device(domain, &pdev->dev);
+
+ pci_clear_dev_assigned(pdev);
+
+ dev_info(&pdev->dev, "kvm deassign device\n");
+
+ return 0;
+}
+
+int kvm_iommu_map_guest(struct kvm *kvm)
+{
+ int r;
+
+ if (!iommu_present(&pci_bus_type)) {
+ printk(KERN_ERR "%s: iommu not found\n", __func__);
+ return -ENODEV;
+ }
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type);
+ if (!kvm->arch.iommu_domain) {
+ r = -ENOMEM;
+ goto out_unlock;
+ }
+
+ if (!allow_unsafe_assigned_interrupts &&
+ !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) {
+ printk(KERN_WARNING "%s: No interrupt remapping support,"
+ " disallowing device assignment."
+ " Re-enble with \"allow_unsafe_assigned_interrupts=1\""
+ " module option.\n", __func__);
+ iommu_domain_free(kvm->arch.iommu_domain);
+ kvm->arch.iommu_domain = NULL;
+ r = -EPERM;
+ goto out_unlock;
+ }
+
+ r = kvm_iommu_map_memslots(kvm);
+ if (r)
+ kvm_iommu_unmap_memslots(kvm);
+
+out_unlock:
+ mutex_unlock(&kvm->slots_lock);
+ return r;
+}
+
+static void kvm_iommu_put_pages(struct kvm *kvm,
+ gfn_t base_gfn, unsigned long npages)
+{
+ struct iommu_domain *domain;
+ gfn_t end_gfn, gfn;
+ pfn_t pfn;
+ u64 phys;
+
+ domain = kvm->arch.iommu_domain;
+ end_gfn = base_gfn + npages;
+ gfn = base_gfn;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return;
+
+ while (gfn < end_gfn) {
+ unsigned long unmap_pages;
+ size_t size;
+
+ /* Get physical address */
+ phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn));
+
+ if (!phys) {
+ gfn++;
+ continue;
+ }
+
+ pfn = phys >> PAGE_SHIFT;
+
+ /* Unmap address from IO address space */
+ size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE);
+ unmap_pages = 1ULL << get_order(size);
+
+ /* Unpin all pages we just unmapped to not leak any memory */
+ kvm_unpin_pages(kvm, pfn, unmap_pages);
+
+ gfn += unmap_pages;
+ }
+}
+
+void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages);
+}
+
+static int kvm_iommu_unmap_memslots(struct kvm *kvm)
+{
+ int idx;
+ struct kvm_memslots *slots;
+ struct kvm_memory_slot *memslot;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ slots = kvm_memslots(kvm);
+
+ kvm_for_each_memslot(memslot, slots)
+ kvm_iommu_unmap_pages(kvm, memslot);
+
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ if (kvm->arch.iommu_noncoherent)
+ kvm_arch_unregister_noncoherent_dma(kvm);
+
+ return 0;
+}
+
+int kvm_iommu_unmap_guest(struct kvm *kvm)
+{
+ struct iommu_domain *domain = kvm->arch.iommu_domain;
+
+ /* check if iommu exists and in use */
+ if (!domain)
+ return 0;
+
+ mutex_lock(&kvm->slots_lock);
+ kvm_iommu_unmap_memslots(kvm);
+ kvm->arch.iommu_domain = NULL;
+ kvm->arch.iommu_noncoherent = false;
+ mutex_unlock(&kvm->slots_lock);
+
+ iommu_domain_free(domain);
+ return 0;
+}
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
new file mode 100644
index 000000000000..72298b3ac025
--- /dev/null
+++ b/arch/x86/kvm/irq_comm.c
@@ -0,0 +1,332 @@
+/*
+ * irq_comm.c: Common API for in kernel interrupt controller
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ * Authors:
+ * Yaozu (Eddie) Dong <Eddie.dong@intel.com>
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <trace/events/kvm.h>
+
+#include <asm/msidef.h>
+
+#include "irq.h"
+
+#include "ioapic.h"
+
+static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_pic *pic = pic_irqchip(kvm);
+ return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level);
+}
+
+static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level,
+ bool line_status)
+{
+ struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+ return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+ line_status);
+}
+
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
+{
+ return irq->delivery_mode == APIC_DM_LOWEST;
+}
+
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, unsigned long *dest_map)
+{
+ int i, r = -1;
+ struct kvm_vcpu *vcpu, *lowest = NULL;
+
+ if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+ kvm_is_dm_lowest_prio(irq)) {
+ printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+ irq->delivery_mode = APIC_DM_FIXED;
+ }
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, src, irq, &r, dest_map))
+ return r;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (!kvm_apic_present(vcpu))
+ continue;
+
+ if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+ irq->dest_id, irq->dest_mode))
+ continue;
+
+ if (!kvm_is_dm_lowest_prio(irq)) {
+ if (r < 0)
+ r = 0;
+ r += kvm_apic_set_irq(vcpu, irq, dest_map);
+ } else if (kvm_lapic_enabled(vcpu)) {
+ if (!lowest)
+ lowest = vcpu;
+ else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+ lowest = vcpu;
+ }
+ }
+
+ if (lowest)
+ r = kvm_apic_set_irq(lowest, irq, dest_map);
+
+ return r;
+}
+
+static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm_lapic_irq *irq)
+{
+ trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+
+ irq->dest_id = (e->msi.address_lo &
+ MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ irq->vector = (e->msi.data &
+ MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+ irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+ irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+ irq->delivery_mode = e->msi.data & 0x700;
+ irq->level = 1;
+ irq->shorthand = 0;
+ /* TODO Deal with RH bit of MSI message address */
+}
+
+int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm, int irq_source_id, int level, bool line_status)
+{
+ struct kvm_lapic_irq irq;
+
+ if (!level)
+ return -1;
+
+ kvm_set_msi_irq(e, &irq);
+
+ return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
+}
+
+
+static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
+ struct kvm *kvm)
+{
+ struct kvm_lapic_irq irq;
+ int r;
+
+ kvm_set_msi_irq(e, &irq);
+
+ if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
+ return r;
+ else
+ return -EWOULDBLOCK;
+}
+
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ * Other values - No need to retry.
+ */
+int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+{
+ struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+ struct kvm_kernel_irq_routing_entry *e;
+ int ret = -EINVAL;
+ int idx;
+
+ trace_kvm_set_irq(irq, level, irq_source_id);
+
+ /*
+ * Injection into either PIC or IOAPIC might need to scan all CPUs,
+ * which would need to be retried from thread context; when same GSI
+ * is connected to both PIC and IOAPIC, we'd have to report a
+ * partial failure here.
+ * Since there's no easy way to do this, we only support injecting MSI
+ * which is limited to 1:1 GSI mapping.
+ */
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+ e = &entries[0];
+ if (likely(e->type == KVM_IRQ_ROUTING_MSI))
+ ret = kvm_set_msi_inatomic(e, kvm);
+ else
+ ret = -EWOULDBLOCK;
+ }
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
+
+int kvm_request_irq_source_id(struct kvm *kvm)
+{
+ unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
+ int irq_source_id;
+
+ mutex_lock(&kvm->irq_lock);
+ irq_source_id = find_first_zero_bit(bitmap, BITS_PER_LONG);
+
+ if (irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_WARNING "kvm: exhaust allocatable IRQ sources!\n");
+ irq_source_id = -EFAULT;
+ goto unlock;
+ }
+
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+ set_bit(irq_source_id, bitmap);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+
+ return irq_source_id;
+}
+
+void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
+{
+ ASSERT(irq_source_id != KVM_USERSPACE_IRQ_SOURCE_ID);
+ ASSERT(irq_source_id != KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID);
+
+ mutex_lock(&kvm->irq_lock);
+ if (irq_source_id < 0 ||
+ irq_source_id >= BITS_PER_LONG) {
+ printk(KERN_ERR "kvm: IRQ source ID out of range!\n");
+ goto unlock;
+ }
+ clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
+ if (!irqchip_in_kernel(kvm))
+ goto unlock;
+
+ kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
+ kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id);
+unlock:
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ kimn->irq = irq;
+ hlist_add_head_rcu(&kimn->link, &kvm->arch.mask_notifier_list);
+ mutex_unlock(&kvm->irq_lock);
+}
+
+void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
+ struct kvm_irq_mask_notifier *kimn)
+{
+ mutex_lock(&kvm->irq_lock);
+ hlist_del_rcu(&kimn->link);
+ mutex_unlock(&kvm->irq_lock);
+ synchronize_srcu(&kvm->irq_srcu);
+}
+
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+ bool mask)
+{
+ struct kvm_irq_mask_notifier *kimn;
+ int idx, gsi;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
+ if (gsi != -1)
+ hlist_for_each_entry_rcu(kimn, &kvm->arch.mask_notifier_list, link)
+ if (kimn->irq == gsi)
+ kimn->func(kimn, mask);
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+ const struct kvm_irq_routing_entry *ue)
+{
+ int r = -EINVAL;
+ int delta;
+ unsigned max_pin;
+
+ switch (ue->type) {
+ case KVM_IRQ_ROUTING_IRQCHIP:
+ delta = 0;
+ switch (ue->u.irqchip.irqchip) {
+ case KVM_IRQCHIP_PIC_MASTER:
+ e->set = kvm_set_pic_irq;
+ max_pin = PIC_NUM_PINS;
+ break;
+ case KVM_IRQCHIP_PIC_SLAVE:
+ e->set = kvm_set_pic_irq;
+ max_pin = PIC_NUM_PINS;
+ delta = 8;
+ break;
+ case KVM_IRQCHIP_IOAPIC:
+ max_pin = KVM_IOAPIC_NUM_PINS;
+ e->set = kvm_set_ioapic_irq;
+ break;
+ default:
+ goto out;
+ }
+ e->irqchip.irqchip = ue->u.irqchip.irqchip;
+ e->irqchip.pin = ue->u.irqchip.pin + delta;
+ if (e->irqchip.pin >= max_pin)
+ goto out;
+ break;
+ case KVM_IRQ_ROUTING_MSI:
+ e->set = kvm_set_msi;
+ e->msi.address_lo = ue->u.msi.address_lo;
+ e->msi.address_hi = ue->u.msi.address_hi;
+ e->msi.data = ue->u.msi.data;
+ break;
+ default:
+ goto out;
+ }
+
+ r = 0;
+out:
+ return r;
+}
+
+#define IOAPIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
+#define ROUTING_ENTRY1(irq) IOAPIC_ROUTING_ENTRY(irq)
+
+#define PIC_ROUTING_ENTRY(irq) \
+ { .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP, \
+ .u.irqchip = { .irqchip = SELECT_PIC(irq), .pin = (irq) % 8 } }
+#define ROUTING_ENTRY2(irq) \
+ IOAPIC_ROUTING_ENTRY(irq), PIC_ROUTING_ENTRY(irq)
+
+static const struct kvm_irq_routing_entry default_routing[] = {
+ ROUTING_ENTRY2(0), ROUTING_ENTRY2(1),
+ ROUTING_ENTRY2(2), ROUTING_ENTRY2(3),
+ ROUTING_ENTRY2(4), ROUTING_ENTRY2(5),
+ ROUTING_ENTRY2(6), ROUTING_ENTRY2(7),
+ ROUTING_ENTRY2(8), ROUTING_ENTRY2(9),
+ ROUTING_ENTRY2(10), ROUTING_ENTRY2(11),
+ ROUTING_ENTRY2(12), ROUTING_ENTRY2(13),
+ ROUTING_ENTRY2(14), ROUTING_ENTRY2(15),
+ ROUTING_ENTRY1(16), ROUTING_ENTRY1(17),
+ ROUTING_ENTRY1(18), ROUTING_ENTRY1(19),
+ ROUTING_ENTRY1(20), ROUTING_ENTRY1(21),
+ ROUTING_ENTRY1(22), ROUTING_ENTRY1(23),
+};
+
+int kvm_setup_default_irq_routing(struct kvm *kvm)
+{
+ return kvm_set_irq_routing(kvm, default_routing,
+ ARRAY_SIZE(default_routing), 0);
+}
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b8345dd41b25..4f0c0b954686 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -68,6 +68,9 @@
#define MAX_APIC_VECTOR 256
#define APIC_VECTORS_PER_REG 32
+#define APIC_BROADCAST 0xFF
+#define X2APIC_BROADCAST 0xFFFFFFFFul
+
#define VEC_POS(v) ((v) & (32 - 1))
#define REG_POS(v) (((v) >> 5) << 4)
@@ -129,8 +132,6 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
}
-#define KVM_X2APIC_CID_BITS 0
-
static void recalculate_apic_map(struct kvm *kvm)
{
struct kvm_apic_map *new, *old = NULL;
@@ -149,42 +150,56 @@ static void recalculate_apic_map(struct kvm *kvm)
new->cid_shift = 8;
new->cid_mask = 0;
new->lid_mask = 0xff;
+ new->broadcast = APIC_BROADCAST;
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_lapic *apic = vcpu->arch.apic;
- u16 cid, lid;
- u32 ldr;
if (!kvm_apic_present(vcpu))
continue;
+ if (apic_x2apic_mode(apic)) {
+ new->ldr_bits = 32;
+ new->cid_shift = 16;
+ new->cid_mask = new->lid_mask = 0xffff;
+ new->broadcast = X2APIC_BROADCAST;
+ } else if (kvm_apic_get_reg(apic, APIC_LDR)) {
+ if (kvm_apic_get_reg(apic, APIC_DFR) ==
+ APIC_DFR_CLUSTER) {
+ new->cid_shift = 4;
+ new->cid_mask = 0xf;
+ new->lid_mask = 0xf;
+ } else {
+ new->cid_shift = 8;
+ new->cid_mask = 0;
+ new->lid_mask = 0xff;
+ }
+ }
+
/*
* All APICs have to be configured in the same mode by an OS.
* We take advatage of this while building logical id loockup
- * table. After reset APICs are in xapic/flat mode, so if we
- * find apic with different setting we assume this is the mode
+ * table. After reset APICs are in software disabled mode, so if
+ * we find apic with different setting we assume this is the mode
* OS wants all apics to be in; build lookup table accordingly.
*/
- if (apic_x2apic_mode(apic)) {
- new->ldr_bits = 32;
- new->cid_shift = 16;
- new->cid_mask = (1 << KVM_X2APIC_CID_BITS) - 1;
- new->lid_mask = 0xffff;
- } else if (kvm_apic_sw_enabled(apic) &&
- !new->cid_mask /* flat mode */ &&
- kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_CLUSTER) {
- new->cid_shift = 4;
- new->cid_mask = 0xf;
- new->lid_mask = 0xf;
- }
+ if (kvm_apic_sw_enabled(apic))
+ break;
+ }
- new->phys_map[kvm_apic_id(apic)] = apic;
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvm_lapic *apic = vcpu->arch.apic;
+ u16 cid, lid;
+ u32 ldr, aid;
+ aid = kvm_apic_id(apic);
ldr = kvm_apic_get_reg(apic, APIC_LDR);
cid = apic_cluster_id(new, ldr);
lid = apic_logical_id(new, ldr);
- if (lid)
+ if (aid < ARRAY_SIZE(new->phys_map))
+ new->phys_map[aid] = apic;
+ if (lid && cid < ARRAY_SIZE(new->logical_map))
new->logical_map[cid][ffs(lid) - 1] = apic;
}
out:
@@ -201,11 +216,13 @@ out:
static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
{
- u32 prev = kvm_apic_get_reg(apic, APIC_SPIV);
+ bool enabled = val & APIC_SPIV_APIC_ENABLED;
apic_set_reg(apic, APIC_SPIV, val);
- if ((prev ^ val) & APIC_SPIV_APIC_ENABLED) {
- if (val & APIC_SPIV_APIC_ENABLED) {
+
+ if (enabled != apic->sw_enabled) {
+ apic->sw_enabled = enabled;
+ if (enabled) {
static_key_slow_dec_deferred(&apic_sw_disabled);
recalculate_apic_map(apic->vcpu->kvm);
} else
@@ -237,21 +254,17 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type)
static inline int apic_lvtt_oneshot(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_ONESHOT;
}
static inline int apic_lvtt_period(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_PERIODIC;
}
static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic)
{
- return ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) ==
- APIC_LVT_TIMER_TSCDEADLINE);
+ return apic->lapic_timer.timer_mode == APIC_LVT_TIMER_TSCDEADLINE;
}
static inline int apic_lvt_nmi_mode(u32 lvt_val)
@@ -326,8 +339,12 @@ EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
static inline void apic_set_irr(int vec, struct kvm_lapic *apic)
{
- apic->irr_pending = true;
apic_set_vector(vec, apic->regs + APIC_IRR);
+ /*
+ * irr_pending must be true if any interrupt is pending; set it after
+ * APIC_IRR to avoid race with apic_clear_irr
+ */
+ apic->irr_pending = true;
}
static inline int apic_search_irr(struct kvm_lapic *apic)
@@ -359,13 +376,15 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
vcpu = apic->vcpu;
- apic_clear_vector(vec, apic->regs + APIC_IRR);
- if (unlikely(kvm_apic_vid_enabled(vcpu->kvm)))
+ if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
/* try to update RVI */
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
kvm_make_request(KVM_REQ_EVENT, vcpu);
- else {
- vec = apic_search_irr(apic);
- apic->irr_pending = (vec != -1);
+ } else {
+ apic->irr_pending = false;
+ apic_clear_vector(vec, apic->regs + APIC_IRR);
+ if (apic_search_irr(apic) != -1)
+ apic->irr_pending = true;
}
}
@@ -558,16 +577,25 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
apic_update_ppr(apic);
}
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
+static int kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+{
+ return dest == (apic_x2apic_mode(apic) ?
+ X2APIC_BROADCAST : APIC_BROADCAST);
+}
+
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
{
- return dest == 0xff || kvm_apic_id(apic) == dest;
+ return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
}
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
{
int result = 0;
u32 logical_id;
+ if (kvm_apic_broadcast(apic, mda))
+ return 1;
+
if (apic_x2apic_mode(apic)) {
logical_id = kvm_apic_get_reg(apic, APIC_LDR);
return logical_id & mda;
@@ -595,7 +623,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
}
int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
- int short_hand, int dest, int dest_mode)
+ int short_hand, unsigned int dest, int dest_mode)
{
int result = 0;
struct kvm_lapic *target = vcpu->arch.apic;
@@ -657,15 +685,24 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
if (!map)
goto out;
+ if (irq->dest_id == map->broadcast)
+ goto out;
+
+ ret = true;
+
if (irq->dest_mode == 0) { /* physical mode */
- if (irq->delivery_mode == APIC_DM_LOWEST ||
- irq->dest_id == 0xff)
+ if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
goto out;
- dst = &map->phys_map[irq->dest_id & 0xff];
+
+ dst = &map->phys_map[irq->dest_id];
} else {
u32 mda = irq->dest_id << (32 - map->ldr_bits);
+ u16 cid = apic_cluster_id(map, mda);
+
+ if (cid >= ARRAY_SIZE(map->logical_map))
+ goto out;
- dst = map->logical_map[apic_cluster_id(map, mda)];
+ dst = map->logical_map[cid];
bitmap = apic_logical_id(map, mda);
@@ -691,8 +728,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
*r = 0;
*r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
-
- ret = true;
out:
rcu_read_unlock();
return ret;
@@ -1034,6 +1069,26 @@ static void update_divide_count(struct kvm_lapic *apic)
apic->divide_count);
}
+static void apic_timer_expired(struct kvm_lapic *apic)
+{
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ wait_queue_head_t *q = &vcpu->wq;
+
+ /*
+ * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
+ * vcpu_enter_guest.
+ */
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ atomic_inc(&apic->lapic_timer.pending);
+ /* FIXME: this code should not know anything about vcpus */
+ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+}
+
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1096,9 +1151,10 @@ static void start_apic_timer(struct kvm_lapic *apic)
if (likely(tscdeadline > guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 1000000ULL;
do_div(ns, this_tsc_khz);
- }
- hrtimer_start(&apic->lapic_timer.timer,
- ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ hrtimer_start(&apic->lapic_timer.timer,
+ ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ } else
+ apic_timer_expired(apic);
local_irq_restore(flags);
}
@@ -1203,17 +1259,20 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
break;
- case APIC_LVTT:
- if ((kvm_apic_get_reg(apic, APIC_LVTT) &
- apic->lapic_timer.timer_mode_mask) !=
- (val & apic->lapic_timer.timer_mode_mask))
+ case APIC_LVTT: {
+ u32 timer_mode = val & apic->lapic_timer.timer_mode_mask;
+
+ if (apic->lapic_timer.timer_mode != timer_mode) {
+ apic->lapic_timer.timer_mode = timer_mode;
hrtimer_cancel(&apic->lapic_timer.timer);
+ }
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask);
apic_set_reg(apic, APIC_LVTT, val);
break;
+ }
case APIC_TMICT:
if (apic_lvtt_tscdeadline(apic))
@@ -1320,7 +1379,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
if (!(vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE))
static_key_slow_dec_deferred(&apic_hw_disabled);
- if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED))
+ if (!apic->sw_enabled)
static_key_slow_dec_deferred(&apic_sw_disabled);
if (apic->regs)
@@ -1355,9 +1414,6 @@ void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data)
return;
hrtimer_cancel(&apic->lapic_timer.timer);
- /* Inject here so clearing tscdeadline won't override new value */
- if (apic_has_pending_timer(vcpu))
- kvm_inject_apic_timer_irqs(vcpu);
apic->lapic_timer.tscdeadline = data;
start_apic_timer(apic);
}
@@ -1422,6 +1478,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
apic->base_address = apic->vcpu->arch.apic_base &
MSR_IA32_APICBASE_BASE;
+ if ((value & MSR_IA32_APICBASE_ENABLE) &&
+ apic->base_address != APIC_DEFAULT_PHYS_BASE)
+ pr_warn_once("APIC base relocation is unsupported by KVM");
+
/* with FSB delivery interrupt, we can restart APIC functionality */
apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is "
"0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address);
@@ -1447,6 +1507,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
for (i = 0; i < APIC_LVT_NUM; i++)
apic_set_reg(apic, APIC_LVTT + 0x10 * i, APIC_LVT_MASKED);
+ apic->lapic_timer.timer_mode = 0;
apic_set_reg(apic, APIC_LVT0,
SET_APIC_DELIVERY_MODE(0, APIC_MODE_EXTINT));
@@ -1538,23 +1599,8 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
{
struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic, lapic_timer);
- struct kvm_vcpu *vcpu = apic->vcpu;
- wait_queue_head_t *q = &vcpu->wq;
-
- /*
- * There is a race window between reading and incrementing, but we do
- * not care about potentially losing timer events in the !reinject
- * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
- * in vcpu_enter_guest.
- */
- if (!atomic_read(&ktimer->pending)) {
- atomic_inc(&ktimer->pending);
- /* FIXME: this code should not know anything about vcpus */
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- }
- if (waitqueue_active(q))
- wake_up_interruptible(q);
+ apic_timer_expired(apic);
if (lapic_is_periodic(apic)) {
hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
@@ -1693,6 +1739,9 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
1 : count_vectors(apic->regs + APIC_ISR);
apic->highest_isr_cache = -1;
+ if (kvm_x86_ops->hwapic_irr_update)
+ kvm_x86_ops->hwapic_irr_update(vcpu,
+ apic_find_highest_irr(apic));
kvm_x86_ops->hwapic_isr_update(vcpu->kvm, apic_find_highest_isr(apic));
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_rtc_eoi_tracking_restore_one(vcpu);
@@ -1837,8 +1886,11 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
return 1;
+ if (reg == APIC_ICR2)
+ return 1;
+
/* if this is ICR write vector before command */
- if (msr == 0x830)
+ if (reg == APIC_ICR)
apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
return apic_reg_write(apic, reg, (u32)data);
}
@@ -1851,9 +1903,15 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
return 1;
+ if (reg == APIC_DFR || reg == APIC_ICR2) {
+ apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n",
+ reg);
+ return 1;
+ }
+
if (apic_reg_read(apic, reg, 4, &low))
return 1;
- if (msr == 0x830)
+ if (reg == APIC_ICR)
apic_reg_read(apic, APIC_ICR2, 4, &high);
*data = (((u64)high) << 32) | low;
@@ -1908,7 +1966,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data)
void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
- unsigned int sipi_vector;
+ u8 sipi_vector;
unsigned long pe;
if (!kvm_vcpu_has_lapic(vcpu) || !apic->pending_events)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 6a11845fd8b9..c674fce53cf9 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -11,6 +11,7 @@
struct kvm_timer {
struct hrtimer timer;
s64 period; /* unit: ns */
+ u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
atomic_t pending; /* accumulated triggered timers */
@@ -22,6 +23,7 @@ struct kvm_lapic {
struct kvm_timer lapic_timer;
u32 divide_count;
struct kvm_vcpu *vcpu;
+ bool sw_enabled;
bool irr_pending;
/* Number of bits set in ISR. */
s16 isr_count;
@@ -55,8 +57,8 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu);
void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
-int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
-int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest);
+int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda);
int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
unsigned long *dest_map);
int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type);
@@ -119,11 +121,11 @@ static inline int kvm_apic_hw_enabled(struct kvm_lapic *apic)
extern struct static_key_deferred apic_sw_disabled;
-static inline int kvm_apic_sw_enabled(struct kvm_lapic *apic)
+static inline bool kvm_apic_sw_enabled(struct kvm_lapic *apic)
{
if (static_key_false(&apic_sw_disabled.key))
- return kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_APIC_ENABLED;
- return APIC_SPIV_APIC_ENABLED;
+ return apic->sw_enabled;
+ return true;
}
static inline bool kvm_apic_present(struct kvm_vcpu *vcpu)
@@ -152,8 +154,6 @@ static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
ldr >>= 32 - map->ldr_bits;
cid = (ldr >> map->cid_shift) & map->cid_mask;
- BUG_ON(cid >= ARRAY_SIZE(map->logical_map));
-
return cid;
}
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 978f402006ee..10fbed126b11 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -214,13 +214,12 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
#define MMIO_GEN_LOW_SHIFT 10
#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
-#define MMIO_MAX_GEN ((1 << MMIO_GEN_SHIFT) - 1)
static u64 generation_mmio_spte_mask(unsigned int gen)
{
u64 mask;
- WARN_ON(gen > MMIO_MAX_GEN);
+ WARN_ON(gen & ~MMIO_GEN_MASK);
mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
mask |= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
@@ -263,13 +262,13 @@ static bool is_mmio_spte(u64 spte)
static gfn_t get_mmio_spte_gfn(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) >> PAGE_SHIFT;
}
static unsigned get_mmio_spte_access(u64 spte)
{
- u64 mask = generation_mmio_spte_mask(MMIO_MAX_GEN) | shadow_mmio_mask;
+ u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
return (spte & ~mask) & ~PAGE_MASK;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 7527cefc5a43..41dd0387cccb 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1056,9 +1056,11 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
{
struct vcpu_svm *svm = to_svm(vcpu);
- WARN_ON(adjustment < 0);
- if (host)
- adjustment = svm_scale_tsc(vcpu, adjustment);
+ if (host) {
+ if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
+ WARN_ON(adjustment < 0);
+ adjustment = svm_scale_tsc(vcpu, (u64)adjustment);
+ }
svm->vmcb->control.tsc_offset += adjustment;
if (is_guest_mode(vcpu))
@@ -2999,7 +3001,6 @@ static int dr_interception(struct vcpu_svm *svm)
{
int reg, dr;
unsigned long val;
- int err;
if (svm->vcpu.guest_debug == 0) {
/*
@@ -3019,12 +3020,15 @@ static int dr_interception(struct vcpu_svm *svm)
dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
if (dr >= 16) { /* mov to DRn */
+ if (!kvm_require_dr(&svm->vcpu, dr - 16))
+ return 1;
val = kvm_register_read(&svm->vcpu, reg);
kvm_set_dr(&svm->vcpu, dr - 16, val);
} else {
- err = kvm_get_dr(&svm->vcpu, dr, &val);
- if (!err)
- kvm_register_write(&svm->vcpu, reg, val);
+ if (!kvm_require_dr(&svm->vcpu, dr))
+ return 1;
+ kvm_get_dr(&svm->vcpu, dr, &val);
+ kvm_register_write(&svm->vcpu, reg, val);
}
skip_emulated_instruction(&svm->vcpu);
@@ -4123,6 +4127,11 @@ static bool svm_mpx_supported(void)
return false;
}
+static bool svm_xsaves_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -4410,6 +4419,7 @@ static struct kvm_x86_ops svm_x86_ops = {
.rdtscp_supported = svm_rdtscp_supported,
.invpcid_supported = svm_invpcid_supported,
.mpx_supported = svm_mpx_supported,
+ .xsaves_supported = svm_xsaves_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 6b06ab8748dd..c2a34bb5ad93 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include <asm/svm.h>
#include <asm/clocksource.h>
+#include <asm/pvclock-abi.h>
#undef TRACE_SYSTEM
#define TRACE_SYSTEM kvm
@@ -877,6 +878,42 @@ TRACE_EVENT(kvm_ple_window,
#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \
trace_kvm_ple_window(false, vcpu_id, new, old)
+TRACE_EVENT(kvm_pvclock_update,
+ TP_PROTO(unsigned int vcpu_id, struct pvclock_vcpu_time_info *pvclock),
+ TP_ARGS(vcpu_id, pvclock),
+
+ TP_STRUCT__entry(
+ __field( unsigned int, vcpu_id )
+ __field( __u32, version )
+ __field( __u64, tsc_timestamp )
+ __field( __u64, system_time )
+ __field( __u32, tsc_to_system_mul )
+ __field( __s8, tsc_shift )
+ __field( __u8, flags )
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->version = pvclock->version;
+ __entry->tsc_timestamp = pvclock->tsc_timestamp;
+ __entry->system_time = pvclock->system_time;
+ __entry->tsc_to_system_mul = pvclock->tsc_to_system_mul;
+ __entry->tsc_shift = pvclock->tsc_shift;
+ __entry->flags = pvclock->flags;
+ ),
+
+ TP_printk("vcpu_id %u, pvclock { version %u, tsc_timestamp 0x%llx, "
+ "system_time 0x%llx, tsc_to_system_mul 0x%x, tsc_shift %d, "
+ "flags 0x%x }",
+ __entry->vcpu_id,
+ __entry->version,
+ __entry->tsc_timestamp,
+ __entry->system_time,
+ __entry->tsc_to_system_mul,
+ __entry->tsc_shift,
+ __entry->flags)
+);
+
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3e556c68351b..feb852b04598 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -99,13 +99,15 @@ module_param_named(enable_shadow_vmcs, enable_shadow_vmcs, bool, S_IRUGO);
static bool __read_mostly nested = 0;
module_param(nested, bool, S_IRUGO);
+static u64 __read_mostly host_xss;
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
#define KVM_CR4_GUEST_OWNED_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_TSD)
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
@@ -214,6 +216,7 @@ struct __packed vmcs12 {
u64 virtual_apic_page_addr;
u64 apic_access_addr;
u64 ept_pointer;
+ u64 xss_exit_bitmap;
u64 guest_physical_address;
u64 vmcs_link_pointer;
u64 guest_ia32_debugctl;
@@ -616,6 +619,7 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD64(VIRTUAL_APIC_PAGE_ADDR, virtual_apic_page_addr),
FIELD64(APIC_ACCESS_ADDR, apic_access_addr),
FIELD64(EPT_POINTER, ept_pointer),
+ FIELD64(XSS_EXIT_BITMAP, xss_exit_bitmap),
FIELD64(GUEST_PHYSICAL_ADDRESS, guest_physical_address),
FIELD64(VMCS_LINK_POINTER, vmcs_link_pointer),
FIELD64(GUEST_IA32_DEBUGCTL, guest_ia32_debugctl),
@@ -720,12 +724,15 @@ static const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-static const int max_vmcs_field = ARRAY_SIZE(vmcs_field_to_offset_table);
static inline short vmcs_field_to_offset(unsigned long field)
{
- if (field >= max_vmcs_field || vmcs_field_to_offset_table[field] == 0)
- return -1;
+ BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+
+ if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
+ vmcs_field_to_offset_table[field] == 0)
+ return -ENOENT;
+
return vmcs_field_to_offset_table[field];
}
@@ -758,6 +765,7 @@ static u64 construct_eptp(unsigned long root_hpa);
static void kvm_cpu_vmxon(u64 addr);
static void kvm_cpu_vmxoff(void);
static bool vmx_mpx_supported(void);
+static bool vmx_xsaves_supported(void);
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
static void vmx_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg);
@@ -1098,6 +1106,12 @@ static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
}
+static inline bool nested_cpu_has_xsaves(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES) &&
+ vmx_xsaves_supported();
+}
+
static inline bool is_exception(u32 intr_info)
{
return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
@@ -1659,12 +1673,20 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
clear_atomic_switch_msr(vmx, MSR_EFER);
- /* On ept, can't emulate nx, and must switch nx atomically */
- if (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX)) {
+
+ /*
+ * On EPT, we can't emulate NX, so we must switch EFER atomically.
+ * On CPUs that support "load IA32_EFER", always switch EFER
+ * atomically, since it's faster than switching it manually.
+ */
+ if (cpu_has_load_ia32_efer ||
+ (enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
guest_efer = vmx->vcpu.arch.efer;
if (!(guest_efer & EFER_LMA))
guest_efer &= ~EFER_LME;
- add_atomic_switch_msr(vmx, MSR_EFER, guest_efer, host_efer);
+ if (guest_efer != host_efer)
+ add_atomic_switch_msr(vmx, MSR_EFER,
+ guest_efer, host_efer);
return false;
}
@@ -2377,12 +2399,13 @@ static __init void nested_vmx_setup_ctls_msrs(void)
nested_vmx_secondary_ctls_low = 0;
nested_vmx_secondary_ctls_high &=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_WBINVD_EXITING;
+ SECONDARY_EXEC_WBINVD_EXITING |
+ SECONDARY_EXEC_XSAVES;
if (enable_ept) {
/* nested EPT: emulate EPT also to L1 */
- nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT;
+ nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT |
+ SECONDARY_EXEC_UNRESTRICTED_GUEST;
nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
@@ -2558,6 +2581,11 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_get_vmx_msr(vcpu, msr_index, pdata);
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported())
+ return 1;
+ data = vcpu->arch.ia32_xss;
+ break;
case MSR_TSC_AUX:
if (!to_vmx(vcpu)->rdtscp_enabled)
return 1;
@@ -2649,6 +2677,22 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
return 1; /* they are read-only */
+ case MSR_IA32_XSS:
+ if (!vmx_xsaves_supported())
+ return 1;
+ /*
+ * The only supported bit as of Skylake is bit 8, but
+ * it is not supported on KVM.
+ */
+ if (data != 0)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ if (vcpu->arch.ia32_xss != host_xss)
+ add_atomic_switch_msr(vmx, MSR_IA32_XSS,
+ vcpu->arch.ia32_xss, host_xss);
+ else
+ clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
+ break;
case MSR_TSC_AUX:
if (!vmx->rdtscp_enabled)
return 1;
@@ -2884,7 +2928,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_SHADOW_VMCS;
+ SECONDARY_EXEC_SHADOW_VMCS |
+ SECONDARY_EXEC_XSAVES;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -3007,6 +3052,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
}
}
+ if (cpu_has_xsaves)
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
return 0;
}
@@ -3110,76 +3158,6 @@ static __init int alloc_kvm_area(void)
return 0;
}
-static __init int hardware_setup(void)
-{
- if (setup_vmcs_config(&vmcs_config) < 0)
- return -EIO;
-
- if (boot_cpu_has(X86_FEATURE_NX))
- kvm_enable_efer_bits(EFER_NX);
-
- if (!cpu_has_vmx_vpid())
- enable_vpid = 0;
- if (!cpu_has_vmx_shadow_vmcs())
- enable_shadow_vmcs = 0;
- if (enable_shadow_vmcs)
- init_vmcs_shadow_fields();
-
- if (!cpu_has_vmx_ept() ||
- !cpu_has_vmx_ept_4levels()) {
- enable_ept = 0;
- enable_unrestricted_guest = 0;
- enable_ept_ad_bits = 0;
- }
-
- if (!cpu_has_vmx_ept_ad_bits())
- enable_ept_ad_bits = 0;
-
- if (!cpu_has_vmx_unrestricted_guest())
- enable_unrestricted_guest = 0;
-
- if (!cpu_has_vmx_flexpriority()) {
- flexpriority_enabled = 0;
-
- /*
- * set_apic_access_page_addr() is used to reload apic access
- * page upon invalidation. No need to do anything if the
- * processor does not have the APIC_ACCESS_ADDR VMCS field.
- */
- kvm_x86_ops->set_apic_access_page_addr = NULL;
- }
-
- if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
-
- if (!cpu_has_vmx_ple())
- ple_gap = 0;
-
- if (!cpu_has_vmx_apicv())
- enable_apicv = 0;
-
- if (enable_apicv)
- kvm_x86_ops->update_cr8_intercept = NULL;
- else {
- kvm_x86_ops->hwapic_irr_update = NULL;
- kvm_x86_ops->deliver_posted_interrupt = NULL;
- kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
- }
-
- if (nested)
- nested_vmx_setup_ctls_msrs();
-
- return alloc_kvm_area();
-}
-
-static __exit void hardware_unsetup(void)
-{
- free_kvm_area();
-}
-
static bool emulation_required(struct kvm_vcpu *vcpu)
{
return emulate_invalid_guest_state && !guest_state_valid(vcpu);
@@ -4396,6 +4374,7 @@ static void ept_set_mmio_spte_mask(void)
kvm_mmu_set_mmio_spte_mask((0x3ull << 62) | 0x6ull);
}
+#define VMX_XSS_EXIT_BITMAP 0
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -4505,6 +4484,9 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
set_cr4_guest_host_mask(vmx);
+ if (vmx_xsaves_supported())
+ vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
+
return 0;
}
@@ -5163,13 +5145,20 @@ static int handle_cr(struct kvm_vcpu *vcpu)
static int handle_dr(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
- int dr, reg;
+ int dr, dr7, reg;
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
+
+ /* First, if DR does not exist, trigger UD */
+ if (!kvm_require_dr(vcpu, dr))
+ return 1;
/* Do not handle if the CPL > 0, will trigger GP on re-entry */
if (!kvm_require_cpl(vcpu, 0))
return 1;
- dr = vmcs_readl(GUEST_DR7);
- if (dr & DR7_GD) {
+ dr7 = vmcs_readl(GUEST_DR7);
+ if (dr7 & DR7_GD) {
/*
* As the vm-exit takes precedence over the debug trap, we
* need to emulate the latter, either for the host or the
@@ -5177,17 +5166,14 @@ static int handle_dr(struct kvm_vcpu *vcpu)
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
- vcpu->run->debug.arch.dr7 = dr;
- vcpu->run->debug.arch.pc =
- vmcs_readl(GUEST_CS_BASE) +
- vmcs_readl(GUEST_RIP);
+ vcpu->run->debug.arch.dr7 = dr7;
+ vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
return 0;
} else {
- vcpu->arch.dr7 &= ~DR7_GD;
+ vcpu->arch.dr6 &= ~15;
vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
- vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
kvm_queue_exception(vcpu, DB_VECTOR);
return 1;
}
@@ -5209,8 +5195,6 @@ static int handle_dr(struct kvm_vcpu *vcpu)
return 1;
}
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
reg = DEBUG_REG_ACCESS_REG(exit_qualification);
if (exit_qualification & TYPE_MOV_FROM_DR) {
unsigned long val;
@@ -5391,6 +5375,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_xsaves(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
+static int handle_xrstors(struct kvm_vcpu *vcpu)
+{
+ skip_emulated_instruction(vcpu);
+ WARN(1, "this should never happen\n");
+ return 1;
+}
+
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
@@ -5492,7 +5490,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
}
/* clear all local breakpoint enable flags */
- vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x55);
+ vmcs_writel(GUEST_DR7, vmcs_readl(GUEST_DR7) & ~0x155);
/*
* TODO: What about debug traps on tss switch?
@@ -5539,11 +5537,11 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
trace_kvm_page_fault(gpa, exit_qualification);
/* It is a write fault? */
- error_code = exit_qualification & (1U << 1);
+ error_code = exit_qualification & PFERR_WRITE_MASK;
/* It is a fetch fault? */
- error_code |= (exit_qualification & (1U << 2)) << 2;
+ error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
/* ept page table is present? */
- error_code |= (exit_qualification >> 3) & 0x1;
+ error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
vcpu->arch.exit_qualification = exit_qualification;
@@ -5785,6 +5783,204 @@ static void update_ple_window_actual_max(void)
ple_window_grow, INT_MIN);
}
+static __init int hardware_setup(void)
+{
+ int r = -ENOMEM, i, msr;
+
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
+ kvm_define_shared_msr(i, vmx_msr_index[i]);
+
+ vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_a)
+ return r;
+
+ vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_io_bitmap_b)
+ goto out;
+
+ vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy)
+ goto out1;
+
+ vmx_msr_bitmap_legacy_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_legacy_x2apic)
+ goto out2;
+
+ vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode)
+ goto out3;
+
+ vmx_msr_bitmap_longmode_x2apic =
+ (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_msr_bitmap_longmode_x2apic)
+ goto out4;
+ vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmread_bitmap)
+ goto out5;
+
+ vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+ if (!vmx_vmwrite_bitmap)
+ goto out6;
+
+ memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
+ memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
+
+ /*
+ * Allow direct access to the PC debug port (it is often used for I/O
+ * delays, but the vmexits simply slow things down).
+ */
+ memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+ clear_bit(0x80, vmx_io_bitmap_a);
+
+ memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
+
+ memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+ memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
+
+ vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+ vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
+ vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
+
+ memcpy(vmx_msr_bitmap_legacy_x2apic,
+ vmx_msr_bitmap_legacy, PAGE_SIZE);
+ memcpy(vmx_msr_bitmap_longmode_x2apic,
+ vmx_msr_bitmap_longmode, PAGE_SIZE);
+
+ if (enable_apicv) {
+ for (msr = 0x800; msr <= 0x8ff; msr++)
+ vmx_disable_intercept_msr_read_x2apic(msr);
+
+ /* According SDM, in x2apic mode, the whole id reg is used.
+ * But in KVM, it only use the highest eight bits. Need to
+ * intercept it */
+ vmx_enable_intercept_msr_read_x2apic(0x802);
+ /* TMCCT */
+ vmx_enable_intercept_msr_read_x2apic(0x839);
+ /* TPR */
+ vmx_disable_intercept_msr_write_x2apic(0x808);
+ /* EOI */
+ vmx_disable_intercept_msr_write_x2apic(0x80b);
+ /* SELF-IPI */
+ vmx_disable_intercept_msr_write_x2apic(0x83f);
+ }
+
+ if (enable_ept) {
+ kvm_mmu_set_mask_ptes(0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
+ (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
+ 0ull, VMX_EPT_EXECUTABLE_MASK);
+ ept_set_mmio_spte_mask();
+ kvm_enable_tdp();
+ } else
+ kvm_disable_tdp();
+
+ update_ple_window_actual_max();
+
+ if (setup_vmcs_config(&vmcs_config) < 0) {
+ r = -EIO;
+ goto out7;
+ }
+
+ if (boot_cpu_has(X86_FEATURE_NX))
+ kvm_enable_efer_bits(EFER_NX);
+
+ if (!cpu_has_vmx_vpid())
+ enable_vpid = 0;
+ if (!cpu_has_vmx_shadow_vmcs())
+ enable_shadow_vmcs = 0;
+ if (enable_shadow_vmcs)
+ init_vmcs_shadow_fields();
+
+ if (!cpu_has_vmx_ept() ||
+ !cpu_has_vmx_ept_4levels()) {
+ enable_ept = 0;
+ enable_unrestricted_guest = 0;
+ enable_ept_ad_bits = 0;
+ }
+
+ if (!cpu_has_vmx_ept_ad_bits())
+ enable_ept_ad_bits = 0;
+
+ if (!cpu_has_vmx_unrestricted_guest())
+ enable_unrestricted_guest = 0;
+
+ if (!cpu_has_vmx_flexpriority()) {
+ flexpriority_enabled = 0;
+
+ /*
+ * set_apic_access_page_addr() is used to reload apic access
+ * page upon invalidation. No need to do anything if the
+ * processor does not have the APIC_ACCESS_ADDR VMCS field.
+ */
+ kvm_x86_ops->set_apic_access_page_addr = NULL;
+ }
+
+ if (!cpu_has_vmx_tpr_shadow())
+ kvm_x86_ops->update_cr8_intercept = NULL;
+
+ if (enable_ept && !cpu_has_vmx_ept_2m_page())
+ kvm_disable_largepages();
+
+ if (!cpu_has_vmx_ple())
+ ple_gap = 0;
+
+ if (!cpu_has_vmx_apicv())
+ enable_apicv = 0;
+
+ if (enable_apicv)
+ kvm_x86_ops->update_cr8_intercept = NULL;
+ else {
+ kvm_x86_ops->hwapic_irr_update = NULL;
+ kvm_x86_ops->deliver_posted_interrupt = NULL;
+ kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
+ }
+
+ if (nested)
+ nested_vmx_setup_ctls_msrs();
+
+ return alloc_kvm_area();
+
+out7:
+ free_page((unsigned long)vmx_vmwrite_bitmap);
+out6:
+ free_page((unsigned long)vmx_vmread_bitmap);
+out5:
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+out4:
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+out3:
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+out2:
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+out1:
+ free_page((unsigned long)vmx_io_bitmap_b);
+out:
+ free_page((unsigned long)vmx_io_bitmap_a);
+
+ return r;
+}
+
+static __exit void hardware_unsetup(void)
+{
+ free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
+ free_page((unsigned long)vmx_msr_bitmap_legacy);
+ free_page((unsigned long)vmx_msr_bitmap_longmode);
+ free_page((unsigned long)vmx_io_bitmap_b);
+ free_page((unsigned long)vmx_io_bitmap_a);
+ free_page((unsigned long)vmx_vmwrite_bitmap);
+ free_page((unsigned long)vmx_vmread_bitmap);
+
+ free_kvm_area();
+}
+
/*
* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
@@ -6361,58 +6557,60 @@ static inline int vmcs_field_readonly(unsigned long field)
* some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
* 64-bit fields are to be returned).
*/
-static inline bool vmcs12_read_any(struct kvm_vcpu *vcpu,
- unsigned long field, u64 *ret)
+static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 *ret)
{
short offset = vmcs_field_to_offset(field);
char *p;
if (offset < 0)
- return 0;
+ return offset;
p = ((char *)(get_vmcs12(vcpu))) + offset;
switch (vmcs_field_type(field)) {
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
*ret = *((natural_width *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U16:
*ret = *((u16 *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U32:
*ret = *((u32 *)p);
- return 1;
+ return 0;
case VMCS_FIELD_TYPE_U64:
*ret = *((u64 *)p);
- return 1;
+ return 0;
default:
- return 0; /* can never happen. */
+ WARN_ON(1);
+ return -ENOENT;
}
}
-static inline bool vmcs12_write_any(struct kvm_vcpu *vcpu,
- unsigned long field, u64 field_value){
+static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
+ unsigned long field, u64 field_value){
short offset = vmcs_field_to_offset(field);
char *p = ((char *) get_vmcs12(vcpu)) + offset;
if (offset < 0)
- return false;
+ return offset;
switch (vmcs_field_type(field)) {
case VMCS_FIELD_TYPE_U16:
*(u16 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_U32:
*(u32 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_U64:
*(u64 *)p = field_value;
- return true;
+ return 0;
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
*(natural_width *)p = field_value;
- return true;
+ return 0;
default:
- return false; /* can never happen. */
+ WARN_ON(1);
+ return -ENOENT;
}
}
@@ -6445,6 +6643,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
field_value = vmcs_readl(field);
break;
+ default:
+ WARN_ON(1);
+ continue;
}
vmcs12_write_any(&vmx->vcpu, field, field_value);
}
@@ -6490,6 +6691,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
case VMCS_FIELD_TYPE_NATURAL_WIDTH:
vmcs_writel(field, (long)field_value);
break;
+ default:
+ WARN_ON(1);
+ break;
}
}
}
@@ -6528,7 +6732,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Read the field, zero-extended to a u64 field_value */
- if (!vmcs12_read_any(vcpu, field, &field_value)) {
+ if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
@@ -6598,7 +6802,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
return 1;
}
- if (!vmcs12_write_any(vcpu, field, field_value)) {
+ if (vmcs12_write_any(vcpu, field, field_value) < 0) {
nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
skip_emulated_instruction(vcpu);
return 1;
@@ -6802,6 +7006,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_MONITOR_INSTRUCTION] = handle_monitor,
[EXIT_REASON_INVEPT] = handle_invept,
[EXIT_REASON_INVVPID] = handle_invvpid,
+ [EXIT_REASON_XSAVES] = handle_xsaves,
+ [EXIT_REASON_XRSTORS] = handle_xrstors,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7089,6 +7295,14 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
case EXIT_REASON_XSETBV:
return 1;
+ case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
+ /*
+ * This should never happen, since it is not possible to
+ * set XSS to a non-zero value---neither in L1 nor in L2.
+ * If if it were, XSS would have to be checked against
+ * the XSS exit bitmap in vmcs12.
+ */
+ return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
default:
return 1;
}
@@ -7277,6 +7491,9 @@ static void vmx_set_rvi(int vector)
u16 status;
u8 old;
+ if (vector == -1)
+ vector = 0;
+
status = vmcs_read16(GUEST_INTR_STATUS);
old = (u8)status & 0xff;
if ((u8)vector != old) {
@@ -7288,22 +7505,23 @@ static void vmx_set_rvi(int vector)
static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
{
+ if (!is_guest_mode(vcpu)) {
+ vmx_set_rvi(max_irr);
+ return;
+ }
+
if (max_irr == -1)
return;
/*
- * If a vmexit is needed, vmx_check_nested_events handles it.
+ * In guest mode. If a vmexit is needed, vmx_check_nested_events
+ * handles it.
*/
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ if (nested_exit_on_intr(vcpu))
return;
- if (!is_guest_mode(vcpu)) {
- vmx_set_rvi(max_irr);
- return;
- }
-
/*
- * Fall back to pre-APICv interrupt injection since L2
+ * Else, fall back to pre-APICv interrupt injection since L2
* is run without virtual interrupt delivery.
*/
if (!kvm_event_needs_reinjection(vcpu) &&
@@ -7400,6 +7618,12 @@ static bool vmx_mpx_supported(void)
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
}
+static bool vmx_xsaves_supported(void)
+{
+ return vmcs_config.cpu_based_2nd_exec_ctrl &
+ SECONDARY_EXEC_XSAVES;
+}
+
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -8135,6 +8359,8 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap);
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
@@ -8775,6 +9001,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
if (vmx_mpx_supported())
vmcs12->guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
+ if (nested_cpu_has_xsaves(vmcs12))
+ vmcs12->xss_exit_bitmap = vmcs_read64(XSS_EXIT_BITMAP);
/* update exit information fields: */
@@ -9176,6 +9404,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
.check_intercept = vmx_check_intercept,
.handle_external_intr = vmx_handle_external_intr,
.mpx_supported = vmx_mpx_supported,
+ .xsaves_supported = vmx_xsaves_supported,
.check_nested_events = vmx_check_nested_events,
@@ -9184,150 +9413,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
static int __init vmx_init(void)
{
- int r, i, msr;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
-
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
-
- vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_a)
- return -ENOMEM;
-
- r = -ENOMEM;
-
- vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_io_bitmap_b)
- goto out;
-
- vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy)
- goto out1;
-
- vmx_msr_bitmap_legacy_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_legacy_x2apic)
- goto out2;
-
- vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode)
- goto out3;
-
- vmx_msr_bitmap_longmode_x2apic =
- (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_msr_bitmap_longmode_x2apic)
- goto out4;
- vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmread_bitmap)
- goto out5;
-
- vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
- if (!vmx_vmwrite_bitmap)
- goto out6;
-
- memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);
- memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE);
-
- /*
- * Allow direct access to the PC debug port (it is often used for I/O
- * delays, but the vmexits simply slow things down).
- */
- memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
- clear_bit(0x80, vmx_io_bitmap_a);
-
- memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
-
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
-
- set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
-
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
- __alignof__(struct vcpu_vmx), THIS_MODULE);
+ int r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ __alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
- goto out7;
+ return r;
#ifdef CONFIG_KEXEC
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
#endif
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
- vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true);
-
- memcpy(vmx_msr_bitmap_legacy_x2apic,
- vmx_msr_bitmap_legacy, PAGE_SIZE);
- memcpy(vmx_msr_bitmap_longmode_x2apic,
- vmx_msr_bitmap_longmode, PAGE_SIZE);
-
- if (enable_apicv) {
- for (msr = 0x800; msr <= 0x8ff; msr++)
- vmx_disable_intercept_msr_read_x2apic(msr);
-
- /* According SDM, in x2apic mode, the whole id reg is used.
- * But in KVM, it only use the highest eight bits. Need to
- * intercept it */
- vmx_enable_intercept_msr_read_x2apic(0x802);
- /* TMCCT */
- vmx_enable_intercept_msr_read_x2apic(0x839);
- /* TPR */
- vmx_disable_intercept_msr_write_x2apic(0x808);
- /* EOI */
- vmx_disable_intercept_msr_write_x2apic(0x80b);
- /* SELF-IPI */
- vmx_disable_intercept_msr_write_x2apic(0x83f);
- }
-
- if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull,
- (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
- (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK);
- ept_set_mmio_spte_mask();
- kvm_enable_tdp();
- } else
- kvm_disable_tdp();
-
- update_ple_window_actual_max();
-
return 0;
-
-out7:
- free_page((unsigned long)vmx_vmwrite_bitmap);
-out6:
- free_page((unsigned long)vmx_vmread_bitmap);
-out5:
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
-out4:
- free_page((unsigned long)vmx_msr_bitmap_longmode);
-out3:
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
-out2:
- free_page((unsigned long)vmx_msr_bitmap_legacy);
-out1:
- free_page((unsigned long)vmx_io_bitmap_b);
-out:
- free_page((unsigned long)vmx_io_bitmap_a);
- return r;
}
static void __exit vmx_exit(void)
{
- free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic);
- free_page((unsigned long)vmx_msr_bitmap_legacy);
- free_page((unsigned long)vmx_msr_bitmap_longmode);
- free_page((unsigned long)vmx_io_bitmap_b);
- free_page((unsigned long)vmx_io_bitmap_a);
- free_page((unsigned long)vmx_vmwrite_bitmap);
- free_page((unsigned long)vmx_vmread_bitmap);
-
#ifdef CONFIG_KEXEC
RCU_INIT_POINTER(crash_vmclear_loaded_vmcss, NULL);
synchronize_rcu();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0033df32a745..c259814200bd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -27,6 +27,7 @@
#include "kvm_cache_regs.h"
#include "x86.h"
#include "cpuid.h"
+#include "assigned-dev.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -353,6 +354,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
if (!vcpu->arch.exception.pending) {
queue:
+ if (has_error && !is_protmode(vcpu))
+ has_error = false;
vcpu->arch.exception.pending = true;
vcpu->arch.exception.has_error_code = has_error;
vcpu->arch.exception.nr = nr;
@@ -455,6 +458,16 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
}
EXPORT_SYMBOL_GPL(kvm_require_cpl);
+bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
+{
+ if ((dr != 4 && dr != 5) || !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+ return true;
+
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+}
+EXPORT_SYMBOL_GPL(kvm_require_dr);
+
/*
* This function will be used to read from the physical memory of the currently
* running guest. The difference to kvm_read_guest_page is that this function
@@ -656,6 +669,12 @@ int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
if ((!(xcr0 & XSTATE_BNDREGS)) != (!(xcr0 & XSTATE_BNDCSR)))
return 1;
+ if (xcr0 & XSTATE_AVX512) {
+ if (!(xcr0 & XSTATE_YMM))
+ return 1;
+ if ((xcr0 & XSTATE_AVX512) != XSTATE_AVX512)
+ return 1;
+ }
kvm_put_guest_xcr0(vcpu);
vcpu->arch.xcr0 = xcr0;
@@ -732,6 +751,10 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
+#ifdef CONFIG_X86_64
+ cr3 &= ~CR3_PCID_INVD;
+#endif
+
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
kvm_mmu_sync_roots(vcpu);
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
@@ -811,8 +834,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
/* fall through */
case 6:
if (val & 0xffffffff00000000ULL)
@@ -821,8 +842,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
kvm_update_dr6(vcpu);
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1; /* #UD */
/* fall through */
default: /* 7 */
if (val & 0xffffffff00000000ULL)
@@ -837,27 +856,21 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
- int res;
-
- res = __kvm_set_dr(vcpu, dr, val);
- if (res > 0)
- kvm_queue_exception(vcpu, UD_VECTOR);
- else if (res < 0)
+ if (__kvm_set_dr(vcpu, dr, val)) {
kvm_inject_gp(vcpu, 0);
-
- return res;
+ return 1;
+ }
+ return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_dr);
-static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
switch (dr) {
case 0 ... 3:
*val = vcpu->arch.db[dr];
break;
case 4:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
/* fall through */
case 6:
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
@@ -866,23 +879,11 @@ static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
*val = kvm_x86_ops->get_dr6(vcpu);
break;
case 5:
- if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
- return 1;
/* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
break;
}
-
- return 0;
-}
-
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
-{
- if (_kvm_get_dr(vcpu, dr, val)) {
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
- }
return 0;
}
EXPORT_SYMBOL_GPL(kvm_get_dr);
@@ -1237,21 +1238,22 @@ void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
{
#ifdef CONFIG_X86_64
bool vcpus_matched;
- bool do_request = false;
struct kvm_arch *ka = &vcpu->kvm->arch;
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
atomic_read(&vcpu->kvm->online_vcpus));
- if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC)
- if (!ka->use_master_clock)
- do_request = 1;
-
- if (!vcpus_matched && ka->use_master_clock)
- do_request = 1;
-
- if (do_request)
+ /*
+ * Once the masterclock is enabled, always perform request in
+ * order to update it.
+ *
+ * In order to enable masterclock, the host clocksource must be TSC
+ * and the vcpus need to have matched TSCs. When that happens,
+ * perform request to enable masterclock.
+ */
+ if (ka->use_master_clock ||
+ (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched))
kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
@@ -1637,16 +1639,16 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
vcpu->last_guest_tsc = tsc_timestamp;
+ if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+ &guest_hv_clock, sizeof(guest_hv_clock))))
+ return 0;
+
/*
* The interface expects us to write an even number signaling that the
* update is finished. Since the guest won't see the intermediate
* state, we just increase by 2 at the end.
*/
- vcpu->hv_clock.version += 2;
-
- if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
- &guest_hv_clock, sizeof(guest_hv_clock))))
- return 0;
+ vcpu->hv_clock.version = guest_hv_clock.version + 2;
/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
@@ -1662,6 +1664,8 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
vcpu->hv_clock.flags = pvclock_flags;
+ trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
&vcpu->hv_clock,
sizeof(vcpu->hv_clock));
@@ -2140,7 +2144,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC_ADJUST:
if (guest_cpuid_has_tsc_adjust(vcpu)) {
if (!msr_info->host_initiated) {
- u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
+ s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
@@ -3106,7 +3110,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
unsigned long val;
memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
- _kvm_get_dr(vcpu, 6, &val);
+ kvm_get_dr(vcpu, 6, &val);
dbgregs->dr6 = val;
dbgregs->dr7 = vcpu->arch.dr7;
dbgregs->flags = 0;
@@ -3128,15 +3132,89 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
return 0;
}
+#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
+
+static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
+{
+ struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+ u64 xstate_bv = xsave->xsave_hdr.xstate_bv;
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(dest, xsave, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV */
+ *(u64 *)(dest + XSAVE_HDR_OFFSET) = xstate_bv;
+
+ /*
+ * Copy each region from the possibly compacted offset to the
+ * non-compacted offset.
+ */
+ valid = xstate_bv & ~XSTATE_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *src = get_xsave_addr(xsave, feature);
+
+ if (src) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ memcpy(dest + offset, src, size);
+ }
+
+ valid -= feature;
+ }
+}
+
+static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
+{
+ struct xsave_struct *xsave = &vcpu->arch.guest_fpu.state->xsave;
+ u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
+ u64 valid;
+
+ /*
+ * Copy legacy XSAVE area, to avoid complications with CPUID
+ * leaves 0 and 1 in the loop below.
+ */
+ memcpy(xsave, src, XSAVE_HDR_OFFSET);
+
+ /* Set XSTATE_BV and possibly XCOMP_BV. */
+ xsave->xsave_hdr.xstate_bv = xstate_bv;
+ if (cpu_has_xsaves)
+ xsave->xsave_hdr.xcomp_bv = host_xcr0 | XSTATE_COMPACTION_ENABLED;
+
+ /*
+ * Copy each region from the non-compacted offset to the
+ * possibly compacted offset.
+ */
+ valid = xstate_bv & ~XSTATE_FPSSE;
+ while (valid) {
+ u64 feature = valid & -valid;
+ int index = fls64(feature) - 1;
+ void *dest = get_xsave_addr(xsave, feature);
+
+ if (dest) {
+ u32 size, offset, ecx, edx;
+ cpuid_count(XSTATE_CPUID, index,
+ &size, &offset, &ecx, &edx);
+ memcpy(dest, src + offset, size);
+ } else
+ WARN_ON_ONCE(1);
+
+ valid -= feature;
+ }
+}
+
static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
struct kvm_xsave *guest_xsave)
{
if (cpu_has_xsave) {
- memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state->xsave,
- vcpu->arch.guest_xstate_size);
- *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] &=
- vcpu->arch.guest_supported_xcr0 | XSTATE_FPSSE;
+ memset(guest_xsave, 0, sizeof(struct kvm_xsave));
+ fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
&vcpu->arch.guest_fpu.state->fxsave,
@@ -3160,8 +3238,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
*/
if (xstate_bv & ~kvm_supported_xcr0())
return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state->xsave,
- guest_xsave->region, vcpu->arch.guest_xstate_size);
+ load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
if (xstate_bv & ~XSTATE_FPSSE)
return -EINVAL;
@@ -4004,7 +4081,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
}
default:
- ;
+ r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg);
}
out:
return r;
@@ -4667,7 +4744,7 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
{
- return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
+ return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
}
int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
@@ -5211,21 +5288,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag
static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
- struct kvm_run *kvm_run = vcpu->run;
- unsigned long eip = vcpu->arch.emulate_ctxt.eip;
- u32 dr6 = 0;
-
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
- dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ struct kvm_run *kvm_run = vcpu->run;
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.guest_debug_dr7,
vcpu->arch.eff_db);
if (dr6 != 0) {
kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
- kvm_run->debug.arch.pc = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
-
+ kvm_run->debug.arch.pc = eip;
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
*r = EMULATE_USER_EXIT;
@@ -5235,7 +5308,8 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
!(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
- dr6 = kvm_vcpu_check_hw_bp(eip, 0,
+ unsigned long eip = kvm_get_linear_rip(vcpu);
+ u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
vcpu->arch.db);
@@ -5365,7 +5439,9 @@ restart:
kvm_rip_write(vcpu, ctxt->eip);
if (r == EMULATE_DONE)
kvm_vcpu_check_singlestep(vcpu, rflags, &r);
- __kvm_set_rflags(vcpu, ctxt->eflags);
+ if (!ctxt->have_exception ||
+ exception_type(ctxt->exception.vector) == EXCPT_TRAP)
+ __kvm_set_rflags(vcpu, ctxt->eflags);
/*
* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
@@ -5965,6 +6041,12 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
+ if (vcpu->arch.exception.nr == DB_VECTOR &&
+ (vcpu->arch.dr7 & DR7_GD)) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
+ }
+
kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
@@ -6873,6 +6955,9 @@ int fx_init(struct kvm_vcpu *vcpu)
return err;
fpu_finit(&vcpu->arch.guest_fpu);
+ if (cpu_has_xsaves)
+ vcpu->arch.guest_fpu.state->xsave.xsave_hdr.xcomp_bv =
+ host_xcr0 | XSTATE_COMPACTION_ENABLED;
/*
* Ensure guest xcr0 is valid for loading
@@ -7024,7 +7109,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
kvm_x86_ops->vcpu_reset(vcpu);
}
-void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, unsigned int vector)
+void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
{
struct kvm_segment cs;
@@ -7256,6 +7341,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (type)
return -EINVAL;
+ INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
@@ -7536,12 +7622,18 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
-bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
{
- unsigned long current_rip = kvm_rip_read(vcpu) +
- get_segment_base(vcpu, VCPU_SREG_CS);
+ if (is_64_bit_mode(vcpu))
+ return kvm_rip_read(vcpu);
+ return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
+ kvm_rip_read(vcpu));
+}
+EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
- return current_rip == linear_rip;
+bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
+{
+ return kvm_get_linear_rip(vcpu) == linear_rip;
}
EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 7cb9c45a5fe0..cc1d61af6140 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -162,7 +162,8 @@ int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
bool kvm_mtrr_valid(struct kvm_vcpu *vcpu, u32 msr, u64 data);
#define KVM_SUPPORTED_XCR0 (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \
- | XSTATE_BNDREGS | XSTATE_BNDCSR)
+ | XSTATE_BNDREGS | XSTATE_BNDCSR \
+ | XSTATE_AVX512)
extern u64 host_xcr0;
extern u64 kvm_supported_xcr0(void);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index aae94132bc24..c1c1544b8485 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -841,7 +841,7 @@ static void __init lguest_init_IRQ(void)
{
unsigned int i;
- for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+ for (i = FIRST_EXTERNAL_VECTOR; i < FIRST_SYSTEM_VECTOR; i++) {
/* Some systems map "vectors" to interrupts weirdly. Not us! */
__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
if (i != SYSCALL_VECTOR)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b74a7e130b03..38dcec403b46 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1247,7 +1247,7 @@ good_area:
}
/* User mode? Just return to handle the fatal exception */
- if (fault & FAULT_FLAG_USER)
+ if (flags & FAULT_FLAG_USER)
return;
/* Not returning to user mode? Handle exceptions or die: */
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 207d9aef662d..d7547824e763 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -15,7 +15,7 @@
static inline pte_t gup_get_pte(pte_t *ptep)
{
#ifndef CONFIG_X86_PAE
- return ACCESS_ONCE(*ptep);
+ return READ_ONCE(*ptep);
#else
/*
* With get_user_pages_fast, we walk down the pagetables without taking
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfaf2e0f5f8f..536ea2fb6e33 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -384,6 +384,26 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address,
}
/*
+ * Lookup the PMD entry for a virtual address. Return a pointer to the entry
+ * or NULL if not present.
+ */
+pmd_t *lookup_pmd_address(unsigned long address)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+
+ pgd = pgd_offset_k(address);
+ if (pgd_none(*pgd))
+ return NULL;
+
+ pud = pud_offset(pgd, address);
+ if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud))
+ return NULL;
+
+ return pmd_offset(pud, address);
+}
+
+/*
* This is necessary because __pa() does not work on some
* kinds of memory, like vmalloc() or the alloc_remap()
* areas on 32-bit NUMA systems. The percpu areas can
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index b9958c364075..44b9271580b5 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -210,6 +210,9 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
{
int polarity;
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
+
if (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_TANGIER)
polarity = 0; /* active high */
else
@@ -224,13 +227,18 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
if (mp_map_gsi_to_irq(dev->irq, IOAPIC_MAP_ALLOC) < 0)
return -EBUSY;
+ dev->irq_managed = 1;
+
return 0;
}
static void intel_mid_pci_irq_disable(struct pci_dev *dev)
{
- if (!mp_should_keep_irq(&dev->dev) && dev->irq > 0)
+ if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+ dev->irq > 0) {
mp_unmap_irq(dev->irq);
+ dev->irq_managed = 0;
+ }
}
struct pci_ops intel_mid_pci_ops = {
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index eb500c2592ad..5dc6ca5e1741 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1200,11 +1200,12 @@ static int pirq_enable_irq(struct pci_dev *dev)
#ifdef CONFIG_X86_IO_APIC
struct pci_dev *temp_dev;
int irq;
- struct io_apic_irq_attr irq_attr;
+
+ if (dev->irq_managed && dev->irq > 0)
+ return 0;
irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
- PCI_SLOT(dev->devfn),
- pin - 1, &irq_attr);
+ PCI_SLOT(dev->devfn), pin - 1);
/*
* Busses behind bridges are typically not listed in the MP-table.
* In this case we have to look up the IRQ based on the parent bus,
@@ -1218,7 +1219,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
pin = pci_swizzle_interrupt_pin(dev, pin);
irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
PCI_SLOT(bridge->devfn),
- pin - 1, &irq_attr);
+ pin - 1);
if (irq >= 0)
dev_warn(&dev->dev, "using bridge %s "
"INT %c to get IRQ %d\n",
@@ -1228,6 +1229,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
}
dev = temp_dev;
if (irq >= 0) {
+ dev->irq_managed = 1;
dev->irq = irq;
dev_info(&dev->dev, "PCI->APIC IRQ transform: "
"INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
@@ -1254,11 +1256,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
return 0;
}
+bool mp_should_keep_irq(struct device *dev)
+{
+ if (dev->power.is_prepared)
+ return true;
+#ifdef CONFIG_PM
+ if (dev->power.runtime_status == RPM_SUSPENDING)
+ return true;
+#endif
+
+ return false;
+}
+
static void pirq_disable_irq(struct pci_dev *dev)
{
if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
- dev->irq) {
+ dev->irq_managed && dev->irq) {
mp_unmap_irq(dev->irq);
dev->irq = 0;
+ dev->irq_managed = 0;
}
}
diff --git a/arch/x86/platform/uv/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index b233681af4de..0ce673645432 100644
--- a/arch/x86/platform/uv/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -131,7 +131,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
unsigned long mmr_offset, int limit)
{
const struct cpumask *eligible_cpu = cpumask_of(cpu);
- struct irq_cfg *cfg = irq_get_chip_data(irq);
+ struct irq_cfg *cfg = irq_cfg(irq);
unsigned long mmr_value;
struct uv_IO_APIC_route_entry *entry;
int mmr_pnode, err;
@@ -198,13 +198,13 @@ static int
uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
bool force)
{
- struct irq_cfg *cfg = data->chip_data;
+ struct irq_cfg *cfg = irqd_cfg(data);
unsigned int dest;
unsigned long mmr_value, mmr_offset;
struct uv_IO_APIC_route_entry *entry;
int mmr_pnode;
- if (__ioapic_set_affinity(data, mask, &dest))
+ if (apic_set_affinity(data, mask, &dest))
return -1;
mmr_value = 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 8c8298d78185..5c1f9ace7ae7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -387,7 +387,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
unsigned long mfn;
if (!xen_feature(XENFEAT_auto_translated_physmap))
- mfn = get_phys_to_machine(pfn);
+ mfn = __pfn_to_mfn(pfn);
else
mfn = pfn;
/*
@@ -1113,20 +1113,16 @@ static void __init xen_cleanhighmap(unsigned long vaddr,
* instead of somewhere later and be confusing. */
xen_mc_flush();
}
-static void __init xen_pagetable_p2m_copy(void)
+
+static void __init xen_pagetable_p2m_free(void)
{
unsigned long size;
unsigned long addr;
- unsigned long new_mfn_list;
-
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return;
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- new_mfn_list = xen_revector_p2m_tree();
/* No memory or already called. */
- if (!new_mfn_list || new_mfn_list == xen_start_info->mfn_list)
+ if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
return;
/* using __ka address and sticking INVALID_P2M_ENTRY! */
@@ -1144,8 +1140,6 @@ static void __init xen_pagetable_p2m_copy(void)
size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
memblock_free(__pa(xen_start_info->mfn_list), size);
- /* And revector! Bye bye old array */
- xen_start_info->mfn_list = new_mfn_list;
/* At this stage, cleanup_highmap has already cleaned __ka space
* from _brk_limit way up to the max_pfn_mapped (which is the end of
@@ -1169,17 +1163,35 @@ static void __init xen_pagetable_p2m_copy(void)
}
#endif
-static void __init xen_pagetable_init(void)
+static void __init xen_pagetable_p2m_setup(void)
{
- paging_init();
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return;
+
+ xen_vmalloc_p2m_tree();
+
#ifdef CONFIG_X86_64
- xen_pagetable_p2m_copy();
+ xen_pagetable_p2m_free();
#endif
+ /* And revector! Bye bye old array */
+ xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
+}
+
+static void __init xen_pagetable_init(void)
+{
+ paging_init();
+ xen_post_allocator_init();
+
+ xen_pagetable_p2m_setup();
+
/* Allocate and initialize top and mid mfn levels for p2m structure */
xen_build_mfn_list_list();
+ /* Remap memory freed due to conflicts with E820 map */
+ if (!xen_feature(XENFEAT_auto_translated_physmap))
+ xen_remap_memory();
+
xen_setup_shared_info();
- xen_post_allocator_init();
}
static void xen_write_cr2(unsigned long cr2)
{
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index b456b048eca9..edbc7a63fd73 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -3,21 +3,22 @@
* guests themselves, but it must also access and update the p2m array
* during suspend/resume when all the pages are reallocated.
*
- * The p2m table is logically a flat array, but we implement it as a
- * three-level tree to allow the address space to be sparse.
+ * The logical flat p2m table is mapped to a linear kernel memory area.
+ * For accesses by Xen a three-level tree linked via mfns only is set up to
+ * allow the address space to be sparse.
*
- * Xen
- * |
- * p2m_top p2m_top_mfn
- * / \ / \
- * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
- * / \ / \ / /
- * p2m p2m p2m p2m p2m p2m p2m ...
+ * Xen
+ * |
+ * p2m_top_mfn
+ * / \
+ * p2m_mid_mfn p2m_mid_mfn
+ * / /
+ * p2m p2m p2m ...
*
* The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
*
- * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
- * maximum representable pseudo-physical address space is:
+ * The p2m_top_mfn level is limited to 1 page, so the maximum representable
+ * pseudo-physical address space is:
* P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
*
* P2M_PER_PAGE depends on the architecture, as a mfn is always
@@ -30,6 +31,9 @@
* leaf entries, or for the top root, or middle one, for which there is a void
* entry, we assume it is "missing". So (for example)
* pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
+ * We have a dedicated page p2m_missing with all entries being
+ * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m
+ * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns.
*
* We also have the possibility of setting 1-1 mappings on certain regions, so
* that:
@@ -39,122 +43,20 @@
* PCI BARs, or ACPI spaces), we can create mappings easily because we
* get the PFN value to match the MFN.
*
- * For this to work efficiently we have one new page p2m_identity and
- * allocate (via reserved_brk) any other pages we need to cover the sides
- * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
- * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
- * no other fancy value).
+ * For this to work efficiently we have one new page p2m_identity. All entries
+ * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only
+ * recognizes that and MFNs, no other fancy value).
*
* On lookup we spot that the entry points to p2m_identity and return the
* identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
* If the entry points to an allocated page, we just proceed as before and
- * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
+ * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
* appropriate functions (pfn_to_mfn).
*
* The reason for having the IDENTITY_FRAME_BIT instead of just returning the
* PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
* non-identity pfn. To protect ourselves against we elect to set (and get) the
* IDENTITY_FRAME_BIT on all identity mapped PFNs.
- *
- * This simplistic diagram is used to explain the more subtle piece of code.
- * There is also a digram of the P2M at the end that can help.
- * Imagine your E820 looking as so:
- *
- * 1GB 2GB 4GB
- * /-------------------+---------\/----\ /----------\ /---+-----\
- * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
- * \-------------------+---------/\----/ \----------/ \---+-----/
- * ^- 1029MB ^- 2001MB
- *
- * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
- * 2048MB = 524288 (0x80000)]
- *
- * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
- * is actually not present (would have to kick the balloon driver to put it in).
- *
- * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
- * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
- * of the PFN and the end PFN (263424 and 512256 respectively). The first step
- * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
- * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
- * aligned on 512^2*PAGE_SIZE (1GB) we reserve_brk new middle and leaf pages as
- * required to split any existing p2m_mid_missing middle pages.
- *
- * With the E820 example above, 263424 is not 1GB aligned so we allocate a
- * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
- * Each entry in the allocate page is "missing" (points to p2m_missing).
- *
- * Next stage is to determine if we need to do a more granular boundary check
- * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
- * We check if the start pfn and end pfn violate that boundary check, and if
- * so reserve_brk a (p2m[x][y]) leaf page. This way we have a much finer
- * granularity of setting which PFNs are missing and which ones are identity.
- * In our example 263424 and 512256 both fail the check so we reserve_brk two
- * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
- * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
- *
- * At this point we would at minimum reserve_brk one page, but could be up to
- * three. Each call to set_phys_range_identity has at maximum a three page
- * cost. If we were to query the P2M at this stage, all those entries from
- * start PFN through end PFN (so 1029MB -> 2001MB) would return
- * INVALID_P2M_ENTRY ("missing").
- *
- * The next step is to walk from the start pfn to the end pfn setting
- * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
- * If we find that the middle entry is pointing to p2m_missing we can swap it
- * over to p2m_identity - this way covering 4MB (or 2MB) PFN space (and
- * similarly swapping p2m_mid_missing for p2m_mid_identity for larger regions).
- * At this point we do not need to worry about boundary aligment (so no need to
- * reserve_brk a middle page, figure out which PFNs are "missing" and which
- * ones are identity), as that has been done earlier. If we find that the
- * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
- * that page (which covers 512 PFNs) and set the appropriate PFN with
- * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
- * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
- * IDENTITY_FRAME_BIT set.
- *
- * All other regions that are void (or not filled) either point to p2m_missing
- * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
- * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
- * contain the INVALID_P2M_ENTRY value and are considered "missing."
- *
- * Finally, the region beyond the end of of the E820 (4 GB in this example)
- * is set to be identity (in case there are MMIO regions placed here).
- *
- * This is what the p2m ends up looking (for the E820 above) with this
- * fabulous drawing:
- *
- * p2m /--------------\
- * /-----\ | &mfn_list[0],| /-----------------\
- * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
- * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
- * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
- * |-----| \ | [p2m_identity]+\\ | .... |
- * | 2 |--\ \-------------------->| ... | \\ \----------------/
- * |-----| \ \---------------/ \\
- * | 3 |-\ \ \\ p2m_identity [1]
- * |-----| \ \-------------------->/---------------\ /-----------------\
- * | .. |\ | | [p2m_identity]+-->| ~0, ~0, ~0, ... |
- * \-----/ | | | [p2m_identity]+-->| ..., ~0 |
- * | | | .... | \-----------------/
- * | | +-[x], ~0, ~0.. +\
- * | | \---------------/ \
- * | | \-> /---------------\
- * | V p2m_mid_missing p2m_missing | IDENTITY[@0] |
- * | /-----------------\ /------------\ | IDENTITY[@256]|
- * | | [p2m_missing] +---->| ~0, ~0, ...| | ~0, ~0, .... |
- * | | [p2m_missing] +---->| ..., ~0 | \---------------/
- * | | ... | \------------/
- * | \-----------------/
- * |
- * | p2m_mid_identity
- * | /-----------------\
- * \-->| [p2m_identity] +---->[1]
- * | [p2m_identity] +---->[1]
- * | ... |
- * \-----------------/
- *
- * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
*/
#include <linux/init.h>
@@ -164,9 +66,11 @@
#include <linux/sched.h>
#include <linux/seq_file.h>
#include <linux/bootmem.h>
+#include <linux/slab.h>
#include <asm/cache.h>
#include <asm/setup.h>
+#include <asm/uaccess.h>
#include <asm/xen/page.h>
#include <asm/xen/hypercall.h>
@@ -178,31 +82,26 @@
#include "multicalls.h"
#include "xen-ops.h"
+#define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE)
+
static void __init m2p_override_init(void);
+unsigned long *xen_p2m_addr __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_addr);
+unsigned long xen_p2m_size __read_mostly;
+EXPORT_SYMBOL_GPL(xen_p2m_size);
unsigned long xen_max_p2m_pfn __read_mostly;
+EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
+
+static DEFINE_SPINLOCK(p2m_update_lock);
static unsigned long *p2m_mid_missing_mfn;
static unsigned long *p2m_top_mfn;
static unsigned long **p2m_top_mfn_p;
-
-/* Placeholders for holes in the address space */
-static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
-
-static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
-static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_identity, P2M_MID_PER_PAGE);
-
-RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
-
-/* For each I/O range remapped we may lose up to two leaf pages for the boundary
- * violations and three mid pages to cover up to 3GB. With
- * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
- * remapped region.
- */
-RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
+static unsigned long *p2m_missing;
+static unsigned long *p2m_identity;
+static pte_t *p2m_missing_pte;
+static pte_t *p2m_identity_pte;
static inline unsigned p2m_top_index(unsigned long pfn)
{
@@ -220,14 +119,6 @@ static inline unsigned p2m_index(unsigned long pfn)
return pfn % P2M_PER_PAGE;
}
-static void p2m_top_init(unsigned long ***top)
-{
- unsigned i;
-
- for (i = 0; i < P2M_TOP_PER_PAGE; i++)
- top[i] = p2m_mid_missing;
-}
-
static void p2m_top_mfn_init(unsigned long *top)
{
unsigned i;
@@ -244,28 +135,43 @@ static void p2m_top_mfn_p_init(unsigned long **top)
top[i] = p2m_mid_missing_mfn;
}
-static void p2m_mid_init(unsigned long **mid, unsigned long *leaf)
+static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
{
unsigned i;
for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = leaf;
+ mid[i] = virt_to_mfn(leaf);
}
-static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf)
+static void p2m_init(unsigned long *p2m)
{
unsigned i;
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- mid[i] = virt_to_mfn(leaf);
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = INVALID_P2M_ENTRY;
}
-static void p2m_init(unsigned long *p2m)
+static void p2m_init_identity(unsigned long *p2m, unsigned long pfn)
{
unsigned i;
- for (i = 0; i < P2M_MID_PER_PAGE; i++)
- p2m[i] = INVALID_P2M_ENTRY;
+ for (i = 0; i < P2M_PER_PAGE; i++)
+ p2m[i] = IDENTITY_FRAME(pfn + i);
+}
+
+static void * __ref alloc_p2m_page(void)
+{
+ if (unlikely(!slab_is_available()))
+ return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+
+ return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
+}
+
+/* Only to be called in case of a race for a page just allocated! */
+static void free_p2m_page(void *p)
+{
+ BUG_ON(!slab_is_available());
+ free_page((unsigned long)p);
}
/*
@@ -280,40 +186,46 @@ static void p2m_init(unsigned long *p2m)
*/
void __ref xen_build_mfn_list_list(void)
{
- unsigned long pfn;
+ unsigned long pfn, mfn;
+ pte_t *ptep;
+ unsigned int level, topidx, mididx;
+ unsigned long *mid_mfn_p;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
/* Pre-initialize p2m_top_mfn to be completely missing */
if (p2m_top_mfn == NULL) {
- p2m_mid_missing_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_mid_missing_mfn = alloc_p2m_page();
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
- p2m_top_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn_p = alloc_p2m_page();
p2m_top_mfn_p_init(p2m_top_mfn_p);
- p2m_top_mfn = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ p2m_top_mfn = alloc_p2m_page();
p2m_top_mfn_init(p2m_top_mfn);
} else {
/* Reinitialise, mfn's all change after migration */
p2m_mid_mfn_init(p2m_mid_missing_mfn, p2m_missing);
}
- for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned long **mid;
- unsigned long *mid_mfn_p;
+ for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN;
+ pfn += P2M_PER_PAGE) {
+ topidx = p2m_top_index(pfn);
+ mididx = p2m_mid_index(pfn);
- mid = p2m_top[topidx];
mid_mfn_p = p2m_top_mfn_p[topidx];
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn),
+ &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ mfn = pte_mfn(*ptep);
+ ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
/* Don't bother allocating any mfn mid levels if
* they're just missing, just update the stored mfn,
* since all could have changed over a migrate.
*/
- if (mid == p2m_mid_missing) {
+ if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) {
BUG_ON(mididx);
BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
@@ -322,19 +234,14 @@ void __ref xen_build_mfn_list_list(void)
}
if (mid_mfn_p == p2m_mid_missing_mfn) {
- /*
- * XXX boot-time only! We should never find
- * missing parts of the mfn tree after
- * runtime.
- */
- mid_mfn_p = alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE);
+ mid_mfn_p = alloc_p2m_page();
p2m_mid_mfn_init(mid_mfn_p, p2m_missing);
p2m_top_mfn_p[topidx] = mid_mfn_p;
}
p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
- mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
+ mid_mfn_p[mididx] = mfn;
}
}
@@ -353,171 +260,235 @@ void xen_setup_mfn_list_list(void)
/* Set up p2m_top to point to the domain-builder provided p2m pages */
void __init xen_build_dynamic_phys_to_machine(void)
{
- unsigned long *mfn_list;
- unsigned long max_pfn;
unsigned long pfn;
if (xen_feature(XENFEAT_auto_translated_physmap))
return;
- mfn_list = (unsigned long *)xen_start_info->mfn_list;
- max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
- xen_max_p2m_pfn = max_pfn;
+ xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list;
+ xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE);
- p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_missing);
- p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_init(p2m_identity);
+ for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++)
+ xen_p2m_addr[pfn] = INVALID_P2M_ENTRY;
- p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_missing, p2m_missing);
- p2m_mid_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(p2m_mid_identity, p2m_identity);
+ xen_max_p2m_pfn = xen_p2m_size;
+}
- p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_top_init(p2m_top);
+#define P2M_TYPE_IDENTITY 0
+#define P2M_TYPE_MISSING 1
+#define P2M_TYPE_PFN 2
+#define P2M_TYPE_UNKNOWN 3
- /*
- * The domain builder gives us a pre-constructed p2m array in
- * mfn_list for all the pages initially given to us, so we just
- * need to graft that into our tree structure.
- */
- for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
+static int xen_p2m_elem_type(unsigned long pfn)
+{
+ unsigned long mfn;
- if (p2m_top[topidx] == p2m_mid_missing) {
- unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
- p2m_mid_init(mid, p2m_missing);
+ if (pfn >= xen_p2m_size)
+ return P2M_TYPE_IDENTITY;
- p2m_top[topidx] = mid;
- }
+ mfn = xen_p2m_addr[pfn];
- /*
- * As long as the mfn_list has enough entries to completely
- * fill a p2m page, pointing into the array is ok. But if
- * not the entries beyond the last pfn will be undefined.
- */
- if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
- unsigned long p2midx;
+ if (mfn == INVALID_P2M_ENTRY)
+ return P2M_TYPE_MISSING;
- p2midx = max_pfn % P2M_PER_PAGE;
- for ( ; p2midx < P2M_PER_PAGE; p2midx++)
- mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
- }
- p2m_top[topidx][mididx] = &mfn_list[pfn];
- }
+ if (mfn & IDENTITY_FRAME_BIT)
+ return P2M_TYPE_IDENTITY;
- m2p_override_init();
+ return P2M_TYPE_PFN;
}
-#ifdef CONFIG_X86_64
-unsigned long __init xen_revector_p2m_tree(void)
+
+static void __init xen_rebuild_p2m_list(unsigned long *p2m)
{
- unsigned long va_start;
- unsigned long va_end;
+ unsigned int i, chunk;
unsigned long pfn;
- unsigned long pfn_free = 0;
- unsigned long *mfn_list = NULL;
- unsigned long size;
-
- va_start = xen_start_info->mfn_list;
- /*We copy in increments of P2M_PER_PAGE * sizeof(unsigned long),
- * so make sure it is rounded up to that */
- size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
- va_end = va_start + size;
-
- /* If we were revectored already, don't do it again. */
- if (va_start <= __START_KERNEL_map && va_start >= __PAGE_OFFSET)
- return 0;
+ unsigned long *mfns;
+ pte_t *ptep;
+ pmd_t *pmdp;
+ int type;
- mfn_list = alloc_bootmem_align(size, PAGE_SIZE);
- if (!mfn_list) {
- pr_warn("Could not allocate space for a new P2M tree!\n");
- return xen_start_info->mfn_list;
- }
- /* Fill it out with INVALID_P2M_ENTRY value */
- memset(mfn_list, 0xFF, size);
+ p2m_missing = alloc_p2m_page();
+ p2m_init(p2m_missing);
+ p2m_identity = alloc_p2m_page();
+ p2m_init(p2m_identity);
- for (pfn = 0; pfn < ALIGN(MAX_DOMAIN_PAGES, P2M_PER_PAGE); pfn += P2M_PER_PAGE) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx;
- unsigned long *mid_p;
+ p2m_missing_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT);
+ p2m_identity_pte = alloc_p2m_page();
+ paravirt_alloc_pte(&init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT);
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ set_pte(p2m_missing_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO));
+ set_pte(p2m_identity_pte + i,
+ pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO));
+ }
- if (!p2m_top[topidx])
+ for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) {
+ /*
+ * Try to map missing/identity PMDs or p2m-pages if possible.
+ * We have to respect the structure of the mfn_list_list
+ * which will be built just afterwards.
+ * Chunk size to test is one p2m page if we are in the middle
+ * of a mfn_list_list mid page and the complete mid page area
+ * if we are at index 0 of the mid page. Please note that a
+ * mid page might cover more than one PMD, e.g. on 32 bit PAE
+ * kernels.
+ */
+ chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ?
+ P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE;
+
+ type = xen_p2m_elem_type(pfn);
+ i = 0;
+ if (type != P2M_TYPE_PFN)
+ for (i = 1; i < chunk; i++)
+ if (xen_p2m_elem_type(pfn + i) != type)
+ break;
+ if (i < chunk)
+ /* Reset to minimal chunk size. */
+ chunk = P2M_PER_PAGE;
+
+ if (type == P2M_TYPE_PFN || i < chunk) {
+ /* Use initial p2m page contents. */
+#ifdef CONFIG_X86_64
+ mfns = alloc_p2m_page();
+ copy_page(mfns, xen_p2m_addr + pfn);
+#else
+ mfns = xen_p2m_addr + pfn;
+#endif
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL));
continue;
+ }
- if (p2m_top[topidx] == p2m_mid_missing)
+ if (chunk == P2M_PER_PAGE) {
+ /* Map complete missing or identity p2m-page. */
+ mfns = (type == P2M_TYPE_MISSING) ?
+ p2m_missing : p2m_identity;
+ ptep = populate_extra_pte((unsigned long)(p2m + pfn));
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO));
continue;
+ }
- mididx = p2m_mid_index(pfn);
- mid_p = p2m_top[topidx][mididx];
- if (!mid_p)
- continue;
- if ((mid_p == p2m_missing) || (mid_p == p2m_identity))
- continue;
+ /* Complete missing or identity PMD(s) can be mapped. */
+ ptep = (type == P2M_TYPE_MISSING) ?
+ p2m_missing_pte : p2m_identity_pte;
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pmdp = populate_extra_pmd(
+ (unsigned long)(p2m + pfn + i * PTRS_PER_PTE));
+ set_pmd(pmdp, __pmd(__pa(ptep) | _KERNPG_TABLE));
+ }
+ }
+}
- if ((unsigned long)mid_p == INVALID_P2M_ENTRY)
- continue;
+void __init xen_vmalloc_p2m_tree(void)
+{
+ static struct vm_struct vm;
- /* The old va. Rebase it on mfn_list */
- if (mid_p >= (unsigned long *)va_start && mid_p <= (unsigned long *)va_end) {
- unsigned long *new;
+ vm.flags = VM_ALLOC;
+ vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
+ PMD_SIZE * PMDS_PER_MID_PAGE);
+ vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
+ pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
- if (pfn_free > (size / sizeof(unsigned long))) {
- WARN(1, "Only allocated for %ld pages, but we want %ld!\n",
- size / sizeof(unsigned long), pfn_free);
- return 0;
- }
- new = &mfn_list[pfn_free];
+ xen_max_p2m_pfn = vm.size / sizeof(unsigned long);
- copy_page(new, mid_p);
- p2m_top[topidx][mididx] = &mfn_list[pfn_free];
+ xen_rebuild_p2m_list(vm.addr);
- pfn_free += P2M_PER_PAGE;
+ xen_p2m_addr = vm.addr;
+ xen_p2m_size = xen_max_p2m_pfn;
- }
- /* This should be the leafs allocated for identity from _brk. */
- }
- return (unsigned long)mfn_list;
+ xen_inv_extra_mem();
+ m2p_override_init();
}
-#else
-unsigned long __init xen_revector_p2m_tree(void)
-{
- return 0;
-}
-#endif
+
unsigned long get_phys_to_machine(unsigned long pfn)
{
- unsigned topidx, mididx, idx;
+ pte_t *ptep;
+ unsigned int level;
+
+ if (unlikely(pfn >= xen_p2m_size)) {
+ if (pfn < xen_max_p2m_pfn)
+ return xen_chk_extra_mem(pfn);
- if (unlikely(pfn >= MAX_P2M_PFN))
return IDENTITY_FRAME(pfn);
+ }
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
/*
* The INVALID_P2M_ENTRY is filled in both p2m_*identity
* and in p2m_*missing, so returning the INVALID_P2M_ENTRY
* would be wrong.
*/
- if (p2m_top[topidx][mididx] == p2m_identity)
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
return IDENTITY_FRAME(pfn);
- return p2m_top[topidx][mididx][idx];
+ return xen_p2m_addr[pfn];
}
EXPORT_SYMBOL_GPL(get_phys_to_machine);
-static void *alloc_p2m_page(void)
+/*
+ * Allocate new pmd(s). It is checked whether the old pmd is still in place.
+ * If not, nothing is changed. This is okay as the only reason for allocating
+ * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual
+ * pmd. In case of PAE/x86-32 there are multiple pmds to allocate!
+ */
+static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *ptep, pte_t *pte_pg)
{
- return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
-}
+ pte_t *ptechk;
+ pte_t *pteret = ptep;
+ pte_t *pte_newpg[PMDS_PER_MID_PAGE];
+ pmd_t *pmdp;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long vaddr;
+ int i;
-static void free_p2m_page(void *p)
-{
- free_page((unsigned long)p);
+ /* Do all allocations first to bail out in error case. */
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ pte_newpg[i] = alloc_p2m_page();
+ if (!pte_newpg[i]) {
+ for (i--; i >= 0; i--)
+ free_p2m_page(pte_newpg[i]);
+
+ return NULL;
+ }
+ }
+
+ vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1);
+
+ for (i = 0; i < PMDS_PER_MID_PAGE; i++) {
+ copy_page(pte_newpg[i], pte_pg);
+ paravirt_alloc_pte(&init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT);
+
+ pmdp = lookup_pmd_address(vaddr);
+ BUG_ON(!pmdp);
+
+ spin_lock_irqsave(&p2m_update_lock, flags);
+
+ ptechk = lookup_address(vaddr, &level);
+ if (ptechk == pte_pg) {
+ set_pmd(pmdp,
+ __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE));
+ if (vaddr == (addr & ~(PMD_SIZE - 1)))
+ pteret = pte_offset_kernel(pmdp, addr);
+ pte_newpg[i] = NULL;
+ }
+
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
+
+ if (pte_newpg[i]) {
+ paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT);
+ free_p2m_page(pte_newpg[i]);
+ }
+
+ vaddr += PMD_SIZE;
+ }
+
+ return pteret;
}
/*
@@ -530,58 +501,62 @@ static void free_p2m_page(void *p)
static bool alloc_p2m(unsigned long pfn)
{
unsigned topidx, mididx;
- unsigned long ***top_p, **mid;
unsigned long *top_mfn_p, *mid_mfn;
- unsigned long *p2m_orig;
+ pte_t *ptep, *pte_pg;
+ unsigned int level;
+ unsigned long flags;
+ unsigned long addr = (unsigned long)(xen_p2m_addr + pfn);
+ unsigned long p2m_pfn;
topidx = p2m_top_index(pfn);
mididx = p2m_mid_index(pfn);
- top_p = &p2m_top[topidx];
- mid = ACCESS_ONCE(*top_p);
+ ptep = lookup_address(addr, &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
+ pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1));
- if (mid == p2m_mid_missing) {
- /* Mid level is missing, allocate a new one */
- mid = alloc_p2m_page();
- if (!mid)
+ if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) {
+ /* PMD level is missing, allocate a new one */
+ ptep = alloc_p2m_pmd(addr, ptep, pte_pg);
+ if (!ptep)
return false;
-
- p2m_mid_init(mid, p2m_missing);
-
- if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
- free_p2m_page(mid);
}
- top_mfn_p = &p2m_top_mfn[topidx];
- mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
+ if (p2m_top_mfn) {
+ top_mfn_p = &p2m_top_mfn[topidx];
+ mid_mfn = ACCESS_ONCE(p2m_top_mfn_p[topidx]);
- BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
+ BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
- if (mid_mfn == p2m_mid_missing_mfn) {
- /* Separately check the mid mfn level */
- unsigned long missing_mfn;
- unsigned long mid_mfn_mfn;
- unsigned long old_mfn;
+ if (mid_mfn == p2m_mid_missing_mfn) {
+ /* Separately check the mid mfn level */
+ unsigned long missing_mfn;
+ unsigned long mid_mfn_mfn;
+ unsigned long old_mfn;
- mid_mfn = alloc_p2m_page();
- if (!mid_mfn)
- return false;
+ mid_mfn = alloc_p2m_page();
+ if (!mid_mfn)
+ return false;
- p2m_mid_mfn_init(mid_mfn, p2m_missing);
+ p2m_mid_mfn_init(mid_mfn, p2m_missing);
- missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
- mid_mfn_mfn = virt_to_mfn(mid_mfn);
- old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
- if (old_mfn != missing_mfn) {
- free_p2m_page(mid_mfn);
- mid_mfn = mfn_to_virt(old_mfn);
- } else {
- p2m_top_mfn_p[topidx] = mid_mfn;
+ missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
+ mid_mfn_mfn = virt_to_mfn(mid_mfn);
+ old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn);
+ if (old_mfn != missing_mfn) {
+ free_p2m_page(mid_mfn);
+ mid_mfn = mfn_to_virt(old_mfn);
+ } else {
+ p2m_top_mfn_p[topidx] = mid_mfn;
+ }
}
+ } else {
+ mid_mfn = NULL;
}
- p2m_orig = ACCESS_ONCE(p2m_top[topidx][mididx]);
- if (p2m_orig == p2m_identity || p2m_orig == p2m_missing) {
+ p2m_pfn = pte_pfn(ACCESS_ONCE(*ptep));
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) ||
+ p2m_pfn == PFN_DOWN(__pa(p2m_missing))) {
/* p2m leaf page is missing */
unsigned long *p2m;
@@ -589,183 +564,36 @@ static bool alloc_p2m(unsigned long pfn)
if (!p2m)
return false;
- p2m_init(p2m);
-
- if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
- free_p2m_page(p2m);
+ if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
+ p2m_init(p2m);
else
- mid_mfn[mididx] = virt_to_mfn(p2m);
- }
-
- return true;
-}
-
-static bool __init early_alloc_p2m(unsigned long pfn, bool check_boundary)
-{
- unsigned topidx, mididx, idx;
- unsigned long *p2m;
-
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /* Pfff.. No boundary cross-over, lets get out. */
- if (!idx && check_boundary)
- return false;
-
- WARN(p2m_top[topidx][mididx] == p2m_identity,
- "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
- topidx, mididx);
-
- /*
- * Could be done by xen_build_dynamic_phys_to_machine..
- */
- if (p2m_top[topidx][mididx] != p2m_missing)
- return false;
-
- /* Boundary cross-over for the edges: */
- p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
-
- p2m_init(p2m);
+ p2m_init_identity(p2m, pfn);
- p2m_top[topidx][mididx] = p2m;
+ spin_lock_irqsave(&p2m_update_lock, flags);
- return true;
-}
-
-static bool __init early_alloc_p2m_middle(unsigned long pfn)
-{
- unsigned topidx = p2m_top_index(pfn);
- unsigned long **mid;
-
- mid = p2m_top[topidx];
- if (mid == p2m_mid_missing) {
- mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
-
- p2m_mid_init(mid, p2m_missing);
-
- p2m_top[topidx] = mid;
- }
- return true;
-}
-
-/*
- * Skim over the P2M tree looking at pages that are either filled with
- * INVALID_P2M_ENTRY or with 1:1 PFNs. If found, re-use that page and
- * replace the P2M leaf with a p2m_missing or p2m_identity.
- * Stick the old page in the new P2M tree location.
- */
-static bool __init early_can_reuse_p2m_middle(unsigned long set_pfn)
-{
- unsigned topidx;
- unsigned mididx;
- unsigned ident_pfns;
- unsigned inv_pfns;
- unsigned long *p2m;
- unsigned idx;
- unsigned long pfn;
-
- /* We only look when this entails a P2M middle layer */
- if (p2m_index(set_pfn))
- return false;
-
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {
- topidx = p2m_top_index(pfn);
-
- if (!p2m_top[topidx])
- continue;
-
- if (p2m_top[topidx] == p2m_mid_missing)
- continue;
-
- mididx = p2m_mid_index(pfn);
- p2m = p2m_top[topidx][mididx];
- if (!p2m)
- continue;
-
- if ((p2m == p2m_missing) || (p2m == p2m_identity))
- continue;
-
- if ((unsigned long)p2m == INVALID_P2M_ENTRY)
- continue;
-
- ident_pfns = 0;
- inv_pfns = 0;
- for (idx = 0; idx < P2M_PER_PAGE; idx++) {
- /* IDENTITY_PFNs are 1:1 */
- if (p2m[idx] == IDENTITY_FRAME(pfn + idx))
- ident_pfns++;
- else if (p2m[idx] == INVALID_P2M_ENTRY)
- inv_pfns++;
- else
- break;
+ if (pte_pfn(*ptep) == p2m_pfn) {
+ set_pte(ptep,
+ pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL));
+ if (mid_mfn)
+ mid_mfn[mididx] = virt_to_mfn(p2m);
+ p2m = NULL;
}
- if ((ident_pfns == P2M_PER_PAGE) || (inv_pfns == P2M_PER_PAGE))
- goto found;
- }
- return false;
-found:
- /* Found one, replace old with p2m_identity or p2m_missing */
- p2m_top[topidx][mididx] = (ident_pfns ? p2m_identity : p2m_missing);
-
- /* Reset where we want to stick the old page in. */
- topidx = p2m_top_index(set_pfn);
- mididx = p2m_mid_index(set_pfn);
-
- /* This shouldn't happen */
- if (WARN_ON(p2m_top[topidx] == p2m_mid_missing))
- early_alloc_p2m_middle(set_pfn);
-
- if (WARN_ON(p2m_top[topidx][mididx] != p2m_missing))
- return false;
-
- p2m_init(p2m);
- p2m_top[topidx][mididx] = p2m;
- return true;
-}
-bool __init early_set_phys_to_machine(unsigned long pfn, unsigned long mfn)
-{
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
- if (!early_alloc_p2m_middle(pfn))
- return false;
-
- if (early_can_reuse_p2m_middle(pfn))
- return __set_phys_to_machine(pfn, mfn);
-
- if (!early_alloc_p2m(pfn, false /* boundary crossover OK!*/))
- return false;
+ spin_unlock_irqrestore(&p2m_update_lock, flags);
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
+ if (p2m)
+ free_p2m_page(p2m);
}
return true;
}
-static void __init early_split_p2m(unsigned long pfn)
-{
- unsigned long mididx, idx;
-
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /*
- * Allocate new middle and leaf pages if this pfn lies in the
- * middle of one.
- */
- if (mididx || idx)
- early_alloc_p2m_middle(pfn);
- if (idx)
- early_alloc_p2m(pfn, false);
-}
-
unsigned long __init set_phys_range_identity(unsigned long pfn_s,
unsigned long pfn_e)
{
unsigned long pfn;
- if (unlikely(pfn_s >= MAX_P2M_PFN))
+ if (unlikely(pfn_s >= xen_p2m_size))
return 0;
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
@@ -774,101 +602,51 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
if (pfn_s > pfn_e)
return 0;
- if (pfn_e > MAX_P2M_PFN)
- pfn_e = MAX_P2M_PFN;
-
- early_split_p2m(pfn_s);
- early_split_p2m(pfn_e);
-
- for (pfn = pfn_s; pfn < pfn_e;) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
-
- if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
- break;
- pfn++;
-
- /*
- * If the PFN was set to a middle or leaf identity
- * page the remainder must also be identity, so skip
- * ahead to the next middle or leaf entry.
- */
- if (p2m_top[topidx] == p2m_mid_identity)
- pfn = ALIGN(pfn, P2M_MID_PER_PAGE * P2M_PER_PAGE);
- else if (p2m_top[topidx][mididx] == p2m_identity)
- pfn = ALIGN(pfn, P2M_PER_PAGE);
- }
+ if (pfn_e > xen_p2m_size)
+ pfn_e = xen_p2m_size;
- WARN((pfn - pfn_s) != (pfn_e - pfn_s),
- "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
- (pfn_e - pfn_s) - (pfn - pfn_s));
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn);
return pfn - pfn_s;
}
-/* Try to install p2m mapping; fail if intermediate bits missing */
bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- unsigned topidx, mididx, idx;
+ pte_t *ptep;
+ unsigned int level;
/* don't track P2M changes in autotranslate guests */
if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
return true;
- if (unlikely(pfn >= MAX_P2M_PFN)) {
+ if (unlikely(pfn >= xen_p2m_size)) {
BUG_ON(mfn != INVALID_P2M_ENTRY);
return true;
}
- topidx = p2m_top_index(pfn);
- mididx = p2m_mid_index(pfn);
- idx = p2m_index(pfn);
-
- /* For sparse holes were the p2m leaf has real PFN along with
- * PCI holes, stick in the PFN as the MFN value.
- *
- * set_phys_range_identity() will have allocated new middle
- * and leaf pages as required so an existing p2m_mid_missing
- * or p2m_missing mean that whole range will be identity so
- * these can be switched to p2m_mid_identity or p2m_identity.
- */
- if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
- if (p2m_top[topidx] == p2m_mid_identity)
- return true;
-
- if (p2m_top[topidx] == p2m_mid_missing) {
- WARN_ON(cmpxchg(&p2m_top[topidx], p2m_mid_missing,
- p2m_mid_identity) != p2m_mid_missing);
- return true;
- }
-
- if (p2m_top[topidx][mididx] == p2m_identity)
- return true;
+ if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn)))
+ return true;
- /* Swap over from MISSING to IDENTITY if needed. */
- if (p2m_top[topidx][mididx] == p2m_missing) {
- WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
- p2m_identity) != p2m_missing);
- return true;
- }
- }
+ ptep = lookup_address((unsigned long)(xen_p2m_addr + pfn), &level);
+ BUG_ON(!ptep || level != PG_LEVEL_4K);
- if (p2m_top[topidx][mididx] == p2m_missing)
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_missing)))
return mfn == INVALID_P2M_ENTRY;
- p2m_top[topidx][mididx][idx] = mfn;
+ if (pte_pfn(*ptep) == PFN_DOWN(__pa(p2m_identity)))
+ return mfn == IDENTITY_FRAME(pfn);
- return true;
+ return false;
}
bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
{
- if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
+ if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
if (!alloc_p2m(pfn))
return false;
- if (!__set_phys_to_machine(pfn, mfn))
- return false;
+ return __set_phys_to_machine(pfn, mfn);
}
return true;
@@ -877,15 +655,16 @@ bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
#define M2P_OVERRIDE_HASH_SHIFT 10
#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
-static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
+static struct list_head *m2p_overrides;
static DEFINE_SPINLOCK(m2p_override_lock);
static void __init m2p_override_init(void)
{
unsigned i;
- m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
- sizeof(unsigned long));
+ m2p_overrides = alloc_bootmem_align(
+ sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
+ sizeof(unsigned long));
for (i = 0; i < M2P_OVERRIDE_HASH; i++)
INIT_LIST_HEAD(&m2p_overrides[i]);
@@ -896,68 +675,9 @@ static unsigned long mfn_hash(unsigned long mfn)
return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
}
-int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
- struct gnttab_map_grant_ref *kmap_ops,
- struct page **pages, unsigned int count)
-{
- int i, ret = 0;
- bool lazy = false;
- pte_t *pte;
-
- if (xen_feature(XENFEAT_auto_translated_physmap))
- return 0;
-
- if (kmap_ops &&
- !in_interrupt() &&
- paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
- arch_enter_lazy_mmu_mode();
- lazy = true;
- }
-
- for (i = 0; i < count; i++) {
- unsigned long mfn, pfn;
-
- /* Do not add to override if the map failed. */
- if (map_ops[i].status)
- continue;
-
- if (map_ops[i].flags & GNTMAP_contains_pte) {
- pte = (pte_t *) (mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
- (map_ops[i].host_addr & ~PAGE_MASK));
- mfn = pte_mfn(*pte);
- } else {
- mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
- }
- pfn = page_to_pfn(pages[i]);
-
- WARN_ON(PagePrivate(pages[i]));
- SetPagePrivate(pages[i]);
- set_page_private(pages[i], mfn);
- pages[i]->index = pfn_to_mfn(pfn);
-
- if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
- ret = -ENOMEM;
- goto out;
- }
-
- if (kmap_ops) {
- ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
- if (ret)
- goto out;
- }
- }
-
-out:
- if (lazy)
- arch_leave_lazy_mmu_mode();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
-
/* Add an MFN override for a particular page */
-int m2p_add_override(unsigned long mfn, struct page *page,
- struct gnttab_map_grant_ref *kmap_op)
+static int m2p_add_override(unsigned long mfn, struct page *page,
+ struct gnttab_map_grant_ref *kmap_op)
{
unsigned long flags;
unsigned long pfn;
@@ -970,7 +690,7 @@ int m2p_add_override(unsigned long mfn, struct page *page,
address = (unsigned long)__va(pfn << PAGE_SHIFT);
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_add_override: pfn %lx not mapped", pfn))
+ "m2p_add_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
@@ -1004,19 +724,19 @@ int m2p_add_override(unsigned long mfn, struct page *page,
* because mfn_to_pfn (that ends up being called by GUPF) will
* return the backend pfn rather than the frontend pfn. */
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) == mfn)
+ if (__pfn_to_mfn(pfn) == mfn)
set_phys_to_machine(pfn, FOREIGN_FRAME(mfn));
return 0;
}
-EXPORT_SYMBOL_GPL(m2p_add_override);
-int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
- struct gnttab_map_grant_ref *kmap_ops,
- struct page **pages, unsigned int count)
+int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
{
int i, ret = 0;
bool lazy = false;
+ pte_t *pte;
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
@@ -1029,35 +749,75 @@ int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
}
for (i = 0; i < count; i++) {
- unsigned long mfn = get_phys_to_machine(page_to_pfn(pages[i]));
- unsigned long pfn = page_to_pfn(pages[i]);
+ unsigned long mfn, pfn;
- if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
- ret = -EINVAL;
- goto out;
+ /* Do not add to override if the map failed. */
+ if (map_ops[i].status)
+ continue;
+
+ if (map_ops[i].flags & GNTMAP_contains_pte) {
+ pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) +
+ (map_ops[i].host_addr & ~PAGE_MASK));
+ mfn = pte_mfn(*pte);
+ } else {
+ mfn = PFN_DOWN(map_ops[i].dev_bus_addr);
}
+ pfn = page_to_pfn(pages[i]);
- set_page_private(pages[i], INVALID_P2M_ENTRY);
- WARN_ON(!PagePrivate(pages[i]));
- ClearPagePrivate(pages[i]);
- set_phys_to_machine(pfn, pages[i]->index);
+ WARN_ON(PagePrivate(pages[i]));
+ SetPagePrivate(pages[i]);
+ set_page_private(pages[i], mfn);
+ pages[i]->index = pfn_to_mfn(pfn);
- if (kmap_ops)
- ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
- if (ret)
+ if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) {
+ ret = -ENOMEM;
goto out;
+ }
+
+ if (kmap_ops) {
+ ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]);
+ if (ret)
+ goto out;
+ }
}
out:
if (lazy)
arch_leave_lazy_mmu_mode();
+
return ret;
}
-EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
+EXPORT_SYMBOL_GPL(set_foreign_p2m_mapping);
-int m2p_remove_override(struct page *page,
- struct gnttab_map_grant_ref *kmap_op,
- unsigned long mfn)
+static struct page *m2p_find_override(unsigned long mfn)
+{
+ unsigned long flags;
+ struct list_head *bucket;
+ struct page *p, *ret;
+
+ if (unlikely(!m2p_overrides))
+ return NULL;
+
+ ret = NULL;
+ bucket = &m2p_overrides[mfn_hash(mfn)];
+
+ spin_lock_irqsave(&m2p_override_lock, flags);
+
+ list_for_each_entry(p, bucket, lru) {
+ if (page_private(p) == mfn) {
+ ret = p;
+ break;
+ }
+ }
+
+ spin_unlock_irqrestore(&m2p_override_lock, flags);
+
+ return ret;
+}
+
+static int m2p_remove_override(struct page *page,
+ struct gnttab_map_grant_ref *kmap_op,
+ unsigned long mfn)
{
unsigned long flags;
unsigned long pfn;
@@ -1072,7 +832,7 @@ int m2p_remove_override(struct page *page,
ptep = lookup_address(address, &level);
if (WARN(ptep == NULL || level != PG_LEVEL_4K,
- "m2p_remove_override: pfn %lx not mapped", pfn))
+ "m2p_remove_override: pfn %lx not mapped", pfn))
return -EINVAL;
}
@@ -1102,9 +862,8 @@ int m2p_remove_override(struct page *page,
* hypercall actually returned an error.
*/
if (kmap_op->handle == GNTST_general_error) {
- printk(KERN_WARNING "m2p_remove_override: "
- "pfn %lx mfn %lx, failed to modify kernel mappings",
- pfn, mfn);
+ pr_warn("m2p_remove_override: pfn %lx mfn %lx, failed to modify kernel mappings",
+ pfn, mfn);
put_balloon_scratch_page();
return -1;
}
@@ -1112,14 +871,14 @@ int m2p_remove_override(struct page *page,
xen_mc_batch();
mcs = __xen_mc_entry(
- sizeof(struct gnttab_unmap_and_replace));
+ sizeof(struct gnttab_unmap_and_replace));
unmap_op = mcs.args;
unmap_op->host_addr = kmap_op->host_addr;
unmap_op->new_addr = scratch_page_address;
unmap_op->handle = kmap_op->handle;
MULTI_grant_table_op(mcs.mc,
- GNTTABOP_unmap_and_replace, unmap_op, 1);
+ GNTTABOP_unmap_and_replace, unmap_op, 1);
mcs = __xen_mc_entry(0);
MULTI_update_va_mapping(mcs.mc, scratch_page_address,
@@ -1145,35 +904,56 @@ int m2p_remove_override(struct page *page,
* pfn again. */
mfn &= ~FOREIGN_FRAME_BIT;
pfn = mfn_to_pfn_no_overrides(mfn);
- if (get_phys_to_machine(pfn) == FOREIGN_FRAME(mfn) &&
+ if (__pfn_to_mfn(pfn) == FOREIGN_FRAME(mfn) &&
m2p_find_override(mfn) == NULL)
set_phys_to_machine(pfn, mfn);
return 0;
}
-EXPORT_SYMBOL_GPL(m2p_remove_override);
-struct page *m2p_find_override(unsigned long mfn)
+int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops,
+ struct gnttab_map_grant_ref *kmap_ops,
+ struct page **pages, unsigned int count)
{
- unsigned long flags;
- struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
- struct page *p, *ret;
+ int i, ret = 0;
+ bool lazy = false;
- ret = NULL;
+ if (xen_feature(XENFEAT_auto_translated_physmap))
+ return 0;
- spin_lock_irqsave(&m2p_override_lock, flags);
+ if (kmap_ops &&
+ !in_interrupt() &&
+ paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) {
+ arch_enter_lazy_mmu_mode();
+ lazy = true;
+ }
- list_for_each_entry(p, bucket, lru) {
- if (page_private(p) == mfn) {
- ret = p;
- break;
+ for (i = 0; i < count; i++) {
+ unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i]));
+ unsigned long pfn = page_to_pfn(pages[i]);
+
+ if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT)) {
+ ret = -EINVAL;
+ goto out;
}
- }
- spin_unlock_irqrestore(&m2p_override_lock, flags);
+ set_page_private(pages[i], INVALID_P2M_ENTRY);
+ WARN_ON(!PagePrivate(pages[i]));
+ ClearPagePrivate(pages[i]);
+ set_phys_to_machine(pfn, pages[i]->index);
+
+ if (kmap_ops)
+ ret = m2p_remove_override(pages[i], &kmap_ops[i], mfn);
+ if (ret)
+ goto out;
+ }
+out:
+ if (lazy)
+ arch_leave_lazy_mmu_mode();
return ret;
}
+EXPORT_SYMBOL_GPL(clear_foreign_p2m_mapping);
unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
{
@@ -1192,79 +972,29 @@ EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
#include "debugfs.h"
static int p2m_dump_show(struct seq_file *m, void *v)
{
- static const char * const level_name[] = { "top", "middle",
- "entry", "abnormal", "error"};
-#define TYPE_IDENTITY 0
-#define TYPE_MISSING 1
-#define TYPE_PFN 2
-#define TYPE_UNKNOWN 3
static const char * const type_name[] = {
- [TYPE_IDENTITY] = "identity",
- [TYPE_MISSING] = "missing",
- [TYPE_PFN] = "pfn",
- [TYPE_UNKNOWN] = "abnormal"};
- unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
- unsigned int uninitialized_var(prev_level);
- unsigned int uninitialized_var(prev_type);
-
- if (!p2m_top)
- return 0;
-
- for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
- unsigned topidx = p2m_top_index(pfn);
- unsigned mididx = p2m_mid_index(pfn);
- unsigned idx = p2m_index(pfn);
- unsigned lvl, type;
-
- lvl = 4;
- type = TYPE_UNKNOWN;
- if (p2m_top[topidx] == p2m_mid_missing) {
- lvl = 0; type = TYPE_MISSING;
- } else if (p2m_top[topidx] == NULL) {
- lvl = 0; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx] == NULL) {
- lvl = 1; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx] == p2m_identity) {
- lvl = 1; type = TYPE_IDENTITY;
- } else if (p2m_top[topidx][mididx] == p2m_missing) {
- lvl = 1; type = TYPE_MISSING;
- } else if (p2m_top[topidx][mididx][idx] == 0) {
- lvl = 2; type = TYPE_UNKNOWN;
- } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
- lvl = 2; type = TYPE_IDENTITY;
- } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
- lvl = 2; type = TYPE_MISSING;
- } else if (p2m_top[topidx][mididx][idx] == pfn) {
- lvl = 2; type = TYPE_PFN;
- } else if (p2m_top[topidx][mididx][idx] != pfn) {
- lvl = 2; type = TYPE_PFN;
- }
- if (pfn == 0) {
- prev_level = lvl;
- prev_type = type;
- }
- if (pfn == MAX_DOMAIN_PAGES-1) {
- lvl = 3;
- type = TYPE_UNKNOWN;
- }
- if (prev_type != type) {
- seq_printf(m, " [0x%lx->0x%lx] %s\n",
- prev_pfn_type, pfn, type_name[prev_type]);
- prev_pfn_type = pfn;
+ [P2M_TYPE_IDENTITY] = "identity",
+ [P2M_TYPE_MISSING] = "missing",
+ [P2M_TYPE_PFN] = "pfn",
+ [P2M_TYPE_UNKNOWN] = "abnormal"};
+ unsigned long pfn, first_pfn;
+ int type, prev_type;
+
+ prev_type = xen_p2m_elem_type(0);
+ first_pfn = 0;
+
+ for (pfn = 0; pfn < xen_p2m_size; pfn++) {
+ type = xen_p2m_elem_type(pfn);
+ if (type != prev_type) {
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
prev_type = type;
- }
- if (prev_level != lvl) {
- seq_printf(m, " [0x%lx->0x%lx] level %s\n",
- prev_pfn_level, pfn, level_name[prev_level]);
- prev_pfn_level = pfn;
- prev_level = lvl;
+ first_pfn = pfn;
}
}
+ seq_printf(m, " [0x%lx->0x%lx] %s\n", first_pfn, pfn,
+ type_name[prev_type]);
return 0;
-#undef TYPE_IDENTITY
-#undef TYPE_MISSING
-#undef TYPE_PFN
-#undef TYPE_UNKNOWN
}
static int p2m_dump_open(struct inode *inode, struct file *filp)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 29834b3fd87f..dfd77dec8e2b 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -30,6 +30,7 @@
#include "xen-ops.h"
#include "vdso.h"
#include "p2m.h"
+#include "mmu.h"
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@@ -47,8 +48,19 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
/* Number of pages released from the initial allocation. */
unsigned long xen_released_pages;
-/* Buffer used to remap identity mapped pages */
-unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+/*
+ * Buffer used to remap identity mapped pages. We only need the virtual space.
+ * The physical page behind this address is remapped as needed to different
+ * buffer pages.
+ */
+#define REMAP_SIZE (P2M_PER_PAGE - 3)
+static struct {
+ unsigned long next_area_mfn;
+ unsigned long target_pfn;
+ unsigned long size;
+ unsigned long mfns[REMAP_SIZE];
+} xen_remap_buf __initdata __aligned(PAGE_SIZE);
+static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY;
/*
* The maximum amount of extra memory compared to the base size. The
@@ -64,7 +76,6 @@ unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
static void __init xen_add_extra_mem(u64 start, u64 size)
{
- unsigned long pfn;
int i;
for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
@@ -84,75 +95,76 @@ static void __init xen_add_extra_mem(u64 start, u64 size)
printk(KERN_WARNING "Warning: not enough extra memory regions\n");
memblock_reserve(start, size);
+}
- xen_max_p2m_pfn = PFN_DOWN(start + size);
- for (pfn = PFN_DOWN(start); pfn < xen_max_p2m_pfn; pfn++) {
- unsigned long mfn = pfn_to_mfn(pfn);
-
- if (WARN_ONCE(mfn == pfn, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn))
- continue;
- WARN_ONCE(mfn != INVALID_P2M_ENTRY, "Trying to remove %lx which has %lx mfn!\n",
- pfn, mfn);
+static void __init xen_del_extra_mem(u64 start, u64 size)
+{
+ int i;
+ u64 start_r, size_r;
- __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ start_r = xen_extra_mem[i].start;
+ size_r = xen_extra_mem[i].size;
+
+ /* Start of region. */
+ if (start_r == start) {
+ BUG_ON(size > size_r);
+ xen_extra_mem[i].start += size;
+ xen_extra_mem[i].size -= size;
+ break;
+ }
+ /* End of region. */
+ if (start_r + size_r == start + size) {
+ BUG_ON(size > size_r);
+ xen_extra_mem[i].size -= size;
+ break;
+ }
+ /* Mid of region. */
+ if (start > start_r && start < start_r + size_r) {
+ BUG_ON(start + size > start_r + size_r);
+ xen_extra_mem[i].size = start - start_r;
+ /* Calling memblock_reserve() again is okay. */
+ xen_add_extra_mem(start + size, start_r + size_r -
+ (start + size));
+ break;
+ }
}
+ memblock_free(start, size);
}
-static unsigned long __init xen_do_chunk(unsigned long start,
- unsigned long end, bool release)
+/*
+ * Called during boot before the p2m list can take entries beyond the
+ * hypervisor supplied p2m list. Entries in extra mem are to be regarded as
+ * invalid.
+ */
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn)
{
- struct xen_memory_reservation reservation = {
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
- unsigned long len = 0;
- unsigned long pfn;
- int ret;
+ int i;
+ unsigned long addr = PFN_PHYS(pfn);
- for (pfn = start; pfn < end; pfn++) {
- unsigned long frame;
- unsigned long mfn = pfn_to_mfn(pfn);
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ if (addr >= xen_extra_mem[i].start &&
+ addr < xen_extra_mem[i].start + xen_extra_mem[i].size)
+ return INVALID_P2M_ENTRY;
+ }
- if (release) {
- /* Make sure pfn exists to start with */
- if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
- continue;
- frame = mfn;
- } else {
- if (mfn != INVALID_P2M_ENTRY)
- continue;
- frame = pfn;
- }
- set_xen_guest_handle(reservation.extent_start, &frame);
- reservation.nr_extents = 1;
+ return IDENTITY_FRAME(pfn);
+}
- ret = HYPERVISOR_memory_op(release ? XENMEM_decrease_reservation : XENMEM_populate_physmap,
- &reservation);
- WARN(ret != 1, "Failed to %s pfn %lx err=%d\n",
- release ? "release" : "populate", pfn, ret);
+/*
+ * Mark all pfns of extra mem as invalid in p2m list.
+ */
+void __init xen_inv_extra_mem(void)
+{
+ unsigned long pfn, pfn_s, pfn_e;
+ int i;
- if (ret == 1) {
- if (!early_set_phys_to_machine(pfn, release ? INVALID_P2M_ENTRY : frame)) {
- if (release)
- break;
- set_xen_guest_handle(reservation.extent_start, &frame);
- reservation.nr_extents = 1;
- ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation,
- &reservation);
- break;
- }
- len++;
- } else
- break;
+ for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) {
+ pfn_s = PFN_DOWN(xen_extra_mem[i].start);
+ pfn_e = PFN_UP(xen_extra_mem[i].start + xen_extra_mem[i].size);
+ for (pfn = pfn_s; pfn < pfn_e; pfn++)
+ set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
}
- if (len)
- printk(KERN_INFO "%s %lx-%lx pfn range: %lu pages %s\n",
- release ? "Freeing" : "Populating",
- start, end, len,
- release ? "freed" : "added");
-
- return len;
}
/*
@@ -198,26 +210,62 @@ static unsigned long __init xen_find_pfn_range(
return done;
}
+static int __init xen_free_mfn(unsigned long mfn)
+{
+ struct xen_memory_reservation reservation = {
+ .address_bits = 0,
+ .extent_order = 0,
+ .domid = DOMID_SELF
+ };
+
+ set_xen_guest_handle(reservation.extent_start, &mfn);
+ reservation.nr_extents = 1;
+
+ return HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation);
+}
+
/*
- * This releases a chunk of memory and then does the identity map. It's used as
+ * This releases a chunk of memory and then does the identity map. It's used
* as a fallback if the remapping fails.
*/
static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
unsigned long *released)
{
+ unsigned long len = 0;
+ unsigned long pfn, end;
+ int ret;
+
WARN_ON(start_pfn > end_pfn);
+ end = min(end_pfn, nr_pages);
+ for (pfn = start_pfn; pfn < end; pfn++) {
+ unsigned long mfn = pfn_to_mfn(pfn);
+
+ /* Make sure pfn exists to start with */
+ if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn)
+ continue;
+
+ ret = xen_free_mfn(mfn);
+ WARN(ret != 1, "Failed to release pfn %lx err=%d\n", pfn, ret);
+
+ if (ret == 1) {
+ if (!__set_phys_to_machine(pfn, INVALID_P2M_ENTRY))
+ break;
+ len++;
+ } else
+ break;
+ }
+
/* Need to release pages first */
- *released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+ *released += len;
*identity += set_phys_range_identity(start_pfn, end_pfn);
}
/*
- * Helper function to update both the p2m and m2p tables.
+ * Helper function to update the p2m and m2p tables and kernel mapping.
*/
-static unsigned long __init xen_update_mem_tables(unsigned long pfn,
- unsigned long mfn)
+static void __init xen_update_mem_tables(unsigned long pfn, unsigned long mfn)
{
struct mmu_update update = {
.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
@@ -225,161 +273,88 @@ static unsigned long __init xen_update_mem_tables(unsigned long pfn,
};
/* Update p2m */
- if (!early_set_phys_to_machine(pfn, mfn)) {
+ if (!set_phys_to_machine(pfn, mfn)) {
WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
pfn, mfn);
- return false;
+ BUG();
}
/* Update m2p */
if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
mfn, pfn);
- return false;
+ BUG();
}
- return true;
+ /* Update kernel mapping, but not for highmem. */
+ if ((pfn << PAGE_SHIFT) >= __pa(high_memory))
+ return;
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)__va(pfn << PAGE_SHIFT),
+ mfn_pte(mfn, PAGE_KERNEL), 0)) {
+ WARN(1, "Failed to update kernel mapping for mfn=%ld pfn=%ld\n",
+ mfn, pfn);
+ BUG();
+ }
}
/*
* This function updates the p2m and m2p tables with an identity map from
- * start_pfn to start_pfn+size and remaps the underlying RAM of the original
- * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
- * to not exhaust the reserved brk space. Doing it in properly aligned blocks
- * ensures we only allocate the minimum required leaf pages in the p2m table. It
- * copies the existing mfns from the p2m table under the 1:1 map, overwrites
- * them with the identity map and then updates the p2m and m2p tables with the
- * remapped memory.
+ * start_pfn to start_pfn+size and prepares remapping the underlying RAM of the
+ * original allocation at remap_pfn. The information needed for remapping is
+ * saved in the memory itself to avoid the need for allocating buffers. The
+ * complete remap information is contained in a list of MFNs each containing
+ * up to REMAP_SIZE MFNs and the start target PFN for doing the remap.
+ * This enables us to preserve the original mfn sequence while doing the
+ * remapping at a time when the memory management is capable of allocating
+ * virtual and physical memory in arbitrary amounts, see 'xen_remap_memory' and
+ * its callers.
*/
-static unsigned long __init xen_do_set_identity_and_remap_chunk(
+static void __init xen_do_set_identity_and_remap_chunk(
unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, mfn;
unsigned long ident_pfn_iter, remap_pfn_iter;
- unsigned long ident_start_pfn_align, remap_start_pfn_align;
- unsigned long ident_end_pfn_align, remap_end_pfn_align;
- unsigned long ident_boundary_pfn, remap_boundary_pfn;
- unsigned long ident_cnt = 0;
- unsigned long remap_cnt = 0;
+ unsigned long ident_end_pfn = start_pfn + size;
unsigned long left = size;
- unsigned long mod;
- int i;
+ unsigned long ident_cnt = 0;
+ unsigned int i, chunk;
WARN_ON(size == 0);
BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
- /*
- * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
- * blocks. We need to keep track of both the existing pfn mapping and
- * the new pfn remapping.
- */
- mod = start_pfn % P2M_PER_PAGE;
- ident_start_pfn_align =
- mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
- mod = remap_pfn % P2M_PER_PAGE;
- remap_start_pfn_align =
- mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
- mod = (start_pfn + size) % P2M_PER_PAGE;
- ident_end_pfn_align = start_pfn + size - mod;
- mod = (remap_pfn + size) % P2M_PER_PAGE;
- remap_end_pfn_align = remap_pfn + size - mod;
-
- /* Iterate over each p2m leaf node in each range */
- for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
- ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
- ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
- /* Check we aren't past the end */
- BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
- BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
-
- /* Save p2m mappings */
- for (i = 0; i < P2M_PER_PAGE; i++)
- xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
-
- /* Set identity map which will free a p2m leaf */
- ident_cnt += set_phys_range_identity(ident_pfn_iter,
- ident_pfn_iter + P2M_PER_PAGE);
+ mfn_save = virt_to_mfn(buf);
-#ifdef DEBUG
- /* Helps verify a p2m leaf has been freed */
- for (i = 0; i < P2M_PER_PAGE; i++) {
- unsigned int pfn = ident_pfn_iter + i;
- BUG_ON(pfn_to_mfn(pfn) != pfn);
- }
-#endif
- /* Now remap memory */
- for (i = 0; i < P2M_PER_PAGE; i++) {
- unsigned long mfn = xen_remap_buf[i];
-
- /* This will use the p2m leaf freed above */
- if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
- WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
- remap_pfn_iter + i, mfn);
- return 0;
- }
-
- remap_cnt++;
- }
-
- left -= P2M_PER_PAGE;
- }
-
- /* Max boundary space possible */
- BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+ for (ident_pfn_iter = start_pfn, remap_pfn_iter = remap_pfn;
+ ident_pfn_iter < ident_end_pfn;
+ ident_pfn_iter += REMAP_SIZE, remap_pfn_iter += REMAP_SIZE) {
+ chunk = (left < REMAP_SIZE) ? left : REMAP_SIZE;
- /* Now handle the boundary conditions */
- ident_boundary_pfn = start_pfn;
- remap_boundary_pfn = remap_pfn;
- for (i = 0; i < left; i++) {
- unsigned long mfn;
+ /* Map first pfn to xen_remap_buf */
+ mfn = pfn_to_mfn(ident_pfn_iter);
+ set_pte_mfn(buf, mfn, PAGE_KERNEL);
- /* These two checks move from the start to end boundaries */
- if (ident_boundary_pfn == ident_start_pfn_align)
- ident_boundary_pfn = ident_pfn_iter;
- if (remap_boundary_pfn == remap_start_pfn_align)
- remap_boundary_pfn = remap_pfn_iter;
+ /* Save mapping information in page */
+ xen_remap_buf.next_area_mfn = xen_remap_mfn;
+ xen_remap_buf.target_pfn = remap_pfn_iter;
+ xen_remap_buf.size = chunk;
+ for (i = 0; i < chunk; i++)
+ xen_remap_buf.mfns[i] = pfn_to_mfn(ident_pfn_iter + i);
- /* Check we aren't past the end */
- BUG_ON(ident_boundary_pfn >= start_pfn + size);
- BUG_ON(remap_boundary_pfn >= remap_pfn + size);
-
- mfn = pfn_to_mfn(ident_boundary_pfn);
-
- if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
- WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
- remap_pfn_iter + i, mfn);
- return 0;
- }
- remap_cnt++;
+ /* Put remap buf into list. */
+ xen_remap_mfn = mfn;
- ident_boundary_pfn++;
- remap_boundary_pfn++;
- }
+ /* Set identity map */
+ ident_cnt += set_phys_range_identity(ident_pfn_iter,
+ ident_pfn_iter + chunk);
- /* Finish up the identity map */
- if (ident_start_pfn_align >= ident_end_pfn_align) {
- /*
- * In this case we have an identity range which does not span an
- * aligned block so everything needs to be identity mapped here.
- * If we didn't check this we might remap too many pages since
- * the align boundaries are not meaningful in this case.
- */
- ident_cnt += set_phys_range_identity(start_pfn,
- start_pfn + size);
- } else {
- /* Remapped above so check each end of the chunk */
- if (start_pfn < ident_start_pfn_align)
- ident_cnt += set_phys_range_identity(start_pfn,
- ident_start_pfn_align);
- if (start_pfn + size > ident_pfn_iter)
- ident_cnt += set_phys_range_identity(ident_pfn_iter,
- start_pfn + size);
+ left -= chunk;
}
- BUG_ON(ident_cnt != size);
- BUG_ON(remap_cnt != size);
-
- return size;
+ /* Restore old xen_remap_buf mapping */
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
}
/*
@@ -396,8 +371,7 @@ static unsigned long __init xen_do_set_identity_and_remap_chunk(
static unsigned long __init xen_set_identity_and_remap_chunk(
const struct e820entry *list, size_t map_size, unsigned long start_pfn,
unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
- unsigned long *identity, unsigned long *remapped,
- unsigned long *released)
+ unsigned long *identity, unsigned long *released)
{
unsigned long pfn;
unsigned long i = 0;
@@ -431,19 +405,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
if (size > remap_range_size)
size = remap_range_size;
- if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
- WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
- cur_pfn, size, remap_pfn);
- xen_set_identity_and_release_chunk(cur_pfn,
- cur_pfn + left, nr_pages, identity, released);
- break;
- }
+ xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn);
/* Update variables to reflect new mappings. */
i += size;
remap_pfn += size;
*identity += size;
- *remapped += size;
}
/*
@@ -458,13 +425,12 @@ static unsigned long __init xen_set_identity_and_remap_chunk(
return remap_pfn;
}
-static unsigned long __init xen_set_identity_and_remap(
+static void __init xen_set_identity_and_remap(
const struct e820entry *list, size_t map_size, unsigned long nr_pages,
unsigned long *released)
{
phys_addr_t start = 0;
unsigned long identity = 0;
- unsigned long remapped = 0;
unsigned long last_pfn = nr_pages;
const struct e820entry *entry;
unsigned long num_released = 0;
@@ -494,8 +460,7 @@ static unsigned long __init xen_set_identity_and_remap(
last_pfn = xen_set_identity_and_remap_chunk(
list, map_size, start_pfn,
end_pfn, nr_pages, last_pfn,
- &identity, &remapped,
- &num_released);
+ &identity, &num_released);
start = end;
}
}
@@ -503,12 +468,63 @@ static unsigned long __init xen_set_identity_and_remap(
*released = num_released;
pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
- pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
- last_pfn);
pr_info("Released %ld page(s)\n", num_released);
+}
+
+/*
+ * Remap the memory prepared in xen_do_set_identity_and_remap_chunk().
+ * The remap information (which mfn remap to which pfn) is contained in the
+ * to be remapped memory itself in a linked list anchored at xen_remap_mfn.
+ * This scheme allows to remap the different chunks in arbitrary order while
+ * the resulting mapping will be independant from the order.
+ */
+void __init xen_remap_memory(void)
+{
+ unsigned long buf = (unsigned long)&xen_remap_buf;
+ unsigned long mfn_save, mfn, pfn;
+ unsigned long remapped = 0;
+ unsigned int i;
+ unsigned long pfn_s = ~0UL;
+ unsigned long len = 0;
+
+ mfn_save = virt_to_mfn(buf);
+
+ while (xen_remap_mfn != INVALID_P2M_ENTRY) {
+ /* Map the remap information */
+ set_pte_mfn(buf, xen_remap_mfn, PAGE_KERNEL);
- return last_pfn;
+ BUG_ON(xen_remap_mfn != xen_remap_buf.mfns[0]);
+
+ pfn = xen_remap_buf.target_pfn;
+ for (i = 0; i < xen_remap_buf.size; i++) {
+ mfn = xen_remap_buf.mfns[i];
+ xen_update_mem_tables(pfn, mfn);
+ remapped++;
+ pfn++;
+ }
+ if (pfn_s == ~0UL || pfn == pfn_s) {
+ pfn_s = xen_remap_buf.target_pfn;
+ len += xen_remap_buf.size;
+ } else if (pfn_s + len == xen_remap_buf.target_pfn) {
+ len += xen_remap_buf.size;
+ } else {
+ xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+ pfn_s = xen_remap_buf.target_pfn;
+ len = xen_remap_buf.size;
+ }
+
+ mfn = xen_remap_mfn;
+ xen_remap_mfn = xen_remap_buf.next_area_mfn;
+ }
+
+ if (pfn_s != ~0UL && len)
+ xen_del_extra_mem(PFN_PHYS(pfn_s), PFN_PHYS(len));
+
+ set_pte_mfn(buf, mfn_save, PAGE_KERNEL);
+
+ pr_info("Remapped %ld page(s)\n", remapped);
}
+
static unsigned long __init xen_get_max_pages(void)
{
unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -569,7 +585,6 @@ char * __init xen_memory_setup(void)
int rc;
struct xen_memory_map memmap;
unsigned long max_pages;
- unsigned long last_pfn = 0;
unsigned long extra_pages = 0;
int i;
int op;
@@ -616,17 +631,14 @@ char * __init xen_memory_setup(void)
extra_pages += max_pages - max_pfn;
/*
- * Set identity map on non-RAM pages and remap the underlying RAM.
+ * Set identity map on non-RAM pages and prepare remapping the
+ * underlying RAM.
*/
- last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
- &xen_released_pages);
+ xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+ &xen_released_pages);
extra_pages += xen_released_pages;
- if (last_pfn > max_pfn) {
- max_pfn = min(MAX_DOMAIN_PAGES, last_pfn);
- mem_end = PFN_PHYS(max_pfn);
- }
/*
* Clamp the amount of extra memory to a EXTRA_MEM_RATIO
* factor the base size. On non-highmem systems, the base
@@ -653,6 +665,7 @@ char * __init xen_memory_setup(void)
size = min(size, (u64)extra_pages * PAGE_SIZE);
extra_pages -= size / PAGE_SIZE;
xen_add_extra_mem(addr, size);
+ xen_max_p2m_pfn = PFN_DOWN(addr + size);
} else
type = E820_UNUSABLE;
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 4ab9298c5e17..5686bd9d58cc 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -29,11 +29,13 @@ void xen_build_mfn_list_list(void);
void xen_setup_machphys_mapping(void);
void xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
void xen_reserve_top(void);
-extern unsigned long xen_max_p2m_pfn;
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+unsigned long __ref xen_chk_extra_mem(unsigned long pfn);
+void __init xen_inv_extra_mem(void);
+void __init xen_remap_memory(void);
char * __init xen_memory_setup(void);
char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
@@ -46,7 +48,7 @@ void xen_hvm_init_shared_info(void);
void xen_unplug_emulated_devices(void);
void __init xen_build_dynamic_phys_to_machine(void);
-unsigned long __init xen_revector_p2m_tree(void);
+void __init xen_vmalloc_p2m_tree(void);
void xen_init_irq_ops(void);
void xen_setup_timer(int cpu);