diff options
Diffstat (limited to 'arch/x86/kernel')
45 files changed, 523 insertions, 162 deletions
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index a142e77693e1..a3eadfdc3e04 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -604,18 +604,24 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irqp) { - int irq; + int rc, irq, trigger, polarity; if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { *irqp = gsi; - } else { - irq = mp_map_gsi_to_irq(gsi, - IOAPIC_MAP_ALLOC | IOAPIC_MAP_CHECK); - if (irq < 0) - return -1; - *irqp = irq; + return 0; } - return 0; + + rc = acpi_get_override_irq(gsi, &trigger, &polarity); + if (rc == 0) { + trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE; + polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH; + irq = acpi_register_gsi(NULL, gsi, trigger, polarity); + if (irq >= 0) { + *irqp = irq; + return 0; + } + } + return -1; } EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 31368207837c..d1daead5fcdd 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -78,7 +78,7 @@ int x86_acpi_suspend_lowlevel(void) header->pmode_cr0 = read_cr0(); if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = __read_cr4(); header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); } if (!rdmsr_safe(MSR_IA32_MISC_ENABLE, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 4128b5fcb559..2aaee79fb129 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -40,7 +40,7 @@ static unsigned int get_apic_id(unsigned long x) unsigned int id; rdmsrl(MSR_FAM10H_NODE_ID, value); - id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U); return id; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index e27b49d7c922..80091ae54c2b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -66,3 +66,4 @@ targets += capflags.c $(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.sh FORCE $(call if_changed,mkcapflags) endif +clean-files += capflags.c diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cfa9b5b2c27a..7bc49c3b9684 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -19,6 +19,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/debugreg.h> #include <asm/sections.h> #include <asm/vsyscall.h> @@ -278,7 +279,7 @@ __setup("nosmep", setup_disable_smep); static __always_inline void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) - set_in_cr4(X86_CR4_SMEP); + cr4_set_bits(X86_CR4_SMEP); } static __init int setup_disable_smap(char *arg) @@ -298,9 +299,9 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_SMAP)) { #ifdef CONFIG_X86_SMAP - set_in_cr4(X86_CR4_SMAP); + cr4_set_bits(X86_CR4_SMAP); #else - clear_in_cr4(X86_CR4_SMAP); + cr4_clear_bits(X86_CR4_SMAP); #endif } } @@ -1304,6 +1305,12 @@ void cpu_init(void) wait_for_master_cpu(cpu); /* + * Initialize the CR4 shadow before doing anything that could + * try to read it. + */ + cr4_init_shadow(); + + /* * Load microcode on this cpu if a valid microcode is available. * This is early microcode loading procedure. */ @@ -1322,7 +1329,7 @@ void cpu_init(void) pr_debug("Initializing CPU#%d\n", cpu); - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); /* * Initialize the per-CPU GDT with the boot GDT, @@ -1403,7 +1410,7 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); switch_to_new_gdt(cpu); diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 09edd0b65fef..10b46906767f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -3,6 +3,8 @@ enum severity_level { MCE_NO_SEVERITY, + MCE_DEFERRED_SEVERITY, + MCE_UCNA_SEVERITY = MCE_DEFERRED_SEVERITY, MCE_KEEP_SEVERITY, MCE_SOME_SEVERITY, MCE_AO_SEVERITY, @@ -21,7 +23,7 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg); +int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index c370e1c4468b..8bb433043a7f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -31,6 +31,7 @@ enum context { IN_KERNEL = 1, IN_USER = 2 }; enum ser { SER_REQUIRED = 1, NO_SER = 2 }; +enum exception { EXCP_CONTEXT = 1, NO_EXCP = 2 }; static struct severity { u64 mask; @@ -40,6 +41,7 @@ static struct severity { unsigned char mcgres; unsigned char ser; unsigned char context; + unsigned char excp; unsigned char covered; char *msg; } severities[] = { @@ -48,6 +50,8 @@ static struct severity { #define USER .context = IN_USER #define SER .ser = SER_REQUIRED #define NOSER .ser = NO_SER +#define EXCP .excp = EXCP_CONTEXT +#define NOEXCP .excp = NO_EXCP #define BITCLR(x) .mask = x, .result = 0 #define BITSET(x) .mask = x, .result = x #define MCGMASK(x, y) .mcgmask = x, .mcgres = y @@ -62,7 +66,7 @@ static struct severity { ), MCESEV( NO, "Not enabled", - BITCLR(MCI_STATUS_EN) + EXCP, BITCLR(MCI_STATUS_EN) ), MCESEV( PANIC, "Processor context corrupt", @@ -71,16 +75,20 @@ static struct severity { /* When MCIP is not set something is very confused */ MCESEV( PANIC, "MCIP not set in MCA handler", - MCGMASK(MCG_STATUS_MCIP, 0) + EXCP, MCGMASK(MCG_STATUS_MCIP, 0) ), /* Neither return not error IP -- no chance to recover -> PANIC */ MCESEV( PANIC, "Neither restart nor error IP", - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + EXCP, MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) ), MCESEV( PANIC, "In kernel and no restart IP", - KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + EXCP, KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + DEFERRED, "Deferred error", + NOSER, MASK(MCI_STATUS_UC|MCI_STATUS_DEFERRED|MCI_STATUS_POISON, MCI_STATUS_DEFERRED) ), MCESEV( KEEP, "Corrected error", @@ -89,7 +97,7 @@ static struct severity { /* ignore OVER for UCNA */ MCESEV( - KEEP, "Uncorrected no action required", + UCNA, "Uncorrected no action required", SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) ), MCESEV( @@ -178,8 +186,9 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) { + enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); struct severity *s; @@ -194,6 +203,8 @@ int mce_severity(struct mce *m, int tolerant, char **msg) continue; if (s->context && ctx != s->context) continue; + if (s->excp && excp != s->excp) + continue; if (msg) *msg = s->msg; s->covered = 1; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 61a9668cebfd..bf44e45a2a76 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -660,6 +661,7 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, struct pt_regs *regs) { int i, ret = 0; + char *tmp; for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); @@ -668,8 +670,11 @@ static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, if (quirk_no_way_out) quirk_no_way_out(i, m, regs); } - if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + + if (mce_severity(m, mca_cfg.tolerant, &tmp, true) >= MCE_PANIC_SEVERITY) { + *msg = tmp; ret = 1; + } } return ret; } @@ -754,7 +759,7 @@ static void mce_reign(void) for_each_possible_cpu(cpu) { int severity = mce_severity(&per_cpu(mces_seen, cpu), mca_cfg.tolerant, - &nmsg); + &nmsg, true); if (severity > global_worst) { msg = nmsg; global_worst = severity; @@ -1095,13 +1100,14 @@ void do_machine_check(struct pt_regs *regs, long error_code) */ add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - severity = mce_severity(&m, cfg->tolerant, NULL); + severity = mce_severity(&m, cfg->tolerant, NULL, true); /* - * When machine check was for corrected handler don't touch, - * unless we're panicing. + * When machine check was for corrected/deferred handler don't + * touch, unless we're panicing. */ - if (severity == MCE_KEEP_SEVERITY && !no_way_out) + if ((severity == MCE_KEEP_SEVERITY || + severity == MCE_UCNA_SEVERITY) && !no_way_out) continue; __set_bit(i, toclear); if (severity == MCE_NO_SEVERITY) { @@ -1455,7 +1461,7 @@ static void __mcheck_cpu_init_generic(void) bitmap_fill(all_banks, MAX_NR_BANKS); machine_check_poll(MCP_UC | m_fl, &all_banks); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); rdmsrl(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index a3042989398c..30692ac88d1e 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -8,6 +8,7 @@ #include <linux/smp.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -59,7 +60,7 @@ void intel_p5_mcheck_init(struct cpuinfo_x86 *c) "Intel old style machine check architecture supported.\n"); /* Enable MCE: */ - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Intel old style machine check reporting enabled on CPU#%d.\n", smp_processor_id()); diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 7dc5564d0cdf..590cc753ba8f 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -7,6 +7,7 @@ #include <linux/types.h> #include <asm/processor.h> +#include <asm/tlbflush.h> #include <asm/mce.h> #include <asm/msr.h> @@ -31,7 +32,7 @@ void winchip_mcheck_init(struct cpuinfo_x86 *c) lo &= ~(1<<4); /* Enable MCE */ wrmsr(MSR_IDT_FCR1, lo, hi); - set_in_cr4(X86_CR4_MCE); + cr4_set_bits(X86_CR4_MCE); printk(KERN_INFO "Winchip machine check reporting enabled on CPU#0.\n"); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 8fffd845e22b..bfbbe6195e2d 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -376,7 +376,7 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) +enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; @@ -390,8 +390,8 @@ enum ucode_state load_microcode_amd(u8 family, const u8 *data, size_t size) #if defined(CONFIG_MICROCODE_AMD_EARLY) && defined(CONFIG_X86_32) /* save BSP's matching patch for early load */ - if (cpu_data(smp_processor_id()).cpu_index == boot_cpu_data.cpu_index) { - struct ucode_patch *p = find_patch(smp_processor_id()); + if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) { + struct ucode_patch *p = find_patch(cpu); if (p) { memset(amd_ucode_patch, 0, PATCH_MAX_SIZE); memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), @@ -444,7 +444,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device, goto fw_release; } - ret = load_microcode_amd(c->x86, fw->data, fw->size); + ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size); fw_release: release_firmware(fw); diff --git a/arch/x86/kernel/cpu/microcode/amd_early.c b/arch/x86/kernel/cpu/microcode/amd_early.c index 06674473b0e6..737737edbd1e 100644 --- a/arch/x86/kernel/cpu/microcode/amd_early.c +++ b/arch/x86/kernel/cpu/microcode/amd_early.c @@ -389,7 +389,7 @@ int __init save_microcode_in_initrd_amd(void) eax = cpuid_eax(0x00000001); eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); - ret = load_microcode_amd(eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); if (ret != UCODE_OK) retval = -EINVAL; @@ -402,3 +402,21 @@ int __init save_microcode_in_initrd_amd(void) return retval; } + +void reload_ucode_amd(void) +{ + struct microcode_amd *mc; + u32 rev, eax; + + rdmsr(MSR_AMD64_PATCH_LEVEL, rev, eax); + + mc = (struct microcode_amd *)amd_ucode_patch; + + if (mc && rev < mc->hdr.patch_id) { + if (!__apply_microcode_amd(mc)) { + ucode_new_rev = mc->hdr.patch_id; + pr_info("microcode: reload patch_level=0x%08x\n", + ucode_new_rev); + } + } +} diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08fe6e8a726e..36a83617eb21 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -465,16 +465,8 @@ static void mc_bp_resume(void) if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); -#ifdef CONFIG_X86_64 else if (!uci->mc) - /* - * We might resume and not have applied late microcode but still - * have a newer patch stashed from the early loader. We don't - * have it in uci->mc so we have to load it the same way we're - * applying patches early on the APs. - */ - load_ucode_ap(); -#endif + reload_early_microcode(); } static struct syscore_ops mc_syscore_ops = { @@ -559,8 +551,8 @@ static int __init microcode_init(void) struct cpuinfo_x86 *c = &cpu_data(0); int error; - if (dis_ucode_ldr) - return 0; + if (paravirt_enabled() || dis_ucode_ldr) + return -EINVAL; if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index 2c017f242a78..d45df4bd16ab 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -176,3 +176,24 @@ int __init save_microcode_in_initrd(void) return 0; } + +void reload_early_microcode(void) +{ + int vendor, x86; + + vendor = x86_vendor(); + x86 = x86_family(); + + switch (vendor) { + case X86_VENDOR_INTEL: + if (x86 >= 6) + reload_ucode_intel(); + break; + case X86_VENDOR_AMD: + if (x86 >= 0x10) + reload_ucode_amd(); + break; + default: + break; + } +} diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index b88343f7a3b3..5e109a31f62b 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -321,7 +321,7 @@ get_matching_model_microcode(int cpu, unsigned long start, unsigned int mc_saved_count = mc_saved_data->mc_saved_count; int i; - while (leftover) { + while (leftover && mc_saved_count < ARRAY_SIZE(mc_saved_tmp)) { mc_header = (struct microcode_header_intel *)ucode_ptr; mc_size = get_totalsize(mc_header); @@ -650,8 +650,7 @@ static inline void print_ucode(struct ucode_cpu_info *uci) } #endif -static int apply_microcode_early(struct mc_saved_data *mc_saved_data, - struct ucode_cpu_info *uci) +static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) { struct microcode_intel *mc_intel; unsigned int val[2]; @@ -680,7 +679,10 @@ static int apply_microcode_early(struct mc_saved_data *mc_saved_data, #endif uci->cpu_sig.rev = val[1]; - print_ucode(uci); + if (early) + print_ucode(uci); + else + print_ucode_info(uci, mc_intel->hdr.date); return 0; } @@ -715,12 +717,17 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, unsigned long initrd_end_early, struct ucode_cpu_info *uci) { + enum ucode_state ret; + collect_cpu_info_early(uci); scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, mc_saved_in_initrd, uci); - load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); - apply_microcode_early(mc_saved_data, uci); + + ret = load_microcode(mc_saved_data, mc_saved_in_initrd, + initrd_start_early, uci); + + if (ret == UCODE_OK) + apply_microcode_early(uci, true); } void __init @@ -749,7 +756,8 @@ load_ucode_intel_bsp(void) initrd_end_early = initrd_start_early + ramdisk_size; _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, &uci); + initrd_start_early, initrd_end_early, + &uci); #endif } @@ -783,5 +791,23 @@ void load_ucode_intel_ap(void) collect_cpu_info_early(&uci); load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, initrd_start_addr, &uci); - apply_microcode_early(mc_saved_data_p, &uci); + apply_microcode_early(&uci, true); +} + +void reload_ucode_intel(void) +{ + struct ucode_cpu_info uci; + enum ucode_state ret; + + if (!mc_saved_data.mc_saved_count) + return; + + collect_cpu_info_early(&uci); + + ret = generic_load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, false); } diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index a450373e8e91..939155ffdece 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -107,6 +107,7 @@ static struct clocksource hyperv_cs = { .rating = 400, /* use this when running on Hyperv*/ .read = read_hv_clock, .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; static void __init ms_hyperv_init_platform(void) diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c index 9e451b0876b5..f8c81ba0b465 100644 --- a/arch/x86/kernel/cpu/mtrr/cyrix.c +++ b/arch/x86/kernel/cpu/mtrr/cyrix.c @@ -138,8 +138,8 @@ static void prepare_set(void) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* @@ -171,7 +171,7 @@ static void post_set(void) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); } static void cyrix_set_arr(unsigned int reg, unsigned long base, diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0e25a1bc5ab5..7d74f7b3c6ba 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -678,8 +678,8 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_has_pge) { - cr4 = read_cr4(); - write_cr4(cr4 & ~X86_CR4_PGE); + cr4 = __read_cr4(); + __write_cr4(cr4 & ~X86_CR4_PGE); } /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ @@ -708,7 +708,7 @@ static void post_set(void) __releases(set_atomicity_lock) /* Restore value of CR4 */ if (cpu_has_pge) - write_cr4(cr4); + __write_cr4(cr4); raw_spin_unlock(&set_atomicity_lock); } diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 143e5f5dc855..6b5acd5f4a34 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,6 +31,7 @@ #include <asm/nmi.h> #include <asm/smp.h> #include <asm/alternative.h> +#include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/desc.h> #include <asm/ldt.h> @@ -1328,7 +1329,7 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_STARTING: if (x86_pmu.attr_rdpmc) - set_in_cr4(X86_CR4_PCE); + cr4_set_bits(X86_CR4_PCE); if (x86_pmu.cpu_starting) x86_pmu.cpu_starting(cpu); break; @@ -1834,9 +1835,9 @@ static void change_rdpmc(void *info) bool enable = !!(unsigned long)info; if (enable) - set_in_cr4(X86_CR4_PCE); + cr4_set_bits(X86_CR4_PCE); else - clear_in_cr4(X86_CR4_PCE); + cr4_clear_bits(X86_CR4_PCE); } static ssize_t set_attr_rdpmc(struct device *cdev, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 944bf019b74f..df11583a9041 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2431,6 +2431,7 @@ __init int intel_pmu_init(void) break; case 55: /* 22nm Atom "Silvermont" */ + case 76: /* 14nm Atom "Airmont" */ case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */ memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -2603,13 +2604,13 @@ __init int intel_pmu_init(void) * counter, so do not extend mask to generic counters */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != FIXED_EVENT_FLAGS - || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { - continue; + if (c->cmask == FIXED_EVENT_FLAGS + && c->idxmsk64 != INTEL_PMC_MSK_FIXED_REF_CYCLES) { + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; } - - c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; - c->weight += x86_pmu.num_counters; + c->idxmsk64 &= + ~(~0UL << (INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed)); + c->weight = hweight64(c->idxmsk64); } } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index d64f275fe274..611d821eac1a 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -135,7 +135,7 @@ static inline u64 rapl_scale(u64 v) * or use ldexp(count, -32). * Watts = Joules/Time delta */ - return v << (32 - __this_cpu_read(rapl_pmu->hw_unit)); + return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit); } static u64 rapl_event_update(struct perf_event *event) @@ -664,6 +664,7 @@ static int __init rapl_pmu_init(void) break; case 60: /* Haswell */ case 69: /* Haswell-Celeron */ + case 61: /* Broadwell */ rapl_cntr_mask = RAPL_IDX_HSW; rapl_pmu_events_group.attrs = rapl_events_hsw_attr; break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 9762dbd9f3f7..e98f68cfea02 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -276,6 +276,17 @@ static struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, return box; } +/* + * Using uncore_pmu_event_init pmu event_init callback + * as a detection point for uncore events. + */ +static int uncore_pmu_event_init(struct perf_event *event); + +static bool is_uncore_event(struct perf_event *event) +{ + return event->pmu->event_init == uncore_pmu_event_init; +} + static int uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) { @@ -290,13 +301,18 @@ uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, b return -EINVAL; n = box->n_events; - box->event_list[n] = leader; - n++; + + if (is_uncore_event(leader)) { + box->event_list[n] = leader; + n++; + } + if (!dogrp) return n; list_for_each_entry(event, &leader->sibling_list, group_entry) { - if (event->state <= PERF_EVENT_STATE_OFF) + if (!is_uncore_event(event) || + event->state <= PERF_EVENT_STATE_OFF) continue; if (n >= max_count) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 18eb78bbdd10..863d9b02563e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -17,7 +17,7 @@ #define UNCORE_PCI_DEV_TYPE(data) ((data >> 8) & 0xff) #define UNCORE_PCI_DEV_IDX(data) (data & 0xff) #define UNCORE_EXTRA_PCI_DEV 0xff -#define UNCORE_EXTRA_PCI_DEV_MAX 2 +#define UNCORE_EXTRA_PCI_DEV_MAX 3 /* support up to 8 sockets */ #define UNCORE_SOCKET_MAX 8 diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c index f9ed429d6e4f..ab474faa262b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c @@ -887,6 +887,7 @@ void snbep_uncore_cpu_init(void) enum { SNBEP_PCI_QPI_PORT0_FILTER, SNBEP_PCI_QPI_PORT1_FILTER, + HSWEP_PCI_PCU_3, }; static int snbep_qpi_hw_config(struct intel_uncore_box *box, struct perf_event *event) @@ -2022,6 +2023,17 @@ void hswep_uncore_cpu_init(void) { if (hswep_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores) hswep_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores; + + /* Detect 6-8 core systems with only two SBOXes */ + if (uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3]) { + u32 capid4; + + pci_read_config_dword(uncore_extra_pci_dev[0][HSWEP_PCI_PCU_3], + 0x94, &capid4); + if (((capid4 >> 6) & 0x3) == 0) + hswep_uncore_sbox.num_boxes = 2; + } + uncore_msr_uncores = hswep_msr_uncores; } @@ -2279,6 +2291,11 @@ static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = { .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, SNBEP_PCI_QPI_PORT1_FILTER), }, + { /* PCU.3 (for Capability registers) */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2fc0), + .driver_data = UNCORE_PCI_DEV_DATA(UNCORE_EXTRA_PCI_DEV, + HSWEP_PCI_PCU_3), + }, { /* end: all zeroes */ } }; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 344b63f18d14..3dddb89ba320 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback) ENTRY(xen_do_upcall) 1: mov %esp, %eax call xen_evtchn_do_upcall +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp ret_from_intr CFI_ENDPROC ENDPROC(xen_hypervisor_callback) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index c0226ab54106..e36d9815ef56 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -334,11 +334,14 @@ ENTRY(ret_from_fork) testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? jz 1f - testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET - jnz int_ret_from_sys_call - - RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET - jmp ret_from_sys_call # go to the SYSRET fastpath + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use int_ret_from_sys_call to return, since it can safely handle + * all of the above. + */ + jmp int_ret_from_sys_call 1: subq $REST_SKIP, %rsp # leave space for volatiles @@ -1170,6 +1173,9 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) popq %rsp CFI_DEF_CFA_REGISTER rsp decl PER_CPU_VAR(irq_count) +#ifndef CONFIG_PREEMPT + call xen_maybe_preempt_hcall +#endif jmp error_exit CFI_ENDPROC END(xen_do_hypervisor_callback) diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index d6c1b9836995..2911ef3a9f1c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,6 +31,7 @@ static void __init i386_default_early_setup(void) asmlinkage __visible void __init i386_start_kernel(void) { + cr4_init_shadow(); sanitize_boot_params(&boot_params); /* Call the subarch specific early setup function */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index eda1a865641e..957fdb5eb2e3 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -155,6 +155,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); + cr4_init_shadow(); + /* Kill off the identity-map trampoline */ reset_early_page_tables(); @@ -162,7 +164,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) clear_bss(); for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) - set_intr_gate(i, early_idt_handlers[i]); + set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); copy_bootdata(__va(real_mode_data)); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..e7be5290fe1f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -61,9 +61,16 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif -/* Number of possible pages in the lowmem region */ -LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) - +/* + * Number of possible pages in the lowmem region. + * + * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a + * gas warning about overflowing shift count when gas has been compiled + * with only a host target support using a 32-bit type for internal + * representation. + */ +LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT) + /* Enough space to fit pagetables for the low memory linear map */ MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT @@ -477,21 +484,22 @@ is486: __INIT setup_once: /* - * Set up a idt with 256 entries pointing to ignore_int, - * interrupt gates. It doesn't actually load idt - that needs - * to be done on each CPU. Interrupts are enabled elsewhere, - * when we can be relatively sure everything is ok. + * Set up a idt with 256 interrupt gates that push zero if there + * is no error code and then jump to early_idt_handler_common. + * It doesn't actually load the idt - that needs to be done on + * each CPU. Interrupts are enabled elsewhere, when we can be + * relatively sure everything is ok. */ movl $idt_table,%edi - movl $early_idt_handlers,%eax + movl $early_idt_handler_array,%eax movl $NUM_EXCEPTION_VECTORS,%ecx 1: movl %eax,(%edi) movl %eax,4(%edi) /* interrupt gate, dpl=0, present */ movl $(0x8E000000 + __KERNEL_CS),2(%edi) - addl $9,%eax + addl $EARLY_IDT_HANDLER_SIZE,%eax addl $8,%edi loop 1b @@ -523,26 +531,28 @@ setup_once: andl $0,setup_once_ref /* Once is enough, thanks */ ret -ENTRY(early_idt_handlers) +ENTRY(early_idt_handler_array) # 36(%esp) %eflags # 32(%esp) %cs # 28(%esp) %eip # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handlers) +ENDPROC(early_idt_handler_array) - /* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%esp) # X86_TRAP_NMI @@ -602,7 +612,7 @@ ex_entry: is_nmi: addl $8,%esp /* drop vector number and error code */ iret -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) /* This is the default interrupt "handler" :-) */ ALIGN diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a468c0a65c42..a2dc0add72ed 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -321,26 +321,28 @@ bad_address: jmp bad_address __INIT - .globl early_idt_handlers -early_idt_handlers: +ENTRY(early_idt_handler_array) # 104(%rsp) %rflags # 96(%rsp) %cs # 88(%rsp) %rip # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .if (EXCEPTION_ERRCODE_MASK >> i) & 1 - ASM_NOP2 - .else + .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 pushq $0 # Dummy error code, to make stack frame uniform .endif pushq $i # 72(%rsp) Vector number - jmp early_idt_handler + jmp early_idt_handler_common i = i + 1 + .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr +ENDPROC(early_idt_handler_array) -/* This is global to keep gas from relaxing the jumps */ -ENTRY(early_idt_handler) +early_idt_handler_common: + /* + * The stack is the hardware frame, an error code or zero, and the + * vector number. + */ cld cmpl $2,(%rsp) # X86_TRAP_NMI @@ -412,7 +414,7 @@ ENTRY(early_idt_handler) is_nmi: addq $16,%rsp # drop vector number and error code INTERRUPT_RETURN -ENDPROC(early_idt_handler) +ENDPROC(early_idt_handler_common) __INITDATA diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index a9a4229f6161..8d6e954db2a7 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -13,6 +13,7 @@ #include <asm/sigcontext.h> #include <asm/processor.h> #include <asm/math_emu.h> +#include <asm/tlbflush.h> #include <asm/uaccess.h> #include <asm/ptrace.h> #include <asm/i387.h> @@ -155,6 +156,21 @@ static void init_thread_xstate(void) xstate_size = sizeof(struct i387_fxsave_struct); else xstate_size = sizeof(struct i387_fsave_struct); + + /* + * Quirk: we don't yet handle the XSAVES* instructions + * correctly, as we don't correctly convert between + * standard and compacted format when interfacing + * with user-space - so disable it for now. + * + * The difference is small: with recent CPUs the + * compacted format is only marginally smaller than + * the standard FPU state format. + * + * ( This is easy to backport while we are fixing + * XSAVES* support. ) + */ + setup_clear_cpu_cap(X86_FEATURE_XSAVES); } /* @@ -180,7 +196,7 @@ void fpu_init(void) if (cpu_has_xmm) cr4_mask |= X86_CR4_OSXMMEXCPT; if (cr4_mask) - set_in_cr4(cr4_mask); + cr4_set_bits(cr4_mask); cr0 = read_cr0(); cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 922d28581024..37907756fc41 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -127,7 +127,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " Machine check polls\n"); #endif #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) - seq_printf(p, "%*s: ", prec, "THR"); + seq_printf(p, "%*s: ", prec, "HYP"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count); seq_printf(p, " Hypervisor callback interrupts\n"); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 67e6d19ef1be..f2e281cf8c19 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -330,13 +330,16 @@ int __copy_instruction(u8 *dest, u8 *src) { struct insn insn; kprobe_opcode_t buf[MAX_INSN_SIZE]; + int length; kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); insn_get_length(&insn); + length = insn.length; + /* Another subsystem puts a breakpoint, failed to recover */ if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) return 0; - memcpy(dest, insn.kaddr, insn.length); + memcpy(dest, insn.kaddr, length); #ifdef CONFIG_X86_64 if (insn_rip_relative(&insn)) { @@ -366,7 +369,7 @@ int __copy_instruction(u8 *dest, u8 *src) *(s32 *) disp = (s32) newdisp; } #endif - return insn.length; + return length; } static int arch_copy_kprobe(struct kprobe *p) @@ -1018,6 +1021,15 @@ int setjmp_pre_handler(struct kprobe *p, struct pt_regs *regs) regs->flags &= ~X86_EFLAGS_IF; trace_hardirqs_off(); regs->ip = (unsigned long)(jp->entry); + + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer to get messed up. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); return 1; } NOKPROBE_SYMBOL(setjmp_pre_handler); @@ -1046,24 +1058,25 @@ int longjmp_break_handler(struct kprobe *p, struct pt_regs *regs) struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); u8 *addr = (u8 *) (regs->ip - 1); struct jprobe *jp = container_of(p, struct jprobe, kp); + void *saved_sp = kcb->jprobe_saved_sp; if ((addr > (u8 *) jprobe_return) && (addr < (u8 *) jprobe_return_end)) { - if (stack_addr(regs) != kcb->jprobe_saved_sp) { + if (stack_addr(regs) != saved_sp) { struct pt_regs *saved_regs = &kcb->jprobe_saved_regs; printk(KERN_ERR "current sp %p does not match saved sp %p\n", - stack_addr(regs), kcb->jprobe_saved_sp); + stack_addr(regs), saved_sp); printk(KERN_ERR "Saved registers for jprobe %p\n", jp); show_regs(saved_regs); printk(KERN_ERR "Current registers\n"); show_regs(regs); BUG(); } + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); *regs = kcb->jprobe_saved_regs; - memcpy((kprobe_opcode_t *)(kcb->jprobe_saved_sp), - kcb->jprobes_stack, - MIN_STACK_SIZE(kcb->jprobe_saved_sp)); + memcpy(saved_sp, kcb->jprobes_stack, MIN_STACK_SIZE(saved_sp)); preempt_enable_no_resched(); return 1; } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index f6945bef2cd1..94f643484300 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -283,7 +283,14 @@ NOKPROBE_SYMBOL(do_async_page_fault); static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d9156ceecdff..a2de9bc7ac0b 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -263,7 +263,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) diff --git a/arch/x86/kernel/pmc_atom.c b/arch/x86/kernel/pmc_atom.c index 0ee5025e0fa4..8bb9a611ca23 100644 --- a/arch/x86/kernel/pmc_atom.c +++ b/arch/x86/kernel/pmc_atom.c @@ -217,6 +217,8 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) if (!dir) return -ENOMEM; + pmc->dbgfs_dir = dir; + f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO, dir, pmc, &pmc_dev_state_ops); if (!f) { @@ -229,7 +231,7 @@ static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev) dev_err(&pdev->dev, "sleep_state register failed\n"); goto err; } - pmc->dbgfs_dir = dir; + return 0; err: pmc_dbgfs_unregister(pmc); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index e127ddaa2d5a..a388bb883128 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -24,10 +24,12 @@ #include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> +#include <asm/mwait.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/debugreg.h> #include <asm/nmi.h> +#include <asm/tlbflush.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -141,7 +143,7 @@ void flush_thread(void) static void hard_disable_TSC(void) { - write_cr4(read_cr4() | X86_CR4_TSD); + cr4_set_bits(X86_CR4_TSD); } void disable_TSC(void) @@ -158,7 +160,7 @@ void disable_TSC(void) static void hard_enable_TSC(void) { - write_cr4(read_cr4() & ~X86_CR4_TSD); + cr4_clear_bits(X86_CR4_TSD); } static void enable_TSC(void) @@ -398,6 +400,53 @@ static void amd_e400_idle(void) default_idle(); } +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default default C1 state. + * This invokes MWAIT with interrutps enabled and no flags, + * which is backwards compatible with the original MWAIT implementation. + */ + +static void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + smp_mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + smp_mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } + __current_clr_polling(); +} + void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -411,6 +460,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; } else x86_idle = default_idle; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8f3ebfe710d0..603c4f99cb5a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -101,7 +101,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4_safe(); + cr4 = __read_cr4_safe(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3ed4a68d4013..67fcc43577d2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,7 +93,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = read_cr4(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 17962e667a91..587be13be0be 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -182,6 +182,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + /* ASUS */ { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ab08aa2276fb..87b6496a32f1 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1176,7 +1176,7 @@ void __init setup_arch(char **cmdline_p) if (boot_cpu_data.cpuid_level >= 0) { /* A CPU has %cr4 if and only if it has CPUID */ - mmu_cr4_features = read_cr4(); + mmu_cr4_features = __read_cr4(); if (trampoline_cr4_features) *trampoline_cr4_features = mmu_cr4_features; } diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3a..7fc5e843f247 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,58 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + /* + * For historical reasons (i.e. no one ever documented how any + * of the segmentation APIs work), user programs can and do + * assume that a struct user_desc that's all zeros except for + * entry_number means "no segment at all". This never actually + * worked. In fact, up to Linux 3.19, a struct user_desc like + * this would create a 16-bit read-write segment with base and + * limit both equal to zero. + * + * That was close enough to "no segment at all" until we + * hardened this function to disallow 16-bit TLS segments. Fix + * it up by interpreting these zeroed segments the way that they + * were almost certainly intended to be interpreted. + * + * The correct way to ask for "no segment at all" is to specify + * a user_desc that satisfies LDT_empty. To keep everything + * working, we accept both. + * + * Note that there's a similar kludge in modify_ldt -- look at + * the distinction between modes 1 and 0x11. + */ + if (LDT_empty(info) || LDT_zero(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -40,7 +92,7 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info)) + if (LDT_empty(info) || LDT_zero(info)) desc->a = desc->b = 0; else fill_ldt(desc, info); @@ -66,6 +118,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +247,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +261,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index de801f22128a..07ab8e9733c5 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -387,7 +387,7 @@ NOKPROBE_SYMBOL(do_int3); * for scheduling or signal handling. The actual stack switch is done in * entry.S */ -asmlinkage __visible struct pt_regs *sync_regs(struct pt_regs *eregs) +asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs) { struct pt_regs *regs = eregs; /* Did already sync */ @@ -413,7 +413,7 @@ struct bad_iret_stack { struct pt_regs regs; }; -asmlinkage __visible +asmlinkage __visible notrace struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) { /* @@ -436,6 +436,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) BUG_ON(!user_mode_vm(&new_stack->regs)); return new_stack; } +NOKPROBE_SYMBOL(fixup_bad_iret); #endif /* diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b7e50bba3bbb..505449700e0c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -617,7 +617,7 @@ static unsigned long quick_pit_calibrate(void) goto success; } } - pr_err("Fast TSC calibration failed\n"); + pr_info("Fast TSC calibration failed\n"); return 0; success: diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 4c540c4719d8..cdc6cf903078 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -12,6 +12,7 @@ #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/sigframe.h> +#include <asm/tlbflush.h> #include <asm/xcr.h> /* @@ -378,7 +379,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) * thread's fpu state, reconstruct fxstate from the fsave * header. Sanitize the copied state etc. */ - struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; + struct fpu *fpu = &tsk->thread.fpu; struct user_i387_ia32_struct env; int err = 0; @@ -392,14 +393,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ drop_fpu(tsk); - if (__copy_from_user(xsave, buf_fx, state_size) || + if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) || __copy_from_user(&env, buf, sizeof(env))) { + fpu_finit(fpu); err = -1; } else { sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); - set_used_math(); } + set_used_math(); if (use_eager_fpu()) { preempt_disable(); math_state_restore(); @@ -453,7 +455,7 @@ static void prepare_fx_sw_frame(void) */ static inline void xstate_enable(void) { - set_in_cr4(X86_CR4_OSXSAVE); + cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } @@ -738,3 +740,4 @@ void *get_xsave_addr(struct xsave_struct *xsave, int xstate) return (void *)xsave + xstate_comp_offsets[feature]; } +EXPORT_SYMBOL_GPL(get_xsave_addr); |