diff options
Diffstat (limited to 'arch')
111 files changed, 2103 insertions, 507 deletions
diff --git a/arch/alpha/include/asm/xchg.h b/arch/alpha/include/asm/xchg.h index 0ca9724597c1..7081e52291d0 100644 --- a/arch/alpha/include/asm/xchg.h +++ b/arch/alpha/include/asm/xchg.h @@ -11,6 +11,10 @@ * Atomic exchange. * Since it can be used to implement critical sections * it must clobber "memory" (also for interrupts in UP). + * + * The leading and the trailing memory barriers guarantee that these + * operations are fully ordered. + * */ static inline unsigned long @@ -18,6 +22,7 @@ ____xchg(_u8, volatile char *m, unsigned long val) { unsigned long ret, tmp, addr64; + smp_mb(); __asm__ __volatile__( " andnot %4,7,%3\n" " insbl %1,%4,%1\n" @@ -42,6 +47,7 @@ ____xchg(_u16, volatile short *m, unsigned long val) { unsigned long ret, tmp, addr64; + smp_mb(); __asm__ __volatile__( " andnot %4,7,%3\n" " inswl %1,%4,%1\n" @@ -66,6 +72,7 @@ ____xchg(_u32, volatile int *m, unsigned long val) { unsigned long dummy; + smp_mb(); __asm__ __volatile__( "1: ldl_l %0,%4\n" " bis $31,%3,%1\n" @@ -86,6 +93,7 @@ ____xchg(_u64, volatile long *m, unsigned long val) { unsigned long dummy; + smp_mb(); __asm__ __volatile__( "1: ldq_l %0,%4\n" " bis $31,%3,%1\n" @@ -127,10 +135,12 @@ ____xchg(, volatile void *ptr, unsigned long x, int size) * store NEW in MEM. Return the initial value in MEM. Success is * indicated by comparing RETURN with OLD. * - * The memory barrier should be placed in SMP only when we actually - * make the change. If we don't change anything (so if the returned - * prev is equal to old) then we aren't acquiring anything new and - * we don't need any memory barrier as far I can tell. + * The leading and the trailing memory barriers guarantee that these + * operations are fully ordered. + * + * The trailing memory barrier is placed in SMP unconditionally, in + * order to guarantee that dependency ordering is preserved when a + * dependency is headed by an unsuccessful operation. */ static inline unsigned long @@ -138,6 +148,7 @@ ____cmpxchg(_u8, volatile char *m, unsigned char old, unsigned char new) { unsigned long prev, tmp, cmp, addr64; + smp_mb(); __asm__ __volatile__( " andnot %5,7,%4\n" " insbl %1,%5,%1\n" @@ -149,8 +160,8 @@ ____cmpxchg(_u8, volatile char *m, unsigned char old, unsigned char new) " or %1,%2,%2\n" " stq_c %2,0(%4)\n" " beq %2,3f\n" - __ASM__MB "2:\n" + __ASM__MB ".subsection 2\n" "3: br 1b\n" ".previous" @@ -165,6 +176,7 @@ ____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new) { unsigned long prev, tmp, cmp, addr64; + smp_mb(); __asm__ __volatile__( " andnot %5,7,%4\n" " inswl %1,%5,%1\n" @@ -176,8 +188,8 @@ ____cmpxchg(_u16, volatile short *m, unsigned short old, unsigned short new) " or %1,%2,%2\n" " stq_c %2,0(%4)\n" " beq %2,3f\n" - __ASM__MB "2:\n" + __ASM__MB ".subsection 2\n" "3: br 1b\n" ".previous" @@ -192,6 +204,7 @@ ____cmpxchg(_u32, volatile int *m, int old, int new) { unsigned long prev, cmp; + smp_mb(); __asm__ __volatile__( "1: ldl_l %0,%5\n" " cmpeq %0,%3,%1\n" @@ -199,8 +212,8 @@ ____cmpxchg(_u32, volatile int *m, int old, int new) " mov %4,%1\n" " stl_c %1,%2\n" " beq %1,3f\n" - __ASM__MB "2:\n" + __ASM__MB ".subsection 2\n" "3: br 1b\n" ".previous" @@ -215,6 +228,7 @@ ____cmpxchg(_u64, volatile long *m, unsigned long old, unsigned long new) { unsigned long prev, cmp; + smp_mb(); __asm__ __volatile__( "1: ldq_l %0,%5\n" " cmpeq %0,%3,%1\n" @@ -222,8 +236,8 @@ ____cmpxchg(_u64, volatile long *m, unsigned long old, unsigned long new) " mov %4,%1\n" " stq_c %1,%2\n" " beq %1,3f\n" - __ASM__MB "2:\n" + __ASM__MB ".subsection 2\n" "3: br 1b\n" ".previous" diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 249e10190d20..b7b78cb09a37 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -495,7 +495,6 @@ config ARC_CURR_IN_REG config ARC_EMUL_UNALIGNED bool "Emulate unaligned memory access (userspace only)" - default N select SYSCTL_ARCH_UNALIGN_NO_WARN select SYSCTL_ARCH_UNALIGN_ALLOW depends on ISA_ARCOMPACT diff --git a/arch/arm/boot/dts/bcm283x.dtsi b/arch/arm/boot/dts/bcm283x.dtsi index 74dd21b7373c..c51b88ee3cec 100644 --- a/arch/arm/boot/dts/bcm283x.dtsi +++ b/arch/arm/boot/dts/bcm283x.dtsi @@ -146,8 +146,8 @@ i2s: i2s@7e203000 { compatible = "brcm,bcm2835-i2s"; - reg = <0x7e203000 0x20>, - <0x7e101098 0x02>; + reg = <0x7e203000 0x24>; + clocks = <&clocks BCM2835_CLOCK_PCM>; dmas = <&dma 2>, <&dma 3>; diff --git a/arch/arm/boot/dts/bcm958625hr.dts b/arch/arm/boot/dts/bcm958625hr.dts index a1658d0721b8..cf0de77f09c4 100644 --- a/arch/arm/boot/dts/bcm958625hr.dts +++ b/arch/arm/boot/dts/bcm958625hr.dts @@ -49,7 +49,7 @@ memory { device_type = "memory"; - reg = <0x60000000 0x80000000>; + reg = <0x60000000 0x20000000>; }; gpio-restart { diff --git a/arch/arm/boot/dts/imx7d-cl-som-imx7.dts b/arch/arm/boot/dts/imx7d-cl-som-imx7.dts index 58b09bf1ba2d..205130600853 100644 --- a/arch/arm/boot/dts/imx7d-cl-som-imx7.dts +++ b/arch/arm/boot/dts/imx7d-cl-som-imx7.dts @@ -213,37 +213,37 @@ &iomuxc { pinctrl_enet1: enet1grp { fsl,pins = < - MX7D_PAD_SD2_CD_B__ENET1_MDIO 0x3 - MX7D_PAD_SD2_WP__ENET1_MDC 0x3 - MX7D_PAD_ENET1_RGMII_TXC__ENET1_RGMII_TXC 0x1 - MX7D_PAD_ENET1_RGMII_TD0__ENET1_RGMII_TD0 0x1 - MX7D_PAD_ENET1_RGMII_TD1__ENET1_RGMII_TD1 0x1 - MX7D_PAD_ENET1_RGMII_TD2__ENET1_RGMII_TD2 0x1 - MX7D_PAD_ENET1_RGMII_TD3__ENET1_RGMII_TD3 0x1 - MX7D_PAD_ENET1_RGMII_TX_CTL__ENET1_RGMII_TX_CTL 0x1 - MX7D_PAD_ENET1_RGMII_RXC__ENET1_RGMII_RXC 0x1 - MX7D_PAD_ENET1_RGMII_RD0__ENET1_RGMII_RD0 0x1 - MX7D_PAD_ENET1_RGMII_RD1__ENET1_RGMII_RD1 0x1 - MX7D_PAD_ENET1_RGMII_RD2__ENET1_RGMII_RD2 0x1 - MX7D_PAD_ENET1_RGMII_RD3__ENET1_RGMII_RD3 0x1 - MX7D_PAD_ENET1_RGMII_RX_CTL__ENET1_RGMII_RX_CTL 0x1 + MX7D_PAD_SD2_CD_B__ENET1_MDIO 0x30 + MX7D_PAD_SD2_WP__ENET1_MDC 0x30 + MX7D_PAD_ENET1_RGMII_TXC__ENET1_RGMII_TXC 0x11 + MX7D_PAD_ENET1_RGMII_TD0__ENET1_RGMII_TD0 0x11 + MX7D_PAD_ENET1_RGMII_TD1__ENET1_RGMII_TD1 0x11 + MX7D_PAD_ENET1_RGMII_TD2__ENET1_RGMII_TD2 0x11 + MX7D_PAD_ENET1_RGMII_TD3__ENET1_RGMII_TD3 0x11 + MX7D_PAD_ENET1_RGMII_TX_CTL__ENET1_RGMII_TX_CTL 0x11 + MX7D_PAD_ENET1_RGMII_RXC__ENET1_RGMII_RXC 0x11 + MX7D_PAD_ENET1_RGMII_RD0__ENET1_RGMII_RD0 0x11 + MX7D_PAD_ENET1_RGMII_RD1__ENET1_RGMII_RD1 0x11 + MX7D_PAD_ENET1_RGMII_RD2__ENET1_RGMII_RD2 0x11 + MX7D_PAD_ENET1_RGMII_RD3__ENET1_RGMII_RD3 0x11 + MX7D_PAD_ENET1_RGMII_RX_CTL__ENET1_RGMII_RX_CTL 0x11 >; }; pinctrl_enet2: enet2grp { fsl,pins = < - MX7D_PAD_EPDC_GDSP__ENET2_RGMII_TXC 0x1 - MX7D_PAD_EPDC_SDCE2__ENET2_RGMII_TD0 0x1 - MX7D_PAD_EPDC_SDCE3__ENET2_RGMII_TD1 0x1 - MX7D_PAD_EPDC_GDCLK__ENET2_RGMII_TD2 0x1 - MX7D_PAD_EPDC_GDOE__ENET2_RGMII_TD3 0x1 - MX7D_PAD_EPDC_GDRL__ENET2_RGMII_TX_CTL 0x1 - MX7D_PAD_EPDC_SDCE1__ENET2_RGMII_RXC 0x1 - MX7D_PAD_EPDC_SDCLK__ENET2_RGMII_RD0 0x1 - MX7D_PAD_EPDC_SDLE__ENET2_RGMII_RD1 0x1 - MX7D_PAD_EPDC_SDOE__ENET2_RGMII_RD2 0x1 - MX7D_PAD_EPDC_SDSHR__ENET2_RGMII_RD3 0x1 - MX7D_PAD_EPDC_SDCE0__ENET2_RGMII_RX_CTL 0x1 + MX7D_PAD_EPDC_GDSP__ENET2_RGMII_TXC 0x11 + MX7D_PAD_EPDC_SDCE2__ENET2_RGMII_TD0 0x11 + MX7D_PAD_EPDC_SDCE3__ENET2_RGMII_TD1 0x11 + MX7D_PAD_EPDC_GDCLK__ENET2_RGMII_TD2 0x11 + MX7D_PAD_EPDC_GDOE__ENET2_RGMII_TD3 0x11 + MX7D_PAD_EPDC_GDRL__ENET2_RGMII_TX_CTL 0x11 + MX7D_PAD_EPDC_SDCE1__ENET2_RGMII_RXC 0x11 + MX7D_PAD_EPDC_SDCLK__ENET2_RGMII_RD0 0x11 + MX7D_PAD_EPDC_SDLE__ENET2_RGMII_RD1 0x11 + MX7D_PAD_EPDC_SDOE__ENET2_RGMII_RD2 0x11 + MX7D_PAD_EPDC_SDSHR__ENET2_RGMII_RD3 0x11 + MX7D_PAD_EPDC_SDCE0__ENET2_RGMII_RX_CTL 0x11 >; }; diff --git a/arch/arm/boot/dts/r8a7791-porter.dts b/arch/arm/boot/dts/r8a7791-porter.dts index 6761d11d3f9e..db0239c7e6c7 100644 --- a/arch/arm/boot/dts/r8a7791-porter.dts +++ b/arch/arm/boot/dts/r8a7791-porter.dts @@ -428,7 +428,7 @@ "dclkin.0", "dclkin.1"; ports { - port@1 { + port@0 { endpoint { remote-endpoint = <&adv7511_in>; }; diff --git a/arch/arm/boot/dts/socfpga.dtsi b/arch/arm/boot/dts/socfpga.dtsi index 9f48141270b8..f0702d8063d9 100644 --- a/arch/arm/boot/dts/socfpga.dtsi +++ b/arch/arm/boot/dts/socfpga.dtsi @@ -759,7 +759,7 @@ timer@fffec600 { compatible = "arm,cortex-a9-twd-timer"; reg = <0xfffec600 0x100>; - interrupts = <1 13 0xf04>; + interrupts = <1 13 0xf01>; clocks = <&mpu_periph_clk>; }; diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 12f99fd2e3b2..3aed4492c9a7 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -534,4 +534,14 @@ THUMB( orr \reg , \reg , #PSR_T_BIT ) #endif .endm +#ifdef CONFIG_KPROBES +#define _ASM_NOKPROBE(entry) \ + .pushsection "_kprobe_blacklist", "aw" ; \ + .balign 4 ; \ + .long entry; \ + .popsection +#else +#define _ASM_NOKPROBE(entry) +#endif + #endif /* __ASM_ASSEMBLER_H__ */ diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index d10e36235438..7f66b1b3aca1 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -223,6 +223,22 @@ static inline unsigned int kvm_get_vmid_bits(void) return 8; } +/* + * We are not in the kvm->srcu critical section most of the time, so we take + * the SRCU read lock here. Since we copy the data from the user page, we + * can immediately drop the lock again. + */ +static inline int kvm_read_guest_lock(struct kvm *kvm, + gpa_t gpa, void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_read_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + static inline void *kvm_get_hyp_vector(void) { return kvm_ksym_ref(__kvm_hyp_vector); diff --git a/arch/arm/include/asm/vdso.h b/arch/arm/include/asm/vdso.h index d0295f1dd1a3..ff65b6d96c7e 100644 --- a/arch/arm/include/asm/vdso.h +++ b/arch/arm/include/asm/vdso.h @@ -11,8 +11,6 @@ struct mm_struct; void arm_install_vdso(struct mm_struct *mm, unsigned long addr); -extern char vdso_start, vdso_end; - extern unsigned int vdso_total_pages; #else /* CONFIG_VDSO */ diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1b304897aa12..aa316a7562b1 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -19,6 +19,7 @@ #include <linux/uaccess.h> #include <linux/hardirq.h> #include <linux/kdebug.h> +#include <linux/kprobes.h> #include <linux/module.h> #include <linux/kexec.h> #include <linux/bug.h> @@ -415,7 +416,8 @@ void unregister_undef_hook(struct undef_hook *hook) raw_spin_unlock_irqrestore(&undef_lock, flags); } -static int call_undef_hook(struct pt_regs *regs, unsigned int instr) +static nokprobe_inline +int call_undef_hook(struct pt_regs *regs, unsigned int instr) { struct undef_hook *hook; unsigned long flags; @@ -488,6 +490,7 @@ die_sig: arm_notify_die("Oops - undefined instruction", regs, &info, 0, 6); } +NOKPROBE_SYMBOL(do_undefinstr) /* * Handle FIQ similarly to NMI on x86 systems. diff --git a/arch/arm/kernel/vdso.c b/arch/arm/kernel/vdso.c index 53cf86cf2d1a..890439737374 100644 --- a/arch/arm/kernel/vdso.c +++ b/arch/arm/kernel/vdso.c @@ -39,6 +39,8 @@ static struct page **vdso_text_pagelist; +extern char vdso_start[], vdso_end[]; + /* Total number of pages needed for the data and text portions of the VDSO. */ unsigned int vdso_total_pages __ro_after_init; @@ -179,13 +181,13 @@ static int __init vdso_init(void) unsigned int text_pages; int i; - if (memcmp(&vdso_start, "\177ELF", 4)) { + if (memcmp(vdso_start, "\177ELF", 4)) { pr_err("VDSO is not a valid ELF object!\n"); return -ENOEXEC; } - text_pages = (&vdso_end - &vdso_start) >> PAGE_SHIFT; - pr_debug("vdso: %i text pages at base %p\n", text_pages, &vdso_start); + text_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; + pr_debug("vdso: %i text pages at base %p\n", text_pages, vdso_start); /* Allocate the VDSO text pagelist */ vdso_text_pagelist = kcalloc(text_pages, sizeof(struct page *), @@ -200,7 +202,7 @@ static int __init vdso_init(void) for (i = 0; i < text_pages; i++) { struct page *page; - page = virt_to_page(&vdso_start + i * PAGE_SIZE); + page = virt_to_page(vdso_start + i * PAGE_SIZE); vdso_text_pagelist[i] = page; } @@ -211,7 +213,7 @@ static int __init vdso_init(void) cntvct_ok = cntvct_functional(); - patch_vdso(&vdso_start); + patch_vdso(vdso_start); return 0; } diff --git a/arch/arm/lib/getuser.S b/arch/arm/lib/getuser.S index df73914e81c8..746e7801dcdf 100644 --- a/arch/arm/lib/getuser.S +++ b/arch/arm/lib/getuser.S @@ -38,6 +38,7 @@ ENTRY(__get_user_1) mov r0, #0 ret lr ENDPROC(__get_user_1) +_ASM_NOKPROBE(__get_user_1) ENTRY(__get_user_2) check_uaccess r0, 2, r1, r2, __get_user_bad @@ -58,6 +59,7 @@ rb .req r0 mov r0, #0 ret lr ENDPROC(__get_user_2) +_ASM_NOKPROBE(__get_user_2) ENTRY(__get_user_4) check_uaccess r0, 4, r1, r2, __get_user_bad @@ -65,6 +67,7 @@ ENTRY(__get_user_4) mov r0, #0 ret lr ENDPROC(__get_user_4) +_ASM_NOKPROBE(__get_user_4) ENTRY(__get_user_8) check_uaccess r0, 8, r1, r2, __get_user_bad8 @@ -78,6 +81,7 @@ ENTRY(__get_user_8) mov r0, #0 ret lr ENDPROC(__get_user_8) +_ASM_NOKPROBE(__get_user_8) #ifdef __ARMEB__ ENTRY(__get_user_32t_8) @@ -91,6 +95,7 @@ ENTRY(__get_user_32t_8) mov r0, #0 ret lr ENDPROC(__get_user_32t_8) +_ASM_NOKPROBE(__get_user_32t_8) ENTRY(__get_user_64t_1) check_uaccess r0, 1, r1, r2, __get_user_bad8 @@ -98,6 +103,7 @@ ENTRY(__get_user_64t_1) mov r0, #0 ret lr ENDPROC(__get_user_64t_1) +_ASM_NOKPROBE(__get_user_64t_1) ENTRY(__get_user_64t_2) check_uaccess r0, 2, r1, r2, __get_user_bad8 @@ -114,6 +120,7 @@ rb .req r0 mov r0, #0 ret lr ENDPROC(__get_user_64t_2) +_ASM_NOKPROBE(__get_user_64t_2) ENTRY(__get_user_64t_4) check_uaccess r0, 4, r1, r2, __get_user_bad8 @@ -121,6 +128,7 @@ ENTRY(__get_user_64t_4) mov r0, #0 ret lr ENDPROC(__get_user_64t_4) +_ASM_NOKPROBE(__get_user_64t_4) #endif __get_user_bad8: @@ -131,6 +139,8 @@ __get_user_bad: ret lr ENDPROC(__get_user_bad) ENDPROC(__get_user_bad8) +_ASM_NOKPROBE(__get_user_bad) +_ASM_NOKPROBE(__get_user_bad8) .pushsection __ex_table, "a" .long 1b, __get_user_bad diff --git a/arch/arm/mach-omap1/clock.c b/arch/arm/mach-omap1/clock.c index 4f5fd4a084c0..034b89499bd7 100644 --- a/arch/arm/mach-omap1/clock.c +++ b/arch/arm/mach-omap1/clock.c @@ -1031,17 +1031,17 @@ static int clk_debugfs_register_one(struct clk *c) return -ENOMEM; c->dent = d; - d = debugfs_create_u8("usecount", S_IRUGO, c->dent, (u8 *)&c->usecount); + d = debugfs_create_u8("usecount", S_IRUGO, c->dent, &c->usecount); if (!d) { err = -ENOMEM; goto err_out; } - d = debugfs_create_u32("rate", S_IRUGO, c->dent, (u32 *)&c->rate); + d = debugfs_create_ulong("rate", S_IRUGO, c->dent, &c->rate); if (!d) { err = -ENOMEM; goto err_out; } - d = debugfs_create_x32("flags", S_IRUGO, c->dent, (u32 *)&c->flags); + d = debugfs_create_x8("flags", S_IRUGO, c->dent, &c->flags); if (!d) { err = -ENOMEM; goto err_out; diff --git a/arch/arm/mach-omap2/pm.c b/arch/arm/mach-omap2/pm.c index 678d2a31dcb8..3202015ecb83 100644 --- a/arch/arm/mach-omap2/pm.c +++ b/arch/arm/mach-omap2/pm.c @@ -225,7 +225,7 @@ static void omap_pm_end(void) cpu_idle_poll_ctrl(false); } -static void omap_pm_finish(void) +static void omap_pm_wake(void) { if (cpu_is_omap34xx()) omap_prcm_irq_complete(); @@ -235,7 +235,7 @@ static const struct platform_suspend_ops omap_pm_ops = { .begin = omap_pm_begin, .end = omap_pm_end, .enter = omap_pm_enter, - .finish = omap_pm_finish, + .wake = omap_pm_wake, .valid = suspend_valid_only_mem, }; diff --git a/arch/arm/mach-omap2/timer.c b/arch/arm/mach-omap2/timer.c index b2f2448bfa6d..a4cab2814655 100644 --- a/arch/arm/mach-omap2/timer.c +++ b/arch/arm/mach-omap2/timer.c @@ -136,12 +136,6 @@ static struct clock_event_device clockevent_gpt = { .tick_resume = omap2_gp_timer_shutdown, }; -static struct property device_disabled = { - .name = "status", - .length = sizeof("disabled"), - .value = "disabled", -}; - static const struct of_device_id omap_timer_match[] __initconst = { { .compatible = "ti,omap2420-timer", }, { .compatible = "ti,omap3430-timer", }, @@ -183,8 +177,17 @@ static struct device_node * __init omap_get_timer_dt(const struct of_device_id * of_get_property(np, "ti,timer-secure", NULL))) continue; - if (!of_device_is_compatible(np, "ti,omap-counter32k")) - of_add_property(np, &device_disabled); + if (!of_device_is_compatible(np, "ti,omap-counter32k")) { + struct property *prop; + + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return NULL; + prop->name = "status"; + prop->value = "disabled"; + prop->length = strlen(prop->value); + of_add_property(np, prop); + } return np; } diff --git a/arch/arm/mach-orion5x/Kconfig b/arch/arm/mach-orion5x/Kconfig index 89bb0fc796bd..72905a442106 100644 --- a/arch/arm/mach-orion5x/Kconfig +++ b/arch/arm/mach-orion5x/Kconfig @@ -57,7 +57,6 @@ config MACH_KUROBOX_PRO config MACH_DNS323 bool "D-Link DNS-323" - select GENERIC_NET_UTILS select I2C_BOARDINFO if I2C help Say 'Y' here if you want your kernel to support the @@ -65,7 +64,6 @@ config MACH_DNS323 config MACH_TS209 bool "QNAP TS-109/TS-209" - select GENERIC_NET_UTILS help Say 'Y' here if you want your kernel to support the QNAP TS-109/TS-209 platform. @@ -107,7 +105,6 @@ config MACH_LINKSTATION_LS_HGL config MACH_TS409 bool "QNAP TS-409" - select GENERIC_NET_UTILS help Say 'Y' here if you want your kernel to support the QNAP TS-409 platform. diff --git a/arch/arm/mach-orion5x/dns323-setup.c b/arch/arm/mach-orion5x/dns323-setup.c index cd483bfb5ca8..d13344b2ddcd 100644 --- a/arch/arm/mach-orion5x/dns323-setup.c +++ b/arch/arm/mach-orion5x/dns323-setup.c @@ -173,10 +173,42 @@ static struct mv643xx_eth_platform_data dns323_eth_data = { .phy_addr = MV643XX_ETH_PHY_ADDR(8), }; +/* dns323_parse_hex_*() taken from tsx09-common.c; should a common copy of these + * functions be kept somewhere? + */ +static int __init dns323_parse_hex_nibble(char n) +{ + if (n >= '0' && n <= '9') + return n - '0'; + + if (n >= 'A' && n <= 'F') + return n - 'A' + 10; + + if (n >= 'a' && n <= 'f') + return n - 'a' + 10; + + return -1; +} + +static int __init dns323_parse_hex_byte(const char *b) +{ + int hi; + int lo; + + hi = dns323_parse_hex_nibble(b[0]); + lo = dns323_parse_hex_nibble(b[1]); + + if (hi < 0 || lo < 0) + return -1; + + return (hi << 4) | lo; +} + static int __init dns323_read_mac_addr(void) { u_int8_t addr[6]; - void __iomem *mac_page; + int i; + char *mac_page; /* MAC address is stored as a regular ol' string in /dev/mtdblock4 * (0x007d0000-0x00800000) starting at offset 196480 (0x2ff80). @@ -185,8 +217,23 @@ static int __init dns323_read_mac_addr(void) if (!mac_page) return -ENOMEM; - if (!mac_pton((__force const char *) mac_page, addr)) - goto error_fail; + /* Sanity check the string we're looking at */ + for (i = 0; i < 5; i++) { + if (*(mac_page + (i * 3) + 2) != ':') { + goto error_fail; + } + } + + for (i = 0; i < 6; i++) { + int byte; + + byte = dns323_parse_hex_byte(mac_page + (i * 3)); + if (byte < 0) { + goto error_fail; + } + + addr[i] = byte; + } iounmap(mac_page); printk("DNS-323: Found ethernet MAC address: %pM\n", addr); diff --git a/arch/arm/mach-orion5x/tsx09-common.c b/arch/arm/mach-orion5x/tsx09-common.c index 89774985d380..905d4f2dd0b8 100644 --- a/arch/arm/mach-orion5x/tsx09-common.c +++ b/arch/arm/mach-orion5x/tsx09-common.c @@ -53,12 +53,53 @@ struct mv643xx_eth_platform_data qnap_tsx09_eth_data = { .phy_addr = MV643XX_ETH_PHY_ADDR(8), }; +static int __init qnap_tsx09_parse_hex_nibble(char n) +{ + if (n >= '0' && n <= '9') + return n - '0'; + + if (n >= 'A' && n <= 'F') + return n - 'A' + 10; + + if (n >= 'a' && n <= 'f') + return n - 'a' + 10; + + return -1; +} + +static int __init qnap_tsx09_parse_hex_byte(const char *b) +{ + int hi; + int lo; + + hi = qnap_tsx09_parse_hex_nibble(b[0]); + lo = qnap_tsx09_parse_hex_nibble(b[1]); + + if (hi < 0 || lo < 0) + return -1; + + return (hi << 4) | lo; +} + static int __init qnap_tsx09_check_mac_addr(const char *addr_str) { u_int8_t addr[6]; + int i; - if (!mac_pton(addr_str, addr)) - return -1; + for (i = 0; i < 6; i++) { + int byte; + + /* + * Enforce "xx:xx:xx:xx:xx:xx\n" format. + */ + if (addr_str[(i * 3) + 2] != ((i < 5) ? ':' : '\n')) + return -1; + + byte = qnap_tsx09_parse_hex_byte(addr_str + (i * 3)); + if (byte < 0) + return -1; + addr[i] = byte; + } printk(KERN_INFO "tsx09: found ethernet mac address %pM\n", addr); @@ -77,12 +118,12 @@ void __init qnap_tsx09_find_mac_addr(u32 mem_base, u32 size) unsigned long addr; for (addr = mem_base; addr < (mem_base + size); addr += 1024) { - void __iomem *nor_page; + char *nor_page; int ret = 0; nor_page = ioremap(addr, 1024); if (nor_page != NULL) { - ret = qnap_tsx09_check_mac_addr((__force const char *)nor_page); + ret = qnap_tsx09_check_mac_addr(nor_page); iounmap(nor_page); } diff --git a/arch/arm/plat-omap/dmtimer.c b/arch/arm/plat-omap/dmtimer.c index 7a327bd32521..ebef8aacea83 100644 --- a/arch/arm/plat-omap/dmtimer.c +++ b/arch/arm/plat-omap/dmtimer.c @@ -890,11 +890,8 @@ static int omap_dm_timer_probe(struct platform_device *pdev) timer->irq = irq->start; timer->pdev = pdev; - /* Skip pm_runtime_enable for OMAP1 */ - if (!(timer->capability & OMAP_TIMER_NEEDS_RESET)) { - pm_runtime_enable(dev); - pm_runtime_irq_safe(dev); - } + pm_runtime_enable(dev); + pm_runtime_irq_safe(dev); if (!timer->reserved) { ret = pm_runtime_get_sync(dev); diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index bcdecc25461b..b2aa9b32bff2 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -165,13 +165,14 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) { unsigned long flags; struct kprobe *p = &op->kp; - struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + struct kprobe_ctlblk *kcb; /* Save skipped registers */ regs->ARM_pc = (unsigned long)op->kp.addr; regs->ARM_ORIG_r0 = ~0UL; local_irq_save(flags); + kcb = get_kprobe_ctlblk(); if (kprobe_running()) { kprobes_inc_nmissed_count(&op->kp); @@ -191,6 +192,7 @@ optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) local_irq_restore(flags); } +NOKPROBE_SYMBOL(optimized_callback) int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig) { diff --git a/arch/arm64/boot/dts/qcom/msm8996.dtsi b/arch/arm64/boot/dts/qcom/msm8996.dtsi index 338f82a7fdc7..2c93de7fffe5 100644 --- a/arch/arm64/boot/dts/qcom/msm8996.dtsi +++ b/arch/arm64/boot/dts/qcom/msm8996.dtsi @@ -326,8 +326,8 @@ blsp2_spi5: spi@075ba000{ compatible = "qcom,spi-qup-v2.2.1"; reg = <0x075ba000 0x600>; - interrupts = <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>; - clocks = <&gcc GCC_BLSP2_QUP5_SPI_APPS_CLK>, + interrupts = <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&gcc GCC_BLSP2_QUP6_SPI_APPS_CLK>, <&gcc GCC_BLSP2_AHB_CLK>; clock-names = "core", "iface"; pinctrl-names = "default", "sleep"; diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 80bf33715ecb..eac73a640ea7 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -313,6 +313,22 @@ static inline unsigned int kvm_get_vmid_bits(void) return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +/* + * We are not in the kvm->srcu critical section most of the time, so we take + * the SRCU read lock here. Since we copy the data from the user page, we + * can immediately drop the lock again. + */ +static inline int kvm_read_guest_lock(struct kvm *kvm, + gpa_t gpa, void *data, unsigned long len) +{ + int srcu_idx = srcu_read_lock(&kvm->srcu); + int ret = kvm_read_guest(kvm, gpa, data, len); + + srcu_read_unlock(&kvm->srcu, srcu_idx); + + return ret; +} + #ifdef CONFIG_HARDEN_BRANCH_PREDICTOR #include <asm/mmu.h> diff --git a/arch/arm64/include/asm/spinlock.h b/arch/arm64/include/asm/spinlock.h index cae331d553f8..a9d2dd03c977 100644 --- a/arch/arm64/include/asm/spinlock.h +++ b/arch/arm64/include/asm/spinlock.h @@ -141,8 +141,8 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock) " cbnz %w1, 1f\n" " add %w1, %w0, %3\n" " casa %w0, %w1, %2\n" - " and %w1, %w1, #0xffff\n" - " eor %w1, %w1, %w0, lsr #16\n" + " sub %w1, %w1, %3\n" + " eor %w1, %w1, %w0\n" "1:") : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock) : "I" (1 << TICKET_SHIFT) diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h index 801a16dbbdf6..7d2a15a0f625 100644 --- a/arch/arm64/include/asm/stacktrace.h +++ b/arch/arm64/include/asm/stacktrace.h @@ -23,7 +23,7 @@ struct stackframe { unsigned long sp; unsigned long pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - unsigned int graph; + int graph; #endif }; diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 74107134cc30..2de62aa91303 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -160,7 +160,7 @@ static int enable_smccc_arch_workaround_1(void *data) case PSCI_CONDUIT_HVC: arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if (res.a0) + if ((int)res.a0 < 0) return 0; cb = call_hvc_arch_workaround_1; smccc_start = __smccc_workaround_1_hvc_start; @@ -170,7 +170,7 @@ static int enable_smccc_arch_workaround_1(void *data) case PSCI_CONDUIT_SMC: arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if (res.a0) + if ((int)res.a0 < 0) return 0; cb = call_smc_arch_workaround_1; smccc_start = __smccc_workaround_1_smc_start; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index 8a552a33c6ef..5201bebcec07 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -73,6 +73,11 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (tsk->ret_stack && (frame->pc == (unsigned long)return_to_handler)) { + if (WARN_ON_ONCE(frame->graph == -1)) + return -EINVAL; + if (frame->graph < -1) + frame->graph += FTRACE_NOTRACE_DEPTH; + /* * This is a case where function graph tracer has * modified a return address (LR) in a stack frame diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c index 59779699a1a4..5d9076e86200 100644 --- a/arch/arm64/kernel/time.c +++ b/arch/arm64/kernel/time.c @@ -53,7 +53,7 @@ unsigned long profile_pc(struct pt_regs *regs) frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - frame.graph = -1; /* no task info */ + frame.graph = current->curr_ret_stack; #endif do { int ret = unwind_frame(NULL, &frame); diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 5ed0ea92c5bf..f851c9d651f0 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, u64 virt_addr=simple_strtoull(buf, NULL, 16); int ret; - ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL); + ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG printk("Virtual address %lx is not existing.\n",virt_addr); diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index a0fc0c192427..3e8be0f54a44 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -135,7 +135,11 @@ static struct platform_device mcf_fec0 = { .id = 0, .num_resources = ARRAY_SIZE(mcf_fec0_resources), .resource = mcf_fec0_resources, - .dev.platform_data = FEC_PDATA, + .dev = { + .dma_mask = &mcf_fec0.dev.coherent_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), + .platform_data = FEC_PDATA, + } }; #ifdef MCFFEC_BASE1 @@ -167,7 +171,11 @@ static struct platform_device mcf_fec1 = { .id = 1, .num_resources = ARRAY_SIZE(mcf_fec1_resources), .resource = mcf_fec1_resources, - .dev.platform_data = FEC_PDATA, + .dev = { + .dma_mask = &mcf_fec1.dev.coherent_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), + .platform_data = FEC_PDATA, + } }; #endif /* MCFFEC_BASE1 */ #endif /* CONFIG_FEC */ diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index 6ed1ded87b8f..6420c83c29d1 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -2271,7 +2271,7 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, parent_irq = irq_of_parse_and_map(ciu_node, 0); if (!parent_irq) { - pr_err("ERROR: Couldn't acquire parent_irq for %s\n.", + pr_err("ERROR: Couldn't acquire parent_irq for %s\n", ciu_node->name); return -EINVAL; } @@ -2283,7 +2283,7 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, addr = of_get_address(ciu_node, 0, NULL, NULL); if (!addr) { - pr_err("ERROR: Couldn't acquire reg(0) %s\n.", ciu_node->name); + pr_err("ERROR: Couldn't acquire reg(0) %s\n", ciu_node->name); return -EINVAL; } host_data->raw_reg = (u64)phys_to_virt( @@ -2291,7 +2291,7 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, addr = of_get_address(ciu_node, 1, NULL, NULL); if (!addr) { - pr_err("ERROR: Couldn't acquire reg(1) %s\n.", ciu_node->name); + pr_err("ERROR: Couldn't acquire reg(1) %s\n", ciu_node->name); return -EINVAL; } host_data->en_reg = (u64)phys_to_virt( @@ -2299,7 +2299,7 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, r = of_property_read_u32(ciu_node, "cavium,max-bits", &val); if (r) { - pr_err("ERROR: Couldn't read cavium,max-bits from %s\n.", + pr_err("ERROR: Couldn't read cavium,max-bits from %s\n", ciu_node->name); return r; } @@ -2309,7 +2309,7 @@ static int __init octeon_irq_init_cib(struct device_node *ciu_node, &octeon_irq_domain_cib_ops, host_data); if (!cib_domain) { - pr_err("ERROR: Couldn't irq_domain_add_linear()\n."); + pr_err("ERROR: Couldn't irq_domain_add_linear()\n"); return -ENOMEM; } diff --git a/arch/mips/include/asm/mach-ath79/ar71xx_regs.h b/arch/mips/include/asm/mach-ath79/ar71xx_regs.h index aa3800c82332..d99ca862dae3 100644 --- a/arch/mips/include/asm/mach-ath79/ar71xx_regs.h +++ b/arch/mips/include/asm/mach-ath79/ar71xx_regs.h @@ -167,7 +167,7 @@ #define AR71XX_AHB_DIV_MASK 0x7 #define AR724X_PLL_REG_CPU_CONFIG 0x00 -#define AR724X_PLL_REG_PCIE_CONFIG 0x18 +#define AR724X_PLL_REG_PCIE_CONFIG 0x10 #define AR724X_PLL_FB_SHIFT 0 #define AR724X_PLL_FB_MASK 0x3ff diff --git a/arch/mips/include/asm/machine.h b/arch/mips/include/asm/machine.h index 6b444cd9526f..db930cdc715f 100644 --- a/arch/mips/include/asm/machine.h +++ b/arch/mips/include/asm/machine.h @@ -52,7 +52,7 @@ mips_machine_is_compatible(const struct mips_machine *mach, const void *fdt) if (!mach->matches) return NULL; - for (match = mach->matches; match->compatible; match++) { + for (match = mach->matches; match->compatible[0]; match++) { if (fdt_node_check_compatible(fdt, 0, match->compatible) == 0) return match; } diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c index 0c8ae2cc6380..8f7bf74d1c0b 100644 --- a/arch/mips/kernel/ptrace.c +++ b/arch/mips/kernel/ptrace.c @@ -483,7 +483,7 @@ static int fpr_get_msa(struct task_struct *target, /* * Copy the floating-point context to the supplied NT_PRFPREG buffer. * Choose the appropriate helper for general registers, and then copy - * the FCSR register separately. + * the FCSR and FIR registers separately. */ static int fpr_get(struct task_struct *target, const struct user_regset *regset, @@ -491,6 +491,7 @@ static int fpr_get(struct task_struct *target, void *kbuf, void __user *ubuf) { const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); + const int fir_pos = fcr31_pos + sizeof(u32); int err; if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t)) @@ -503,6 +504,12 @@ static int fpr_get(struct task_struct *target, err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fpu.fcr31, fcr31_pos, fcr31_pos + sizeof(u32)); + if (err) + return err; + + err = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &boot_cpu_data.fpu_id, + fir_pos, fir_pos + sizeof(u32)); return err; } @@ -551,7 +558,8 @@ static int fpr_set_msa(struct task_struct *target, /* * Copy the supplied NT_PRFPREG buffer to the floating-point context. * Choose the appropriate helper for general registers, and then copy - * the FCSR register separately. + * the FCSR register separately. Ignore the incoming FIR register + * contents though, as the register is read-only. * * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0', * which is supposed to have been guaranteed by the kernel before @@ -565,6 +573,7 @@ static int fpr_set(struct task_struct *target, const void *kbuf, const void __user *ubuf) { const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t); + const int fir_pos = fcr31_pos + sizeof(u32); u32 fcr31; int err; @@ -592,6 +601,11 @@ static int fpr_set(struct task_struct *target, ptrace_setfcr31(target, fcr31); } + if (count > 0) + err = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf, + fir_pos, + fir_pos + sizeof(u32)); + return err; } @@ -813,7 +827,7 @@ long arch_ptrace(struct task_struct *child, long request, fregs = get_fpu_regs(child); #ifdef CONFIG_32BIT - if (test_thread_flag(TIF_32BIT_FPREGS)) { + if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) { /* * The odd registers are actually the high * order bits of the values stored in the even @@ -902,7 +916,7 @@ long arch_ptrace(struct task_struct *child, long request, init_fp_ctx(child); #ifdef CONFIG_32BIT - if (test_thread_flag(TIF_32BIT_FPREGS)) { + if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) { /* * The odd registers are actually the high * order bits of the values stored in the even diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index 5fcbdcd7abd0..bc9afbabbe14 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -97,7 +97,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, break; } fregs = get_fpu_regs(child); - if (test_thread_flag(TIF_32BIT_FPREGS)) { + if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) { /* * The odd registers are actually the high * order bits of the values stored in the even @@ -204,7 +204,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, sizeof(child->thread.fpu)); child->thread.fpu.fcr31 = 0; } - if (test_thread_flag(TIF_32BIT_FPREGS)) { + if (test_tsk_thread_flag(child, TIF_32BIT_FPREGS)) { /* * The odd registers are actually the high * order bits of the values stored in the even diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 29ec9ab3fd55..a2c46f539e3e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -42,7 +42,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "cache", VCPU_STAT(cache_exits), KVM_STAT_VCPU }, { "signal", VCPU_STAT(signal_exits), KVM_STAT_VCPU }, { "interrupt", VCPU_STAT(int_exits), KVM_STAT_VCPU }, - { "cop_unsuable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU }, + { "cop_unusable", VCPU_STAT(cop_unusable_exits), KVM_STAT_VCPU }, { "tlbmod", VCPU_STAT(tlbmod_exits), KVM_STAT_VCPU }, { "tlbmiss_ld", VCPU_STAT(tlbmiss_ld_exits), KVM_STAT_VCPU }, { "tlbmiss_st", VCPU_STAT(tlbmiss_st_exits), KVM_STAT_VCPU }, diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 9d0107fbb169..43fa682e55da 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -851,9 +851,12 @@ static void r4k_dma_cache_wback_inv(unsigned long addr, unsigned long size) /* * Either no secondary cache or the available caches don't have the * subset property so we have to flush the primary caches - * explicitly + * explicitly. + * If we would need IPI to perform an INDEX-type operation, then + * we have to use the HIT-type alternative as IPI cannot be used + * here due to interrupts possibly being disabled. */ - if (size >= dcache_size) { + if (!r4k_op_needs_ipi(R4K_INDEX) && size >= dcache_size) { r4k_blast_dcache(); } else { R4600_HIT_CACHEOP_WAR_IMPL; @@ -890,7 +893,7 @@ static void r4k_dma_cache_inv(unsigned long addr, unsigned long size) return; } - if (size >= dcache_size) { + if (!r4k_op_needs_ipi(R4K_INDEX) && size >= dcache_size) { r4k_blast_dcache(); } else { R4600_HIT_CACHEOP_WAR_IMPL; diff --git a/arch/mips/txx9/rbtx4939/setup.c b/arch/mips/txx9/rbtx4939/setup.c index 8b937300fb7f..fd26fadc8617 100644 --- a/arch/mips/txx9/rbtx4939/setup.c +++ b/arch/mips/txx9/rbtx4939/setup.c @@ -186,7 +186,7 @@ static void __init rbtx4939_update_ioc_pen(void) #define RBTX4939_MAX_7SEGLEDS 8 -#if IS_ENABLED(CONFIG_LEDS_CLASS) +#if IS_BUILTIN(CONFIG_LEDS_CLASS) static u8 led_val[RBTX4939_MAX_7SEGLEDS]; struct rbtx4939_led_data { struct led_classdev cdev; @@ -261,7 +261,7 @@ static inline void rbtx4939_led_setup(void) static void __rbtx4939_7segled_putc(unsigned int pos, unsigned char val) { -#if IS_ENABLED(CONFIG_LEDS_CLASS) +#if IS_BUILTIN(CONFIG_LEDS_CLASS) unsigned long flags; local_irq_save(flags); /* bit7: reserved for LED class */ diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 9d47f2efa830..bb69f3955b59 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -92,7 +92,8 @@ $(addprefix $(obj)/,$(zlib-y)): \ libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c libfdtheader := fdt.h libfdt.h libfdt_internal.h -$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \ +$(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o \ + treeboot-akebono.o treeboot-currituck.o treeboot-iss4xx.o): \ $(addprefix $(obj)/,$(libfdtheader)) src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ diff --git a/arch/powerpc/include/asm/irq_work.h b/arch/powerpc/include/asm/irq_work.h index 744fd54de374..1bcc84903930 100644 --- a/arch/powerpc/include/asm/irq_work.h +++ b/arch/powerpc/include/asm/irq_work.h @@ -5,5 +5,6 @@ static inline bool arch_irq_work_has_interrupt(void) { return true; } +extern void arch_irq_work_raise(void); #endif /* _ASM_POWERPC_IRQ_WORK_H */ diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index f516ac508ae3..bf0f712ac0e0 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -228,14 +228,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) unsigned short maj; unsigned short min; - /* We only show online cpus: disable preempt (overzealous, I - * knew) to prevent cpu going down. */ - preempt_disable(); - if (!cpu_online(cpu_id)) { - preempt_enable(); - return 0; - } - #ifdef CONFIG_SMP pvr = per_cpu(cpu_pvr, cpu_id); #else @@ -340,9 +332,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) #ifdef CONFIG_SMP seq_printf(m, "\n"); #endif - - preempt_enable(); - /* If this is the last cpu, print the summary */ if (cpumask_next(cpu_id, cpu_online_mask) >= nr_cpu_ids) show_cpuinfo_summary(m); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 218cba2f5699..0a2b247dbc6b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3107,15 +3107,17 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) goto up_out; psize = vma_kernel_pagesize(vma); - porder = __ilog2(psize); up_read(¤t->mm->mmap_sem); /* We can handle 4k, 64k or 16M pages in the VRMA */ - err = -EINVAL; - if (!(psize == 0x1000 || psize == 0x10000 || - psize == 0x1000000)) - goto out_srcu; + if (psize >= 0x1000000) + psize = 0x1000000; + else if (psize >= 0x10000) + psize = 0x10000; + else + psize = 0x1000; + porder = __ilog2(psize); /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51c188b81f3..6cff96e0d77b 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -551,7 +551,7 @@ static int numa_setup_cpu(unsigned long lcpu) nid = of_node_to_nid_single(cpu); out_present: - if (nid < 0 || !node_online(nid)) + if (nid < 0 || !node_possible(nid)) nid = first_online_node; map_cpu_to_node(lcpu, nid); @@ -904,6 +904,32 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) NODE_DATA(nid)->node_spanned_pages = spanned_pages; } +static void __init find_possible_nodes(void) +{ + struct device_node *rtas; + u32 numnodes, i; + + if (min_common_depth <= 0) + return; + + rtas = of_find_node_by_path("/rtas"); + if (!rtas) + return; + + if (of_property_read_u32_index(rtas, + "ibm,max-associativity-domains", + min_common_depth, &numnodes)) + goto out; + + for (i = 0; i < numnodes; i++) { + if (!node_possible(i)) + node_set(i, node_possible_map); + } + +out: + of_node_put(rtas); +} + void __init initmem_init(void) { int nid, cpu; @@ -917,12 +943,15 @@ void __init initmem_init(void) memblock_dump_all(); /* - * Reduce the possible NUMA nodes to the online NUMA nodes, - * since we do not support node hotplug. This ensures that we - * lower the maximum NUMA node ID to what is actually present. + * Modify the set of possible NUMA nodes to reflect information + * available about the set of online nodes, and the set of nodes + * that we expect to make use of for this platform's affinity + * calculations. */ nodes_and(node_possible_map, node_possible_map, node_online_map); + find_possible_nodes(); + for_each_online_node(nid) { unsigned long start_pfn, end_pfn; @@ -1274,6 +1303,40 @@ static long vphn_get_associativity(unsigned long cpu, return rc; } +static inline int find_and_online_cpu_nid(int cpu) +{ + __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; + int new_nid; + + /* Use associativity from first thread for all siblings */ + vphn_get_associativity(cpu, associativity); + new_nid = associativity_to_nid(associativity); + if (new_nid < 0 || !node_possible(new_nid)) + new_nid = first_online_node; + + if (NODE_DATA(new_nid) == NULL) { +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Need to ensure that NODE_DATA is initialized for a node from + * available memory (see memblock_alloc_try_nid). If unable to + * init the node, then default to nearest node that has memory + * installed. + */ + if (try_online_node(new_nid)) + new_nid = first_online_node; +#else + /* + * Default to using the nearest node that has memory installed. + * Otherwise, it would be necessary to patch the kernel MM code + * to deal with more memoryless-node error conditions. + */ + new_nid = first_online_node; +#endif + } + + return new_nid; +} + /* * Update the CPU maps and sysfs entries for a single CPU when its NUMA * characteristics change. This function doesn't perform any locking and is @@ -1339,7 +1402,6 @@ int arch_update_cpu_topology(void) { unsigned int cpu, sibling, changed = 0; struct topology_update_data *updates, *ud; - __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0}; cpumask_t updated_cpus; struct device *dev; int weight, new_nid, i = 0; @@ -1374,11 +1436,7 @@ int arch_update_cpu_topology(void) continue; } - /* Use associativity from first thread for all siblings */ - vphn_get_associativity(cpu, associativity); - new_nid = associativity_to_nid(associativity); - if (new_nid < 0 || !node_online(new_nid)) - new_nid = first_online_node; + new_nid = find_and_online_cpu_nid(cpu); if (new_nid == numa_cpu_lookup_table[cpu]) { cpumask_andnot(&cpu_associativity_changes_mask, diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 7e706f36e364..9c58194c7ea5 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -329,6 +329,9 @@ static int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); PPC_LWZ_OFFS(r_A, r_skb, offsetof(struct sk_buff, len)); break; + case BPF_LDX | BPF_W | BPF_ABS: /* A = *((u32 *)(seccomp_data + K)); */ + PPC_LWZ_OFFS(r_A, r_skb, K); + break; case BPF_LDX | BPF_W | BPF_LEN: /* X = skb->len; */ PPC_LWZ_OFFS(r_X, r_skb, offsetof(struct sk_buff, len)); break; diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index bf949623de90..771edffa2d40 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -448,6 +448,16 @@ static void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) /* invalid entry */ continue; + /* + * BHRB rolling buffer could very much contain the kernel + * addresses at this point. Check the privileges before + * exporting it to userspace (avoid exposure of regions + * where we could have speculative execution) + */ + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN) && + is_kernel_addr(addr)) + continue; + /* Branches are read most recent first (ie. mfbhrb 0 is * the most recent branch). * There are two types of valid entries: @@ -1188,6 +1198,7 @@ static void power_pmu_disable(struct pmu *pmu) */ write_mmcr0(cpuhw, val); mb(); + isync(); /* * Disable instruction sampling if it was enabled @@ -1196,12 +1207,26 @@ static void power_pmu_disable(struct pmu *pmu) mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mb(); + isync(); } cpuhw->disabled = 1; cpuhw->n_added = 0; ebb_switch_out(mmcr0); + +#ifdef CONFIG_PPC64 + /* + * These are readable by userspace, may contain kernel + * addresses and are not switched by context switch, so clear + * them now to avoid leaking anything to userspace in general + * including to another process. + */ + if (ppmu->flags & PPMU_ARCH_207S) { + mtspr(SPRN_SDAR, 0); + mtspr(SPRN_SIAR, 0); + } +#endif } local_irq_restore(flags); diff --git a/arch/powerpc/platforms/powernv/opal-nvram.c b/arch/powerpc/platforms/powernv/opal-nvram.c index 1bceb95f422d..5584247f5029 100644 --- a/arch/powerpc/platforms/powernv/opal-nvram.c +++ b/arch/powerpc/platforms/powernv/opal-nvram.c @@ -44,6 +44,10 @@ static ssize_t opal_nvram_read(char *buf, size_t count, loff_t *index) return count; } +/* + * This can be called in the panic path with interrupts off, so use + * mdelay in that case. + */ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) { s64 rc = OPAL_BUSY; @@ -58,10 +62,16 @@ static ssize_t opal_nvram_write(char *buf, size_t count, loff_t *index) while (rc == OPAL_BUSY || rc == OPAL_BUSY_EVENT) { rc = opal_write_nvram(__pa(buf), count, off); if (rc == OPAL_BUSY_EVENT) { - msleep(OPAL_BUSY_DELAY_MS); + if (in_interrupt() || irqs_disabled()) + mdelay(OPAL_BUSY_DELAY_MS); + else + msleep(OPAL_BUSY_DELAY_MS); opal_poll_events(NULL); } else if (rc == OPAL_BUSY) { - msleep(OPAL_BUSY_DELAY_MS); + if (in_interrupt() || irqs_disabled()) + mdelay(OPAL_BUSY_DELAY_MS); + else + msleep(OPAL_BUSY_DELAY_MS); } } diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index b9aac951a90f..f37567ed640c 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -626,7 +626,7 @@ static inline u32 mpic_physmask(u32 cpumask) int i; u32 mask = 0; - for (i = 0; i < min(32, NR_CPUS); ++i, cpumask >>= 1) + for (i = 0; i < min(32, NR_CPUS) && cpu_possible(i); ++i, cpumask >>= 1) mask |= (cpumask & 1) << get_hard_smp_processor_id(i); return mask; } diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S index 8013989cd2e5..096affb74446 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/crypto/crc32be-vx.S @@ -12,6 +12,7 @@ */ #include <linux/linkage.h> +#include <asm/nospec-insn.h> #include <asm/vx-insn.h> /* Vector register range containing CRC-32 constants */ @@ -66,6 +67,8 @@ .previous + GEN_BR_THUNK %r14 + .text /* * The CRC-32 function(s) use these calling conventions: @@ -202,6 +205,6 @@ ENTRY(crc32_be_vgfm_16) .Ldone: VLGVF %r2,%v2,3 - br %r14 + BR_EX %r14 .previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S index 17f2504c2633..8dc98c1d7cb1 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/crypto/crc32le-vx.S @@ -13,6 +13,7 @@ */ #include <linux/linkage.h> +#include <asm/nospec-insn.h> #include <asm/vx-insn.h> /* Vector register range containing CRC-32 constants */ @@ -75,6 +76,7 @@ .previous + GEN_BR_THUNK %r14 .text @@ -263,6 +265,6 @@ crc32_le_vgfm_generic: .Ldone: VLGVF %r2,%v2,2 - br %r14 + BR_EX %r14 .previous diff --git a/arch/s390/include/asm/alternative-asm.h b/arch/s390/include/asm/alternative-asm.h new file mode 100644 index 000000000000..955d620db23e --- /dev/null +++ b/arch/s390/include/asm/alternative-asm.h @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_ALTERNATIVE_ASM_H +#define _ASM_S390_ALTERNATIVE_ASM_H + +#ifdef __ASSEMBLY__ + +/* + * Check the length of an instruction sequence. The length may not be larger + * than 254 bytes and it has to be divisible by 2. + */ +.macro alt_len_check start,end + .if ( \end - \start ) > 254 + .error "cpu alternatives does not support instructions blocks > 254 bytes\n" + .endif + .if ( \end - \start ) % 2 + .error "cpu alternatives instructions length is odd\n" + .endif +.endm + +/* + * Issue one struct alt_instr descriptor entry (need to put it into + * the section .altinstructions, see below). This entry contains + * enough information for the alternatives patching code to patch an + * instruction. See apply_alternatives(). + */ +.macro alt_entry orig_start, orig_end, alt_start, alt_end, feature + .long \orig_start - . + .long \alt_start - . + .word \feature + .byte \orig_end - \orig_start + .byte \alt_end - \alt_start +.endm + +/* + * Fill up @bytes with nops. The macro emits 6-byte nop instructions + * for the bulk of the area, possibly followed by a 4-byte and/or + * a 2-byte nop if the size of the area is not divisible by 6. + */ +.macro alt_pad_fill bytes + .fill ( \bytes ) / 6, 6, 0xc0040000 + .fill ( \bytes ) % 6 / 4, 4, 0x47000000 + .fill ( \bytes ) % 6 % 4 / 2, 2, 0x0700 +.endm + +/* + * Fill up @bytes with nops. If the number of bytes is larger + * than 6, emit a jg instruction to branch over all nops, then + * fill an area of size (@bytes - 6) with nop instructions. + */ +.macro alt_pad bytes + .if ( \bytes > 0 ) + .if ( \bytes > 6 ) + jg . + \bytes + alt_pad_fill \bytes - 6 + .else + alt_pad_fill \bytes + .endif + .endif +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ +.macro ALTERNATIVE oldinstr, newinstr, feature + .pushsection .altinstr_replacement,"ax" +770: \newinstr +771: .popsection +772: \oldinstr +773: alt_len_check 770b, 771b + alt_len_check 772b, 773b + alt_pad ( ( 771b - 770b ) - ( 773b - 772b ) ) +774: .pushsection .altinstructions,"a" + alt_entry 772b, 774b, 770b, 771b, \feature + .popsection +.endm + +/* + * Define an alternative between two instructions. If @feature is + * present, early code in apply_alternatives() replaces @oldinstr with + * @newinstr. ".skip" directive takes care of proper instruction padding + * in case @newinstr is longer than @oldinstr. + */ +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 + .pushsection .altinstr_replacement,"ax" +770: \newinstr1 +771: \newinstr2 +772: .popsection +773: \oldinstr +774: alt_len_check 770b, 771b + alt_len_check 771b, 772b + alt_len_check 773b, 774b + .if ( 771b - 770b > 772b - 771b ) + alt_pad ( ( 771b - 770b ) - ( 774b - 773b ) ) + .else + alt_pad ( ( 772b - 771b ) - ( 774b - 773b ) ) + .endif +775: .pushsection .altinstructions,"a" + alt_entry 773b, 775b, 770b, 771b,\feature1 + alt_entry 773b, 775b, 771b, 772b,\feature2 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_ALTERNATIVE_ASM_H */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h new file mode 100644 index 000000000000..9a56e738d645 --- /dev/null +++ b/arch/s390/include/asm/nospec-insn.h @@ -0,0 +1,195 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_NOSPEC_ASM_H +#define _ASM_S390_NOSPEC_ASM_H + +#include <asm/alternative-asm.h> +#include <asm/asm-offsets.h> + +#ifdef __ASSEMBLY__ + +#ifdef CONFIG_EXPOLINE + +_LC_BR_R1 = __LC_BR_R1 + +/* + * The expoline macros are used to create thunks in the same format + * as gcc generates them. The 'comdat' section flag makes sure that + * the various thunks are merged into a single copy. + */ + .macro __THUNK_PROLOG_NAME name + .pushsection .text.\name,"axG",@progbits,\name,comdat + .globl \name + .hidden \name + .type \name,@function +\name: + .cfi_startproc + .endm + + .macro __THUNK_EPILOG + .cfi_endproc + .popsection + .endm + + .macro __THUNK_PROLOG_BR r1,r2 + __THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1 + .endm + + .macro __THUNK_PROLOG_BC d0,r1,r2 + __THUNK_PROLOG_NAME __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + .endm + + .macro __THUNK_BR r1,r2 + jg __s390x_indirect_jump_r\r2\()use_r\r1 + .endm + + .macro __THUNK_BC d0,r1,r2 + jg __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + .endm + + .macro __THUNK_BRASL r1,r2,r3 + brasl \r1,__s390x_indirect_jump_r\r3\()use_r\r2 + .endm + + .macro __DECODE_RR expand,reg,ruse + .set __decode_fail,1 + .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \reg,%r\r1 + .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \ruse,%r\r2 + \expand \r1,\r2 + .set __decode_fail,0 + .endif + .endr + .endif + .endr + .if __decode_fail == 1 + .error "__DECODE_RR failed" + .endif + .endm + + .macro __DECODE_RRR expand,rsave,rtarget,ruse + .set __decode_fail,1 + .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \rsave,%r\r1 + .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \rtarget,%r\r2 + .irp r3,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \ruse,%r\r3 + \expand \r1,\r2,\r3 + .set __decode_fail,0 + .endif + .endr + .endif + .endr + .endif + .endr + .if __decode_fail == 1 + .error "__DECODE_RRR failed" + .endif + .endm + + .macro __DECODE_DRR expand,disp,reg,ruse + .set __decode_fail,1 + .irp r1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \reg,%r\r1 + .irp r2,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + .ifc \ruse,%r\r2 + \expand \disp,\r1,\r2 + .set __decode_fail,0 + .endif + .endr + .endif + .endr + .if __decode_fail == 1 + .error "__DECODE_DRR failed" + .endif + .endm + + .macro __THUNK_EX_BR reg,ruse + # Be very careful when adding instructions to this macro! + # The ALTERNATIVE replacement code has a .+10 which targets + # the "br \reg" after the code has been patched. +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + exrl 0,555f + j . +#else + .ifc \reg,%r1 + ALTERNATIVE "ex %r0,_LC_BR_R1", ".insn ril,0xc60000000000,0,.+10", 35 + j . + .else + larl \ruse,555f + ex 0,0(\ruse) + j . + .endif +#endif +555: br \reg + .endm + + .macro __THUNK_EX_BC disp,reg,ruse +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + exrl 0,556f + j . +#else + larl \ruse,556f + ex 0,0(\ruse) + j . +#endif +556: b \disp(\reg) + .endm + + .macro GEN_BR_THUNK reg,ruse=%r1 + __DECODE_RR __THUNK_PROLOG_BR,\reg,\ruse + __THUNK_EX_BR \reg,\ruse + __THUNK_EPILOG + .endm + + .macro GEN_B_THUNK disp,reg,ruse=%r1 + __DECODE_DRR __THUNK_PROLOG_BC,\disp,\reg,\ruse + __THUNK_EX_BC \disp,\reg,\ruse + __THUNK_EPILOG + .endm + + .macro BR_EX reg,ruse=%r1 +557: __DECODE_RR __THUNK_BR,\reg,\ruse + .pushsection .s390_indirect_branches,"a",@progbits + .long 557b-. + .popsection + .endm + + .macro B_EX disp,reg,ruse=%r1 +558: __DECODE_DRR __THUNK_BC,\disp,\reg,\ruse + .pushsection .s390_indirect_branches,"a",@progbits + .long 558b-. + .popsection + .endm + + .macro BASR_EX rsave,rtarget,ruse=%r1 +559: __DECODE_RRR __THUNK_BRASL,\rsave,\rtarget,\ruse + .pushsection .s390_indirect_branches,"a",@progbits + .long 559b-. + .popsection + .endm + +#else + .macro GEN_BR_THUNK reg,ruse=%r1 + .endm + + .macro GEN_B_THUNK disp,reg,ruse=%r1 + .endm + + .macro BR_EX reg,ruse=%r1 + br \reg + .endm + + .macro B_EX disp,reg,ruse=%r1 + b \disp(\reg) + .endm + + .macro BASR_EX rsave,rtarget,ruse=%r1 + basr \rsave,\rtarget + .endm +#endif + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_NOSPEC_ASM_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 0501cac2ab95..5b139977a3a4 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -63,6 +63,7 @@ obj-y += nospec-branch.o extra-y += head.o head64.o vmlinux.lds +obj-$(CONFIG_SYSFS) += nospec-sysfs.o CFLAGS_REMOVE_nospec-branch.o += $(CC_FLAGS_EXPOLINE) obj-$(CONFIG_MODULES) += module.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f3df9e0a5dec..85c8ead29582 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -175,6 +175,7 @@ int main(void) OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_PASTE, lowcore, paste); + OFFSET(__LC_BR_R1, lowcore, br_r1_trampoline); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S index 326f717df587..61fca549a93b 100644 --- a/arch/s390/kernel/base.S +++ b/arch/s390/kernel/base.S @@ -8,18 +8,22 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> #include <asm/ptrace.h> #include <asm/sigp.h> + GEN_BR_THUNK %r9 + GEN_BR_THUNK %r14 + ENTRY(s390_base_mcck_handler) basr %r13,0 0: lg %r15,__LC_PANIC_STACK # load panic stack aghi %r15,-STACK_FRAME_OVERHEAD larl %r1,s390_base_mcck_handler_fn - lg %r1,0(%r1) - ltgr %r1,%r1 + lg %r9,0(%r1) + ltgr %r9,%r9 jz 1f - basr %r14,%r1 + BASR_EX %r14,%r9 1: la %r1,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) lpswe __LC_MCK_OLD_PSW @@ -36,10 +40,10 @@ ENTRY(s390_base_ext_handler) basr %r13,0 0: aghi %r15,-STACK_FRAME_OVERHEAD larl %r1,s390_base_ext_handler_fn - lg %r1,0(%r1) - ltgr %r1,%r1 + lg %r9,0(%r1) + ltgr %r9,%r9 jz 1f - basr %r14,%r1 + BASR_EX %r14,%r9 1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit lpswe __LC_EXT_OLD_PSW @@ -56,10 +60,10 @@ ENTRY(s390_base_pgm_handler) basr %r13,0 0: aghi %r15,-STACK_FRAME_OVERHEAD larl %r1,s390_base_pgm_handler_fn - lg %r1,0(%r1) - ltgr %r1,%r1 + lg %r9,0(%r1) + ltgr %r9,%r9 jz 1f - basr %r14,%r1 + BASR_EX %r14,%r9 lmg %r0,%r15,__LC_SAVE_AREA_SYNC lpswe __LC_PGM_OLD_PSW 1: lpswe disabled_wait_psw-0b(%r13) @@ -116,7 +120,7 @@ ENTRY(diag308_reset) larl %r4,.Lcontinue_psw # Restore PSW flags lpswe 0(%r4) .Lcontinue: - br %r14 + BR_EX %r14 .align 16 .Lrestart_psw: .long 0x00080000,0x80000000 + .Lrestart_part2 diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 1996afeb2e81..a4fd00064c80 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -24,6 +24,7 @@ #include <asm/setup.h> #include <asm/nmi.h> #include <asm/export.h> +#include <asm/nospec-insn.h> __PT_R0 = __PT_GPRS __PT_R1 = __PT_GPRS + 8 @@ -226,67 +227,9 @@ _PIF_WORK = (_PIF_PER_TRAP) .popsection .endm -#ifdef CONFIG_EXPOLINE - - .macro GEN_BR_THUNK name,reg,tmp - .section .text.\name,"axG",@progbits,\name,comdat - .globl \name - .hidden \name - .type \name,@function -\name: - .cfi_startproc -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - exrl 0,0f -#else - larl \tmp,0f - ex 0,0(\tmp) -#endif - j . -0: br \reg - .cfi_endproc - .endm - - GEN_BR_THUNK __s390x_indirect_jump_r1use_r9,%r9,%r1 - GEN_BR_THUNK __s390x_indirect_jump_r1use_r14,%r14,%r1 - GEN_BR_THUNK __s390x_indirect_jump_r11use_r14,%r14,%r11 - - .macro BASR_R14_R9 -0: brasl %r14,__s390x_indirect_jump_r1use_r9 - .pushsection .s390_indirect_branches,"a",@progbits - .long 0b-. - .popsection - .endm - - .macro BR_R1USE_R14 -0: jg __s390x_indirect_jump_r1use_r14 - .pushsection .s390_indirect_branches,"a",@progbits - .long 0b-. - .popsection - .endm - - .macro BR_R11USE_R14 -0: jg __s390x_indirect_jump_r11use_r14 - .pushsection .s390_indirect_branches,"a",@progbits - .long 0b-. - .popsection - .endm - -#else /* CONFIG_EXPOLINE */ - - .macro BASR_R14_R9 - basr %r14,%r9 - .endm - - .macro BR_R1USE_R14 - br %r14 - .endm - - .macro BR_R11USE_R14 - br %r14 - .endm - -#endif /* CONFIG_EXPOLINE */ - + GEN_BR_THUNK %r9 + GEN_BR_THUNK %r14 + GEN_BR_THUNK %r14,%r11 .section .kprobes.text, "ax" .Ldummy: @@ -303,7 +246,7 @@ _PIF_WORK = (_PIF_PER_TRAP) ENTRY(__bpon) .globl __bpon BPON - BR_R1USE_R14 + BR_EX %r14 /* * Scheduler resume function, called by switch_to @@ -333,7 +276,7 @@ ENTRY(__switch_to) TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPP jz 0f .insn s,0xb2800000,__LC_LPP # set program parameter -0: BR_R1USE_R14 +0: BR_EX %r14 .L__critical_start: @@ -399,7 +342,7 @@ sie_exit: xgr %r5,%r5 lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers lg %r2,__SF_EMPTY+16(%r15) # return exit reason code - BR_R1USE_R14 + BR_EX %r14 .Lsie_fault: lghi %r14,-EFAULT stg %r14,__SF_EMPTY+16(%r15) # set exit reason code @@ -458,7 +401,7 @@ ENTRY(system_call) lgf %r9,0(%r8,%r10) # get system call add. TSTMSK __TI_flags(%r12),_TIF_TRACE jnz .Lsysc_tracesys - BASR_R14_R9 # call sys_xxxx + BASR_EX %r14,%r9 # call sys_xxxx stg %r2,__PT_R2(%r11) # store return value .Lsysc_return: @@ -598,7 +541,7 @@ ENTRY(system_call) lmg %r3,%r7,__PT_R3(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) lg %r2,__PT_ORIG_GPR2(%r11) - BASR_R14_R9 # call sys_xxx + BASR_EX %r14,%r9 # call sys_xxx stg %r2,__PT_R2(%r11) # store return value .Lsysc_tracenogo: TSTMSK __TI_flags(%r12),_TIF_TRACE @@ -622,7 +565,7 @@ ENTRY(ret_from_fork) lmg %r9,%r10,__PT_R9(%r11) # load gprs ENTRY(kernel_thread_starter) la %r2,0(%r10) - BASR_R14_R9 + BASR_EX %r14,%r9 j .Lsysc_tracenogo /* @@ -698,7 +641,7 @@ ENTRY(pgm_check_handler) je .Lpgm_return lgf %r9,0(%r10,%r1) # load address of handler routine lgr %r2,%r11 # pass pointer to pt_regs - BASR_R14_R9 # branch to interrupt-handler + BASR_EX %r14,%r9 # branch to interrupt-handler .Lpgm_return: LOCKDEP_SYS_EXIT tm __PT_PSW+1(%r11),0x01 # returning to user ? @@ -976,7 +919,7 @@ ENTRY(psw_idle) stpt __TIMER_IDLE_ENTER(%r2) .Lpsw_idle_lpsw: lpswe __SF_EMPTY(%r15) - BR_R1USE_R14 + BR_EX %r14 .Lpsw_idle_end: /* @@ -1021,7 +964,7 @@ ENTRY(save_fpu_regs) .Lsave_fpu_regs_done: oi __LC_CPU_FLAGS+7,_CIF_FPU .Lsave_fpu_regs_exit: - BR_R1USE_R14 + BR_EX %r14 .Lsave_fpu_regs_end: #if IS_ENABLED(CONFIG_KVM) EXPORT_SYMBOL(save_fpu_regs) @@ -1071,7 +1014,7 @@ load_fpu_regs: .Lload_fpu_regs_done: ni __LC_CPU_FLAGS+7,255-_CIF_FPU .Lload_fpu_regs_exit: - BR_R1USE_R14 + BR_EX %r14 .Lload_fpu_regs_end: .L__critical_end: @@ -1244,7 +1187,7 @@ cleanup_critical: jl 0f clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end jl .Lcleanup_load_fpu_regs -0: BR_R11USE_R14 +0: BR_EX %r14 .align 8 .Lcleanup_table: @@ -1274,7 +1217,7 @@ cleanup_critical: ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit - BR_R11USE_R14 + BR_EX %r14 #endif .Lcleanup_system_call: @@ -1332,7 +1275,7 @@ cleanup_critical: stg %r15,56(%r11) # r15 stack pointer # set new psw address and exit larl %r9,.Lsysc_do_svc - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_system_call_insn: .quad system_call .quad .Lsysc_stmg @@ -1342,7 +1285,7 @@ cleanup_critical: .Lcleanup_sysc_tif: larl %r9,.Lsysc_tif - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_sysc_restore: # check if stpt has been executed @@ -1359,14 +1302,14 @@ cleanup_critical: mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) 1: lmg %r8,%r9,__LC_RETURN_PSW - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_sysc_restore_insn: .quad .Lsysc_exit_timer .quad .Lsysc_done - 4 .Lcleanup_io_tif: larl %r9,.Lio_tif - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_io_restore: # check if stpt has been executed @@ -1380,7 +1323,7 @@ cleanup_critical: mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) 1: lmg %r8,%r9,__LC_RETURN_PSW - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_io_restore_insn: .quad .Lio_exit_timer .quad .Lio_done - 4 @@ -1433,17 +1376,17 @@ cleanup_critical: # prepare return psw nihh %r8,0xfcfd # clear irq & wait state bits lg %r9,48(%r11) # return from psw_idle - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_idle_insn: .quad .Lpsw_idle_lpsw .Lcleanup_save_fpu_regs: larl %r9,save_fpu_regs - BR_R11USE_R14 + BR_EX %r14,%r11 .Lcleanup_load_fpu_regs: larl %r9,load_fpu_regs - BR_R11USE_R14 + BR_EX %r14,%r11 /* * Integer constants diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 285d6561076d..7ff976737bb1 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -173,10 +173,9 @@ void do_softirq_own_stack(void) new -= STACK_FRAME_OVERHEAD; ((struct stack_frame *) new)->back_chain = old; asm volatile(" la 15,0(%0)\n" - " basr 14,%2\n" + " brasl 14,__do_softirq\n" " la 15,0(%1)\n" - : : "a" (new), "a" (old), - "a" (__do_softirq) + : : "a" (new), "a" (old) : "0", "1", "2", "3", "4", "5", "14", "cc", "memory" ); } else { diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index 9a17e4475d27..be75e8e49e43 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -8,13 +8,17 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> #include <asm/ftrace.h> +#include <asm/nospec-insn.h> #include <asm/ptrace.h> #include <asm/export.h> + GEN_BR_THUNK %r1 + GEN_BR_THUNK %r14 + .section .kprobes.text, "ax" ENTRY(ftrace_stub) - br %r14 + BR_EX %r14 #define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -22,7 +26,7 @@ ENTRY(ftrace_stub) #define STACK_PTREGS_PSW (STACK_PTREGS + __PT_PSW) ENTRY(_mcount) - br %r14 + BR_EX %r14 EXPORT_SYMBOL(_mcount) @@ -52,7 +56,7 @@ ENTRY(ftrace_caller) #endif lgr %r3,%r14 la %r5,STACK_PTREGS(%r15) - basr %r14,%r1 + BASR_EX %r14,%r1 #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. @@ -67,7 +71,7 @@ ftrace_graph_caller_end: #endif lg %r1,(STACK_PTREGS_PSW+8)(%r15) lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) - br %r1 + BR_EX %r1 #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -80,6 +84,6 @@ ENTRY(return_to_handler) aghi %r15,STACK_FRAME_OVERHEAD lgr %r14,%r2 lmg %r2,%r5,32(%r15) - br %r14 + BR_EX %r14 #endif diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index 9f3b5b382743..d5eed651b5ab 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -44,24 +44,6 @@ static int __init nospec_report(void) } arch_initcall(nospec_report); -#ifdef CONFIG_SYSFS -ssize_t cpu_show_spectre_v1(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); -} - -ssize_t cpu_show_spectre_v2(struct device *dev, - struct device_attribute *attr, char *buf) -{ - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) - return sprintf(buf, "Mitigation: execute trampolines\n"); - if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) - return sprintf(buf, "Mitigation: limited branch prediction.\n"); - return sprintf(buf, "Vulnerable\n"); -} -#endif - #ifdef CONFIG_EXPOLINE int nospec_disable = IS_ENABLED(CONFIG_EXPOLINE_OFF); @@ -112,7 +94,6 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) s32 *epo; /* Second part of the instruction replace is always a nop */ - memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x00, 0x00 }, 4); for (epo = start; epo < end; epo++) { instr = (u8 *) epo + *epo; if (instr[0] == 0xc0 && (instr[1] & 0x0f) == 0x04) @@ -133,18 +114,34 @@ static void __init_or_module __nospec_revert(s32 *start, s32 *end) br = thunk + (*(int *)(thunk + 2)) * 2; else continue; - if (br[0] != 0x07 || (br[1] & 0xf0) != 0xf0) + /* Check for unconditional branch 0x07f? or 0x47f???? */ + if ((br[0] & 0xbf) != 0x07 || (br[1] & 0xf0) != 0xf0) continue; + + memcpy(insnbuf + 2, (char[]) { 0x47, 0x00, 0x07, 0x00 }, 4); switch (type) { case BRCL_EXPOLINE: - /* brcl to thunk, replace with br + nop */ insnbuf[0] = br[0]; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + if (br[0] == 0x47) { + /* brcl to b, replace with bc + nopr */ + insnbuf[2] = br[2]; + insnbuf[3] = br[3]; + } else { + /* brcl to br, replace with bcr + nop */ + } break; case BRASL_EXPOLINE: - /* brasl to thunk, replace with basr + nop */ - insnbuf[0] = 0x0d; insnbuf[1] = (instr[1] & 0xf0) | (br[1] & 0x0f); + if (br[0] == 0x47) { + /* brasl to b, replace with bas + nopr */ + insnbuf[0] = 0x4d; + insnbuf[2] = br[2]; + insnbuf[3] = br[3]; + } else { + /* brasl to br, replace with basr + nop */ + insnbuf[0] = 0x0d; + } break; } diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c new file mode 100644 index 000000000000..8affad5f18cb --- /dev/null +++ b/arch/s390/kernel/nospec-sysfs.c @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/device.h> +#include <linux/cpu.h> +#include <asm/facility.h> +#include <asm/nospec-branch.h> + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + return sprintf(buf, "Mitigation: execute trampolines\n"); + if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) + return sprintf(buf, "Mitigation: limited branch prediction\n"); + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index fcc634c1479a..96e4fcad57bf 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -739,6 +739,10 @@ static int __hw_perf_event_init(struct perf_event *event) */ rate = 0; if (attr->freq) { + if (!attr->sample_freq) { + err = -EINVAL; + goto out; + } rate = freq_to_sample_rate(&si, attr->sample_freq); rate = hw_limit_rate(&si, rate); attr->freq = 0; diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 89ea8c213d82..70d635da782c 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -6,8 +6,11 @@ #include <linux/linkage.h> #include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> #include <asm/sigp.h> + GEN_BR_THUNK %r9 + # # Issue "store status" for the current CPU to its prefix page # and call passed function afterwards @@ -66,9 +69,9 @@ ENTRY(store_status) st %r4,0(%r1) st %r5,4(%r1) stg %r2,8(%r1) - lgr %r1,%r2 + lgr %r9,%r2 lgr %r2,%r3 - br %r1 + BR_EX %r9 .section .bss .align 8 diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 2d6b6e81f812..4e76aaf7bb38 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -12,6 +12,7 @@ #include <asm/ptrace.h> #include <asm/thread_info.h> #include <asm/asm-offsets.h> +#include <asm/nospec-insn.h> #include <asm/sigp.h> /* @@ -23,6 +24,8 @@ * (see below) in the resume process. * This function runs with disabled interrupts. */ + GEN_BR_THUNK %r14 + .section .text ENTRY(swsusp_arch_suspend) stmg %r6,%r15,__SF_GPRS(%r15) @@ -102,7 +105,7 @@ ENTRY(swsusp_arch_suspend) spx 0x318(%r1) lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 - br %r14 + BR_EX %r14 /* * Restore saved memory image to correct place and restore register context. @@ -200,7 +203,7 @@ pgm_check_entry: lghi %r1,0 sam31 sigp %r1,%r0,SIGP_SET_ARCHITECTURE - basr %r14,%r3 + brasl %r14,_sclp_print_early larl %r3,.Ldisabled_wait_31 lpsw 0(%r3) 4: @@ -266,7 +269,7 @@ restore_registers: /* Return 0 */ lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 - br %r14 + BR_EX %r14 .section .data..nosave,"aw",@progbits .align 8 diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index ced6c9b8f04d..51f842c0a175 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -549,7 +549,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) gpa = scb_o->itdba & ~0xffUL; if (gpa && (scb_s->ecb & 0x10U)) { - if (!(gpa & ~0x1fffU)) { + if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x0080U); goto unpin; } diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index be9fa65bfac4..e7672edc284a 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -6,6 +6,9 @@ #include <linux/linkage.h> #include <asm/export.h> +#include <asm/nospec-insn.h> + + GEN_BR_THUNK %r14 /* * memset implementation @@ -39,7 +42,7 @@ ENTRY(memset) .Lmemset_clear_rest: larl %r3,.Lmemset_xc ex %r4,0(%r3) - br %r14 + BR_EX %r14 .Lmemset_fill: stc %r3,0(%r2) cghi %r4,1 @@ -56,7 +59,7 @@ ENTRY(memset) .Lmemset_fill_rest: larl %r3,.Lmemset_mvc ex %r4,0(%r3) - br %r14 + BR_EX %r14 .Lmemset_xc: xc 0(1,%r1),0(%r1) .Lmemset_mvc: @@ -79,7 +82,7 @@ ENTRY(memcpy) .Lmemcpy_rest: larl %r5,.Lmemcpy_mvc ex %r4,0(%r5) - br %r14 + BR_EX %r14 .Lmemcpy_loop: mvc 0(256,%r1),0(%r3) la %r1,256(%r1) diff --git a/arch/s390/net/bpf_jit.S b/arch/s390/net/bpf_jit.S index a1c917d881ec..fa716f2a95a7 100644 --- a/arch/s390/net/bpf_jit.S +++ b/arch/s390/net/bpf_jit.S @@ -8,6 +8,7 @@ */ #include <linux/linkage.h> +#include <asm/nospec-insn.h> #include "bpf_jit.h" /* @@ -53,7 +54,7 @@ ENTRY(sk_load_##NAME##_pos); \ clg %r3,STK_OFF_HLEN(%r15); /* Offset + SIZE > hlen? */ \ jh sk_load_##NAME##_slow; \ LOAD %r14,-SIZE(%r3,%r12); /* Get data from skb */ \ - b OFF_OK(%r6); /* Return */ \ + B_EX OFF_OK,%r6; /* Return */ \ \ sk_load_##NAME##_slow:; \ lgr %r2,%r7; /* Arg1 = skb pointer */ \ @@ -63,11 +64,14 @@ sk_load_##NAME##_slow:; \ brasl %r14,skb_copy_bits; /* Get data from skb */ \ LOAD %r14,STK_OFF_TMP(%r15); /* Load from temp bufffer */ \ ltgr %r2,%r2; /* Set cc to (%r2 != 0) */ \ - br %r6; /* Return */ + BR_EX %r6; /* Return */ sk_load_common(word, 4, llgf) /* r14 = *(u32 *) (skb->data+offset) */ sk_load_common(half, 2, llgh) /* r14 = *(u16 *) (skb->data+offset) */ + GEN_BR_THUNK %r6 + GEN_B_THUNK OFF_OK,%r6 + /* * Load 1 byte from SKB (optimized version) */ @@ -79,7 +83,7 @@ ENTRY(sk_load_byte_pos) clg %r3,STK_OFF_HLEN(%r15) # Offset >= hlen? jnl sk_load_byte_slow llgc %r14,0(%r3,%r12) # Get byte from skb - b OFF_OK(%r6) # Return OK + B_EX OFF_OK,%r6 # Return OK sk_load_byte_slow: lgr %r2,%r7 # Arg1 = skb pointer @@ -89,7 +93,7 @@ sk_load_byte_slow: brasl %r14,skb_copy_bits # Get data from skb llgc %r14,STK_OFF_TMP(%r15) # Load result from temp buffer ltgr %r2,%r2 # Set cc to (%r2 != 0) - br %r6 # Return cc + BR_EX %r6 # Return cc #define sk_negative_common(NAME, SIZE, LOAD) \ sk_load_##NAME##_slow_neg:; \ @@ -103,7 +107,7 @@ sk_load_##NAME##_slow_neg:; \ jz bpf_error; \ LOAD %r14,0(%r2); /* Get data from pointer */ \ xr %r3,%r3; /* Set cc to zero */ \ - br %r6; /* Return cc */ + BR_EX %r6; /* Return cc */ sk_negative_common(word, 4, llgf) sk_negative_common(half, 2, llgh) @@ -112,4 +116,4 @@ sk_negative_common(byte, 1, llgc) bpf_error: # force a return 0 from jit handler ltgr %r15,%r15 # Set condition code - br %r6 + BR_EX %r6 diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e8dee623d545..e7ce2577f0c9 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -24,6 +24,8 @@ #include <linux/bpf.h> #include <asm/cacheflush.h> #include <asm/dis.h> +#include <asm/facility.h> +#include <asm/nospec-branch.h> #include "bpf_jit.h" int bpf_jit_enable __read_mostly; @@ -41,6 +43,8 @@ struct bpf_jit { int base_ip; /* Base address for literal pool */ int ret0_ip; /* Address of return 0 */ int exit_ip; /* Address of exit */ + int r1_thunk_ip; /* Address of expoline thunk for 'br %r1' */ + int r14_thunk_ip; /* Address of expoline thunk for 'br %r14' */ int tail_call_start; /* Tail call start offset */ int labels[1]; /* Labels for local jumps */ }; @@ -251,6 +255,19 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) REG_SET_SEEN(b2); \ }) +#define EMIT6_PCREL_RILB(op, b, target) \ +({ \ + int rel = (target - jit->prg) / 2; \ + _EMIT6(op | reg_high(b) << 16 | rel >> 16, rel & 0xffff); \ + REG_SET_SEEN(b); \ +}) + +#define EMIT6_PCREL_RIL(op, target) \ +({ \ + int rel = (target - jit->prg) / 2; \ + _EMIT6(op | rel >> 16, rel & 0xffff); \ +}) + #define _EMIT6_IMM(op, imm) \ ({ \ unsigned int __imm = (imm); \ @@ -470,8 +487,45 @@ static void bpf_jit_epilogue(struct bpf_jit *jit) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE); + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + jit->r14_thunk_ip = jit->prg; + /* Generate __s390_indirect_jump_r14 thunk */ + if (test_facility(35)) { + /* exrl %r0,.+10 */ + EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); + } else { + /* larl %r1,.+14 */ + EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14); + /* ex 0,0(%r1) */ + EMIT4_DISP(0x44000000, REG_0, REG_1, 0); + } + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); + } /* br %r14 */ _EMIT2(0x07fe); + + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable && + (jit->seen & SEEN_FUNC)) { + jit->r1_thunk_ip = jit->prg; + /* Generate __s390_indirect_jump_r1 thunk */ + if (test_facility(35)) { + /* exrl %r0,.+10 */ + EMIT6_PCREL_RIL(0xc6000000, jit->prg + 10); + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); + /* br %r1 */ + _EMIT2(0x07f1); + } else { + /* larl %r1,.+14 */ + EMIT6_PCREL_RILB(0xc0000000, REG_1, jit->prg + 14); + /* ex 0,S390_lowcore.br_r1_tampoline */ + EMIT4_DISP(0x44000000, REG_0, REG_0, + offsetof(struct lowcore, br_r1_trampoline)); + /* j . */ + EMIT4_PCREL(0xa7f40000, 0); + } + } } /* @@ -977,8 +1031,13 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* lg %w1,<d(imm)>(%l) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, EMIT_CONST_U64(func)); - /* basr %r14,%w1 */ - EMIT2(0x0d00, REG_14, REG_W1); + if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + /* brasl %r14,__s390_indirect_jump_r1 */ + EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); + } else { + /* basr %r14,%w1 */ + EMIT2(0x0d00, REG_14, REG_W1); + } /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); if (bpf_helper_changes_skb_data((void *)func)) { diff --git a/arch/sh/kernel/entry-common.S b/arch/sh/kernel/entry-common.S index c001f782c5f1..28cc61216b64 100644 --- a/arch/sh/kernel/entry-common.S +++ b/arch/sh/kernel/entry-common.S @@ -255,7 +255,7 @@ debug_trap: mov.l @r8, r8 jsr @r8 nop - bra __restore_all + bra ret_from_exception nop CFI_ENDPROC diff --git a/arch/sparc/include/asm/atomic_64.h b/arch/sparc/include/asm/atomic_64.h index 24827a3f733a..89d299ccdfa6 100644 --- a/arch/sparc/include/asm/atomic_64.h +++ b/arch/sparc/include/asm/atomic_64.h @@ -82,7 +82,11 @@ ATOMIC_OPS(xor) #define atomic64_add_negative(i, v) (atomic64_add_return(i, v) < 0) #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n))) -#define atomic_xchg(v, new) (xchg(&((v)->counter), new)) + +static inline int atomic_xchg(atomic_t *v, int new) +{ + return xchg(&v->counter, new); +} static inline int __atomic_add_unless(atomic_t *v, int a, int u) { diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index b6802b978140..81ad06a1672f 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -952,7 +952,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index c56a195c9071..b2722ed31053 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} + /* * This routine is only called when splitting a THP */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - - pmd_val(entry) &= ~_PAGE_VALID; + pmd_t old, entry; - set_pmd_at(vma->vm_mm, address, pmdp, entry); + entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID); + old = pmdp_establish(vma, address, pmdp, entry); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* @@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, if ((pmd_val(entry) & _PAGE_PMD_HUGE) && !is_huge_zero_page(pmd_page(entry))) (vma->vm_mm)->context.thp_pte_count--; + + return old; } void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cc69e37548db..c0ad1bb27fa2 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -330,7 +330,8 @@ __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, (void *)(unsigned long)pci->romimage, + pci->romsize); return status; free_struct: @@ -436,7 +437,8 @@ __setup_efi_pci64(efi_pci_io_protocol_64 *pci, struct pci_setup_rom **__rom) if (status != EFI_SUCCESS) goto free_struct; - memcpy(rom->romdata, pci->romimage, pci->romsize); + memcpy(rom->romdata, (void *)(unsigned long)pci->romimage, + pci->romsize); return status; free_struct: diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 02e547f9ca3f..655a65eaf105 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1155,16 +1155,13 @@ int x86_perf_event_set_period(struct perf_event *event) per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || - local64_read(&hwc->prev_count) != (u64)-left) { - /* - * The hw event starts counting from this event offset, - * mark it to be able to extra future deltas: - */ - local64_set(&hwc->prev_count, (u64)-left); + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); - } + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); /* * Due to erratum on certan cpu we need diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 6f353a874178..815039327932 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2066,16 +2066,23 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 status; int handled; + int pmu_enabled; cpuc = this_cpu_ptr(&cpu_hw_events); /* + * Save the PMU state. + * It needs to be restored when leaving the handler. + */ + pmu_enabled = cpuc->enabled; + /* * No known reason to not always do late ACK, * but just in case do it opt-in. */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); intel_bts_disable_local(); + cpuc->enabled = 0; __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2173,7 +2180,8 @@ again: done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ - if (cpuc->enabled) + cpuc->enabled = pmu_enabled; + if (pmu_enabled) __intel_pmu_enable_all(0, true); intel_bts_enable_local(); @@ -3019,7 +3027,7 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * Therefore the effective (average) period matches the requested period, * despite coarser hardware granularity. */ -static unsigned bdw_limit_period(struct perf_event *event, unsigned left) +static u64 bdw_limit_period(struct perf_event *event, u64 left) { if ((event->hw.config & INTEL_ARCH_EVENT_MASK) == X86_CONFIG(.event=0xc0, .umask=0x01)) { diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 8e7a3f1df3a5..f26e26e4d84f 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -1110,6 +1110,7 @@ static void setup_pebs_sample_data(struct perf_event *event, if (pebs == NULL) return; + regs->flags &= ~PERF_EFLAGS_EXACT; sample_type = event->attr.sample_type; dsrc = sample_type & PERF_SAMPLE_DATA_SRC; @@ -1154,7 +1155,6 @@ static void setup_pebs_sample_data(struct perf_event *event, */ *regs = *iregs; regs->flags = pebs->flags; - set_linear_ip(regs, pebs->ip); if (sample_type & PERF_SAMPLE_REGS_INTR) { regs->ax = pebs->ax; @@ -1190,13 +1190,22 @@ static void setup_pebs_sample_data(struct perf_event *event, #endif } - if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs->ip = pebs->real_ip; - regs->flags |= PERF_EFLAGS_EXACT; - } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs)) - regs->flags |= PERF_EFLAGS_EXACT; - else - regs->flags &= ~PERF_EFLAGS_EXACT; + if (event->attr.precise_ip > 1) { + /* Haswell and later have the eventing IP, so use it: */ + if (x86_pmu.intel_cap.pebs_format >= 2) { + set_linear_ip(regs, pebs->real_ip); + regs->flags |= PERF_EFLAGS_EXACT; + } else { + /* Otherwise use PEBS off-by-1 IP: */ + set_linear_ip(regs, pebs->ip); + + /* ... and try to fix it up using the LBR entries: */ + if (intel_pmu_pebs_fixup_ip(regs)) + regs->flags |= PERF_EFLAGS_EXACT; + } + } else + set_linear_ip(regs, pebs->ip); + if ((sample_type & PERF_SAMPLE_ADDR) && x86_pmu.intel_cap.pebs_format >= 1) @@ -1263,17 +1272,84 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) return NULL; } +/* + * Special variant of intel_pmu_save_and_restart() for auto-reload. + */ +static int +intel_pmu_save_and_restart_reload(struct perf_event *event, int count) +{ + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - x86_pmu.cntval_bits; + u64 period = hwc->sample_period; + u64 prev_raw_count, new_raw_count; + s64 new, old; + + WARN_ON(!period); + + /* + * drain_pebs() only happens when the PMU is disabled. + */ + WARN_ON(this_cpu_read(cpu_hw_events.enabled)); + + prev_raw_count = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); + local64_set(&hwc->prev_count, new_raw_count); + + /* + * Since the counter increments a negative counter value and + * overflows on the sign switch, giving the interval: + * + * [-period, 0] + * + * the difference between two consequtive reads is: + * + * A) value2 - value1; + * when no overflows have happened in between, + * + * B) (0 - value1) + (value2 - (-period)); + * when one overflow happened in between, + * + * C) (0 - value1) + (n - 1) * (period) + (value2 - (-period)); + * when @n overflows happened in between. + * + * Here A) is the obvious difference, B) is the extension to the + * discrete interval, where the first term is to the top of the + * interval and the second term is from the bottom of the next + * interval and C) the extension to multiple intervals, where the + * middle term is the whole intervals covered. + * + * An equivalent of C, by reduction, is: + * + * value2 - value1 + n * period + */ + new = ((s64)(new_raw_count << shift) >> shift); + old = ((s64)(prev_raw_count << shift) >> shift); + local64_add(new - old + count * period, &event->count); + + perf_event_update_userpage(event); + + return 0; +} + static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *base, void *top, int bit, int count) { + struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; struct pt_regs regs; void *at = get_next_pebs_record_by_bit(base, top, bit); - if (!intel_pmu_save_and_restart(event) && - !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + /* + * Now, auto-reload is only enabled in fixed period mode. + * The reload value is always hwc->sample_period. + * May need to change it, if auto-reload is enabled in + * freq mode later. + */ + intel_pmu_save_and_restart_reload(event, count); + } else if (!intel_pmu_save_and_restart(event)) return; while (count > 1) { @@ -1325,8 +1401,11 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; n = top - at; - if (n <= 0) + if (n <= 0) { + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); return; + } __intel_pmu_pebs_event(event, iregs, at, top, 0, n); } @@ -1349,8 +1428,22 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) ds->pebs_index = ds->pebs_buffer_base; - if (unlikely(base >= top)) + if (unlikely(base >= top)) { + /* + * The drain_pebs() could be called twice in a short period + * for auto-reload event in pmu::read(). There are no + * overflows have happened in between. + * It needs to call intel_pmu_save_and_restart_reload() to + * update the event->count for this case. + */ + for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, + x86_pmu.max_pebs_events) { + event = cpuc->events[bit]; + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); + } return; + } for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index bcbb1d2ae10b..f3563179290b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -548,7 +548,7 @@ struct x86_pmu { struct x86_pmu_quirk *quirks; int perfctr_second_write; bool late_ack; - unsigned (*limit_period)(struct perf_event *event, unsigned l); + u64 (*limit_period)(struct perf_event *event, u64 l); /* * sysfs attrs diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a2485311164b..c278f276c9b3 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -197,6 +197,9 @@ #define X86_FEATURE_RETPOLINE ( 7*32+12) /* "" Generic Retpoline mitigation for Spectre variant 2 */ #define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* "" AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_MSR_SPEC_CTRL ( 7*32+16) /* "" MSR SPEC_CTRL is implemented */ +#define X86_FEATURE_SSBD ( 7*32+17) /* Speculative Store Bypass Disable */ + #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */ @@ -204,6 +207,13 @@ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ #define X86_FEATURE_USE_IBRS_FW ( 7*32+22) /* "" Use IBRS during runtime firmware calls */ +#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ +#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation */ +#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU is AMD family 0x17 (Zen) */ + /* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ @@ -261,9 +271,10 @@ /* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ #define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ #define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ -#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ -#define X86_FEATURE_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ -#define X86_FEATURE_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +#define X86_FEATURE_AMD_IBRS (13*32+14) /* Indirect Branch Restricted Speculation */ +#define X86_FEATURE_AMD_STIBP (13*32+15) /* Single Thread Indirect Branch Predictors */ +#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ #define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ @@ -299,6 +310,7 @@ #define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ #define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ + /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ @@ -306,6 +318,7 @@ #define X86_FEATURE_SPEC_CTRL (18*32+26) /* "" Speculation Control (IBRS + IBPB) */ #define X86_FEATURE_INTEL_STIBP (18*32+27) /* "" Single Thread Indirect Branch Predictors */ #define X86_FEATURE_ARCH_CAPABILITIES (18*32+29) /* IA32_ARCH_CAPABILITIES MSR (Intel) */ +#define X86_FEATURE_SPEC_CTRL_SSBD (18*32+31) /* "" Speculative Store Bypass Disable */ /* * BUG word(s) @@ -335,5 +348,6 @@ #define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ #define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ #define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ +#define X86_BUG_SPEC_STORE_BYPASS X86_BUG(17) /* CPU is affected by speculative store bypass attack */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index 39bcefc20de7..bb078786a323 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -68,6 +68,11 @@ struct legacy_pic { extern struct legacy_pic *legacy_pic; extern struct legacy_pic null_legacy_pic; +static inline bool has_legacy_pic(void) +{ + return legacy_pic != &null_legacy_pic; +} + static inline int nr_legacy_irqs(void) { return legacy_pic->nr_legacy_irqs; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 20cfeeb681c6..7598a6c26f76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -864,7 +864,7 @@ struct kvm_x86_ops { int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); - bool (*cpu_has_high_real_mode_segbase)(void); + bool (*has_emulated_msr)(int index); void (*cpuid_update)(struct kvm_vcpu *vcpu); int (*vm_init)(struct kvm *kvm); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 5a295bb97103..733650874b30 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -113,7 +113,7 @@ static inline int init_new_context(struct task_struct *tsk, #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { - /* pkey 0 is the default and always allocated */ + /* pkey 0 is the default and allocated implicitly */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index c768bc1550a1..1ec13e253174 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -40,6 +40,8 @@ #define MSR_IA32_SPEC_CTRL 0x00000048 /* Speculation Control */ #define SPEC_CTRL_IBRS (1 << 0) /* Indirect Branch Restricted Speculation */ #define SPEC_CTRL_STIBP (1 << 1) /* Single Thread Indirect Branch Predictors */ +#define SPEC_CTRL_SSBD_SHIFT 2 /* Speculative Store Bypass Disable bit */ +#define SPEC_CTRL_SSBD (1 << SPEC_CTRL_SSBD_SHIFT) /* Speculative Store Bypass Disable */ #define MSR_IA32_PRED_CMD 0x00000049 /* Prediction Command */ #define PRED_CMD_IBPB (1 << 0) /* Indirect Branch Prediction Barrier */ @@ -61,6 +63,11 @@ #define MSR_IA32_ARCH_CAPABILITIES 0x0000010a #define ARCH_CAP_RDCL_NO (1 << 0) /* Not susceptible to Meltdown */ #define ARCH_CAP_IBRS_ALL (1 << 1) /* Enhanced IBRS support */ +#define ARCH_CAP_SSB_NO (1 << 4) /* + * Not susceptible to Speculative Store Bypass + * attack, so no Speculative Store Bypass + * control required. + */ #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e @@ -135,6 +142,7 @@ /* DEBUGCTLMSR bits (others vary by model): */ #define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF_SHIFT 1 #define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ #define DEBUGCTLMSR_TR (1UL << 6) #define DEBUGCTLMSR_BTS (1UL << 7) @@ -315,6 +323,8 @@ #define MSR_AMD64_IBSOPDATA4 0xc001103d #define MSR_AMD64_IBS_REG_COUNT_MAX 8 /* includes MSR_AMD64_IBSBRTARGET */ +#define MSR_AMD64_VIRT_SPEC_CTRL 0xc001011f + /* Fam 17h MSRs */ #define MSR_F17H_IRPERF 0xc00000e9 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index f928ad9b143f..8b38df98548e 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -217,6 +217,14 @@ enum spectre_v2_mitigation { SPECTRE_V2_IBRS, }; +/* The Speculative Store Bypass disable variants */ +enum ssb_mitigation { + SPEC_STORE_BYPASS_NONE, + SPEC_STORE_BYPASS_DISABLE, + SPEC_STORE_BYPASS_PRCTL, + SPEC_STORE_BYPASS_SECCOMP, +}; + extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; @@ -241,22 +249,27 @@ static inline void vmexit_fill_RSB(void) #endif } -#define alternative_msr_write(_msr, _val, _feature) \ - asm volatile(ALTERNATIVE("", \ - "movl %[msr], %%ecx\n\t" \ - "movl %[val], %%eax\n\t" \ - "movl $0, %%edx\n\t" \ - "wrmsr", \ - _feature) \ - : : [msr] "i" (_msr), [val] "i" (_val) \ - : "eax", "ecx", "edx", "memory") +static __always_inline +void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) +{ + asm volatile(ALTERNATIVE("", "wrmsr", %c[feature]) + : : "c" (msr), + "a" ((u32)val), + "d" ((u32)(val >> 32)), + [feature] "i" (feature) + : "memory"); +} static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, PRED_CMD_IBPB, - X86_FEATURE_USE_IBPB); + u64 val = PRED_CMD_IBPB; + + alternative_msr_write(MSR_IA32_PRED_CMD, val, X86_FEATURE_USE_IBPB); } +/* The Intel SPEC CTRL MSR base value cache */ +extern u64 x86_spec_ctrl_base; + /* * With retpoline, we must use IBRS to restrict branch prediction * before calling into firmware. @@ -265,14 +278,18 @@ static inline void indirect_branch_prediction_barrier(void) */ #define firmware_restrict_branch_speculation_start() \ do { \ + u64 val = x86_spec_ctrl_base | SPEC_CTRL_IBRS; \ + \ preempt_disable(); \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, SPEC_CTRL_IBRS, \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ } while (0) #define firmware_restrict_branch_speculation_end() \ do { \ - alternative_msr_write(MSR_IA32_SPEC_CTRL, 0, \ + u64 val = x86_spec_ctrl_base; \ + \ + alternative_msr_write(MSR_IA32_SPEC_CTRL, val, \ X86_FEATURE_USE_IBRS_FW); \ preempt_enable(); \ } while (0) diff --git a/arch/x86/include/asm/pkeys.h b/arch/x86/include/asm/pkeys.h index b3b09b98896d..c50d6dcf4a22 100644 --- a/arch/x86/include/asm/pkeys.h +++ b/arch/x86/include/asm/pkeys.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H +#define ARCH_DEFAULT_PKEY 0 + #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, @@ -14,7 +16,7 @@ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!boot_cpu_has(X86_FEATURE_OSPKE)) - return 0; + return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } @@ -48,13 +50,21 @@ bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned - * from pkey_alloc(). pkey 0 is special, and never - * returned from pkey_alloc(). + * from pkey_alloc() or pkey 0 which is allocated + * implicitly when the mm is created. */ - if (pkey <= 0) + if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; + /* + * The exec-only pkey is set in the allocation map, but + * is not available to any of the user interfaces like + * mprotect_pkey(). + */ + if (pkey == mm->context.execute_only_pkey) + return false; + return mm_pkey_allocation_map(mm) & (1U << pkey); } diff --git a/arch/x86/include/asm/spec-ctrl.h b/arch/x86/include/asm/spec-ctrl.h new file mode 100644 index 000000000000..ae7c2c5cd7f0 --- /dev/null +++ b/arch/x86/include/asm/spec-ctrl.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SPECCTRL_H_ +#define _ASM_X86_SPECCTRL_H_ + +#include <linux/thread_info.h> +#include <asm/nospec-branch.h> + +/* + * On VMENTER we must preserve whatever view of the SPEC_CTRL MSR + * the guest has, while on VMEXIT we restore the host view. This + * would be easier if SPEC_CTRL were architecturally maskable or + * shadowable for guests but this is not (currently) the case. + * Takes the guest view of SPEC_CTRL MSR as a parameter and also + * the guest's version of VIRT_SPEC_CTRL, if emulated. + */ +extern void x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool guest); + +/** + * x86_spec_ctrl_set_guest - Set speculation control registers for the guest + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_set_guest(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, true); +} + +/** + * x86_spec_ctrl_restore_host - Restore host speculation control registers + * @guest_spec_ctrl: The guest content of MSR_SPEC_CTRL + * @guest_virt_spec_ctrl: The guest controlled bits of MSR_VIRT_SPEC_CTRL + * (may get translated to MSR_AMD64_LS_CFG bits) + * + * Avoids writing to the MSR if the content/bits are the same + */ +static inline +void x86_spec_ctrl_restore_host(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl) +{ + x86_virt_spec_ctrl(guest_spec_ctrl, guest_virt_spec_ctrl, false); +} + +/* AMD specific Speculative Store Bypass MSR data */ +extern u64 x86_amd_ls_cfg_base; +extern u64 x86_amd_ls_cfg_ssbd_mask; + +static inline u64 ssbd_tif_to_spec_ctrl(u64 tifn) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (tifn & _TIF_SSBD) >> (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline unsigned long ssbd_spec_ctrl_to_tif(u64 spec_ctrl) +{ + BUILD_BUG_ON(TIF_SSBD < SPEC_CTRL_SSBD_SHIFT); + return (spec_ctrl & SPEC_CTRL_SSBD) << (TIF_SSBD - SPEC_CTRL_SSBD_SHIFT); +} + +static inline u64 ssbd_tif_to_amd_ls_cfg(u64 tifn) +{ + return (tifn & _TIF_SSBD) ? x86_amd_ls_cfg_ssbd_mask : 0ULL; +} + +#ifdef CONFIG_SMP +extern void speculative_store_bypass_ht_init(void); +#else +static inline void speculative_store_bypass_ht_init(void) { } +#endif + +extern void speculative_store_bypass_update(unsigned long tif); + +static inline void speculative_store_bypass_update_current(void) +{ + speculative_store_bypass_update(current_thread_info()->flags); +} + +#endif diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b65c6705d6b9..51b0c652d02f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -90,6 +90,7 @@ struct thread_info { #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +#define TIF_SSBD 5 /* Reduced data speculation */ #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ @@ -112,8 +113,9 @@ struct thread_info { #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) -#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) +#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) +#define _TIF_SSBD (1 << TIF_SSBD) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) @@ -148,7 +150,7 @@ struct thread_info { /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ - (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) + (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP|_TIF_SSBD) #define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY) #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 99185a064978..686a58d793e5 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -111,6 +111,16 @@ static inline void cr4_clear_bits(unsigned long mask) } } +static inline void cr4_toggle_bits(unsigned long mask) +{ + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + cr4 ^= mask; + this_cpu_write(cpu_tlbstate.cr4, cr4); + __write_cr4(cr4); +} + /* Read the CR4 shadow. */ static inline unsigned long cr4_read_shadow(void) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index c6583efdbdaf..76cf21f887bd 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1403,7 +1403,7 @@ void setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!cpu && (pic_mode || !value)) { + if (!cpu && (pic_mode || !value || skip_ioapic_setup)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu); } else { diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c375bc672f82..4c2be99fa0fb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> +#include <asm/spec-ctrl.h> #include <asm/smp.h> #include <asm/pci-direct.h> #include <asm/delay.h> @@ -542,6 +543,26 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) rdmsrl(MSR_FAM10H_NODE_ID, value); nodes_per_socket = ((value >> 3) & 7) + 1; } + + if (c->x86 >= 0x15 && c->x86 <= 0x17) { + unsigned int bit; + + switch (c->x86) { + case 0x15: bit = 54; break; + case 0x16: bit = 33; break; + case 0x17: bit = 10; break; + default: return; + } + /* + * Try to cache the base value so further operations can + * avoid RMW. If that faults, do not enable SSBD. + */ + if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); + setup_force_cpu_cap(X86_FEATURE_SSBD); + x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; + } + } } static void early_init_amd(struct cpuinfo_x86 *c) @@ -728,6 +749,17 @@ static void init_amd_bd(struct cpuinfo_x86 *c) } } +static void init_amd_zn(struct cpuinfo_x86 *c) +{ + set_cpu_cap(c, X86_FEATURE_ZEN); + /* + * Fix erratum 1076: CPB feature bit not being set in CPUID. It affects + * all up to and including B1. + */ + if (c->x86_model <= 1 && c->x86_stepping <= 1) + set_cpu_cap(c, X86_FEATURE_CPB); +} + static void init_amd(struct cpuinfo_x86 *c) { u32 dummy; @@ -758,6 +790,7 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x10: init_amd_gh(c); break; case 0x12: init_amd_ln(c); break; case 0x15: init_amd_bd(c); break; + case 0x17: init_amd_zn(c); break; } /* Enable workaround for FXSAVE leak */ @@ -824,8 +857,9 @@ static void init_amd(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); - /* AMD CPUs don't reset SS attributes on SYSRET */ - set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ + if (!cpu_has(c, X86_FEATURE_XENPV)) + set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b8b0b6e78371..86af9b1b049d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -11,8 +11,10 @@ #include <linux/utsname.h> #include <linux/cpu.h> #include <linux/module.h> +#include <linux/nospec.h> +#include <linux/prctl.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> @@ -26,6 +28,27 @@ #include <asm/intel-family.h> static void __init spectre_v2_select_mitigation(void); +static void __init ssb_select_mitigation(void); + +/* + * Our boot-time value of the SPEC_CTRL MSR. We read it once so that any + * writes to SPEC_CTRL contain whatever reserved bits have been set. + */ +u64 __ro_after_init x86_spec_ctrl_base; +EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); + +/* + * The vendor and possibly platform specific bits which can be modified in + * x86_spec_ctrl_base. + */ +static u64 __ro_after_init x86_spec_ctrl_mask = SPEC_CTRL_IBRS; + +/* + * AMD specific MSR info for Speculative Store Bypass control. + * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). + */ +u64 __ro_after_init x86_amd_ls_cfg_base; +u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; void __init check_bugs(void) { @@ -36,9 +59,27 @@ void __init check_bugs(void) print_cpu_info(&boot_cpu_data); } + /* + * Read the SPEC_CTRL MSR to account for reserved bits which may + * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD + * init code as it is not enumerated and depends on the family. + */ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + /* Allow STIBP in MSR_SPEC_CTRL if supported */ + if (boot_cpu_has(X86_FEATURE_STIBP)) + x86_spec_ctrl_mask |= SPEC_CTRL_STIBP; + /* Select the proper spectre mitigation before patching alternatives */ spectre_v2_select_mitigation(); + /* + * Select proper mitigation for any exposure to the Speculative Store + * Bypass vulnerability. + */ + ssb_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -92,7 +133,76 @@ static const char *spectre_v2_strings[] = { #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt -static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; +static enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = + SPECTRE_V2_NONE; + +void +x86_virt_spec_ctrl(u64 guest_spec_ctrl, u64 guest_virt_spec_ctrl, bool setguest) +{ + u64 msrval, guestval, hostval = x86_spec_ctrl_base; + struct thread_info *ti = current_thread_info(); + + /* Is MSR_SPEC_CTRL implemented ? */ + if (static_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) { + /* + * Restrict guest_spec_ctrl to supported values. Clear the + * modifiable bits in the host base value and or the + * modifiable bits from the guest value. + */ + guestval = hostval & ~x86_spec_ctrl_mask; + guestval |= guest_spec_ctrl & x86_spec_ctrl_mask; + + /* SSBD controlled in MSR_SPEC_CTRL */ + if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD)) + hostval |= ssbd_tif_to_spec_ctrl(ti->flags); + + if (hostval != guestval) { + msrval = setguest ? guestval : hostval; + wrmsrl(MSR_IA32_SPEC_CTRL, msrval); + } + } + + /* + * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update + * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. + */ + if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && + !static_cpu_has(X86_FEATURE_VIRT_SSBD)) + return; + + /* + * If the host has SSBD mitigation enabled, force it in the host's + * virtual MSR value. If its not permanently enabled, evaluate + * current's TIF_SSBD thread flag. + */ + if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) + hostval = SPEC_CTRL_SSBD; + else + hostval = ssbd_tif_to_spec_ctrl(ti->flags); + + /* Sanitize the guest value */ + guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; + + if (hostval != guestval) { + unsigned long tif; + + tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : + ssbd_spec_ctrl_to_tif(hostval); + + speculative_store_bypass_update(tif); + } +} +EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); + +static void x86_amd_ssb_disable(void) +{ + u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; + + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + wrmsrl(MSR_AMD64_LS_CFG, msrval); +} #ifdef RETPOLINE static bool spectre_v2_bad_module; @@ -311,32 +421,289 @@ retpoline_auto: } #undef pr_fmt +#define pr_fmt(fmt) "Speculative Store Bypass: " fmt + +static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; + +/* The kernel command line selection */ +enum ssb_mitigation_cmd { + SPEC_STORE_BYPASS_CMD_NONE, + SPEC_STORE_BYPASS_CMD_AUTO, + SPEC_STORE_BYPASS_CMD_ON, + SPEC_STORE_BYPASS_CMD_PRCTL, + SPEC_STORE_BYPASS_CMD_SECCOMP, +}; + +static const char *ssb_strings[] = { + [SPEC_STORE_BYPASS_NONE] = "Vulnerable", + [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", + [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", + [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", +}; + +static const struct { + const char *option; + enum ssb_mitigation_cmd cmd; +} ssb_mitigation_options[] = { + { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ + { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ + { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ + { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ + { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ +}; + +static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) +{ + enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; + char arg[20]; + int ret, i; + + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + return SPEC_STORE_BYPASS_CMD_NONE; + } else { + ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", + arg, sizeof(arg)); + if (ret < 0) + return SPEC_STORE_BYPASS_CMD_AUTO; + + for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { + if (!match_option(arg, ret, ssb_mitigation_options[i].option)) + continue; + + cmd = ssb_mitigation_options[i].cmd; + break; + } + + if (i >= ARRAY_SIZE(ssb_mitigation_options)) { + pr_err("unknown option (%s). Switching to AUTO select\n", arg); + return SPEC_STORE_BYPASS_CMD_AUTO; + } + } + + return cmd; +} + +static enum ssb_mitigation __init __ssb_select_mitigation(void) +{ + enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; + enum ssb_mitigation_cmd cmd; + + if (!boot_cpu_has(X86_FEATURE_SSBD)) + return mode; + + cmd = ssb_parse_cmdline(); + if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && + (cmd == SPEC_STORE_BYPASS_CMD_NONE || + cmd == SPEC_STORE_BYPASS_CMD_AUTO)) + return mode; + + switch (cmd) { + case SPEC_STORE_BYPASS_CMD_AUTO: + case SPEC_STORE_BYPASS_CMD_SECCOMP: + /* + * Choose prctl+seccomp as the default mode if seccomp is + * enabled. + */ + if (IS_ENABLED(CONFIG_SECCOMP)) + mode = SPEC_STORE_BYPASS_SECCOMP; + else + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_ON: + mode = SPEC_STORE_BYPASS_DISABLE; + break; + case SPEC_STORE_BYPASS_CMD_PRCTL: + mode = SPEC_STORE_BYPASS_PRCTL; + break; + case SPEC_STORE_BYPASS_CMD_NONE: + break; + } + + /* + * We have three CPU feature flags that are in play here: + * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. + * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass + * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation + */ + if (mode == SPEC_STORE_BYPASS_DISABLE) { + setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); + /* + * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD uses + * a completely different MSR and bit dependent on family. + */ + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + x86_spec_ctrl_base |= SPEC_CTRL_SSBD; + x86_spec_ctrl_mask |= SPEC_CTRL_SSBD; + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + break; + case X86_VENDOR_AMD: + x86_amd_ssb_disable(); + break; + } + } + + return mode; +} + +static void ssb_select_mitigation(void) +{ + ssb_mode = __ssb_select_mitigation(); + + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +#undef pr_fmt +#define pr_fmt(fmt) "Speculation prctl: " fmt + +static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) +{ + bool update; + + if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && + ssb_mode != SPEC_STORE_BYPASS_SECCOMP) + return -ENXIO; + + switch (ctrl) { + case PR_SPEC_ENABLE: + /* If speculation is force disabled, enable is not allowed */ + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_clear_spec_ssb_disable(task); + update = test_and_clear_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_DISABLE: + task_set_spec_ssb_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + case PR_SPEC_FORCE_DISABLE: + task_set_spec_ssb_disable(task); + task_set_spec_ssb_force_disable(task); + update = !test_and_set_tsk_thread_flag(task, TIF_SSBD); + break; + default: + return -ERANGE; + } + + /* + * If being set on non-current task, delay setting the CPU + * mitigation until it is next scheduled. + */ + if (task == current && update) + speculative_store_bypass_update_current(); + + return 0; +} + +int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, + unsigned long ctrl) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_set(task, ctrl); + default: + return -ENODEV; + } +} + +#ifdef CONFIG_SECCOMP +void arch_seccomp_spec_mitigate(struct task_struct *task) +{ + if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) + ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); +} +#endif + +static int ssb_prctl_get(struct task_struct *task) +{ + switch (ssb_mode) { + case SPEC_STORE_BYPASS_DISABLE: + return PR_SPEC_DISABLE; + case SPEC_STORE_BYPASS_SECCOMP: + case SPEC_STORE_BYPASS_PRCTL: + if (task_spec_ssb_force_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_disable(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE; + return PR_SPEC_PRCTL | PR_SPEC_ENABLE; + default: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + return PR_SPEC_ENABLE; + return PR_SPEC_NOT_AFFECTED; + } +} + +int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) +{ + switch (which) { + case PR_SPEC_STORE_BYPASS: + return ssb_prctl_get(task); + default: + return -ENODEV; + } +} + +void x86_spec_ctrl_setup_ap(void) +{ + if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) + wrmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) + x86_amd_ssb_disable(); +} #ifdef CONFIG_SYSFS -ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) + +static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, + char *buf, unsigned int bug) { - if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + if (!boot_cpu_has_bug(bug)) return sprintf(buf, "Not affected\n"); - if (boot_cpu_has(X86_FEATURE_KAISER)) - return sprintf(buf, "Mitigation: PTI\n"); + + switch (bug) { + case X86_BUG_CPU_MELTDOWN: + if (boot_cpu_has(X86_FEATURE_KAISER)) + return sprintf(buf, "Mitigation: PTI\n"); + + break; + + case X86_BUG_SPECTRE_V1: + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + + case X86_BUG_SPECTRE_V2: + return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], + boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", + boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", + spectre_v2_module_string()); + + case X86_BUG_SPEC_STORE_BYPASS: + return sprintf(buf, "%s\n", ssb_strings[ssb_mode]); + + default: + break; + } + return sprintf(buf, "Vulnerable\n"); } +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); +} + ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) - return sprintf(buf, "Not affected\n"); - return sprintf(buf, "Mitigation: __user pointer sanitization\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) - return sprintf(buf, "Not affected\n"); + return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); +} - return sprintf(buf, "%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], - boot_cpu_has(X86_FEATURE_USE_IBPB) ? ", IBPB" : "", - boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", - spectre_v2_module_string()); +ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 301bbd1f2373..b0fd028b2eee 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -725,17 +725,32 @@ static void init_speculation_control(struct cpuinfo_x86 *c) * and they also have a different bit for STIBP support. Also, * a hypervisor might have set the individual AMD bits even on * Intel CPUs, for finer-grained selection of what's available. - * - * We use the AMD bits in 0x8000_0008 EBX as the generic hardware - * features, which are visible in /proc/cpuinfo and used by the - * kernel. So set those accordingly from the Intel bits. */ if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { set_cpu_cap(c, X86_FEATURE_IBRS); set_cpu_cap(c, X86_FEATURE_IBPB); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); } + if (cpu_has(c, X86_FEATURE_INTEL_STIBP)) set_cpu_cap(c, X86_FEATURE_STIBP); + + if (cpu_has(c, X86_FEATURE_SPEC_CTRL_SSBD) || + cpu_has(c, X86_FEATURE_VIRT_SSBD)) + set_cpu_cap(c, X86_FEATURE_SSBD); + + if (cpu_has(c, X86_FEATURE_AMD_IBRS)) { + set_cpu_cap(c, X86_FEATURE_IBRS); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } + + if (cpu_has(c, X86_FEATURE_AMD_IBPB)) + set_cpu_cap(c, X86_FEATURE_IBPB); + + if (cpu_has(c, X86_FEATURE_AMD_STIBP)) { + set_cpu_cap(c, X86_FEATURE_STIBP); + set_cpu_cap(c, X86_FEATURE_MSR_SPEC_CTRL); + } } void get_cpu_cap(struct cpuinfo_x86 *c) @@ -879,21 +894,55 @@ static const __initconst struct x86_cpu_id cpu_no_meltdown[] = { {} }; -static bool __init cpu_vulnerable_to_meltdown(struct cpuinfo_x86 *c) +static const __initconst struct x86_cpu_id cpu_no_spec_store_bypass[] = { + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PINEVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_LINCROFT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_PENWELL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CLOVERVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_CEDARVIEW }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT1 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_AIRMONT }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_SILVERMONT2 }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_ATOM_MERRIFIELD }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_CORE_YONAH }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNL }, + { X86_VENDOR_INTEL, 6, INTEL_FAM6_XEON_PHI_KNM }, + { X86_VENDOR_CENTAUR, 5, }, + { X86_VENDOR_INTEL, 5, }, + { X86_VENDOR_NSC, 5, }, + { X86_VENDOR_AMD, 0x12, }, + { X86_VENDOR_AMD, 0x11, }, + { X86_VENDOR_AMD, 0x10, }, + { X86_VENDOR_AMD, 0xf, }, + { X86_VENDOR_ANY, 4, }, + {} +}; + +static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = 0; - if (x86_match_cpu(cpu_no_meltdown)) - return false; - if (cpu_has(c, X86_FEATURE_ARCH_CAPABILITIES)) rdmsrl(MSR_IA32_ARCH_CAPABILITIES, ia32_cap); + if (!x86_match_cpu(cpu_no_spec_store_bypass) && + !(ia32_cap & ARCH_CAP_SSB_NO)) + setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS); + + if (x86_match_cpu(cpu_no_speculation)) + return; + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + + if (x86_match_cpu(cpu_no_meltdown)) + return; + /* Rogue Data Cache Load? No! */ if (ia32_cap & ARCH_CAP_RDCL_NO) - return false; + return; - return true; + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); } /* @@ -942,12 +991,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) setup_force_cpu_cap(X86_FEATURE_ALWAYS); - if (!x86_match_cpu(cpu_no_speculation)) { - if (cpu_vulnerable_to_meltdown(c)) - setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); - setup_force_cpu_bug(X86_BUG_SPECTRE_V1); - setup_force_cpu_bug(X86_BUG_SPECTRE_V2); - } + cpu_set_bug_bits(c); fpu__init_system(c); @@ -1315,6 +1359,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) #endif mtrr_ap_init(); validate_apic_and_package_id(c); + x86_spec_ctrl_setup_ap(); } struct msr_range { diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 2584265d4745..3b19d82f7932 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -46,4 +46,7 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +extern void x86_spec_ctrl_setup_ap(void); + #endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 8fb1d6522f8e..93781e3f05b2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -153,7 +153,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) setup_clear_cpu_cap(X86_FEATURE_IBPB); setup_clear_cpu_cap(X86_FEATURE_STIBP); setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL); + setup_clear_cpu_cap(X86_FEATURE_MSR_SPEC_CTRL); setup_clear_cpu_cap(X86_FEATURE_INTEL_STIBP); + setup_clear_cpu_cap(X86_FEATURE_SSBD); + setup_clear_cpu_cap(X86_FEATURE_SPEC_CTRL_SSBD); } /* diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3fe45f84ced4..7a07b15b451c 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -11,6 +11,7 @@ #include <linux/of_address.h> #include <linux/of_platform.h> #include <linux/of_irq.h> +#include <linux/libfdt.h> #include <linux/slab.h> #include <linux/pci.h> #include <linux/of_pci.h> @@ -199,19 +200,22 @@ static struct of_ioapic_type of_ioapic_type[] = static int dt_irqdomain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *arg) { - struct of_phandle_args *irq_data = (void *)arg; + struct irq_fwspec *fwspec = (struct irq_fwspec *)arg; struct of_ioapic_type *it; struct irq_alloc_info tmp; + int type_index; - if (WARN_ON(irq_data->args_count < 2)) + if (WARN_ON(fwspec->param_count < 2)) return -EINVAL; - if (irq_data->args[1] >= ARRAY_SIZE(of_ioapic_type)) + + type_index = fwspec->param[1]; + if (type_index >= ARRAY_SIZE(of_ioapic_type)) return -EINVAL; - it = &of_ioapic_type[irq_data->args[1]]; + it = &of_ioapic_type[type_index]; ioapic_set_alloc_attr(&tmp, NUMA_NO_NODE, it->trigger, it->polarity); tmp.ioapic_id = mpc_ioapic_id(mp_irqdomain_ioapic_idx(domain)); - tmp.ioapic_pin = irq_data->args[0]; + tmp.ioapic_pin = fwspec->param[0]; return mp_irqdomain_alloc(domain, virq, nr_irqs, &tmp); } @@ -276,14 +280,15 @@ static void __init x86_flattree_get_config(void) map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK), (u64)128); - initial_boot_params = dt = early_memremap(initial_dtb, map_len); - size = of_get_flat_dt_size(); + dt = early_memremap(initial_dtb, map_len); + size = fdt_totalsize(dt); if (map_len < size) { early_memunmap(dt, map_len); - initial_boot_params = dt = early_memremap(initial_dtb, size); + dt = early_memremap(initial_dtb, size); map_len = size; } + early_init_dt_verify(dt); unflatten_and_copy_device_tree(); early_memunmap(dt, map_len); } diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..fd7e9937ddd6 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -71,12 +71,17 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { free_page((unsigned long)image->arch.pgd); + image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); + image->arch.pmd0 = NULL; free_page((unsigned long)image->arch.pmd1); + image->arch.pmd1 = NULL; #endif free_page((unsigned long)image->arch.pte0); + image->arch.pte0 = NULL; free_page((unsigned long)image->arch.pte1); + image->arch.pte1 = NULL; } static int machine_kexec_alloc_page_tables(struct kimage *image) @@ -93,7 +98,6 @@ static int machine_kexec_alloc_page_tables(struct kimage *image) !image->arch.pmd0 || !image->arch.pmd1 || #endif !image->arch.pte0 || !image->arch.pte1) { - machine_kexec_free_page_tables(image); return -ENOMEM; } return 0; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a5784a14f8d1..eae59cad0b07 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -37,8 +37,11 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { free_page((unsigned long)image->arch.pud); + image->arch.pud = NULL; free_page((unsigned long)image->arch.pmd); + image->arch.pmd = NULL; free_page((unsigned long)image->arch.pte); + image->arch.pte = NULL; } static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) @@ -79,7 +82,6 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); return 0; err: - free_transition_pgtable(image); return result; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a55b32007785..00a9047539d7 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -33,6 +33,7 @@ #include <asm/mce.h> #include <asm/vm86.h> #include <asm/switch_to.h> +#include <asm/spec-ctrl.h> /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -134,11 +135,6 @@ void flush_thread(void) fpu__clear(&tsk->thread.fpu); } -static void hard_disable_TSC(void) -{ - cr4_set_bits(X86_CR4_TSD); -} - void disable_TSC(void) { preempt_disable(); @@ -147,15 +143,10 @@ void disable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_disable_TSC(); + cr4_set_bits(X86_CR4_TSD); preempt_enable(); } -static void hard_enable_TSC(void) -{ - cr4_clear_bits(X86_CR4_TSD); -} - static void enable_TSC(void) { preempt_disable(); @@ -164,7 +155,7 @@ static void enable_TSC(void) * Must flip the CPU state synchronously with * TIF_NOTSC in the current running context. */ - hard_enable_TSC(); + cr4_clear_bits(X86_CR4_TSD); preempt_enable(); } @@ -192,48 +183,199 @@ int set_tsc_mode(unsigned int val) return 0; } -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) +static inline void switch_to_bitmap(struct tss_struct *tss, + struct thread_struct *prev, + struct thread_struct *next, + unsigned long tifp, unsigned long tifn) { - struct thread_struct *prev, *next; - - prev = &prev_p->thread; - next = &next_p->thread; - - if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ - test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { - unsigned long debugctl = get_debugctlmsr(); - - debugctl &= ~DEBUGCTLMSR_BTF; - if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) - debugctl |= DEBUGCTLMSR_BTF; - - update_debugctlmsr(debugctl); - } - - if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ - test_tsk_thread_flag(next_p, TIF_NOTSC)) { - /* prev and next are different */ - if (test_tsk_thread_flag(next_p, TIF_NOTSC)) - hard_disable_TSC(); - else - hard_enable_TSC(); - } - - if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { + if (tifn & _TIF_IO_BITMAP) { /* * Copy the relevant range of the IO bitmap. * Normally this is 128 bytes or less: */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, max(prev->io_bitmap_max, next->io_bitmap_max)); - } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) { + } else if (tifp & _TIF_IO_BITMAP) { /* * Clear any possible leftover bits: */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } +} + +#ifdef CONFIG_SMP + +struct ssb_state { + struct ssb_state *shared_state; + raw_spinlock_t lock; + unsigned int disable_state; + unsigned long local_state; +}; + +#define LSTATE_SSB 0 + +static DEFINE_PER_CPU(struct ssb_state, ssb_state); + +void speculative_store_bypass_ht_init(void) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + st->local_state = 0; + + /* + * Shared state setup happens once on the first bringup + * of the CPU. It's not destroyed on CPU hotunplug. + */ + if (st->shared_state) + return; + + raw_spin_lock_init(&st->lock); + + /* + * Go over HT siblings and check whether one of them has set up the + * shared state pointer already. + */ + for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) { + if (cpu == this_cpu) + continue; + + if (!per_cpu(ssb_state, cpu).shared_state) + continue; + + /* Link it to the state of the sibling: */ + st->shared_state = per_cpu(ssb_state, cpu).shared_state; + return; + } + + /* + * First HT sibling to come up on the core. Link shared state of + * the first HT sibling to itself. The siblings on the same core + * which come up later will see the shared state pointer and link + * themself to the state of this CPU. + */ + st->shared_state = st; +} + +/* + * Logic is: First HT sibling enables SSBD for both siblings in the core + * and last sibling to disable it, disables it for the whole core. This how + * MSR_SPEC_CTRL works in "hardware": + * + * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL + */ +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + struct ssb_state *st = this_cpu_ptr(&ssb_state); + u64 msr = x86_amd_ls_cfg_base; + + if (!static_cpu_has(X86_FEATURE_ZEN)) { + msr |= ssbd_tif_to_amd_ls_cfg(tifn); + wrmsrl(MSR_AMD64_LS_CFG, msr); + return; + } + + if (tifn & _TIF_SSBD) { + /* + * Since this can race with prctl(), block reentry on the + * same CPU. + */ + if (__test_and_set_bit(LSTATE_SSB, &st->local_state)) + return; + + msr |= x86_amd_ls_cfg_ssbd_mask; + + raw_spin_lock(&st->shared_state->lock); + /* First sibling enables SSBD: */ + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + st->shared_state->disable_state++; + raw_spin_unlock(&st->shared_state->lock); + } else { + if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state)) + return; + + raw_spin_lock(&st->shared_state->lock); + st->shared_state->disable_state--; + if (!st->shared_state->disable_state) + wrmsrl(MSR_AMD64_LS_CFG, msr); + raw_spin_unlock(&st->shared_state->lock); + } +} +#else +static __always_inline void amd_set_core_ssb_state(unsigned long tifn) +{ + u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); + + wrmsrl(MSR_AMD64_LS_CFG, msr); +} +#endif + +static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) +{ + /* + * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, + * so ssbd_tif_to_spec_ctrl() just works. + */ + wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); +} + +static __always_inline void intel_set_ssb_state(unsigned long tifn) +{ + u64 msr = x86_spec_ctrl_base | ssbd_tif_to_spec_ctrl(tifn); + + wrmsrl(MSR_IA32_SPEC_CTRL, msr); +} + +static __always_inline void __speculative_store_bypass_update(unsigned long tifn) +{ + if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) + amd_set_ssb_virt_state(tifn); + else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + amd_set_core_ssb_state(tifn); + else + intel_set_ssb_state(tifn); +} + +void speculative_store_bypass_update(unsigned long tif) +{ + preempt_disable(); + __speculative_store_bypass_update(tif); + preempt_enable(); +} + +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long tifp, tifn; + + prev = &prev_p->thread; + next = &next_p->thread; + + tifn = READ_ONCE(task_thread_info(next_p)->flags); + tifp = READ_ONCE(task_thread_info(prev_p)->flags); + switch_to_bitmap(tss, prev, next, tifp, tifn); + propagate_user_return_notify(prev_p, next_p); + + if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) && + arch_has_block_step()) { + unsigned long debugctl, msk; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~DEBUGCTLMSR_BTF; + msk = tifn & _TIF_BLOCKSTEP; + debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + } + + if ((tifp ^ tifn) & _TIF_NOTSC) + cr4_toggle_bits(X86_CR4_TSD); + + if ((tifp ^ tifn) & _TIF_SSBD) + __speculative_store_bypass_update(tifn); } /* diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 83929cc47a4b..10b22fc6ef5a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -75,6 +75,7 @@ #include <asm/i8259.h> #include <asm/realmode.h> #include <asm/misc.h> +#include <asm/spec-ctrl.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -229,6 +230,8 @@ static void notrace start_secondary(void *unused) */ check_tsc_sync_target(); + speculative_store_bypass_ht_init(); + /* * Lock vector_lock and initialize the vectors on this cpu * before setting the cpu online. We must set it online with @@ -1325,6 +1328,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) set_mtrr_aps_delayed_init(); smp_quirk_init_udelay(); + + speculative_store_bypass_ht_init(); } void arch_enable_nonboot_cpus_begin(void) @@ -1492,6 +1497,7 @@ static void remove_siblinginfo(int cpu) cpumask_clear(topology_core_cpumask(cpu)); c->phys_proc_id = 0; c->cpu_core_id = 0; + c->booted_cores = 0; cpumask_clear_cpu(cpu, cpu_sibling_setup_mask); recompute_smt_state(); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index da6a287a11e4..769c370011d6 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -24,6 +24,7 @@ #include <asm/geode.h> #include <asm/apic.h> #include <asm/intel-family.h> +#include <asm/i8259.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -456,6 +457,20 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) unsigned long tscmin, tscmax; int pitcnt; + if (!has_legacy_pic()) { + /* + * Relies on tsc_early_delay_calibrate() to have given us semi + * usable udelay(), wait for the same 50ms we would have with + * the PIT loop below. + */ + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + udelay(10 * USEC_PER_MSEC); + return ULONG_MAX; + } + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); @@ -580,6 +595,9 @@ static unsigned long quick_pit_calibrate(void) u64 tsc, delta; unsigned long d1, d2; + if (!has_legacy_pic()) + return 0; + /* Set the Gate high, disable speaker */ outb((inb(0x61) & ~0x02) | 0x01, 0x61); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 93f924de06cf..7e5119c1d15c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -357,7 +357,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 0x80000008.ebx */ const u32 kvm_cpuid_8000_0008_ebx_x86_features = - F(IBPB) | F(IBRS); + F(AMD_IBPB) | F(AMD_IBRS) | F(VIRT_SSBD); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = @@ -382,7 +382,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = - F(SPEC_CTRL) | F(ARCH_CAPABILITIES); + F(SPEC_CTRL) | F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -468,6 +468,11 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= ~F(PKU); entry->edx &= kvm_cpuid_7_0_edx_x86_features; cpuid_mask(&entry->edx, CPUID_7_EDX); + /* + * We emulate ARCH_CAPABILITIES in software even + * if the host doesn't support it. + */ + entry->edx |= F(ARCH_CAPABILITIES); } else { entry->ebx = 0; entry->ecx = 0; @@ -618,13 +623,20 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, g_phys_as = phys_as; entry->eax = g_phys_as | (virt_as << 8); entry->edx = 0; - /* IBRS and IBPB aren't necessarily present in hardware cpuid */ - if (boot_cpu_has(X86_FEATURE_IBPB)) - entry->ebx |= F(IBPB); - if (boot_cpu_has(X86_FEATURE_IBRS)) - entry->ebx |= F(IBRS); + /* + * IBRS, IBPB and VIRT_SSBD aren't necessarily present in + * hardware cpuid + */ + if (boot_cpu_has(X86_FEATURE_AMD_IBPB)) + entry->ebx |= F(AMD_IBPB); + if (boot_cpu_has(X86_FEATURE_AMD_IBRS)) + entry->ebx |= F(AMD_IBRS); + if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) + entry->ebx |= F(VIRT_SSBD); entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_8000_0008_EBX); + if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) + entry->ebx |= F(VIRT_SSBD); break; } case 0x80000019: diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d1beb7156704..c38369781239 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -165,21 +165,21 @@ static inline bool guest_cpuid_has_ibpb(struct kvm_vcpu *vcpu) struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best && (best->ebx & bit(X86_FEATURE_IBPB))) + if (best && (best->ebx & bit(X86_FEATURE_AMD_IBPB))) return true; best = kvm_find_cpuid_entry(vcpu, 7, 0); return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); } -static inline bool guest_cpuid_has_ibrs(struct kvm_vcpu *vcpu) +static inline bool guest_cpuid_has_spec_ctrl(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); - if (best && (best->ebx & bit(X86_FEATURE_IBRS))) + if (best && (best->ebx & bit(X86_FEATURE_AMD_IBRS))) return true; best = kvm_find_cpuid_entry(vcpu, 7, 0); - return best && (best->edx & bit(X86_FEATURE_SPEC_CTRL)); + return best && (best->edx & (bit(X86_FEATURE_SPEC_CTRL) | bit(X86_FEATURE_SSBD))); } static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) @@ -190,6 +190,15 @@ static inline bool guest_cpuid_has_arch_capabilities(struct kvm_vcpu *vcpu) return best && (best->edx & bit(X86_FEATURE_ARCH_CAPABILITIES)); } +static inline bool guest_cpuid_has_virt_ssbd(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0); + return best && (best->ebx & bit(X86_FEATURE_VIRT_SSBD)); +} + + /* * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3 diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 1015f4c3b9ed..09c6e49e459c 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -299,8 +299,16 @@ void kvm_apic_set_version(struct kvm_vcpu *vcpu) if (!lapic_in_kernel(vcpu)) return; + /* + * KVM emulates 82093AA datasheet (with in-kernel IOAPIC implementation) + * which doesn't have EOI register; Some buggy OSes (e.g. Windows with + * Hyper-V role) disable EOI broadcast in lapic not checking for IOAPIC + * version first and level-triggered interrupts never get EOIed in + * IOAPIC. + */ feat = kvm_find_cpuid_entry(apic->vcpu, 0x1, 0); - if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31)))) + if (feat && (feat->ecx & (1 << (X86_FEATURE_X2APIC & 31))) && + !ioapic_in_kernel(vcpu->kvm)) v |= APIC_LVR_DIRECTED_EOI; kvm_lapic_set_reg(apic, APIC_LVR, v); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index aaa93b4b0380..a27f9e442ffc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,7 +45,7 @@ #include <asm/kvm_para.h> #include <asm/irq_remapping.h> #include <asm/microcode.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include <asm/virtext.h> #include "trace.h" @@ -185,6 +185,12 @@ struct vcpu_svm { } host; u64 spec_ctrl; + /* + * Contains guest-controlled bits of VIRT_SPEC_CTRL, which will be + * translated into the appropriate L2_CFG bits on the host to + * perform speculative control. + */ + u64 virt_spec_ctrl; u32 *msrpm; @@ -1561,6 +1567,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) u32 eax = 1; svm->spec_ctrl = 0; + svm->virt_spec_ctrl = 0; if (!init_event) { svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE | @@ -3545,11 +3552,18 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has_ibrs(vcpu)) + !guest_cpuid_has_spec_ctrl(vcpu)) return 1; msr_info->data = svm->spec_ctrl; break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr_info->host_initiated && + !guest_cpuid_has_virt_ssbd(vcpu)) + return 1; + + msr_info->data = svm->virt_spec_ctrl; + break; case MSR_IA32_UCODE_REV: msr_info->data = 0x01000065; break; @@ -3643,7 +3657,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_SPEC_CTRL: if (!msr->host_initiated && - !guest_cpuid_has_ibrs(vcpu)) + !guest_cpuid_has_spec_ctrl(vcpu)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ @@ -3684,6 +3698,16 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1); break; + case MSR_AMD64_VIRT_SPEC_CTRL: + if (!msr->host_initiated && + !guest_cpuid_has_virt_ssbd(vcpu)) + return 1; + + if (data & ~SPEC_CTRL_SSBD) + return 1; + + svm->virt_spec_ctrl = data; + break; case MSR_STAR: svm->vmcb->save.star = data; break; @@ -4917,8 +4941,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, svm->spec_ctrl); + x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); asm volatile ( "push %%" _ASM_BP "; \n\t" @@ -5012,6 +5035,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) #endif ); + /* Eliminate branch target predictions from guest mode */ + vmexit_fill_RSB(); + +#ifdef CONFIG_X86_64 + wrmsrl(MSR_GS_BASE, svm->host.gs_base); +#else + loadsegment(fs, svm->host.fs); +#ifndef CONFIG_X86_32_LAZY_GS + loadsegment(gs, svm->host.gs); +#endif +#endif + /* * We do not use IBRS in the kernel. If this vCPU has used the * SPEC_CTRL MSR it may have left it on; save the value and @@ -5030,20 +5065,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) svm->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (svm->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); - - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - -#ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, svm->host.gs_base); -#else - loadsegment(fs, svm->host.fs); -#ifndef CONFIG_X86_32_LAZY_GS - loadsegment(gs, svm->host.gs); -#endif -#endif + x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); reload_tss(vcpu); @@ -5145,7 +5167,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_high_real_mode_segbase(void) +static bool svm_has_emulated_msr(int index) { return true; } @@ -5462,7 +5484,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .hardware_enable = svm_hardware_enable, .hardware_disable = svm_hardware_disable, .cpu_has_accelerated_tpr = svm_cpu_has_accelerated_tpr, - .cpu_has_high_real_mode_segbase = svm_has_high_real_mode_segbase, + .has_emulated_msr = svm_has_emulated_msr, .vcpu_create = svm_create_vcpu, .vcpu_free = svm_free_vcpu, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b978aeccda78..2827a9622d97 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,7 +50,7 @@ #include <asm/apic.h> #include <asm/irq_remapping.h> #include <asm/microcode.h> -#include <asm/nospec-branch.h> +#include <asm/spec-ctrl.h> #include "trace.h" #include "pmu.h" @@ -2558,6 +2558,8 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, return; } + WARN_ON_ONCE(vmx->emulation_required); + if (kvm_exception_is_soft(nr)) { vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmx->vcpu.arch.event_exit_inst_len); @@ -3020,7 +3022,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has_ibrs(vcpu)) + !guest_cpuid_has_spec_ctrl(vcpu)) return 1; msr_info->data = to_vmx(vcpu)->spec_ctrl; @@ -3137,11 +3139,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has_ibrs(vcpu)) + !guest_cpuid_has_spec_ctrl(vcpu)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ - if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP)) + if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD)) return 1; vmx->spec_ctrl = data; @@ -6430,12 +6432,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) goto out; } - if (err != EMULATE_DONE) { - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; - vcpu->run->internal.ndata = 0; - return 0; - } + if (err != EMULATE_DONE) + goto emulation_error; + + if (vmx->emulation_required && !vmx->rmode.vm86_active && + vcpu->arch.exception.pending) + goto emulation_error; if (vcpu->arch.halt_request) { vcpu->arch.halt_request = 0; @@ -6451,6 +6453,12 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) out: return ret; + +emulation_error: + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; + return 0; } static int __grow_ple_window(int val) @@ -8691,9 +8699,21 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) } } -static bool vmx_has_high_real_mode_segbase(void) +static bool vmx_has_emulated_msr(int index) { - return enable_unrestricted_guest || emulate_invalid_guest_state; + switch (index) { + case MSR_IA32_SMBASE: + /* + * We cannot do SMM unless we can run the guest in big + * real mode. + */ + return enable_unrestricted_guest || emulate_invalid_guest_state; + case MSR_AMD64_VIRT_SPEC_CTRL: + /* This is AMD only. */ + return false; + default: + return true; + } } static bool vmx_mpx_supported(void) @@ -8916,10 +8936,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) * is no need to worry about the conditional branch over the wrmsr * being speculatively taken. */ - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl); + x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0); vmx->__launched = vmx->loaded_vmcs->launched; + asm( /* Store host registers */ "push %%" _ASM_DX "; push %%" _ASM_BP ";" @@ -9055,8 +9075,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); - if (vmx->spec_ctrl) - native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); /* Eliminate branch target predictions from guest mode */ vmexit_fill_RSB(); @@ -11347,7 +11366,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .hardware_enable = hardware_enable, .hardware_disable = hardware_disable, .cpu_has_accelerated_tpr = report_flexpriority, - .cpu_has_high_real_mode_segbase = vmx_has_high_real_mode_segbase, + .has_emulated_msr = vmx_has_emulated_msr, .vcpu_create = vmx_create_vcpu, .vcpu_free = vmx_free_vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0bf6ed591f16..719a4a5f7708 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1002,6 +1002,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_AMD64_VIRT_SPEC_CTRL, }; static unsigned num_emulated_msrs; @@ -2664,7 +2665,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) * fringe case that is not enabled except via specific settings * of the module parameters. */ - r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); + r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE); break; case KVM_CAP_COALESCED_MMIO: r = KVM_COALESCED_MMIO_PAGE_OFFSET; @@ -4130,13 +4131,14 @@ long kvm_arch_vm_ioctl(struct file *filp, mutex_unlock(&kvm->lock); break; case KVM_XEN_HVM_CONFIG: { + struct kvm_xen_hvm_config xhc; r = -EFAULT; - if (copy_from_user(&kvm->arch.xen_hvm_config, argp, - sizeof(struct kvm_xen_hvm_config))) + if (copy_from_user(&xhc, argp, sizeof(xhc))) goto out; r = -EINVAL; - if (kvm->arch.xen_hvm_config.flags) + if (xhc.flags) goto out; + memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc)); r = 0; break; } @@ -4226,14 +4228,8 @@ static void kvm_init_msr_list(void) num_msrs_to_save = j; for (i = j = 0; i < ARRAY_SIZE(emulated_msrs); i++) { - switch (emulated_msrs[i]) { - case MSR_IA32_SMBASE: - if (!kvm_x86_ops->cpu_has_high_real_mode_segbase()) - continue; - break; - default: - break; - } + if (!kvm_x86_ops->has_emulated_msr(emulated_msrs[i])) + continue; if (j < i) emulated_msrs[j] = emulated_msrs[i]; @@ -7270,6 +7266,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { struct msr_data apic_base_msr; int mmu_reset_needed = 0; + int cpuid_update_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; @@ -7301,8 +7298,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, vcpu->arch.cr0 = sregs->cr0; mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; + cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) & + (X86_CR4_OSXSAVE | X86_CR4_PKE)); kvm_x86_ops->set_cr4(vcpu, sregs->cr4); - if (sregs->cr4 & (X86_CR4_OSXSAVE | X86_CR4_PKE)) + if (cpuid_update_needed) kvm_update_cpuid(vcpu); idx = srcu_read_lock(&vcpu->kvm->srcu); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7df8e3a79dc0..d35d0e4bbf99 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1014,8 +1014,7 @@ void __init mem_init(void) after_bootmem = 1; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, - PAGE_SIZE, KCORE_OTHER); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_USER); mem_init_print_info(NULL); } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index c1085c7ee212..1dd1c2ad4e58 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -287,9 +287,11 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, /* * The .rodata section needs to be read-only. Using the pfn - * catches all aliases. + * catches all aliases. This also includes __ro_after_init, + * so do not enforce until kernel_set_to_readonly is true. */ - if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + if (kernel_set_to_readonly && + within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index b97ef29c940f..a3b63e5a527c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -1,5 +1,6 @@ #include <linux/mm.h> #include <linux/gfp.h> +#include <linux/hugetlb.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/tlb.h> @@ -577,6 +578,10 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) (mtrr != MTRR_TYPE_WRBACK)) return 0; + /* Bail out if we are we on a populated non-leaf entry: */ + if (pud_present(*pud) && !pud_huge(*pud)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pud, pfn_pte( @@ -605,6 +610,10 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) return 0; } + /* Bail out if we are we on a populated non-leaf entry: */ + if (pmd_present(*pmd) && !pmd_huge(*pmd)) + return 0; + prot = pgprot_4k_2_large(prot); set_pte((pte_t *)pmd, pfn_pte( diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index f88ce0e5efd9..0bbec041c003 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -95,26 +95,27 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey */ if (pkey != -1) return pkey; - /* - * Look for a protection-key-drive execute-only mapping - * which is now being given permissions that are not - * execute-only. Move it back to the default pkey. - */ - if (vma_is_pkey_exec_only(vma) && - (prot & (PROT_READ|PROT_WRITE))) { - return 0; - } + /* * The mapping is execute-only. Go try to get the * execute-only protection key. If we fail to do that, * fall through as if we do not have execute-only - * support. + * support in this mm. */ if (prot == PROT_EXEC) { pkey = execute_only_pkey(vma->vm_mm); if (pkey > 0) return pkey; + } else if (vma_is_pkey_exec_only(vma)) { + /* + * Protections are *not* PROT_EXEC, but the mapping + * is using the exec-only pkey. This mapping was + * PROT_EXEC and will no longer be. Move back to + * the default pkey. + */ + return ARCH_DEFAULT_PKEY; } + /* * This is a vanilla, non-pkey mprotect (or we failed to * setup execute-only), inherit the pkey from the VMA we diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..74b516cb39df 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -142,7 +142,7 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir) #endif } -int swsusp_arch_resume(void) +asmlinkage int swsusp_arch_resume(void) { int error; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..0cb1dd461529 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -149,7 +149,7 @@ static int relocate_restore_code(void) return 0; } -int swsusp_arch_resume(void) +asmlinkage int swsusp_arch_resume(void) { int error; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2bea87cc0ff2..081437b5f381 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1977,10 +1977,8 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static void xen_set_cpu_features(struct cpuinfo_x86 *c) { - if (xen_pv_domain()) { - clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + if (xen_pv_domain()) set_cpu_cap(c, X86_FEATURE_XENPV); - } } static void xen_pin_vcpu(int cpu) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 418f1b8576cf..c92f75f7ae33 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1317,8 +1317,6 @@ void xen_flush_tlb_all(void) struct mmuext_op *op; struct multicall_space mcs; - trace_xen_mmu_flush_tlb_all(0); - preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); @@ -1336,8 +1334,6 @@ static void xen_flush_tlb(void) struct mmuext_op *op; struct multicall_space mcs; - trace_xen_mmu_flush_tlb(0); - preempt_disable(); mcs = xen_mc_entry(sizeof(*op)); |