summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt10
-rw-r--r--Makefile7
-rw-r--r--arch/powerpc/perf/core-book3s.c8
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/boot/compressed/misc.h1
-rw-r--r--arch/x86/entry/entry_64.S164
-rw-r--r--arch/x86/entry/entry_64_compat.S7
-rw-r--r--arch/x86/include/asm/cmdline.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h4
-rw-r--r--arch/x86/include/asm/desc.h2
-rw-r--r--arch/x86/include/asm/disabled-features.h4
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/hw_irq.h2
-rw-r--r--arch/x86/include/asm/kaiser.h141
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h2
-rw-r--r--arch/x86/include/asm/pgtable.h28
-rw-r--r--arch/x86/include/asm/pgtable_64.h25
-rw-r--r--arch/x86/include/asm/pgtable_types.h29
-rw-r--r--arch/x86/include/asm/processor.h2
-rw-r--r--arch/x86/include/asm/tlbflush.h171
-rw-r--r--arch/x86/include/uapi/asm/processor-flags.h3
-rw-r--r--arch/x86/kernel/cpu/bugs.c8
-rw-r--r--arch/x86/kernel/cpu/common.c66
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c57
-rw-r--r--arch/x86/kernel/espfix_64.c10
-rw-r--r--arch/x86/kernel/head_64.S35
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/ldt.c25
-rw-r--r--arch/x86/kernel/paravirt_patch_64.c2
-rw-r--r--arch/x86/kernel/process.c2
-rw-r--r--arch/x86/kernel/reboot.c4
-rw-r--r--arch/x86/kernel/setup.c7
-rw-r--r--arch/x86/kernel/smpboot.c9
-rw-r--r--arch/x86/kernel/tracepoint.c2
-rw-r--r--arch/x86/kernel/vm86_32.c2
-rw-r--r--arch/x86/kvm/emulate.c32
-rw-r--r--arch/x86/kvm/x86.c5
-rw-r--r--arch/x86/lib/cmdline.c105
-rw-r--r--arch/x86/mm/Makefile1
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/kaiser.c455
-rw-r--r--arch/x86/mm/pageattr.c63
-rw-r--r--arch/x86/mm/pgtable.c16
-rw-r--r--arch/x86/mm/tlb.c112
-rw-r--r--arch/x86/xen/enlighten.c6
-rw-r--r--crypto/mcryptd.c23
-rw-r--r--drivers/acpi/apei/erst.c2
-rw-r--r--drivers/infiniband/hw/cxgb4/cq.c6
-rw-r--r--drivers/mfd/cros_ec_spi.c1
-rw-r--r--drivers/mfd/twl4030-audio.c9
-rw-r--r--drivers/mfd/twl6040.c12
-rw-r--r--drivers/net/ethernet/broadcom/tg3.c4
-rw-r--r--drivers/net/ethernet/marvell/mvmdio.c3
-rw-r--r--drivers/net/ethernet/marvell/mvneta.c4
-rw-r--r--drivers/net/phy/micrel.c1
-rw-r--r--drivers/net/usb/qmi_wwan.c1
-rw-r--r--drivers/parisc/lba_pci.c33
-rw-r--r--drivers/pci/pci-driver.c7
-rw-r--r--drivers/spi/spi-xilinx.c11
-rw-r--r--drivers/tty/n_tty.c4
-rw-r--r--drivers/usb/core/config.c2
-rw-r--r--drivers/usb/core/quirks.c6
-rw-r--r--drivers/usb/host/xhci-pci.c3
-rw-r--r--drivers/usb/serial/ftdi_sio.c1
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h6
-rw-r--r--drivers/usb/serial/option.c17
-rw-r--r--drivers/usb/serial/qcserial.c3
-rw-r--r--drivers/usb/usbip/stub_main.c5
-rw-r--r--drivers/usb/usbip/stub_rx.c7
-rw-r--r--drivers/usb/usbip/stub_tx.c4
-rw-r--r--drivers/usb/usbip/vhci_hcd.c10
-rw-r--r--drivers/usb/usbip/vhci_rx.c23
-rw-r--r--drivers/usb/usbip/vhci_tx.c3
-rw-r--r--include/asm-generic/vmlinux.lds.h7
-rw-r--r--include/crypto/mcryptd.h1
-rw-r--r--include/linux/ipv6.h3
-rw-r--r--include/linux/kaiser.h52
-rw-r--r--include/linux/mmzone.h3
-rw-r--r--include/linux/percpu-defs.h32
-rw-r--r--include/linux/vm_event_item.h2
-rw-r--r--include/net/ip.h2
-rw-r--r--init/main.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c13
-rw-r--r--mm/vmstat.c1
-rw-r--r--net/bridge/br_netlink.c11
-rw-r--r--net/core/net_namespace.c2
-rw-r--r--net/core/skbuff.c6
-rw-r--r--net/ipv4/devinet.c2
-rw-r--r--net/ipv4/fib_frontend.c9
-rw-r--r--net/ipv4/igmp.c44
-rw-r--r--net/ipv4/ip_tunnel.c4
-rw-r--r--net/ipv4/raw.c15
-rw-r--r--net/ipv4/tcp_ipv4.c2
-rw-r--r--net/ipv6/af_inet6.c1
-rw-r--r--net/ipv6/ip6_output.c12
-rw-r--r--net/ipv6/ipv6_sockglue.c1
-rw-r--r--net/ipv6/mcast.c25
-rw-r--r--net/ipv6/tcp_ipv6.c2
-rw-r--r--net/netlink/af_netlink.c3
-rw-r--r--net/sctp/socket.c4
-rw-r--r--security/Kconfig10
-rw-r--r--sound/core/rawmidi.c15
-rw-r--r--sound/hda/hdac_i915.c2
-rw-r--r--sound/pci/hda/patch_realtek.c5
-rw-r--r--sound/soc/codecs/twl4030.c4
-rw-r--r--sound/soc/fsl/fsl_ssi.c18
-rw-r--r--sound/usb/mixer.c27
-rw-r--r--tools/usb/usbip/src/utils.c9
113 files changed, 1830 insertions, 425 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 5d593ecadb90..5977c4d71356 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2523,6 +2523,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nojitter [IA-64] Disables jitter checking for ITC timers.
+ nopti [X86-64] Disable KAISER isolation of kernel from user.
+
no-kvmclock [X86,KVM] Disable paravirtualized KVM clock driver
no-kvmapf [X86,KVM] Disable paravirtualized asynchronous page
@@ -2555,6 +2557,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
nopat [X86] Disable PAT (page attribute table extension of
pagetables) support.
+ nopcid [X86-64] Disable the PCID cpu feature.
+
norandmaps Don't use address space randomization. Equivalent to
echo 0 > /proc/sys/kernel/randomize_va_space
@@ -3052,6 +3056,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
pt. [PARIDE]
See Documentation/blockdev/paride.txt.
+ pti= [X86_64]
+ Control KAISER user/kernel address space isolation:
+ on - enable
+ off - disable
+ auto - default setting
+
pty.legacy_count=
[KNL] Number of legacy pty's. Overwrites compiled-in
default number.
diff --git a/Makefile b/Makefile
index 99f9834c4ba6..db8df2a46f77 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 4
PATCHLEVEL = 4
-SUBLEVEL = 108
-EXTRAVERSION =
+SUBLEVEL = 110
+EXTRAVERSION = -rc1
NAME = Blurry Fish Butt
# *DOCUMENTATION*
@@ -782,6 +782,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
# disable invalid "can't wrap" optimizations for signed / pointers
KBUILD_CFLAGS += $(call cc-option,-fno-strict-overflow)
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS += $(call cc-option,-fno-stack-check,)
+
# conserve stack if available
KBUILD_CFLAGS += $(call cc-option,-fconserve-stack)
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d1e65ce545b3..b2ab164a8094 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -401,8 +401,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
int ret;
__u64 target;
- if (is_kernel_addr(addr))
- return branch_target((unsigned int *)addr);
+ if (is_kernel_addr(addr)) {
+ if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+ return 0;
+
+ return branch_target(&instr);
+ }
/* Userspace: need copy instruction here then translate it */
pagefault_disable();
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7e40905f6d4c..39d2dc66faa5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,7 +42,7 @@ config X86
select ARCH_USE_CMPXCHG_LOCKREF if X86_64
select ARCH_USE_QUEUED_RWLOCKS
select ARCH_USE_QUEUED_SPINLOCKS
- select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+ select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select ARCH_WANT_FRAME_POINTERS
select ARCH_WANT_IPC_PARSE_VERSION if X86_32
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3783dc3e10b3..4abb284a5b9c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
*/
#undef CONFIG_PARAVIRT
#undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_PAGE_TABLE_ISOLATION
#undef CONFIG_KASAN
#include <linux/linkage.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cc0f2f5da19b..952b23b5d4e9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
#include <linux/err.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
* it is too small to ever cause noticeable irq latency.
*/
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* A hypervisor implementation might want to use a label
* after the swapgs, so that it can do the swapgs
@@ -207,9 +209,17 @@ entry_SYSCALL_64_fastpath:
testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
- RESTORE_C_REGS_EXCEPT_RCX_R11
movq RIP(%rsp), %rcx
movq EFLAGS(%rsp), %r11
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
/*
* 64-bit SYSRET restores rip from rcx,
@@ -347,10 +357,26 @@ GLOBAL(int_ret_from_sys_call)
syscall_return_via_sysret:
/* rcx and r11 are already restored (see code above) */
RESTORE_C_REGS_EXCEPT_RCX_R11
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
movq RSP(%rsp), %rsp
USERGS_SYSRET64
opportunistic_sysret_failed:
+ /*
+ * This opens a window where we have a user CR3, but are
+ * running in the kernel. This makes using the CS
+ * register useless for telling whether or not we need to
+ * switch CR3 in NMIs. Normal interrupts are OK because
+ * they are off here.
+ */
+ SWITCH_USER_CR3
SWAPGS
jmp restore_c_regs_and_iret
END(entry_SYSCALL_64)
@@ -509,6 +535,7 @@ END(irq_entries_start)
* tracking that we're in kernel mode.
*/
SWAPGS
+ SWITCH_KERNEL_CR3
/*
* We need to tell lockdep that IRQs are off. We can't do this until
@@ -568,6 +595,7 @@ GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
TRACE_IRQS_IRETQ
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
@@ -625,6 +653,7 @@ native_irq_return_ldt:
pushq %rax
pushq %rdi
SWAPGS
+ SWITCH_KERNEL_CR3
movq PER_CPU_VAR(espfix_waddr), %rdi
movq %rax, (0*8)(%rdi) /* RAX */
movq (2*8)(%rsp), %rax /* RIP */
@@ -640,6 +669,7 @@ native_irq_return_ldt:
andl $0xffff0000, %eax
popq %rdi
orq PER_CPU_VAR(espfix_stack), %rax
+ SWITCH_USER_CR3
SWAPGS
movq %rax, %rsp
popq %rax
@@ -995,7 +1025,11 @@ idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vec
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
*/
ENTRY(paranoid_entry)
cld
@@ -1008,7 +1042,26 @@ ENTRY(paranoid_entry)
js 1f /* negative -> in kernel */
SWAPGS
xorl %ebx, %ebx
-1: ret
+1:
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+ * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+ * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+ * unconditionally, but we need to find out whether the reverse
+ * should be done on return (conveyed to paranoid_exit in %ebx).
+ */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ testl $KAISER_SHADOW_PGD_OFFSET, %eax
+ jz 2f
+ orl $2, %ebx
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ movq %rax, %cr3
+2:
+#endif
+ ret
END(paranoid_entry)
/*
@@ -1021,19 +1074,26 @@ END(paranoid_entry)
* be complicated. Fortunately, we there's no good reason
* to try to handle preemption here.
*
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ * ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ * ebx=2: needs both swapgs and SWITCH_USER_CR3
+ * ebx=3: needs SWITCH_USER_CR3 but not swapgs
*/
ENTRY(paranoid_exit)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
- testl %ebx, %ebx /* swapgs needed? */
+ TRACE_IRQS_IRETQ_DEBUG
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
+ testl $2, %ebx /* SWITCH_USER_CR3 needed? */
+ jz paranoid_exit_no_switch
+ SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+ testl $1, %ebx /* swapgs needed? */
jnz paranoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- jmp paranoid_exit_restore
paranoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1048,6 +1108,13 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ /*
+ * error_entry() always returns with a kernel gsbase and
+ * CR3. We must also have a kernel CR3/gsbase before
+ * calling TRACE_IRQS_*. Just unconditionally switch to
+ * the kernel CR3 here.
+ */
+ SWITCH_KERNEL_CR3
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1210,6 +1277,10 @@ ENTRY(nmi)
*/
SWAPGS_UNSAFE_STACK
+ /*
+ * percpu variables are mapped with user CR3, so no need
+ * to switch CR3 here.
+ */
cld
movq %rsp, %rdx
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1243,12 +1314,34 @@ ENTRY(nmi)
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
call do_nmi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Unconditionally restore CR3. I know we return to
+ * kernel code that needs user CR3, but do we ever return
+ * to "user mode" where we need the kernel CR3?
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts. Do not
+ * switch to user CR3: we might be going back to kernel code
+ * that had a user CR3 set.
*/
SWAPGS
jmp restore_c_regs_and_iret
@@ -1445,22 +1538,55 @@ end_repeat_nmi:
ALLOC_PT_GPREGS_ON_STACK
/*
- * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
- * as we should not be calling schedule in NMI context.
- * Even with normal interrupts enabled. An NMI should not be
- * setting NEED_RESCHED or anything that normal interrupts and
- * exceptions might do.
+ * Use the same approach as paranoid_entry to handle SWAPGS, but
+ * without CR3 handling since we do that differently in NMIs. No
+ * need to use paranoid_exit as we should not be calling schedule
+ * in NMI context. Even with normal interrupts enabled. An NMI
+ * should not be setting NEED_RESCHED or anything that normal
+ * interrupts and exceptions might do.
*/
- call paranoid_entry
-
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+ cld
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+1:
movq %rsp, %rdi
movq $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /* Unconditionally use kernel CR3 for do_nmi() */
+ /* %rax is saved above, so OK to clobber here */
+ ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+ /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+ ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+ pushq %rax
+ /* mask off "user" bit of pgd address and 12 PCID bits: */
+ andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+ movq %rax, %cr3
+2:
+#endif
+
+ /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
call do_nmi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ /*
+ * Unconditionally restore CR3. We might be returning to
+ * kernel code that needs user CR3, like just just before
+ * a sysret.
+ */
+ ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
nmi_swapgs:
+ /* We fixed up CR3 above, so no need to switch it here */
SWAPGS_UNSAFE_STACK
nmi_restore:
RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 15cfebaa7688..d03bf0e28b8b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,8 @@
#include <asm/irqflags.h>
#include <asm/asm.h>
#include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
#include <linux/linkage.h>
#include <linux/err.h>
@@ -50,6 +52,7 @@ ENDPROC(native_usergs_sysret32)
ENTRY(entry_SYSENTER_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/*
@@ -161,6 +164,7 @@ ENDPROC(entry_SYSENTER_compat)
ENTRY(entry_SYSCALL_compat)
/* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+ SWITCH_KERNEL_CR3_NO_STACK
/* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
@@ -208,6 +212,7 @@ ENTRY(entry_SYSCALL_compat)
/* Opportunistic SYSRET */
sysret32_from_system_call:
TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ SWITCH_USER_CR3
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@@ -269,6 +274,7 @@ ENTRY(entry_INT80_compat)
PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS
+ SWITCH_KERNEL_CR3_NO_STACK
/*
* User tracing code (ptrace or signal handlers) might assume that
@@ -311,6 +317,7 @@ ENTRY(entry_INT80_compat)
/* Go back to user mode. */
TRACE_IRQS_ON
+ SWITCH_USER_CR3
SWAPGS
jmp restore_regs_and_iret
END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
#define _ASM_X86_CMDLINE_H
int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+ char *buffer, int bufsize);
#endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f7ba9fbf12ee..f6605712ca90 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -187,6 +187,7 @@
#define X86_FEATURE_ARAT ( 7*32+ 1) /* Always Running APIC Timer */
#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */
#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
#define X86_FEATURE_PLN ( 7*32+ 5) /* Intel Power Limit Notification */
#define X86_FEATURE_PTS ( 7*32+ 6) /* Intel Package Thermal Status */
#define X86_FEATURE_DTHERM ( 7*32+ 7) /* Digital Thermal Sensor */
@@ -199,6 +200,9 @@
#define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
+
/* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..880db91d9457 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -43,7 +43,7 @@ struct gdt_page {
struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..8b17c2ad1048 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
# define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31))
# define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31))
# define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID 0
#else
# define DISABLE_VME 0
# define DISABLE_K6_MTRR 0
# define DISABLE_CYRIX_ARR 0
# define DISABLE_CENTAUR_MCR 0
+# define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31))
#endif /* CONFIG_X86_64 */
/*
@@ -35,7 +37,7 @@
#define DISABLED_MASK1 0
#define DISABLED_MASK2 0
#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4 0
+#define DISABLED_MASK4 (DISABLE_PCID)
#define DISABLED_MASK5 0
#define DISABLED_MASK6 0
#define DISABLED_MASK7 0
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
#ifdef CONFIG_SMP
unsigned int irq_resched_count;
unsigned int irq_call_count;
- unsigned int irq_tlb_count;
#endif
+ unsigned int irq_tlb_count;
#ifdef CONFIG_X86_THERMAL_VECTOR
unsigned int irq_thermal_count;
#endif
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 59caa55fb9b5..ee52ff858699 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -187,7 +187,7 @@ extern char irq_entries_start[];
#define VECTOR_RETRIGGERED ((void *)~0UL)
typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
#endif /* !ASSEMBLY_ */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..802bbbdfe143
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,141 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory. It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js 9f
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+8:
+.endm
+
+.macro SWITCH_USER_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+8:
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+ALTERNATIVE "jmp 8f", \
+ __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+ X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
+
+.macro SWITCH_KERNEL_CR3
+.endm
+.macro SWITCH_USER_CR3
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored. To change the address space, another register is
+ * needed. A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
+#else
+#define kaiser_enabled 0
+static inline void __init kaiser_check_boottime_disable(void) {}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
+/**
+ * kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ * @flags: The mapping flags of the pages
+ *
+ * The mapping is done on a global scope, so no bigger
+ * synchronization has to be done. the pages have to be
+ * manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+/**
+ * kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ * @addr: the start address of the range
+ * @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ * kaiser_init - Initialize the shadow mapping
+ *
+ * Most parts of the shadow mapping can be mapped upon boot
+ * time. Only per-process things like the thread stacks
+ * or a new LDT have to be mapped at runtime. These boot-
+ * time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 55234d5e7160..7680b76adafc 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -24,12 +24,6 @@ typedef struct {
atomic_t perf_rdpmc_allowed; /* nonzero if rdpmc is allowed */
} mm_context_t;
-#ifdef CONFIG_SMP
void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 44fc93987869..9bfc5fd77015 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -98,10 +98,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
{
-#ifdef CONFIG_SMP
if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..84c62d950023 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
#ifndef __ASSEMBLY__
#include <asm/x86_init.h>
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
void ptdump_walk_pgd_level_checkwx(void);
@@ -653,7 +659,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
static inline int pgd_bad(pgd_t pgd)
{
- return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+ pgdval_t ignore_flags = _PAGE_USER;
+ /*
+ * We set NX on KAISER pgds that map userspace memory so
+ * that userspace can not meaningfully use the kernel
+ * page table by accident; it will fault on the first
+ * instruction it tries to run. See native_set_pgd().
+ */
+ if (kaiser_enabled)
+ ignore_flags |= _PAGE_NX;
+
+ return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}
static inline int pgd_none(pgd_t pgd)
@@ -855,7 +871,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
*/
static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
{
- memcpy(dst, src, count * sizeof(pgd_t));
+ memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (kaiser_enabled) {
+ /* Clone the shadow pgd part as well */
+ memcpy(native_get_shadow_pgd(dst),
+ native_get_shadow_pgd(src),
+ count * sizeof(pgd_t));
+ }
+#endif
}
#define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2ee781114d34..c810226e741a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
native_set_pud(pud, native_make_pud(0));
}
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+#ifdef CONFIG_DEBUG_VM
+ /* linux/mmdebug.h may not have been included at this point */
+ BUG_ON(!kaiser_enabled);
+#endif
+ return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+ BUILD_BUG_ON(1);
+ return NULL;
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
{
- *pgdp = pgd;
+ *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
}
static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79c91853e50e..8dba273da25a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -89,7 +89,7 @@
#define _PAGE_NX (_AT(pteval_t, 0))
#endif
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
_PAGE_ACCESSED | _PAGE_DIRTY)
@@ -102,6 +102,33 @@
_PAGE_SOFT_DIRTY)
#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN (_AC(0x0,UL))
+
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH (0)
+#define X86_CR3_PCID_USER_FLUSH (0)
+#define X86_CR3_PCID_KERN_NOFLUSH (0)
+#define X86_CR3_PCID_USER_NOFLUSH (0)
+#endif
+
/*
* The cache modes defined here are used to translate between pure SW usage
* and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50cb61a2..f3bdaed0188f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -305,7 +305,7 @@ struct tss_struct {
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
#ifdef CONFIG_X86_32
DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4dc534175b5e..a691b66cc40a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
#include <asm/processor.h>
#include <asm/special_insns.h>
+#include <asm/smp.h>
static inline void __invpcid(unsigned long pcid, unsigned long addr,
unsigned long type)
@@ -64,10 +65,8 @@ static inline void invpcid_flush_all_nonglobals(void)
#endif
struct tlb_state {
-#ifdef CONFIG_SMP
struct mm_struct *active_mm;
int state;
-#endif
/*
* Access to this CR4 shadow and to H/W CR4 is protected by
@@ -132,6 +131,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
cr4_set_bits(mask);
}
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern int kaiser_enabled;
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+#define kaiser_enabled 0
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
static inline void __native_flush_tlb(void)
{
/*
@@ -140,6 +157,8 @@ static inline void __native_flush_tlb(void)
* back:
*/
preempt_disable();
+ if (kaiser_enabled)
+ kaiser_flush_tlb_on_return_to_user();
native_write_cr3(native_read_cr3());
preempt_enable();
}
@@ -149,20 +168,27 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
unsigned long cr4;
cr4 = this_cpu_read(cpu_tlbstate.cr4);
- /* clear PGE */
- native_write_cr4(cr4 & ~X86_CR4_PGE);
- /* write old PGE again and flush TLBs */
- native_write_cr4(cr4);
+ if (cr4 & X86_CR4_PGE) {
+ /* clear PGE and flush TLB of all entries */
+ native_write_cr4(cr4 & ~X86_CR4_PGE);
+ /* restore PGE as it was before */
+ native_write_cr4(cr4);
+ } else {
+ /* do it with cr3, letting kaiser flush user PCID */
+ __native_flush_tlb();
+ }
}
static inline void __native_flush_tlb_global(void)
{
unsigned long flags;
- if (static_cpu_has(X86_FEATURE_INVPCID)) {
+ if (this_cpu_has(X86_FEATURE_INVPCID)) {
/*
* Using INVPCID is considerably faster than a pair of writes
* to CR4 sandwiched inside an IRQ flag save/restore.
+ *
+ * Note, this works with CR4.PCIDE=0 or 1.
*/
invpcid_flush_all();
return;
@@ -174,23 +200,52 @@ static inline void __native_flush_tlb_global(void)
* be called from deep inside debugging code.)
*/
raw_local_irq_save(flags);
-
__native_flush_tlb_global_irq_disabled();
-
raw_local_irq_restore(flags);
}
static inline void __native_flush_tlb_single(unsigned long addr)
{
- asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ /*
+ * SIMICS #GP's if you run INVPCID with type 2/3
+ * and X86_CR4_PCIDE clear. Shame!
+ *
+ * The ASIDs used below are hard-coded. But, we must not
+ * call invpcid(type=1/2) before CR4.PCIDE=1. Just call
+ * invlpg in the case we are called early.
+ */
+
+ if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+ if (kaiser_enabled)
+ kaiser_flush_tlb_on_return_to_user();
+ asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+ return;
+ }
+ /* Flush the address out of both PCIDs. */
+ /*
+ * An optimization here might be to determine addresses
+ * that are only kernel-mapped and only flush the kernel
+ * ASID. But, userspace flushes are probably much more
+ * important performance-wise.
+ *
+ * Make sure to do only a single invpcid when KAISER is
+ * disabled and we have only a single ASID.
+ */
+ if (kaiser_enabled)
+ invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+ invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
}
static inline void __flush_tlb_all(void)
{
- if (cpu_has_pge)
- __flush_tlb_global();
- else
- __flush_tlb();
+ __flush_tlb_global();
+ /*
+ * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+ * we'd end up flushing kernel translations for the current ASID but
+ * we might fail to flush kernel translations for other cached ASIDs.
+ *
+ * To avoid this issue, we force PCID off if PGE is off.
+ */
}
static inline void __flush_tlb_one(unsigned long addr)
@@ -204,7 +259,6 @@ static inline void __flush_tlb_one(unsigned long addr)
/*
* TLB flushing:
*
- * - flush_tlb() flushes the current mm struct TLBs
* - flush_tlb_all() flushes all processes TLBs
* - flush_tlb_mm(mm) flushes the specified mm context TLB's
* - flush_tlb_page(vma, vmaddr) flushes one page
@@ -216,84 +270,6 @@ static inline void __flush_tlb_one(unsigned long addr)
* and page-granular flushes are available only on i486 and up.
*/
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions. *Not* intended to be called
- * directly. All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
- __flush_tlb_all();
-}
-
-static inline void flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void local_flush_tlb(void)
-{
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- if (vma->vm_mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
- unsigned long start, unsigned long end, unsigned long vmflag)
-{
- if (mm == current->active_mm)
- __flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
- unsigned long end)
-{
- flush_tlb_all();
-}
-
-#else /* SMP */
-
-#include <asm/smp.h>
-
#define local_flush_tlb() __flush_tlb()
#define flush_tlb_mm(mm) flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
@@ -302,13 +278,14 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long vmflag);
extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
-#define flush_tlb() flush_tlb_current_task()
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+ flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
void native_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
@@ -323,8 +300,6 @@ static inline void reset_lazy_tlbstate(void)
this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
}
-#endif /* SMP */
-
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, mm, start, end) \
native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..1361779f44fe 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
#define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT)
#define X86_CR3_PCD_BIT 4 /* Page Cache Disable */
#define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
/*
* Intel CPU features in CR4
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..0b6124315441 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,14 @@
void __init check_bugs(void)
{
+#ifdef CONFIG_X86_32
+ /*
+ * Regardless of whether PCID is enumerated, the SDM says
+ * that it can't be enabled in 32-bit mode.
+ */
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
identify_boot_cpu();
#ifndef CONFIG_SMP
pr_info("CPU: ");
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c84b62956e8d..cc154ac64f00 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -162,6 +162,24 @@ static int __init x86_mpx_setup(char *s)
}
__setup("nompx", x86_mpx_setup);
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+ /* require an exact match without trailing characters */
+ if (strlen(s))
+ return 0;
+
+ /* do not emit a message if the feature is not present */
+ if (!boot_cpu_has(X86_FEATURE_PCID))
+ return 1;
+
+ setup_clear_cpu_cap(X86_FEATURE_PCID);
+ pr_info("nopcid: PCID feature disabled\n");
+ return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
static int __init x86_noinvpcid_setup(char *s)
{
/* noinvpcid doesn't accept parameters */
@@ -303,6 +321,39 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+ if (cpu_has(c, X86_FEATURE_PCID)) {
+ if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
+ cr4_set_bits(X86_CR4_PCIDE);
+ /*
+ * INVPCID has two "groups" of types:
+ * 1/2: Invalidate an individual address
+ * 3/4: Invalidate all contexts
+ *
+ * 1/2 take a PCID, but 3/4 do not. So, 3/4
+ * ignore the PCID argument in the descriptor.
+ * But, we have to be careful not to call 1/2
+ * with an actual non-zero PCID in them before
+ * we do the above cr4_set_bits().
+ */
+ if (cpu_has(c, X86_FEATURE_INVPCID))
+ set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
+ } else {
+ /*
+ * flush_tlb_all(), as currently implemented, won't
+ * work if PCID is on but PGE is not. Since that
+ * combination doesn't exist on real hardware, there's
+ * no reason to try to fully support it, but it's
+ * polite to avoid corrupting data if we're on
+ * an improperly configured VM.
+ */
+ clear_cpu_cap(c, X86_FEATURE_PCID);
+ }
+ }
+ kaiser_setup_pcid();
+}
+
/*
* Some CPU features depend on higher CPUID levels, which may not always
* be available due to CPUID level capping or broken virtualization
@@ -934,6 +985,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smep(c);
setup_smap(c);
+ /* Set up PCID */
+ setup_pcid(c);
+
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
@@ -1189,7 +1243,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
[DEBUG_STACK - 1] = DEBUG_STKSZ
};
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
/* May not be marked __init: used by software suspend */
@@ -1352,6 +1406,14 @@ void cpu_init(void)
* try to read it.
*/
cr4_init_shadow();
+ if (!kaiser_enabled) {
+ /*
+ * secondary_startup_64() deferred setting PGE in cr4:
+ * probe_page_size_mask() sets it on the boot cpu,
+ * but it needs to be set on each secondary cpu.
+ */
+ cr4_set_bits(X86_CR4_PGE);
+ }
/*
* Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1e7de3cefc9c..f01b3a12dce0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,11 +2,15 @@
#include <linux/types.h>
#include <linux/slab.h>
+#include <asm/kaiser.h>
#include <asm/perf_event.h>
#include <asm/insn.h>
#include "perf_event.h"
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
static DEFINE_PER_CPU(void *, insn_buffer);
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ unsigned int order = get_order(size);
+ struct page *page;
+ unsigned long addr;
+
+ page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+ if (!page)
+ return NULL;
+ addr = (unsigned long)page_address(page);
+ if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+ __free_pages(page, order);
+ addr = 0;
+ }
+ return (void *)addr;
+#else
+ return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+ if (!buffer)
+ return;
+ kaiser_remove_mapping((unsigned long)buffer, size);
+ free_pages((unsigned long)buffer, get_order(size));
+#else
+ kfree(buffer);
+#endif
+}
+
static int alloc_pebs_buffer(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
if (!x86_pmu.pebs)
return 0;
- buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+ buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
if (unlikely(!buffer))
return -ENOMEM;
@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
if (x86_pmu.intel_cap.pebs_format < 2) {
ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
if (!ibuffer) {
- kfree(buffer);
+ dsfree(buffer, x86_pmu.pebs_buffer_size);
return -ENOMEM;
}
per_cpu(insn_buffer, cpu) = ibuffer;
@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
kfree(per_cpu(insn_buffer, cpu));
per_cpu(insn_buffer, cpu) = NULL;
- kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+ x86_pmu.pebs_buffer_size);
ds->pebs_buffer_base = 0;
}
@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
if (!x86_pmu.bts)
return 0;
- buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+ buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
if (unlikely(!buffer)) {
WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
return -ENOMEM;
@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
if (!ds || !x86_pmu.bts)
return;
- kfree((void *)(unsigned long)ds->bts_buffer_base);
+ dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
ds->bts_buffer_base = 0;
}
static int alloc_ds_buffer(int cpu)
{
- int node = cpu_to_node(cpu);
- struct debug_store *ds;
-
- ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
- if (unlikely(!ds))
- return -ENOMEM;
+ struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
+ memset(ds, 0, sizeof(*ds));
per_cpu(cpu_hw_events, cpu).ds = ds;
return 0;
@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
return;
per_cpu(cpu_hw_events, cpu).ds = NULL;
- kfree(ds);
}
void release_ds_buffers(void)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..b02cb2ec6726 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>
+#include <asm/kaiser.h>
/*
* Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
/* Install the espfix pud into the kernel page directory */
pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+ /*
+ * Just copy the top-level PGD that is mapping the espfix
+ * area to ensure it is mapped into the shadow user page
+ * tables.
+ */
+ if (kaiser_enabled) {
+ set_pgd(native_get_shadow_pgd(pgd_p),
+ __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+ }
/* Randomize the locations */
init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..4034e905741a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
movq $(init_level4_pgt - __START_KERNEL_map), %rax
1:
- /* Enable PAE mode and PGE */
- movl $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+ /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+ movl $(X86_CR4_PAE | X86_CR4_PSE), %ecx
movq %rcx, %cr4
/* Setup early boot stage 4 level pagetables. */
@@ -441,6 +441,27 @@ early_idt_ripmsg:
.balign PAGE_SIZE; \
GLOBAL(name)
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned. We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL 512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+ .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL 0
+#endif
+
/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
i = 0 ; \
@@ -450,9 +471,10 @@ GLOBAL(name)
.endr
__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
.fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(early_dynamic_pgts)
.fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -460,16 +482,18 @@ NEXT_PAGE(early_dynamic_pgts)
.data
#ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.fill 512,8,0
+ .fill KAISER_USER_PGD_FILL,8,0
#else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_PAGE_OFFSET*8, 0
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
.org init_level4_pgt + L4_START_KERNEL*8, 0
/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_ident_pgt)
.quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -480,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
#endif
+ .fill KAISER_USER_PGD_FILL,8,0
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
.flags = IRQF_NO_THREAD,
};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
};
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d6279593bcdd..bc429365b72a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
+#include <linux/kaiser.h>
#include <asm/ldt.h>
#include <asm/desc.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
set_ldt(pc->ldt->entries, pc->ldt->size);
}
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+ if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+ vfree(ldt->entries);
+ else
+ free_page((unsigned long)ldt->entries);
+ kfree(ldt);
+}
+
/* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
static struct ldt_struct *alloc_ldt_struct(int size)
{
struct ldt_struct *new_ldt;
int alloc_size;
+ int ret;
if (size > LDT_ENTRIES)
return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
return NULL;
}
+ ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+ __PAGE_KERNEL);
new_ldt->size = size;
+ if (ret) {
+ __free_ldt_struct(new_ldt);
+ return NULL;
+ }
return new_ldt;
}
@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
if (likely(!ldt))
return;
+ kaiser_remove_mapping((unsigned long)ldt->entries,
+ ldt->size * LDT_ENTRY_SIZE);
paravirt_free_ldt(ldt->entries, ldt->size);
- if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
- vfree(ldt->entries);
- else
- free_page((unsigned long)ldt->entries);
- kfree(ldt);
+ __free_ldt_struct(ldt);
}
/*
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..0677bf8d3a42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
PATCH_SITE(pv_mmu_ops, read_cr3);
PATCH_SITE(pv_mmu_ops, write_cr3);
PATCH_SITE(pv_cpu_ops, clts);
- PATCH_SITE(pv_mmu_ops, flush_tlb_single);
PATCH_SITE(pv_cpu_ops, wbinvd);
#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..7c5c5dc90ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
.x86_tss = {
.sp0 = TOP_OF_INIT_STACK,
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f660d63f40fe..9a16932c7258 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -93,6 +93,10 @@ void __noreturn machine_real_restart(unsigned int type)
load_cr3(initial_page_table);
#else
write_cr3(real_mode_header->trampoline_pgd);
+
+ /* Exiting long mode will fail if CR4.PCIDE is set. */
+ if (static_cpu_has(X86_FEATURE_PCID))
+ cr4_clear_bits(X86_CR4_PCIDE);
#endif
/* Jump to the identity-mapped low memory code */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e67b834279b2..bbaae4cf9e8e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
#include <asm/alternative.h>
#include <asm/prom.h>
#include <asm/microcode.h>
+#include <asm/kaiser.h>
/*
* max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
*/
init_hypervisor_platform();
+ /*
+ * This needs to happen right after XENPV is set on xen and
+ * kaiser_enabled is checked below in cleanup_highmap().
+ */
+ kaiser_check_boottime_disable();
+
x86_init.resources.probe_roms();
/* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fbabe4fcc7fb..fe89f938e0f0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -104,14 +104,10 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
spin_lock_irqsave(&rtc_lock, flags);
CMOS_WRITE(0xa, 0xf);
spin_unlock_irqrestore(&rtc_lock, flags);
- local_flush_tlb();
- pr_debug("1.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
start_eip >> 4;
- pr_debug("2.\n");
*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
start_eip & 0xf;
- pr_debug("3.\n");
}
static inline void smpboot_restore_warm_reset_vector(void)
@@ -119,11 +115,6 @@ static inline void smpboot_restore_warm_reset_vector(void)
unsigned long flags;
/*
- * Install writable page 0 entry to set BIOS data area.
- */
- local_flush_tlb();
-
- /*
* Paranoid: Set warm reset code and vector here back
* to default values.
*/
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..2bb5ee464df3 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
#include <linux/atomic.h>
atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
(unsigned long) trace_idt_table };
/* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
static int trace_irq_vector_refcount;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..510e80da7de4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -187,7 +187,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
- flush_tlb();
+ flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
}
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 684edebb4a0c..00045499f6c2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2383,9 +2383,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
}
static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
- u64 cr0, u64 cr4)
+ u64 cr0, u64 cr3, u64 cr4)
{
int bad;
+ u64 pcid;
+
+ /* In order to later set CR4.PCIDE, CR3[11:0] must be zero. */
+ pcid = 0;
+ if (cr4 & X86_CR4_PCIDE) {
+ pcid = cr3 & 0xfff;
+ cr3 &= ~0xfff;
+ }
+
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
/*
* First enable PAE, long mode needs it before CR0.PG = 1 is set.
@@ -2404,6 +2416,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
bad = ctxt->ops->set_cr(ctxt, 4, cr4);
if (bad)
return X86EMUL_UNHANDLEABLE;
+ if (pcid) {
+ bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+ if (bad)
+ return X86EMUL_UNHANDLEABLE;
+ }
+
}
return X86EMUL_CONTINUE;
@@ -2414,11 +2432,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
struct desc_struct desc;
struct desc_ptr dt;
u16 selector;
- u32 val, cr0, cr4;
+ u32 val, cr0, cr3, cr4;
int i;
cr0 = GET_SMSTATE(u32, smbase, 0x7ffc);
- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+ cr3 = GET_SMSTATE(u32, smbase, 0x7ff8);
ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0);
@@ -2460,14 +2478,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
- return rsm_enter_protected_mode(ctxt, cr0, cr4);
+ return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
}
static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
{
struct desc_struct desc;
struct desc_ptr dt;
- u64 val, cr0, cr4;
+ u64 val, cr0, cr3, cr4;
u32 base3;
u16 selector;
int i, r;
@@ -2484,7 +2502,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
cr0 = GET_SMSTATE(u64, smbase, 0x7f58);
- ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u64, smbase, 0x7f50));
+ cr3 = GET_SMSTATE(u64, smbase, 0x7f50);
cr4 = GET_SMSTATE(u64, smbase, 0x7f48);
ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
val = GET_SMSTATE(u64, smbase, 0x7ed0);
@@ -2512,7 +2530,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
dt.address = GET_SMSTATE(u64, smbase, 0x7e68);
ctxt->ops->set_gdt(ctxt, &dt);
- r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+ r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
if (r != X86EMUL_CONTINUE)
return r;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5f44f33de89..ccf17dbfea09 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
return 1;
/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
- if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+ if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+ !is_long_mode(vcpu))
return 1;
}
@@ -6941,7 +6942,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
#endif
kvm_rip_write(vcpu, regs->rip);
- kvm_set_rflags(vcpu, regs->rflags);
+ kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..a744506856b1 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
return 0; /* Buffer overrun */
}
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+ const char *option, char *buffer, int bufsize)
+{
+ char c;
+ int pos = 0, len = -1;
+ const char *opptr = NULL;
+ char *bufptr = buffer;
+ enum {
+ st_wordstart = 0, /* Start of word/after whitespace */
+ st_wordcmp, /* Comparing this word */
+ st_wordskip, /* Miscompare, skip */
+ st_bufcpy, /* Copying this to buffer */
+ } state = st_wordstart;
+
+ if (!cmdline)
+ return -1; /* No command line */
+
+ /*
+ * This 'pos' check ensures we do not overrun
+ * a non-NULL-terminated 'cmdline'
+ */
+ while (pos++ < max_cmdline_size) {
+ c = *(char *)cmdline++;
+ if (!c)
+ break;
+
+ switch (state) {
+ case st_wordstart:
+ if (myisspace(c))
+ break;
+
+ state = st_wordcmp;
+ opptr = option;
+ /* fall through */
+
+ case st_wordcmp:
+ if ((c == '=') && !*opptr) {
+ /*
+ * We matched all the way to the end of the
+ * option we were looking for, prepare to
+ * copy the argument.
+ */
+ len = 0;
+ bufptr = buffer;
+ state = st_bufcpy;
+ break;
+ } else if (c == *opptr++) {
+ /*
+ * We are currently matching, so continue
+ * to the next character on the cmdline.
+ */
+ break;
+ }
+ state = st_wordskip;
+ /* fall through */
+
+ case st_wordskip:
+ if (myisspace(c))
+ state = st_wordstart;
+ break;
+
+ case st_bufcpy:
+ if (myisspace(c)) {
+ state = st_wordstart;
+ } else {
+ /*
+ * Increment len, but don't overrun the
+ * supplied buffer and leave room for the
+ * NULL terminator.
+ */
+ if (++len < bufsize)
+ *bufptr++ = c;
+ }
+ break;
+ }
+ }
+
+ if (bufsize)
+ *bufptr = '\0';
+
+ return len;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+ int bufsize)
+{
+ return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+ buffer, bufsize);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1ae7c141f778..61e6cead9c4a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA) += srat.o
obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION) += kaiser.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3aebbd6c6f5f..2bd45ae91eb3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
cr4_set_bits_and_update_boot(X86_CR4_PSE);
/* Enable PGE if available */
- if (cpu_has_pge) {
+ if (cpu_has_pge && !kaiser_enabled) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
} else
@@ -753,10 +753,8 @@ void __init zone_sizes_init(void)
}
DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
.active_mm = &init_mm,
.state = 0,
-#endif
.cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
};
EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..d76ec9348cff 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
continue;
if (vaddr < (unsigned long) _text || vaddr > end)
set_pmd(pmd, __pmd(0));
+ else if (kaiser_enabled) {
+ /*
+ * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+ * clear that now. This is not important, so long as
+ * CR4.PGE remains clear, but it removes an anomaly.
+ * Physical mapping setup below avoids _PAGE_GLOBAL
+ * by use of massage_pgprot() inside pfn_pte() etc.
+ */
+ set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+ }
}
}
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..bf6a915f60e7
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,455 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h> /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/cmdline.h>
+
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3. It would take
+ * another register. So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes. No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte(). We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+ pgd_t *pgd;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset_k(vaddr);
+ /*
+ * We made all the kernel PGDs present in kaiser_init().
+ * We expect them to stay that way.
+ */
+ BUG_ON(pgd_none(*pgd));
+ /*
+ * PGDs are either 512GB or 128TB on all x86_64
+ * configurations. We don't handle these.
+ */
+ BUG_ON(pgd_large(*pgd));
+
+ pud = pud_offset(pgd, vaddr);
+ if (pud_none(*pud)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pud_large(*pud))
+ return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+ pmd = pmd_offset(pud, vaddr);
+ if (pmd_none(*pmd)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ if (pmd_large(*pmd))
+ return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+ pte = pte_offset_kernel(pmd, vaddr);
+ if (pte_none(*pte)) {
+ WARN_ON_ONCE(1);
+ return -1;
+ }
+
+ return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+ pmd_t *pmd;
+ pud_t *pud;
+ pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+ gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+ if (pgd_none(*pgd)) {
+ WARN_ONCE(1, "All shadow pgds should have been populated");
+ return NULL;
+ }
+ BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+ pud = pud_offset(pgd, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pud_large(*pud)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pud_none(*pud)) {
+ unsigned long new_pmd_page = __get_free_page(gfp);
+ if (!new_pmd_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pud_none(*pud)) {
+ set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pmd_page), NR_KAISERTABLE);
+ } else
+ free_page(new_pmd_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ pmd = pmd_offset(pud, address);
+ /* The shadow page tables do not use large mappings: */
+ if (pmd_large(*pmd)) {
+ WARN_ON(1);
+ return NULL;
+ }
+ if (pmd_none(*pmd)) {
+ unsigned long new_pte_page = __get_free_page(gfp);
+ if (!new_pte_page)
+ return NULL;
+ spin_lock(&shadow_table_allocation_lock);
+ if (pmd_none(*pmd)) {
+ set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+ __inc_zone_page_state(virt_to_page((void *)
+ new_pte_page), NR_KAISERTABLE);
+ } else
+ free_page(new_pte_page);
+ spin_unlock(&shadow_table_allocation_lock);
+ }
+
+ return pte_offset_kernel(pmd, address);
+}
+
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+ unsigned long flags)
+{
+ int ret = 0;
+ pte_t *pte;
+ unsigned long start_addr = (unsigned long )__start_addr;
+ unsigned long address = start_addr & PAGE_MASK;
+ unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+ unsigned long target_address;
+
+ /*
+ * It is convenient for callers to pass in __PAGE_KERNEL etc,
+ * and there is no actual harm from setting _PAGE_GLOBAL, so
+ * long as CR4.PGE is not set. But it is nonetheless troubling
+ * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+ * requires that not to be #defined to 0): so mask it off here.
+ */
+ flags &= ~_PAGE_GLOBAL;
+
+ for (; address < end_addr; address += PAGE_SIZE) {
+ target_address = get_pa_from_mapping(address);
+ if (target_address == -1) {
+ ret = -EIO;
+ break;
+ }
+ pte = kaiser_pagetable_walk(address);
+ if (!pte) {
+ ret = -ENOMEM;
+ break;
+ }
+ if (pte_none(*pte)) {
+ set_pte(pte, __pte(flags | target_address));
+ } else {
+ pte_t tmp;
+ set_pte(&tmp, __pte(flags | target_address));
+ WARN_ON_ONCE(!pte_same(*pte, tmp));
+ }
+ }
+ return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+ unsigned long size = end - start;
+
+ return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated. This ensures that all processes that get
+ * forked have the same entries. This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+ pgd_t *pgd;
+ int i = 0;
+
+ pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+ for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+ pgd_t new_pgd;
+ pud_t *pud = pud_alloc_one(&init_mm,
+ PAGE_OFFSET + i * PGDIR_SIZE);
+ if (!pud) {
+ WARN_ON(1);
+ break;
+ }
+ inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+ new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+ /*
+ * Make sure not to stomp on some other pgd entry.
+ */
+ if (!pgd_none(pgd[i])) {
+ WARN_ON(1);
+ continue;
+ }
+ set_pgd(pgd + i, new_pgd);
+ }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do { \
+ int __ret = kaiser_add_user_map(start, size, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
+ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
+ WARN_ON(__ret); \
+} while (0)
+
+void __init kaiser_check_boottime_disable(void)
+{
+ bool enable = true;
+ char arg[5];
+ int ret;
+
+ if (boot_cpu_has(X86_FEATURE_XENPV))
+ goto silent_disable;
+
+ ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+ if (ret > 0) {
+ if (!strncmp(arg, "on", 2))
+ goto enable;
+
+ if (!strncmp(arg, "off", 3))
+ goto disable;
+
+ if (!strncmp(arg, "auto", 4))
+ goto skip;
+ }
+
+ if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ goto disable;
+
+skip:
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ goto disable;
+
+enable:
+ if (enable)
+ setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+ return;
+
+disable:
+ pr_info("disabled\n");
+
+silent_disable:
+ kaiser_enabled = 0;
+ setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die. But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it. If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+ int cpu;
+
+ if (!kaiser_enabled)
+ return;
+
+ kaiser_init_all_pgds();
+
+ for_each_possible_cpu(cpu) {
+ void *percpu_vaddr = __per_cpu_user_mapped_start +
+ per_cpu_offset(cpu);
+ unsigned long percpu_sz = __per_cpu_user_mapped_end -
+ __per_cpu_user_mapped_start;
+ kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+ __PAGE_KERNEL);
+ }
+
+ /*
+ * Map the entry/exit text section, which is needed at
+ * switches from user to and from kernel.
+ */
+ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+ __PAGE_KERNEL_RX);
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+ kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+ __irqentry_text_end,
+ __PAGE_KERNEL_RX);
+#endif
+ kaiser_add_user_map_early((void *)idt_descr.address,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+ kaiser_add_user_map_early(&trace_idt_descr,
+ sizeof(trace_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&trace_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+#endif
+ kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+ __PAGE_KERNEL);
+ kaiser_add_user_map_early(&debug_idt_table,
+ sizeof(gate_desc) * NR_VECTORS,
+ __PAGE_KERNEL);
+
+ pr_info("enabled\n");
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+ if (!kaiser_enabled)
+ return 0;
+ return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+ extern void unmap_pud_range_nofree(pgd_t *pgd,
+ unsigned long start, unsigned long end);
+ unsigned long end = start + size;
+ unsigned long addr, next;
+ pgd_t *pgd;
+
+ if (!kaiser_enabled)
+ return;
+ pgd = native_get_shadow_pgd(pgd_offset_k(start));
+ for (addr = start; addr < end; pgd++, addr = next) {
+ next = pgd_addr_end(addr, end);
+ unmap_pud_range_nofree(pgd, addr, next);
+ }
+}
+
+/*
+ * Page table pages are page-aligned. The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+ return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+ if (!kaiser_enabled)
+ return pgd;
+ /*
+ * Do we need to also populate the shadow pgd? Check _PAGE_USER to
+ * skip cases like kexec and EFI which make temporary low mappings.
+ */
+ if (pgd.pgd & _PAGE_USER) {
+ if (is_userspace_pgd(pgdp)) {
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ /*
+ * Even if the entry is *mapping* userspace, ensure
+ * that userspace can not use it. This way, if we
+ * get out to userspace running on the kernel CR3,
+ * userspace will crash instead of running.
+ */
+ pgd.pgd |= _PAGE_NX;
+ }
+ } else if (!pgd.pgd) {
+ /*
+ * pgd_clear() cannot check _PAGE_USER, and is even used to
+ * clear corrupted pgd entries: so just rely on cases like
+ * kexec and EFI never to be using pgd_clear().
+ */
+ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+ is_userspace_pgd(pgdp))
+ native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+ }
+ return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+ unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+ if (this_cpu_has(X86_FEATURE_PCID))
+ user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+ /*
+ * These variables are used by the entry/exit
+ * code to change PCID and pgd and TLB flushing.
+ */
+ this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+ if (this_cpu_has(X86_FEATURE_PCID))
+ this_cpu_write(x86_cr3_pcid_user,
+ X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b599a780a5a9..79377e2a7bcd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
#define CPA_FLUSHTLB 1
#define CPA_ARRAY 2
#define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
#ifdef CONFIG_PROC_FS
static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
return 0;
}
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PTE; i++)
if (!pte_none(pte[i]))
return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
return true;
}
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
{
int i;
+ if (!(cpa->flags & CPA_FREE_PAGETABLES))
+ return false;
+
for (i = 0; i < PTRS_PER_PMD; i++)
if (!pmd_none(pmd[i]))
return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
return true;
}
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+ unsigned long start,
+ unsigned long end)
{
pte_t *pte = pte_offset_kernel(pmd, start);
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
pte++;
}
- if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+ if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
pmd_clear(pmd);
return true;
}
return false;
}
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
unsigned long start, unsigned long end)
{
- if (unmap_pte_range(pmd, start, end))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (unmap_pte_range(cpa, pmd, start, end))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+ unsigned long start, unsigned long end)
{
pmd_t *pmd = pmd_offset(pud, start);
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- __unmap_pmd_range(pud, pmd, start, pre_end);
+ __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
start = pre_end;
pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
if (pmd_large(*pmd))
pmd_clear(pmd);
else
- __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+ __unmap_pmd_range(cpa, pud, pmd,
+ start, start + PMD_SIZE);
start += PMD_SIZE;
pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
* 4K leftovers?
*/
if (start < end)
- return __unmap_pmd_range(pud, pmd, start, end);
+ return __unmap_pmd_range(cpa, pud, pmd, start, end);
/*
* Try again to free the PMD page if haven't succeeded above.
*/
if (!pud_none(*pud))
- if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+ if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
pud_clear(pud);
}
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+ unsigned long start,
+ unsigned long end)
{
pud_t *pud = pud_offset(pgd, start);
@@ -840,7 +853,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
unsigned long pre_end = min_t(unsigned long, end, next_page);
- unmap_pmd_range(pud, start, pre_end);
+ unmap_pmd_range(cpa, pud, start, pre_end);
start = pre_end;
pud++;
@@ -854,7 +867,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
if (pud_large(*pud))
pud_clear(pud);
else
- unmap_pmd_range(pud, start, start + PUD_SIZE);
+ unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
start += PUD_SIZE;
pud++;
@@ -864,7 +877,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
* 2M leftovers?
*/
if (start < end)
- unmap_pmd_range(pud, start, end);
+ unmap_pmd_range(cpa, pud, start, end);
/*
* No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
*/
}
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = CPA_FREE_PAGETABLES,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+ struct cpa_data cpa = {
+ .flags = 0,
+ };
+
+ __unmap_pud_range(&cpa, pgd, start, end);
+}
+
static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
{
pgd_t *pgd_entry = root + pgd_index(addr);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..dbc27a2b4ad5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
#include <asm/fixmap.h>
#include <asm/mtrr.h>
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
#ifdef CONFIG_HIGHPTE
#define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -340,14 +340,24 @@ static inline void _pgd_free(pgd_t *pgd)
kmem_cache_free(pgd_cache, pgd);
}
#else
+
+/*
+ * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is
+ * both 8k in size and 8k-aligned. That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER kaiser_enabled
+
static inline pgd_t *_pgd_alloc(void)
{
- return (pgd_t *)__get_free_page(PGALLOC_GFP);
+ /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+ return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+ PGD_ALLOCATION_ORDER);
}
static inline void _pgd_free(pgd_t *pgd)
{
- free_page((unsigned long)pgd);
+ free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
}
#endif /* CONFIG_X86_PAE */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 45ba87466e6a..7cad01af6dcd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,16 +6,17 @@
#include <linux/interrupt.h>
#include <linux/module.h>
#include <linux/cpu.h>
+#include <linux/debugfs.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
/*
- * Smarter SMP flushing macros.
+ * TLB flushing, formerly SMP-only
* c/o Linus Torvalds.
*
* These mean you can really definitely utterly forget about
@@ -28,14 +29,42 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
-#ifdef CONFIG_SMP
-
struct flush_tlb_info {
struct mm_struct *flush_mm;
unsigned long flush_start;
unsigned long flush_end;
};
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+ unsigned long new_mm_cr3 = __pa(pgdir);
+
+ if (kaiser_enabled) {
+ /*
+ * We reuse the same PCID for different tasks, so we must
+ * flush all the entries for the PCID out when we change tasks.
+ * Flush KERN below, flush USER when returning to userspace in
+ * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+ *
+ * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+ * do it here, but can only be used if X86_FEATURE_INVPCID is
+ * available - and many machines support pcid without invpcid.
+ *
+ * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+ * would be needed in the write_cr3() below - if PCIDs enabled.
+ */
+ BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
+ kaiser_flush_tlb_on_return_to_user();
+ }
+
+ /*
+ * Caution: many callers of this function expect
+ * that load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask writes.
+ */
+ write_cr3(new_mm_cr3);
+}
+
/*
* We cannot call mmdrop() because we are in interrupt context,
* instead update mm->cpu_vm_mask.
@@ -47,7 +76,7 @@ void leave_mm(int cpu)
BUG();
if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
- load_cr3(swapper_pg_dir);
+ load_new_mm_cr3(swapper_pg_dir);
/*
* This gets called in the idle path where RCU
* functions differently. Tracing normally
@@ -59,8 +88,6 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);
-#endif /* CONFIG_SMP */
-
void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
@@ -77,10 +104,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
-#ifdef CONFIG_SMP
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
cpumask_set_cpu(cpu, mm_cpumask(next));
/*
@@ -111,7 +136,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* ordering guarantee we need.
*
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
@@ -137,9 +162,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
if (unlikely(prev->context.ldt != next->context.ldt))
load_mm_ldt(next);
#endif
- }
-#ifdef CONFIG_SMP
- else {
+ } else {
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
@@ -160,17 +183,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
* As above, load_cr3() is serializing and orders TLB
* fills with respect to the mm_cpumask write.
*/
- load_cr3(next->pgd);
+ load_new_mm_cr3(next->pgd);
trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
load_mm_cr4(next);
load_mm_ldt(next);
}
}
-#endif
}
-#ifdef CONFIG_SMP
-
/*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
@@ -272,23 +292,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
}
-void flush_tlb_current_task(void)
-{
- struct mm_struct *mm = current->mm;
-
- preempt_disable();
-
- count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-
- /* This is an implicit full barrier that synchronizes with switch_mm. */
- local_flush_tlb();
-
- trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
- preempt_enable();
-}
-
/*
* See Documentation/x86/tlb.txt for details. We choose 33
* because it is large enough to cover the vast majority (at
@@ -309,6 +312,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
preempt_disable();
+
+ if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+ base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+ if (base_pages_to_flush > tlb_single_page_flush_ceiling)
+ base_pages_to_flush = TLB_FLUSH_ALL;
+
if (current->active_mm != mm) {
/* Synchronize with switch_mm. */
smp_mb();
@@ -325,15 +334,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
goto out;
}
- if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
- base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-
/*
* Both branches below are implicit full barriers (MOV to CR or
* INVLPG) that synchronize with switch_mm.
*/
- if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
- base_pages_to_flush = TLB_FLUSH_ALL;
+ if (base_pages_to_flush == TLB_FLUSH_ALL) {
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
local_flush_tlb();
} else {
@@ -354,33 +359,6 @@ out:
preempt_enable();
}
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- preempt_disable();
-
- if (current->active_mm == mm) {
- if (current->mm) {
- /*
- * Implicit full barrier (INVLPG) that synchronizes
- * with switch_mm.
- */
- __flush_tlb_one(start);
- } else {
- leave_mm(smp_processor_id());
-
- /* Synchronize with switch_mm. */
- smp_mb();
- }
- }
-
- if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
- preempt_enable();
-}
-
static void do_flush_tlb_all(void *info)
{
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
@@ -465,5 +443,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ffa41591bff9..cbef64b508e1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -433,6 +433,12 @@ static void __init xen_init_cpuid_mask(void)
~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
+ /*
+ * Xen PV would need some work to support PCID: CR3 handling as well
+ * as xen_flush_tlb_others() would need updating.
+ */
+ cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32)); /* disable PCID */
+
if (!xen_initial_domain())
cpuid_leaf1_edx_mask &=
~((1 << X86_FEATURE_ACPI)); /* disable ACPI */
diff --git a/crypto/mcryptd.c b/crypto/mcryptd.c
index b4f3930266b1..f620fe09d20a 100644
--- a/crypto/mcryptd.c
+++ b/crypto/mcryptd.c
@@ -80,6 +80,7 @@ static int mcryptd_init_queue(struct mcryptd_queue *queue,
pr_debug("cpu_queue #%d %p\n", cpu, queue->cpu_queue);
crypto_init_queue(&cpu_queue->queue, max_cpu_qlen);
INIT_WORK(&cpu_queue->work, mcryptd_queue_worker);
+ spin_lock_init(&cpu_queue->q_lock);
}
return 0;
}
@@ -103,15 +104,16 @@ static int mcryptd_enqueue_request(struct mcryptd_queue *queue,
int cpu, err;
struct mcryptd_cpu_queue *cpu_queue;
- cpu = get_cpu();
- cpu_queue = this_cpu_ptr(queue->cpu_queue);
- rctx->tag.cpu = cpu;
+ cpu_queue = raw_cpu_ptr(queue->cpu_queue);
+ spin_lock(&cpu_queue->q_lock);
+ cpu = smp_processor_id();
+ rctx->tag.cpu = smp_processor_id();
err = crypto_enqueue_request(&cpu_queue->queue, request);
pr_debug("enqueue request: cpu %d cpu_queue %p request %p\n",
cpu, cpu_queue, request);
+ spin_unlock(&cpu_queue->q_lock);
queue_work_on(cpu, kcrypto_wq, &cpu_queue->work);
- put_cpu();
return err;
}
@@ -164,16 +166,11 @@ static void mcryptd_queue_worker(struct work_struct *work)
cpu_queue = container_of(work, struct mcryptd_cpu_queue, work);
i = 0;
while (i < MCRYPTD_BATCH || single_task_running()) {
- /*
- * preempt_disable/enable is used to prevent
- * being preempted by mcryptd_enqueue_request()
- */
- local_bh_disable();
- preempt_disable();
+
+ spin_lock_bh(&cpu_queue->q_lock);
backlog = crypto_get_backlog(&cpu_queue->queue);
req = crypto_dequeue_request(&cpu_queue->queue);
- preempt_enable();
- local_bh_enable();
+ spin_unlock_bh(&cpu_queue->q_lock);
if (!req) {
mcryptd_opportunistic_flush();
@@ -188,7 +185,7 @@ static void mcryptd_queue_worker(struct work_struct *work)
++i;
}
if (cpu_queue->queue.qlen)
- queue_work(kcrypto_wq, &cpu_queue->work);
+ queue_work_on(smp_processor_id(), kcrypto_wq, &cpu_queue->work);
}
void mcryptd_flusher(struct work_struct *__work)
diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 6682c5daf742..4c9be45ea328 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -1020,7 +1020,7 @@ skip:
/* The record may be cleared by others, try read next record */
if (len == -ENOENT)
goto skip;
- else if (len < sizeof(*rcd)) {
+ else if (len < 0 || len < sizeof(*rcd)) {
rc = -EIO;
goto out;
}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
index bc147582bed9..6d62b69c898e 100644
--- a/drivers/infiniband/hw/cxgb4/cq.c
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -579,10 +579,10 @@ static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
ret = -EAGAIN;
goto skip_cqe;
}
- if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+ if (unlikely(!CQE_STATUS(hw_cqe) &&
+ CQE_WRID_MSN(hw_cqe) != wq->rq.msn)) {
t4_set_wq_in_error(wq);
- hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
- goto proc_cqe;
+ hw_cqe->header |= cpu_to_be32(CQE_STATUS_V(T4_ERR_MSN));
}
goto proc_cqe;
}
diff --git a/drivers/mfd/cros_ec_spi.c b/drivers/mfd/cros_ec_spi.c
index 6a0f6ec67c6b..ee7847a1ca06 100644
--- a/drivers/mfd/cros_ec_spi.c
+++ b/drivers/mfd/cros_ec_spi.c
@@ -660,6 +660,7 @@ static int cros_ec_spi_probe(struct spi_device *spi)
sizeof(struct ec_response_get_protocol_info);
ec_dev->dout_size = sizeof(struct ec_host_request);
+ ec_spi->last_transfer_ns = ktime_get_ns();
err = cros_ec_register(ec_dev);
if (err) {
diff --git a/drivers/mfd/twl4030-audio.c b/drivers/mfd/twl4030-audio.c
index 0a1606480023..cc832d309599 100644
--- a/drivers/mfd/twl4030-audio.c
+++ b/drivers/mfd/twl4030-audio.c
@@ -159,13 +159,18 @@ unsigned int twl4030_audio_get_mclk(void)
EXPORT_SYMBOL_GPL(twl4030_audio_get_mclk);
static bool twl4030_audio_has_codec(struct twl4030_audio_data *pdata,
- struct device_node *node)
+ struct device_node *parent)
{
+ struct device_node *node;
+
if (pdata && pdata->codec)
return true;
- if (of_find_node_by_name(node, "codec"))
+ node = of_get_child_by_name(parent, "codec");
+ if (node) {
+ of_node_put(node);
return true;
+ }
return false;
}
diff --git a/drivers/mfd/twl6040.c b/drivers/mfd/twl6040.c
index 08a693cd38cc..72aab60ae846 100644
--- a/drivers/mfd/twl6040.c
+++ b/drivers/mfd/twl6040.c
@@ -97,12 +97,16 @@ static struct reg_sequence twl6040_patch[] = {
};
-static bool twl6040_has_vibra(struct device_node *node)
+static bool twl6040_has_vibra(struct device_node *parent)
{
-#ifdef CONFIG_OF
- if (of_find_node_by_name(node, "vibra"))
+ struct device_node *node;
+
+ node = of_get_child_by_name(parent, "vibra");
+ if (node) {
+ of_node_put(node);
return true;
-#endif
+ }
+
return false;
}
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 3613469dc5c6..ab53e0cfb4dc 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -14228,7 +14228,9 @@ static int tg3_change_mtu(struct net_device *dev, int new_mtu)
/* Reset PHY, otherwise the read DMA engine will be in a mode that
* breaks all requests to 256 bytes.
*/
- if (tg3_asic_rev(tp) == ASIC_REV_57766)
+ if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
+ tg3_asic_rev(tp) == ASIC_REV_5717 ||
+ tg3_asic_rev(tp) == ASIC_REV_5719)
reset_phy = true;
err = tg3_restart_hw(tp, reset_phy);
diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c
index fc2fb25343f4..c122b3b99cd8 100644
--- a/drivers/net/ethernet/marvell/mvmdio.c
+++ b/drivers/net/ethernet/marvell/mvmdio.c
@@ -241,7 +241,8 @@ static int orion_mdio_probe(struct platform_device *pdev)
dev->regs + MVMDIO_ERR_INT_MASK);
} else if (dev->err_interrupt == -EPROBE_DEFER) {
- return -EPROBE_DEFER;
+ ret = -EPROBE_DEFER;
+ goto out_mdio;
}
mutex_init(&dev->lock);
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 15056f06754a..7430dd44019e 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -914,6 +914,10 @@ static void mvneta_port_disable(struct mvneta_port *pp)
val &= ~MVNETA_GMAC0_PORT_ENABLE;
mvreg_write(pp, MVNETA_GMAC_CTRL_0, val);
+ pp->link = 0;
+ pp->duplex = -1;
+ pp->speed = 0;
+
udelay(200);
}
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index c8b85f1069ff..920391165f18 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -541,6 +541,7 @@ static int ksz9031_read_status(struct phy_device *phydev)
phydev->link = 0;
if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
phydev->drv->config_intr(phydev);
+ return genphy_config_aneg(phydev);
}
return 0;
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 2cbecbda1ae3..b0ea8dee5f06 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -737,6 +737,7 @@ static const struct usb_device_id products[] = {
{QMI_FIXED_INTF(0x1199, 0x9079, 10)}, /* Sierra Wireless EM74xx */
{QMI_FIXED_INTF(0x1199, 0x907b, 8)}, /* Sierra Wireless EM74xx */
{QMI_FIXED_INTF(0x1199, 0x907b, 10)}, /* Sierra Wireless EM74xx */
+ {QMI_FIXED_INTF(0x1199, 0x9091, 8)}, /* Sierra Wireless EM7565 */
{QMI_FIXED_INTF(0x1bbb, 0x011e, 4)}, /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
{QMI_FIXED_INTF(0x1bbb, 0x0203, 2)}, /* Alcatel L800MA */
{QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index d0c2759076a2..312cb5b74dec 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -1654,3 +1654,36 @@ void lba_set_iregs(struct parisc_device *lba, u32 ibase, u32 imask)
iounmap(base_addr);
}
+
+/*
+ * The design of the Diva management card in rp34x0 machines (rp3410, rp3440)
+ * seems rushed, so that many built-in components simply don't work.
+ * The following quirks disable the serial AUX port and the built-in ATI RV100
+ * Radeon 7000 graphics card which both don't have any external connectors and
+ * thus are useless, and even worse, e.g. the AUX port occupies ttyS0 and as
+ * such makes those machines the only PARISC machines on which we can't use
+ * ttyS0 as boot console.
+ */
+static void quirk_diva_ati_card(struct pci_dev *dev)
+{
+ if (dev->subsystem_vendor != PCI_VENDOR_ID_HP ||
+ dev->subsystem_device != 0x1292)
+ return;
+
+ dev_info(&dev->dev, "Hiding Diva built-in ATI card");
+ dev->device = 0;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RADEON_QY,
+ quirk_diva_ati_card);
+
+static void quirk_diva_aux_disable(struct pci_dev *dev)
+{
+ if (dev->subsystem_vendor != PCI_VENDOR_ID_HP ||
+ dev->subsystem_device != 0x1291)
+ return;
+
+ dev_info(&dev->dev, "Hiding Diva built-in AUX serial device");
+ dev->device = 0;
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_DIVA_AUX,
+ quirk_diva_aux_disable);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index fca925543fae..32bd8ab79d53 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -944,7 +944,12 @@ static int pci_pm_thaw_noirq(struct device *dev)
if (pci_has_legacy_pm_support(pci_dev))
return pci_legacy_resume_early(dev);
- pci_update_current_state(pci_dev, PCI_D0);
+ /*
+ * pci_restore_state() requires the device to be in D0 (because of MSI
+ * restoration among other things), so force it into D0 in case the
+ * driver's "freeze" callbacks put it into a low-power state directly.
+ */
+ pci_set_power_state(pci_dev, PCI_D0);
pci_restore_state(pci_dev);
if (drv && drv->pm && drv->pm->thaw_noirq)
diff --git a/drivers/spi/spi-xilinx.c b/drivers/spi/spi-xilinx.c
index 3009121173cd..3c6ea5c3ddd2 100644
--- a/drivers/spi/spi-xilinx.c
+++ b/drivers/spi/spi-xilinx.c
@@ -271,6 +271,7 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
while (remaining_words) {
int n_words, tx_words, rx_words;
u32 sr;
+ int stalled;
n_words = min(remaining_words, xspi->buffer_size);
@@ -299,7 +300,17 @@ static int xilinx_spi_txrx_bufs(struct spi_device *spi, struct spi_transfer *t)
/* Read out all the data from the Rx FIFO */
rx_words = n_words;
+ stalled = 10;
while (rx_words) {
+ if (rx_words == n_words && !(stalled--) &&
+ !(sr & XSPI_SR_TX_EMPTY_MASK) &&
+ (sr & XSPI_SR_RX_EMPTY_MASK)) {
+ dev_err(&spi->dev,
+ "Detected stall. Check C_SPI_MODE and C_SPI_MEMORY\n");
+ xspi_init_hw(xspi);
+ return -EIO;
+ }
+
if ((sr & XSPI_SR_TX_EMPTY_MASK) && (rx_words > 1)) {
xilinx_spi_rx(xspi);
rx_words--;
diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 84e71bd19082..41dda25da049 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -1801,7 +1801,7 @@ static void n_tty_set_termios(struct tty_struct *tty, struct ktermios *old)
{
struct n_tty_data *ldata = tty->disc_data;
- if (!old || (old->c_lflag ^ tty->termios.c_lflag) & ICANON) {
+ if (!old || (old->c_lflag ^ tty->termios.c_lflag) & (ICANON | EXTPROC)) {
bitmap_zero(ldata->read_flags, N_TTY_BUF_SIZE);
ldata->line_start = ldata->read_tail;
if (!L_ICANON(tty) || !read_cnt(ldata)) {
@@ -2493,7 +2493,7 @@ static int n_tty_ioctl(struct tty_struct *tty, struct file *file,
return put_user(tty_chars_in_buffer(tty), (int __user *) arg);
case TIOCINQ:
down_write(&tty->termios_rwsem);
- if (L_ICANON(tty))
+ if (L_ICANON(tty) && !L_EXTPROC(tty))
retval = inq_canon(ldata);
else
retval = read_cnt(ldata);
diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c
index f6fde903fcad..22dcccf2d286 100644
--- a/drivers/usb/core/config.c
+++ b/drivers/usb/core/config.c
@@ -973,7 +973,7 @@ int usb_get_bos_descriptor(struct usb_device *dev)
case USB_SSP_CAP_TYPE:
ssp_cap = (struct usb_ssp_cap_descriptor *)buffer;
ssac = (le32_to_cpu(ssp_cap->bmAttributes) &
- USB_SSP_SUBLINK_SPEED_ATTRIBS) + 1;
+ USB_SSP_SUBLINK_SPEED_ATTRIBS);
if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac))
dev->bos->ssp_cap = ssp_cap;
break;
diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c
index 50010282c010..c05c4f877750 100644
--- a/drivers/usb/core/quirks.c
+++ b/drivers/usb/core/quirks.c
@@ -57,10 +57,11 @@ static const struct usb_device_id usb_quirk_list[] = {
/* Microsoft LifeCam-VX700 v2.0 */
{ USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME },
- /* Logitech HD Pro Webcams C920, C920-C and C930e */
+ /* Logitech HD Pro Webcams C920, C920-C, C925e and C930e */
{ USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT },
{ USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT },
{ USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT },
+ { USB_DEVICE(0x046d, 0x085b), .driver_info = USB_QUIRK_DELAY_INIT },
/* Logitech ConferenceCam CC3000e */
{ USB_DEVICE(0x046d, 0x0847), .driver_info = USB_QUIRK_DELAY_INIT },
@@ -154,6 +155,9 @@ static const struct usb_device_id usb_quirk_list[] = {
/* Genesys Logic hub, internally used by KY-688 USB 3.1 Type-C Hub */
{ USB_DEVICE(0x05e3, 0x0612), .driver_info = USB_QUIRK_NO_LPM },
+ /* ELSA MicroLink 56K */
+ { USB_DEVICE(0x05cc, 0x2267), .driver_info = USB_QUIRK_RESET_RESUME },
+
/* Genesys Logic hub, internally used by Moshi USB to Ethernet Adapter */
{ USB_DEVICE(0x05e3, 0x0616), .driver_info = USB_QUIRK_NO_LPM },
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index e8f990642281..cbf3be66f89c 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -185,6 +185,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci)
xhci->quirks |= XHCI_BROKEN_STREAMS;
}
if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
+ pdev->device == 0x0014)
+ xhci->quirks |= XHCI_TRUST_TX_LENGTH;
+ if (pdev->vendor == PCI_VENDOR_ID_RENESAS &&
pdev->device == 0x0015)
xhci->quirks |= XHCI_RESET_ON_RESUME;
if (pdev->vendor == PCI_VENDOR_ID_VIA)
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 30344efc123f..64fe9dc25ed4 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1017,6 +1017,7 @@ static const struct usb_device_id id_table_combined[] = {
.driver_info = (kernel_ulong_t)&ftdi_jtag_quirk },
{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) },
{ USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) },
+ { USB_DEVICE(AIRBUS_DS_VID, AIRBUS_DS_P8GR) },
{ } /* Terminating entry */
};
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index f9d15bd62785..543d2801632b 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -914,6 +914,12 @@
#define ICPDAS_I7563U_PID 0x0105
/*
+ * Airbus Defence and Space
+ */
+#define AIRBUS_DS_VID 0x1e8e /* Vendor ID */
+#define AIRBUS_DS_P8GR 0x6001 /* Tetra P8GR */
+
+/*
* RT Systems programming cables for various ham radios
*/
#define RTSYSTEMS_VID 0x2100 /* Vendor ID */
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index ffa8ec917ff5..a818c43a02ec 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -236,6 +236,8 @@ static void option_instat_callback(struct urb *urb);
/* These Quectel products use Qualcomm's vendor ID */
#define QUECTEL_PRODUCT_UC20 0x9003
#define QUECTEL_PRODUCT_UC15 0x9090
+/* These Yuga products use Qualcomm's vendor ID */
+#define YUGA_PRODUCT_CLM920_NC5 0x9625
#define QUECTEL_VENDOR_ID 0x2c7c
/* These Quectel products use Quectel's vendor ID */
@@ -283,6 +285,7 @@ static void option_instat_callback(struct urb *urb);
#define TELIT_PRODUCT_LE922_USBCFG3 0x1043
#define TELIT_PRODUCT_LE922_USBCFG5 0x1045
#define TELIT_PRODUCT_ME910 0x1100
+#define TELIT_PRODUCT_ME910_DUAL_MODEM 0x1101
#define TELIT_PRODUCT_LE920 0x1200
#define TELIT_PRODUCT_LE910 0x1201
#define TELIT_PRODUCT_LE910_USBCFG4 0x1206
@@ -648,6 +651,11 @@ static const struct option_blacklist_info telit_me910_blacklist = {
.reserved = BIT(1) | BIT(3),
};
+static const struct option_blacklist_info telit_me910_dual_modem_blacklist = {
+ .sendsetup = BIT(0),
+ .reserved = BIT(3),
+};
+
static const struct option_blacklist_info telit_le910_blacklist = {
.sendsetup = BIT(0),
.reserved = BIT(1) | BIT(2),
@@ -677,6 +685,10 @@ static const struct option_blacklist_info cinterion_rmnet2_blacklist = {
.reserved = BIT(4) | BIT(5),
};
+static const struct option_blacklist_info yuga_clm920_nc5_blacklist = {
+ .reserved = BIT(1) | BIT(4),
+};
+
static const struct usb_device_id option_ids[] = {
{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_COLT) },
{ USB_DEVICE(OPTION_VENDOR_ID, OPTION_PRODUCT_RICOLA) },
@@ -1181,6 +1193,9 @@ static const struct usb_device_id option_ids[] = {
{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC15)},
{ USB_DEVICE(QUALCOMM_VENDOR_ID, QUECTEL_PRODUCT_UC20),
.driver_info = (kernel_ulong_t)&net_intf4_blacklist },
+ /* Yuga products use Qualcomm vendor ID */
+ { USB_DEVICE(QUALCOMM_VENDOR_ID, YUGA_PRODUCT_CLM920_NC5),
+ .driver_info = (kernel_ulong_t)&yuga_clm920_nc5_blacklist },
/* Quectel products using Quectel vendor ID */
{ USB_DEVICE(QUECTEL_VENDOR_ID, QUECTEL_PRODUCT_EC21),
.driver_info = (kernel_ulong_t)&net_intf4_blacklist },
@@ -1247,6 +1262,8 @@ static const struct usb_device_id option_ids[] = {
.driver_info = (kernel_ulong_t)&telit_le922_blacklist_usbcfg0 },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910),
.driver_info = (kernel_ulong_t)&telit_me910_blacklist },
+ { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_ME910_DUAL_MODEM),
+ .driver_info = (kernel_ulong_t)&telit_me910_dual_modem_blacklist },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910),
.driver_info = (kernel_ulong_t)&telit_le910_blacklist },
{ USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_LE910_USBCFG4),
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 4516291df1b8..fb6dc16c754a 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -166,6 +166,8 @@ static const struct usb_device_id id_table[] = {
{DEVICE_SWI(0x1199, 0x9079)}, /* Sierra Wireless EM74xx */
{DEVICE_SWI(0x1199, 0x907a)}, /* Sierra Wireless EM74xx QDL */
{DEVICE_SWI(0x1199, 0x907b)}, /* Sierra Wireless EM74xx */
+ {DEVICE_SWI(0x1199, 0x9090)}, /* Sierra Wireless EM7565 QDL */
+ {DEVICE_SWI(0x1199, 0x9091)}, /* Sierra Wireless EM7565 */
{DEVICE_SWI(0x413c, 0x81a2)}, /* Dell Wireless 5806 Gobi(TM) 4G LTE Mobile Broadband Card */
{DEVICE_SWI(0x413c, 0x81a3)}, /* Dell Wireless 5570 HSPA+ (42Mbps) Mobile Broadband Card */
{DEVICE_SWI(0x413c, 0x81a4)}, /* Dell Wireless 5570e HSPA+ (42Mbps) Mobile Broadband Card */
@@ -346,6 +348,7 @@ static int qcprobe(struct usb_serial *serial, const struct usb_device_id *id)
break;
case 2:
dev_dbg(dev, "NMEA GPS interface found\n");
+ sendsetup = true;
break;
case 3:
dev_dbg(dev, "Modem port found\n");
diff --git a/drivers/usb/usbip/stub_main.c b/drivers/usb/usbip/stub_main.c
index af10f7b131a4..325b4c05acdd 100644
--- a/drivers/usb/usbip/stub_main.c
+++ b/drivers/usb/usbip/stub_main.c
@@ -252,11 +252,12 @@ void stub_device_cleanup_urbs(struct stub_device *sdev)
struct stub_priv *priv;
struct urb *urb;
- dev_dbg(&sdev->udev->dev, "free sdev %p\n", sdev);
+ dev_dbg(&sdev->udev->dev, "Stub device cleaning up urbs\n");
while ((priv = stub_priv_pop(sdev))) {
urb = priv->urb;
- dev_dbg(&sdev->udev->dev, "free urb %p\n", urb);
+ dev_dbg(&sdev->udev->dev, "free urb seqnum %lu\n",
+ priv->seqnum);
usb_kill_urb(urb);
kmem_cache_free(stub_priv_cache, priv);
diff --git a/drivers/usb/usbip/stub_rx.c b/drivers/usb/usbip/stub_rx.c
index 00e475c51a12..7de54a66044f 100644
--- a/drivers/usb/usbip/stub_rx.c
+++ b/drivers/usb/usbip/stub_rx.c
@@ -230,9 +230,6 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
if (priv->seqnum != pdu->u.cmd_unlink.seqnum)
continue;
- dev_info(&priv->urb->dev->dev, "unlink urb %p\n",
- priv->urb);
-
/*
* This matched urb is not completed yet (i.e., be in
* flight in usb hcd hardware/driver). Now we are
@@ -271,8 +268,8 @@ static int stub_recv_cmd_unlink(struct stub_device *sdev,
ret = usb_unlink_urb(priv->urb);
if (ret != -EINPROGRESS)
dev_err(&priv->urb->dev->dev,
- "failed to unlink a urb %p, ret %d\n",
- priv->urb, ret);
+ "failed to unlink a urb # %lu, ret %d\n",
+ priv->seqnum, ret);
return 0;
}
diff --git a/drivers/usb/usbip/stub_tx.c b/drivers/usb/usbip/stub_tx.c
index af858d52608a..f4dd30c56f36 100644
--- a/drivers/usb/usbip/stub_tx.c
+++ b/drivers/usb/usbip/stub_tx.c
@@ -201,8 +201,8 @@ static int stub_send_ret_submit(struct stub_device *sdev)
/* 1. setup usbip_header */
setup_ret_submit_pdu(&pdu_header, urb);
- usbip_dbg_stub_tx("setup txdata seqnum: %d urb: %p\n",
- pdu_header.base.seqnum, urb);
+ usbip_dbg_stub_tx("setup txdata seqnum: %d\n",
+ pdu_header.base.seqnum);
usbip_header_correct_endian(&pdu_header, 1);
iov[iovnum].iov_base = &pdu_header;
diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c
index 81b2b9f808b5..f9af04d7f02f 100644
--- a/drivers/usb/usbip/vhci_hcd.c
+++ b/drivers/usb/usbip/vhci_hcd.c
@@ -467,9 +467,6 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb,
int ret = 0;
struct vhci_device *vdev;
- usbip_dbg_vhci_hc("enter, usb_hcd %p urb %p mem_flags %d\n",
- hcd, urb, mem_flags);
-
/* patch to usb_sg_init() is in 2.5.60 */
BUG_ON(!urb->transfer_buffer && urb->transfer_buffer_length);
@@ -627,8 +624,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
struct vhci_priv *priv;
struct vhci_device *vdev;
- pr_info("dequeue a urb %p\n", urb);
-
spin_lock(&the_controller->lock);
priv = urb->hcpriv;
@@ -656,7 +651,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
/* tcp connection is closed */
spin_lock(&vdev->priv_lock);
- pr_info("device %p seems to be disconnected\n", vdev);
list_del(&priv->list);
kfree(priv);
urb->hcpriv = NULL;
@@ -668,8 +662,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
* vhci_rx will receive RET_UNLINK and give back the URB.
* Otherwise, we give back it here.
*/
- pr_info("gives back urb %p\n", urb);
-
usb_hcd_unlink_urb_from_ep(hcd, urb);
spin_unlock(&the_controller->lock);
@@ -698,8 +690,6 @@ static int vhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status)
unlink->unlink_seqnum = priv->seqnum;
- pr_info("device %p seems to be still connected\n", vdev);
-
/* send cmd_unlink and try to cancel the pending URB in the
* peer */
list_add_tail(&unlink->list, &vdev->unlink_tx);
diff --git a/drivers/usb/usbip/vhci_rx.c b/drivers/usb/usbip/vhci_rx.c
index 00e4a54308e4..bc4eb0855314 100644
--- a/drivers/usb/usbip/vhci_rx.c
+++ b/drivers/usb/usbip/vhci_rx.c
@@ -37,24 +37,23 @@ struct urb *pickup_urb_and_free_priv(struct vhci_device *vdev, __u32 seqnum)
urb = priv->urb;
status = urb->status;
- usbip_dbg_vhci_rx("find urb %p vurb %p seqnum %u\n",
- urb, priv, seqnum);
+ usbip_dbg_vhci_rx("find urb seqnum %u\n", seqnum);
switch (status) {
case -ENOENT:
/* fall through */
case -ECONNRESET:
- dev_info(&urb->dev->dev,
- "urb %p was unlinked %ssynchronuously.\n", urb,
- status == -ENOENT ? "" : "a");
+ dev_dbg(&urb->dev->dev,
+ "urb seq# %u was unlinked %ssynchronuously\n",
+ seqnum, status == -ENOENT ? "" : "a");
break;
case -EINPROGRESS:
/* no info output */
break;
default:
- dev_info(&urb->dev->dev,
- "urb %p may be in a error, status %d\n", urb,
- status);
+ dev_dbg(&urb->dev->dev,
+ "urb seq# %u may be in a error, status %d\n",
+ seqnum, status);
}
list_del(&priv->list);
@@ -78,8 +77,8 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
spin_unlock(&vdev->priv_lock);
if (!urb) {
- pr_err("cannot find a urb of seqnum %u\n", pdu->base.seqnum);
- pr_info("max seqnum %d\n",
+ pr_err("cannot find a urb of seqnum %u max seqnum %d\n",
+ pdu->base.seqnum,
atomic_read(&the_controller->seqnum));
usbip_event_add(ud, VDEV_EVENT_ERROR_TCP);
return;
@@ -102,7 +101,7 @@ static void vhci_recv_ret_submit(struct vhci_device *vdev,
if (usbip_dbg_flag_vhci_rx)
usbip_dump_urb(urb);
- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+ usbip_dbg_vhci_rx("now giveback urb %u\n", pdu->base.seqnum);
spin_lock(&the_controller->lock);
usb_hcd_unlink_urb_from_ep(vhci_to_hcd(the_controller), urb);
@@ -165,7 +164,7 @@ static void vhci_recv_ret_unlink(struct vhci_device *vdev,
pr_info("the urb (seqnum %d) was already given back\n",
pdu->base.seqnum);
} else {
- usbip_dbg_vhci_rx("now giveback urb %p\n", urb);
+ usbip_dbg_vhci_rx("now giveback urb %d\n", pdu->base.seqnum);
/* If unlink is successful, status is -ECONNRESET */
urb->status = pdu->u.ret_unlink.status;
diff --git a/drivers/usb/usbip/vhci_tx.c b/drivers/usb/usbip/vhci_tx.c
index 409fd99f3257..3c5796c8633a 100644
--- a/drivers/usb/usbip/vhci_tx.c
+++ b/drivers/usb/usbip/vhci_tx.c
@@ -82,7 +82,8 @@ static int vhci_send_cmd_submit(struct vhci_device *vdev)
memset(&msg, 0, sizeof(msg));
memset(&iov, 0, sizeof(iov));
- usbip_dbg_vhci_tx("setup txdata urb %p\n", urb);
+ usbip_dbg_vhci_tx("setup txdata urb seqnum %lu\n",
+ priv->seqnum);
/* 1. setup usbip_header */
setup_cmd_submit_pdu(&pdu_header, urb);
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ef2e8c97e183..a461b6604fd9 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -725,7 +725,14 @@
*/
#define PERCPU_INPUT(cacheline) \
VMLINUX_SYMBOL(__per_cpu_start) = .; \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .; \
*(.data..percpu..first) \
+ . = ALIGN(cacheline); \
+ *(.data..percpu..user_mapped) \
+ *(.data..percpu..user_mapped..shared_aligned) \
+ . = ALIGN(PAGE_SIZE); \
+ *(.data..percpu..user_mapped..page_aligned) \
+ VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .; \
. = ALIGN(PAGE_SIZE); \
*(.data..percpu..page_aligned) \
. = ALIGN(cacheline); \
diff --git a/include/crypto/mcryptd.h b/include/crypto/mcryptd.h
index c23ee1f7ee80..c2ff077168d3 100644
--- a/include/crypto/mcryptd.h
+++ b/include/crypto/mcryptd.h
@@ -26,6 +26,7 @@ static inline struct mcryptd_ahash *__mcryptd_ahash_cast(
struct mcryptd_cpu_queue {
struct crypto_queue queue;
+ spinlock_t q_lock;
struct work_struct work;
};
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 402753bccafa..7b8e3afcc291 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -215,7 +215,8 @@ struct ipv6_pinfo {
* 100: prefer care-of address
*/
dontfrag:1,
- autoflowlabel:1;
+ autoflowlabel:1,
+ autoflowlabel_set:1;
__u8 min_hopcount;
__u8 tclass;
__be32 rcv_flowinfo;
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..58c55b1589d0
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ /*
+ * Map that page of kernel stack on which we enter from user context.
+ */
+ return kaiser_add_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+ /*
+ * Note: may be called even when kaiser_map_thread_stack() failed.
+ */
+ kaiser_remove_mapping((unsigned long)stack +
+ THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+ unsigned long size, unsigned long flags)
+{
+ return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+ unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+ return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ff88d6189411..b93b578cfa42 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -131,8 +131,9 @@ enum zone_stat_item {
NR_SLAB_RECLAIMABLE,
NR_SLAB_UNRECLAIMABLE,
NR_PAGETABLE, /* used for pagetables */
- NR_KERNEL_STACK,
/* Second 128 byte cacheline */
+ NR_KERNEL_STACK,
+ NR_KAISERTABLE,
NR_UNSTABLE_NFS, /* NFS unstable pages */
NR_BOUNCE,
NR_VMSCAN_WRITE,
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299ca068..8902f23bb770 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
#endif
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
/*
* Base implementations of per-CPU variable declarations and definitions, where
* the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
#define DEFINE_PER_CPU(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "")
+#define DECLARE_PER_CPU_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
/*
* Declaration/definition used for per-CPU variables that must come first in
* the set of variables.
@@ -144,6 +156,14 @@
DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
____cacheline_aligned_in_smp
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION PER_CPU_SHARED_ALIGNED_SECTION) \
+ ____cacheline_aligned_in_smp
+
#define DECLARE_PER_CPU_ALIGNED(type, name) \
DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION) \
____cacheline_aligned
@@ -162,11 +182,21 @@
#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name) \
DEFINE_PER_CPU_SECTION(type, name, "..page_aligned") \
__aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name) \
+ DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") \
+ __aligned(PAGE_SIZE)
/*
* Declaration/definition used for per-CPU variables that must be read mostly.
*/
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index e623d392db0c..8ef3a61fdc74 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -80,10 +80,8 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
#endif
#endif
#ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
NR_TLB_REMOTE_FLUSH, /* cpu tried to flush others' tlbs */
NR_TLB_REMOTE_FLUSH_RECEIVED,/* cpu received ipi for flush */
-#endif /* CONFIG_SMP */
NR_TLB_LOCAL_FLUSH_ALL,
NR_TLB_LOCAL_FLUSH_ONE,
#endif /* CONFIG_DEBUG_TLBFLUSH */
diff --git a/include/net/ip.h b/include/net/ip.h
index 7476bb10ff37..639398af273b 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -33,6 +33,8 @@
#include <net/flow.h>
#include <net/flow_dissector.h>
+#define IPV4_MIN_MTU 68 /* RFC 791 */
+
struct sock;
struct inet_skb_parm {
diff --git a/init/main.c b/init/main.c
index 9e64d7097f1a..49926d95442f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -81,6 +81,7 @@
#include <linux/integrity.h>
#include <linux/proc_ns.h>
#include <linux/io.h>
+#include <linux/kaiser.h>
#include <asm/io.h>
#include <asm/bugs.h>
@@ -492,6 +493,7 @@ static void __init mm_init(void)
pgtable_init();
vmalloc_init();
ioremap_huge_init();
+ kaiser_init();
}
asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 68cfda1c1800..ac00f14208b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -169,6 +170,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
static inline void free_thread_info(struct thread_info *ti)
{
+ kaiser_unmap_thread_stack(ti);
free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
}
# else
@@ -352,6 +354,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
goto free_ti;
tsk->stack = ti;
+
+ err = kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto free_ti;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 22c57e191a23..e5d228f7224c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -568,6 +568,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -584,8 +589,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqretry(&jiffies_lock, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1275175b0946..d9cd6191760b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 61d0960559c8..8aef4e63ac57 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -5754,7 +5754,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -5805,14 +5805,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -6539,6 +6531,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -6562,7 +6555,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c344e3609c53..324b7e90b4c5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -736,6 +736,7 @@ const char * const vmstat_text[] = {
"nr_slab_unreclaimable",
"nr_page_table_pages",
"nr_kernel_stack",
+ "nr_overhead",
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index a1f697ec4fc2..0ce26a0f7913 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1067,19 +1067,20 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev,
struct net_bridge *br = netdev_priv(dev);
int err;
+ err = register_netdevice(dev);
+ if (err)
+ return err;
+
if (tb[IFLA_ADDRESS]) {
spin_lock_bh(&br->lock);
br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
spin_unlock_bh(&br->lock);
}
- err = register_netdevice(dev);
- if (err)
- return err;
-
err = br_changelink(dev, tb, data);
if (err)
- unregister_netdevice(dev);
+ br_dev_delete(dev, NULL);
+
return err;
}
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 2e9a1c2818c7..b5c351d2830b 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -261,7 +261,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
spin_lock_irqsave(&net->nsid_lock, flags);
peer = idr_find(&net->netns_ids, id);
if (peer)
- get_net(peer);
+ peer = maybe_get_net(peer);
spin_unlock_irqrestore(&net->nsid_lock, flags);
rcu_read_unlock();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index d33609c2f276..86b619501350 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3676,7 +3676,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock *sk = skb->sk;
if (!skb_may_tx_timestamp(sk, false))
- return;
+ goto err;
/* Take a reference to prevent skb_orphan() from freeing the socket,
* but only if the socket refcount is not zero.
@@ -3685,7 +3685,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
*skb_hwtstamps(skb) = *hwtstamps;
__skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND);
sock_put(sk);
+ return;
}
+
+err:
+ kfree_skb(skb);
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 0212591b0077..63f99e9a821b 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1358,7 +1358,7 @@ skip:
static bool inetdev_valid_mtu(unsigned int mtu)
{
- return mtu >= 68;
+ return mtu >= IPV4_MIN_MTU;
}
static void inetdev_send_gratuitous_arp(struct net_device *dev,
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 0cb240c749bf..c9e68ff48a72 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1252,7 +1252,7 @@ fail:
static void ip_fib_net_exit(struct net *net)
{
- unsigned int i;
+ int i;
rtnl_lock();
#ifdef CONFIG_IP_MULTIPLE_TABLES
@@ -1260,7 +1260,12 @@ static void ip_fib_net_exit(struct net *net)
RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
#endif
- for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
+ /* Destroy the tables in reverse order to guarantee that the
+ * local table, ID 255, is destroyed before the main table, ID
+ * 254. This is necessary as the local table may contain
+ * references to data contained in the main table.
+ */
+ for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
struct hlist_head *head = &net->ipv4.fib_table_hash[i];
struct hlist_node *tmp;
struct fib_table *tb;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 3809d523d012..b60106d34346 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -89,6 +89,7 @@
#include <linux/rtnetlink.h>
#include <linux/times.h>
#include <linux/pkt_sched.h>
+#include <linux/byteorder/generic.h>
#include <net/net_namespace.h>
#include <net/arp.h>
@@ -327,6 +328,23 @@ igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted)
return scount;
}
+/* source address selection per RFC 3376 section 4.2.13 */
+static __be32 igmpv3_get_srcaddr(struct net_device *dev,
+ const struct flowi4 *fl4)
+{
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+ if (!in_dev)
+ return htonl(INADDR_ANY);
+
+ for_ifa(in_dev) {
+ if (inet_ifa_match(fl4->saddr, ifa))
+ return fl4->saddr;
+ } endfor_ifa(in_dev);
+
+ return htonl(INADDR_ANY);
+}
+
static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
{
struct sk_buff *skb;
@@ -374,7 +392,7 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
pip->frag_off = htons(IP_DF);
pip->ttl = 1;
pip->daddr = fl4.daddr;
- pip->saddr = fl4.saddr;
+ pip->saddr = igmpv3_get_srcaddr(dev, &fl4);
pip->protocol = IPPROTO_IGMP;
pip->tot_len = 0; /* filled in later */
ip_select_ident(net, skb, NULL);
@@ -410,16 +428,17 @@ static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
- int type, struct igmpv3_grec **ppgr)
+ int type, struct igmpv3_grec **ppgr, unsigned int mtu)
{
struct net_device *dev = pmc->interface->dev;
struct igmpv3_report *pih;
struct igmpv3_grec *pgr;
- if (!skb)
- skb = igmpv3_newpack(dev, dev->mtu);
- if (!skb)
- return NULL;
+ if (!skb) {
+ skb = igmpv3_newpack(dev, mtu);
+ if (!skb)
+ return NULL;
+ }
pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
@@ -441,12 +460,17 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
+ unsigned int mtu;
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
return skb;
+ mtu = READ_ONCE(dev->mtu);
+ if (mtu < IPV4_MIN_MTU)
+ return skb;
+
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
type == IGMPV3_MODE_IS_EXCLUDE;
truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
@@ -467,7 +491,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
}
}
first = 1;
@@ -494,12 +518,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
pgr->grec_nsrcs = htons(scount);
if (skb)
igmpv3_sendpack(skb);
- skb = igmpv3_newpack(dev, dev->mtu);
+ skb = igmpv3_newpack(dev, mtu);
first = 1;
scount = 0;
}
if (first) {
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
if (!skb)
@@ -533,7 +557,7 @@ empty_source:
igmpv3_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 3310ac75e3f3..c18245e05d26 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -400,8 +400,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
dev->needed_headroom = t_hlen + hlen;
mtu -= (dev->hard_header_len + t_hlen);
- if (mtu < 68)
- mtu = 68;
+ if (mtu < IPV4_MIN_MTU)
+ mtu = IPV4_MIN_MTU;
return mtu;
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8f2cd7d09720..4d3d4291c82f 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -500,11 +500,16 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int err;
struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
+ int hdrincl;
err = -EMSGSIZE;
if (len > 0xFFFF)
goto out;
+ /* hdrincl should be READ_ONCE(inet->hdrincl)
+ * but READ_ONCE() doesn't work with bit fields
+ */
+ hdrincl = inet->hdrincl;
/*
* Check the flags.
*/
@@ -579,7 +584,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* Linux does not mangle headers on raw sockets,
* so that IP options + IP_HDRINCL is non-sense.
*/
- if (inet->hdrincl)
+ if (hdrincl)
goto done;
if (ipc.opt->opt.srr) {
if (!daddr)
@@ -601,9 +606,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE,
- inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+ hdrincl ? IPPROTO_RAW : sk->sk_protocol,
inet_sk_flowi_flags(sk) |
- (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+ (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
if (!saddr && ipc.oif) {
@@ -612,7 +617,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto done;
}
- if (!inet->hdrincl) {
+ if (!hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -637,7 +642,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
goto do_confirm;
back_from_confirm:
- if (inet->hdrincl)
+ if (hdrincl)
err = raw_send_hdrinc(sk, &fl4, msg, len,
&rt, msg->msg_flags);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a5d790c13ef5..61c93a93f228 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -823,7 +823,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_time_stamp,
req->ts_recent,
0,
- tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
+ tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
AF_INET),
inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
ip_hdr(skb)->tos);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 83ec6639b04d..637a0e41b0aa 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -200,7 +200,6 @@ lookup_protocol:
np->mcast_hops = IPV6_DEFAULT_MCASTHOPS;
np->mc_loop = 1;
np->pmtudisc = IPV6_PMTUDISC_WANT;
- np->autoflowlabel = ip6_default_np_autolabel(sock_net(sk));
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
/* Init the ipv4 part of the socket since we can have sockets
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 71624cf26832..1b4f5f2d2929 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,6 +148,14 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
!(IP6CB(skb)->flags & IP6SKB_REROUTED));
}
+static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
+{
+ if (!np->autoflowlabel_set)
+ return ip6_default_np_autolabel(net);
+ else
+ return np->autoflowlabel;
+}
+
/*
* xmit an sk_buff (used by TCP, SCTP and DCCP)
* Note : socket lock is not held for SYNACK packets, but might be modified
@@ -211,7 +219,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
hlimit = ip6_dst_hoplimit(dst);
ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel, fl6));
+ ip6_autoflowlabel(net, np), fl6));
hdr->payload_len = htons(seg_len);
hdr->nexthdr = proto;
@@ -1675,7 +1683,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_flow_hdr(hdr, v6_cork->tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
- np->autoflowlabel, fl6));
+ ip6_autoflowlabel(net, np), fl6));
hdr->hop_limit = v6_cork->hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a4a30d2ca66f..435e26210587 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -872,6 +872,7 @@ pref_skip_coa:
break;
case IPV6_AUTOFLOWLABEL:
np->autoflowlabel = valbool;
+ np->autoflowlabel_set = 1;
retv = 0;
break;
}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index d64ee7e83664..06640685ff43 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1668,16 +1668,16 @@ static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel)
}
static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
- int type, struct mld2_grec **ppgr)
+ int type, struct mld2_grec **ppgr, unsigned int mtu)
{
- struct net_device *dev = pmc->idev->dev;
struct mld2_report *pmr;
struct mld2_grec *pgr;
- if (!skb)
- skb = mld_newpack(pmc->idev, dev->mtu);
- if (!skb)
- return NULL;
+ if (!skb) {
+ skb = mld_newpack(pmc->idev, mtu);
+ if (!skb)
+ return NULL;
+ }
pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec));
pgr->grec_type = type;
pgr->grec_auxwords = 0;
@@ -1700,10 +1700,15 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
struct mld2_grec *pgr = NULL;
struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
int scount, stotal, first, isquery, truncate;
+ unsigned int mtu;
if (pmc->mca_flags & MAF_NOREPORT)
return skb;
+ mtu = READ_ONCE(dev->mtu);
+ if (mtu < IPV6_MIN_MTU)
+ return skb;
+
isquery = type == MLD2_MODE_IS_INCLUDE ||
type == MLD2_MODE_IS_EXCLUDE;
truncate = type == MLD2_MODE_IS_EXCLUDE ||
@@ -1724,7 +1729,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
if (skb)
mld_sendpack(skb);
- skb = mld_newpack(idev, dev->mtu);
+ skb = mld_newpack(idev, mtu);
}
}
first = 1;
@@ -1751,12 +1756,12 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc,
pgr->grec_nsrcs = htons(scount);
if (skb)
mld_sendpack(skb);
- skb = mld_newpack(idev, dev->mtu);
+ skb = mld_newpack(idev, mtu);
first = 1;
scount = 0;
}
if (first) {
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
first = 0;
}
if (!skb)
@@ -1790,7 +1795,7 @@ empty_source:
mld_sendpack(skb);
skb = NULL; /* add_grhead will get a new one */
}
- skb = add_grhead(skb, pmc, type, &pgr);
+ skb = add_grhead(skb, pmc, type, &pgr, mtu);
}
}
if (pgr)
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 59c908ff251a..74cbcc4b399c 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -949,7 +949,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
tcp_rsk(req)->rcv_nxt,
req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
tcp_time_stamp, req->ts_recent, sk->sk_bound_dev_if,
- tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
+ tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
0, 0);
}
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 5fabe68e20dd..48e1608414e6 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -261,6 +261,9 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
struct sock *sk = skb->sk;
int ret = -ENOMEM;
+ if (!net_eq(dev_net(dev), sock_net(sk)))
+ return 0;
+
dev_hold(dev);
if (is_vmalloc_addr(skb->head))
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 61189c576963..a870d27ca778 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4153,7 +4153,7 @@ static int sctp_init_sock(struct sock *sk)
SCTP_DBG_OBJCNT_INC(sock);
local_bh_disable();
- percpu_counter_inc(&sctp_sockets_allocated);
+ sk_sockets_allocated_inc(sk);
sock_prot_inuse_add(net, sk->sk_prot, 1);
/* Nothing can fail after this block, otherwise
@@ -4197,7 +4197,7 @@ static void sctp_destroy_sock(struct sock *sk)
}
sctp_endpoint_free(sp->ep);
local_bh_disable();
- percpu_counter_dec(&sctp_sockets_allocated);
+ sk_sockets_allocated_dec(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
}
diff --git a/security/Kconfig b/security/Kconfig
index e45237897b43..a3ebb6ee5bd5 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -31,6 +31,16 @@ config SECURITY
If you are unsure how to answer this question, answer N.
+config PAGE_TABLE_ISOLATION
+ bool "Remove the kernel mapping in user mode"
+ default y
+ depends on X86_64 && SMP
+ help
+ This enforces a strict kernel and user space isolation, in order
+ to close hardware side channels on kernel address information.
+
+ If you are unsure how to answer this question, answer Y.
+
config SECURITYFS
bool "Enable the securityfs filesystem"
help
diff --git a/sound/core/rawmidi.c b/sound/core/rawmidi.c
index b450a27588c8..16f8124b1150 100644
--- a/sound/core/rawmidi.c
+++ b/sound/core/rawmidi.c
@@ -579,15 +579,14 @@ static int snd_rawmidi_info_user(struct snd_rawmidi_substream *substream,
return 0;
}
-int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info)
+static int __snd_rawmidi_info_select(struct snd_card *card,
+ struct snd_rawmidi_info *info)
{
struct snd_rawmidi *rmidi;
struct snd_rawmidi_str *pstr;
struct snd_rawmidi_substream *substream;
- mutex_lock(&register_mutex);
rmidi = snd_rawmidi_search(card, info->device);
- mutex_unlock(&register_mutex);
if (!rmidi)
return -ENXIO;
if (info->stream < 0 || info->stream > 1)
@@ -603,6 +602,16 @@ int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info
}
return -ENXIO;
}
+
+int snd_rawmidi_info_select(struct snd_card *card, struct snd_rawmidi_info *info)
+{
+ int ret;
+
+ mutex_lock(&register_mutex);
+ ret = __snd_rawmidi_info_select(card, info);
+ mutex_unlock(&register_mutex);
+ return ret;
+}
EXPORT_SYMBOL(snd_rawmidi_info_select);
static int snd_rawmidi_info_select_user(struct snd_card *card,
diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c
index cce9ae5ec93b..bd7bcf428bcf 100644
--- a/sound/hda/hdac_i915.c
+++ b/sound/hda/hdac_i915.c
@@ -183,7 +183,7 @@ static int hdac_component_master_match(struct device *dev, void *data)
*/
int snd_hdac_i915_register_notifier(const struct i915_audio_component_audio_ops *aops)
{
- if (WARN_ON(!hdac_acomp))
+ if (!hdac_acomp)
return -ENODEV;
hdac_acomp->audio_ops = aops;
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 2159b18f76bf..5875a08d555e 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -5954,6 +5954,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = {
{0x1b, 0x01011020},
{0x21, 0x02211010}),
SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
+ {0x12, 0x90a60130},
+ {0x14, 0x90170110},
+ {0x1b, 0x01011020},
+ {0x21, 0x0221101f}),
+ SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE,
{0x12, 0x90a60160},
{0x14, 0x90170120},
{0x21, 0x02211030}),
diff --git a/sound/soc/codecs/twl4030.c b/sound/soc/codecs/twl4030.c
index a5a4e9f75c57..a06395507225 100644
--- a/sound/soc/codecs/twl4030.c
+++ b/sound/soc/codecs/twl4030.c
@@ -232,7 +232,7 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
struct twl4030_codec_data *pdata = dev_get_platdata(codec->dev);
struct device_node *twl4030_codec_node = NULL;
- twl4030_codec_node = of_find_node_by_name(codec->dev->parent->of_node,
+ twl4030_codec_node = of_get_child_by_name(codec->dev->parent->of_node,
"codec");
if (!pdata && twl4030_codec_node) {
@@ -241,9 +241,11 @@ static struct twl4030_codec_data *twl4030_get_pdata(struct snd_soc_codec *codec)
GFP_KERNEL);
if (!pdata) {
dev_err(codec->dev, "Can not allocate memory\n");
+ of_node_put(twl4030_codec_node);
return NULL;
}
twl4030_setup_pdata_of(pdata, twl4030_codec_node);
+ of_node_put(twl4030_codec_node);
}
return pdata;
diff --git a/sound/soc/fsl/fsl_ssi.c b/sound/soc/fsl/fsl_ssi.c
index 95d2392303eb..7ca67613e0d4 100644
--- a/sound/soc/fsl/fsl_ssi.c
+++ b/sound/soc/fsl/fsl_ssi.c
@@ -1408,12 +1408,6 @@ static int fsl_ssi_probe(struct platform_device *pdev)
sizeof(fsl_ssi_ac97_dai));
fsl_ac97_data = ssi_private;
-
- ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
- if (ret) {
- dev_err(&pdev->dev, "could not set AC'97 ops\n");
- return ret;
- }
} else {
/* Initialize this copy of the CPU DAI driver structure */
memcpy(&ssi_private->cpu_dai_drv, &fsl_ssi_dai_template,
@@ -1473,6 +1467,14 @@ static int fsl_ssi_probe(struct platform_device *pdev)
return ret;
}
+ if (fsl_ssi_is_ac97(ssi_private)) {
+ ret = snd_soc_set_ac97_ops_of_reset(&fsl_ssi_ac97_ops, pdev);
+ if (ret) {
+ dev_err(&pdev->dev, "could not set AC'97 ops\n");
+ goto error_ac97_ops;
+ }
+ }
+
ret = devm_snd_soc_register_component(&pdev->dev, &fsl_ssi_component,
&ssi_private->cpu_dai_drv, 1);
if (ret) {
@@ -1556,6 +1558,10 @@ error_sound_card:
fsl_ssi_debugfs_remove(&ssi_private->dbg_stats);
error_asoc_register:
+ if (fsl_ssi_is_ac97(ssi_private))
+ snd_soc_set_ac97_ops(NULL);
+
+error_ac97_ops:
if (ssi_private->soc->imx)
fsl_ssi_imx_clean(pdev, ssi_private);
diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c
index 0ed9ae030ce1..c9ae29068c7c 100644
--- a/sound/usb/mixer.c
+++ b/sound/usb/mixer.c
@@ -2101,20 +2101,25 @@ static int parse_audio_selector_unit(struct mixer_build *state, int unitid,
kctl->private_value = (unsigned long)namelist;
kctl->private_free = usb_mixer_selector_elem_free;
- nameid = uac_selector_unit_iSelector(desc);
+ /* check the static mapping table at first */
len = check_mapped_name(map, kctl->id.name, sizeof(kctl->id.name));
- if (len)
- ;
- else if (nameid)
- len = snd_usb_copy_string_desc(state, nameid, kctl->id.name,
- sizeof(kctl->id.name));
- else
- len = get_term_name(state, &state->oterm,
- kctl->id.name, sizeof(kctl->id.name), 0);
-
if (!len) {
- strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name));
+ /* no mapping ? */
+ /* if iSelector is given, use it */
+ nameid = uac_selector_unit_iSelector(desc);
+ if (nameid)
+ len = snd_usb_copy_string_desc(state, nameid,
+ kctl->id.name,
+ sizeof(kctl->id.name));
+ /* ... or pick up the terminal name at next */
+ if (!len)
+ len = get_term_name(state, &state->oterm,
+ kctl->id.name, sizeof(kctl->id.name), 0);
+ /* ... or use the fixed string "USB" as the last resort */
+ if (!len)
+ strlcpy(kctl->id.name, "USB", sizeof(kctl->id.name));
+ /* and add the proper suffix */
if (desc->bDescriptorSubtype == UAC2_CLOCK_SELECTOR)
append_ctl_name(kctl, " Clock Source");
else if ((state->oterm.type & 0xff00) == 0x0100)
diff --git a/tools/usb/usbip/src/utils.c b/tools/usb/usbip/src/utils.c
index 2b3d6d235015..3d7b42e77299 100644
--- a/tools/usb/usbip/src/utils.c
+++ b/tools/usb/usbip/src/utils.c
@@ -30,6 +30,7 @@ int modify_match_busid(char *busid, int add)
char command[SYSFS_BUS_ID_SIZE + 4];
char match_busid_attr_path[SYSFS_PATH_MAX];
int rc;
+ int cmd_size;
snprintf(match_busid_attr_path, sizeof(match_busid_attr_path),
"%s/%s/%s/%s/%s/%s", SYSFS_MNT_PATH, SYSFS_BUS_NAME,
@@ -37,12 +38,14 @@ int modify_match_busid(char *busid, int add)
attr_name);
if (add)
- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s", busid);
+ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "add %s",
+ busid);
else
- snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s", busid);
+ cmd_size = snprintf(command, SYSFS_BUS_ID_SIZE + 4, "del %s",
+ busid);
rc = write_sysfs_attribute(match_busid_attr_path, command,
- sizeof(command));
+ cmd_size);
if (rc < 0) {
dbg("failed to write match_busid: %s", strerror(errno));
return -1;