aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Makefile1
-rw-r--r--kernel/bpf/core.c534
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/cgroup.c461
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cpu.c33
-rw-r--r--kernel/cpuset.c500
-rw-r--r--kernel/delayacct.c52
-rw-r--r--kernel/events/core.c8
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--kernel/exit.c50
-rw-r--r--kernel/fork.c129
-rw-r--r--kernel/futex.c402
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/irq/generic-chip.c5
-rw-r--r--kernel/irq/irqdomain.c2
-rw-r--r--kernel/irq_work.c110
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c1291
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/lockdep.c2
-rw-r--r--kernel/locking/mcs_spinlock.c8
-rw-r--r--kernel/locking/mcs_spinlock.h4
-rw-r--r--kernel/locking/mutex.c39
-rw-r--r--kernel/locking/qrwlock.c9
-rw-r--r--kernel/locking/rtmutex-debug.c5
-rw-r--r--kernel/locking/rtmutex-debug.h7
-rw-r--r--kernel/locking/rtmutex.c562
-rw-r--r--kernel/locking/rtmutex.h7
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/locking/rwsem-xadd.c4
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/power/Kconfig3
-rw-r--r--kernel/power/hibernate.c6
-rw-r--r--kernel/power/main.c25
-rw-r--r--kernel/power/power.h7
-rw-r--r--kernel/power/snapshot.c494
-rw-r--r--kernel/power/suspend.c154
-rw-r--r--kernel/power/suspend_test.c12
-rw-r--r--kernel/printk/printk.c157
-rw-r--r--kernel/ptrace.c8
-rw-r--r--kernel/rcu/rcu.h8
-rw-r--r--kernel/rcu/srcu.c4
-rw-r--r--kernel/rcu/tree.c59
-rw-r--r--kernel/rcu/tree.h36
-rw-r--r--kernel/rcu/tree_plugin.h302
-rw-r--r--kernel/rcu/update.c3
-rw-r--r--kernel/resource.c101
-rw-r--r--kernel/sched/core.c121
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/deadline.c18
-rw-r--r--kernel/sched/fair.c244
-rw-r--r--kernel/sched/idle.c8
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c30
-rw-r--r--kernel/sched/sched.h38
-rw-r--r--kernel/sched/wait.c30
-rw-r--r--kernel/seccomp.c430
-rw-r--r--kernel/signal.c50
-rw-r--r--kernel/smp.c11
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c9
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/system_keyring.c1
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/Kconfig5
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/tick-sched.c20
-rw-r--r--kernel/time/timekeeping.c604
-rw-r--r--kernel/time/timekeeping_internal.h15
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c445
-rw-r--r--kernel/trace/ring_buffer.c57
-rw-r--r--kernel/trace/trace.c107
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_event_perf.c12
-rw-r--r--kernel/trace/trace_events.c60
-rw-r--r--kernel/trace/trace_events_filter.c73
-rw-r--r--kernel/trace/trace_functions_graph.c43
-rw-r--r--kernel/trace/trace_output.c282
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_seq.c428
-rw-r--r--kernel/trace/trace_uprobe.c3
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/watchdog.c11
-rw-r--r--kernel/workqueue.c206
98 files changed, 6389 insertions, 2813 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 973a40cf8068..dc5c77544fd6 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,6 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
+obj-$(CONFIG_NET) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
@@ -104,7 +105,7 @@ targets += config_data.gz
$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
$(call if_changed,gzip)
- filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;")
+ filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
diff --git a/kernel/audit.c b/kernel/audit.c
index 3ef2e0e797e8..ba2ff5a5c600 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1677,7 +1677,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
audit_log_format(ab, " %s=", prefix);
CAP_FOR_EACH_U32(i) {
audit_log_format(ab, "%08x",
- cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+ cap->cap[CAP_LAST_U32 - i]);
}
}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c3dbb7..c447cd9848d1 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
if (unlikely(!entry))
return NULL;
- fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL);
+ fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
if (unlikely(!fields)) {
kfree(entry);
return NULL;
@@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
int __init audit_register_class(int class, unsigned *list)
{
- __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
+ __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
if (!p)
return -ENOMEM;
while (*list != ~0U) {
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
-#include <linux/page_cgroup.h>
#include <linux/log2.h>
#include <linux/spinlock_types.h>
@@ -18,7 +17,6 @@ void foo(void)
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
- DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
#ifdef CONFIG_SMP
DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
new file mode 100644
index 000000000000..6a71145e2769
--- /dev/null
+++ b/kernel/bpf/Makefile
@@ -0,0 +1 @@
+obj-y := core.o
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
new file mode 100644
index 000000000000..7f0dbcbb34af
--- /dev/null
+++ b/kernel/bpf/core.c
@@ -0,0 +1,534 @@
+/*
+ * Linux Socket Filter - Kernel level socket filtering
+ *
+ * Based on the design of the Berkeley Packet Filter. The new
+ * internal format has been designed by PLUMgrid:
+ *
+ * Copyright (c) 2011 - 2014 PLUMgrid, http://plumgrid.com
+ *
+ * Authors:
+ *
+ * Jay Schulist <jschlst@samba.org>
+ * Alexei Starovoitov <ast@plumgrid.com>
+ * Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in bpf_check_classic()
+ */
+#include <linux/filter.h>
+#include <linux/skbuff.h>
+#include <asm/unaligned.h>
+
+/* Registers */
+#define BPF_R0 regs[BPF_REG_0]
+#define BPF_R1 regs[BPF_REG_1]
+#define BPF_R2 regs[BPF_REG_2]
+#define BPF_R3 regs[BPF_REG_3]
+#define BPF_R4 regs[BPF_REG_4]
+#define BPF_R5 regs[BPF_REG_5]
+#define BPF_R6 regs[BPF_REG_6]
+#define BPF_R7 regs[BPF_REG_7]
+#define BPF_R8 regs[BPF_REG_8]
+#define BPF_R9 regs[BPF_REG_9]
+#define BPF_R10 regs[BPF_REG_10]
+
+/* Named registers */
+#define DST regs[insn->dst_reg]
+#define SRC regs[insn->src_reg]
+#define FP regs[BPF_REG_FP]
+#define ARG1 regs[BPF_REG_ARG1]
+#define CTX regs[BPF_REG_CTX]
+#define IMM insn->imm
+
+/* No hurry in this branch
+ *
+ * Exported for the bpf jit load helper.
+ */
+void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, unsigned int size)
+{
+ u8 *ptr = NULL;
+
+ if (k >= SKF_NET_OFF)
+ ptr = skb_network_header(skb) + k - SKF_NET_OFF;
+ else if (k >= SKF_LL_OFF)
+ ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
+ if (ptr >= skb->head && ptr + size <= skb_tail_pointer(skb))
+ return ptr;
+
+ return NULL;
+}
+
+/* Base function for offset calculation. Needs to go into .text section,
+ * therefore keeping it non-static as well; will also be used by JITs
+ * anyway later on, so do not let the compiler omit it.
+ */
+noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+/**
+ * __bpf_prog_run - run eBPF program on a given context
+ * @ctx: is the data we are operating on
+ * @insn: is the array of eBPF instructions
+ *
+ * Decode and execute eBPF instructions.
+ */
+static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+{
+ u64 stack[MAX_BPF_STACK / sizeof(u64)];
+ u64 regs[MAX_BPF_REG], tmp;
+ static const void *jumptable[256] = {
+ [0 ... 255] = &&default_label,
+ /* Now overwrite non-defaults ... */
+ /* 32 bit ALU operations */
+ [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X,
+ [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K,
+ [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X,
+ [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K,
+ [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X,
+ [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K,
+ [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X,
+ [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K,
+ [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X,
+ [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K,
+ [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X,
+ [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K,
+ [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X,
+ [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K,
+ [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X,
+ [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K,
+ [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X,
+ [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K,
+ [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X,
+ [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K,
+ [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X,
+ [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K,
+ [BPF_ALU | BPF_NEG] = &&ALU_NEG,
+ [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE,
+ [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE,
+ /* 64 bit ALU operations */
+ [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X,
+ [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K,
+ [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X,
+ [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K,
+ [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X,
+ [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K,
+ [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X,
+ [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K,
+ [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X,
+ [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K,
+ [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X,
+ [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K,
+ [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X,
+ [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K,
+ [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X,
+ [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K,
+ [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X,
+ [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K,
+ [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X,
+ [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K,
+ [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X,
+ [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K,
+ [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X,
+ [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K,
+ [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
+ /* Call instruction */
+ [BPF_JMP | BPF_CALL] = &&JMP_CALL,
+ /* Jumps */
+ [BPF_JMP | BPF_JA] = &&JMP_JA,
+ [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
+ [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K,
+ [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X,
+ [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K,
+ [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X,
+ [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K,
+ [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X,
+ [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K,
+ [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X,
+ [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K,
+ [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X,
+ [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K,
+ [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X,
+ [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K,
+ /* Program return */
+ [BPF_JMP | BPF_EXIT] = &&JMP_EXIT,
+ /* Store instructions */
+ [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B,
+ [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H,
+ [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W,
+ [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW,
+ [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W,
+ [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW,
+ [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B,
+ [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H,
+ [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W,
+ [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW,
+ /* Load instructions */
+ [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B,
+ [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H,
+ [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W,
+ [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW,
+ [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W,
+ [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H,
+ [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B,
+ [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
+ [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
+ [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ };
+ void *ptr;
+ int off;
+
+#define CONT ({ insn++; goto select_insn; })
+#define CONT_JMP ({ insn++; goto select_insn; })
+
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
+ ARG1 = (u64) (unsigned long) ctx;
+
+ /* Registers used in classic BPF programs need to be reset first. */
+ regs[BPF_REG_A] = 0;
+ regs[BPF_REG_X] = 0;
+
+select_insn:
+ goto *jumptable[insn->code];
+
+ /* ALU */
+#define ALU(OPCODE, OP) \
+ ALU64_##OPCODE##_X: \
+ DST = DST OP SRC; \
+ CONT; \
+ ALU_##OPCODE##_X: \
+ DST = (u32) DST OP (u32) SRC; \
+ CONT; \
+ ALU64_##OPCODE##_K: \
+ DST = DST OP IMM; \
+ CONT; \
+ ALU_##OPCODE##_K: \
+ DST = (u32) DST OP (u32) IMM; \
+ CONT;
+
+ ALU(ADD, +)
+ ALU(SUB, -)
+ ALU(AND, &)
+ ALU(OR, |)
+ ALU(LSH, <<)
+ ALU(RSH, >>)
+ ALU(XOR, ^)
+ ALU(MUL, *)
+#undef ALU
+ ALU_NEG:
+ DST = (u32) -DST;
+ CONT;
+ ALU64_NEG:
+ DST = -DST;
+ CONT;
+ ALU_MOV_X:
+ DST = (u32) SRC;
+ CONT;
+ ALU_MOV_K:
+ DST = (u32) IMM;
+ CONT;
+ ALU64_MOV_X:
+ DST = SRC;
+ CONT;
+ ALU64_MOV_K:
+ DST = IMM;
+ CONT;
+ ALU64_ARSH_X:
+ (*(s64 *) &DST) >>= SRC;
+ CONT;
+ ALU64_ARSH_K:
+ (*(s64 *) &DST) >>= IMM;
+ CONT;
+ ALU64_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = DST;
+ DST = do_div(tmp, SRC);
+ CONT;
+ ALU_MOD_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) SRC);
+ CONT;
+ ALU64_MOD_K:
+ tmp = DST;
+ DST = do_div(tmp, IMM);
+ CONT;
+ ALU_MOD_K:
+ tmp = (u32) DST;
+ DST = do_div(tmp, (u32) IMM);
+ CONT;
+ ALU64_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ do_div(DST, SRC);
+ CONT;
+ ALU_DIV_X:
+ if (unlikely(SRC == 0))
+ return 0;
+ tmp = (u32) DST;
+ do_div(tmp, (u32) SRC);
+ DST = (u32) tmp;
+ CONT;
+ ALU64_DIV_K:
+ do_div(DST, IMM);
+ CONT;
+ ALU_DIV_K:
+ tmp = (u32) DST;
+ do_div(tmp, (u32) IMM);
+ DST = (u32) tmp;
+ CONT;
+ ALU_END_TO_BE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_be16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_be32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_be64(DST);
+ break;
+ }
+ CONT;
+ ALU_END_TO_LE:
+ switch (IMM) {
+ case 16:
+ DST = (__force u16) cpu_to_le16(DST);
+ break;
+ case 32:
+ DST = (__force u32) cpu_to_le32(DST);
+ break;
+ case 64:
+ DST = (__force u64) cpu_to_le64(DST);
+ break;
+ }
+ CONT;
+
+ /* CALL */
+ JMP_CALL:
+ /* Function call scratches BPF_R1-BPF_R5 registers,
+ * preserves BPF_R6-BPF_R9, and stores return value
+ * into BPF_R0.
+ */
+ BPF_R0 = (__bpf_call_base + insn->imm)(BPF_R1, BPF_R2, BPF_R3,
+ BPF_R4, BPF_R5);
+ CONT;
+
+ /* JMP */
+ JMP_JA:
+ insn += insn->off;
+ CONT;
+ JMP_JEQ_X:
+ if (DST == SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JEQ_K:
+ if (DST == IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_X:
+ if (DST != SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JNE_K:
+ if (DST != IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_X:
+ if (DST > SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGT_K:
+ if (DST > IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_X:
+ if (DST >= SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JGE_K:
+ if (DST >= IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_X:
+ if (((s64) DST) > ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGT_K:
+ if (((s64) DST) > ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_X:
+ if (((s64) DST) >= ((s64) SRC)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSGE_K:
+ if (((s64) DST) >= ((s64) IMM)) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_X:
+ if (DST & SRC) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_JSET_K:
+ if (DST & IMM) {
+ insn += insn->off;
+ CONT_JMP;
+ }
+ CONT;
+ JMP_EXIT:
+ return BPF_R0;
+
+ /* STX and ST and LDX*/
+#define LDST(SIZEOP, SIZE) \
+ STX_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = SRC; \
+ CONT; \
+ ST_MEM_##SIZEOP: \
+ *(SIZE *)(unsigned long) (DST + insn->off) = IMM; \
+ CONT; \
+ LDX_MEM_##SIZEOP: \
+ DST = *(SIZE *)(unsigned long) (SRC + insn->off); \
+ CONT;
+
+ LDST(B, u8)
+ LDST(H, u16)
+ LDST(W, u32)
+ LDST(DW, u64)
+#undef LDST
+ STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */
+ atomic_add((u32) SRC, (atomic_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */
+ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long)
+ (DST + insn->off));
+ CONT;
+ LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */
+ off = IMM;
+load_word:
+ /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are
+ * only appearing in the programs where ctx ==
+ * skb. All programs keep 'ctx' in regs[BPF_REG_CTX]
+ * == BPF_R6, bpf_convert_filter() saves it in BPF_R6,
+ * internal BPF verifier will check that BPF_R6 ==
+ * ctx.
+ *
+ * BPF_ABS and BPF_IND are wrappers of function calls,
+ * so they scratch BPF_R1-BPF_R5 registers, preserve
+ * BPF_R6-BPF_R9, and store return value into BPF_R0.
+ *
+ * Implicit input:
+ * ctx == skb == BPF_R6 == CTX
+ *
+ * Explicit input:
+ * SRC == any register
+ * IMM == 32-bit immediate
+ *
+ * Output:
+ * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness
+ */
+
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be32(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */
+ off = IMM;
+load_half:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = get_unaligned_be16(ptr);
+ CONT;
+ }
+
+ return 0;
+ LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */
+ off = IMM;
+load_byte:
+ ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp);
+ if (likely(ptr != NULL)) {
+ BPF_R0 = *(u8 *)ptr;
+ CONT;
+ }
+
+ return 0;
+ LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_word;
+ LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */
+ off = IMM + SRC;
+ goto load_half;
+ LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */
+ off = IMM + SRC;
+ goto load_byte;
+
+ default_label:
+ /* If we ever reach this, we have a bug somewhere. */
+ WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
+ return 0;
+}
+
+void __weak bpf_int_jit_compile(struct bpf_prog *prog)
+{
+}
+
+/**
+ * bpf_prog_select_runtime - select execution runtime for BPF program
+ * @fp: bpf_prog populated with internal BPF program
+ *
+ * try to JIT internal BPF program, if JIT is not available select interpreter
+ * BPF program will be executed via BPF_PROG_RUN() macro
+ */
+void bpf_prog_select_runtime(struct bpf_prog *fp)
+{
+ fp->bpf_func = (void *) __bpf_prog_run;
+
+ /* Probe if internal BPF can be JITed */
+ bpf_int_jit_compile(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
+
+/* free internal BPF program */
+void bpf_prog_free(struct bpf_prog *fp)
+{
+ bpf_jit_free(fp);
+}
+EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/capability.c b/kernel/capability.c
index a5cf13c018ce..989f5bfc57dc 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -258,6 +258,10 @@ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data)
i++;
}
+ effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+ inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK;
+
new = prepare_creds();
if (!new)
return -ENOMEM;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 70776aec2562..7dc8788cfd52 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -149,12 +149,14 @@ struct cgroup_root cgrp_dfl_root;
*/
static bool cgrp_dfl_root_visible;
+/*
+ * Set by the boot param of the same name and makes subsystems with NULL
+ * ->dfl_files to use ->legacy_files on the default hierarchy.
+ */
+static bool cgroup_legacy_files_on_dfl;
+
/* some controllers are not supported in the default hierarchy */
-static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0
-#ifdef CONFIG_CGROUP_DEBUG
- | (1 << debug_cgrp_id)
-#endif
- ;
+static unsigned int cgrp_dfl_root_inhibit_ss_mask;
/* The list of hierarchy roots */
@@ -180,13 +182,15 @@ static u64 css_serial_nr_next = 1;
*/
static int need_forkexit_callback __read_mostly;
-static struct cftype cgroup_base_files[];
+static struct cftype cgroup_dfl_base_files[];
+static struct cftype cgroup_legacy_base_files[];
static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss);
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible);
static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
@@ -1037,6 +1041,58 @@ static void cgroup_put(struct cgroup *cgrp)
}
/**
+ * cgroup_refresh_child_subsys_mask - update child_subsys_mask
+ * @cgrp: the target cgroup
+ *
+ * On the default hierarchy, a subsystem may request other subsystems to be
+ * enabled together through its ->depends_on mask. In such cases, more
+ * subsystems than specified in "cgroup.subtree_control" may be enabled.
+ *
+ * This function determines which subsystems need to be enabled given the
+ * current @cgrp->subtree_control and records it in
+ * @cgrp->child_subsys_mask. The resulting mask is always a superset of
+ * @cgrp->subtree_control and follows the usual hierarchy rules.
+ */
+static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp)
+{
+ struct cgroup *parent = cgroup_parent(cgrp);
+ unsigned int cur_ss_mask = cgrp->subtree_control;
+ struct cgroup_subsys *ss;
+ int ssid;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->child_subsys_mask = cur_ss_mask;
+ return;
+ }
+
+ while (true) {
+ unsigned int new_ss_mask = cur_ss_mask;
+
+ for_each_subsys(ss, ssid)
+ if (cur_ss_mask & (1 << ssid))
+ new_ss_mask |= ss->depends_on;
+
+ /*
+ * Mask out subsystems which aren't available. This can
+ * happen only if some depended-upon subsystems were bound
+ * to non-default hierarchies.
+ */
+ if (parent)
+ new_ss_mask &= parent->child_subsys_mask;
+ else
+ new_ss_mask &= cgrp->root->subsys_mask;
+
+ if (new_ss_mask == cur_ss_mask)
+ break;
+ cur_ss_mask = new_ss_mask;
+ }
+
+ cgrp->child_subsys_mask = cur_ss_mask;
+}
+
+/**
* cgroup_kn_unlock - unlocking helper for cgroup kernfs methods
* @kn: the kernfs_node being serviced
*
@@ -1208,12 +1264,15 @@ static int rebind_subsystems(struct cgroup_root *dst_root, unsigned int ss_mask)
up_write(&css_set_rwsem);
src_root->subsys_mask &= ~(1 << ssid);
- src_root->cgrp.child_subsys_mask &= ~(1 << ssid);
+ src_root->cgrp.subtree_control &= ~(1 << ssid);
+ cgroup_refresh_child_subsys_mask(&src_root->cgrp);
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
- if (dst_root != &cgrp_dfl_root)
- dst_root->cgrp.child_subsys_mask |= 1 << ssid;
+ if (dst_root != &cgrp_dfl_root) {
+ dst_root->cgrp.subtree_control |= 1 << ssid;
+ cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+ }
if (ss->bind)
ss->bind(css);
@@ -1233,8 +1292,6 @@ static int cgroup_show_options(struct seq_file *seq,
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
seq_printf(seq, ",%s", ss->name);
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
- seq_puts(seq, ",sane_behavior");
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1268,6 +1325,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
bool all_ss = false, one_ss = false;
unsigned int mask = -1U;
struct cgroup_subsys *ss;
+ int nr_opts = 0;
int i;
#ifdef CONFIG_CPUSETS
@@ -1277,6 +1335,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
memset(opts, 0, sizeof(*opts));
while ((token = strsep(&o, ",")) != NULL) {
+ nr_opts++;
+
if (!*token)
return -EINVAL;
if (!strcmp(token, "none")) {
@@ -1361,37 +1421,33 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
return -ENOENT;
}
- /* Consistency checks */
-
if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
-
- if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) ||
- opts->cpuset_clone_children || opts->release_agent ||
- opts->name) {
- pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n");
+ if (nr_opts != 1) {
+ pr_err("sane_behavior: no other mount options allowed\n");
return -EINVAL;
}
- } else {
- /*
- * If the 'all' option was specified select all the
- * subsystems, otherwise if 'none', 'name=' and a subsystem
- * name options were not specified, let's default to 'all'
- */
- if (all_ss || (!one_ss && !opts->none && !opts->name))
- for_each_subsys(ss, i)
- if (!ss->disabled)
- opts->subsys_mask |= (1 << i);
-
- /*
- * We either have to specify by name or by subsystems. (So
- * all empty hierarchies must have a name).
- */
- if (!opts->subsys_mask && !opts->name)
- return -EINVAL;
+ return 0;
}
/*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise if 'none', 'name=' and a subsystem name options were
+ * not specified, let's default to 'all'
+ */
+ if (all_ss || (!one_ss && !opts->none && !opts->name))
+ for_each_subsys(ss, i)
+ if (!ss->disabled)
+ opts->subsys_mask |= (1 << i);
+
+ /*
+ * We either have to specify by name or by subsystems. (So all
+ * empty hierarchies must have a name).
+ */
+ if (!opts->subsys_mask && !opts->name)
+ return -EINVAL;
+
+ /*
* Option noprefix was introduced just for backward compatibility
* with the old cpuset, so we allow noprefix only if mounting just
* the cpuset subsystem.
@@ -1399,7 +1455,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask))
return -EINVAL;
-
/* Can't specify "none" and some subsystems */
if (opts->subsys_mask && opts->none)
return -EINVAL;
@@ -1414,8 +1469,8 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
struct cgroup_sb_opts opts;
unsigned int added_mask, removed_mask;
- if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: remount is not allowed\n");
+ if (root == &cgrp_dfl_root) {
+ pr_err("remount is not allowed\n");
return -EINVAL;
}
@@ -1434,11 +1489,10 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
removed_mask = root->subsys_mask & ~opts.subsys_mask;
/* Don't allow flags or name to change at remount */
- if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) ||
+ if ((opts.flags ^ root->flags) ||
(opts.name && strcmp(opts.name, root->name))) {
pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n",
- opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "",
- root->flags & CGRP_ROOT_OPTION_MASK, root->name);
+ opts.flags, opts.name ?: "", root->flags, root->name);
ret = -EINVAL;
goto out_unlock;
}
@@ -1563,6 +1617,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
{
LIST_HEAD(tmp_links);
struct cgroup *root_cgrp = &root->cgrp;
+ struct cftype *base_files;
struct css_set *cset;
int i, ret;
@@ -1600,7 +1655,12 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
}
root_cgrp->kn = root->kf_root->kn;
- ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true);
+ if (root == &cgrp_dfl_root)
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(root_cgrp, base_files, true);
if (ret)
goto destroy_root;
@@ -1638,7 +1698,7 @@ destroy_root:
exit_root_id:
cgroup_exit_root_id(root);
cancel_ref:
- percpu_ref_cancel_init(&root_cgrp->self.refcnt);
+ percpu_ref_exit(&root_cgrp->self.refcnt);
out:
free_cgrp_cset_links(&tmp_links);
return ret;
@@ -1672,7 +1732,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
/* look for a matching existing root */
- if (!opts.subsys_mask && !opts.none && !opts.name) {
+ if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) {
cgrp_dfl_root_visible = true;
root = &cgrp_dfl_root;
cgroup_get(&root->cgrp);
@@ -1730,15 +1790,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
goto out_unlock;
}
- if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) {
- if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) {
- pr_err("sane_behavior: new mount options should match the existing superblock\n");
- ret = -EINVAL;
- goto out_unlock;
- } else {
- pr_warn("new mount options do not match the existing superblock, will be ignored\n");
- }
- }
+ if (root->flags ^ opts.flags)
+ pr_warn("new mount options do not match the existing superblock, will be ignored\n");
/*
* We want to reuse @root whose lifetime is governed by its
@@ -2457,9 +2510,7 @@ static int cgroup_release_agent_show(struct seq_file *seq, void *v)
static int cgroup_sane_behavior_show(struct seq_file *seq, void *v)
{
- struct cgroup *cgrp = seq_css(seq)->cgroup;
-
- seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+ seq_puts(seq, "0\n");
return 0;
}
@@ -2496,7 +2547,7 @@ static int cgroup_controllers_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control);
return 0;
}
@@ -2505,7 +2556,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- cgroup_print_ss_mask(seq, cgrp->child_subsys_mask);
+ cgroup_print_ss_mask(seq, cgrp->subtree_control);
return 0;
}
@@ -2611,6 +2662,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
loff_t off)
{
unsigned int enable = 0, disable = 0;
+ unsigned int css_enable, css_disable, old_ctrl, new_ctrl;
struct cgroup *cgrp, *child;
struct cgroup_subsys *ss;
char *tok;
@@ -2650,11 +2702,26 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
for_each_subsys(ss, ssid) {
if (enable & (1 << ssid)) {
- if (cgrp->child_subsys_mask & (1 << ssid)) {
+ if (cgrp->subtree_control & (1 << ssid)) {
enable &= ~(1 << ssid);
continue;
}
+ /* unavailable or not enabled on the parent? */
+ if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
+ (cgroup_parent(cgrp) &&
+ !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
+
+ /*
+ * @ss is already enabled through dependency and
+ * we'll just make it visible. Skip draining.
+ */
+ if (cgrp->child_subsys_mask & (1 << ssid))
+ continue;
+
/*
* Because css offlining is asynchronous, userland
* might try to re-enable the same controller while
@@ -2677,23 +2744,15 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
return restart_syscall();
}
-
- /* unavailable or not enabled on the parent? */
- if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) ||
- (cgroup_parent(cgrp) &&
- !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) {
- ret = -ENOENT;
- goto out_unlock;
- }
} else if (disable & (1 << ssid)) {
- if (!(cgrp->child_subsys_mask & (1 << ssid))) {
+ if (!(cgrp->subtree_control & (1 << ssid))) {
disable &= ~(1 << ssid);
continue;
}
/* a child has it enabled? */
cgroup_for_each_live_child(child, cgrp) {
- if (child->child_subsys_mask & (1 << ssid)) {
+ if (child->subtree_control & (1 << ssid)) {
ret = -EBUSY;
goto out_unlock;
}
@@ -2707,7 +2766,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Except for the root, child_subsys_mask must be zero for a cgroup
+ * Except for the root, subtree_control must be zero for a cgroup
* with tasks so that child cgroups don't compete against tasks.
*/
if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) {
@@ -2716,36 +2775,75 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
}
/*
- * Create csses for enables and update child_subsys_mask. This
- * changes cgroup_e_css() results which in turn makes the
- * subsequent cgroup_update_dfl_csses() associate all tasks in the
- * subtree to the updated csses.
+ * Update subsys masks and calculate what needs to be done. More
+ * subsystems than specified may need to be enabled or disabled
+ * depending on subsystem dependencies.
+ */
+ cgrp->subtree_control |= enable;
+ cgrp->subtree_control &= ~disable;
+
+ old_ctrl = cgrp->child_subsys_mask;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ new_ctrl = cgrp->child_subsys_mask;
+
+ css_enable = ~old_ctrl & new_ctrl;
+ css_disable = old_ctrl & ~new_ctrl;
+ enable |= css_enable;
+ disable |= css_disable;
+
+ /*
+ * Create new csses or make the existing ones visible. A css is
+ * created invisible if it's being implicitly enabled through
+ * dependency. An invisible css is made visible when the userland
+ * explicitly enables it.
*/
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
continue;
cgroup_for_each_live_child(child, cgrp) {
- ret = create_css(child, ss);
+ if (css_enable & (1 << ssid))
+ ret = create_css(child, ss,
+ cgrp->subtree_control & (1 << ssid));
+ else
+ ret = cgroup_populate_dir(child, 1 << ssid);
if (ret)
goto err_undo_css;
}
}
- cgrp->child_subsys_mask |= enable;
- cgrp->child_subsys_mask &= ~disable;
-
+ /*
+ * At this point, cgroup_e_css() results reflect the new csses
+ * making the following cgroup_update_dfl_csses() properly update
+ * css associations of all tasks in the subtree.
+ */
ret = cgroup_update_dfl_csses(cgrp);
if (ret)
goto err_undo_css;
- /* all tasks are now migrated away from the old csses, kill them */
+ /*
+ * All tasks are migrated out of disabled csses. Kill or hide
+ * them. A css is hidden when the userland requests it to be
+ * disabled while other subsystems are still depending on it. The
+ * css must not actively control resources and be in the vanilla
+ * state if it's made visible again later. Controllers which may
+ * be depended upon should provide ->css_reset() for this purpose.
+ */
for_each_subsys(ss, ssid) {
if (!(disable & (1 << ssid)))
continue;
- cgroup_for_each_live_child(child, cgrp)
- kill_css(cgroup_css(child, ss));
+ cgroup_for_each_live_child(child, cgrp) {
+ struct cgroup_subsys_state *css = cgroup_css(child, ss);
+
+ if (css_disable & (1 << ssid)) {
+ kill_css(css);
+ } else {
+ cgroup_clear_dir(child, 1 << ssid);
+ if (ss->css_reset)
+ ss->css_reset(css);
+ }
+ }
}
kernfs_activate(cgrp->kn);
@@ -2755,8 +2853,9 @@ out_unlock:
return ret ?: nbytes;
err_undo_css:
- cgrp->child_subsys_mask &= ~enable;
- cgrp->child_subsys_mask |= disable;
+ cgrp->subtree_control &= ~enable;
+ cgrp->subtree_control |= disable;
+ cgroup_refresh_child_subsys_mask(cgrp);
for_each_subsys(ss, ssid) {
if (!(enable & (1 << ssid)))
@@ -2764,8 +2863,14 @@ err_undo_css:
cgroup_for_each_live_child(child, cgrp) {
struct cgroup_subsys_state *css = cgroup_css(child, ss);
- if (css)
+
+ if (!css)
+ continue;
+
+ if (css_enable & (1 << ssid))
kill_css(css);
+ else
+ cgroup_clear_dir(child, 1 << ssid);
}
}
goto out_unlock;
@@ -2878,9 +2983,9 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
/*
* This isn't a proper migration and its usefulness is very
- * limited. Disallow if sane_behavior.
+ * limited. Disallow on the default hierarchy.
*/
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return -EPERM;
/*
@@ -2964,9 +3069,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
for (cft = cfts; cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
- if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
+ if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
- if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+ if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp))
continue;
@@ -3024,6 +3129,9 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
kfree(cft->kf_ops);
cft->kf_ops = NULL;
cft->ss = NULL;
+
+ /* revert flags set by cgroup core while adding @cfts */
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
}
}
@@ -3109,7 +3217,7 @@ int cgroup_rm_cftypes(struct cftype *cfts)
* function currently returns 0 as long as @cfts registration is successful
* even if some file creation attempts on existing cgroups fail.
*/
-int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
int ret;
@@ -3135,6 +3243,40 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
}
/**
+ * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the default hierarchy.
+ */
+int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_ONLY_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
+ * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Similar to cgroup_add_cftypes() but the added files are only used for
+ * the legacy hierarchies.
+ */
+int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
+{
+ struct cftype *cft;
+
+ for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+ cft->flags |= __CFTYPE_NOT_ON_DFL;
+ return cgroup_add_cftypes(ss, cfts);
+}
+
+/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
*
@@ -3699,8 +3841,9 @@ after:
*
* All this extra complexity was caused by the original implementation
* committing to an entirely unnecessary property. In the long term, we
- * want to do away with it. Explicitly scramble sort order if
- * sane_behavior so that no such expectation exists in the new interface.
+ * want to do away with it. Explicitly scramble sort order if on the
+ * default hierarchy so that no such expectation exists in the new
+ * interface.
*
* Scrambling is done by swapping every two consecutive bits, which is
* non-identity one-to-one mapping which disturbs sort order sufficiently.
@@ -3715,7 +3858,7 @@ static pid_t pid_fry(pid_t pid)
static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid)
{
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
return pid_fry(pid);
else
return pid;
@@ -3818,7 +3961,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
css_task_iter_end(&it);
length = n;
/* now sort & (if procs) strip out duplicates */
- if (cgroup_sane_behavior(cgrp))
+ if (cgroup_on_dfl(cgrp))
sort(array, length, sizeof(pid_t), fried_cmppid, NULL);
else
sort(array, length, sizeof(pid_t), cmppid, NULL);
@@ -4040,7 +4183,8 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
return 0;
}
-static struct cftype cgroup_base_files[] = {
+/* cgroup core interface files for the default hierarchy */
+static struct cftype cgroup_dfl_base_files[] = {
{
.name = "cgroup.procs",
.seq_start = cgroup_pidlist_start,
@@ -4052,46 +4196,52 @@ static struct cftype cgroup_base_files[] = {
.mode = S_IRUGO | S_IWUSR,
},
{
- .name = "cgroup.clone_children",
- .flags = CFTYPE_INSANE,
- .read_u64 = cgroup_clone_children_read,
- .write_u64 = cgroup_clone_children_write,
- },
- {
- .name = "cgroup.sane_behavior",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .seq_show = cgroup_sane_behavior_show,
- },
- {
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_root_controllers_show,
},
{
.name = "cgroup.controllers",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_controllers_show,
},
{
.name = "cgroup.subtree_control",
- .flags = CFTYPE_ONLY_ON_DFL,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
{
.name = "cgroup.populated",
- .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT,
+ .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = cgroup_populated_show,
},
+ { } /* terminate */
+};
- /*
- * Historical crazy stuff. These don't have "cgroup." prefix and
- * don't exist if sane_behavior. If you're depending on these, be
- * prepared to be burned.
- */
+/* cgroup core interface files for the legacy hierarchies */
+static struct cftype cgroup_legacy_base_files[] = {
+ {
+ .name = "cgroup.procs",
+ .seq_start = cgroup_pidlist_start,
+ .seq_next = cgroup_pidlist_next,
+ .seq_stop = cgroup_pidlist_stop,
+ .seq_show = cgroup_pidlist_show,
+ .private = CGROUP_FILE_PROCS,
+ .write = cgroup_procs_write,
+ .mode = S_IRUGO | S_IWUSR,
+ },
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
+ {
+ .name = "cgroup.sane_behavior",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = cgroup_sane_behavior_show,
+ },
{
.name = "tasks",
- .flags = CFTYPE_INSANE, /* use "procs" instead */
.seq_start = cgroup_pidlist_start,
.seq_next = cgroup_pidlist_next,
.seq_stop = cgroup_pidlist_stop,
@@ -4102,13 +4252,12 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "notify_on_release",
- .flags = CFTYPE_INSANE,
.read_u64 = cgroup_read_notify_on_release,
.write_u64 = cgroup_write_notify_on_release,
},
{
.name = "release_agent",
- .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT,
+ .flags = CFTYPE_ONLY_ON_ROOT,
.seq_show = cgroup_release_agent_show,
.write = cgroup_release_agent_write,
.max_write_len = PATH_MAX - 1,
@@ -4175,6 +4324,8 @@ static void css_free_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
struct cgroup *cgrp = css->cgroup;
+ percpu_ref_exit(&css->refcnt);
+
if (css->ss) {
/* css free path */
if (css->parent)
@@ -4314,12 +4465,14 @@ static void offline_css(struct cgroup_subsys_state *css)
* create_css - create a cgroup_subsys_state
* @cgrp: the cgroup new css will be associated with
* @ss: the subsys of new css
+ * @visible: whether to create control knobs for the new css or not
*
* Create a new css associated with @cgrp - @ss pair. On success, the new
- * css is online and installed in @cgrp with all interface files created.
- * Returns 0 on success, -errno on failure.
+ * css is online and installed in @cgrp with all interface files created if
+ * @visible. Returns 0 on success, -errno on failure.
*/
-static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
+static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
+ bool visible)
{
struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss);
@@ -4343,9 +4496,11 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss)
goto err_free_percpu_ref;
css->id = err;
- err = cgroup_populate_dir(cgrp, 1 << ss->id);
- if (err)
- goto err_free_id;
+ if (visible) {
+ err = cgroup_populate_dir(cgrp, 1 << ss->id);
+ if (err)
+ goto err_free_id;
+ }
/* @css is ready to be brought online now, make it visible */
list_add_tail_rcu(&css->sibling, &parent_css->children);
@@ -4372,7 +4527,7 @@ err_list_del:
err_free_id:
cgroup_idr_remove(&ss->css_idr, css->id);
err_free_percpu_ref:
- percpu_ref_cancel_init(&css->refcnt);
+ percpu_ref_exit(&css->refcnt);
err_free_css:
call_rcu(&css->rcu_head, css_free_rcu_fn);
return err;
@@ -4385,6 +4540,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
struct cgroup_root *root;
struct cgroup_subsys *ss;
struct kernfs_node *kn;
+ struct cftype *base_files;
int ssid, ret;
parent = cgroup_kn_lock_live(parent_kn);
@@ -4455,14 +4611,20 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
if (ret)
goto out_destroy;
- ret = cgroup_addrm_files(cgrp, cgroup_base_files, true);
+ if (cgroup_on_dfl(cgrp))
+ base_files = cgroup_dfl_base_files;
+ else
+ base_files = cgroup_legacy_base_files;
+
+ ret = cgroup_addrm_files(cgrp, base_files, true);
if (ret)
goto out_destroy;
/* let's create and online css's */
for_each_subsys(ss, ssid) {
if (parent->child_subsys_mask & (1 << ssid)) {
- ret = create_css(cgrp, ss);
+ ret = create_css(cgrp, ss,
+ parent->subtree_control & (1 << ssid));
if (ret)
goto out_destroy;
}
@@ -4470,10 +4632,12 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
/*
* On the default hierarchy, a child doesn't automatically inherit
- * child_subsys_mask from the parent. Each is configured manually.
+ * subtree_control from the parent. Each is configured manually.
*/
- if (!cgroup_on_dfl(cgrp))
- cgrp->child_subsys_mask = parent->child_subsys_mask;
+ if (!cgroup_on_dfl(cgrp)) {
+ cgrp->subtree_control = parent->subtree_control;
+ cgroup_refresh_child_subsys_mask(cgrp);
+ }
kernfs_activate(kn);
@@ -4483,7 +4647,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
out_free_id:
cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
out_cancel_ref:
- percpu_ref_cancel_init(&cgrp->self.refcnt);
+ percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
kfree(cgrp);
out_unlock:
@@ -4736,8 +4900,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
*/
int __init cgroup_init_early(void)
{
- static struct cgroup_sb_opts __initdata opts =
- { .flags = CGRP_ROOT_SANE_BEHAVIOR };
+ static struct cgroup_sb_opts __initdata opts;
struct cgroup_subsys *ss;
int i;
@@ -4775,7 +4938,8 @@ int __init cgroup_init(void)
unsigned long key;
int ssid, err;
- BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
mutex_lock(&cgroup_mutex);
@@ -4807,9 +4971,22 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (!ss->disabled) {
- cgrp_dfl_root.subsys_mask |= 1 << ss->id;
- WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes));
+ if (ss->disabled)
+ continue;
+
+ cgrp_dfl_root.subsys_mask |= 1 << ss->id;
+
+ if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
+ ss->dfl_cftypes = ss->legacy_cftypes;
+
+ if (!ss->dfl_cftypes)
+ cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
+
+ if (ss->dfl_cftypes == ss->legacy_cftypes) {
+ WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes));
+ } else {
+ WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes));
+ WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes));
}
}
@@ -5205,6 +5382,14 @@ static int __init cgroup_disable(char *str)
}
__setup("cgroup_disable=", cgroup_disable);
+static int __init cgroup_set_legacy_files_on_dfl(char *str)
+{
+ printk("cgroup: using legacy files on the default hierarchy\n");
+ cgroup_legacy_files_on_dfl = true;
+ return 0;
+}
+__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
+
/**
* css_tryget_online_from_dir - get corresponding css from a cgroup dentry
* @dentry: directory dentry of interest
@@ -5399,6 +5584,6 @@ static struct cftype debug_files[] = {
struct cgroup_subsys debug_cgrp_subsys = {
.css_alloc = debug_css_alloc,
.css_free = debug_css_free,
- .base_cftypes = debug_files,
+ .legacy_cftypes = debug_files,
};
#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index a79e40f9d700..92b98cc0ee76 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -480,5 +480,5 @@ struct cgroup_subsys freezer_cgrp_subsys = {
.css_free = freezer_css_free,
.attach = freezer_attach,
.fork = freezer_fork,
- .base_cftypes = files,
+ .legacy_cftypes = files,
};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index a343bde710b1..81e2a388a0f6 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -274,21 +274,28 @@ void clear_tasks_mm_cpumask(int cpu)
rcu_read_unlock();
}
-static inline void check_for_tasks(int cpu)
+static inline void check_for_tasks(int dead_cpu)
{
- struct task_struct *p;
- cputime_t utime, stime;
+ struct task_struct *g, *p;
- write_lock_irq(&tasklist_lock);
- for_each_process(p) {
- task_cputime(p, &utime, &stime);
- if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
- (utime || stime))
- pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n",
- p->comm, task_pid_nr(p), cpu,
- p->state, p->flags);
- }
- write_unlock_irq(&tasklist_lock);
+ read_lock_irq(&tasklist_lock);
+ do_each_thread(g, p) {
+ if (!p->on_rq)
+ continue;
+ /*
+ * We do the check with unlocked task_rq(p)->lock.
+ * Order the reading to do not warn about a task,
+ * which was running on this cpu in the past, and
+ * it's just been woken on another cpu.
+ */
+ rmb();
+ if (task_cpu(p) != dead_cpu)
+ continue;
+
+ pr_warn("Task %s (pid=%d) is on cpu %d (state=%ld, flags=%x)\n",
+ p->comm, task_pid_nr(p), dead_cpu, p->state, p->flags);
+ } while_each_thread(g, p);
+ read_unlock_irq(&tasklist_lock);
}
struct take_cpu_down_param {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 116a4164720a..22874d7cf2c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -76,8 +76,34 @@ struct cpuset {
struct cgroup_subsys_state css;
unsigned long flags; /* "unsigned long" so bitops work */
- cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
- nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
+
+ /*
+ * On default hierarchy:
+ *
+ * The user-configured masks can only be changed by writing to
+ * cpuset.cpus and cpuset.mems, and won't be limited by the
+ * parent masks.
+ *
+ * The effective masks is the real masks that apply to the tasks
+ * in the cpuset. They may be changed if the configured masks are
+ * changed or hotplug happens.
+ *
+ * effective_mask == configured_mask & parent's effective_mask,
+ * and if it ends up empty, it will inherit the parent's mask.
+ *
+ *
+ * On legacy hierachy:
+ *
+ * The user-configured masks are always the same with effective masks.
+ */
+
+ /* user-configured CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t cpus_allowed;
+ nodemask_t mems_allowed;
+
+ /* effective CPUs and Memory Nodes allow to tasks */
+ cpumask_var_t effective_cpus;
+ nodemask_t effective_mems;
/*
* This is old Memory Nodes tasks took on.
@@ -307,9 +333,9 @@ static struct file_system_type cpuset_fs_type = {
*/
static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
+ while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask))
cs = parent_cs(cs);
- cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
+ cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
}
/*
@@ -325,9 +351,9 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
- while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY]))
+ while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
- nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]);
+ nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]);
}
/*
@@ -376,13 +402,20 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
if (!trial)
return NULL;
- if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) {
- kfree(trial);
- return NULL;
- }
- cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
+ cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
+ cpumask_copy(trial->effective_cpus, cs->effective_cpus);
return trial;
+
+free_cpus:
+ free_cpumask_var(trial->cpus_allowed);
+free_cs:
+ kfree(trial);
+ return NULL;
}
/**
@@ -391,6 +424,7 @@ static struct cpuset *alloc_trial_cpuset(struct cpuset *cs)
*/
static void free_trial_cpuset(struct cpuset *trial)
{
+ free_cpumask_var(trial->effective_cpus);
free_cpumask_var(trial->cpus_allowed);
kfree(trial);
}
@@ -436,9 +470,9 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
par = parent_cs(cur);
- /* We must be a subset of our parent cpuset */
+ /* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!is_cpuset_subset(trial, par))
+ if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -480,11 +514,11 @@ out:
#ifdef CONFIG_SMP
/*
* Helper routine for generate_sched_domains().
- * Do cpusets a, b have overlapping cpus_allowed masks?
+ * Do cpusets a, b have overlapping effective cpus_allowed masks?
*/
static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
{
- return cpumask_intersects(a->cpus_allowed, b->cpus_allowed);
+ return cpumask_intersects(a->effective_cpus, b->effective_cpus);
}
static void
@@ -601,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
*dattr = SD_ATTR_INIT;
update_domain_attr_tree(dattr, &top_cpuset);
}
- cpumask_copy(doms[0], top_cpuset.cpus_allowed);
+ cpumask_copy(doms[0], top_cpuset.effective_cpus);
goto done;
}
@@ -705,7 +739,7 @@ restart:
struct cpuset *b = csa[j];
if (apn == b->pn) {
- cpumask_or(dp, dp, b->cpus_allowed);
+ cpumask_or(dp, dp, b->effective_cpus);
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -757,7 +791,7 @@ static void rebuild_sched_domains_locked(void)
* passing doms with offlined cpu to partition_sched_domains().
* Anyways, hotplug work item will rebuild sched domains.
*/
- if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask))
+ if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
goto out;
/* Generate domain masks and attrs */
@@ -781,45 +815,6 @@ void rebuild_sched_domains(void)
mutex_unlock(&cpuset_mutex);
}
-/*
- * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus
- * @cs: the cpuset in interest
- *
- * A cpuset's effective cpumask is the cpumask of the nearest ancestor
- * with non-empty cpus. We use effective cpumask whenever:
- * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask
- * if the cpuset they reside in has no cpus)
- * - we want to retrieve task_cs(tsk)'s cpus_allowed.
- *
- * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an
- * exception. See comments there.
- */
-static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs)
-{
- while (cpumask_empty(cs->cpus_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
-/*
- * effective_nodemask_cpuset - return nearest ancestor with non-empty mems
- * @cs: the cpuset in interest
- *
- * A cpuset's effective nodemask is the nodemask of the nearest ancestor
- * with non-empty memss. We use effective nodemask whenever:
- * - we update tasks' mems_allowed. (they take on the ancestor's nodemask
- * if the cpuset they reside in has no mems)
- * - we want to retrieve task_cs(tsk)'s mems_allowed.
- *
- * Called with cpuset_mutex held.
- */
-static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
-{
- while (nodes_empty(cs->mems_allowed))
- cs = parent_cs(cs);
- return cs;
-}
-
/**
* update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
@@ -830,53 +825,80 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs)
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
css_task_iter_start(&cs->css, &it);
while ((task = css_task_iter_next(&it)))
- set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed);
+ set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
}
/*
- * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy.
- * @root_cs: the root cpuset of the hierarchy
- * @update_root: update root cpuset or not?
+ * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_cpus: temp variable for calculating new effective_cpus
+ *
+ * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * and all its descendants need to be updated.
*
- * This will update cpumasks of tasks in @root_cs and all other empty cpusets
- * which take on cpumask of @root_cs.
+ * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root)
+static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
+ bool need_rebuild_sched_domains = false;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some CPUs.
+ */
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent->effective_cpus);
+
+ /* Skip the whole subtree if the cpumask remains the same. */
+ if (cpumask_equal(new_cpus, cp->effective_cpus)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cp->effective_cpus, new_cpus);
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+
update_tasks_cpumask(cp);
+ /*
+ * If the effective cpumask of any non-empty cpuset is changed,
+ * we need to rebuild sched domains.
+ */
+ if (!cpumask_empty(cp->cpus_allowed) &&
+ is_sched_load_balance(cp))
+ need_rebuild_sched_domains = true;
+
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
+
+ if (need_rebuild_sched_domains)
+ rebuild_sched_domains_locked();
}
/**
@@ -889,7 +911,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
- int is_load_balanced;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -908,7 +929,8 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
+ if (!cpumask_subset(trialcs->cpus_allowed,
+ top_cpuset.cpus_allowed))
return -EINVAL;
}
@@ -920,16 +942,12 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
return retval;
- is_load_balanced = is_sched_load_balance(trialcs);
-
mutex_lock(&callback_mutex);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
mutex_unlock(&callback_mutex);
- update_tasks_cpumask_hier(cs, true);
-
- if (is_load_balanced)
- rebuild_sched_domains_locked();
+ /* use trialcs->cpus_allowed as a temp variable */
+ update_cpumasks_hier(cs, trialcs->cpus_allowed);
return 0;
}
@@ -951,15 +969,13 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct task_struct *tsk = current;
- struct cpuset *mems_cs;
tsk->mems_allowed = *to;
do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &tsk->mems_allowed);
+ guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
rcu_read_unlock();
}
@@ -1028,13 +1044,12 @@ static void *cpuset_being_rebound;
static void update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems; /* protected by cpuset_mutex */
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
- guarantee_online_mems(mems_cs, &newmems);
+ guarantee_online_mems(cs, &newmems);
/*
* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
@@ -1077,36 +1092,52 @@ static void update_tasks_nodemask(struct cpuset *cs)
}
/*
- * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy.
- * @cs: the root cpuset of the hierarchy
- * @update_root: update the root cpuset or not?
+ * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree
+ * @cs: the cpuset to consider
+ * @new_mems: a temp variable for calculating new effective_mems
*
- * This will update nodemasks of tasks in @root_cs and all other empty cpusets
- * which take on nodemask of @root_cs.
+ * When configured nodemask is changed, the effective nodemasks of this cpuset
+ * and all its descendants need to be updated.
+ *
+ * On legacy hiearchy, effective_mems will be the same with mems_allowed.
*
* Called with cpuset_mutex held
*/
-static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root)
+static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- if (cp == root_cs) {
- if (!update_root)
- continue;
- } else {
- /* skip the whole subtree if @cp have some CPU */
- if (!nodes_empty(cp->mems_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
+ cpuset_for_each_descendant_pre(cp, pos_css, cs) {
+ struct cpuset *parent = parent_cs(cp);
+
+ nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
+
+ /*
+ * If it becomes empty, inherit the effective mask of the
+ * parent, which is guaranteed to have some MEMs.
+ */
+ if (nodes_empty(*new_mems))
+ *new_mems = parent->effective_mems;
+
+ /* Skip the whole subtree if the nodemask remains the same. */
+ if (nodes_equal(*new_mems, cp->effective_mems)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
}
+
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
+ mutex_lock(&callback_mutex);
+ cp->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+ !nodes_equal(cp->mems_allowed, cp->effective_mems));
+
update_tasks_nodemask(cp);
rcu_read_lock();
@@ -1156,8 +1187,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
goto done;
if (!nodes_subset(trialcs->mems_allowed,
- node_states[N_MEMORY])) {
- retval = -EINVAL;
+ top_cpuset.mems_allowed)) {
+ retval = -EINVAL;
goto done;
}
}
@@ -1174,7 +1205,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
cs->mems_allowed = trialcs->mems_allowed;
mutex_unlock(&callback_mutex);
- update_tasks_nodemask_hier(cs, true);
+ /* use trialcs->mems_allowed as a temp variable */
+ update_nodemasks_hier(cs, &cs->mems_allowed);
done:
return retval;
}
@@ -1389,12 +1421,9 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
mutex_lock(&cpuset_mutex);
- /*
- * We allow to move tasks into an empty cpuset if sane_behavior
- * flag is set.
- */
+ /* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_sane_behavior(css->cgroup) &&
+ if (!cgroup_on_dfl(css->cgroup) &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1452,8 +1481,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
struct task_struct *leader = cgroup_taskset_first(tset);
struct cpuset *cs = css_cs(css);
struct cpuset *oldcs = cpuset_attach_old_cs;
- struct cpuset *cpus_cs = effective_cpumask_cpuset(cs);
- struct cpuset *mems_cs = effective_nodemask_cpuset(cs);
mutex_lock(&cpuset_mutex);
@@ -1461,9 +1488,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
- guarantee_online_cpus(cpus_cs, cpus_attach);
+ guarantee_online_cpus(cs, cpus_attach);
- guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to);
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, tset) {
/*
@@ -1480,11 +1507,9 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* Change mm, possibly for multiple threads in a threadgroup. This is
* expensive and may sleep.
*/
- cpuset_attach_nodemask_to = cs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->effective_mems;
mm = get_task_mm(leader);
if (mm) {
- struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs);
-
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
/*
@@ -1495,7 +1520,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
* mm from.
*/
if (is_memory_migrate(cs)) {
- cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed,
+ cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
}
mmput(mm);
@@ -1516,6 +1541,8 @@ typedef enum {
FILE_MEMORY_MIGRATE,
FILE_CPULIST,
FILE_MEMLIST,
+ FILE_EFFECTIVE_CPULIST,
+ FILE_EFFECTIVE_MEMLIST,
FILE_CPU_EXCLUSIVE,
FILE_MEM_EXCLUSIVE,
FILE_MEM_HARDWALL,
@@ -1694,6 +1721,12 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
case FILE_MEMLIST:
s += nodelist_scnprintf(s, count, cs->mems_allowed);
break;
+ case FILE_EFFECTIVE_CPULIST:
+ s += cpulist_scnprintf(s, count, cs->effective_cpus);
+ break;
+ case FILE_EFFECTIVE_MEMLIST:
+ s += nodelist_scnprintf(s, count, cs->effective_mems);
+ break;
default:
ret = -EINVAL;
goto out_unlock;
@@ -1779,6 +1812,18 @@ static struct cftype files[] = {
},
{
+ .name = "effective_cpus",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_CPULIST,
+ },
+
+ {
+ .name = "effective_mems",
+ .seq_show = cpuset_common_seq_show,
+ .private = FILE_EFFECTIVE_MEMLIST,
+ },
+
+ {
.name = "cpu_exclusive",
.read_u64 = cpuset_read_u64,
.write_u64 = cpuset_write_u64,
@@ -1869,18 +1914,26 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
- if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
- kfree(cs);
- return ERR_PTR(-ENOMEM);
- }
+ if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
+ goto free_cs;
+ if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
+ goto free_cpus;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed);
+ cpumask_clear(cs->effective_cpus);
+ nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
return &cs->css;
+
+free_cpus:
+ free_cpumask_var(cs->cpus_allowed);
+free_cs:
+ kfree(cs);
+ return ERR_PTR(-ENOMEM);
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
@@ -1903,6 +1956,13 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
+ mutex_lock(&callback_mutex);
+ if (cgroup_on_dfl(cs->css.cgroup)) {
+ cpumask_copy(cs->effective_cpus, parent->effective_cpus);
+ cs->effective_mems = parent->effective_mems;
+ }
+ mutex_unlock(&callback_mutex);
+
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
goto out_unlock;
@@ -1962,20 +2022,40 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
+ free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
kfree(cs);
}
+static void cpuset_bind(struct cgroup_subsys_state *root_css)
+{
+ mutex_lock(&cpuset_mutex);
+ mutex_lock(&callback_mutex);
+
+ if (cgroup_on_dfl(root_css->cgroup)) {
+ cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
+ top_cpuset.mems_allowed = node_possible_map;
+ } else {
+ cpumask_copy(top_cpuset.cpus_allowed,
+ top_cpuset.effective_cpus);
+ top_cpuset.mems_allowed = top_cpuset.effective_mems;
+ }
+
+ mutex_unlock(&callback_mutex);
+ mutex_unlock(&cpuset_mutex);
+}
+
struct cgroup_subsys cpuset_cgrp_subsys = {
- .css_alloc = cpuset_css_alloc,
- .css_online = cpuset_css_online,
- .css_offline = cpuset_css_offline,
- .css_free = cpuset_css_free,
- .can_attach = cpuset_can_attach,
- .cancel_attach = cpuset_cancel_attach,
- .attach = cpuset_attach,
- .base_cftypes = files,
- .early_init = 1,
+ .css_alloc = cpuset_css_alloc,
+ .css_online = cpuset_css_online,
+ .css_offline = cpuset_css_offline,
+ .css_free = cpuset_css_free,
+ .can_attach = cpuset_can_attach,
+ .cancel_attach = cpuset_cancel_attach,
+ .attach = cpuset_attach,
+ .bind = cpuset_bind,
+ .legacy_cftypes = files,
+ .early_init = 1,
};
/**
@@ -1990,9 +2070,13 @@ int __init cpuset_init(void)
if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
+ cpumask_setall(top_cpuset.effective_cpus);
+ nodes_setall(top_cpuset.effective_mems);
fmeter_init(&top_cpuset.fmeter);
set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
@@ -2035,6 +2119,66 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
}
}
+static void
+hotplug_update_tasks_legacy(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ bool is_empty;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->cpus_allowed, new_cpus);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->mems_allowed = *new_mems;
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ /*
+ * Don't call update_tasks_cpumask() if the cpuset becomes empty,
+ * as the tasks will be migratecd to an ancestor.
+ */
+ if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
+ update_tasks_cpumask(cs);
+ if (mems_updated && !nodes_empty(cs->mems_allowed))
+ update_tasks_nodemask(cs);
+
+ is_empty = cpumask_empty(cs->cpus_allowed) ||
+ nodes_empty(cs->mems_allowed);
+
+ mutex_unlock(&cpuset_mutex);
+
+ /*
+ * Move tasks to the nearest ancestor with execution resources,
+ * This is full cgroup operation which will also call back into
+ * cpuset. Should be done outside any lock.
+ */
+ if (is_empty)
+ remove_tasks_in_empty_cpuset(cs);
+
+ mutex_lock(&cpuset_mutex);
+}
+
+static void
+hotplug_update_tasks(struct cpuset *cs,
+ struct cpumask *new_cpus, nodemask_t *new_mems,
+ bool cpus_updated, bool mems_updated)
+{
+ if (cpumask_empty(new_cpus))
+ cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
+ if (nodes_empty(*new_mems))
+ *new_mems = parent_cs(cs)->effective_mems;
+
+ mutex_lock(&callback_mutex);
+ cpumask_copy(cs->effective_cpus, new_cpus);
+ cs->effective_mems = *new_mems;
+ mutex_unlock(&callback_mutex);
+
+ if (cpus_updated)
+ update_tasks_cpumask(cs);
+ if (mems_updated)
+ update_tasks_nodemask(cs);
+}
+
/**
* cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug
* @cs: cpuset in interest
@@ -2045,11 +2189,10 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
*/
static void cpuset_hotplug_update_tasks(struct cpuset *cs)
{
- static cpumask_t off_cpus;
- static nodemask_t off_mems;
- bool is_empty;
- bool sane = cgroup_sane_behavior(cs->css.cgroup);
-
+ static cpumask_t new_cpus;
+ static nodemask_t new_mems;
+ bool cpus_updated;
+ bool mems_updated;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
@@ -2064,51 +2207,20 @@ retry:
goto retry;
}
- cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
- nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
-
- mutex_lock(&callback_mutex);
- cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' cpumask
- * for empty cpuset to take on ancestor's cpumask. Otherwise, don't
- * call update_tasks_cpumask() if the cpuset becomes empty, as
- * the tasks in it will be migrated to an ancestor.
- */
- if ((sane && cpumask_empty(cs->cpus_allowed)) ||
- (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed)))
- update_tasks_cpumask(cs);
+ cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
- mutex_lock(&callback_mutex);
- nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
- mutex_unlock(&callback_mutex);
-
- /*
- * If sane_behavior flag is set, we need to update tasks' nodemask
- * for empty cpuset to take on ancestor's nodemask. Otherwise, don't
- * call update_tasks_nodemask() if the cpuset becomes empty, as
- * the tasks in it will be migratd to an ancestor.
- */
- if ((sane && nodes_empty(cs->mems_allowed)) ||
- (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed)))
- update_tasks_nodemask(cs);
+ cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
+ mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
+ if (cgroup_on_dfl(cs->css.cgroup))
+ hotplug_update_tasks(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
+ else
+ hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
+ cpus_updated, mems_updated);
mutex_unlock(&cpuset_mutex);
-
- /*
- * If sane_behavior flag is set, we'll keep tasks in empty cpusets.
- *
- * Otherwise move tasks to the nearest ancestor with execution
- * resources. This is full cgroup operation which will
- * also call back into cpuset. Should be done outside any lock.
- */
- if (!sane && is_empty)
- remove_tasks_in_empty_cpuset(cs);
}
/**
@@ -2132,6 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
+ bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
mutex_lock(&cpuset_mutex);
@@ -2139,13 +2252,15 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
- cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
- mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
+ cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
+ mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
- cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ if (!on_dfl)
+ cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
+ cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
@@ -2153,7 +2268,9 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
mutex_lock(&callback_mutex);
- top_cpuset.mems_allowed = new_mems;
+ if (!on_dfl)
+ top_cpuset.mems_allowed = new_mems;
+ top_cpuset.effective_mems = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset);
}
@@ -2228,6 +2345,9 @@ void __init cpuset_init_smp(void)
top_cpuset.mems_allowed = node_states[N_MEMORY];
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
+ cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
+ top_cpuset.effective_mems = node_states[N_MEMORY];
+
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
}
@@ -2244,23 +2364,17 @@ void __init cpuset_init_smp(void)
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
- struct cpuset *cpus_cs;
-
mutex_lock(&callback_mutex);
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- guarantee_online_cpus(cpus_cs, pmask);
+ guarantee_online_cpus(task_cs(tsk), pmask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
}
void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
- struct cpuset *cpus_cs;
-
rcu_read_lock();
- cpus_cs = effective_cpumask_cpuset(task_cs(tsk));
- do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus);
rcu_read_unlock();
/*
@@ -2299,13 +2413,11 @@ void cpuset_init_current_mems_allowed(void)
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
- struct cpuset *mems_cs;
nodemask_t mask;
mutex_lock(&callback_mutex);
rcu_read_lock();
- mems_cs = effective_nodemask_cpuset(task_cs(tsk));
- guarantee_online_mems(mems_cs, &mask);
+ guarantee_online_mems(task_cs(tsk), &mask);
rcu_read_unlock();
mutex_unlock(&callback_mutex);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index de699f42f9bc..ef90b04d783f 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -46,32 +46,25 @@ void __delayacct_tsk_init(struct task_struct *tsk)
}
/*
- * Finish delay accounting for a statistic using
- * its timestamps (@start, @end), accumalator (@total) and @count
+ * Finish delay accounting for a statistic using its timestamps (@start),
+ * accumalator (@total) and @count
*/
-
-static void delayacct_end(struct timespec *start, struct timespec *end,
- u64 *total, u32 *count)
+static void delayacct_end(u64 *start, u64 *total, u32 *count)
{
- struct timespec ts;
- s64 ns;
+ s64 ns = ktime_get_ns() - *start;
unsigned long flags;
- ktime_get_ts(end);
- ts = timespec_sub(*end, *start);
- ns = timespec_to_ns(&ts);
- if (ns < 0)
- return;
-
- spin_lock_irqsave(&current->delays->lock, flags);
- *total += ns;
- (*count)++;
- spin_unlock_irqrestore(&current->delays->lock, flags);
+ if (ns > 0) {
+ spin_lock_irqsave(&current->delays->lock, flags);
+ *total += ns;
+ (*count)++;
+ spin_unlock_irqrestore(&current->delays->lock, flags);
+ }
}
void __delayacct_blkio_start(void)
{
- ktime_get_ts(&current->delays->blkio_start);
+ current->delays->blkio_start = ktime_get_ns();
}
void __delayacct_blkio_end(void)
@@ -79,35 +72,29 @@ void __delayacct_blkio_end(void)
if (current->delays->flags & DELAYACCT_PF_SWAPIN)
/* Swapin block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->swapin_delay,
&current->delays->swapin_count);
else /* Other block I/O */
delayacct_end(&current->delays->blkio_start,
- &current->delays->blkio_end,
&current->delays->blkio_delay,
&current->delays->blkio_count);
}
int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
{
- s64 tmp;
- unsigned long t1;
- unsigned long long t2, t3;
- unsigned long flags;
- struct timespec ts;
cputime_t utime, stime, stimescaled, utimescaled;
+ unsigned long long t2, t3;
+ unsigned long flags, t1;
+ s64 tmp;
- tmp = (s64)d->cpu_run_real_total;
task_cputime(tsk, &utime, &stime);
- cputime_to_timespec(utime + stime, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_run_real_total;
+ tmp += cputime_to_nsecs(utime + stime);
d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
- tmp = (s64)d->cpu_scaled_run_real_total;
task_cputime_scaled(tsk, &utimescaled, &stimescaled);
- cputime_to_timespec(utimescaled + stimescaled, &ts);
- tmp += timespec_to_ns(&ts);
+ tmp = (s64)d->cpu_scaled_run_real_total;
+ tmp += cputime_to_nsecs(utimescaled + stimescaled);
d->cpu_scaled_run_real_total =
(tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp;
@@ -159,13 +146,12 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
void __delayacct_freepages_start(void)
{
- ktime_get_ts(&current->delays->freepages_start);
+ current->delays->freepages_start = ktime_get_ns();
}
void __delayacct_freepages_end(void)
{
delayacct_end(&current->delays->freepages_start,
- &current->delays->freepages_end,
&current->delays->freepages_delay,
&current->delays->freepages_count);
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6b17ac1b0c2a..1cf24b3e42ec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5266,6 +5266,12 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
goto got_name;
} else {
+ if (vma->vm_ops && vma->vm_ops->name) {
+ name = (char *) vma->vm_ops->name(vma);
+ if (name)
+ goto cpy_name;
+ }
+
name = (char *)arch_vma_name(vma);
if (name)
goto cpy_name;
@@ -7804,7 +7810,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
/*
* Initialize the perf_event context in task_struct
*/
-int perf_event_init_context(struct task_struct *child, int ctxn)
+static int perf_event_init_context(struct task_struct *child, int ctxn)
{
struct perf_event_context *child_ctx, *parent_ctx;
struct perf_event_context *cloned_ctx;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
/* For mmu_notifiers */
const unsigned long mmun_start = addr;
const unsigned long mmun_end = addr + PAGE_SIZE;
+ struct mem_cgroup *memcg;
+
+ err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+ if (err)
+ return err;
/* For try_to_free_swap() and munlock_vma_page() below */
lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr);
+ mem_cgroup_commit_charge(kpage, memcg, false);
+ lru_cache_add_active_or_unevictable(kpage, vma);
if (!PageAnon(page)) {
dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
+ mem_cgroup_cancel_charge(kpage, memcg);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
@@ -315,18 +323,11 @@ retry:
if (!new_page)
goto put_old;
- if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
- goto put_new;
-
__SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
ret = __replace_page(vma, vaddr, old_page, new_page);
- if (ret)
- mem_cgroup_uncharge_page(new_page);
-
-put_new:
page_cache_release(new_page);
put_old:
put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..32c58f7433a3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
-static void exit_mm(struct task_struct * tsk);
+static void exit_mm(struct task_struct *tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
@@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk)
spin_unlock(&sighand->siglock);
__cleanup_sighand(sighand);
- clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
+ clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
if (group_dead) {
flush_sigqueue(&sig->shared_pending);
tty_kref_put(tty);
@@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
}
-void release_task(struct task_struct * p)
+void release_task(struct task_struct *p)
{
struct task_struct *leader;
int zap_leader;
@@ -192,7 +192,8 @@ repeat:
*/
zap_leader = 0;
leader = p->group_leader;
- if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
+ if (leader != p && thread_group_empty(leader)
+ && leader->exit_state == EXIT_ZOMBIE) {
/*
* If we were the last child thread and the leader has
* exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
*
* "I ask you, have you ever known what it is to be an orphan?"
*/
-static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task)
+static int will_become_orphaned_pgrp(struct pid *pgrp,
+ struct task_struct *ignored_task)
{
struct task_struct *p;
@@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
struct task_struct *ignored_task = tsk;
if (!parent)
- /* exit: our father is in a different pgrp than
- * we are and we were the only connection outside.
- */
+ /* exit: our father is in a different pgrp than
+ * we are and we were the only connection outside.
+ */
parent = tsk->real_parent;
else
/* reparent: our child is in a different pgrp than
@@ -405,7 +407,7 @@ assign_new_owner:
* Turn us into a lazy TLB process if we
* aren't already..
*/
-static void exit_mm(struct task_struct * tsk)
+static void exit_mm(struct task_struct *tsk)
{
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
@@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk)
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
+
up_read(&mm->mmap_sem);
self.task = tsk;
@@ -455,6 +458,7 @@ static void exit_mm(struct task_struct * tsk)
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
+ clear_thread_flag(TIF_MEMDIE);
}
/*
@@ -565,6 +569,7 @@ static void forget_original_parent(struct task_struct *father)
list_for_each_entry_safe(p, n, &father->children, sibling) {
struct task_struct *t = p;
+
do {
t->real_parent = reaper;
if (t->parent == father) {
@@ -598,7 +603,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/*
* This does two things:
*
- * A. Make init inherit all the child processes
+ * A. Make init inherit all the child processes
* B. Check to see if any process groups have become orphaned
* as a result of our exiting, and if they have any stopped
* jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
@@ -648,9 +653,8 @@ static void check_stack_usage(void)
spin_lock(&low_water_lock);
if (free < lowest_to_date) {
- printk(KERN_WARNING "%s (%d) used greatest stack depth: "
- "%lu bytes left\n",
- current->comm, task_pid_nr(current), free);
+ pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
+ current->comm, task_pid_nr(current), free);
lowest_to_date = free;
}
spin_unlock(&low_water_lock);
@@ -691,8 +695,7 @@ void do_exit(long code)
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
- printk(KERN_ALERT
- "Fixing recursive fault but reboot is needed!\n");
+ pr_alert("Fixing recursive fault but reboot is needed!\n");
/*
* We can do this unlocked here. The futex code uses
* this flag just to verify whether the pi state
@@ -716,9 +719,9 @@ void do_exit(long code)
raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic()))
- printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
- current->comm, task_pid_nr(current),
- preempt_count());
+ pr_info("note: %s[%d] exited with preempt_count %d\n",
+ current->comm, task_pid_nr(current),
+ preempt_count());
acct_update_integrals(tsk);
/* sync mm's RSS info before statistics gathering */
@@ -836,7 +839,6 @@ void do_exit(long code)
for (;;)
cpu_relax(); /* For when BUG is null */
}
-
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +848,6 @@ void complete_and_exit(struct completion *comp, long code)
do_exit(code);
}
-
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +870,7 @@ do_group_exit(int exit_code)
exit_code = sig->group_exit_code;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
+
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
/* Another thread got here before we took the lock. */
@@ -1033,9 +1035,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* as other threads in the parent group can be right
* here reaping other children at the same time.
*
- * We use thread_group_cputime_adjusted() to get times for the thread
- * group, which consolidates times for all threads in the
- * group including the group leader.
+ * We use thread_group_cputime_adjusted() to get times for
+ * the thread group, which consolidates times for all threads
+ * in the group including the group leader.
*/
thread_group_cputime_adjusted(p, &tgutime, &tgstime);
spin_lock_irq(&p->real_parent->sighand->siglock);
@@ -1417,6 +1419,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->children, sibling) {
int ret = wait_consider_task(wo, 0, p);
+
if (ret)
return ret;
}
@@ -1430,6 +1433,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
int ret = wait_consider_task(wo, 1, p);
+
if (ret)
return ret;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 627b7f80afb0..1380d8ace334 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -315,6 +315,15 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
goto free_ti;
tsk->stack = ti;
+#ifdef CONFIG_SECCOMP
+ /*
+ * We must handle setting up seccomp filters once we're under
+ * the sighand lock in case orig has changed between now and
+ * then. Until then, filter must be NULL to avoid messing up
+ * the usage counts on the error path calling free_task.
+ */
+ tsk->seccomp.filter = NULL;
+#endif
setup_thread_stack(tsk, orig);
clear_user_return_notifier(tsk);
@@ -365,12 +374,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
- mm->locked_vm = 0;
- mm->mmap = NULL;
- mm->vmacache_seqnum = 0;
- mm->map_count = 0;
- cpumask_clear(mm_cpumask(mm));
- mm->mm_rb = RB_ROOT;
+ mm->total_vm = oldmm->total_vm;
+ mm->shared_vm = oldmm->shared_vm;
+ mm->exec_vm = oldmm->exec_vm;
+ mm->stack_vm = oldmm->stack_vm;
+
rb_link = &mm->mm_rb.rb_node;
rb_parent = NULL;
pprev = &mm->mmap;
@@ -421,7 +429,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
atomic_dec(&inode->i_writecount);
mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
- mapping->i_mmap_writable++;
+ atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -527,19 +535,37 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
+{
+#ifdef CONFIG_MEMCG
+ mm->owner = p;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
{
+ mm->mmap = NULL;
+ mm->mm_rb = RB_ROOT;
+ mm->vmacache_seqnum = 0;
atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0);
+ mm->map_count = 0;
+ mm->locked_vm = 0;
+ mm->pinned_vm = 0;
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
+ mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
+ mm->pmd_huge_pte = NULL;
+#endif
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -549,11 +575,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->def_flags = 0;
}
- if (likely(!mm_alloc_pgd(mm))) {
- mmu_notifier_mm_init(mm);
- return mm;
- }
+ if (mm_alloc_pgd(mm))
+ goto fail_nopgd;
+
+ if (init_new_context(p, mm))
+ goto fail_nocontext;
+ return mm;
+
+fail_nocontext:
+ mm_free_pgd(mm);
+fail_nopgd:
free_mm(mm);
return NULL;
}
@@ -587,7 +619,6 @@ struct mm_struct *mm_alloc(void)
return NULL;
memset(mm, 0, sizeof(*mm));
- mm_init_cpumask(mm);
return mm_init(mm, current);
}
@@ -819,17 +850,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
goto fail_nomem;
memcpy(mm, oldmm, sizeof(*mm));
- mm_init_cpumask(mm);
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- mm->pmd_huge_pte = NULL;
-#endif
if (!mm_init(mm, tsk))
goto fail_nomem;
- if (init_new_context(tsk, mm))
- goto fail_nocontext;
-
dup_mm_exe_file(oldmm, mm);
err = dup_mmap(mm, oldmm);
@@ -851,15 +875,6 @@ free_pt:
fail_nomem:
return NULL;
-
-fail_nocontext:
- /*
- * If init_new_context() failed, we cannot use mmput() to free the mm
- * because it calls destroy_context()
- */
- mm_free_pgd(mm);
- free_mm(mm);
- return NULL;
}
static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1081,6 +1096,39 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
return 0;
}
+static void copy_seccomp(struct task_struct *p)
+{
+#ifdef CONFIG_SECCOMP
+ /*
+ * Must be called with sighand->lock held, which is common to
+ * all threads in the group. Holding cred_guard_mutex is not
+ * needed because this new task is not yet running and cannot
+ * be racing exec.
+ */
+ BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+ /* Ref-count the new filter user, and assign it. */
+ get_seccomp_filter(current);
+ p->seccomp = current->seccomp;
+
+ /*
+ * Explicitly enable no_new_privs here in case it got set
+ * between the task_struct being duplicated and holding the
+ * sighand lock. The seccomp state and nnp must be in sync.
+ */
+ if (task_no_new_privs(current))
+ task_set_no_new_privs(p);
+
+ /*
+ * If the parent gained a seccomp mode after copying thread
+ * flags and between before we held the sighand lock, we have
+ * to manually enable the seccomp thread flag here.
+ */
+ if (p->seccomp.mode != SECCOMP_MODE_DISABLED)
+ set_tsk_thread_flag(p, TIF_SECCOMP);
+#endif
+}
+
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
{
current->clear_child_tid = tidptr;
@@ -1095,17 +1143,9 @@ static void rt_mutex_init_task(struct task_struct *p)
p->pi_waiters = RB_ROOT;
p->pi_waiters_leftmost = NULL;
p->pi_blocked_on = NULL;
- p->pi_top_task = NULL;
#endif
}
-#ifdef CONFIG_MEMCG
-void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
-{
- mm->owner = p;
-}
-#endif /* CONFIG_MEMCG */
-
/*
* Initialize POSIX timer handling for a single task.
*/
@@ -1196,7 +1236,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto fork_out;
ftrace_graph_init_task(p);
- get_seccomp_filter(p);
rt_mutex_init_task(p);
@@ -1306,10 +1345,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
#endif
-#ifdef CONFIG_MEMCG
- p->memcg_batch.do_batch = 0;
- p->memcg_batch.memcg = NULL;
-#endif
#ifdef CONFIG_BCACHE
p->sequential_io = 0;
p->sequential_io_avg = 0;
@@ -1327,6 +1362,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (retval)
goto bad_fork_cleanup_policy;
/* copy all the process information */
+ shm_init_task(p);
retval = copy_semundo(clone_flags, p);
if (retval)
goto bad_fork_cleanup_audit;
@@ -1436,6 +1472,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
spin_lock(&current->sighand->siglock);
/*
+ * Copy seccomp details explicitly here, in case they were changed
+ * before holding sighand lock.
+ */
+ copy_seccomp(p);
+
+ /*
* Process group and session signals need to be delivered to just the
* parent before the fork or both the parent and the child after the
* fork. Restart if a signal comes in before we add the new process to
@@ -1872,6 +1914,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
*/
exit_sem(current);
}
+ if (unshare_flags & CLONE_NEWIPC) {
+ /* Orphan segments in old ns (see sem above). */
+ exit_shm(current);
+ shm_init_task(current);
+ }
if (new_nsproxy)
switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/futex.c b/kernel/futex.c
index b632b5f3f094..d3a9d946d0b7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -792,94 +792,91 @@ void exit_pi_state_list(struct task_struct *curr)
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
*/
-static int
-lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+
+/*
+ * Validate that the existing waiter has a pi_state and sanity check
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
{
- struct futex_pi_state *pi_state = NULL;
- struct futex_q *this, *next;
- struct task_struct *p;
pid_t pid = uval & FUTEX_TID_MASK;
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (match_futex(&this->key, key)) {
- /*
- * Sanity check the waiter before increasing
- * the refcount and attaching to it.
- */
- pi_state = this->pi_state;
- /*
- * Userspace might have messed up non-PI and
- * PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
- WARN_ON(!atomic_read(&pi_state->refcount));
+ WARN_ON(!atomic_read(&pi_state->refcount));
+ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+ /*
+ * exit_pi_state_list sets owner to NULL and wakes the
+ * topmost waiter. The task which acquires the
+ * pi_state->rt_mutex will fixup owner.
+ */
+ if (!pi_state->owner) {
/*
- * Handle the owner died case:
+ * No pi state owner, but the user space TID
+ * is not 0. Inconsistent state. [5]
*/
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and
- * wakes the topmost waiter. The task which
- * acquires the pi_state->rt_mutex will fixup
- * owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user
- * space TID is not 0. Inconsistent
- * state. [5]
- */
- if (pid)
- return -EINVAL;
- /*
- * Take a ref on the state and
- * return. [4]
- */
- goto out_state;
- }
-
- /*
- * If TID is 0, then either the dying owner
- * has not yet executed exit_pi_state_list()
- * or some waiter acquired the rtmutex in the
- * pi state, but did not yet fixup the TID in
- * user space.
- *
- * Take a ref on the state and return. [6]
- */
- if (!pid)
- goto out_state;
- } else {
- /*
- * If the owner died bit is not set,
- * then the pi_state must have an
- * owner. [7]
- */
- if (!pi_state->owner)
- return -EINVAL;
- }
-
+ if (pid)
+ return -EINVAL;
/*
- * Bail out if user space manipulated the
- * futex value. If pi state exists then the
- * owner TID must be the same as the user
- * space TID. [9/10]
+ * Take a ref on the state and return success. [4]
*/
- if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-
- out_state:
- atomic_inc(&pi_state->refcount);
- *ps = pi_state;
- return 0;
+ goto out_state;
}
+
+ /*
+ * If TID is 0, then either the dying owner has not
+ * yet executed exit_pi_state_list() or some waiter
+ * acquired the rtmutex in the pi state, but did not
+ * yet fixup the TID in user space.
+ *
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+ goto out_state;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+ return -EINVAL;
}
/*
+ * Bail out if user space manipulated the futex value. If pi
+ * state exists then the owner TID must be the same as the
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+ return -EINVAL;
+out_state:
+ atomic_inc(&pi_state->refcount);
+ *ps = pi_state;
+ return 0;
+}
+
+/*
+ * Lookup the task for the TID provided from user space and attach to
+ * it after doing proper sanity checks.
+ */
+static int attach_to_pi_owner(u32 uval, union futex_key *key,
+ struct futex_pi_state **ps)
+{
+ pid_t pid = uval & FUTEX_TID_MASK;
+ struct futex_pi_state *pi_state;
+ struct task_struct *p;
+
+ /*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
*/
@@ -920,7 +917,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
pi_state = alloc_pi_state();
/*
- * Initialize the pi_mutex in locked state and make 'p'
+ * Initialize the pi_mutex in locked state and make @p
* the owner of it:
*/
rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
@@ -940,6 +937,36 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
return 0;
}
+static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+{
+ struct futex_q *match = futex_top_waiter(hb, key);
+
+ /*
+ * If there is a waiter on that futex, validate it and
+ * attach to the pi_state when the validation succeeds.
+ */
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+ * @uval and attach to it.
+ */
+ return attach_to_pi_owner(uval, key, ps);
+}
+
+static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
+{
+ u32 uninitialized_var(curval);
+
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+ return -EFAULT;
+
+ /*If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+}
+
/**
* futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
* @uaddr: the pi futex user address
@@ -963,113 +990,69 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct futex_pi_state **ps,
struct task_struct *task, int set_waiters)
{
- int lock_taken, ret, force_take = 0;
- u32 uval, newval, curval, vpid = task_pid_vnr(task);
-
-retry:
- ret = lock_taken = 0;
+ u32 uval, newval, vpid = task_pid_vnr(task);
+ struct futex_q *match;
+ int ret;
/*
- * To avoid races, we attempt to take the lock here again
- * (by doing a 0 -> TID atomic cmpxchg), while holding all
- * the locks. It will most likely not succeed.
+ * Read the user space value first so we can validate a few
+ * things before proceeding further.
*/
- newval = vpid;
- if (set_waiters)
- newval |= FUTEX_WAITERS;
-
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
+ if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
+ if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
- * Surprise - we got the lock, but we do not trust user space at all.
- */
- if (unlikely(!curval)) {
- /*
- * We verify whether there is kernel state for this
- * futex. If not, we can safely assume, that the 0 ->
- * TID transition is correct. If state exists, we do
- * not bother to fixup the user space state as it was
- * corrupted already.
- */
- return futex_top_waiter(hb, key) ? -EINVAL : 1;
- }
-
- uval = curval;
-
- /*
- * Set the FUTEX_WAITERS flag, so the owner will know it has someone
- * to wake at the next unlock.
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
*/
- newval = curval | FUTEX_WAITERS;
+ match = futex_top_waiter(hb, key);
+ if (match)
+ return attach_to_pi_state(uval, match->pi_state, ps);
/*
- * Should we force take the futex? See below.
+ * No waiter and user TID is 0. We are here because the
+ * waiters or the owner died bit is set or called from
+ * requeue_cmp_pi or for whatever reason something took the
+ * syscall.
*/
- if (unlikely(force_take)) {
+ if (!(uval & FUTEX_TID_MASK)) {
/*
- * Keep the OWNER_DIED and the WAITERS bit and set the
- * new TID value.
+ * We take over the futex. No other waiters and the user space
+ * TID is 0. We preserve the owner died bit.
*/
- newval = (curval & ~FUTEX_TID_MASK) | vpid;
- force_take = 0;
- lock_taken = 1;
- }
+ newval = uval & FUTEX_OWNER_DIED;
+ newval |= vpid;
- if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
- return -EFAULT;
- if (unlikely(curval != uval))
- goto retry;
+ /* The futex requeue_pi code can enforce the waiters bit */
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
+
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ /* If the take over worked, return 1 */
+ return ret < 0 ? ret : 1;
+ }
/*
- * We took the lock due to forced take over.
+ * First waiter. Set the waiters bit before attaching ourself to
+ * the owner. If owner tries to unlock, it will be forced into
+ * the kernel and blocked on hb->lock.
*/
- if (unlikely(lock_taken))
- return 1;
-
+ newval = uval | FUTEX_WAITERS;
+ ret = lock_pi_update_atomic(uaddr, uval, newval);
+ if (ret)
+ return ret;
/*
- * We dont have the lock. Look up the PI state (or create it if
- * we are the first waiter):
+ * If the update of the user space value succeeded, we try to
+ * attach to the owner. If that fails, no harm done, we only
+ * set the FUTEX_WAITERS bit in the user space variable.
*/
- ret = lookup_pi_state(uval, hb, key, ps);
-
- if (unlikely(ret)) {
- switch (ret) {
- case -ESRCH:
- /*
- * We failed to find an owner for this
- * futex. So we have no pi_state to block
- * on. This can happen in two cases:
- *
- * 1) The owner died
- * 2) A stale FUTEX_WAITERS bit
- *
- * Re-read the futex value.
- */
- if (get_futex_value_locked(&curval, uaddr))
- return -EFAULT;
-
- /*
- * If the owner died or we have a stale
- * WAITERS bit the owner TID in the user space
- * futex is 0.
- */
- if (!(curval & FUTEX_TID_MASK)) {
- force_take = 1;
- goto retry;
- }
- default:
- break;
- }
- }
-
- return ret;
+ return attach_to_pi_owner(uval, key, ps);
}
/**
@@ -1186,22 +1169,6 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
return 0;
}
-static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
-{
- u32 uninitialized_var(oldval);
-
- /*
- * There is no waiter, so we unlock the futex. The owner died
- * bit has not to be preserved here. We are the owner:
- */
- if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
- return -EFAULT;
- if (oldval != uval)
- return -EAGAIN;
-
- return 0;
-}
-
/*
* Express the locking dependencies for lockdep:
*/
@@ -1659,7 +1626,12 @@ retry_private:
goto retry;
goto out;
case -EAGAIN:
- /* The owner was exiting, try again. */
+ /*
+ * Two reasons for this:
+ * - Owner is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
+ */
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1718,7 +1690,7 @@ retry_private:
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
- this->task, 1);
+ this->task);
if (ret == 1) {
/* We got the lock. */
requeue_pi_wake_futex(this, &key2, hb2);
@@ -2316,8 +2288,10 @@ retry_private:
goto uaddr_faulted;
case -EAGAIN:
/*
- * Task is exiting and we just wait for the
- * exit to complete.
+ * Two reasons for this:
+ * - Task is exiting and we just wait for the
+ * exit to complete.
+ * - The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
@@ -2337,9 +2311,9 @@ retry_private:
/*
* Block on the PI mutex:
*/
- if (!trylock)
- ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
- else {
+ if (!trylock) {
+ ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+ } else {
ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
@@ -2401,10 +2375,10 @@ uaddr_faulted:
*/
static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
- struct futex_hash_bucket *hb;
- struct futex_q *this, *next;
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
- u32 uval, vpid = task_pid_vnr(current);
+ struct futex_hash_bucket *hb;
+ struct futex_q *match;
int ret;
retry:
@@ -2417,57 +2391,47 @@ retry:
return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
- if (unlikely(ret != 0))
- goto out;
+ if (ret)
+ return ret;
hb = hash_futex(&key);
spin_lock(&hb->lock);
/*
- * To avoid races, try to do the TID -> 0 atomic transition
- * again. If it succeeds then we can return without waking
- * anyone else up. We only try this if neither the waiters nor
- * the owner died bit are set.
- */
- if (!(uval & ~FUTEX_TID_MASK) &&
- cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
- goto pi_faulted;
- /*
- * Rare case: we managed to release the lock atomically,
- * no need to wake anyone else up:
- */
- if (unlikely(uval == vpid))
- goto out_unlock;
-
- /*
- * Ok, other tasks may need to be woken up - check waiters
- * and do the wakeup if necessary:
+ * Check waiters first. We do not trust user space values at
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
*/
- plist_for_each_entry_safe(this, next, &hb->chain, list) {
- if (!match_futex (&this->key, &key))
- continue;
- ret = wake_futex_pi(uaddr, uval, this);
+ match = futex_top_waiter(hb, &key);
+ if (match) {
+ ret = wake_futex_pi(uaddr, uval, match);
/*
- * The atomic access to the futex value
- * generated a pagefault, so retry the
- * user-access and the wakeup:
+ * The atomic access to the futex value generated a
+ * pagefault, so retry the user-access and the wakeup:
*/
if (ret == -EFAULT)
goto pi_faulted;
goto out_unlock;
}
+
/*
- * No waiters - kernel unlocks the futex:
+ * We have no kernel internal state, i.e. no waiters in the
+ * kernel. Waiters which are about to queue themselves are stuck
+ * on hb->lock. So we can safely ignore them. We do neither
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
goto pi_faulted;
+ /*
+ * If uval has changed, let user space handle it.
+ */
+ ret = (curval == uval) ? 0 : -EAGAIN;
+
out_unlock:
spin_unlock(&hb->lock);
put_futex_key(&key);
-
-out:
return ret;
pi_faulted:
@@ -2669,7 +2633,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+ ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
debug_rt_mutex_free_waiter(&rt_waiter);
spin_lock(q.lock_ptr);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
err_remove:
pr_err("init failed\n");
- if (root_node.dentry)
- debugfs_remove(root_node.dentry);
+ debugfs_remove(root_node.dentry);
return rc;
}
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 452d6f2ba21d..cf80e7b0ddab 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -341,8 +341,8 @@ static struct lock_class_key irq_nested_lock_class;
/*
* irq_map_generic_chip - Map a generic chip for an irq domain
*/
-static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
- irq_hw_number_t hw_irq)
+int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
+ irq_hw_number_t hw_irq)
{
struct irq_data *data = irq_get_irq_data(virq);
struct irq_domain_chip_generic *dgc = d->gc;
@@ -394,6 +394,7 @@ static int irq_map_generic_chip(struct irq_domain *d, unsigned int virq,
irq_modify_status(virq, dgc->irq_flags_to_clear, dgc->irq_flags_to_set);
return 0;
}
+EXPORT_SYMBOL_GPL(irq_map_generic_chip);
struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index eb5e10e32e05..6534ff6ce02e 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -231,7 +231,7 @@ void irq_set_default_host(struct irq_domain *domain)
}
EXPORT_SYMBOL_GPL(irq_set_default_host);
-static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
+void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq)
{
struct irq_data *irq_data = irq_get_irq_data(irq);
irq_hw_number_t hwirq;
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index a82170e2fa78..e6bcbe756663 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -16,11 +16,12 @@
#include <linux/tick.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
+#include <linux/smp.h>
#include <asm/processor.h>
-static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-static DEFINE_PER_CPU(int, irq_work_raised);
+static DEFINE_PER_CPU(struct llist_head, raised_list);
+static DEFINE_PER_CPU(struct llist_head, lazy_list);
/*
* Claim the entry so that no one else will poke at it.
@@ -55,12 +56,34 @@ void __weak arch_irq_work_raise(void)
*/
}
+#ifdef CONFIG_SMP
/*
- * Enqueue the irq_work @entry unless it's already pending
+ * Enqueue the irq_work @work on @cpu unless it's already pending
* somewhere.
*
* Can be re-enqueued while the callback is still in progress.
*/
+bool irq_work_queue_on(struct irq_work *work, int cpu)
+{
+ /* All work should have been flushed before going offline */
+ WARN_ON_ONCE(cpu_is_offline(cpu));
+
+ /* Arch remote IPI send/receive backend aren't NMI safe */
+ WARN_ON_ONCE(in_nmi());
+
+ /* Only queue if not already pending */
+ if (!irq_work_claim(work))
+ return false;
+
+ if (llist_add(&work->llnode, &per_cpu(raised_list, cpu)))
+ arch_send_call_function_single_ipi(cpu);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(irq_work_queue_on);
+#endif
+
+/* Enqueue the irq work @work on the current CPU */
bool irq_work_queue(struct irq_work *work)
{
/* Only queue if not already pending */
@@ -70,15 +93,13 @@ bool irq_work_queue(struct irq_work *work)
/* Queue the entry and raise the IPI if needed. */
preempt_disable();
- llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-
- /*
- * If the work is not "lazy" or the tick is stopped, raise the irq
- * work interrupt (if supported by the arch), otherwise, just wait
- * for the next tick.
- */
- if (!(work->flags & IRQ_WORK_LAZY) || tick_nohz_tick_stopped()) {
- if (!this_cpu_cmpxchg(irq_work_raised, 0, 1))
+ /* If the work is "lazy", handle it from next tick if any */
+ if (work->flags & IRQ_WORK_LAZY) {
+ if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) &&
+ tick_nohz_tick_stopped())
+ arch_irq_work_raise();
+ } else {
+ if (llist_add(&work->llnode, &__get_cpu_var(raised_list)))
arch_irq_work_raise();
}
@@ -90,10 +111,11 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
bool irq_work_needs_cpu(void)
{
- struct llist_head *this_list;
+ struct llist_head *raised, *lazy;
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
+ raised = &__get_cpu_var(raised_list);
+ lazy = &__get_cpu_var(lazy_list);
+ if (llist_empty(raised) && llist_empty(lazy))
return false;
/* All work should have been flushed before going offline */
@@ -102,28 +124,18 @@ bool irq_work_needs_cpu(void)
return true;
}
-static void __irq_work_run(void)
+static void irq_work_run_list(struct llist_head *list)
{
unsigned long flags;
struct irq_work *work;
- struct llist_head *this_list;
struct llist_node *llnode;
+ BUG_ON(!irqs_disabled());
- /*
- * Reset the "raised" state right before we check the list because
- * an NMI may enqueue after we find the list empty from the runner.
- */
- __this_cpu_write(irq_work_raised, 0);
- barrier();
-
- this_list = &__get_cpu_var(irq_work_list);
- if (llist_empty(this_list))
+ if (llist_empty(list))
return;
- BUG_ON(!irqs_disabled());
-
- llnode = llist_del_all(this_list);
+ llnode = llist_del_all(list);
while (llnode != NULL) {
work = llist_entry(llnode, struct irq_work, llnode);
@@ -149,13 +161,13 @@ static void __irq_work_run(void)
}
/*
- * Run the irq_work entries on this cpu. Requires to be ran from hardirq
- * context with local IRQs disabled.
+ * hotplug calls this through:
+ * hotplug_cfd() -> flush_smp_call_function_queue()
*/
void irq_work_run(void)
{
- BUG_ON(!in_irq());
- __irq_work_run();
+ irq_work_run_list(&__get_cpu_var(raised_list));
+ irq_work_run_list(&__get_cpu_var(lazy_list));
}
EXPORT_SYMBOL_GPL(irq_work_run);
@@ -171,35 +183,3 @@ void irq_work_sync(struct irq_work *work)
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int irq_work_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
-{
- long cpu = (long)hcpu;
-
- switch (action) {
- case CPU_DYING:
- /* Called from stop_machine */
- if (WARN_ON_ONCE(cpu != smp_processor_id()))
- break;
- __irq_work_run();
- break;
- default:
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cpu_notify;
-
-static __init int irq_work_init_cpu_notifier(void)
-{
- cpu_notify.notifier_call = irq_work_cpu_notify;
- cpu_notify.priority = 0;
- register_cpu_notifier(&cpu_notify);
- return 0;
-}
-device_initcall(irq_work_init_cpu_notifier);
-
-#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..ae5167087845 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
- return sprintf(buffer, "0x%lx", address);
+ return sprintf(buffer, "0x%lx", address - symbol_offset);
if (name != buffer)
strcpy(buffer, name);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..0b49a0a58102 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
* Version 2. See the file COPYING for more details.
*/
+#define pr_fmt(fmt) "kexec: " fmt
+
#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
@@ -40,6 +42,9 @@
#include <asm/io.h>
#include <asm/sections.h>
+#include <crypto/hash.h>
+#include <crypto/sha.h>
+
/* Per cpu memory for storing cpu states in case of system crash. */
note_buf_t __percpu *crash_notes;
@@ -52,6 +57,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
/* Flag to indicate we are going to kexec a new kernel */
bool kexec_in_progress = false;
+/*
+ * Declare these symbols weak so that if architecture provides a purgatory,
+ * these will be overridden.
+ */
+char __weak kexec_purgatory[0];
+size_t __weak kexec_purgatory_size = 0;
+
+static int kexec_calculate_store_digests(struct kimage *image);
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -125,45 +139,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
gfp_t gfp_mask,
unsigned long dest);
-static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_user_segment_list(struct kimage *image,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
+ int ret;
size_t segment_bytes;
- struct kimage *image;
- unsigned long i;
- int result;
-
- /* Allocate a controlling structure */
- result = -ENOMEM;
- image = kzalloc(sizeof(*image), GFP_KERNEL);
- if (!image)
- goto out;
-
- image->head = 0;
- image->entry = &image->head;
- image->last_entry = &image->head;
- image->control_page = ~0; /* By default this does not apply */
- image->start = entry;
- image->type = KEXEC_TYPE_DEFAULT;
-
- /* Initialize the list of control pages */
- INIT_LIST_HEAD(&image->control_pages);
-
- /* Initialize the list of destination pages */
- INIT_LIST_HEAD(&image->dest_pages);
-
- /* Initialize the list of unusable pages */
- INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
image->nr_segments = nr_segments;
segment_bytes = nr_segments * sizeof(*segments);
- result = copy_from_user(image->segment, segments, segment_bytes);
- if (result) {
- result = -EFAULT;
- goto out;
- }
+ ret = copy_from_user(image->segment, segments, segment_bytes);
+ if (ret)
+ ret = -EFAULT;
+
+ return ret;
+}
+
+static int sanity_check_segment_list(struct kimage *image)
+{
+ int result, i;
+ unsigned long nr_segments = image->nr_segments;
/*
* Verify we have good destination addresses. The caller is
@@ -185,9 +181,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
- goto out;
+ return result;
if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
- goto out;
+ return result;
}
/* Verify our destination addresses do not overlap.
@@ -208,7 +204,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
pend = pstart + image->segment[j].memsz;
/* Do the segments overlap ? */
if ((mend > pstart) && (mstart < pend))
- goto out;
+ return result;
}
}
@@ -220,130 +216,401 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
result = -EINVAL;
for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
- goto out;
+ return result;
}
- result = 0;
-out:
- if (result == 0)
- *rimage = image;
- else
- kfree(image);
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
- return result;
+ if (image->type == KEXEC_TYPE_CRASH) {
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) ||
+ (mend > crashk_res.end))
+ return result;
+ }
+ }
+ return 0;
+}
+
+static struct kimage *do_kimage_alloc_init(void)
+{
+ struct kimage *image;
+
+ /* Allocate a controlling structure */
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ return NULL;
+
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unusable pages */
+ INIT_LIST_HEAD(&image->unusable_pages);
+
+ return image;
}
static void kimage_free_page_list(struct list_head *list);
-static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
{
- int result;
+ int ret;
struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_ON_CRASH;
+
+ if (kexec_on_panic) {
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end))
+ return -EADDRNOTAVAIL;
+ }
/* Allocate and initialize a controlling structure */
- image = NULL;
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
- goto out;
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->start = entry;
+
+ ret = copy_user_segment_list(image, nr_segments, segments);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_image;
+
+ /* Enable the special crash kernel control page allocation policy. */
+ if (kexec_on_panic) {
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
/*
* Find a location for the control code buffer, and add it
* the vector of segments so that it's pages will also be
* counted as destination pages.
*/
- result = -ENOMEM;
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_image;
}
- image->swap_page = kimage_alloc_control_pages(image, 0);
- if (!image->swap_page) {
- pr_err("Could not allocate swap buffer\n");
- goto out_free;
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err("Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
kimage_free_page_list(&image->control_pages);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
-static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments,
- struct kexec_segment __user *segments)
+static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
{
- int result;
- struct kimage *image;
- unsigned long i;
+ struct fd f = fdget(fd);
+ int ret;
+ struct kstat stat;
+ loff_t pos;
+ ssize_t bytes = 0;
- image = NULL;
- /* Verify we have a valid entry point */
- if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
- result = -EADDRNOTAVAIL;
+ if (!f.file)
+ return -EBADF;
+
+ ret = vfs_getattr(&f.file->f_path, &stat);
+ if (ret)
+ goto out;
+
+ if (stat.size > INT_MAX) {
+ ret = -EFBIG;
goto out;
}
- /* Allocate and initialize a controlling structure */
- result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result)
+ /* Don't hand 0 to vmalloc, it whines. */
+ if (stat.size == 0) {
+ ret = -EINVAL;
goto out;
+ }
- /* Enable the special crash kernel control page
- * allocation policy.
- */
- image->control_page = crashk_res.start;
- image->type = KEXEC_TYPE_CRASH;
+ *buf = vmalloc(stat.size);
+ if (!*buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
- /*
- * Verify we have good destination addresses. Normally
- * the caller is responsible for making certain we don't
- * attempt to load the new image into invalid or reserved
- * areas of RAM. But crash kernels are preloaded into a
- * reserved area of ram. We must ensure the addresses
- * are in the reserved area otherwise preloading the
- * kernel could corrupt things.
- */
- result = -EADDRNOTAVAIL;
- for (i = 0; i < nr_segments; i++) {
- unsigned long mstart, mend;
+ pos = 0;
+ while (pos < stat.size) {
+ bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
+ stat.size - pos);
+ if (bytes < 0) {
+ vfree(*buf);
+ ret = bytes;
+ goto out;
+ }
- mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz - 1;
- /* Ensure we are within the crash kernel limits */
- if ((mstart < crashk_res.start) || (mend > crashk_res.end))
- goto out_free;
+ if (bytes == 0)
+ break;
+ pos += bytes;
}
+ if (pos != stat.size) {
+ ret = -EBADF;
+ vfree(*buf);
+ goto out;
+ }
+
+ *buf_len = pos;
+out:
+ fdput(f);
+ return ret;
+}
+
+/* Architectures can provide this probe function */
+int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -ENOEXEC;
+}
+
+void * __weak arch_kexec_kernel_image_load(struct kimage *image)
+{
+ return ERR_PTR(-ENOEXEC);
+}
+
+void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
+{
+}
+
+int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
+ unsigned long buf_len)
+{
+ return -EKEYREJECTED;
+}
+
+/* Apply relocations of type RELA */
+int __weak
+arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("RELA relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/* Apply relocations of type REL */
+int __weak
+arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
+ unsigned int relsec)
+{
+ pr_err("REL relocation unsupported.\n");
+ return -ENOEXEC;
+}
+
+/*
+ * Free up memory used by kernel, initrd, and comand line. This is temporary
+ * memory allocation which is not needed any more after these buffers have
+ * been loaded into separate segments and have been copied elsewhere.
+ */
+static void kimage_file_post_load_cleanup(struct kimage *image)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ vfree(image->kernel_buf);
+ image->kernel_buf = NULL;
+
+ vfree(image->initrd_buf);
+ image->initrd_buf = NULL;
+
+ kfree(image->cmdline_buf);
+ image->cmdline_buf = NULL;
+
+ vfree(pi->purgatory_buf);
+ pi->purgatory_buf = NULL;
+
+ vfree(pi->sechdrs);
+ pi->sechdrs = NULL;
+
+ /* See if architecture has anything to cleanup post load */
+ arch_kimage_file_post_load_cleanup(image);
+
/*
- * Find a location for the control code buffer, and add
- * the vector of segments so that it's pages will also be
- * counted as destination pages.
+ * Above call should have called into bootloader to free up
+ * any data stored in kimage->image_loader_data. It should
+ * be ok now to free it up.
*/
- result = -ENOMEM;
+ kfree(image->image_loader_data);
+ image->image_loader_data = NULL;
+}
+
+/*
+ * In file mode list of segments is prepared by kernel. Copy relevant
+ * data from user space, do error checking, prepare segment list
+ */
+static int
+kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
+ const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned flags)
+{
+ int ret = 0;
+ void *ldata;
+
+ ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
+ &image->kernel_buf_len);
+ if (ret)
+ return ret;
+
+ /* Call arch image probe handlers */
+ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
+ image->kernel_buf_len);
+
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_KEXEC_VERIFY_SIG
+ ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
+ image->kernel_buf_len);
+ if (ret) {
+ pr_debug("kernel signature verification failed.\n");
+ goto out;
+ }
+ pr_debug("kernel signature verification successful.\n");
+#endif
+ /* It is possible that there no initramfs is being loaded */
+ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
+ ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
+ &image->initrd_buf_len);
+ if (ret)
+ goto out;
+ }
+
+ if (cmdline_len) {
+ image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
+ if (!image->cmdline_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
+ cmdline_len);
+ if (ret) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ image->cmdline_buf_len = cmdline_len;
+
+ /* command line should be a string with last byte null */
+ if (image->cmdline_buf[cmdline_len - 1] != '\0') {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
+
+ /* Call arch image load handlers */
+ ldata = arch_kexec_kernel_image_load(image);
+
+ if (IS_ERR(ldata)) {
+ ret = PTR_ERR(ldata);
+ goto out;
+ }
+
+ image->image_loader_data = ldata;
+out:
+ /* In case of error, free up all allocated memory in this function */
+ if (ret)
+ kimage_file_post_load_cleanup(image);
+ return ret;
+}
+
+static int
+kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
+ int initrd_fd, const char __user *cmdline_ptr,
+ unsigned long cmdline_len, unsigned long flags)
+{
+ int ret;
+ struct kimage *image;
+ bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
+
+ image = do_kimage_alloc_init();
+ if (!image)
+ return -ENOMEM;
+
+ image->file_mode = 1;
+
+ if (kexec_on_panic) {
+ /* Enable special crash kernel control page alloc policy. */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+ }
+
+ ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
+ cmdline_ptr, cmdline_len, flags);
+ if (ret)
+ goto out_free_image;
+
+ ret = sanity_check_segment_list(image);
+ if (ret)
+ goto out_free_post_load_bufs;
+
+ ret = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
pr_err("Could not allocate control_code_buffer\n");
- goto out_free;
+ goto out_free_post_load_bufs;
+ }
+
+ if (!kexec_on_panic) {
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ pr_err(KERN_ERR "Could not allocate swap buffer\n");
+ goto out_free_control_pages;
+ }
}
*rimage = image;
return 0;
-
-out_free:
+out_free_control_pages:
+ kimage_free_page_list(&image->control_pages);
+out_free_post_load_bufs:
+ kimage_file_post_load_cleanup(image);
+out_free_image:
kfree(image);
-out:
- return result;
+ return ret;
}
static int kimage_is_destination_range(struct kimage *image,
@@ -609,7 +876,7 @@ static void kimage_free_extra_pages(struct kimage *image)
kimage_free_page_list(&image->dest_pages);
/* Walk through and free any unusable pages I have cached */
- kimage_free_page_list(&image->unuseable_pages);
+ kimage_free_page_list(&image->unusable_pages);
}
static void kimage_terminate(struct kimage *image)
@@ -663,6 +930,14 @@ static void kimage_free(struct kimage *image)
/* Free the kexec control pages... */
kimage_free_page_list(&image->control_pages);
+
+ /*
+ * Free up any temporary buffers allocated. This might hit if
+ * error occurred much later after buffer allocation.
+ */
+ if (image->file_mode)
+ kimage_file_post_load_cleanup(image);
+
kfree(image);
}
@@ -732,7 +1007,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
/* If the page cannot be used file it away */
if (page_to_pfn(page) >
(KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
- list_add(&page->lru, &image->unuseable_pages);
+ list_add(&page->lru, &image->unusable_pages);
continue;
}
addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1066,14 @@ static int kimage_load_normal_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -826,7 +1105,11 @@ static int kimage_load_normal_segment(struct kimage *image,
PAGE_SIZE - (maddr & ~PAGE_MASK));
uchunk = min(ubytes, mchunk);
- result = copy_from_user(ptr, buf, uchunk);
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
result = -EFAULT;
@@ -834,7 +1117,10 @@ static int kimage_load_normal_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -851,10 +1137,14 @@ static int kimage_load_crash_segment(struct kimage *image,
unsigned long maddr;
size_t ubytes, mbytes;
int result;
- unsigned char __user *buf;
+ unsigned char __user *buf = NULL;
+ unsigned char *kbuf = NULL;
result = 0;
- buf = segment->buf;
+ if (image->file_mode)
+ kbuf = segment->kbuf;
+ else
+ buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
@@ -877,7 +1167,12 @@ static int kimage_load_crash_segment(struct kimage *image,
/* Zero the trailing part of the page */
memset(ptr + uchunk, 0, mchunk - uchunk);
}
- result = copy_from_user(ptr, buf, uchunk);
+
+ /* For file based kexec, source pages are in kernel memory */
+ if (image->file_mode)
+ memcpy(ptr, kbuf, uchunk);
+ else
+ result = copy_from_user(ptr, buf, uchunk);
kexec_flush_icache_page(page);
kunmap(page);
if (result) {
@@ -886,7 +1181,10 @@ static int kimage_load_crash_segment(struct kimage *image,
}
ubytes -= uchunk;
maddr += mchunk;
- buf += mchunk;
+ if (image->file_mode)
+ kbuf += mchunk;
+ else
+ buf += mchunk;
mbytes -= mchunk;
}
out:
@@ -986,16 +1284,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
/* Loading another kernel to reboot into */
if ((flags & KEXEC_ON_CRASH) == 0)
- result = kimage_normal_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
/* Loading another kernel to switch to if this one crashes */
else if (flags & KEXEC_ON_CRASH) {
/* Free any current crash dump kernel before
* we corrupt it.
*/
kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_crash_alloc(&image, entry,
- nr_segments, segments);
+ result = kimage_alloc_init(&image, entry, nr_segments,
+ segments, flags);
crash_map_reserved_pages();
}
if (result)
@@ -1077,6 +1375,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
}
#endif
+SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
+ unsigned long, cmdline_len, const char __user *, cmdline_ptr,
+ unsigned long, flags)
+{
+ int ret = 0, i;
+ struct kimage **dest_image, *image;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
+ return -EPERM;
+
+ /* Make sure we have a legal set of flags */
+ if (flags != (flags & KEXEC_FILE_FLAGS))
+ return -EINVAL;
+
+ image = NULL;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_FILE_ON_CRASH)
+ dest_image = &kexec_crash_image;
+
+ if (flags & KEXEC_FILE_UNLOAD)
+ goto exchange;
+
+ /*
+ * In case of crash, new kernel gets loaded in reserved region. It is
+ * same memory where old crash kernel might be loaded. Free any
+ * current crash dump kernel before we corrupt it.
+ */
+ if (flags & KEXEC_FILE_ON_CRASH)
+ kimage_free(xchg(&kexec_crash_image, NULL));
+
+ ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
+ cmdline_len, flags);
+ if (ret)
+ goto out;
+
+ ret = machine_kexec_prepare(image);
+ if (ret)
+ goto out;
+
+ ret = kexec_calculate_store_digests(image);
+ if (ret)
+ goto out;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
+ i, ksegment->buf, ksegment->bufsz, ksegment->mem,
+ ksegment->memsz);
+
+ ret = kimage_load_segment(image, &image->segment[i]);
+ if (ret)
+ goto out;
+ }
+
+ kimage_terminate(image);
+
+ /*
+ * Free up any temporary buffers allocated which are not needed
+ * after image has been loaded
+ */
+ kimage_file_post_load_cleanup(image);
+exchange:
+ image = xchg(dest_image, image);
+out:
+ mutex_unlock(&kexec_mutex);
+ kimage_free(image);
+ return ret;
+}
+
void crash_kexec(struct pt_regs *regs)
{
/* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1632,6 +2006,683 @@ static int __init crash_save_vmcoreinfo_init(void)
subsys_initcall(crash_save_vmcoreinfo_init);
+static int __kexec_add_segment(struct kimage *image, char *buf,
+ unsigned long bufsz, unsigned long mem,
+ unsigned long memsz)
+{
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[image->nr_segments];
+ ksegment->kbuf = buf;
+ ksegment->bufsz = bufsz;
+ ksegment->mem = mem;
+ ksegment->memsz = memsz;
+ image->nr_segments++;
+
+ return 0;
+}
+
+static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_end = min(end, kbuf->buf_max);
+ temp_start = temp_end - kbuf->memsz;
+
+ do {
+ /* align down start */
+ temp_start = temp_start & (~(kbuf->buf_align - 1));
+
+ if (temp_start < start || temp_start < kbuf->buf_min)
+ return 0;
+
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start - PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
+ struct kexec_buf *kbuf)
+{
+ struct kimage *image = kbuf->image;
+ unsigned long temp_start, temp_end;
+
+ temp_start = max(start, kbuf->buf_min);
+
+ do {
+ temp_start = ALIGN(temp_start, kbuf->buf_align);
+ temp_end = temp_start + kbuf->memsz - 1;
+
+ if (temp_end > end || temp_end > kbuf->buf_max)
+ return 0;
+ /*
+ * Make sure this does not conflict with any of existing
+ * segments
+ */
+ if (kimage_is_destination_range(image, temp_start, temp_end)) {
+ temp_start = temp_start + PAGE_SIZE;
+ continue;
+ }
+
+ /* We found a suitable memory range */
+ break;
+ } while (1);
+
+ /* If we are here, we found a suitable memory range */
+ __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start,
+ kbuf->memsz);
+
+ /* Success, stop navigating through remaining System RAM ranges */
+ return 1;
+}
+
+static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
+{
+ struct kexec_buf *kbuf = (struct kexec_buf *)arg;
+ unsigned long sz = end - start + 1;
+
+ /* Returning 0 will take to next memory range */
+ if (sz < kbuf->memsz)
+ return 0;
+
+ if (end < kbuf->buf_min || start > kbuf->buf_max)
+ return 0;
+
+ /*
+ * Allocate memory top down with-in ram range. Otherwise bottom up
+ * allocation.
+ */
+ if (kbuf->top_down)
+ return locate_mem_hole_top_down(start, end, kbuf);
+ return locate_mem_hole_bottom_up(start, end, kbuf);
+}
+
+/*
+ * Helper function for placing a buffer in a kexec segment. This assumes
+ * that kexec_mutex is held.
+ */
+int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
+ unsigned long memsz, unsigned long buf_align,
+ unsigned long buf_min, unsigned long buf_max,
+ bool top_down, unsigned long *load_addr)
+{
+
+ struct kexec_segment *ksegment;
+ struct kexec_buf buf, *kbuf;
+ int ret;
+
+ /* Currently adding segment this way is allowed only in file mode */
+ if (!image->file_mode)
+ return -EINVAL;
+
+ if (image->nr_segments >= KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ /*
+ * Make sure we are not trying to add buffer after allocating
+ * control pages. All segments need to be placed first before
+ * any control pages are allocated. As control page allocation
+ * logic goes through list of segments to make sure there are
+ * no destination overlaps.
+ */
+ if (!list_empty(&image->control_pages)) {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
+ memset(&buf, 0, sizeof(struct kexec_buf));
+ kbuf = &buf;
+ kbuf->image = image;
+ kbuf->buffer = buffer;
+ kbuf->bufsz = bufsz;
+
+ kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
+ kbuf->buf_align = max(buf_align, PAGE_SIZE);
+ kbuf->buf_min = buf_min;
+ kbuf->buf_max = buf_max;
+ kbuf->top_down = top_down;
+
+ /* Walk the RAM ranges and allocate a suitable range for the buffer */
+ if (image->type == KEXEC_TYPE_CRASH)
+ ret = walk_iomem_res("Crash kernel",
+ IORESOURCE_MEM | IORESOURCE_BUSY,
+ crashk_res.start, crashk_res.end, kbuf,
+ locate_mem_hole_callback);
+ else
+ ret = walk_system_ram_res(0, -1, kbuf,
+ locate_mem_hole_callback);
+ if (ret != 1) {
+ /* A suitable memory range could not be found for buffer */
+ return -EADDRNOTAVAIL;
+ }
+
+ /* Found a suitable memory range */
+ ksegment = &image->segment[image->nr_segments - 1];
+ *load_addr = ksegment->mem;
+ return 0;
+}
+
+/* Calculate and store the digest of segments */
+static int kexec_calculate_store_digests(struct kimage *image)
+{
+ struct crypto_shash *tfm;
+ struct shash_desc *desc;
+ int ret = 0, i, j, zero_buf_sz, sha_region_sz;
+ size_t desc_size, nullsz;
+ char *digest;
+ void *zero_buf;
+ struct kexec_sha_region *sha_regions;
+ struct purgatory_info *pi = &image->purgatory_info;
+
+ zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
+ zero_buf_sz = PAGE_SIZE;
+
+ tfm = crypto_alloc_shash("sha256", 0, 0);
+ if (IS_ERR(tfm)) {
+ ret = PTR_ERR(tfm);
+ goto out;
+ }
+
+ desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+ desc = kzalloc(desc_size, GFP_KERNEL);
+ if (!desc) {
+ ret = -ENOMEM;
+ goto out_free_tfm;
+ }
+
+ sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
+ sha_regions = vzalloc(sha_region_sz);
+ if (!sha_regions)
+ goto out_free_desc;
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+
+ ret = crypto_shash_init(desc);
+ if (ret < 0)
+ goto out_free_sha_regions;
+
+ digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
+ if (!digest) {
+ ret = -ENOMEM;
+ goto out_free_sha_regions;
+ }
+
+ for (j = i = 0; i < image->nr_segments; i++) {
+ struct kexec_segment *ksegment;
+
+ ksegment = &image->segment[i];
+ /*
+ * Skip purgatory as it will be modified once we put digest
+ * info in purgatory.
+ */
+ if (ksegment->kbuf == pi->purgatory_buf)
+ continue;
+
+ ret = crypto_shash_update(desc, ksegment->kbuf,
+ ksegment->bufsz);
+ if (ret)
+ break;
+
+ /*
+ * Assume rest of the buffer is filled with zero and
+ * update digest accordingly.
+ */
+ nullsz = ksegment->memsz - ksegment->bufsz;
+ while (nullsz) {
+ unsigned long bytes = nullsz;
+
+ if (bytes > zero_buf_sz)
+ bytes = zero_buf_sz;
+ ret = crypto_shash_update(desc, zero_buf, bytes);
+ if (ret)
+ break;
+ nullsz -= bytes;
+ }
+
+ if (ret)
+ break;
+
+ sha_regions[j].start = ksegment->mem;
+ sha_regions[j].len = ksegment->memsz;
+ j++;
+ }
+
+ if (!ret) {
+ ret = crypto_shash_final(desc, digest);
+ if (ret)
+ goto out_free_digest;
+ ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
+ sha_regions, sha_region_sz, 0);
+ if (ret)
+ goto out_free_digest;
+
+ ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
+ digest, SHA256_DIGEST_SIZE, 0);
+ if (ret)
+ goto out_free_digest;
+ }
+
+out_free_digest:
+ kfree(digest);
+out_free_sha_regions:
+ vfree(sha_regions);
+out_free_desc:
+ kfree(desc);
+out_free_tfm:
+ kfree(tfm);
+out:
+ return ret;
+}
+
+/* Actually load purgatory. Lot of code taken from kexec-tools */
+static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
+ unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
+ unsigned char *buf_addr, *src;
+ int i, ret = 0, entry_sidx = -1;
+ const Elf_Shdr *sechdrs_c;
+ Elf_Shdr *sechdrs = NULL;
+ void *purgatory_buf = NULL;
+
+ /*
+ * sechdrs_c points to section headers in purgatory and are read
+ * only. No modifications allowed.
+ */
+ sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
+
+ /*
+ * We can not modify sechdrs_c[] and its fields. It is read only.
+ * Copy it over to a local copy where one can store some temporary
+ * data and free it at the end. We need to modify ->sh_addr and
+ * ->sh_offset fields to keep track of permanent and temporary
+ * locations of sections.
+ */
+ sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+ if (!sechdrs)
+ return -ENOMEM;
+
+ memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
+
+ /*
+ * We seem to have multiple copies of sections. First copy is which
+ * is embedded in kernel in read only section. Some of these sections
+ * will be copied to a temporary buffer and relocated. And these
+ * sections will finally be copied to their final destination at
+ * segment load time.
+ *
+ * Use ->sh_offset to reflect section address in memory. It will
+ * point to original read only copy if section is not allocatable.
+ * Otherwise it will point to temporary copy which will be relocated.
+ *
+ * Use ->sh_addr to contain final address of the section where it
+ * will go during execution time.
+ */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type == SHT_NOBITS)
+ continue;
+
+ sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
+ sechdrs[i].sh_offset;
+ }
+
+ /*
+ * Identify entry point section and make entry relative to section
+ * start.
+ */
+ entry = pi->ehdr->e_entry;
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
+ continue;
+
+ /* Make entry section relative */
+ if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
+ ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
+ pi->ehdr->e_entry)) {
+ entry_sidx = i;
+ entry -= sechdrs[i].sh_addr;
+ break;
+ }
+ }
+
+ /* Determine how much memory is needed to load relocatable object. */
+ buf_align = 1;
+ bss_align = 1;
+ buf_sz = 0;
+ bss_sz = 0;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ if (buf_align < align)
+ buf_align = align;
+ buf_sz = ALIGN(buf_sz, align);
+ buf_sz += sechdrs[i].sh_size;
+ } else {
+ /* bss section */
+ if (bss_align < align)
+ bss_align = align;
+ bss_sz = ALIGN(bss_sz, align);
+ bss_sz += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Determine the bss padding required to align bss properly */
+ bss_pad = 0;
+ if (buf_sz & (bss_align - 1))
+ bss_pad = bss_align - (buf_sz & (bss_align - 1));
+
+ memsz = buf_sz + bss_pad + bss_sz;
+
+ /* Allocate buffer for purgatory */
+ purgatory_buf = vzalloc(buf_sz);
+ if (!purgatory_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (buf_align < bss_align)
+ buf_align = bss_align;
+
+ /* Add buffer to segment list */
+ ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
+ buf_align, min, max, top_down,
+ &pi->purgatory_load_addr);
+ if (ret)
+ goto out;
+
+ /* Load SHF_ALLOC sections */
+ buf_addr = purgatory_buf;
+ load_addr = curr_load_addr = pi->purgatory_load_addr;
+ bss_addr = load_addr + buf_sz + bss_pad;
+
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+ continue;
+
+ align = sechdrs[i].sh_addralign;
+ if (sechdrs[i].sh_type != SHT_NOBITS) {
+ curr_load_addr = ALIGN(curr_load_addr, align);
+ offset = curr_load_addr - load_addr;
+ /* We already modifed ->sh_offset to keep src addr */
+ src = (char *) sechdrs[i].sh_offset;
+ memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
+
+ /* Store load address and source address of section */
+ sechdrs[i].sh_addr = curr_load_addr;
+
+ /*
+ * This section got copied to temporary buffer. Update
+ * ->sh_offset accordingly.
+ */
+ sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
+
+ /* Advance to the next address */
+ curr_load_addr += sechdrs[i].sh_size;
+ } else {
+ bss_addr = ALIGN(bss_addr, align);
+ sechdrs[i].sh_addr = bss_addr;
+ bss_addr += sechdrs[i].sh_size;
+ }
+ }
+
+ /* Update entry point based on load address of text section */
+ if (entry_sidx >= 0)
+ entry += sechdrs[entry_sidx].sh_addr;
+
+ /* Make kernel jump to purgatory after shutdown */
+ image->start = entry;
+
+ /* Used later to get/set symbol values */
+ pi->sechdrs = sechdrs;
+
+ /*
+ * Used later to identify which section is purgatory and skip it
+ * from checksumming.
+ */
+ pi->purgatory_buf = purgatory_buf;
+ return ret;
+out:
+ vfree(sechdrs);
+ vfree(purgatory_buf);
+ return ret;
+}
+
+static int kexec_apply_relocations(struct kimage *image)
+{
+ int i, ret;
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Shdr *sechdrs = pi->sechdrs;
+
+ /* Apply relocations */
+ for (i = 0; i < pi->ehdr->e_shnum; i++) {
+ Elf_Shdr *section, *symtab;
+
+ if (sechdrs[i].sh_type != SHT_RELA &&
+ sechdrs[i].sh_type != SHT_REL)
+ continue;
+
+ /*
+ * For section of type SHT_RELA/SHT_REL,
+ * ->sh_link contains section header index of associated
+ * symbol table. And ->sh_info contains section header
+ * index of section to which relocations apply.
+ */
+ if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
+ sechdrs[i].sh_link >= pi->ehdr->e_shnum)
+ return -ENOEXEC;
+
+ section = &sechdrs[sechdrs[i].sh_info];
+ symtab = &sechdrs[sechdrs[i].sh_link];
+
+ if (!(section->sh_flags & SHF_ALLOC))
+ continue;
+
+ /*
+ * symtab->sh_link contain section header index of associated
+ * string table.
+ */
+ if (symtab->sh_link >= pi->ehdr->e_shnum)
+ /* Invalid section number? */
+ continue;
+
+ /*
+ * Respective archicture needs to provide support for applying
+ * relocations of type SHT_RELA/SHT_REL.
+ */
+ if (sechdrs[i].sh_type == SHT_RELA)
+ ret = arch_kexec_apply_relocations_add(pi->ehdr,
+ sechdrs, i);
+ else if (sechdrs[i].sh_type == SHT_REL)
+ ret = arch_kexec_apply_relocations(pi->ehdr,
+ sechdrs, i);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+/* Load relocatable purgatory object and relocate it appropriately */
+int kexec_load_purgatory(struct kimage *image, unsigned long min,
+ unsigned long max, int top_down,
+ unsigned long *load_addr)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ int ret;
+
+ if (kexec_purgatory_size <= 0)
+ return -EINVAL;
+
+ if (kexec_purgatory_size < sizeof(Elf_Ehdr))
+ return -ENOEXEC;
+
+ pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
+
+ if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
+ || pi->ehdr->e_type != ET_REL
+ || !elf_check_arch(pi->ehdr)
+ || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
+ return -ENOEXEC;
+
+ if (pi->ehdr->e_shoff >= kexec_purgatory_size
+ || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
+ kexec_purgatory_size - pi->ehdr->e_shoff))
+ return -ENOEXEC;
+
+ ret = __kexec_load_purgatory(image, min, max, top_down);
+ if (ret)
+ return ret;
+
+ ret = kexec_apply_relocations(image);
+ if (ret)
+ goto out;
+
+ *load_addr = pi->purgatory_load_addr;
+ return 0;
+out:
+ vfree(pi->sechdrs);
+ vfree(pi->purgatory_buf);
+ return ret;
+}
+
+static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
+ const char *name)
+{
+ Elf_Sym *syms;
+ Elf_Shdr *sechdrs;
+ Elf_Ehdr *ehdr;
+ int i, k;
+ const char *strtab;
+
+ if (!pi->sechdrs || !pi->ehdr)
+ return NULL;
+
+ sechdrs = pi->sechdrs;
+ ehdr = pi->ehdr;
+
+ for (i = 0; i < ehdr->e_shnum; i++) {
+ if (sechdrs[i].sh_type != SHT_SYMTAB)
+ continue;
+
+ if (sechdrs[i].sh_link >= ehdr->e_shnum)
+ /* Invalid strtab section number */
+ continue;
+ strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
+ syms = (Elf_Sym *)sechdrs[i].sh_offset;
+
+ /* Go through symbols for a match */
+ for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
+ if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
+ continue;
+
+ if (strcmp(strtab + syms[k].st_name, name) != 0)
+ continue;
+
+ if (syms[k].st_shndx == SHN_UNDEF ||
+ syms[k].st_shndx >= ehdr->e_shnum) {
+ pr_debug("Symbol: %s has bad section index %d.\n",
+ name, syms[k].st_shndx);
+ return NULL;
+ }
+
+ /* Found the symbol we are looking for */
+ return &syms[k];
+ }
+ }
+
+ return NULL;
+}
+
+void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
+{
+ struct purgatory_info *pi = &image->purgatory_info;
+ Elf_Sym *sym;
+ Elf_Shdr *sechdr;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return ERR_PTR(-EINVAL);
+
+ sechdr = &pi->sechdrs[sym->st_shndx];
+
+ /*
+ * Returns the address where symbol will finally be loaded after
+ * kexec_load_segment()
+ */
+ return (void *)(sechdr->sh_addr + sym->st_value);
+}
+
+/*
+ * Get or set value of a symbol. If "get_value" is true, symbol value is
+ * returned in buf otherwise symbol value is set based on value in buf.
+ */
+int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
+ void *buf, unsigned int size, bool get_value)
+{
+ Elf_Sym *sym;
+ Elf_Shdr *sechdrs;
+ struct purgatory_info *pi = &image->purgatory_info;
+ char *sym_buf;
+
+ sym = kexec_purgatory_find_symbol(pi, name);
+ if (!sym)
+ return -EINVAL;
+
+ if (sym->st_size != size) {
+ pr_err("symbol %s size mismatch: expected %lu actual %u\n",
+ name, (unsigned long)sym->st_size, size);
+ return -EINVAL;
+ }
+
+ sechdrs = pi->sechdrs;
+
+ if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
+ pr_err("symbol %s is in a bss section. Cannot %s\n", name,
+ get_value ? "get" : "set");
+ return -EINVAL;
+ }
+
+ sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
+ sym->st_value;
+
+ if (get_value)
+ memcpy((void *)buf, sym_buf, size);
+ else
+ memcpy((void *)sym_buf, buf, size);
+
+ return 0;
+}
+
/*
* Move into place and start executing a preloaded standalone
* executable. If nothing was preloaded return an error.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c2390f41307b..ef483220e855 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -591,7 +591,7 @@ static void insert_kthread_work(struct kthread_worker *worker,
list_add_tail(&work->node, pos);
work->worker = worker;
- if (likely(worker->task))
+ if (!worker->current_work && likely(worker->task))
wake_up_process(worker->task);
}
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index d24e4339b46d..88d0d4420ad2 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -384,7 +384,9 @@ static void print_lockdep_off(const char *bug_msg)
{
printk(KERN_DEBUG "%s\n", bug_msg);
printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+#ifdef CONFIG_LOCK_STAT
printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+#endif
}
static int save_trace(struct stack_trace *trace)
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index be9ee1559fca..9887a905a762 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -1,6 +1,4 @@
-
#include <linux/percpu.h>
-#include <linux/mutex.h>
#include <linux/sched.h>
#include "mcs_spinlock.h"
@@ -79,7 +77,7 @@ osq_wait_next(struct optimistic_spin_queue *lock,
break;
}
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
return next;
@@ -120,7 +118,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
if (need_resched())
goto unqueue;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
return true;
@@ -146,7 +144,7 @@ unqueue:
if (smp_load_acquire(&node->locked))
return true;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
/*
* Or we race against a concurrent unqueue()'s step-B, in which
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 74356dc0ce29..23e89c5930e9 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -27,7 +27,7 @@ struct mcs_spinlock {
#define arch_mcs_spin_lock_contended(l) \
do { \
while (!(smp_load_acquire(l))) \
- arch_mutex_cpu_relax(); \
+ cpu_relax_lowlatency(); \
} while (0)
#endif
@@ -104,7 +104,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
return;
/* Wait until the next pointer is set */
while (!(next = ACCESS_ONCE(node->next)))
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
/* Pass lock to next waiter. */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index acca2c1a3c5e..ae712b25e492 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -46,12 +46,6 @@
# include <asm/mutex.h>
#endif
-/*
- * A negative mutex count indicates that waiters are sleeping waiting for the
- * mutex.
- */
-#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
-
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -152,7 +146,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
if (need_resched())
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
rcu_read_unlock();
@@ -388,12 +382,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
/*
* Optimistic spinning.
*
- * We try to spin for acquisition when we find that there are no
- * pending waiters and the lock owner is currently running on a
- * (different) CPU.
- *
- * The rationale is that if the lock owner is running, it is likely to
- * release the lock soon.
+ * We try to spin for acquisition when we find that the lock owner
+ * is currently running on a (different) CPU and while we don't
+ * need to reschedule. The rationale is that if the lock owner is
+ * running, it is likely to release the lock soon.
*
* Since this needs the lock owner, and this mutex implementation
* doesn't track the owner atomically in the lock field, we need to
@@ -440,7 +432,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
if (owner && !mutex_spin_on_owner(lock, owner))
break;
- if ((atomic_read(&lock->count) == 1) &&
+ /* Try to acquire the mutex if it is unlocked. */
+ if (!mutex_is_locked(lock) &&
(atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
if (use_ww_ctx) {
@@ -471,7 +464,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
osq_unlock(&lock->osq);
slowpath:
@@ -485,8 +478,11 @@ slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
- /* once more, can we acquire the lock? */
- if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1))
+ /*
+ * Once more, try to acquire the lock. Only try-lock the mutex if
+ * it is unlocked to reduce unnecessary xchg() operations.
+ */
+ if (!mutex_is_locked(lock) && (atomic_xchg(&lock->count, 0) == 1))
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
@@ -506,9 +502,10 @@ slowpath:
* it's unlocked. Later on, if we sleep, this is the
* operation that gives us the lock. We xchg it to -1, so
* that when we release the lock, we properly wake up the
- * other waiters:
+ * other waiters. We only attempt the xchg if the count is
+ * non-negative in order to avoid unnecessary xchg operations:
*/
- if (MUTEX_SHOW_NO_WAITER(lock) &&
+ if (atomic_read(&lock->count) >= 0 &&
(atomic_xchg(&lock->count, -1) == 1))
break;
@@ -823,6 +820,10 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
unsigned long flags;
int prev;
+ /* No need to trylock if the mutex is locked. */
+ if (mutex_is_locked(lock))
+ return 0;
+
spin_lock_mutex(&lock->wait_lock, flags);
prev = atomic_xchg(&lock->count, -1);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fb5b8ac411a5..f956ede7f90d 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,7 +20,6 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
-#include <linux/mutex.h>
#include <asm/qrwlock.h>
/**
@@ -35,7 +34,7 @@ static __always_inline void
rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
cnts = smp_load_acquire((u32 *)&lock->cnts);
}
}
@@ -75,7 +74,7 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
* to make sure that the write lock isn't taken.
*/
while (atomic_read(&lock->cnts) & _QW_WMASK)
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
@@ -114,7 +113,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
cnts | _QW_WAITING) == cnts))
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
/* When no more readers, set the locked flag */
@@ -125,7 +124,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
_QW_LOCKED) == _QW_WAITING))
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
unlock:
arch_spin_unlock(&lock->lock);
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 49b2ed3dced8..62b6cee8ea7f 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -66,12 +66,13 @@ void rt_mutex_debug_task_free(struct task_struct *task)
* the deadlock. We print when we return. act_waiter can be NULL in
* case of a remove waiter operation.
*/
-void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
+void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *act_waiter,
struct rt_mutex *lock)
{
struct task_struct *task;
- if (!debug_locks || detect || !act_waiter)
+ if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter)
return;
task = rt_mutex_owner(act_waiter->lock);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index ab29b6a22669..d0519c3432b6 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -20,14 +20,15 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
struct task_struct *powner);
extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
-extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
+extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk,
+ struct rt_mutex_waiter *waiter,
struct rt_mutex *lock);
extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
# define debug_rt_mutex_reset_waiter(w) \
do { (w)->deadlock_lock = NULL; } while (0)
-static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
- int detect)
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk walk)
{
return (waiter != NULL);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fc605941b9b8..a0ea2a141b3b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -308,6 +308,32 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
}
/*
+ * Deadlock detection is conditional:
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=n, deadlock detection is only conducted
+ * if the detect argument is == RT_MUTEX_FULL_CHAINWALK.
+ *
+ * If CONFIG_DEBUG_RT_MUTEXES=y, deadlock detection is always
+ * conducted independent of the detect argument.
+ *
+ * If the waiter argument is NULL this indicates the deboost path and
+ * deadlock detection is disabled independent of the detect argument
+ * and the config settings.
+ */
+static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter,
+ enum rtmutex_chainwalk chwalk)
+{
+ /*
+ * This is just a wrapper function for the following call,
+ * because debug_rt_mutex_detect_deadlock() smells like a magic
+ * debug feature and I wanted to keep the cond function in the
+ * main source file along with the comments instead of having
+ * two of the same in the headers.
+ */
+ return debug_rt_mutex_detect_deadlock(waiter, chwalk);
+}
+
+/*
* Max number of times we'll walk the boosting chain:
*/
int max_lock_depth = 1024;
@@ -337,21 +363,65 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
* @top_task: the current top waiter
*
* Returns 0 or -EDEADLK.
+ *
+ * Chain walk basics and protection scope
+ *
+ * [R] refcount on task
+ * [P] task->pi_lock held
+ * [L] rtmutex->wait_lock held
+ *
+ * Step Description Protected by
+ * function arguments:
+ * @task [R]
+ * @orig_lock if != NULL @top_task is blocked on it
+ * @next_lock Unprotected. Cannot be
+ * dereferenced. Only used for
+ * comparison.
+ * @orig_waiter if != NULL @top_task is blocked on it
+ * @top_task current, or in case of proxy
+ * locking protected by calling
+ * code
+ * again:
+ * loop_sanity_check();
+ * retry:
+ * [1] lock(task->pi_lock); [R] acquire [P]
+ * [2] waiter = task->pi_blocked_on; [P]
+ * [3] check_exit_conditions_1(); [P]
+ * [4] lock = waiter->lock; [P]
+ * [5] if (!try_lock(lock->wait_lock)) { [P] try to acquire [L]
+ * unlock(task->pi_lock); release [P]
+ * goto retry;
+ * }
+ * [6] check_exit_conditions_2(); [P] + [L]
+ * [7] requeue_lock_waiter(lock, waiter); [P] + [L]
+ * [8] unlock(task->pi_lock); release [P]
+ * put_task_struct(task); release [R]
+ * [9] check_exit_conditions_3(); [L]
+ * [10] task = owner(lock); [L]
+ * get_task_struct(task); [L] acquire [R]
+ * lock(task->pi_lock); [L] acquire [P]
+ * [11] requeue_pi_waiter(tsk, waiters(lock));[P] + [L]
+ * [12] check_exit_conditions_4(); [P] + [L]
+ * [13] unlock(task->pi_lock); release [P]
+ * unlock(lock->wait_lock); release [L]
+ * goto again;
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
- int deadlock_detect,
+ enum rtmutex_chainwalk chwalk,
struct rt_mutex *orig_lock,
struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
- struct rt_mutex *lock;
struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
- int detect_deadlock, ret = 0, depth = 0;
+ struct rt_mutex_waiter *prerequeue_top_waiter;
+ int ret = 0, depth = 0;
+ struct rt_mutex *lock;
+ bool detect_deadlock;
unsigned long flags;
+ bool requeue = true;
- detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
- deadlock_detect);
+ detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
/*
* The (de)boosting is a step by step approach with a lot of
@@ -360,6 +430,9 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* carefully whether things change under us.
*/
again:
+ /*
+ * We limit the lock chain length for each invocation.
+ */
if (++depth > max_lock_depth) {
static int prev_max;
@@ -377,13 +450,28 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
return -EDEADLK;
}
+
+ /*
+ * We are fully preemptible here and only hold the refcount on
+ * @task. So everything can have changed under us since the
+ * caller or our own code below (goto retry/again) dropped all
+ * locks.
+ */
retry:
/*
- * Task can not go away as we did a get_task() before !
+ * [1] Task cannot go away as we did a get_task() before !
*/
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /*
+ * [2] Get the waiter on which @task is blocked on.
+ */
waiter = task->pi_blocked_on;
+
+ /*
+ * [3] check_exit_conditions_1() protected by task->pi_lock.
+ */
+
/*
* Check whether the end of the boosting chain has been
* reached or the state of the chain has changed while we
@@ -421,20 +509,41 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
* If deadlock detection is off, we stop here if we
- * are not the top pi waiter of the task.
+ * are not the top pi waiter of the task. If deadlock
+ * detection is enabled we continue, but stop the
+ * requeueing in the chain walk.
*/
- if (!detect_deadlock && top_waiter != task_top_pi_waiter(task))
- goto out_unlock_pi;
+ if (top_waiter != task_top_pi_waiter(task)) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
}
/*
- * When deadlock detection is off then we check, if further
- * priority adjustment is necessary.
+ * If the waiter priority is the same as the task priority
+ * then there is no further priority adjustment necessary. If
+ * deadlock detection is off, we stop the chain walk. If its
+ * enabled we continue, but stop the requeueing in the chain
+ * walk.
*/
- if (!detect_deadlock && waiter->prio == task->prio)
- goto out_unlock_pi;
+ if (waiter->prio == task->prio) {
+ if (!detect_deadlock)
+ goto out_unlock_pi;
+ else
+ requeue = false;
+ }
+ /*
+ * [4] Get the next lock
+ */
lock = waiter->lock;
+ /*
+ * [5] We need to trylock here as we are holding task->pi_lock,
+ * which is the reverse lock order versus the other rtmutex
+ * operations.
+ */
if (!raw_spin_trylock(&lock->wait_lock)) {
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
cpu_relax();
@@ -442,79 +551,180 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
+ * [6] check_exit_conditions_2() protected by task->pi_lock and
+ * lock->wait_lock.
+ *
* Deadlock detection. If the lock is the same as the original
* lock which caused us to walk the lock chain or if the
* current lock is owned by the task which initiated the chain
* walk, we detected a deadlock.
*/
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
- debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
+ debug_rt_mutex_deadlock(chwalk, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
ret = -EDEADLK;
goto out_unlock_pi;
}
- top_waiter = rt_mutex_top_waiter(lock);
+ /*
+ * If we just follow the lock chain for deadlock detection, no
+ * need to do all the requeue operations. To avoid a truckload
+ * of conditionals around the various places below, just do the
+ * minimum chain walk checks.
+ */
+ if (!requeue) {
+ /*
+ * No requeue[7] here. Just release @task [8]
+ */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ * If there is no owner of the lock, end of chain.
+ */
+ if (!rt_mutex_owner(lock)) {
+ raw_spin_unlock(&lock->wait_lock);
+ return 0;
+ }
+
+ /* [10] Grab the next task, i.e. owner of @lock */
+ task = rt_mutex_owner(lock);
+ get_task_struct(task);
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /*
+ * No requeue [11] here. We just do deadlock detection.
+ *
+ * [12] Store whether owner is blocked
+ * itself. Decision is made after dropping the locks
+ */
+ next_lock = task_blocked_on_lock(task);
+ /*
+ * Get the top waiter for the next iteration
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [13] Drop locks */
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&lock->wait_lock);
+
+ /* If owner is not blocked, end of chain. */
+ if (!next_lock)
+ goto out_put_task;
+ goto again;
+ }
- /* Requeue the waiter */
+ /*
+ * Store the current top waiter before doing the requeue
+ * operation on @lock. We need it for the boost/deboost
+ * decision below.
+ */
+ prerequeue_top_waiter = rt_mutex_top_waiter(lock);
+
+ /* [7] Requeue the waiter in the lock waiter list. */
rt_mutex_dequeue(lock, waiter);
waiter->prio = task->prio;
rt_mutex_enqueue(lock, waiter);
- /* Release the task */
+ /* [8] Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ put_task_struct(task);
+
+ /*
+ * [9] check_exit_conditions_3 protected by lock->wait_lock.
+ *
+ * We must abort the chain walk if there is no lock owner even
+ * in the dead lock detection case, as we have nothing to
+ * follow here. This is the end of the chain we are walking.
+ */
if (!rt_mutex_owner(lock)) {
/*
- * If the requeue above changed the top waiter, then we need
- * to wake the new top waiter up to try to get the lock.
+ * If the requeue [7] above changed the top waiter,
+ * then we need to wake the new top waiter up to try
+ * to get the lock.
*/
-
- if (top_waiter != rt_mutex_top_waiter(lock))
+ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
raw_spin_unlock(&lock->wait_lock);
- goto out_put_task;
+ return 0;
}
- put_task_struct(task);
- /* Grab the next task */
+ /* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
raw_spin_lock_irqsave(&task->pi_lock, flags);
+ /* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
- /* Boost the owner */
- rt_mutex_dequeue_pi(task, top_waiter);
+ /*
+ * The waiter became the new top (highest priority)
+ * waiter on the lock. Replace the previous top waiter
+ * in the owner tasks pi waiters list with this waiter
+ * and adjust the priority of the owner.
+ */
+ rt_mutex_dequeue_pi(task, prerequeue_top_waiter);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
- } else if (top_waiter == waiter) {
- /* Deboost the owner */
+ } else if (prerequeue_top_waiter == waiter) {
+ /*
+ * The waiter was the top waiter on the lock, but is
+ * no longer the top prority waiter. Replace waiter in
+ * the owner tasks pi waiters list with the new top
+ * (highest priority) waiter and adjust the priority
+ * of the owner.
+ * The new top waiter is stored in @waiter so that
+ * @waiter == @top_waiter evaluates to true below and
+ * we continue to deboost the rest of the chain.
+ */
rt_mutex_dequeue_pi(task, waiter);
waiter = rt_mutex_top_waiter(lock);
rt_mutex_enqueue_pi(task, waiter);
__rt_mutex_adjust_prio(task);
+ } else {
+ /*
+ * Nothing changed. No need to do any priority
+ * adjustment.
+ */
}
/*
+ * [12] check_exit_conditions_4() protected by task->pi_lock
+ * and lock->wait_lock. The actual decisions are made after we
+ * dropped the locks.
+ *
* Check whether the task which owns the current lock is pi
* blocked itself. If yes we store a pointer to the lock for
* the lock chain change detection above. After we dropped
* task->pi_lock next_lock cannot be dereferenced anymore.
*/
next_lock = task_blocked_on_lock(task);
+ /*
+ * Store the top waiter of @lock for the end of chain walk
+ * decision below.
+ */
+ top_waiter = rt_mutex_top_waiter(lock);
+ /* [13] Drop the locks */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
-
- top_waiter = rt_mutex_top_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
/*
+ * Make the actual exit decisions [12], based on the stored
+ * values.
+ *
* We reached the end of the lock chain. Stop right here. No
* point to go back just to figure that out.
*/
if (!next_lock)
goto out_put_task;
+ /*
+ * If the current waiter is not the top waiter on the lock,
+ * then we can stop the chain walk here if we are not in full
+ * deadlock detection mode.
+ */
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -533,76 +743,119 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*
* Must be called with lock->wait_lock held.
*
- * @lock: the lock to be acquired.
- * @task: the task which wants to acquire the lock
- * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
+ * @lock: The lock to be acquired.
+ * @task: The task which wants to acquire the lock
+ * @waiter: The waiter that is queued to the lock's wait list if the
+ * callsite called task_blocked_on_lock(), otherwise NULL
*/
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
- struct rt_mutex_waiter *waiter)
+ struct rt_mutex_waiter *waiter)
{
+ unsigned long flags;
+
/*
- * We have to be careful here if the atomic speedups are
- * enabled, such that, when
- * - no other waiter is on the lock
- * - the lock has been released since we did the cmpxchg
- * the lock can be released or taken while we are doing the
- * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
+ * Before testing whether we can acquire @lock, we set the
+ * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
+ * other tasks which try to modify @lock into the slow path
+ * and they serialize on @lock->wait_lock.
+ *
+ * The RT_MUTEX_HAS_WAITERS bit can have a transitional state
+ * as explained at the top of this file if and only if:
*
- * The atomic acquire/release aware variant of
- * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
- * the WAITERS bit, the atomic release / acquire can not
- * happen anymore and lock->wait_lock protects us from the
- * non-atomic case.
+ * - There is a lock owner. The caller must fixup the
+ * transient state if it does a trylock or leaves the lock
+ * function due to a signal or timeout.
*
- * Note, that this might set lock->owner =
- * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
- * any more. This is fixed up when we take the ownership.
- * This is the transitional state explained at the top of this file.
+ * - @task acquires the lock and there are no other
+ * waiters. This is undone in rt_mutex_set_owner(@task) at
+ * the end of this function.
*/
mark_rt_mutex_waiters(lock);
+ /*
+ * If @lock has an owner, give up.
+ */
if (rt_mutex_owner(lock))
return 0;
/*
- * It will get the lock because of one of these conditions:
- * 1) there is no waiter
- * 2) higher priority than waiters
- * 3) it is top waiter
+ * If @waiter != NULL, @task has already enqueued the waiter
+ * into @lock waiter list. If @waiter == NULL then this is a
+ * trylock attempt.
*/
- if (rt_mutex_has_waiters(lock)) {
- if (task->prio >= rt_mutex_top_waiter(lock)->prio) {
- if (!waiter || waiter != rt_mutex_top_waiter(lock))
- return 0;
- }
- }
-
- if (waiter || rt_mutex_has_waiters(lock)) {
- unsigned long flags;
- struct rt_mutex_waiter *top;
-
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ if (waiter) {
+ /*
+ * If waiter is not the highest priority waiter of
+ * @lock, give up.
+ */
+ if (waiter != rt_mutex_top_waiter(lock))
+ return 0;
- /* remove the queued waiter. */
- if (waiter) {
- rt_mutex_dequeue(lock, waiter);
- task->pi_blocked_on = NULL;
- }
+ /*
+ * We can acquire the lock. Remove the waiter from the
+ * lock waiters list.
+ */
+ rt_mutex_dequeue(lock, waiter);
+ } else {
/*
- * We have to enqueue the top waiter(if it exists) into
- * task->pi_waiters list.
+ * If the lock has waiters already we check whether @task is
+ * eligible to take over the lock.
+ *
+ * If there are no other waiters, @task can acquire
+ * the lock. @task->pi_blocked_on is NULL, so it does
+ * not need to be dequeued.
*/
if (rt_mutex_has_waiters(lock)) {
- top = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(task, top);
+ /*
+ * If @task->prio is greater than or equal to
+ * the top waiter priority (kernel view),
+ * @task lost.
+ */
+ if (task->prio >= rt_mutex_top_waiter(lock)->prio)
+ return 0;
+
+ /*
+ * The current top waiter stays enqueued. We
+ * don't have to change anything in the lock
+ * waiters order.
+ */
+ } else {
+ /*
+ * No waiters. Take the lock without the
+ * pi_lock dance.@task->pi_blocked_on is NULL
+ * and we have no waiters to enqueue in @task
+ * pi waiters list.
+ */
+ goto takeit;
}
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
}
+ /*
+ * Clear @task->pi_blocked_on. Requires protection by
+ * @task->pi_lock. Redundant operation for the @waiter == NULL
+ * case, but conditionals are more expensive than a redundant
+ * store.
+ */
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+ task->pi_blocked_on = NULL;
+ /*
+ * Finish the lock acquisition. @task is the new owner. If
+ * other waiters exist we have to insert the highest priority
+ * waiter into @task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+
+takeit:
/* We got the lock. */
debug_rt_mutex_lock(lock);
+ /*
+ * This either preserves the RT_MUTEX_HAS_WAITERS bit if there
+ * are still waiters or clears it.
+ */
rt_mutex_set_owner(lock, task);
rt_mutex_deadlock_account_lock(lock, task);
@@ -620,7 +873,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
@@ -666,7 +919,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
- } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
+ } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
}
@@ -691,7 +944,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+ res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -753,9 +1006,9 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
static void remove_waiter(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter)
{
- int first = (waiter == rt_mutex_top_waiter(lock));
+ bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
- struct rt_mutex *next_lock = NULL;
+ struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&current->pi_lock, flags);
@@ -763,29 +1016,31 @@ static void remove_waiter(struct rt_mutex *lock,
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- if (!owner)
+ /*
+ * Only update priority if the waiter was the highest priority
+ * waiter of the lock and there is an owner to update.
+ */
+ if (!owner || !is_top_waiter)
return;
- if (first) {
-
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
- rt_mutex_dequeue_pi(owner, waiter);
+ rt_mutex_dequeue_pi(owner, waiter);
- if (rt_mutex_has_waiters(lock)) {
- struct rt_mutex_waiter *next;
+ if (rt_mutex_has_waiters(lock))
+ rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock));
- next = rt_mutex_top_waiter(lock);
- rt_mutex_enqueue_pi(owner, next);
- }
- __rt_mutex_adjust_prio(owner);
+ __rt_mutex_adjust_prio(owner);
- /* Store the lock on which owner is blocked or NULL */
- next_lock = task_blocked_on_lock(owner);
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Don't walk the chain, if the owner task is not blocked
+ * itself.
+ */
if (!next_lock)
return;
@@ -794,7 +1049,8 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
+ rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
+ next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -824,7 +1080,8 @@ void rt_mutex_adjust_pi(struct task_struct *task)
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
- rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
+ rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
+ next_lock, NULL, task);
}
/**
@@ -902,7 +1159,7 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
static int __sched
rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+ enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
int ret = 0;
@@ -928,7 +1185,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
timeout->task = NULL;
}
- ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
if (likely(!ret))
ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
@@ -937,7 +1194,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(ret)) {
remove_waiter(lock, &waiter);
- rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+ rt_mutex_handle_deadlock(ret, chwalk, &waiter);
}
/*
@@ -960,22 +1217,31 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
/*
* Slow path try-lock function:
*/
-static inline int
-rt_mutex_slowtrylock(struct rt_mutex *lock)
+static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
- int ret = 0;
+ int ret;
+
+ /*
+ * If the lock already has an owner we fail to get the lock.
+ * This can be done without taking the @lock->wait_lock as
+ * it is only being read, and this is a trylock anyway.
+ */
+ if (rt_mutex_owner(lock))
+ return 0;
+ /*
+ * The mutex has currently no owner. Lock the wait lock and
+ * try to acquire the lock.
+ */
raw_spin_lock(&lock->wait_lock);
- if (likely(rt_mutex_owner(lock) != current)) {
+ ret = try_to_take_rt_mutex(lock, current, NULL);
- ret = try_to_take_rt_mutex(lock, current, NULL);
- /*
- * try_to_take_rt_mutex() sets the lock waiters
- * bit unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
- }
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -1053,30 +1319,31 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
- int detect_deadlock,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, NULL, detect_deadlock);
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
- struct hrtimer_sleeper *timeout, int detect_deadlock,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk,
int (*slowfn)(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- int detect_deadlock))
+ enum rtmutex_chainwalk chwalk))
{
- if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+ likely(rt_mutex_cmpxchg(lock, NULL, current))) {
rt_mutex_deadlock_account_lock(lock, current);
return 0;
} else
- return slowfn(lock, state, timeout, detect_deadlock);
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
@@ -1109,54 +1376,61 @@ void __sched rt_mutex_lock(struct rt_mutex *lock)
{
might_sleep();
- rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
+ rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
/**
* rt_mutex_lock_interruptible - lock a rt_mutex interruptible
*
- * @lock: the rt_mutex to be locked
- * @detect_deadlock: deadlock detection on/off
+ * @lock: the rt_mutex to be locked
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
+ * 0 on success
+ * -EINTR when interrupted by a signal
*/
-int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
- int detect_deadlock)
+int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
might_sleep();
- return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
- detect_deadlock, rt_mutex_slowlock);
+ return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+/*
+ * Futex variant with full deadlock detection.
+ */
+int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout)
+{
+ might_sleep();
+
+ return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+ RT_MUTEX_FULL_CHAINWALK,
+ rt_mutex_slowlock);
+}
+
/**
* rt_mutex_timed_lock - lock a rt_mutex interruptible
* the timeout structure is provided
* by the caller
*
- * @lock: the rt_mutex to be locked
+ * @lock: the rt_mutex to be locked
* @timeout: timeout structure or NULL (no timeout)
- * @detect_deadlock: deadlock detection on/off
*
* Returns:
- * 0 on success
- * -EINTR when interrupted by a signal
+ * 0 on success
+ * -EINTR when interrupted by a signal
* -ETIMEDOUT when the timeout expired
- * -EDEADLK when the lock would deadlock (when deadlock detection is on)
*/
int
-rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
- int detect_deadlock)
+rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout)
{
might_sleep();
return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- detect_deadlock, rt_mutex_slowlock);
+ RT_MUTEX_MIN_CHAINWALK,
+ rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
@@ -1262,7 +1536,6 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
* @lock: the rt_mutex to take
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Returns:
* 0 - task blocked on lock
@@ -1273,7 +1546,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task, int detect_deadlock)
+ struct task_struct *task)
{
int ret;
@@ -1285,7 +1558,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
}
/* We enforce deadlock detection for futexes */
- ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+ RT_MUTEX_FULL_CHAINWALK);
if (ret && !rt_mutex_owner(lock)) {
/*
@@ -1331,22 +1605,20 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
* rt_mutex_finish_proxy_lock() - Complete lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
- * been started.
+ * been started.
* @waiter: the pre-initialized rt_mutex_waiter
- * @detect_deadlock: perform deadlock detection (1) or not (0)
*
* Complete the lock acquisition started our behalf by another thread.
*
* Returns:
* 0 - success
- * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ * <0 - error, one of -EINTR, -ETIMEDOUT
*
* Special API call for PI-futex requeue support
*/
int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock)
+ struct rt_mutex_waiter *waiter)
{
int ret;
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index f6a1f3c133b1..c4060584c407 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -22,10 +22,15 @@
#define debug_rt_mutex_init(m, n) do { } while (0)
#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
-#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
{
WARN(1, "rtmutex deadlock detected\n");
}
+
+static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w,
+ enum rtmutex_chainwalk walk)
+{
+ return walk == RT_MUTEX_FULL_CHAINWALK;
+}
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7431a9c86f35..855212501407 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -102,6 +102,21 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
}
/*
+ * Constants for rt mutex functions which have a selectable deadlock
+ * detection.
+ *
+ * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are
+ * no further PI adjustments to be made.
+ *
+ * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full
+ * walk of the lock chain.
+ */
+enum rtmutex_chainwalk {
+ RT_MUTEX_MIN_CHAINWALK,
+ RT_MUTEX_FULL_CHAINWALK,
+};
+
+/*
* PI-futex support (proxy locking functions, etc.):
*/
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
@@ -111,12 +126,11 @@ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
- struct task_struct *task,
- int detect_deadlock);
+ struct task_struct *task);
extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock);
+ struct rt_mutex_waiter *waiter);
+extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
#ifdef CONFIG_DEBUG_RT_MUTEXES
# include "rtmutex-debug.h"
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a2391ac135c8..d6203faf2eb1 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -329,7 +329,7 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
if (need_resched())
break;
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
rcu_read_unlock();
@@ -381,7 +381,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
* memory barriers as we'll eventually observe the right
* values at the cost of a few extra spins.
*/
- arch_mutex_cpu_relax();
+ cpu_relax_lowlatency();
}
osq_unlock(&sem->osq);
done:
diff --git a/kernel/module.c b/kernel/module.c
index 81e727cf6df9..6f69463f0066 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -60,7 +60,6 @@
#include <linux/jump_label.h>
#include <linux/pfn.h>
#include <linux/bsearch.h>
-#include <linux/fips.h>
#include <uapi/linux/module.h>
#include "module-internal.h"
@@ -2448,9 +2447,6 @@ static int module_sig_check(struct load_info *info)
}
/* Not having a signature is only an error if we're strict. */
- if (err < 0 && fips_enabled)
- panic("Module verification failed with error %d in FIPS mode\n",
- err);
if (err == -ENOKEY && !sig_enforce)
err = 0;
@@ -3385,6 +3381,8 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
*/
static inline int is_arm_mapping_symbol(const char *str)
{
+ if (str[0] == '.' && str[1] == 'L')
+ return true;
return str[0] == '$' && strchr("atd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
@@ -3448,8 +3446,7 @@ const char *module_address_lookup(unsigned long addr,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
if (modname)
*modname = mod->name;
ret = get_ksymbol(mod, addr, size, offset);
@@ -3473,8 +3470,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3499,8 +3495,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_init(addr, mod) ||
- within_module_core(addr, mod)) {
+ if (within_module(addr, mod)) {
const char *sym;
sym = get_ksymbol(mod, addr, size, offset);
@@ -3764,8 +3759,7 @@ struct module *__module_address(unsigned long addr)
list_for_each_entry_rcu(mod, &modules, list) {
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (within_module_core(addr, mod)
- || within_module_init(addr, mod))
+ if (within_module(addr, mod))
return mod;
}
return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
might_sleep();
+ task_lock(p);
ns = p->nsproxy;
+ p->nsproxy = new;
+ task_unlock(p);
- rcu_assign_pointer(p->nsproxy, new);
-
- if (ns && atomic_dec_and_test(&ns->count)) {
- /*
- * wait for others to get what they want from this nsproxy.
- *
- * cannot release this nsproxy via the call_rcu() since
- * put_mnt_ns() will want to sleep
- */
- synchronize_rcu();
+ if (ns && atomic_dec_and_test(&ns->count))
free_nsproxy(ns);
- }
}
void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..d09dc5c32c67 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
{ TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
{ TAINT_OOT_MODULE, 'O', ' ' },
{ TAINT_UNSIGNED_MODULE, 'E', ' ' },
+ { TAINT_SOFTLOCKUP, 'L', ' ' },
};
/**
diff --git a/kernel/params.c b/kernel/params.c
index 1e52ca233fd9..34f527023794 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -256,6 +256,7 @@ STANDARD_PARAM_DEF(int, int, "%i", kstrtoint);
STANDARD_PARAM_DEF(uint, unsigned int, "%u", kstrtouint);
STANDARD_PARAM_DEF(long, long, "%li", kstrtol);
STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", kstrtoul);
+STANDARD_PARAM_DEF(ullong, unsigned long long, "%llu", kstrtoull);
int param_set_charp(const char *val, const struct kernel_param *kp)
{
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9a83d780facd..e4e4121fa327 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -253,9 +253,6 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config ARCH_HAS_OPP
- bool
-
config PM_OPP
bool
---help---
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index fcc2611d3f14..a9dfa79b6bab 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -371,7 +371,6 @@ int hibernation_snapshot(int platform_mode)
}
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend(PMSG_FREEZE);
@@ -397,7 +396,6 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
dpm_complete(msg);
@@ -500,7 +498,6 @@ int hibernation_restore(int platform_mode)
pm_prepare_console();
suspend_console();
- ftrace_stop();
pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
@@ -508,7 +505,6 @@ int hibernation_restore(int platform_mode)
dpm_resume_end(PMSG_RECOVER);
}
pm_restore_gfp_mask();
- ftrace_start();
resume_console();
pm_restore_console();
return error;
@@ -535,7 +531,6 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- ftrace_stop();
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -579,7 +574,6 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- ftrace_start();
resume_console();
Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 8e90f330f139..9a59d042ea84 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -296,8 +296,8 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
suspend_state_t i;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (pm_states[i].state)
- s += sprintf(s,"%s ", pm_states[i].label);
+ if (pm_states[i])
+ s += sprintf(s,"%s ", pm_states[i]);
#endif
if (hibernation_available())
@@ -311,8 +311,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
- suspend_state_t state = PM_SUSPEND_MIN;
- struct pm_sleep_state *s;
+ suspend_state_t state;
#endif
char *p;
int len;
@@ -325,10 +324,12 @@ static suspend_state_t decode_state(const char *buf, size_t n)
return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
- if (s->state && len == strlen(s->label)
- && !strncmp(buf, s->label, len))
- return s->state;
+ for (state = PM_SUSPEND_MIN; state < PM_SUSPEND_MAX; state++) {
+ const char *label = pm_states[state];
+
+ if (label && len == strlen(label) && !strncmp(buf, label, len))
+ return state;
+ }
#endif
return PM_SUSPEND_ON;
@@ -446,8 +447,8 @@ static ssize_t autosleep_show(struct kobject *kobj,
#ifdef CONFIG_SUSPEND
if (state < PM_SUSPEND_MAX)
- return sprintf(buf, "%s\n", pm_states[state].state ?
- pm_states[state].label : "error");
+ return sprintf(buf, "%s\n", pm_states[state] ?
+ pm_states[state] : "error");
#endif
#ifdef CONFIG_HIBERNATION
return sprintf(buf, "disk\n");
@@ -615,7 +616,6 @@ static struct attribute_group attr_group = {
.attrs = g,
};
-#ifdef CONFIG_PM_RUNTIME
struct workqueue_struct *pm_wq;
EXPORT_SYMBOL_GPL(pm_wq);
@@ -625,9 +625,6 @@ static int __init pm_start_workqueue(void)
return pm_wq ? 0 : -ENOMEM;
}
-#else
-static inline int pm_start_workqueue(void) { return 0; }
-#endif
static int __init pm_init(void)
{
diff --git a/kernel/power/power.h b/kernel/power/power.h
index c60f13b5270a..5d49dcac2537 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -178,13 +178,8 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
unsigned int, char *);
#ifdef CONFIG_SUSPEND
-struct pm_sleep_state {
- const char *label;
- suspend_state_t state;
-};
-
/* kernel/power/suspend.c */
-extern struct pm_sleep_state pm_states[];
+extern const char *pm_states[];
extern int suspend_devices_and_enter(suspend_state_t state);
#else /* !CONFIG_SUSPEND */
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 1ea328aafdc9..4fc5c32422b3 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -248,33 +248,61 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
* information is stored (in the form of a block of bitmap)
* It also contains the pfns that correspond to the start and end of
* the represented memory area.
+ *
+ * The memory bitmap is organized as a radix tree to guarantee fast random
+ * access to the bits. There is one radix tree for each zone (as returned
+ * from create_mem_extents).
+ *
+ * One radix tree is represented by one struct mem_zone_bm_rtree. There are
+ * two linked lists for the nodes of the tree, one for the inner nodes and
+ * one for the leave nodes. The linked leave nodes are used for fast linear
+ * access of the memory bitmap.
+ *
+ * The struct rtree_node represents one node of the radix tree.
*/
#define BM_END_OF_MAP (~0UL)
#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE)
+#define BM_BLOCK_SHIFT (PAGE_SHIFT + 3)
+#define BM_BLOCK_MASK ((1UL << BM_BLOCK_SHIFT) - 1)
-struct bm_block {
- struct list_head hook; /* hook into a list of bitmap blocks */
- unsigned long start_pfn; /* pfn represented by the first bit */
- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
- unsigned long *data; /* bitmap representing pages */
+/*
+ * struct rtree_node is a wrapper struct to link the nodes
+ * of the rtree together for easy linear iteration over
+ * bits and easy freeing
+ */
+struct rtree_node {
+ struct list_head list;
+ unsigned long *data;
};
-static inline unsigned long bm_block_bits(struct bm_block *bb)
-{
- return bb->end_pfn - bb->start_pfn;
-}
+/*
+ * struct mem_zone_bm_rtree represents a bitmap used for one
+ * populated memory zone.
+ */
+struct mem_zone_bm_rtree {
+ struct list_head list; /* Link Zones together */
+ struct list_head nodes; /* Radix Tree inner nodes */
+ struct list_head leaves; /* Radix Tree leaves */
+ unsigned long start_pfn; /* Zone start page frame */
+ unsigned long end_pfn; /* Zone end page frame + 1 */
+ struct rtree_node *rtree; /* Radix Tree Root */
+ int levels; /* Number of Radix Tree Levels */
+ unsigned int blocks; /* Number of Bitmap Blocks */
+};
/* strcut bm_position is used for browsing memory bitmaps */
struct bm_position {
- struct bm_block *block;
- int bit;
+ struct mem_zone_bm_rtree *zone;
+ struct rtree_node *node;
+ unsigned long node_pfn;
+ int node_bit;
};
struct memory_bitmap {
- struct list_head blocks; /* list of bitmap blocks */
+ struct list_head zones;
struct linked_page *p_list; /* list of pages used to store zone
* bitmap objects and bitmap block
* objects
@@ -284,38 +312,178 @@ struct memory_bitmap {
/* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+#define BM_ENTRIES_PER_LEVEL (PAGE_SIZE / sizeof(unsigned long))
+#if BITS_PER_LONG == 32
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 2)
+#else
+#define BM_RTREE_LEVEL_SHIFT (PAGE_SHIFT - 3)
+#endif
+#define BM_RTREE_LEVEL_MASK ((1UL << BM_RTREE_LEVEL_SHIFT) - 1)
+
+/*
+ * alloc_rtree_node - Allocate a new node and add it to the radix tree.
+ *
+ * This function is used to allocate inner nodes as well as the
+ * leave nodes of the radix tree. It also adds the node to the
+ * corresponding linked list passed in by the *list parameter.
+ */
+static struct rtree_node *alloc_rtree_node(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ struct list_head *list)
{
- bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
- bm->cur.bit = 0;
-}
+ struct rtree_node *node;
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+ node = chain_alloc(ca, sizeof(struct rtree_node));
+ if (!node)
+ return NULL;
-/**
- * create_bm_block_list - create a list of block bitmap objects
- * @pages - number of pages to track
- * @list - list to put the allocated blocks into
- * @ca - chain allocator to be used for allocating memory
+ node->data = get_image_page(gfp_mask, safe_needed);
+ if (!node->data)
+ return NULL;
+
+ list_add_tail(&node->list, list);
+
+ return node;
+}
+
+/*
+ * add_rtree_block - Add a new leave node to the radix tree
+ *
+ * The leave nodes need to be allocated in order to keep the leaves
+ * linked list in order. This is guaranteed by the zone->blocks
+ * counter.
*/
-static int create_bm_block_list(unsigned long pages,
- struct list_head *list,
- struct chain_allocator *ca)
+static int add_rtree_block(struct mem_zone_bm_rtree *zone, gfp_t gfp_mask,
+ int safe_needed, struct chain_allocator *ca)
{
- unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+ struct rtree_node *node, *block, **dst;
+ unsigned int levels_needed, block_nr;
+ int i;
- while (nr_blocks-- > 0) {
- struct bm_block *bb;
+ block_nr = zone->blocks;
+ levels_needed = 0;
- bb = chain_alloc(ca, sizeof(struct bm_block));
- if (!bb)
+ /* How many levels do we need for this block nr? */
+ while (block_nr) {
+ levels_needed += 1;
+ block_nr >>= BM_RTREE_LEVEL_SHIFT;
+ }
+
+ /* Make sure the rtree has enough levels */
+ for (i = zone->levels; i < levels_needed; i++) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
return -ENOMEM;
- list_add(&bb->hook, list);
+
+ node->data[0] = (unsigned long)zone->rtree;
+ zone->rtree = node;
+ zone->levels += 1;
+ }
+
+ /* Allocate new block */
+ block = alloc_rtree_node(gfp_mask, safe_needed, ca, &zone->leaves);
+ if (!block)
+ return -ENOMEM;
+
+ /* Now walk the rtree to insert the block */
+ node = zone->rtree;
+ dst = &zone->rtree;
+ block_nr = zone->blocks;
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ if (!node) {
+ node = alloc_rtree_node(gfp_mask, safe_needed, ca,
+ &zone->nodes);
+ if (!node)
+ return -ENOMEM;
+ *dst = node;
+ }
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ dst = (struct rtree_node **)&((*dst)->data[index]);
+ node = *dst;
}
+ zone->blocks += 1;
+ *dst = block;
+
return 0;
}
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free);
+
+/*
+ * create_zone_bm_rtree - create a radix tree for one zone
+ *
+ * Allocated the mem_zone_bm_rtree structure and initializes it.
+ * This function also allocated and builds the radix tree for the
+ * zone.
+ */
+static struct mem_zone_bm_rtree *
+create_zone_bm_rtree(gfp_t gfp_mask, int safe_needed,
+ struct chain_allocator *ca,
+ unsigned long start, unsigned long end)
+{
+ struct mem_zone_bm_rtree *zone;
+ unsigned int i, nr_blocks;
+ unsigned long pages;
+
+ pages = end - start;
+ zone = chain_alloc(ca, sizeof(struct mem_zone_bm_rtree));
+ if (!zone)
+ return NULL;
+
+ INIT_LIST_HEAD(&zone->nodes);
+ INIT_LIST_HEAD(&zone->leaves);
+ zone->start_pfn = start;
+ zone->end_pfn = end;
+ nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK);
+
+ for (i = 0; i < nr_blocks; i++) {
+ if (add_rtree_block(zone, gfp_mask, safe_needed, ca)) {
+ free_zone_bm_rtree(zone, PG_UNSAFE_CLEAR);
+ return NULL;
+ }
+ }
+
+ return zone;
+}
+
+/*
+ * free_zone_bm_rtree - Free the memory of the radix tree
+ *
+ * Free all node pages of the radix tree. The mem_zone_bm_rtree
+ * structure itself is not freed here nor are the rtree_node
+ * structs.
+ */
+static void free_zone_bm_rtree(struct mem_zone_bm_rtree *zone,
+ int clear_nosave_free)
+{
+ struct rtree_node *node;
+
+ list_for_each_entry(node, &zone->nodes, list)
+ free_image_page(node->data, clear_nosave_free);
+
+ list_for_each_entry(node, &zone->leaves, list)
+ free_image_page(node->data, clear_nosave_free);
+}
+
+static void memory_bm_position_reset(struct memory_bitmap *bm)
+{
+ bm->cur.zone = list_entry(bm->zones.next, struct mem_zone_bm_rtree,
+ list);
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+}
+
+static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+
struct mem_extent {
struct list_head hook;
unsigned long start;
@@ -407,40 +575,22 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
int error;
chain_init(&ca, gfp_mask, safe_needed);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
error = create_mem_extents(&mem_extents, gfp_mask);
if (error)
return error;
list_for_each_entry(ext, &mem_extents, hook) {
- struct bm_block *bb;
- unsigned long pfn = ext->start;
- unsigned long pages = ext->end - ext->start;
-
- bb = list_entry(bm->blocks.prev, struct bm_block, hook);
+ struct mem_zone_bm_rtree *zone;
- error = create_bm_block_list(pages, bm->blocks.prev, &ca);
- if (error)
+ zone = create_zone_bm_rtree(gfp_mask, safe_needed, &ca,
+ ext->start, ext->end);
+ if (!zone) {
+ error = -ENOMEM;
goto Error;
-
- list_for_each_entry_continue(bb, &bm->blocks, hook) {
- bb->data = get_image_page(gfp_mask, safe_needed);
- if (!bb->data) {
- error = -ENOMEM;
- goto Error;
- }
-
- bb->start_pfn = pfn;
- if (pages >= BM_BITS_PER_BLOCK) {
- pfn += BM_BITS_PER_BLOCK;
- pages -= BM_BITS_PER_BLOCK;
- } else {
- /* This is executed only once in the loop */
- pfn += pages;
- }
- bb->end_pfn = pfn;
}
+ list_add_tail(&zone->list, &bm->zones);
}
bm->p_list = ca.chain;
@@ -460,51 +610,83 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
*/
static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *zone;
- list_for_each_entry(bb, &bm->blocks, hook)
- if (bb->data)
- free_image_page(bb->data, clear_nosave_free);
+ list_for_each_entry(zone, &bm->zones, list)
+ free_zone_bm_rtree(zone, clear_nosave_free);
free_list_of_pages(bm->p_list, clear_nosave_free);
- INIT_LIST_HEAD(&bm->blocks);
+ INIT_LIST_HEAD(&bm->zones);
}
/**
- * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
- * to given pfn. The cur_zone_bm member of @bm and the cur_block member
- * of @bm->cur_zone_bm are updated.
+ * memory_bm_find_bit - Find the bit for pfn in the memory
+ * bitmap
+ *
+ * Find the bit in the bitmap @bm that corresponds to given pfn.
+ * The cur.zone, cur.block and cur.node_pfn member of @bm are
+ * updated.
+ * It walks the radix tree to find the page which contains the bit for
+ * pfn and returns the bit position in **addr and *bit_nr.
*/
static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
- void **addr, unsigned int *bit_nr)
+ void **addr, unsigned int *bit_nr)
{
- struct bm_block *bb;
+ struct mem_zone_bm_rtree *curr, *zone;
+ struct rtree_node *node;
+ int i, block_nr;
+ zone = bm->cur.zone;
+
+ if (pfn >= zone->start_pfn && pfn < zone->end_pfn)
+ goto zone_found;
+
+ zone = NULL;
+
+ /* Find the right zone */
+ list_for_each_entry(curr, &bm->zones, list) {
+ if (pfn >= curr->start_pfn && pfn < curr->end_pfn) {
+ zone = curr;
+ break;
+ }
+ }
+
+ if (!zone)
+ return -EFAULT;
+
+zone_found:
/*
- * Check if the pfn corresponds to the current bitmap block and find
- * the block where it fits if this is not the case.
+ * We have a zone. Now walk the radix tree to find the leave
+ * node for our pfn.
*/
- bb = bm->cur.block;
- if (pfn < bb->start_pfn)
- list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn)
- break;
- if (pfn >= bb->end_pfn)
- list_for_each_entry_continue(bb, &bm->blocks, hook)
- if (pfn >= bb->start_pfn && pfn < bb->end_pfn)
- break;
+ node = bm->cur.node;
+ if (((pfn - zone->start_pfn) & ~BM_BLOCK_MASK) == bm->cur.node_pfn)
+ goto node_found;
- if (&bb->hook == &bm->blocks)
- return -EFAULT;
+ node = zone->rtree;
+ block_nr = (pfn - zone->start_pfn) >> BM_BLOCK_SHIFT;
+
+ for (i = zone->levels; i > 0; i--) {
+ int index;
+
+ index = block_nr >> ((i - 1) * BM_RTREE_LEVEL_SHIFT);
+ index &= BM_RTREE_LEVEL_MASK;
+ BUG_ON(node->data[index] == 0);
+ node = (struct rtree_node *)node->data[index];
+ }
+
+node_found:
+ /* Update last position */
+ bm->cur.zone = zone;
+ bm->cur.node = node;
+ bm->cur.node_pfn = (pfn - zone->start_pfn) & ~BM_BLOCK_MASK;
+
+ /* Set return values */
+ *addr = node->data;
+ *bit_nr = (pfn - zone->start_pfn) & BM_BLOCK_MASK;
- /* The block has been found */
- bm->cur.block = bb;
- pfn -= bb->start_pfn;
- bm->cur.bit = pfn + 1;
- *bit_nr = pfn;
- *addr = bb->data;
return 0;
}
@@ -528,6 +710,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
error = memory_bm_find_bit(bm, pfn, &addr, &bit);
if (!error)
set_bit(bit, addr);
+
return error;
}
@@ -542,6 +725,14 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
+static void memory_bm_clear_current(struct memory_bitmap *bm)
+{
+ int bit;
+
+ bit = max(bm->cur.node_bit - 1, 0);
+ clear_bit(bit, bm->cur.node->data);
+}
+
static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
@@ -561,38 +752,70 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
return !memory_bm_find_bit(bm, pfn, &addr, &bit);
}
-/**
- * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
- * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
- * returned.
+/*
+ * rtree_next_node - Jumps to the next leave node
+ *
+ * Sets the position to the beginning of the next node in the
+ * memory bitmap. This is either the next node in the current
+ * zone's radix tree or the first node in the radix tree of the
+ * next zone.
*
- * It is required to run memory_bm_position_reset() before the first call to
- * this function.
+ * Returns true if there is a next node, false otherwise.
*/
+static bool rtree_next_node(struct memory_bitmap *bm)
+{
+ bm->cur.node = list_entry(bm->cur.node->list.next,
+ struct rtree_node, list);
+ if (&bm->cur.node->list != &bm->cur.zone->leaves) {
+ bm->cur.node_pfn += BM_BITS_PER_BLOCK;
+ bm->cur.node_bit = 0;
+ touch_softlockup_watchdog();
+ return true;
+ }
+
+ /* No more nodes, goto next zone */
+ bm->cur.zone = list_entry(bm->cur.zone->list.next,
+ struct mem_zone_bm_rtree, list);
+ if (&bm->cur.zone->list != &bm->zones) {
+ bm->cur.node = list_entry(bm->cur.zone->leaves.next,
+ struct rtree_node, list);
+ bm->cur.node_pfn = 0;
+ bm->cur.node_bit = 0;
+ return true;
+ }
+ /* No more zones */
+ return false;
+}
+
+/**
+ * memory_bm_rtree_next_pfn - Find the next set bit in the bitmap @bm
+ *
+ * Starting from the last returned position this function searches
+ * for the next set bit in the memory bitmap and returns its
+ * number. If no more bit is set BM_END_OF_MAP is returned.
+ *
+ * It is required to run memory_bm_position_reset() before the
+ * first call to this function.
+ */
static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
- struct bm_block *bb;
+ unsigned long bits, pfn, pages;
int bit;
- bb = bm->cur.block;
do {
- bit = bm->cur.bit;
- bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
- if (bit < bm_block_bits(bb))
- goto Return_pfn;
-
- bb = list_entry(bb->hook.next, struct bm_block, hook);
- bm->cur.block = bb;
- bm->cur.bit = 0;
- } while (&bb->hook != &bm->blocks);
+ pages = bm->cur.zone->end_pfn - bm->cur.zone->start_pfn;
+ bits = min(pages - bm->cur.node_pfn, BM_BITS_PER_BLOCK);
+ bit = find_next_bit(bm->cur.node->data, bits,
+ bm->cur.node_bit);
+ if (bit < bits) {
+ pfn = bm->cur.zone->start_pfn + bm->cur.node_pfn + bit;
+ bm->cur.node_bit = bit + 1;
+ return pfn;
+ }
+ } while (rtree_next_node(bm));
- memory_bm_position_reset(bm);
return BM_END_OF_MAP;
-
- Return_pfn:
- bm->cur.bit = bit + 1;
- return bb->start_pfn + bit;
}
/**
@@ -816,12 +1039,17 @@ void free_basic_memory_bitmaps(void)
unsigned int snapshot_additional_pages(struct zone *zone)
{
- unsigned int res;
+ unsigned int rtree, nodes;
+
+ rtree = nodes = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
+ rtree += DIV_ROUND_UP(rtree * sizeof(struct rtree_node),
+ LINKED_PAGE_DATA_SIZE);
+ while (nodes > 1) {
+ nodes = DIV_ROUND_UP(nodes, BM_ENTRIES_PER_LEVEL);
+ rtree += nodes;
+ }
- res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
- res += DIV_ROUND_UP(res * sizeof(struct bm_block),
- LINKED_PAGE_DATA_SIZE);
- return 2 * res;
+ return 2 * rtree;
}
#ifdef CONFIG_HIGHMEM
@@ -1094,23 +1322,35 @@ static struct memory_bitmap copy_bm;
void swsusp_free(void)
{
- struct zone *zone;
- unsigned long pfn, max_zone_pfn;
+ unsigned long fb_pfn, fr_pfn;
- for_each_populated_zone(zone) {
- max_zone_pfn = zone_end_pfn(zone);
- for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
- if (pfn_valid(pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (swsusp_page_is_forbidden(page) &&
- swsusp_page_is_free(page)) {
- swsusp_unset_page_forbidden(page);
- swsusp_unset_page_free(page);
- __free_page(page);
- }
- }
+ memory_bm_position_reset(forbidden_pages_map);
+ memory_bm_position_reset(free_pages_map);
+
+loop:
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+
+ /*
+ * Find the next bit set in both bitmaps. This is guaranteed to
+ * terminate when fb_pfn == fr_pfn == BM_END_OF_MAP.
+ */
+ do {
+ if (fb_pfn < fr_pfn)
+ fb_pfn = memory_bm_next_pfn(forbidden_pages_map);
+ if (fr_pfn < fb_pfn)
+ fr_pfn = memory_bm_next_pfn(free_pages_map);
+ } while (fb_pfn != fr_pfn);
+
+ if (fr_pfn != BM_END_OF_MAP && pfn_valid(fr_pfn)) {
+ struct page *page = pfn_to_page(fr_pfn);
+
+ memory_bm_clear_current(forbidden_pages_map);
+ memory_bm_clear_current(free_pages_map);
+ __free_page(page);
+ goto loop;
}
+
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index ed35a4790afe..6dadb25cb0d8 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,20 +31,11 @@
#include "power.h"
-struct pm_sleep_state pm_states[PM_SUSPEND_MAX] = {
- [PM_SUSPEND_FREEZE] = { .label = "freeze", .state = PM_SUSPEND_FREEZE },
- [PM_SUSPEND_STANDBY] = { .label = "standby", },
- [PM_SUSPEND_MEM] = { .label = "mem", },
-};
+static const char *pm_labels[] = { "mem", "standby", "freeze", };
+const char *pm_states[PM_SUSPEND_MAX];
static const struct platform_suspend_ops *suspend_ops;
static const struct platform_freeze_ops *freeze_ops;
-
-static bool need_suspend_ops(suspend_state_t state)
-{
- return state > PM_SUSPEND_FREEZE;
-}
-
static DECLARE_WAIT_QUEUE_HEAD(suspend_freeze_wait_head);
static bool suspend_freeze_wake;
@@ -97,10 +88,7 @@ static bool relative_states;
static int __init sleep_states_setup(char *str)
{
relative_states = !strncmp(str, "1", 1);
- if (relative_states) {
- pm_states[PM_SUSPEND_MEM].state = PM_SUSPEND_FREEZE;
- pm_states[PM_SUSPEND_FREEZE].state = 0;
- }
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[relative_states ? 0 : 2];
return 1;
}
@@ -113,20 +101,20 @@ __setup("relative_sleep_states=", sleep_states_setup);
void suspend_set_ops(const struct platform_suspend_ops *ops)
{
suspend_state_t i;
- int j = PM_SUSPEND_MAX - 1;
+ int j = 0;
lock_system_sleep();
suspend_ops = ops;
for (i = PM_SUSPEND_MEM; i >= PM_SUSPEND_STANDBY; i--)
- if (valid_state(i))
- pm_states[j--].state = i;
- else if (!relative_states)
- pm_states[j--].state = 0;
+ if (valid_state(i)) {
+ pm_states[i] = pm_labels[j++];
+ } else if (!relative_states) {
+ pm_states[i] = NULL;
+ j++;
+ }
- pm_states[j--].state = PM_SUSPEND_FREEZE;
- while (j >= PM_SUSPEND_MIN)
- pm_states[j--].state = 0;
+ pm_states[PM_SUSPEND_FREEZE] = pm_labels[j];
unlock_system_sleep();
}
@@ -145,6 +133,65 @@ int suspend_valid_only_mem(suspend_state_t state)
}
EXPORT_SYMBOL_GPL(suspend_valid_only_mem);
+static bool sleep_state_supported(suspend_state_t state)
+{
+ return state == PM_SUSPEND_FREEZE || (suspend_ops && suspend_ops->enter);
+}
+
+static int platform_suspend_prepare(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare ?
+ suspend_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_late(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
+ suspend_ops->prepare_late() : 0;
+}
+
+static void platform_suspend_wake(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
+ suspend_ops->wake();
+}
+
+static void platform_suspend_finish(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
+ suspend_ops->finish();
+}
+
+static int platform_suspend_begin(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin)
+ return freeze_ops->begin();
+ else if (suspend_ops->begin)
+ return suspend_ops->begin(state);
+ else
+ return 0;
+}
+
+static void platform_suspend_end(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
+ freeze_ops->end();
+ else if (suspend_ops->end)
+ suspend_ops->end();
+}
+
+static void platform_suspend_recover(suspend_state_t state)
+{
+ if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
+ suspend_ops->recover();
+}
+
+static bool platform_suspend_again(suspend_state_t state)
+{
+ return state != PM_SUSPEND_FREEZE && suspend_ops->suspend_again ?
+ suspend_ops->suspend_again() : false;
+}
+
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
@@ -168,7 +215,7 @@ static int suspend_prepare(suspend_state_t state)
{
int error;
- if (need_suspend_ops(state) && (!suspend_ops || !suspend_ops->enter))
+ if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
@@ -214,23 +261,18 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
{
int error;
- if (need_suspend_ops(state) && suspend_ops->prepare) {
- error = suspend_ops->prepare();
- if (error)
- goto Platform_finish;
- }
+ error = platform_suspend_prepare(state);
+ if (error)
+ goto Platform_finish;
error = dpm_suspend_end(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platform_finish;
}
-
- if (need_suspend_ops(state) && suspend_ops->prepare_late) {
- error = suspend_ops->prepare_late();
- if (error)
- goto Platform_wake;
- }
+ error = platform_suspend_prepare_late(state);
+ if (error)
+ goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
@@ -248,7 +290,6 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
goto Platform_wake;
}
- ftrace_stop();
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
@@ -275,18 +316,13 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
Enable_cpus:
enable_nonboot_cpus();
- ftrace_start();
Platform_wake:
- if (need_suspend_ops(state) && suspend_ops->wake)
- suspend_ops->wake();
-
+ platform_suspend_wake(state);
dpm_resume_start(PMSG_RESUME);
Platform_finish:
- if (need_suspend_ops(state) && suspend_ops->finish)
- suspend_ops->finish();
-
+ platform_suspend_finish(state);
return error;
}
@@ -299,18 +335,13 @@ int suspend_devices_and_enter(suspend_state_t state)
int error;
bool wakeup = false;
- if (need_suspend_ops(state) && !suspend_ops)
+ if (!sleep_state_supported(state))
return -ENOSYS;
- if (need_suspend_ops(state) && suspend_ops->begin) {
- error = suspend_ops->begin(state);
- if (error)
- goto Close;
- } else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->begin) {
- error = freeze_ops->begin();
- if (error)
- goto Close;
- }
+ error = platform_suspend_begin(state);
+ if (error)
+ goto Close;
+
suspend_console();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
@@ -324,25 +355,20 @@ int suspend_devices_and_enter(suspend_state_t state)
do {
error = suspend_enter(state, &wakeup);
- } while (!error && !wakeup && need_suspend_ops(state)
- && suspend_ops->suspend_again && suspend_ops->suspend_again());
+ } while (!error && !wakeup && platform_suspend_again(state));
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
resume_console();
- Close:
- if (need_suspend_ops(state) && suspend_ops->end)
- suspend_ops->end();
- else if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
- freeze_ops->end();
+ Close:
+ platform_suspend_end(state);
return error;
Recover_platform:
- if (need_suspend_ops(state) && suspend_ops->recover)
- suspend_ops->recover();
+ platform_suspend_recover(state);
goto Resume_devices;
}
@@ -395,7 +421,7 @@ static int enter_state(suspend_state_t state)
printk("done.\n");
trace_suspend_resume(TPS("sync_filesystems"), 0, false);
- pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare(state);
if (error)
goto Unlock;
@@ -404,7 +430,7 @@ static int enter_state(suspend_state_t state)
goto Finish;
trace_suspend_resume(TPS("suspend_enter"), state, false);
- pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
pm_restore_gfp_mask();
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 269b097e78ea..2f524928b6aa 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -92,13 +92,13 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
}
if (state == PM_SUSPEND_MEM) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
- printk(info_test, pm_states[state].label);
+ printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
if (status < 0)
@@ -141,8 +141,8 @@ static int __init setup_test_suspend(char *value)
/* "=mem" ==> "mem" */
value++;
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
- if (!strcmp(pm_states[i].label, value)) {
- test_state = pm_states[i].state;
+ if (!strcmp(pm_states[i], value)) {
+ test_state = i;
return 0;
}
@@ -162,8 +162,8 @@ static int __init test_suspend(void)
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
- if (!pm_states[test_state].state) {
- printk(warn_bad_state, pm_states[test_state].label);
+ if (!pm_states[test_state]) {
+ printk(warn_bad_state, pm_states[test_state]);
goto done;
}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..de1a6bb6861d 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
#include <linux/poll.h>
#include <linux/irq_work.h>
#include <linux/utsname.h>
+#include <linux/ctype.h>
#include <asm/uaccess.h>
@@ -56,7 +57,7 @@
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
- DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
+ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
};
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
* This is used for debugging the mess that is the VT code by
* keeping track if we have the console semaphore held. It's
* definitely not the perfect debug tool (we don't know if _WE_
- * hold it are racing, but it helps tracking those weird code
- * path in the console code where we end up in places I want
- * locked without the console sempahore held
+ * hold it and are racing, but it helps tracking those weird code
+ * paths in the console code where we end up in places I want
+ * locked without the console sempahore held).
*/
static int console_locked, console_suspended;
@@ -146,8 +147,8 @@ static int console_may_schedule;
* the overall length of the record.
*
* The heads to the first and last entry in the buffer, as well as the
- * sequence numbers of these both entries are maintained when messages
- * are stored..
+ * sequence numbers of these entries are maintained when messages are
+ * stored.
*
* If the heads indicate available messages, the length in the header
* tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
static u32 clear_idx;
#define PREFIX_MAX 32
-#define LOG_LINE_MAX 1024 - PREFIX_MAX
+#define LOG_LINE_MAX (1024 - PREFIX_MAX)
/* record buffer */
#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -266,6 +267,7 @@ static u32 clear_idx;
#define LOG_ALIGN __alignof__(struct printk_log)
#endif
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
static char *log_buf = __log_buf;
static u32 log_buf_len = __LOG_BUF_LEN;
@@ -344,7 +346,7 @@ static int log_make_free_space(u32 msg_size)
while (log_first_seq < log_next_seq) {
if (logbuf_has_space(msg_size, false))
return 0;
- /* drop old messages until we have enough continuous space */
+ /* drop old messages until we have enough contiguous space */
log_first_idx = log_next(log_first_idx);
log_first_seq++;
}
@@ -453,11 +455,7 @@ static int log_store(int facility, int level,
return msg->text_len;
}
-#ifdef CONFIG_SECURITY_DMESG_RESTRICT
-int dmesg_restrict = 1;
-#else
-int dmesg_restrict;
-#endif
+int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
static int syslog_action_restricted(int type)
{
@@ -828,34 +826,74 @@ void log_buf_kexec_setup(void)
/* requested log_buf_len from kernel cmdline */
static unsigned long __initdata new_log_buf_len;
-/* save requested log_buf_len since it's too early to process it */
-static int __init log_buf_len_setup(char *str)
+/* we practice scaling the ring buffer by powers of 2 */
+static void __init log_buf_len_update(unsigned size)
{
- unsigned size = memparse(str, &str);
-
if (size)
size = roundup_pow_of_two(size);
if (size > log_buf_len)
new_log_buf_len = size;
+}
+
+/* save requested log_buf_len since it's too early to process it */
+static int __init log_buf_len_setup(char *str)
+{
+ unsigned size = memparse(str, &str);
+
+ log_buf_len_update(size);
return 0;
}
early_param("log_buf_len", log_buf_len_setup);
+static void __init log_buf_add_cpu(void)
+{
+ unsigned int cpu_extra;
+
+ /*
+ * archs should set up cpu_possible_bits properly with
+ * set_cpu_possible() after setup_arch() but just in
+ * case lets ensure this is valid.
+ */
+ if (num_possible_cpus() == 1)
+ return;
+
+ cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
+
+ /* by default this will only continue through for large > 64 CPUs */
+ if (cpu_extra <= __LOG_BUF_LEN / 2)
+ return;
+
+ pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
+ __LOG_CPU_MAX_BUF_LEN);
+ pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
+ cpu_extra);
+ pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
+
+ log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
+}
+
void __init setup_log_buf(int early)
{
unsigned long flags;
char *new_log_buf;
int free;
+ if (log_buf != __log_buf)
+ return;
+
+ if (!early && !new_log_buf_len)
+ log_buf_add_cpu();
+
if (!new_log_buf_len)
return;
if (early) {
new_log_buf =
- memblock_virt_alloc(new_log_buf_len, PAGE_SIZE);
+ memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
} else {
- new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0);
+ new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
+ LOG_ALIGN);
}
if (unlikely(!new_log_buf)) {
@@ -872,7 +910,7 @@ void __init setup_log_buf(int early)
memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
raw_spin_unlock_irqrestore(&logbuf_lock, flags);
- pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("log_buf_len: %d bytes\n", log_buf_len);
pr_info("early log buf free: %d(%d%%)\n",
free, (free * 100) / __LOG_BUF_LEN);
}
@@ -881,7 +919,7 @@ static bool __read_mostly ignore_loglevel;
static int __init ignore_loglevel_setup(char *str)
{
- ignore_loglevel = 1;
+ ignore_loglevel = true;
pr_info("debug: ignoring loglevel setting.\n");
return 0;
@@ -947,11 +985,7 @@ static inline void boot_delay_msec(int level)
}
#endif
-#if defined(CONFIG_PRINTK_TIME)
-static bool printk_time = 1;
-#else
-static bool printk_time;
-#endif
+static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1344,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
* for pending data, not the size; return the count of
* records, not the length.
*/
- error = log_next_idx - syslog_idx;
+ error = log_next_seq - syslog_seq;
} else {
u64 seq = syslog_seq;
u32 idx = syslog_idx;
@@ -1416,10 +1450,9 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
- * Console drivers may assume that per-cpu resources have
- * been allocated. So unless they're explicitly marked as
- * being able to cope (CON_ANYTIME) don't call them until
- * this CPU is officially up.
+ * Console drivers may assume that per-cpu resources have been allocated. So
+ * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
+ * call them until this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1432,8 +1465,10 @@ static inline int can_use_console(unsigned int cpu)
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*/
-static int console_trylock_for_printk(unsigned int cpu)
+static int console_trylock_for_printk(void)
{
+ unsigned int cpu = smp_processor_id();
+
if (!console_trylock())
return 0;
/*
@@ -1476,7 +1511,7 @@ static struct cont {
struct task_struct *owner; /* task of first print*/
u64 ts_nsec; /* time of first print */
u8 level; /* log level of first message */
- u8 facility; /* log level of first message */
+ u8 facility; /* log facility of first message */
enum log_flags flags; /* prefix, newline flags */
bool flushed:1; /* buffer sealed and committed */
} cont;
@@ -1608,7 +1643,8 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
- goto out_restore_irqs;
+ local_irq_restore(flags);
+ return 0;
}
zap_locks();
}
@@ -1716,21 +1752,30 @@ asmlinkage int vprintk_emit(int facility, int level,
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
+ lockdep_on();
+ local_irq_restore(flags);
/* If called from the scheduler, we can not call up(). */
if (!in_sched) {
+ lockdep_off();
+ /*
+ * Disable preemption to avoid being preempted while holding
+ * console_sem which would prevent anyone from printing to
+ * console
+ */
+ preempt_disable();
+
/*
* Try to acquire and then immediately release the console
* semaphore. The release will print out buffers and wake up
* /dev/kmsg and syslog() users.
*/
- if (console_trylock_for_printk(this_cpu))
+ if (console_trylock_for_printk())
console_unlock();
+ preempt_enable();
+ lockdep_on();
}
- lockdep_on();
-out_restore_irqs:
- local_irq_restore(flags);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1847,7 @@ EXPORT_SYMBOL(printk);
#define LOG_LINE_MAX 0
#define PREFIX_MAX 0
-#define LOG_LINE_MAX 0
+
static u64 syslog_seq;
static u32 syslog_idx;
static u64 console_seq;
@@ -1881,11 +1926,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
return 0;
}
/*
- * Set up a list of consoles. Called from init/main.c
+ * Set up a console. Called via do_early_param() in init/main.c
+ * for each "console=" parameter in the boot command line.
*/
static int __init console_setup(char *str)
{
- char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */
+ char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
char *s, *options, *brl_options = NULL;
int idx;
@@ -1902,7 +1948,8 @@ static int __init console_setup(char *str)
strncpy(buf, str, sizeof(buf) - 1);
}
buf[sizeof(buf) - 1] = 0;
- if ((options = strchr(str, ',')) != NULL)
+ options = strchr(str, ',');
+ if (options)
*(options++) = 0;
#ifdef __sparc__
if (!strcmp(str, "ttya"))
@@ -1911,7 +1958,7 @@ static int __init console_setup(char *str)
strcpy(buf, "ttyS1");
#endif
for (s = buf; *s; s++)
- if ((*s >= '0' && *s <= '9') || *s == ',')
+ if (isdigit(*s) || *s == ',')
break;
idx = simple_strtoul(s, NULL, 10);
*s = 0;
@@ -1950,7 +1997,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
i++, c++)
if (strcmp(c->name, name) == 0 && c->index == idx) {
strlcpy(c->name, name_new, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
c->options = options;
c->index = idx_new;
return i;
@@ -1959,12 +2005,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
return -1;
}
-bool console_suspend_enabled = 1;
+bool console_suspend_enabled = true;
EXPORT_SYMBOL(console_suspend_enabled);
static int __init console_suspend_disable(char *str)
{
- console_suspend_enabled = 0;
+ console_suspend_enabled = false;
return 1;
}
__setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2091,8 @@ EXPORT_SYMBOL(console_lock);
/**
* console_trylock - try to lock the console system for exclusive use.
*
- * Tried to acquire a lock which guarantees that the caller has
- * exclusive access to the console system and the console_drivers list.
+ * Try to acquire a lock which guarantees that the caller has exclusive
+ * access to the console system and the console_drivers list.
*
* returns 1 on success, and 0 on failure to acquire the lock.
*/
@@ -2618,14 +2664,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
bool printk_timed_ratelimit(unsigned long *caller_jiffies,
unsigned int interval_msecs)
{
- if (*caller_jiffies == 0
- || !time_in_range(jiffies, *caller_jiffies,
- *caller_jiffies
- + msecs_to_jiffies(interval_msecs))) {
- *caller_jiffies = jiffies;
- return true;
- }
- return false;
+ unsigned long elapsed = jiffies - *caller_jiffies;
+
+ if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
+ return false;
+
+ *caller_jiffies = jiffies;
+ return true;
}
EXPORT_SYMBOL(printk_timed_ratelimit);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index adf98622cb32..54e75226c2c4 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,12 +28,6 @@
#include <linux/compat.h>
-static int ptrace_trapping_sleep_fn(void *flags)
-{
- schedule();
- return 0;
-}
-
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
@@ -371,7 +365,7 @@ unlock_creds:
out:
if (!retval) {
wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT,
- ptrace_trapping_sleep_fn, TASK_UNINTERRUPTIBLE);
+ TASK_UNINTERRUPTIBLE);
proc_ptrace_connector(task, PTRACE_ATTACH);
}
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index bfda2726ca45..ff1a6de62f17 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -99,6 +99,10 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
void kfree(const void *);
+/*
+ * Reclaim the specified callback, either by invoking it (non-lazy case)
+ * or freeing it directly (lazy case). Return true if lazy, false otherwise.
+ */
static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
{
unsigned long offset = (unsigned long)head->func;
@@ -108,12 +112,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head)
RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
kfree((void *)head - offset);
rcu_lock_release(&rcu_callback_map);
- return 1;
+ return true;
} else {
RCU_TRACE(trace_rcu_invoke_callback(rn, head));
head->func(head);
rcu_lock_release(&rcu_callback_map);
- return 0;
+ return false;
}
}
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index c639556f3fa0..e037f3eb2f7b 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -298,9 +298,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
idx = ACCESS_ONCE(sp->completed) & 0x1;
preempt_disable();
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->c[idx]);
smp_mb(); /* B */ /* Avoid leaking the critical section. */
- ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
+ __this_cpu_inc(sp->per_cpu_ref->seq[idx]);
preempt_enable();
return idx;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 625d0b0cd75a..1b70cb6fbe3c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1013,10 +1013,7 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
}
/*
- * Dump stacks of all tasks running on stalled CPUs. This is a fallback
- * for architectures that do not implement trigger_all_cpu_backtrace().
- * The NMI-triggered stack traces are more accurate because they are
- * printed by the target CPU.
+ * Dump stacks of all tasks running on stalled CPUs.
*/
static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
{
@@ -1094,7 +1091,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
(long)rsp->gpnum, (long)rsp->completed, totqlen);
if (ndetected == 0)
pr_err("INFO: Stall ended before state dump start\n");
- else if (!trigger_all_cpu_backtrace())
+ else
rcu_dump_cpu_stacks(rsp);
/* Complain about tasks blocking the grace period. */
@@ -1125,8 +1122,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n",
jiffies - rsp->gp_start,
(long)rsp->gpnum, (long)rsp->completed, totqlen);
- if (!trigger_all_cpu_backtrace())
- dump_stack();
+ rcu_dump_cpu_stacks(rsp);
raw_spin_lock_irqsave(&rnp->lock, flags);
if (ULONG_CMP_GE(jiffies, ACCESS_ONCE(rsp->jiffies_stall)))
@@ -1305,10 +1301,16 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
* believe that a grace period is in progress, then we must wait
* for the one following, which is in "c". Because our request
* will be noticed at the end of the current grace period, we don't
- * need to explicitly start one.
+ * need to explicitly start one. We only do the lockless check
+ * of rnp_root's fields if the current rcu_node structure thinks
+ * there is no grace period in flight, and because we hold rnp->lock,
+ * the only possible change is when rnp_root's two fields are
+ * equal, in which case rnp_root->gpnum might be concurrently
+ * incremented. But that is OK, as it will just result in our
+ * doing some extra useless work.
*/
if (rnp->gpnum != rnp->completed ||
- ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) {
+ ACCESS_ONCE(rnp_root->gpnum) != ACCESS_ONCE(rnp_root->completed)) {
rnp->need_future_gp[c & 0x1]++;
trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf"));
goto out;
@@ -1645,11 +1647,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
-#ifdef CONFIG_PROVE_RCU_DELAY
- if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 &&
- system_state == SYSTEM_RUNNING)
- udelay(200);
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
cond_resched();
}
@@ -2347,7 +2344,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
}
smp_mb(); /* List handling before counting for rcu_barrier(). */
rdp->qlen_lazy -= count_lazy;
- ACCESS_ONCE(rdp->qlen) -= count;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen - count;
rdp->n_cbs_invoked += count;
/* Reinstate batch limit if we have worked down the excess. */
@@ -2485,14 +2482,14 @@ static void force_quiescent_state(struct rcu_state *rsp)
struct rcu_node *rnp_old = NULL;
/* Funnel through hierarchy to reduce memory contention. */
- rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode;
+ rnp = __this_cpu_read(rsp->rda->mynode);
for (; rnp != NULL; rnp = rnp->parent) {
ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) ||
!raw_spin_trylock(&rnp->fqslock);
if (rnp_old != NULL)
raw_spin_unlock(&rnp_old->fqslock);
if (ret) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
return;
}
rnp_old = rnp;
@@ -2504,7 +2501,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
smp_mb__after_unlock_lock();
raw_spin_unlock(&rnp_old->fqslock);
if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
- ACCESS_ONCE(rsp->n_force_qs_lh)++;
+ rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
return; /* Someone beat us to it. */
}
@@ -2662,7 +2659,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
unsigned long flags;
struct rcu_data *rdp;
- WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */
+ WARN_ON_ONCE((unsigned long)head & 0x1); /* Misaligned rcu_head! */
if (debug_rcu_head_queue(head)) {
/* Probable double call_rcu(), so leak the callback. */
ACCESS_ONCE(head->func) = rcu_leak_callback;
@@ -2693,7 +2690,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
local_irq_restore(flags);
return;
}
- ACCESS_ONCE(rdp->qlen)++;
+ ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
if (lazy)
rdp->qlen_lazy++;
else
@@ -3257,7 +3254,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
* ACCESS_ONCE() to prevent the compiler from speculating
* the increment to precede the early-exit check.
*/
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
@@ -3307,7 +3304,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Increment ->n_barrier_done to prevent duplicate work. */
smp_mb(); /* Keep increment after above mechanism. */
- ACCESS_ONCE(rsp->n_barrier_done)++;
+ ACCESS_ONCE(rsp->n_barrier_done) = rsp->n_barrier_done + 1;
WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
smp_mb(); /* Keep increment before caller's subsequent code. */
@@ -3564,14 +3561,16 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
static void __init rcu_init_one(struct rcu_state *rsp,
struct rcu_data __percpu *rda)
{
- static char *buf[] = { "rcu_node_0",
- "rcu_node_1",
- "rcu_node_2",
- "rcu_node_3" }; /* Match MAX_RCU_LVLS */
- static char *fqs[] = { "rcu_node_fqs_0",
- "rcu_node_fqs_1",
- "rcu_node_fqs_2",
- "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const buf[] = {
+ "rcu_node_0",
+ "rcu_node_1",
+ "rcu_node_2",
+ "rcu_node_3" }; /* Match MAX_RCU_LVLS */
+ static const char * const fqs[] = {
+ "rcu_node_fqs_0",
+ "rcu_node_fqs_1",
+ "rcu_node_fqs_2",
+ "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
static u8 fl_mask = 0x1;
int cpustride = 1;
int i;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 0f69a79c5b7d..71e64c718f75 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -172,6 +172,14 @@ struct rcu_node {
/* queued on this rcu_node structure that */
/* are blocking the current grace period, */
/* there can be no such task. */
+ struct completion boost_completion;
+ /* Used to ensure that the rt_mutex used */
+ /* to carry out the boosting is fully */
+ /* released with no future boostee accesses */
+ /* before that rt_mutex is re-initialized. */
+ struct rt_mutex boost_mtx;
+ /* Used only for the priority-boosting */
+ /* side effect, not as a lock. */
unsigned long boost_time;
/* When to start boosting (jiffies). */
struct task_struct *boost_kthread_task;
@@ -334,11 +342,29 @@ struct rcu_data {
struct rcu_head **nocb_tail;
atomic_long_t nocb_q_count; /* # CBs waiting for kthread */
atomic_long_t nocb_q_count_lazy; /* (approximate). */
+ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
+ struct rcu_head **nocb_follower_tail;
+ atomic_long_t nocb_follower_count; /* # CBs ready to invoke. */
+ atomic_long_t nocb_follower_count_lazy; /* (approximate). */
int nocb_p_count; /* # CBs being invoked by kthread */
int nocb_p_count_lazy; /* (approximate). */
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+
+ /* The following fields are used by the leader, hence own cacheline. */
+ struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
+ /* CBs waiting for GP. */
+ struct rcu_head **nocb_gp_tail;
+ long nocb_gp_count;
+ long nocb_gp_count_lazy;
+ bool nocb_leader_wake; /* Is the nocb leader thread awake? */
+ struct rcu_data *nocb_next_follower;
+ /* Next follower in wakeup chain. */
+
+ /* The following fields are used by the follower, hence new cachline. */
+ struct rcu_data *nocb_leader ____cacheline_internodealigned_in_smp;
+ /* Leader CPU takes GP-end wakeups. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
/* 8) RCU CPU stall data. */
@@ -587,8 +613,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
/* Sum up queue lengths for tracing. */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
{
- *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count;
- *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy;
+ *ql = atomic_long_read(&rdp->nocb_q_count) +
+ rdp->nocb_p_count +
+ atomic_long_read(&rdp->nocb_follower_count) +
+ rdp->nocb_p_count + rdp->nocb_gp_count;
+ *qll = atomic_long_read(&rdp->nocb_q_count_lazy) +
+ rdp->nocb_p_count_lazy +
+ atomic_long_read(&rdp->nocb_follower_count_lazy) +
+ rdp->nocb_p_count_lazy + rdp->nocb_gp_count_lazy;
}
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 02ac0fb186b8..00dc411e9676 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -33,6 +33,7 @@
#define RCU_KTHREAD_PRIO 1
#ifdef CONFIG_RCU_BOOST
+#include "../locking/rtmutex_common.h"
#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
#else
#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
@@ -336,7 +337,7 @@ void rcu_read_unlock_special(struct task_struct *t)
unsigned long flags;
struct list_head *np;
#ifdef CONFIG_RCU_BOOST
- struct rt_mutex *rbmp = NULL;
+ bool drop_boost_mutex = false;
#endif /* #ifdef CONFIG_RCU_BOOST */
struct rcu_node *rnp;
int special;
@@ -398,11 +399,8 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
if (&t->rcu_node_entry == rnp->boost_tasks)
rnp->boost_tasks = np;
- /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
- if (t->rcu_boost_mutex) {
- rbmp = t->rcu_boost_mutex;
- t->rcu_boost_mutex = NULL;
- }
+ /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
+ drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -427,8 +425,10 @@ void rcu_read_unlock_special(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
/* Unboost if we were boosted. */
- if (rbmp)
- rt_mutex_unlock(rbmp);
+ if (drop_boost_mutex) {
+ rt_mutex_unlock(&rnp->boost_mtx);
+ complete(&rnp->boost_completion);
+ }
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -988,6 +988,7 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
/* Because preemptible RCU does not exist, no quieting of tasks. */
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -1149,7 +1150,6 @@ static void rcu_wake_cond(struct task_struct *t, int status)
static int rcu_boost(struct rcu_node *rnp)
{
unsigned long flags;
- struct rt_mutex mtx;
struct task_struct *t;
struct list_head *tb;
@@ -1200,11 +1200,15 @@ static int rcu_boost(struct rcu_node *rnp)
* section.
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
- rt_mutex_init_proxy_locked(&mtx, t);
- t->rcu_boost_mutex = &mtx;
+ rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
+ init_completion(&rnp->boost_completion);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
- rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+ /* Lock only for side effect: boosts task t's priority. */
+ rt_mutex_lock(&rnp->boost_mtx);
+ rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
+
+ /* Wait for boostee to be done w/boost_mtx before reinitializing. */
+ wait_for_completion(&rnp->boost_completion);
return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
ACCESS_ONCE(rnp->boost_tasks) != NULL;
@@ -1256,6 +1260,7 @@ static int rcu_boost_kthread(void *arg)
* about it going away.
*/
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
struct task_struct *t;
@@ -1491,6 +1496,7 @@ static void rcu_prepare_kthreads(int cpu)
#else /* #ifdef CONFIG_RCU_BOOST */
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+ __releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -2060,6 +2066,22 @@ bool rcu_is_nocb_cpu(int cpu)
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
/*
+ * Kick the leader kthread for this NOCB group.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ struct rcu_data *rdp_leader = rdp->nocb_leader;
+
+ if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
+ return;
+ if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) {
+ /* Prior xchg orders against prior callback enqueue. */
+ ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true;
+ wake_up(&rdp_leader->nocb_wq);
+ }
+}
+
+/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2093,7 +2115,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
len = atomic_long_read(&rdp->nocb_q_count);
if (old_rhpp == &rdp->nocb_head) {
if (!irqs_disabled_flags(flags)) {
- wake_up(&rdp->nocb_wq); /* ... if queue was empty ... */
+ /* ... if queue was empty ... */
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
@@ -2103,7 +2126,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
- wake_up_process(t); /* ... or if many callbacks queued. */
+ /* ... or if many callbacks queued. */
+ wake_nocb_leader(rdp, true);
rdp->qlen_last_fqs_check = LONG_MAX / 2;
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
} else {
@@ -2213,13 +2237,150 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
}
/*
+ * Leaders come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_leader_wait(struct rcu_data *my_rdp)
+{
+ bool firsttime = true;
+ bool gotcbs;
+ struct rcu_data *rdp;
+ struct rcu_head **tail;
+
+wait_again:
+
+ /* Wait for callbacks to appear. */
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ wait_event_interruptible(my_rdp->nocb_wq,
+ ACCESS_ONCE(my_rdp->nocb_leader_wake));
+ /* Memory barrier handled by smp_mb() calls below and repoll. */
+ } else if (firsttime) {
+ firsttime = false; /* Don't drown trace log with "Poll"! */
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ }
+
+ /*
+ * Each pass through the following loop checks a follower for CBs.
+ * We are our own first follower. Any CBs found are moved to
+ * nocb_gp_head, where they await a grace period.
+ */
+ gotcbs = false;
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs here, try next follower. */
+
+ /* Move callbacks to wait-for-GP list, which is empty. */
+ ACCESS_ONCE(rdp->nocb_head) = NULL;
+ rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
+ rdp->nocb_gp_count = atomic_long_xchg(&rdp->nocb_q_count, 0);
+ rdp->nocb_gp_count_lazy =
+ atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
+ gotcbs = true;
+ }
+
+ /*
+ * If there were no callbacks, sleep a bit, rescan after a
+ * memory barrier, and go retry.
+ */
+ if (unlikely(!gotcbs)) {
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+
+ /* Rescan in case we were a victim of memory ordering. */
+ my_rdp->nocb_leader_wake = false;
+ smp_mb(); /* Ensure _wake false before scan. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
+ if (ACCESS_ONCE(rdp->nocb_head)) {
+ /* Found CB, so short-circuit next wait. */
+ my_rdp->nocb_leader_wake = true;
+ break;
+ }
+ goto wait_again;
+ }
+
+ /* Wait for one grace period. */
+ rcu_nocb_wait_gp(my_rdp);
+
+ /*
+ * We left ->nocb_leader_wake set to reduce cache thrashing.
+ * We clear it now, but recheck for new callbacks while
+ * traversing our follower list.
+ */
+ my_rdp->nocb_leader_wake = false;
+ smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */
+
+ /* Each pass through the following loop wakes a follower, if needed. */
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
+ if (ACCESS_ONCE(rdp->nocb_head))
+ my_rdp->nocb_leader_wake = true; /* No need to wait. */
+ if (!rdp->nocb_gp_head)
+ continue; /* No CBs, so no need to wake follower. */
+
+ /* Append callbacks to follower's "done" list. */
+ tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ *tail = rdp->nocb_gp_head;
+ atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
+ atomic_long_add(rdp->nocb_gp_count_lazy,
+ &rdp->nocb_follower_count_lazy);
+ if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
+ /*
+ * List was empty, wake up the follower.
+ * Memory barriers supplied by atomic_long_add().
+ */
+ wake_up(&rdp->nocb_wq);
+ }
+ }
+
+ /* If we (the leader) don't have CBs, go wait some more. */
+ if (!my_rdp->nocb_follower_head)
+ goto wait_again;
+}
+
+/*
+ * Followers come here to wait for additional callbacks to show up.
+ * This function does not return until callbacks appear.
+ */
+static void nocb_follower_wait(struct rcu_data *rdp)
+{
+ bool firsttime = true;
+
+ for (;;) {
+ if (!rcu_nocb_poll) {
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "FollowerSleep");
+ wait_event_interruptible(rdp->nocb_wq,
+ ACCESS_ONCE(rdp->nocb_follower_head));
+ } else if (firsttime) {
+ /* Don't drown trace log with "Poll"! */
+ firsttime = false;
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
+ }
+ if (smp_load_acquire(&rdp->nocb_follower_head)) {
+ /* ^^^ Ensure CB invocation follows _head test. */
+ return;
+ }
+ if (!rcu_nocb_poll)
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
+ "WokeEmpty");
+ flush_signals(current);
+ schedule_timeout_interruptible(1);
+ }
+}
+
+/*
* Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes
- * callbacks queued by the corresponding no-CBs CPU.
+ * callbacks queued by the corresponding no-CBs CPU, however, there is
+ * an optional leader-follower relationship so that the grace-period
+ * kthreads don't have to do quite so many wakeups.
*/
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
- bool firsttime = 1;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2227,41 +2388,22 @@ static int rcu_nocb_kthread(void *arg)
/* Each pass through this loop invokes one batch of callbacks */
for (;;) {
- /* If not polling, wait for next batch of callbacks. */
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Sleep"));
- wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
- /* Memory barrier provide by xchg() below. */
- } else if (firsttime) {
- firsttime = 0;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("Poll"));
- }
- list = ACCESS_ONCE(rdp->nocb_head);
- if (!list) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeEmpty"));
- schedule_timeout_interruptible(1);
- flush_signals(current);
- continue;
- }
- firsttime = 1;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WokeNonEmpty"));
-
- /*
- * Extract queued callbacks, update counts, and wait
- * for a grace period to elapse.
- */
- ACCESS_ONCE(rdp->nocb_head) = NULL;
- tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
- c = atomic_long_xchg(&rdp->nocb_q_count, 0);
- cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0);
- ACCESS_ONCE(rdp->nocb_p_count) += c;
- ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl;
- rcu_nocb_wait_gp(rdp);
+ /* Wait for callbacks. */
+ if (rdp->nocb_leader == rdp)
+ nocb_leader_wait(rdp);
+ else
+ nocb_follower_wait(rdp);
+
+ /* Pull the ready-to-invoke callbacks onto local list. */
+ list = ACCESS_ONCE(rdp->nocb_follower_head);
+ BUG_ON(!list);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
+ ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
+ tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ c = atomic_long_xchg(&rdp->nocb_follower_count, 0);
+ cl = atomic_long_xchg(&rdp->nocb_follower_count_lazy, 0);
+ rdp->nocb_p_count += c;
+ rdp->nocb_p_count_lazy += cl;
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name, cl, c, -1);
@@ -2305,7 +2447,7 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
if (!rcu_nocb_need_deferred_wakeup(rdp))
return;
ACCESS_ONCE(rdp->nocb_defer_wakeup) = false;
- wake_up(&rdp->nocb_wq);
+ wake_nocb_leader(rdp, false);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty"));
}
@@ -2314,19 +2456,57 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
-/* Create a kthread for each RCU flavor for each no-CBs CPU. */
+/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_leader_stride = -1;
+module_param(rcu_nocb_leader_stride, int, 0444);
+
+/*
+ * Create a kthread for each RCU flavor for each no-CBs CPU.
+ * Also initialize leader-follower relationships.
+ */
static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
{
int cpu;
+ int ls = rcu_nocb_leader_stride;
+ int nl = 0; /* Next leader. */
struct rcu_data *rdp;
+ struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
struct task_struct *t;
if (rcu_nocb_mask == NULL)
return;
+#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
+ if (ls == -1) {
+ ls = int_sqrt(nr_cpu_ids);
+ rcu_nocb_leader_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure and
+ * spawns one rcu_nocb_kthread().
+ */
for_each_cpu(cpu, rcu_nocb_mask) {
rdp = per_cpu_ptr(rsp->rda, cpu);
+ if (rdp->cpu >= nl) {
+ /* New leader, set up for followers & next leader. */
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_leader = rdp;
+ rdp_leader = rdp;
+ } else {
+ /* Another follower, link to previous leader. */
+ rdp->nocb_leader = rdp_leader;
+ rdp_prev->nocb_next_follower = rdp;
+ }
+ rdp_prev = rdp;
+
+ /* Spawn the kthread for this CPU. */
t = kthread_run(rcu_nocb_kthread, rdp,
"rcuo%c/%d", rsp->abbr, cpu);
BUG_ON(IS_ERR(t));
@@ -2843,12 +3023,16 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
*/
static void rcu_bind_gp_kthread(void)
{
-#ifdef CONFIG_NO_HZ_FULL
- int cpu = ACCESS_ONCE(tick_do_timer_cpu);
+ int __maybe_unused cpu;
- if (cpu < 0 || cpu >= nr_cpu_ids)
+ if (!tick_nohz_full_enabled())
return;
- if (raw_smp_processor_id() != cpu)
+#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
+ cpu = tick_do_timer_cpu;
+ if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
set_cpus_allowed_ptr(current, cpumask_of(cpu));
-#endif /* #ifdef CONFIG_NO_HZ_FULL */
+#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
+ if (!is_housekeeping_cpu(raw_smp_processor_id()))
+ housekeeping_affine(current);
+#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index bc7883570530..4056d7992a6c 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -90,9 +90,6 @@ void __rcu_read_unlock(void)
} else {
barrier(); /* critical section before exit code. */
t->rcu_read_lock_nesting = INT_MIN;
-#ifdef CONFIG_PROVE_RCU_DELAY
- udelay(10); /* Make preemption more probable. */
-#endif /* #ifdef CONFIG_PROVE_RCU_DELAY */
barrier(); /* assign before ->rcu_read_unlock_special load */
if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
rcu_read_unlock_special(t);
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..da14b8d09296 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
static struct resource *bootmem_resource_free;
static DEFINE_SPINLOCK(bootmem_resource_lock);
-static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+static struct resource *next_resource(struct resource *p, bool sibling_only)
{
- struct resource *p = v;
- (*pos)++;
+ /* Caller wants to traverse through siblings only */
+ if (sibling_only)
+ return p->sibling;
+
if (p->child)
return p->child;
while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
return p->sibling;
}
+static void *r_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ struct resource *p = v;
+ (*pos)++;
+ return (void *)next_resource(p, false);
+}
+
#ifdef CONFIG_PROC_FS
enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
-#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
- * Finds the lowest memory reosurce exists within [res->start.res->end)
+ * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
+ * This walks through whole tree and not just first level children
+ * until and unless first_level_children_only is true.
*/
-static int find_next_system_ram(struct resource *res, char *name)
+static int find_next_iomem_res(struct resource *res, char *name,
+ bool first_level_children_only)
{
resource_size_t start, end;
struct resource *p;
+ bool sibling_only = false;
BUG_ON(!res);
@@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name)
BUG_ON(start >= end);
read_lock(&resource_lock);
- for (p = iomem_resource.child; p ; p = p->sibling) {
- /* system ram is just marked as IORESOURCE_MEM */
+
+ if (first_level_children_only) {
+ p = iomem_resource.child;
+ sibling_only = true;
+ } else
+ p = &iomem_resource;
+
+ while ((p = next_resource(p, sibling_only))) {
if (p->flags != res->flags)
continue;
if (name && strcmp(p->name, name))
@@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name)
if ((p->end >= start) && (p->start < end))
break;
}
+
read_unlock(&resource_lock);
if (!p)
return -1;
@@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name)
}
/*
+ * Walks through iomem resources and calls func() with matching resource
+ * ranges. This walks through whole tree and not just first level children.
+ * All the memory ranges which overlap start,end and also match flags and
+ * name are valid candidates.
+ *
+ * @name: name of resource
+ * @flags: resource flags
+ * @start: start addr
+ * @end: end addr
+ */
+int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
+ void *arg, int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = flags;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, name, false))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+/*
+ * This function calls callback against all memory range of "System RAM"
+ * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
+ * Now, this function is only for "System RAM". This function deals with
+ * full ranges and not pfn. If resources are not pfn aligned, dealing
+ * with pfn can truncate ranges.
+ */
+int walk_system_ram_res(u64 start, u64 end, void *arg,
+ int (*func)(u64, u64, void *))
+{
+ struct resource res;
+ u64 orig_end;
+ int ret = -1;
+
+ res.start = start;
+ res.end = end;
+ res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
+ orig_end = res.end;
+ while ((res.start < res.end) &&
+ (!find_next_iomem_res(&res, "System RAM", true))) {
+ ret = (*func)(res.start, res.end, arg);
+ if (ret)
+ break;
+ res.start = res.end + 1;
+ res.end = orig_end;
+ }
+ return ret;
+}
+
+#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
+
+/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
@@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
- (find_next_system_ram(&res, "System RAM") >= 0)) {
+ (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bc1638b33449..1211575a2208 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -139,6 +139,8 @@ void update_rq_clock(struct rq *rq)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+ if (delta < 0)
+ return;
rq->clock += delta;
update_rq_clock_task(rq, delta);
}
@@ -243,6 +245,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int i;
+ struct inode *inode;
if (cnt > 63)
cnt = 63;
@@ -253,7 +256,11 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
cmp = strstrip(buf);
+ /* Ensure the static_key remains in a consistent state */
+ inode = file_inode(filp);
+ mutex_lock(&inode->i_mutex);
i = sched_feat_set(cmp);
+ mutex_unlock(&inode->i_mutex);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
@@ -587,30 +594,31 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
/*
- * resched_task - mark a task 'to be rescheduled now'.
+ * resched_curr - mark rq's current task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_task(struct task_struct *p)
+void resched_curr(struct rq *rq)
{
+ struct task_struct *curr = rq->curr;
int cpu;
- lockdep_assert_held(&task_rq(p)->lock);
+ lockdep_assert_held(&rq->lock);
- if (test_tsk_need_resched(p))
+ if (test_tsk_need_resched(curr))
return;
- cpu = task_cpu(p);
+ cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(p);
+ set_tsk_need_resched(curr);
set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(p))
+ if (set_nr_and_not_polling(curr))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -623,7 +631,7 @@ void resched_cpu(int cpu)
if (!raw_spin_trylock_irqsave(&rq->lock, flags))
return;
- resched_task(cpu_curr(cpu));
+ resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -684,10 +692,16 @@ static void wake_up_idle_cpu(int cpu)
static bool wake_up_full_nohz_cpu(int cpu)
{
+ /*
+ * We just need the target to call irq_exit() and re-evaluate
+ * the next tick. The nohz full kick at least implies that.
+ * If needed we can still optimize that later with an
+ * empty IRQ.
+ */
if (tick_nohz_full_cpu(cpu)) {
if (cpu != smp_processor_id() ||
tick_nohz_tick_stopped())
- smp_send_reschedule(cpu);
+ tick_nohz_full_kick_cpu(cpu);
return true;
}
@@ -730,18 +744,15 @@ static inline bool got_nohz_idle_kick(void)
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
{
- struct rq *rq;
-
- rq = this_rq();
-
- /* Make sure rq->nr_running update is visible after the IPI */
- smp_rmb();
-
- /* More than one running task need preemption */
- if (rq->nr_running > 1)
- return false;
+ /*
+ * More than one running task need preemption.
+ * nr_running update is assumed to be visible
+ * after IPI is sent from wakers.
+ */
+ if (this_rq()->nr_running > 1)
+ return false;
- return true;
+ return true;
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -1022,7 +1033,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
if (class == rq->curr->sched_class)
break;
if (class == p->sched_class) {
- resched_task(rq->curr);
+ resched_curr(rq);
break;
}
}
@@ -1568,9 +1579,7 @@ void scheduler_ipi(void)
*/
preempt_fold_need_resched();
- if (llist_empty(&this_rq()->wake_list)
- && !tick_nohz_full_cpu(smp_processor_id())
- && !got_nohz_idle_kick())
+ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
return;
/*
@@ -1587,7 +1596,6 @@ void scheduler_ipi(void)
* somewhat pessimize the simple resched case.
*/
irq_enter();
- tick_nohz_full_check();
sched_ttwu_pending();
/*
@@ -2431,7 +2439,12 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;
- if (task_current(rq, p)) {
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && p->on_rq) {
update_rq_clock(rq);
ns = rq_clock_task(rq) - p->se.exec_start;
if ((s64)ns < 0)
@@ -2474,8 +2487,10 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we race with it leaving cpu, we'll take a lock. So we're correct.
* If we race with it entering cpu, unaccounted time is 0. This is
* indistinguishable from the read occurring a few cycles earlier.
+ * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+ * been accounted, so we're correct here as well.
*/
- if (!p->on_cpu)
+ if (!p->on_cpu || !p->on_rq)
return p->se.sum_exec_runtime;
#endif
@@ -2971,7 +2986,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
}
trace_sched_pi_setprio(p, prio);
- p->pi_top_task = rt_mutex_get_top_task(p);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->on_rq;
@@ -2991,8 +3005,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
* running task
*/
if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
- dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+ struct task_struct *pi_task = rt_mutex_get_top_task(p);
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
@@ -3064,7 +3079,7 @@ void set_user_nice(struct task_struct *p, long nice)
* lowered its priority, then reschedule its CPU:
*/
if (delta < 0 || (delta > 0 && task_running(rq, p)))
- resched_task(rq->curr);
+ resched_curr(rq);
}
out_unlock:
task_rq_unlock(rq, p, &flags);
@@ -3203,12 +3218,18 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_yielded = 0;
}
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY -1
+
static void __setscheduler_params(struct task_struct *p,
const struct sched_attr *attr)
{
int policy = attr->sched_policy;
- if (policy == -1) /* setparam */
+ if (policy == SETPARAM_POLICY)
policy = p->policy;
p->policy = policy;
@@ -3557,10 +3578,8 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
.sched_nice = PRIO_TO_NICE(p->static_prio),
};
- /*
- * Fixup the legacy SCHED_RESET_ON_FORK hack
- */
- if (policy & SCHED_RESET_ON_FORK) {
+ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
+ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
policy &= ~SCHED_RESET_ON_FORK;
attr.sched_policy = policy;
@@ -3730,7 +3749,7 @@ SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
*/
SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
{
- return do_sched_setscheduler(pid, -1, param);
+ return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
}
/**
@@ -4285,7 +4304,7 @@ again:
* fairness.
*/
if (preempt && rq != p_rq)
- resched_task(p_rq->curr);
+ resched_curr(p_rq);
}
out_unlock:
@@ -6465,6 +6484,20 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
sched_domain_level_max = max(sched_domain_level_max, sd->level);
child->parent = sd;
sd->child = child;
+
+ if (!cpumask_subset(sched_domain_span(child),
+ sched_domain_span(sd))) {
+ pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+ pr_err(" the %s domain not a subset of the %s domain\n",
+ child->name, sd->name);
+#endif
+ /* Fixup, ensure @sd has at least @child cpus. */
+ cpumask_or(sched_domain_span(sd),
+ sched_domain_span(sd),
+ sched_domain_span(child));
+ }
+
}
set_domain_attribute(sd, attr);
@@ -7092,7 +7125,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
__setscheduler(rq, p, &attr);
if (on_rq) {
enqueue_task(rq, p, 0);
- resched_task(rq->curr);
+ resched_curr(rq);
}
check_class_changed(rq, p, prev_class, old_prio);
@@ -7803,6 +7836,11 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
if (period > max_cfs_quota_period)
return -EINVAL;
+ /*
+ * Prevent race between setting of cfs_rq->runtime_enabled and
+ * unthrottle_offline_cfs_rqs().
+ */
+ get_online_cpus();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -7828,7 +7866,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
}
raw_spin_unlock_irq(&cfs_b->lock);
- for_each_possible_cpu(i) {
+ for_each_online_cpu(i) {
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
struct rq *rq = cfs_rq->rq;
@@ -7844,6 +7882,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
+ put_online_cpus();
return ret;
}
@@ -8083,7 +8122,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
- .base_cftypes = cpu_files,
+ .legacy_cftypes = cpu_files,
.early_init = 1,
};
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 9cf350c94ec4..dd7cbb55bbf2 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -278,6 +278,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
struct cgroup_subsys cpuacct_cgrp_subsys = {
.css_alloc = cpuacct_css_alloc,
.css_free = cpuacct_css_free,
- .base_cftypes = files,
+ .legacy_cftypes = files,
.early_init = 1,
};
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fc4f98b1258f..255ce138b652 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -306,7 +306,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
* the overrunning entity can't interfere with other entity in the system and
* can't make them miss their deadlines. Reasons why this kind of overruns
* could happen are, typically, a entity voluntarily trying to overcome its
- * runtime, or it just underestimated it during sched_setscheduler_ex().
+ * runtime, or it just underestimated it during sched_setattr().
*/
static void replenish_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se)
@@ -535,7 +535,7 @@ again:
if (task_has_dl_policy(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
- resched_task(rq->curr);
+ resched_curr(rq);
#ifdef CONFIG_SMP
/*
* Queueing this task back might have overloaded rq,
@@ -634,7 +634,7 @@ static void update_curr_dl(struct rq *rq)
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
- resched_task(curr);
+ resched_curr(rq);
}
/*
@@ -964,7 +964,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
return;
- resched_task(rq->curr);
+ resched_curr(rq);
}
static int pull_dl_task(struct rq *this_rq);
@@ -979,7 +979,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
int flags)
{
if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -1333,7 +1333,7 @@ retry:
if (dl_task(rq->curr) &&
dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1373,7 +1373,7 @@ retry:
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
- resched_task(later_rq->curr);
+ resched_curr(later_rq);
double_unlock_balance(rq, later_rq);
@@ -1632,14 +1632,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/*
* Again, we don't know if p has a earlier
* or later deadline, so let's blindly set a
* (maybe not needed) rescheduling point.
*/
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else
switched_to_dl(rq, p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fea7d3335e1f..bfa3c86d0d68 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1062,7 +1062,6 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
if (!cpus)
return;
- ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
ns->task_capacity =
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
@@ -1096,18 +1095,30 @@ static void task_numa_assign(struct task_numa_env *env,
env->best_cpu = env->dst_cpu;
}
-static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
- long src_load, long dst_load,
+static bool load_too_imbalanced(long src_load, long dst_load,
struct task_numa_env *env)
{
long imb, old_imb;
+ long orig_src_load, orig_dst_load;
+ long src_capacity, dst_capacity;
+
+ /*
+ * The load is corrected for the CPU capacity available on each node.
+ *
+ * src_load dst_load
+ * ------------ vs ---------
+ * src_capacity dst_capacity
+ */
+ src_capacity = env->src_stats.compute_capacity;
+ dst_capacity = env->dst_stats.compute_capacity;
/* We care about the slope of the imbalance, not the direction. */
if (dst_load < src_load)
swap(dst_load, src_load);
/* Is the difference below the threshold? */
- imb = dst_load * 100 - src_load * env->imbalance_pct;
+ imb = dst_load * src_capacity * 100 -
+ src_load * dst_capacity * env->imbalance_pct;
if (imb <= 0)
return false;
@@ -1115,10 +1126,14 @@ static bool load_too_imbalanced(long orig_src_load, long orig_dst_load,
* The imbalance is above the allowed threshold.
* Compare it with the old imbalance.
*/
+ orig_src_load = env->src_stats.load;
+ orig_dst_load = env->dst_stats.load;
+
if (orig_dst_load < orig_src_load)
swap(orig_dst_load, orig_src_load);
- old_imb = orig_dst_load * 100 - orig_src_load * env->imbalance_pct;
+ old_imb = orig_dst_load * src_capacity * 100 -
+ orig_src_load * dst_capacity * env->imbalance_pct;
/* Would this change make things worse? */
return (imb > old_imb);
@@ -1136,10 +1151,10 @@ static void task_numa_compare(struct task_numa_env *env,
struct rq *src_rq = cpu_rq(env->src_cpu);
struct rq *dst_rq = cpu_rq(env->dst_cpu);
struct task_struct *cur;
- long orig_src_load, src_load;
- long orig_dst_load, dst_load;
+ long src_load, dst_load;
long load;
- long imp = (groupimp > 0) ? groupimp : taskimp;
+ long imp = env->p->numa_group ? groupimp : taskimp;
+ long moveimp = imp;
rcu_read_lock();
cur = ACCESS_ONCE(dst_rq->curr);
@@ -1177,11 +1192,6 @@ static void task_numa_compare(struct task_numa_env *env,
* itself (not part of a group), use the task weight
* instead.
*/
- if (env->p->numa_group)
- imp = groupimp;
- else
- imp = taskimp;
-
if (cur->numa_group)
imp += group_weight(cur, env->src_nid) -
group_weight(cur, env->dst_nid);
@@ -1191,7 +1201,7 @@ static void task_numa_compare(struct task_numa_env *env,
}
}
- if (imp < env->best_imp)
+ if (imp <= env->best_imp && moveimp <= env->best_imp)
goto unlock;
if (!cur) {
@@ -1204,20 +1214,34 @@ static void task_numa_compare(struct task_numa_env *env,
}
/* Balance doesn't matter much if we're running a task per cpu */
- if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+ if (imp > env->best_imp && src_rq->nr_running == 1 &&
+ dst_rq->nr_running == 1)
goto assign;
/*
* In the overloaded case, try and keep the load balanced.
*/
balance:
- orig_dst_load = env->dst_stats.load;
- orig_src_load = env->src_stats.load;
-
- /* XXX missing capacity terms */
load = task_h_load(env->p);
- dst_load = orig_dst_load + load;
- src_load = orig_src_load - load;
+ dst_load = env->dst_stats.load + load;
+ src_load = env->src_stats.load - load;
+
+ if (moveimp > imp && moveimp > env->best_imp) {
+ /*
+ * If the improvement from just moving env->p direction is
+ * better than swapping tasks around, check if a move is
+ * possible. Store a slightly smaller score than moveimp,
+ * so an actually idle CPU will win.
+ */
+ if (!load_too_imbalanced(src_load, dst_load, env)) {
+ imp = moveimp - 1;
+ cur = NULL;
+ goto assign;
+ }
+ }
+
+ if (imp <= env->best_imp)
+ goto unlock;
if (cur) {
load = task_h_load(cur);
@@ -1225,8 +1249,7 @@ balance:
src_load += load;
}
- if (load_too_imbalanced(orig_src_load, orig_dst_load,
- src_load, dst_load, env))
+ if (load_too_imbalanced(src_load, dst_load, env))
goto unlock;
assign:
@@ -1302,9 +1325,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has free capacity, try to use it. */
- if (env.dst_stats.has_free_capacity)
- task_numa_find_cpu(&env, taskimp, groupimp);
+ /* Try to find a spot on the preferred nid. */
+ task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
if (env.best_cpu == -1) {
@@ -1324,10 +1346,6 @@ static int task_numa_migrate(struct task_struct *p)
}
}
- /* No better CPU than the current one was found. */
- if (env.best_cpu == -1)
- return -EAGAIN;
-
/*
* If the task is part of a workload that spans multiple NUMA nodes,
* and is migrating into one of the workload's active nodes, remember
@@ -1336,8 +1354,19 @@ static int task_numa_migrate(struct task_struct *p)
* A task that migrated to a second choice node will be better off
* trying for a better one later. Do not set the preferred node here.
*/
- if (p->numa_group && node_isset(env.dst_nid, p->numa_group->active_nodes))
- sched_setnuma(p, env.dst_nid);
+ if (p->numa_group) {
+ if (env.best_cpu == -1)
+ nid = env.src_nid;
+ else
+ nid = env.dst_nid;
+
+ if (node_isset(nid, p->numa_group->active_nodes))
+ sched_setnuma(p, env.dst_nid);
+ }
+
+ /* No better CPU than the current one was found. */
+ if (env.best_cpu == -1)
+ return -EAGAIN;
/*
* Reset the scan period if the task is being rescheduled on an
@@ -1415,12 +1444,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
- * period will be for the next scan window. If local/remote ratio is below
- * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
- * scan period will decrease
+ * period will be for the next scan window. If local/(local+remote) ratio is
+ * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
+ * the scan period will decrease. Aim for 70% local accesses.
*/
#define NUMA_PERIOD_SLOTS 10
-#define NUMA_PERIOD_THRESHOLD 3
+#define NUMA_PERIOD_THRESHOLD 7
/*
* Increase the scan period (slow down scanning) if the majority of
@@ -1595,30 +1624,17 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
- /*
- * If the preferred task and group nids are different,
- * iterate over the nodes again to find the best place.
- */
- if (max_nid != max_group_nid) {
- unsigned long weight, max_weight = 0;
-
- for_each_online_node(nid) {
- weight = task_weight(p, nid) + group_weight(p, nid);
- if (weight > max_weight) {
- max_weight = weight;
- max_nid = nid;
- }
- }
- }
-
spin_unlock_irq(group_lock);
+ max_nid = max_group_nid;
}
- /* Preferred node as the node with the most faults */
- if (max_faults && max_nid != p->numa_preferred_nid) {
- /* Update the preferred nid and migrate task if possible */
- sched_setnuma(p, max_nid);
- numa_migrate_preferred(p);
+ if (max_faults) {
+ /* Set the new preferred node */
+ if (max_nid != p->numa_preferred_nid)
+ sched_setnuma(p, max_nid);
+
+ if (task_node(p) != p->numa_preferred_nid)
+ numa_migrate_preferred(p);
}
}
@@ -2899,7 +2915,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
/*
* The current task ran long enough, ensure it doesn't get
* re-elected due to buddy favours.
@@ -2923,7 +2939,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (delta > ideal_runtime)
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static void
@@ -3063,7 +3079,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
return;
}
/*
@@ -3254,7 +3270,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_task(rq_of(cfs_rq)->curr);
+ resched_curr(rq_of(cfs_rq));
}
static __always_inline
@@ -3360,7 +3376,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 1;
cfs_rq->throttled_clock = rq_clock(rq);
raw_spin_lock(&cfs_b->lock);
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ /*
+ * Add to the _head_ of the list, so that an already-started
+ * distribute_cfs_runtime will not see us
+ */
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
if (!cfs_b->timer_active)
__start_cfs_bandwidth(cfs_b, false);
raw_spin_unlock(&cfs_b->lock);
@@ -3410,14 +3430,15 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* determine whether we need to wake up potentially idle cpu */
if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_task(rq->curr);
+ resched_curr(rq);
}
static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
u64 remaining, u64 expires)
{
struct cfs_rq *cfs_rq;
- u64 runtime = remaining;
+ u64 runtime;
+ u64 starting_runtime = remaining;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
@@ -3448,7 +3469,7 @@ next:
}
rcu_read_unlock();
- return remaining;
+ return starting_runtime - remaining;
}
/*
@@ -3494,22 +3515,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
/* account preceding periods in which throttling occurred */
cfs_b->nr_throttled += overrun;
- /*
- * There are throttled entities so we must first use the new bandwidth
- * to unthrottle them before making it generally available. This
- * ensures that all existing debts will be paid before a new cfs_rq is
- * allowed to run.
- */
- runtime = cfs_b->runtime;
runtime_expires = cfs_b->runtime_expires;
- cfs_b->runtime = 0;
/*
- * This check is repeated as we are holding onto the new bandwidth
- * while we unthrottle. This can potentially race with an unthrottled
- * group trying to acquire new bandwidth from the global pool.
+ * This check is repeated as we are holding onto the new bandwidth while
+ * we unthrottle. This can potentially race with an unthrottled group
+ * trying to acquire new bandwidth from the global pool. This can result
+ * in us over-using our runtime if it is all used during this loop, but
+ * only by limited amounts in that extreme case.
*/
- while (throttled && runtime > 0) {
+ while (throttled && cfs_b->runtime > 0) {
+ runtime = cfs_b->runtime;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
@@ -3517,10 +3533,10 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
raw_spin_lock(&cfs_b->lock);
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
}
- /* return (any) remaining runtime */
- cfs_b->runtime = runtime;
/*
* While we are ensured activity in the period following an
* unthrottle, this also covers the case in which the new bandwidth is
@@ -3631,10 +3647,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
return;
}
- if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+ if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
runtime = cfs_b->runtime;
- cfs_b->runtime = 0;
- }
+
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
@@ -3645,7 +3660,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime = runtime;
+ cfs_b->runtime -= min(runtime, cfs_b->runtime);
raw_spin_unlock(&cfs_b->lock);
}
@@ -3775,6 +3790,19 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
hrtimer_cancel(&cfs_b->slack_timer);
}
+static void __maybe_unused update_runtime_enabled(struct rq *rq)
+{
+ struct cfs_rq *cfs_rq;
+
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
+ struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+ raw_spin_lock(&cfs_b->lock);
+ cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
+ raw_spin_unlock(&cfs_b->lock);
+ }
+}
+
static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
{
struct cfs_rq *cfs_rq;
@@ -3788,6 +3816,12 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* there's some valid quota amount
*/
cfs_rq->runtime_remaining = 1;
+ /*
+ * Offline rq is schedulable till cpu is completely disabled
+ * in take_cpu_down(), so we prevent new cfs throttling here.
+ */
+ cfs_rq->runtime_enabled = 0;
+
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -3831,6 +3865,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
return NULL;
}
static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline void update_runtime_enabled(struct rq *rq) {}
static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
#endif /* CONFIG_CFS_BANDWIDTH */
@@ -3854,7 +3889,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
if (delta < 0) {
if (rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
return;
}
@@ -4723,7 +4758,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
preempt:
- resched_task(curr);
+ resched_curr(rq);
/*
* Only set the backward buddy when the current task is still
* on the rq. This can happen when a wakeup gets interleaved
@@ -5094,8 +5129,7 @@ static void move_task(struct task_struct *p, struct lb_env *env)
/*
* Is this task likely cache-hot:
*/
-static int
-task_hot(struct task_struct *p, u64 now)
+static int task_hot(struct task_struct *p, struct lb_env *env)
{
s64 delta;
@@ -5108,7 +5142,7 @@ task_hot(struct task_struct *p, u64 now)
/*
* Buddy candidates are cache hot:
*/
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+ if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
(&p->se == cfs_rq_of(&p->se)->next ||
&p->se == cfs_rq_of(&p->se)->last))
return 1;
@@ -5118,7 +5152,7 @@ task_hot(struct task_struct *p, u64 now)
if (sysctl_sched_migration_cost == 0)
return 0;
- delta = now - p->se.exec_start;
+ delta = rq_clock_task(env->src_rq) - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
}
@@ -5272,7 +5306,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq));
+ tsk_cache_hot = task_hot(p, env);
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
@@ -5864,10 +5898,12 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
+ * @overload: Indicate more than one runnable task for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs)
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
{
unsigned long load;
int i;
@@ -5885,6 +5921,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
+
+ if (rq->nr_running > 1)
+ *overload = true;
+
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5995,6 +6035,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+ bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -6015,7 +6056,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
if (local_group)
goto next_group;
@@ -6049,6 +6091,13 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ if (env->dst_rq->rd->overload != overload)
+ env->dst_rq->rd->overload = overload;
+ }
+
}
/**
@@ -6767,7 +6816,8 @@ static int idle_balance(struct rq *this_rq)
*/
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
@@ -7325,6 +7375,8 @@ void trigger_load_balance(struct rq *rq)
static void rq_online_fair(struct rq *rq)
{
update_sysctl();
+
+ update_runtime_enabled(rq);
}
static void rq_offline_fair(struct rq *rq)
@@ -7398,7 +7450,7 @@ static void task_fork_fair(struct task_struct *p)
* 'current' within the tree based on its new key value.
*/
swap(curr->vruntime, se->vruntime);
- resched_task(rq->curr);
+ resched_curr(rq);
}
se->vruntime -= cfs_rq->min_vruntime;
@@ -7423,7 +7475,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
*/
if (rq->curr == p) {
if (p->prio > oldprio)
- resched_task(rq->curr);
+ resched_curr(rq);
} else
check_preempt_curr(rq, p, 0);
}
@@ -7486,7 +7538,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* if we can still preempt the current task.
*/
if (rq->curr == p)
- resched_task(rq->curr);
+ resched_curr(rq);
else
check_preempt_curr(rq, p, 0);
}
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index cf009fb0bc25..11e7bc434f43 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -79,7 +79,7 @@ static void cpuidle_idle_call(void)
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- bool broadcast;
+ unsigned int broadcast;
/*
* Check if the idle task must be rescheduled. If it is the
@@ -135,7 +135,7 @@ use_default:
goto exit_idle;
}
- broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
+ broadcast = drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP;
/*
* Tell the time framework to switch to a broadcast timer
@@ -147,8 +147,6 @@ use_default:
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
goto use_default;
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
/*
* Enter the idle state previously returned by the governor decision.
* This function will block until an interrupt occurs and will take
@@ -156,8 +154,6 @@ use_default:
*/
entered_state = cpuidle_enter(drv, dev, next_state);
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
-
if (broadcast)
clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 879f2b75266a..67ad4e7f506a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -20,7 +20,7 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
*/
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
- resched_task(rq->idle);
+ resched_curr(rq);
}
static struct task_struct *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a49083192c64..5f6edca4fafd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -463,9 +463,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
- int cpu = cpu_of(rq_of_rt_rq(rt_rq));
+ int cpu = cpu_of(rq);
rt_se = rt_rq->tg->rt_se[cpu];
@@ -476,7 +477,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
enqueue_rt_entity(rt_se, false);
if (rt_rq->highest_prio.curr < curr->prio)
- resched_task(curr);
+ resched_curr(rq);
}
}
@@ -566,7 +567,7 @@ static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
return;
enqueue_top_rt_rq(rt_rq);
- resched_task(rq->curr);
+ resched_curr(rq);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -740,6 +741,9 @@ balanced:
rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
+
+ /* Make rt_rq available for pick_next_task() */
+ sched_rt_rq_enqueue(rt_rq);
}
}
@@ -948,7 +952,7 @@ static void update_curr_rt(struct rq *rq)
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
if (sched_rt_runtime_exceeded(rt_rq))
- resched_task(curr);
+ resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
}
@@ -1363,7 +1367,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
* to try and push current away:
*/
requeue_task_rt(rq, p, 1);
- resched_task(rq->curr);
+ resched_curr(rq);
}
#endif /* CONFIG_SMP */
@@ -1374,7 +1378,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
if (p->prio < rq->curr->prio) {
- resched_task(rq->curr);
+ resched_curr(rq);
return;
}
@@ -1690,7 +1694,7 @@ retry:
* just reschedule current.
*/
if (unlikely(next_task->prio < rq->curr->prio)) {
- resched_task(rq->curr);
+ resched_curr(rq);
return 0;
}
@@ -1737,7 +1741,7 @@ retry:
activate_task(lowest_rq, next_task, 0);
ret = 1;
- resched_task(lowest_rq->curr);
+ resched_curr(lowest_rq);
double_unlock_balance(rq, lowest_rq);
@@ -1936,7 +1940,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
return;
if (pull_rt_task(rq))
- resched_task(rq->curr);
+ resched_curr(rq);
}
void __init init_sched_rt_class(void)
@@ -1974,7 +1978,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
check_resched = 0;
#endif /* CONFIG_SMP */
if (check_resched && p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
@@ -2003,11 +2007,11 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* Only reschedule if p is still on the same runqueue.
*/
if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
- resched_task(p);
+ resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
if (oldprio < p->prio)
- resched_task(p);
+ resched_curr(rq);
#endif /* CONFIG_SMP */
} else {
/*
@@ -2016,7 +2020,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* then reschedule.
*/
if (p->prio < rq->curr->prio)
- resched_task(rq->curr);
+ resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 31cc02ebc54e..579712f4e9d5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -884,20 +887,10 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
-static __always_inline bool static_branch__true(struct static_key *key)
-{
- return static_key_true(key); /* Not out of line branch. */
-}
-
-static __always_inline bool static_branch__false(struct static_key *key)
-{
- return static_key_false(key); /* Out of line branch. */
-}
-
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
- return static_branch__##enabled(key); \
+ return static_key_##enabled(key); \
}
#include "features.h"
@@ -1196,7 +1189,7 @@ extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void init_sched_dl_class(void);
-extern void resched_task(struct task_struct *p);
+extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
@@ -1218,15 +1211,26 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
rq->nr_running = prev_nr + count;
-#ifdef CONFIG_NO_HZ_FULL
if (prev_nr < 2 && rq->nr_running >= 2) {
+#ifdef CONFIG_SMP
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+#endif
+
+#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(rq->cpu)) {
- /* Order rq->nr_running write against the IPI */
- smp_wmb();
- smp_send_reschedule(rq->cpu);
+ /*
+ * Tick is needed if more than one task runs on a CPU.
+ * Send the target an IPI to kick it out of nohz mode.
+ *
+ * We assume that IPI implies full memory barrier and the
+ * new value of rq->nr_running is visible on reception
+ * from the target.
+ */
+ tick_nohz_full_kick_cpu(rq->cpu);
}
- }
#endif
+ }
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 0ffa20ae657b..15cab1a4f84e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -319,14 +319,14 @@ EXPORT_SYMBOL(wake_bit_function);
*/
int __sched
__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
int ret = 0;
do {
prepare_to_wait(wq, &q->wait, mode);
if (test_bit(q->key.bit_nr, q->key.flags))
- ret = (*action)(q->key.flags);
+ ret = (*action)(&q->key);
} while (test_bit(q->key.bit_nr, q->key.flags) && !ret);
finish_wait(wq, &q->wait);
return ret;
@@ -334,7 +334,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit);
int __sched out_of_line_wait_on_bit(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
do {
int ret;
@@ -353,7 +353,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
prepare_to_wait_exclusive(wq, &q->wait, mode);
if (!test_bit(q->key.bit_nr, q->key.flags))
continue;
- ret = action(q->key.flags);
+ ret = action(&q->key);
if (!ret)
continue;
abort_exclusive_wait(wq, &q->wait, mode, &q->key);
@@ -365,7 +365,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
EXPORT_SYMBOL(__wait_on_bit_lock);
int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
- int (*action)(void *), unsigned mode)
+ wait_bit_action_f *action, unsigned mode)
{
wait_queue_head_t *wq = bit_waitqueue(word, bit);
DEFINE_WAIT_BIT(wait, word, bit);
@@ -502,3 +502,21 @@ void wake_up_atomic_t(atomic_t *p)
__wake_up_bit(atomic_t_waitqueue(p), p, WAIT_ATOMIC_T_BIT_NR);
}
EXPORT_SYMBOL(wake_up_atomic_t);
+
+__sched int bit_wait(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait);
+
+__sched int bit_wait_io(struct wait_bit_key *word)
+{
+ if (signal_pending_state(current->state, current))
+ return 1;
+ io_schedule();
+ return 0;
+}
+EXPORT_SYMBOL(bit_wait_io);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 301bbc24739c..25b0043f4755 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -18,15 +18,17 @@
#include <linux/compat.h>
#include <linux/sched.h>
#include <linux/seccomp.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
/* #define SECCOMP_DEBUG 1 */
#ifdef CONFIG_SECCOMP_FILTER
#include <asm/syscall.h>
#include <linux/filter.h>
+#include <linux/pid.h>
#include <linux/ptrace.h>
#include <linux/security.h>
-#include <linux/slab.h>
#include <linux/tracehook.h>
#include <linux/uaccess.h>
@@ -54,7 +56,7 @@
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
- struct sk_filter *prog;
+ struct bpf_prog *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -87,7 +89,7 @@ static void populate_seccomp_data(struct seccomp_data *sd)
* @filter: filter to verify
* @flen: length of filter
*
- * Takes a previously checked filter (by sk_chk_filter) and
+ * Takes a previously checked filter (by bpf_check_classic) and
* redirects all filter code that loads struct sk_buff data
* and related data through seccomp_bpf_load. It also
* enforces length and alignment checking of those loads.
@@ -172,51 +174,184 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(int syscall)
{
- struct seccomp_filter *f;
+ struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
struct seccomp_data sd;
u32 ret = SECCOMP_RET_ALLOW;
/* Ensure unexpected behavior doesn't result in failing open. */
- if (WARN_ON(current->seccomp.filter == NULL))
+ if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ smp_read_barrier_depends();
+
populate_seccomp_data(&sd);
/*
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
- for (f = current->seccomp.filter; f; f = f->prev) {
- u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
+ for (; f; f = f->prev) {
+ u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd);
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
return ret;
}
+#endif /* CONFIG_SECCOMP_FILTER */
+
+static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
+{
+ BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+ if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
+ return false;
+
+ return true;
+}
+
+static inline void seccomp_assign_mode(struct task_struct *task,
+ unsigned long seccomp_mode)
+{
+ BUG_ON(!spin_is_locked(&task->sighand->siglock));
+
+ task->seccomp.mode = seccomp_mode;
+ /*
+ * Make sure TIF_SECCOMP cannot be set before the mode (and
+ * filter) is set.
+ */
+ smp_mb__before_atomic();
+ set_tsk_thread_flag(task, TIF_SECCOMP);
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/* Returns 1 if the parent is an ancestor of the child. */
+static int is_ancestor(struct seccomp_filter *parent,
+ struct seccomp_filter *child)
+{
+ /* NULL is the root ancestor. */
+ if (parent == NULL)
+ return 1;
+ for (; child; child = child->prev)
+ if (child == parent)
+ return 1;
+ return 0;
+}
/**
- * seccomp_attach_filter: Attaches a seccomp filter to current.
+ * seccomp_can_sync_threads: checks if all threads can be synchronized
+ *
+ * Expects sighand and cred_guard_mutex locks to be held.
+ *
+ * Returns 0 on success, -ve on error, or the pid of a thread which was
+ * either not in the correct seccomp mode or it did not have an ancestral
+ * seccomp filter.
+ */
+static inline pid_t seccomp_can_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+ /* Validate all threads being eligible for synchronization. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ pid_t failed;
+
+ /* Skip current, since it is initiating the sync. */
+ if (thread == caller)
+ continue;
+
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
+ (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
+ is_ancestor(thread->seccomp.filter,
+ caller->seccomp.filter)))
+ continue;
+
+ /* Return the first thread that cannot be synchronized. */
+ failed = task_pid_vnr(thread);
+ /* If the pid cannot be resolved, then return -ESRCH */
+ if (unlikely(WARN_ON(failed == 0)))
+ failed = -ESRCH;
+ return failed;
+ }
+
+ return 0;
+}
+
+/**
+ * seccomp_sync_threads: sets all threads to use current's filter
+ *
+ * Expects sighand and cred_guard_mutex locks to be held, and for
+ * seccomp_can_sync_threads() to have returned success already
+ * without dropping the locks.
+ *
+ */
+static inline void seccomp_sync_threads(void)
+{
+ struct task_struct *thread, *caller;
+
+ BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
+ BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+ /* Synchronize all threads. */
+ caller = current;
+ for_each_thread(caller, thread) {
+ /* Skip current, since it needs no changes. */
+ if (thread == caller)
+ continue;
+
+ /* Get a task reference for the new leaf node. */
+ get_seccomp_filter(caller);
+ /*
+ * Drop the task reference to the shared ancestor since
+ * current's path will hold a reference. (This also
+ * allows a put before the assignment.)
+ */
+ put_seccomp_filter(thread);
+ smp_store_release(&thread->seccomp.filter,
+ caller->seccomp.filter);
+ /*
+ * Opt the other thread into seccomp if needed.
+ * As threads are considered to be trust-realm
+ * equivalent (see ptrace_may_access), it is safe to
+ * allow one thread to transition the other.
+ */
+ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) {
+ /*
+ * Don't let an unprivileged task work around
+ * the no_new_privs restriction by creating
+ * a thread that sets it up, enters seccomp,
+ * then dies.
+ */
+ if (task_no_new_privs(caller))
+ task_set_no_new_privs(thread);
+
+ seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
+ }
+ }
+}
+
+/**
+ * seccomp_prepare_filter: Prepares a seccomp filter for use.
* @fprog: BPF program to install
*
- * Returns 0 on success or an errno on failure.
+ * Returns filter on success or an ERR_PTR on failure.
*/
-static long seccomp_attach_filter(struct sock_fprog *fprog)
+static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
{
struct seccomp_filter *filter;
- unsigned long fp_size = fprog->len * sizeof(struct sock_filter);
- unsigned long total_insns = fprog->len;
+ unsigned long fp_size;
struct sock_filter *fp;
int new_len;
long ret;
if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
- return -EINVAL;
-
- for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
- if (total_insns > MAX_INSNS_PER_PATH)
- return -ENOMEM;
+ return ERR_PTR(-EINVAL);
+ BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
+ fp_size = fprog->len * sizeof(struct sock_filter);
/*
* Installing a seccomp filter requires that the task has
@@ -224,14 +359,14 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
* This avoids scenarios where unprivileged tasks can affect the
* behavior of privileged children.
*/
- if (!current->no_new_privs &&
+ if (!task_no_new_privs(current) &&
security_capable_noaudit(current_cred(), current_user_ns(),
CAP_SYS_ADMIN) != 0)
- return -EACCES;
+ return ERR_PTR(-EACCES);
fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN);
if (!fp)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
/* Copy the instructions from fprog. */
ret = -EFAULT;
@@ -239,7 +374,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
goto free_prog;
/* Check and rewrite the fprog via the skb checker */
- ret = sk_chk_filter(fp, fprog->len);
+ ret = bpf_check_classic(fp, fprog->len);
if (ret)
goto free_prog;
@@ -248,8 +383,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (ret)
goto free_prog;
- /* Convert 'sock_filter' insns to 'sock_filter_int' insns */
- ret = sk_convert_filter(fp, fprog->len, NULL, &new_len);
+ /* Convert 'sock_filter' insns to 'bpf_insn' insns */
+ ret = bpf_convert_filter(fp, fprog->len, NULL, &new_len);
if (ret)
goto free_prog;
@@ -260,12 +395,12 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
if (!filter)
goto free_prog;
- filter->prog = kzalloc(sk_filter_size(new_len),
+ filter->prog = kzalloc(bpf_prog_size(new_len),
GFP_KERNEL|__GFP_NOWARN);
if (!filter->prog)
goto free_filter;
- ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
if (ret)
goto free_filter_prog;
kfree(fp);
@@ -273,15 +408,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
atomic_set(&filter->usage, 1);
filter->prog->len = new_len;
- sk_filter_select_runtime(filter->prog);
+ bpf_prog_select_runtime(filter->prog);
- /*
- * If there is an existing filter, make it the prev and don't drop its
- * task reference.
- */
- filter->prev = current->seccomp.filter;
- current->seccomp.filter = filter;
- return 0;
+ return filter;
free_filter_prog:
kfree(filter->prog);
@@ -289,19 +418,20 @@ free_filter:
kfree(filter);
free_prog:
kfree(fp);
- return ret;
+ return ERR_PTR(ret);
}
/**
- * seccomp_attach_user_filter - attaches a user-supplied sock_fprog
+ * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
* @user_filter: pointer to the user data containing a sock_fprog.
*
* Returns 0 on success and non-zero otherwise.
*/
-static long seccomp_attach_user_filter(char __user *user_filter)
+static struct seccomp_filter *
+seccomp_prepare_user_filter(const char __user *user_filter)
{
struct sock_fprog fprog;
- long ret = -EFAULT;
+ struct seccomp_filter *filter = ERR_PTR(-EFAULT);
#ifdef CONFIG_COMPAT
if (is_compat_task()) {
@@ -314,9 +444,56 @@ static long seccomp_attach_user_filter(char __user *user_filter)
#endif
if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
goto out;
- ret = seccomp_attach_filter(&fprog);
+ filter = seccomp_prepare_filter(&fprog);
out:
- return ret;
+ return filter;
+}
+
+/**
+ * seccomp_attach_filter: validate and attach filter
+ * @flags: flags to change filter behavior
+ * @filter: seccomp filter to add to the current process
+ *
+ * Caller must be holding current->sighand->siglock lock.
+ *
+ * Returns 0 on success, -ve on error.
+ */
+static long seccomp_attach_filter(unsigned int flags,
+ struct seccomp_filter *filter)
+{
+ unsigned long total_insns;
+ struct seccomp_filter *walker;
+
+ BUG_ON(!spin_is_locked(&current->sighand->siglock));
+
+ /* Validate resulting filter length. */
+ total_insns = filter->prog->len;
+ for (walker = current->seccomp.filter; walker; walker = walker->prev)
+ total_insns += walker->prog->len + 4; /* 4 instr penalty */
+ if (total_insns > MAX_INSNS_PER_PATH)
+ return -ENOMEM;
+
+ /* If thread sync has been requested, check that it is possible. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
+ int ret;
+
+ ret = seccomp_can_sync_threads();
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If there is an existing filter, make it the prev and don't drop its
+ * task reference.
+ */
+ filter->prev = current->seccomp.filter;
+ current->seccomp.filter = filter;
+
+ /* Now that the new filter is in place, synchronize to all threads. */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ seccomp_sync_threads();
+
+ return 0;
}
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
@@ -329,6 +506,14 @@ void get_seccomp_filter(struct task_struct *tsk)
atomic_inc(&orig->usage);
}
+static inline void seccomp_filter_free(struct seccomp_filter *filter)
+{
+ if (filter) {
+ bpf_prog_free(filter->prog);
+ kfree(filter);
+ }
+}
+
/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
void put_seccomp_filter(struct task_struct *tsk)
{
@@ -337,8 +522,7 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
- sk_filter_free(freeme->prog);
- kfree(freeme);
+ seccomp_filter_free(freeme);
}
}
@@ -382,12 +566,17 @@ static int mode1_syscalls_32[] = {
int __secure_computing(int this_syscall)
{
- int mode = current->seccomp.mode;
int exit_sig = 0;
int *syscall;
u32 ret;
- switch (mode) {
+ /*
+ * Make sure that any changes to mode from another thread have
+ * been seen after TIF_SECCOMP was seen.
+ */
+ rmb();
+
+ switch (current->seccomp.mode) {
case SECCOMP_MODE_STRICT:
syscall = mode1_syscalls;
#ifdef CONFIG_COMPAT
@@ -473,47 +662,152 @@ long prctl_get_seccomp(void)
}
/**
- * prctl_set_seccomp: configures current->seccomp.mode
- * @seccomp_mode: requested mode to use
- * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ * seccomp_set_mode_strict: internal function for setting strict seccomp
*
- * This function may be called repeatedly with a @seccomp_mode of
- * SECCOMP_MODE_FILTER to install additional filters. Every filter
- * successfully installed will be evaluated (in reverse order) for each system
- * call the task makes.
+ * Once current->seccomp.mode is non-zero, it may not be changed.
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+static long seccomp_set_mode_strict(void)
+{
+ const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
+ long ret = -EINVAL;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+#ifdef TIF_NOTSC
+ disable_TSC();
+#endif
+ seccomp_assign_mode(current, seccomp_mode);
+ ret = 0;
+
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+
+ return ret;
+}
+
+#ifdef CONFIG_SECCOMP_FILTER
+/**
+ * seccomp_set_mode_filter: internal function for setting seccomp filter
+ * @flags: flags to change filter behavior
+ * @filter: struct sock_fprog containing filter
+ *
+ * This function may be called repeatedly to install additional filters.
+ * Every filter successfully installed will be evaluated (in reverse order)
+ * for each system call the task makes.
*
* Once current->seccomp.mode is non-zero, it may not be changed.
*
* Returns 0 on success or -EINVAL on failure.
*/
-long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+static long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
{
+ const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
+ struct seccomp_filter *prepared = NULL;
long ret = -EINVAL;
- if (current->seccomp.mode &&
- current->seccomp.mode != seccomp_mode)
+ /* Validate flags. */
+ if (flags & ~SECCOMP_FILTER_FLAG_MASK)
+ return -EINVAL;
+
+ /* Prepare the new filter before holding any locks. */
+ prepared = seccomp_prepare_user_filter(filter);
+ if (IS_ERR(prepared))
+ return PTR_ERR(prepared);
+
+ /*
+ * Make sure we cannot change seccomp or nnp state via TSYNC
+ * while another thread is in the middle of calling exec.
+ */
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
+ mutex_lock_killable(&current->signal->cred_guard_mutex))
+ goto out_free;
+
+ spin_lock_irq(&current->sighand->siglock);
+
+ if (!seccomp_may_assign_mode(seccomp_mode))
+ goto out;
+
+ ret = seccomp_attach_filter(flags, prepared);
+ if (ret)
goto out;
+ /* Do not free the successfully attached filter. */
+ prepared = NULL;
+
+ seccomp_assign_mode(current, seccomp_mode);
+out:
+ spin_unlock_irq(&current->sighand->siglock);
+ if (flags & SECCOMP_FILTER_FLAG_TSYNC)
+ mutex_unlock(&current->signal->cred_guard_mutex);
+out_free:
+ seccomp_filter_free(prepared);
+ return ret;
+}
+#else
+static inline long seccomp_set_mode_filter(unsigned int flags,
+ const char __user *filter)
+{
+ return -EINVAL;
+}
+#endif
+
+/* Common entry point for both prctl and syscall. */
+static long do_seccomp(unsigned int op, unsigned int flags,
+ const char __user *uargs)
+{
+ switch (op) {
+ case SECCOMP_SET_MODE_STRICT:
+ if (flags != 0 || uargs != NULL)
+ return -EINVAL;
+ return seccomp_set_mode_strict();
+ case SECCOMP_SET_MODE_FILTER:
+ return seccomp_set_mode_filter(flags, uargs);
+ default:
+ return -EINVAL;
+ }
+}
+
+SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
+ const char __user *, uargs)
+{
+ return do_seccomp(op, flags, uargs);
+}
+
+/**
+ * prctl_set_seccomp: configures current->seccomp.mode
+ * @seccomp_mode: requested mode to use
+ * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
+ *
+ * Returns 0 on success or -EINVAL on failure.
+ */
+long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
+{
+ unsigned int op;
+ char __user *uargs;
switch (seccomp_mode) {
case SECCOMP_MODE_STRICT:
- ret = 0;
-#ifdef TIF_NOTSC
- disable_TSC();
-#endif
+ op = SECCOMP_SET_MODE_STRICT;
+ /*
+ * Setting strict mode through prctl always ignored filter,
+ * so make sure it is always NULL here to pass the internal
+ * check in do_seccomp().
+ */
+ uargs = NULL;
break;
-#ifdef CONFIG_SECCOMP_FILTER
case SECCOMP_MODE_FILTER:
- ret = seccomp_attach_user_filter(filter);
- if (ret)
- goto out;
+ op = SECCOMP_SET_MODE_FILTER;
+ uargs = filter;
break;
-#endif
default:
- goto out;
+ return -EINVAL;
}
- current->seccomp.mode = seccomp_mode;
- set_thread_flag(TIF_SECCOMP);
-out:
- return ret;
+ /* prctl interface doesn't have flags, so they are always zero. */
+ return do_seccomp(op, 0, uargs);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index a4077e90f19f..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1263,6 +1263,10 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
struct sighand_struct *sighand;
for (;;) {
+ /*
+ * Disable interrupts early to avoid deadlocks.
+ * See rcu_read_unlock() comment header for details.
+ */
local_irq_save(*flags);
rcu_read_lock();
sighand = rcu_dereference(tsk->sighand);
@@ -2166,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
return signr;
}
-int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka,
- struct pt_regs *regs, void *cookie)
+int get_signal(struct ksignal *ksig)
{
struct sighand_struct *sighand = current->sighand;
struct signal_struct *signal = current->signal;
@@ -2237,13 +2240,13 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, info);
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
if (unlikely(current->ptrace) && signr != SIGKILL) {
- signr = ptrace_signal(signr, info);
+ signr = ptrace_signal(signr, &ksig->info);
if (!signr)
continue;
}
@@ -2251,13 +2254,13 @@ relock:
ka = &sighand->action[signr-1];
/* Trace actually delivered signals. */
- trace_signal_deliver(signr, info, ka);
+ trace_signal_deliver(signr, &ksig->info, ka);
if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
continue;
if (ka->sa.sa_handler != SIG_DFL) {
/* Run the handler. */
- *return_ka = *ka;
+ ksig->ka = *ka;
if (ka->sa.sa_flags & SA_ONESHOT)
ka->sa.sa_handler = SIG_DFL;
@@ -2307,7 +2310,7 @@ relock:
spin_lock_irq(&sighand->siglock);
}
- if (likely(do_signal_stop(info->si_signo))) {
+ if (likely(do_signal_stop(ksig->info.si_signo))) {
/* It released the siglock. */
goto relock;
}
@@ -2328,7 +2331,7 @@ relock:
if (sig_kernel_coredump(signr)) {
if (print_fatal_signals)
- print_fatal_signal(info->si_signo);
+ print_fatal_signal(ksig->info.si_signo);
proc_coredump_connector(current);
/*
* If it was able to dump core, this kills all
@@ -2338,34 +2341,32 @@ relock:
* first and our do_group_exit call below will use
* that value and ignore the one we pass it.
*/
- do_coredump(info);
+ do_coredump(&ksig->info);
}
/*
* Death signals, no core dump.
*/
- do_group_exit(info->si_signo);
+ do_group_exit(ksig->info.si_signo);
/* NOTREACHED */
}
spin_unlock_irq(&sighand->siglock);
- return signr;
+
+ ksig->sig = signr;
+ return ksig->sig > 0;
}
/**
* signal_delivered -
- * @sig: number of signal being delivered
- * @info: siginfo_t of signal being delivered
- * @ka: sigaction setting that chose the handler
- * @regs: user register state
+ * @ksig: kernel signal struct
* @stepping: nonzero if debugger single-step or block-step in use
*
* This function should be called when a signal has successfully been
- * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask
+ * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
* is always blocked, and the signal itself is blocked unless %SA_NODEFER
- * is set in @ka->sa.sa_flags. Tracing is notified.
+ * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
*/
-void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
- struct pt_regs *regs, int stepping)
+static void signal_delivered(struct ksignal *ksig, int stepping)
{
sigset_t blocked;
@@ -2375,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
simply clear the restore sigmask flag. */
clear_restore_sigmask();
- sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
- if (!(ka->sa.sa_flags & SA_NODEFER))
- sigaddset(&blocked, sig);
+ sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
+ if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
+ sigaddset(&blocked, ksig->sig);
set_current_blocked(&blocked);
- tracehook_signal_handler(sig, info, ka, regs, stepping);
+ tracehook_signal_handler(stepping);
}
void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2387,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
if (failed)
force_sigsegv(ksig->sig, current);
else
- signal_delivered(ksig->sig, &ksig->info, &ksig->ka,
- signal_pt_regs(), stepping);
+ signal_delivered(ksig, stepping);
}
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 80c33f8de14f..aff8aa14f547 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -3,6 +3,7 @@
*
* (C) Jens Axboe <jens.axboe@oracle.com> 2008
*/
+#include <linux/irq_work.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/kernel.h>
@@ -251,6 +252,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
csd->func(csd->info);
csd_unlock(csd);
}
+
+ /*
+ * Handle irq works queued remotely by irq_work_queue_on().
+ * Smp functions above are typically synchronous so they
+ * better run first since some other CPUs may be busy waiting
+ * for them.
+ */
+ irq_work_run();
}
/*
@@ -661,7 +670,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
if (cond_func(cpu, info)) {
ret = smp_call_function_single(cpu, func,
info, wait);
- WARN_ON_ONCE(!ret);
+ WARN_ON_ONCE(ret);
}
preempt_enable();
}
diff --git a/kernel/sys.c b/kernel/sys.c
index 66a751ebf9d9..ce8129192a26 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1990,12 +1990,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (arg2 != 1 || arg3 || arg4 || arg5)
return -EINVAL;
- current->no_new_privs = 1;
+ task_set_no_new_privs(current);
break;
case PR_GET_NO_NEW_PRIVS:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- return current->no_new_privs ? 1 : 0;
+ return task_no_new_privs(current) ? 1 : 0;
case PR_GET_THP_DISABLE:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 36441b51b5df..391d4ddb6f4b 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
cond_syscall(sys_kexec_load);
cond_syscall(compat_sys_kexec_load);
+cond_syscall(sys_kexec_file_load);
cond_syscall(sys_init_module);
cond_syscall(sys_finit_module);
cond_syscall(sys_delete_module);
@@ -197,6 +198,7 @@ cond_syscall(compat_sys_timerfd_settime);
cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
+cond_syscall(sys_memfd_create);
/* performance counters: */
cond_syscall(sys_perf_event_open);
@@ -213,3 +215,6 @@ cond_syscall(compat_sys_open_by_handle_at);
/* compare kernel pointers */
cond_syscall(sys_kcmp);
+
+/* operate on Secure Computing state */
+cond_syscall(sys_seccomp);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..75875a741b5e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#ifdef CONFIG_NUMA
{
@@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = &hugetlb_mempolicy_sysctl_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
@@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = hugetlb_overcommit_handler,
- .extra1 = (void *)&hugetlb_zero,
- .extra2 = (void *)&hugetlb_infinity,
+ .extra1 = &zero,
},
#endif
{
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 653cbbd9e7ad..e4ba9a5a5ccb 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -522,6 +522,7 @@ static const struct bin_table bin_net_ipv6_conf_var_table[] = {
{ CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" },
{ CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" },
{ CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" },
+ { CTL_INT, NET_IPV6_ACCEPT_RA_FROM_LOCAL, "accept_ra_from_local" },
{}
};
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
index 52ebc70263f4..875f64e8935b 100644
--- a/kernel/system_keyring.c
+++ b/kernel/system_keyring.c
@@ -89,6 +89,7 @@ static __init int load_system_certificate_list(void)
pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
PTR_ERR(key));
} else {
+ set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
pr_notice("Loaded X.509 cert '%s'\n",
key_ref_to_ptr(key)->description);
key_ref_put(key);
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
* the GNU General Public License for more details.
*/
+#define pr_fmt(fmt) "Kprobe smoke test: " fmt
+
#include <linux/kernel.h>
#include <linux/kprobes.h>
#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler\n");
+ pr_err("incorrect value in post_handler\n");
}
posth_val = preh_val + div_factor;
}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
ret = register_kprobe(&kp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobe returned %d\n", ret);
+ pr_err("register_kprobe returned %d\n", ret);
return ret;
}
@@ -68,14 +68,12 @@ static int test_kprobe(void)
unregister_kprobe(&kp);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
{
if (preh_val != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in post_handler2\n");
+ pr_err("incorrect value in post_handler2\n");
}
posth_val = preh_val + div_factor;
}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
kp.flags = 0;
ret = register_kprobes(kps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kprobes returned %d\n", ret);
+ pr_err("register_kprobes returned %d\n", ret);
return ret;
}
@@ -130,14 +126,12 @@ static int test_kprobes(void)
ret = target(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler not called\n");
+ pr_err("kprobe pre_handler not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler not called\n");
+ pr_err("kprobe post_handler not called\n");
handler_errors++;
}
@@ -146,14 +140,12 @@ static int test_kprobes(void)
ret = target2(rand1);
if (preh_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe pre_handler2 not called\n");
+ pr_err("kprobe pre_handler2 not called\n");
handler_errors++;
}
if (posth_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kprobe post_handler2 not called\n");
+ pr_err("kprobe post_handler2 not called\n");
handler_errors++;
}
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
{
if (value != rand1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in jprobe handler\n");
+ pr_err("incorrect value in jprobe handler\n");
}
jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
ret = register_jprobe(&jp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobe returned %d\n", ret);
+ pr_err("register_jprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_jprobe(&jp);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
@@ -217,24 +206,21 @@ static int test_jprobes(void)
jp.kp.flags = 0;
ret = register_jprobes(jps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_jprobes returned %d\n", ret);
+ pr_err("register_jprobes returned %d\n", ret);
return ret;
}
jph_val = 0;
ret = target(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler not called\n");
+ pr_err("jprobe handler not called\n");
handler_errors++;
}
jph_val = 0;
ret = target2(rand1);
if (jph_val == 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "jprobe handler2 not called\n");
+ pr_err("jprobe handler2 not called\n");
handler_errors++;
}
unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor)) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler\n");
+ pr_err("incorrect value in kretprobe handler\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
ret = register_kretprobe(&rp);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
ret = target(rand1);
unregister_kretprobe(&rp);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
if (ret != (rand1 / div_factor) + 1) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "incorrect value in kretprobe handler2\n");
+ pr_err("incorrect value in kretprobe handler2\n");
}
if (krph_val == 0) {
handler_errors++;
- printk(KERN_ERR "Kprobe smoke test failed: "
- "call to kretprobe entry handler failed\n");
+ pr_err("call to kretprobe entry handler failed\n");
}
krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
rp.kp.flags = 0;
ret = register_kretprobes(rps, 2);
if (ret < 0) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "register_kretprobe returned %d\n", ret);
+ pr_err("register_kretprobe returned %d\n", ret);
return ret;
}
krph_val = 0;
ret = target(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler not called\n");
+ pr_err("kretprobe handler not called\n");
handler_errors++;
}
krph_val = 0;
ret = target2(rand1);
if (krph_val != rand1) {
- printk(KERN_ERR "Kprobe smoke test failed: "
- "kretprobe handler2 not called\n");
+ pr_err("kretprobe handler2 not called\n");
handler_errors++;
}
unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
rand1 = prandom_u32();
} while (rand1 <= div_factor);
- printk(KERN_INFO "Kprobe smoke test started\n");
+ pr_info("started\n");
num_tests++;
ret = test_kprobe();
if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
#endif /* CONFIG_KRETPROBES */
if (errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
- "%d tests failed\n", errors, num_tests);
+ pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
else if (handler_errors)
- printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
- "running handlers\n", handler_errors);
+ pr_err("BUG: %d error(s) running handlers\n", handler_errors);
else
- printk(KERN_INFO "Kprobe smoke test passed successfully\n");
+ pr_info("passed successfully\n");
return 0;
}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index feccfd888732..d626dc98e8df 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -12,6 +12,11 @@ config CLOCKSOURCE_WATCHDOG
config ARCH_CLOCKSOURCE_DATA
bool
+# Clocksources require validation of the clocksource against the last
+# cycle update - x86/TSC misfeature
+config CLOCKSOURCE_VALIDATE_LAST_CYCLE
+ bool
+
# Timekeeping vsyscall support
config GENERIC_TIME_VSYSCALL
bool
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fe75444ae7ec..4aec4a457431 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -71,7 +71,7 @@ struct rtc_device *alarmtimer_get_rtcdev(void)
return ret;
}
-
+EXPORT_SYMBOL_GPL(alarmtimer_get_rtcdev);
static int alarmtimer_rtc_add_device(struct device *dev,
struct class_interface *class_intf)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ba3e502c955a..2e949cc9c9f1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -32,6 +32,7 @@
#include <linux/kthread.h>
#include "tick-internal.h"
+#include "timekeeping_internal.h"
void timecounter_init(struct timecounter *tc,
const struct cyclecounter *cc,
@@ -249,7 +250,7 @@ void clocksource_mark_unstable(struct clocksource *cs)
static void clocksource_watchdog(unsigned long data)
{
struct clocksource *cs;
- cycle_t csnow, wdnow;
+ cycle_t csnow, wdnow, delta;
int64_t wd_nsec, cs_nsec;
int next_cpu, reset_pending;
@@ -282,11 +283,12 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
- wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
- watchdog->mult, watchdog->shift);
+ delta = clocksource_delta(wdnow, cs->wd_last, watchdog->mask);
+ wd_nsec = clocksource_cyc2ns(delta, watchdog->mult,
+ watchdog->shift);
- cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
- cs->mask, cs->mult, cs->shift);
+ delta = clocksource_delta(csnow, cs->cs_last, cs->mask);
+ cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift);
cs->cs_last = csnow;
cs->wd_last = wdnow;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 6558b7ac112d..99aa6ee3908f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -154,6 +154,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
+cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static bool can_stop_full_tick(void)
@@ -224,13 +225,15 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
};
/*
- * Kick the current CPU if it's full dynticks in order to force it to
+ * Kick the CPU if it's full dynticks in order to force it to
* re-evaluate its dependency on the tick and restart it if necessary.
*/
-void tick_nohz_full_kick(void)
+void tick_nohz_full_kick_cpu(int cpu)
{
- if (tick_nohz_full_cpu(smp_processor_id()))
- irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+
+ irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
static void nohz_full_kick_ipi(void *info)
@@ -281,6 +284,7 @@ static int __init tick_nohz_full_setup(char *str)
int cpu;
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
+ alloc_bootmem_cpumask_var(&housekeeping_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
return 1;
@@ -291,6 +295,8 @@ static int __init tick_nohz_full_setup(char *str)
pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
tick_nohz_full_running = true;
return 1;
@@ -332,9 +338,15 @@ static int tick_nohz_init_all(void)
pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ return err;
+ }
err = 0;
cpumask_setall(tick_nohz_full_mask);
cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
+ cpumask_clear(housekeeping_mask);
+ cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
tick_nohz_full_running = true;
#endif
return err;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f7378eaebe67..f36b02838a47 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -44,6 +44,22 @@ static struct {
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
+/**
+ * struct tk_fast - NMI safe timekeeper
+ * @seq: Sequence counter for protecting updates. The lowest bit
+ * is the index for the tk_read_base array
+ * @base: tk_read_base array. Access is indexed by the lowest bit of
+ * @seq.
+ *
+ * See @update_fast_timekeeper() below.
+ */
+struct tk_fast {
+ seqcount_t seq;
+ struct tk_read_base base[2];
+};
+
+static struct tk_fast tk_fast_mono ____cacheline_aligned;
+
/* flag for if timekeeping is suspended */
int __read_mostly timekeeping_suspended;
@@ -52,8 +68,8 @@ bool __read_mostly persistent_clock_exist = false;
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
- while (tk->xtime_nsec >= ((u64)NSEC_PER_SEC << tk->shift)) {
- tk->xtime_nsec -= (u64)NSEC_PER_SEC << tk->shift;
+ while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
+ tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
tk->xtime_sec++;
}
}
@@ -63,20 +79,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
struct timespec64 ts;
ts.tv_sec = tk->xtime_sec;
- ts.tv_nsec = (long)(tk->xtime_nsec >> tk->shift);
+ ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
return ts;
}
static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec = ts->tv_sec;
- tk->xtime_nsec = (u64)ts->tv_nsec << tk->shift;
+ tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
}
static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
{
tk->xtime_sec += ts->tv_sec;
- tk->xtime_nsec += (u64)ts->tv_nsec << tk->shift;
+ tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
tk_normalize_xtime(tk);
}
@@ -97,13 +113,9 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec64 wtm)
tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
-static void tk_set_sleep_time(struct timekeeper *tk, struct timespec64 t)
+static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
{
- /* Verify consistency before modifying */
- WARN_ON_ONCE(tk->offs_boot.tv64 != timespec64_to_ktime(tk->total_sleep_time).tv64);
-
- tk->total_sleep_time = t;
- tk->offs_boot = timespec64_to_ktime(t);
+ tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
/**
@@ -123,9 +135,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
- old_clock = tk->clock;
- tk->clock = clock;
- tk->cycle_last = clock->cycle_last = clock->read(clock);
+ old_clock = tk->tkr.clock;
+ tk->tkr.clock = clock;
+ tk->tkr.read = clock->read;
+ tk->tkr.mask = clock->mask;
+ tk->tkr.cycle_last = tk->tkr.read(clock);
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
@@ -149,21 +163,23 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
if (shift_change < 0)
- tk->xtime_nsec >>= -shift_change;
+ tk->tkr.xtime_nsec >>= -shift_change;
else
- tk->xtime_nsec <<= shift_change;
+ tk->tkr.xtime_nsec <<= shift_change;
}
- tk->shift = clock->shift;
+ tk->tkr.shift = clock->shift;
tk->ntp_error = 0;
tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
+ tk->ntp_tick = ntpinterval << tk->ntp_error_shift;
/*
* The timekeeper keeps its own mult values for the currently
* active clocksource. These value will be adjusted via NTP
* to counteract clock drifting.
*/
- tk->mult = clock->mult;
+ tk->tkr.mult = clock->mult;
+ tk->ntp_err_mult = 0;
}
/* Timekeeper helper functions. */
@@ -175,21 +191,19 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
-static inline s64 timekeeping_get_ns(struct timekeeper *tk)
+static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ cycle_t cycle_now, delta;
s64 nsec;
/* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ cycle_now = tkr->read(tkr->clock);
/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
- nsec = cycle_delta * tk->mult + tk->xtime_nsec;
- nsec >>= tk->shift;
+ nsec = delta * tkr->mult + tkr->xtime_nsec;
+ nsec >>= tkr->shift;
/* If arch requires, add in get_arch_timeoffset() */
return nsec + arch_gettimeoffset();
@@ -197,32 +211,138 @@ static inline s64 timekeeping_get_ns(struct timekeeper *tk)
static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ struct clocksource *clock = tk->tkr.clock;
+ cycle_t cycle_now, delta;
s64 nsec;
/* read clocksource: */
- clock = tk->clock;
- cycle_now = clock->read(clock);
+ cycle_now = tk->tkr.read(clock);
/* calculate the delta since the last update_wall_time: */
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
/* convert delta to nanoseconds. */
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
/* If arch requires, add in get_arch_timeoffset() */
return nsec + arch_gettimeoffset();
}
+/**
+ * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
+ * @tk: The timekeeper from which we take the update
+ * @tkf: The fast timekeeper to update
+ * @tbase: The time base for the fast timekeeper (mono/raw)
+ *
+ * We want to use this from any context including NMI and tracing /
+ * instrumenting the timekeeping code itself.
+ *
+ * So we handle this differently than the other timekeeping accessor
+ * functions which retry when the sequence count has changed. The
+ * update side does:
+ *
+ * smp_wmb(); <- Ensure that the last base[1] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[0], tk);
+ * smp_wmb(); <- Ensure that the base[0] update is visible
+ * tkf->seq++;
+ * smp_wmb(); <- Ensure that the seqcount update is visible
+ * update(tkf->base[1], tk);
+ *
+ * The reader side does:
+ *
+ * do {
+ * seq = tkf->seq;
+ * smp_rmb();
+ * idx = seq & 0x01;
+ * now = now(tkf->base[idx]);
+ * smp_rmb();
+ * } while (seq != tkf->seq)
+ *
+ * As long as we update base[0] readers are forced off to
+ * base[1]. Once base[0] is updated readers are redirected to base[0]
+ * and the base[1] update takes place.
+ *
+ * So if a NMI hits the update of base[0] then it will use base[1]
+ * which is still consistent. In the worst case this can result is a
+ * slightly wrong timestamp (a few nanoseconds). See
+ * @ktime_get_mono_fast_ns.
+ */
+static void update_fast_timekeeper(struct timekeeper *tk)
+{
+ struct tk_read_base *base = tk_fast_mono.base;
+
+ /* Force readers off to base[1] */
+ raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+ /* Update base[0] */
+ memcpy(base, &tk->tkr, sizeof(*base));
+
+ /* Force readers back to base[0] */
+ raw_write_seqcount_latch(&tk_fast_mono.seq);
+
+ /* Update base[1] */
+ memcpy(base + 1, base, sizeof(*base));
+}
+
+/**
+ * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
+ *
+ * This timestamp is not guaranteed to be monotonic across an update.
+ * The timestamp is calculated by:
+ *
+ * now = base_mono + clock_delta * slope
+ *
+ * So if the update lowers the slope, readers who are forced to the
+ * not yet updated second array are still using the old steeper slope.
+ *
+ * tmono
+ * ^
+ * | o n
+ * | o n
+ * | u
+ * | o
+ * |o
+ * |12345678---> reader order
+ *
+ * o = old slope
+ * u = update
+ * n = new slope
+ *
+ * So reader 6 will observe time going backwards versus reader 5.
+ *
+ * While other CPUs are likely to be able observe that, the only way
+ * for a CPU local observation is when an NMI hits in the middle of
+ * the update. Timestamps taken from that NMI context might be ahead
+ * of the following timestamps. Callers need to be aware of that and
+ * deal with it.
+ */
+u64 notrace ktime_get_mono_fast_ns(void)
+{
+ struct tk_read_base *tkr;
+ unsigned int seq;
+ u64 now;
+
+ do {
+ seq = raw_read_seqcount(&tk_fast_mono.seq);
+ tkr = tk_fast_mono.base + (seq & 0x01);
+ now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+
+ } while (read_seqcount_retry(&tk_fast_mono.seq, seq));
+ return now;
+}
+EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
+
#ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD
static inline void update_vsyscall(struct timekeeper *tk)
{
struct timespec xt;
- xt = tk_xtime(tk);
- update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->clock, tk->mult);
+ xt = timespec64_to_timespec(tk_xtime(tk));
+ update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult,
+ tk->tkr.cycle_last);
}
static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -239,11 +359,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
* users are removed, this can be killed.
*/
- remainder = tk->xtime_nsec & ((1ULL << tk->shift) - 1);
- tk->xtime_nsec -= remainder;
- tk->xtime_nsec += 1ULL << tk->shift;
+ remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
+ tk->tkr.xtime_nsec -= remainder;
+ tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
- tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
+ tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
@@ -308,7 +428,10 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
nsec = (s64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
nsec *= NSEC_PER_SEC;
nsec += tk->wall_to_monotonic.tv_nsec;
- tk->base_mono = ns_to_ktime(nsec);
+ tk->tkr.base_mono = ns_to_ktime(nsec);
+
+ /* Update the monotonic raw base */
+ tk->base_raw = timespec64_to_ktime(tk->raw_time);
}
/* must hold timekeeper_lock */
@@ -326,6 +449,8 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
if (action & TK_MIRROR)
memcpy(&shadow_timekeeper, &tk_core.timekeeper,
sizeof(tk_core.timekeeper));
+
+ update_fast_timekeeper(tk);
}
/**
@@ -337,23 +462,22 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- cycle_t cycle_now, cycle_delta;
- struct clocksource *clock;
+ struct clocksource *clock = tk->tkr.clock;
+ cycle_t cycle_now, delta;
s64 nsec;
- clock = tk->clock;
- cycle_now = clock->read(clock);
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
- tk->cycle_last = clock->cycle_last = cycle_now;
+ cycle_now = tk->tkr.read(clock);
+ delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
+ tk->tkr.cycle_last = cycle_now;
- tk->xtime_nsec += cycle_delta * tk->mult;
+ tk->tkr.xtime_nsec += delta * tk->tkr.mult;
/* If arch requires, add in get_arch_timeoffset() */
- tk->xtime_nsec += (u64)arch_gettimeoffset() << tk->shift;
+ tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
+ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
timespec64_add_ns(&tk->raw_time, nsec);
}
@@ -374,7 +498,7 @@ int __getnstimeofday64(struct timespec64 *ts)
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsecs = timekeeping_get_ns(tk);
+ nsecs = timekeeping_get_ns(&tk->tkr);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -414,8 +538,8 @@ ktime_t ktime_get(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_mono;
- nsecs = timekeeping_get_ns(tk);
+ base = tk->tkr.base_mono;
+ nsecs = timekeeping_get_ns(&tk->tkr);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -440,8 +564,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = ktime_add(tk->base_mono, *offset);
- nsecs = timekeeping_get_ns(tk);
+ base = ktime_add(tk->tkr.base_mono, *offset);
+ nsecs = timekeeping_get_ns(&tk->tkr);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -471,6 +595,27 @@ ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs)
EXPORT_SYMBOL_GPL(ktime_mono_to_any);
/**
+ * ktime_get_raw - Returns the raw monotonic time in ktime_t format
+ */
+ktime_t ktime_get_raw(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ ktime_t base;
+ s64 nsecs;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ base = tk->base_raw;
+ nsecs = timekeeping_get_ns_raw(tk);
+
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return ktime_add_ns(base, nsecs);
+}
+EXPORT_SYMBOL_GPL(ktime_get_raw);
+
+/**
* ktime_get_ts64 - get the monotonic clock in timespec64 format
* @ts: pointer to timespec variable
*
@@ -490,7 +635,7 @@ void ktime_get_ts64(struct timespec64 *ts)
do {
seq = read_seqcount_begin(&tk_core.seq);
ts->tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
+ nsec = timekeeping_get_ns(&tk->tkr);
tomono = tk->wall_to_monotonic;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -501,37 +646,6 @@ void ktime_get_ts64(struct timespec64 *ts)
}
EXPORT_SYMBOL_GPL(ktime_get_ts64);
-
-/**
- * timekeeping_clocktai - Returns the TAI time of day in a timespec
- * @ts: pointer to the timespec to be set
- *
- * Returns the time of day in a timespec.
- */
-void timekeeping_clocktai(struct timespec *ts)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
- unsigned long seq;
- u64 nsecs;
-
- WARN_ON(timekeeping_suspended);
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
-
- ts64.tv_sec = tk->xtime_sec + tk->tai_offset;
- nsecs = timekeeping_get_ns(tk);
-
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- ts64.tv_nsec = 0;
- timespec64_add_ns(&ts64, nsecs);
- *ts = timespec64_to_timespec(ts64);
-
-}
-EXPORT_SYMBOL(timekeeping_clocktai);
-
#ifdef CONFIG_NTP_PPS
/**
@@ -559,7 +673,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
ts_real->tv_nsec = 0;
nsecs_raw = timekeeping_get_ns_raw(tk);
- nsecs_real = timekeeping_get_ns(tk);
+ nsecs_real = timekeeping_get_ns(&tk->tkr);
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -743,7 +857,7 @@ static int change_clocksource(void *data)
*/
if (try_module_get(new->owner)) {
if (!new->enable || new->enable(new) == 0) {
- old = tk->clock;
+ old = tk->tkr.clock;
tk_setup_internals(tk, new);
if (old->disable)
old->disable(old);
@@ -771,11 +885,11 @@ int timekeeping_notify(struct clocksource *clock)
{
struct timekeeper *tk = &tk_core.timekeeper;
- if (tk->clock == clock)
+ if (tk->tkr.clock == clock)
return 0;
stop_machine(change_clocksource, clock, NULL);
tick_clock_notify();
- return tk->clock == clock ? 0 : -1;
+ return tk->tkr.clock == clock ? 0 : -1;
}
/**
@@ -815,7 +929,7 @@ int timekeeping_valid_for_hres(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+ ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -834,7 +948,7 @@ u64 timekeeping_max_deferment(void)
do {
seq = read_seqcount_begin(&tk_core.seq);
- ret = tk->clock->max_idle_ns;
+ ret = tk->tkr.clock->max_idle_ns;
} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -913,16 +1027,13 @@ void __init timekeeping_init(void)
tk_set_xtime(tk, &now);
tk->raw_time.tv_sec = 0;
tk->raw_time.tv_nsec = 0;
+ tk->base_raw.tv64 = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
set_normalized_timespec64(&tmp, -boot.tv_sec, -boot.tv_nsec);
tk_set_wall_to_mono(tk, tmp);
- tmp.tv_sec = 0;
- tmp.tv_nsec = 0;
- tk_set_sleep_time(tk, tmp);
-
timekeeping_update(tk, TK_MIRROR);
write_seqcount_end(&tk_core.seq);
@@ -950,7 +1061,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
tk_xtime_add(tk, delta);
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, *delta));
- tk_set_sleep_time(tk, timespec64_add(tk->total_sleep_time, *delta));
+ tk_update_sleep_time(tk, timespec64_to_ktime(*delta));
tk_debug_account_sleep_time(delta);
}
@@ -1004,7 +1115,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
static void timekeeping_resume(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct clocksource *clock = tk->clock;
+ struct clocksource *clock = tk->tkr.clock;
unsigned long flags;
struct timespec64 ts_new, ts_delta;
struct timespec tmp;
@@ -1032,15 +1143,16 @@ static void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = clock->read(clock);
+ cycle_now = tk->tkr.read(clock);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
- cycle_now > clock->cycle_last) {
+ cycle_now > tk->tkr.cycle_last) {
u64 num, max = ULLONG_MAX;
u32 mult = clock->mult;
u32 shift = clock->shift;
s64 nsec = 0;
- cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+ cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
+ tk->tkr.mask);
/*
* "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1066,7 +1178,7 @@ static void timekeeping_resume(void)
__timekeeping_inject_sleeptime(tk, &ts_delta);
/* Re-base the last cycle value */
- tk->cycle_last = clock->cycle_last = cycle_now;
+ tk->tkr.cycle_last = cycle_now;
tk->ntp_error = 0;
timekeeping_suspended = 0;
timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1147,125 +1259,34 @@ static int __init timekeeping_init_ops(void)
register_syscore_ops(&timekeeping_syscore_ops);
return 0;
}
-
device_initcall(timekeeping_init_ops);
/*
- * If the error is already larger, we look ahead even further
- * to compensate for late or lost adjustments.
- */
-static __always_inline int timekeeping_bigadjust(struct timekeeper *tk,
- s64 error, s64 *interval,
- s64 *offset)
-{
- s64 tick_error, i;
- u32 look_ahead, adj;
- s32 error2, mult;
-
- /*
- * Use the current error value to determine how much to look ahead.
- * The larger the error the slower we adjust for it to avoid problems
- * with losing too many ticks, otherwise we would overadjust and
- * produce an even larger error. The smaller the adjustment the
- * faster we try to adjust for it, as lost ticks can do less harm
- * here. This is tuned so that an error of about 1 msec is adjusted
- * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
- */
- error2 = tk->ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ);
- error2 = abs(error2);
- for (look_ahead = 0; error2 > 0; look_ahead++)
- error2 >>= 2;
-
- /*
- * Now calculate the error in (1 << look_ahead) ticks, but first
- * remove the single look ahead already included in the error.
- */
- tick_error = ntp_tick_length() >> (tk->ntp_error_shift + 1);
- tick_error -= tk->xtime_interval >> 1;
- error = ((error - tick_error) >> look_ahead) + tick_error;
-
- /* Finally calculate the adjustment shift value. */
- i = *interval;
- mult = 1;
- if (error < 0) {
- error = -error;
- *interval = -*interval;
- *offset = -*offset;
- mult = -1;
- }
- for (adj = 0; error > i; adj++)
- error >>= 1;
-
- *interval <<= adj;
- *offset <<= adj;
- return mult << adj;
-}
-
-/*
- * Adjust the multiplier to reduce the error value,
- * this is optimized for the most common adjustments of -1,0,1,
- * for other values we can do a bit more work.
+ * Apply a multiplier adjustment to the timekeeper
*/
-static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
+ s64 offset,
+ bool negative,
+ int adj_scale)
{
- s64 error, interval = tk->cycle_interval;
- int adj;
+ s64 interval = tk->cycle_interval;
+ s32 mult_adj = 1;
- /*
- * The point of this is to check if the error is greater than half
- * an interval.
- *
- * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
- *
- * Note we subtract one in the shift, so that error is really error*2.
- * This "saves" dividing(shifting) interval twice, but keeps the
- * (error > interval) comparison as still measuring if error is
- * larger than half an interval.
- *
- * Note: It does not "save" on aggravation when reading the code.
- */
- error = tk->ntp_error >> (tk->ntp_error_shift - 1);
- if (error > interval) {
- /*
- * We now divide error by 4(via shift), which checks if
- * the error is greater than twice the interval.
- * If it is greater, we need a bigadjust, if its smaller,
- * we can adjust by 1.
- */
- error >>= 2;
- if (likely(error <= interval))
- adj = 1;
- else
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- } else {
- if (error < -interval) {
- /* See comment above, this is just switched for the negative */
- error >>= 2;
- if (likely(error >= -interval)) {
- adj = -1;
- interval = -interval;
- offset = -offset;
- } else {
- adj = timekeeping_bigadjust(tk, error, &interval, &offset);
- }
- } else {
- goto out_adjust;
- }
+ if (negative) {
+ mult_adj = -mult_adj;
+ interval = -interval;
+ offset = -offset;
}
+ mult_adj <<= adj_scale;
+ interval <<= adj_scale;
+ offset <<= adj_scale;
- if (unlikely(tk->clock->maxadj &&
- (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) {
- printk_deferred_once(KERN_WARNING
- "Adjusting %s more than 11%% (%ld vs %ld)\n",
- tk->clock->name, (long)tk->mult + adj,
- (long)tk->clock->mult + tk->clock->maxadj);
- }
/*
* So the following can be confusing.
*
- * To keep things simple, lets assume adj == 1 for now.
+ * To keep things simple, lets assume mult_adj == 1 for now.
*
- * When adj != 1, remember that the interval and offset values
+ * When mult_adj != 1, remember that the interval and offset values
* have been appropriately scaled so the math is the same.
*
* The basic idea here is that we're increasing the multiplier
@@ -1309,12 +1330,78 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
*
* XXX - TODO: Doc ntp_error calculation.
*/
- tk->mult += adj;
+ tk->tkr.mult += mult_adj;
tk->xtime_interval += interval;
- tk->xtime_nsec -= offset;
+ tk->tkr.xtime_nsec -= offset;
tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
+}
+
+/*
+ * Calculate the multiplier adjustment needed to match the frequency
+ * specified by NTP
+ */
+static __always_inline void timekeeping_freqadjust(struct timekeeper *tk,
+ s64 offset)
+{
+ s64 interval = tk->cycle_interval;
+ s64 xinterval = tk->xtime_interval;
+ s64 tick_error;
+ bool negative;
+ u32 adj;
+
+ /* Remove any current error adj from freq calculation */
+ if (tk->ntp_err_mult)
+ xinterval -= tk->cycle_interval;
+
+ tk->ntp_tick = ntp_tick_length();
+
+ /* Calculate current error per tick */
+ tick_error = ntp_tick_length() >> tk->ntp_error_shift;
+ tick_error -= (xinterval + tk->xtime_remainder);
+
+ /* Don't worry about correcting it if its small */
+ if (likely((tick_error >= 0) && (tick_error <= interval)))
+ return;
+
+ /* preserve the direction of correction */
+ negative = (tick_error < 0);
+
+ /* Sort out the magnitude of the correction */
+ tick_error = abs(tick_error);
+ for (adj = 0; tick_error > interval; adj++)
+ tick_error >>= 1;
+
+ /* scale the corrections */
+ timekeeping_apply_adjustment(tk, offset, negative, adj);
+}
+
+/*
+ * Adjust the timekeeper's multiplier to the correct frequency
+ * and also to reduce the accumulated error value.
+ */
+static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
+{
+ /* Correct for the current frequency error */
+ timekeeping_freqadjust(tk, offset);
+
+ /* Next make a small adjustment to fix any cumulative error */
+ if (!tk->ntp_err_mult && (tk->ntp_error > 0)) {
+ tk->ntp_err_mult = 1;
+ timekeeping_apply_adjustment(tk, offset, 0, 0);
+ } else if (tk->ntp_err_mult && (tk->ntp_error <= 0)) {
+ /* Undo any existing error adjustment */
+ timekeeping_apply_adjustment(tk, offset, 1, 0);
+ tk->ntp_err_mult = 0;
+ }
+
+ if (unlikely(tk->tkr.clock->maxadj &&
+ (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
+ tk->tkr.clock->name, (long)tk->tkr.mult,
+ (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+ }
-out_adjust:
/*
* It may be possible that when we entered this function, xtime_nsec
* was very small. Further, if we're slightly speeding the clocksource
@@ -1329,12 +1416,11 @@ out_adjust:
* We'll correct this error next time through this function, when
* xtime_nsec is not as small.
*/
- if (unlikely((s64)tk->xtime_nsec < 0)) {
- s64 neg = -(s64)tk->xtime_nsec;
- tk->xtime_nsec = 0;
+ if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
+ s64 neg = -(s64)tk->tkr.xtime_nsec;
+ tk->tkr.xtime_nsec = 0;
tk->ntp_error += neg << tk->ntp_error_shift;
}
-
}
/**
@@ -1347,13 +1433,13 @@ out_adjust:
*/
static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
- u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+ u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
unsigned int clock_set = 0;
- while (tk->xtime_nsec >= nsecps) {
+ while (tk->tkr.xtime_nsec >= nsecps) {
int leap;
- tk->xtime_nsec -= nsecps;
+ tk->tkr.xtime_nsec -= nsecps;
tk->xtime_sec++;
/* Figure out if its a leap sec and apply if needed */
@@ -1398,9 +1484,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
/* Accumulate one shifted interval */
offset -= interval;
- tk->cycle_last += interval;
+ tk->tkr.cycle_last += interval;
- tk->xtime_nsec += tk->xtime_interval << shift;
+ tk->tkr.xtime_nsec += tk->xtime_interval << shift;
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
@@ -1414,7 +1500,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
tk->raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
- tk->ntp_error += ntp_tick_length() << shift;
+ tk->ntp_error += tk->ntp_tick << shift;
tk->ntp_error -= (tk->xtime_interval + tk->xtime_remainder) <<
(tk->ntp_error_shift + shift);
@@ -1427,7 +1513,6 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
*/
void update_wall_time(void)
{
- struct clocksource *clock;
struct timekeeper *real_tk = &tk_core.timekeeper;
struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
@@ -1441,12 +1526,11 @@ void update_wall_time(void)
if (unlikely(timekeeping_suspended))
goto out;
- clock = real_tk->clock;
-
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = (clock->read(clock) - clock->cycle_last) & clock->mask;
+ offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
+ tk->tkr.cycle_last, tk->tkr.mask);
#endif
/* Check if there's really nothing to do */
@@ -1489,8 +1573,6 @@ void update_wall_time(void)
clock_set |= accumulate_nsecs_to_secs(tk);
write_seqcount_begin(&tk_core.seq);
- /* Update clock->cycle_last with the new value */
- clock->cycle_last = tk->cycle_last;
/*
* Update the real timekeeper.
*
@@ -1525,65 +1607,11 @@ out:
void getboottime(struct timespec *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec boottime = {
- .tv_sec = tk->wall_to_monotonic.tv_sec +
- tk->total_sleep_time.tv_sec,
- .tv_nsec = tk->wall_to_monotonic.tv_nsec +
- tk->total_sleep_time.tv_nsec
- };
-
- set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
-}
-EXPORT_SYMBOL_GPL(getboottime);
-
-/**
- * get_monotonic_boottime - Returns monotonic time since boot
- * @ts: pointer to the timespec to be set
- *
- * Returns the monotonic time since boot in a timespec.
- *
- * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
- * includes the time spent in suspend.
- */
-void get_monotonic_boottime(struct timespec *ts)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 tomono, sleep, ret;
- s64 nsec;
- unsigned int seq;
-
- WARN_ON(timekeeping_suspended);
-
- do {
- seq = read_seqcount_begin(&tk_core.seq);
- ret.tv_sec = tk->xtime_sec;
- nsec = timekeeping_get_ns(tk);
- tomono = tk->wall_to_monotonic;
- sleep = tk->total_sleep_time;
-
- } while (read_seqcount_retry(&tk_core.seq, seq));
-
- ret.tv_sec += tomono.tv_sec + sleep.tv_sec;
- ret.tv_nsec = 0;
- timespec64_add_ns(&ret, nsec + tomono.tv_nsec + sleep.tv_nsec);
- *ts = timespec64_to_timespec(ret);
-}
-EXPORT_SYMBOL_GPL(get_monotonic_boottime);
-
-/**
- * monotonic_to_bootbased - Convert the monotonic time to boot based.
- * @ts: pointer to the timespec to be converted
- */
-void monotonic_to_bootbased(struct timespec *ts)
-{
- struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
+ ktime_t t = ktime_sub(tk->offs_real, tk->offs_boot);
- ts64 = timespec_to_timespec64(*ts);
- ts64 = timespec64_add(ts64, tk->total_sleep_time);
- *ts = timespec64_to_timespec(ts64);
+ *ts = ktime_to_timespec(t);
}
-EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
+EXPORT_SYMBOL_GPL(getboottime);
unsigned long get_seconds(void)
{
@@ -1663,8 +1691,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_mono;
- nsecs = tk->xtime_nsec >> tk->shift;
+ base = tk->tkr.base_mono;
+ nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
@@ -1695,8 +1723,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
do {
seq = read_seqcount_begin(&tk_core.seq);
- base = tk->base_mono;
- nsecs = timekeeping_get_ns(tk);
+ base = tk->tkr.base_mono;
+ nsecs = timekeeping_get_ns(&tk->tkr);
*offs_real = tk->offs_real;
*offs_boot = tk->offs_boot;
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
index e3d28ad236f9..4ea005a7f9da 100644
--- a/kernel/time/timekeeping_internal.h
+++ b/kernel/time/timekeeping_internal.h
@@ -3,6 +3,7 @@
/*
* timekeeping debug functions
*/
+#include <linux/clocksource.h>
#include <linux/time.h>
#ifdef CONFIG_DEBUG_FS
@@ -11,4 +12,18 @@ extern void tk_debug_account_sleep_time(struct timespec64 *t);
#define tk_debug_account_sleep_time(x)
#endif
+#ifdef CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ cycle_t ret = (now - last) & mask;
+
+ return (s64) ret > 0 ? ret : 0;
+}
+#else
+static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask)
+{
+ return (now - last) & mask;
+}
+#endif
+
#endif /* _TIMEKEEPING_INTERNAL_H */
diff --git a/kernel/torture.c b/kernel/torture.c
index 40bb511cca48..d600af21f022 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -708,7 +708,7 @@ int _torture_create_kthread(int (*fn)(void *arg), void *arg, char *s, char *m,
int ret = 0;
VERBOSE_TOROUT_STRING(m);
- *tp = kthread_run(fn, arg, s);
+ *tp = kthread_run(fn, arg, "%s", s);
if (IS_ERR(*tp)) {
ret = PTR_ERR(*tp);
VERBOSE_TOROUT_ERRSTRING(f);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d4409356f40d..a5da09c899dd 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -29,11 +29,6 @@ config HAVE_FUNCTION_GRAPH_FP_TEST
help
See Documentation/trace/ftrace-design.txt
-config HAVE_FUNCTION_TRACE_MCOUNT_TEST
- bool
- help
- See Documentation/trace/ftrace-design.txt
-
config HAVE_DYNAMIC_FTRACE
bool
help
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2611613f14f1..67d6369ddf83 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_seq.o
obj-$(CONFIG_TRACING) += trace_stat.o
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ac9d1dad630b..1654b12c891a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -80,9 +80,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly = {
int ftrace_enabled __read_mostly;
static int last_ftrace_enabled;
-/* Quick disabling of function tracer. */
-int function_trace_stop __read_mostly;
-
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
/* What to set function_trace_op to */
@@ -1042,6 +1039,8 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
#ifdef CONFIG_DYNAMIC_FTRACE
+static struct ftrace_ops *removed_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1304,25 +1303,15 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
- int ret;
int i;
/*
- * Remove the current set, update the hash and add
- * them back.
- */
- ftrace_hash_rec_disable(ops, enable);
-
- /*
* If the new source is empty, just free dst and assign it
* the empty_hash.
*/
if (!src->count) {
- free_ftrace_hash_rcu(*dst);
- rcu_assign_pointer(*dst, EMPTY_HASH);
- /* still need to update the function records */
- ret = 0;
- goto out;
+ new_hash = EMPTY_HASH;
+ goto update;
}
/*
@@ -1335,10 +1324,9 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
if (bits > FTRACE_HASH_MAX_BITS)
bits = FTRACE_HASH_MAX_BITS;
- ret = -ENOMEM;
new_hash = alloc_ftrace_hash(bits);
if (!new_hash)
- goto out;
+ return -ENOMEM;
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
@@ -1349,20 +1337,20 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
}
}
+update:
+ /*
+ * Remove the current set, update the hash and add
+ * them back.
+ */
+ ftrace_hash_rec_disable(ops, enable);
+
old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
free_ftrace_hash_rcu(old_hash);
- ret = 0;
- out:
- /*
- * Enable regardless of ret:
- * On success, we enable the new hash.
- * On failure, we re-enable the original hash.
- */
ftrace_hash_rec_enable(ops, enable);
- return ret;
+ return 0;
}
/*
@@ -1492,6 +1480,53 @@ int ftrace_text_reserved(const void *start, const void *end)
return (int)!!ret;
}
+/* Test if ops registered to this rec needs regs */
+static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ bool keep_regs = false;
+
+ for (ops = ftrace_ops_list;
+ ops != &ftrace_list_end; ops = ops->next) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(ops, rec->ip, rec)) {
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ keep_regs = true;
+ break;
+ }
+ }
+ }
+
+ return keep_regs;
+}
+
+static void ftrace_remove_tramp(struct ftrace_ops *ops,
+ struct dyn_ftrace *rec)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
+ if (!entry)
+ return;
+
+ /*
+ * The tramp_hash entry will be removed at time
+ * of update.
+ */
+ ops->nr_trampolines--;
+ rec->flags &= ~FTRACE_FL_TRAMP;
+}
+
+static void ftrace_clear_tramps(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (op->nr_trampolines)
+ ftrace_remove_tramp(op, rec);
+ } while_for_each_ftrace_op(op);
+}
+
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1572,8 +1607,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (inc) {
rec->flags++;
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX))
return;
+
+ /*
+ * If there's only a single callback registered to a
+ * function, and the ops has a trampoline registered
+ * for it, then we can call it directly.
+ */
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+ rec->flags |= FTRACE_FL_TRAMP;
+ ops->nr_trampolines++;
+ } else {
+ /*
+ * If we are adding another function callback
+ * to this function, and the previous had a
+ * trampoline used, then we need to go back to
+ * the default trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /* remove trampolines from any ops for this rec */
+ ftrace_clear_tramps(rec);
+ }
+
/*
* If any ops wants regs saved for this function
* then all ops will get saved regs.
@@ -1581,9 +1638,30 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
rec->flags |= FTRACE_FL_REGS;
} else {
- if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+ if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0))
return;
rec->flags--;
+
+ if (ops->trampoline && !ftrace_rec_count(rec))
+ ftrace_remove_tramp(ops, rec);
+
+ /*
+ * If the rec had REGS enabled and the ops that is
+ * being removed had REGS set, then see if there is
+ * still any ops for this record that wants regs.
+ * If not, we can stop recording them.
+ */
+ if (ftrace_rec_count(rec) > 0 &&
+ rec->flags & FTRACE_FL_REGS &&
+ ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
+ if (!test_rec_ops_needs_regs(rec))
+ rec->flags &= ~FTRACE_FL_REGS;
+ }
+
+ /*
+ * flags will be cleared in ftrace_check_record()
+ * if rec count is zero.
+ */
}
count++;
/* Shortcut, if we handled all records, we are done. */
@@ -1668,17 +1746,23 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* If we are disabling calls, then disable all records that
* are enabled.
*/
- if (enable && (rec->flags & ~FTRACE_FL_MASK))
+ if (enable && ftrace_rec_count(rec))
flag = FTRACE_FL_ENABLED;
/*
- * If enabling and the REGS flag does not match the REGS_EN, then
- * do not ignore this record. Set flags to fail the compare against
- * ENABLED.
+ * If enabling and the REGS flag does not match the REGS_EN, or
+ * the TRAMP flag doesn't match the TRAMP_EN, then do not ignore
+ * this record. Set flags to fail the compare against ENABLED.
*/
- if (flag &&
- (!(rec->flags & FTRACE_FL_REGS) != !(rec->flags & FTRACE_FL_REGS_EN)))
- flag |= FTRACE_FL_REGS;
+ if (flag) {
+ if (!(rec->flags & FTRACE_FL_REGS) !=
+ !(rec->flags & FTRACE_FL_REGS_EN))
+ flag |= FTRACE_FL_REGS;
+
+ if (!(rec->flags & FTRACE_FL_TRAMP) !=
+ !(rec->flags & FTRACE_FL_TRAMP_EN))
+ flag |= FTRACE_FL_TRAMP;
+ }
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1696,6 +1780,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
else
rec->flags &= ~FTRACE_FL_REGS_EN;
}
+ if (flag & FTRACE_FL_TRAMP) {
+ if (rec->flags & FTRACE_FL_TRAMP)
+ rec->flags |= FTRACE_FL_TRAMP_EN;
+ else
+ rec->flags &= ~FTRACE_FL_TRAMP_EN;
+ }
}
/*
@@ -1704,7 +1794,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
* Otherwise,
* return UPDATE_MODIFY_CALL to tell the caller to convert
* from the save regs, to a non-save regs function or
- * vice versa.
+ * vice versa, or from a trampoline call.
*/
if (flag & FTRACE_FL_ENABLED)
return FTRACE_UPDATE_MAKE_CALL;
@@ -1714,7 +1804,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update)
if (update) {
/* If there's no more users, clear all flags */
- if (!(rec->flags & ~FTRACE_FL_MASK))
+ if (!ftrace_rec_count(rec))
rec->flags = 0;
else
/* Just disable the record (keep REGS state) */
@@ -1751,6 +1841,43 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
return ftrace_check_record(rec, enable, 0);
}
+static struct ftrace_ops *
+ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ /* Removed ops need to be tested first */
+ if (removed_ops && removed_ops->tramp_hash) {
+ if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+ return removed_ops;
+ }
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!op->tramp_hash)
+ continue;
+
+ if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+ return op;
+
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
+ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ /* pass rec in as regs to have non-NULL val */
+ if (ftrace_ops_test(op, rec->ip, rec))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
/**
* ftrace_get_addr_new - Get the call address to set to
* @rec: The ftrace record descriptor
@@ -1763,6 +1890,20 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
*/
unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP) {
+ ops = ftrace_find_tramp_ops_new(rec);
+ if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -1781,6 +1922,20 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
*/
unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec)
{
+ struct ftrace_ops *ops;
+
+ /* Trampolines take precedence over regs */
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (FTRACE_WARN_ON(!ops)) {
+ pr_warning("Bad trampoline accounting at: %p (%pS)\n",
+ (void *)rec->ip, (void *)rec->ip);
+ /* Ftrace is shutting down, return anything */
+ return (unsigned long)FTRACE_ADDR;
+ }
+ return ops->trampoline;
+ }
+
if (rec->flags & FTRACE_FL_REGS_EN)
return (unsigned long)FTRACE_REGS_ADDR;
else
@@ -2023,6 +2178,89 @@ void __weak arch_ftrace_update_code(int command)
ftrace_run_stop_machine(command);
}
+static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
+{
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int size, bits;
+ int ret;
+
+ size = ops->nr_trampolines;
+ bits = 0;
+ /*
+ * Make the hash size about 1/2 the # found
+ */
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ ops->tramp_hash = alloc_ftrace_hash(bits);
+ /*
+ * TODO: a failed allocation is going to screw up
+ * the accounting of what needs to be modified
+ * and not. For now, we kill ftrace if we fail
+ * to allocate here. But there are ways around this,
+ * but that will take a little more work.
+ */
+ if (!ops->tramp_hash)
+ return -ENOMEM;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (ftrace_rec_count(rec) == 1 &&
+ ftrace_ops_test(ops, rec->ip, rec)) {
+
+ /*
+ * If another ops adds to a rec, the rec will
+ * lose its trampoline and never get it back
+ * until all ops are off of it.
+ */
+ if (!(rec->flags & FTRACE_FL_TRAMP))
+ continue;
+
+ /* This record had better have a trampoline */
+ if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
+ return -1;
+
+ ret = add_hash_entry(ops->tramp_hash, rec->ip);
+ if (ret < 0)
+ return ret;
+ }
+ } while_for_each_ftrace_rec();
+
+ /* The number of recs in the hash must match nr_trampolines */
+ FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
+
+ return 0;
+}
+
+static int ftrace_save_tramp_hashes(void)
+{
+ struct ftrace_ops *op;
+ int ret;
+
+ /*
+ * Now that any trampoline is being used, we need to save the
+ * hashes for the ops that have them. This allows the mapping
+ * back from the record to the ops that has the trampoline to
+ * know what code is being replaced. Modifying code must always
+ * verify what it is changing.
+ */
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ /* The tramp_hash is recreated each time. */
+ free_ftrace_hash(op->tramp_hash);
+ op->tramp_hash = NULL;
+
+ if (op->nr_trampolines) {
+ ret = ftrace_save_ops_tramp_hash(op);
+ if (ret)
+ return ret;
+ }
+
+ } while_for_each_ftrace_op(op);
+
+ return 0;
+}
+
static void ftrace_run_update_code(int command)
{
int ret;
@@ -2031,11 +2269,6 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
if (ret)
return;
- /*
- * Do not call function tracer while we update the code.
- * We are in stop machine.
- */
- function_trace_stop++;
/*
* By default we use stop_machine() to modify the code.
@@ -2045,15 +2278,15 @@ static void ftrace_run_update_code(int command)
*/
arch_ftrace_update_code(command);
- function_trace_stop--;
-
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+
+ ret = ftrace_save_tramp_hashes();
+ FTRACE_WARN_ON(ret);
}
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
-static int global_start_up;
static void control_ops_free(struct ftrace_ops *ops)
{
@@ -2117,8 +2350,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
ftrace_hash_rec_disable(ops, 1);
- if (!global_start_up)
- ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
command |= FTRACE_UPDATE_CALLS;
@@ -2139,8 +2371,16 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
return 0;
}
+ /*
+ * If the ops uses a trampoline, then it needs to be
+ * tested first on update.
+ */
+ removed_ops = ops;
+
ftrace_run_update_code(command);
+ removed_ops = NULL;
+
/*
* Dynamic ops may be freed, we must make sure that all
* callers are done before leaving this function.
@@ -2398,7 +2638,8 @@ ftrace_allocate_pages(unsigned long num_to_init)
return start_pg;
free_pages:
- while (start_pg) {
+ pg = start_pg;
+ while (pg) {
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
start_pg = pg->next;
@@ -2595,8 +2836,10 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER &&
- ftrace_hash_empty(ops->filter_hash)) {
+ if ((iter->flags & FTRACE_ITER_FILTER &&
+ ftrace_hash_empty(ops->filter_hash)) ||
+ (iter->flags & FTRACE_ITER_NOTRACE &&
+ ftrace_hash_empty(ops->notrace_hash))) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2641,7 +2884,10 @@ static int t_show(struct seq_file *m, void *v)
return t_hash_show(m, iter);
if (iter->flags & FTRACE_ITER_PRINTALL) {
- seq_printf(m, "#### all functions enabled ####\n");
+ if (iter->flags & FTRACE_ITER_NOTRACE)
+ seq_printf(m, "#### no functions disabled ####\n");
+ else
+ seq_printf(m, "#### all functions enabled ####\n");
return 0;
}
@@ -2651,10 +2897,22 @@ static int t_show(struct seq_file *m, void *v)
return 0;
seq_printf(m, "%ps", (void *)rec->ip);
- if (iter->flags & FTRACE_ITER_ENABLED)
+ if (iter->flags & FTRACE_ITER_ENABLED) {
seq_printf(m, " (%ld)%s",
- rec->flags & ~FTRACE_FL_MASK,
- rec->flags & FTRACE_FL_REGS ? " R" : "");
+ ftrace_rec_count(rec),
+ rec->flags & FTRACE_FL_REGS ? " R" : " ");
+ if (rec->flags & FTRACE_FL_TRAMP_EN) {
+ struct ftrace_ops *ops;
+
+ ops = ftrace_find_tramp_ops_curr(rec);
+ if (ops && ops->trampoline)
+ seq_printf(m, "\ttramp: %pS",
+ (void *)ops->trampoline);
+ else
+ seq_printf(m, "\ttramp: ERROR!");
+ }
+ }
+
seq_printf(m, "\n");
return 0;
@@ -2702,13 +2960,6 @@ ftrace_enabled_open(struct inode *inode, struct file *file)
return iter ? 0 : -ENOMEM;
}
-static void ftrace_filter_reset(struct ftrace_hash *hash)
-{
- mutex_lock(&ftrace_lock);
- ftrace_hash_clear(hash);
- mutex_unlock(&ftrace_lock);
-}
-
/**
* ftrace_regex_open - initialize function tracer filter files
* @ops: The ftrace_ops that hold the hash filters
@@ -2758,7 +3009,13 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
hash = ops->filter_hash;
if (file->f_mode & FMODE_WRITE) {
- iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+ const int size_bits = FTRACE_HASH_DEFAULT_BITS;
+
+ if (file->f_flags & O_TRUNC)
+ iter->hash = alloc_ftrace_hash(size_bits);
+ else
+ iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+
if (!iter->hash) {
trace_parser_put(&iter->parser);
kfree(iter);
@@ -2767,10 +3024,6 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
}
}
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC))
- ftrace_filter_reset(iter->hash);
-
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
@@ -3471,14 +3724,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
else
orig_hash = &ops->notrace_hash;
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (reset)
+ hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
+ else
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+
if (!hash) {
ret = -ENOMEM;
goto out_regex_unlock;
}
- if (reset)
- ftrace_filter_reset(hash);
if (buf && !ftrace_match_records(hash, buf, len)) {
ret = -EINVAL;
goto out_regex_unlock;
@@ -3630,6 +3885,7 @@ __setup("ftrace_filter=", set_ftrace_filter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
static int __init set_graph_function(char *str)
@@ -3639,16 +3895,29 @@ static int __init set_graph_function(char *str)
}
__setup("ftrace_graph_filter=", set_graph_function);
-static void __init set_ftrace_early_graph(char *buf)
+static int __init set_graph_notrace_function(char *str)
+{
+ strlcpy(ftrace_graph_notrace_buf, str, FTRACE_FILTER_SIZE);
+ return 1;
+}
+__setup("ftrace_graph_notrace=", set_graph_notrace_function);
+
+static void __init set_ftrace_early_graph(char *buf, int enable)
{
int ret;
char *func;
+ unsigned long *table = ftrace_graph_funcs;
+ int *count = &ftrace_graph_count;
+
+ if (!enable) {
+ table = ftrace_graph_notrace_funcs;
+ count = &ftrace_graph_notrace_count;
+ }
while (buf) {
func = strsep(&buf, ",");
/* we allow only one expression at a time */
- ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count,
- FTRACE_GRAPH_MAX_FUNCS, func);
+ ret = ftrace_set_func(table, count, FTRACE_GRAPH_MAX_FUNCS, func);
if (ret)
printk(KERN_DEBUG "ftrace: function %s not "
"traceable\n", func);
@@ -3677,7 +3946,9 @@ static void __init set_ftrace_early_filters(void)
ftrace_set_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
- set_ftrace_early_graph(ftrace_graph_buf);
+ set_ftrace_early_graph(ftrace_graph_buf, 1);
+ if (ftrace_graph_notrace_buf[0])
+ set_ftrace_early_graph(ftrace_graph_notrace_buf, 0);
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
}
@@ -3819,7 +4090,12 @@ static int g_show(struct seq_file *m, void *v)
return 0;
if (ptr == (unsigned long *)1) {
- seq_printf(m, "#### all functions enabled ####\n");
+ struct ftrace_graph_data *fgd = m->private;
+
+ if (fgd->table == ftrace_graph_funcs)
+ seq_printf(m, "#### all functions enabled ####\n");
+ else
+ seq_printf(m, "#### no functions disabled ####\n");
return 0;
}
@@ -4447,9 +4723,6 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- if (function_trace_stop)
- return;
-
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
@@ -4461,9 +4734,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
if (ftrace_ops_test(op, ip, regs)) {
- if (WARN_ON(!op->func)) {
- function_trace_stop = 1;
- printk("op=%p %pS\n", op, op);
+ if (FTRACE_WARN_ON(!op->func)) {
+ pr_warn("op=%p %pS\n", op, op);
goto out;
}
op->func(ip, parent_ip, op, regs);
@@ -5084,6 +5356,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
/* Function graph doesn't use the .func field of global_ops */
global_ops.flags |= FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ /* Optimize function graph calling (if implemented by arch) */
+ if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+ global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
+#endif
+
ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
@@ -5104,6 +5382,10 @@ void unregister_ftrace_graph(void)
__ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
global_ops.flags &= ~FTRACE_OPS_FL_STUB;
+#ifdef CONFIG_DYNAMIC_FTRACE
+ if (FTRACE_GRAPH_TRAMP_ADDR != 0)
+ global_ops.trampoline = 0;
+#endif
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -5183,9 +5465,4 @@ void ftrace_graph_exit_task(struct task_struct *t)
kfree(ret_stack);
}
-
-void ftrace_graph_stop(void)
-{
- ftrace_stop();
-}
#endif
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ff7027199a9a..afb04b9b818a 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1689,22 +1689,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
- /* The update must run on the CPU that is being updated. */
- preempt_disable();
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu)) {
rb_update_pages(cpu_buffer);
cpu_buffer->nr_pages_to_update = 0;
} else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- preempt_disable();
}
- preempt_enable();
}
/* wait for all the updates to complete */
@@ -1742,22 +1734,14 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- preempt_disable();
- /* The update must run on the CPU that is being updated. */
- if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ /* Can't run something on an offline CPU. */
+ if (!cpu_online(cpu_id))
rb_update_pages(cpu_buffer);
else {
- /*
- * Can not disable preemption for schedule_work_on()
- * on PREEMPT_RT.
- */
- preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- preempt_disable();
}
- preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -1984,7 +1968,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
/**
* rb_update_event - update event type and data
- * @event: the even to update
+ * @event: the event to update
* @type: the type of event
* @length: the size of the event field in the ring buffer
*
@@ -3357,21 +3341,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
/* Iterator usage is expected to have record disabled */
- if (list_empty(&cpu_buffer->reader_page->list)) {
- iter->head_page = rb_set_head_page(cpu_buffer);
- if (unlikely(!iter->head_page))
- return;
- iter->head = iter->head_page->read;
- } else {
- iter->head_page = cpu_buffer->reader_page;
- iter->head = cpu_buffer->reader_page->read;
- }
+ iter->head_page = cpu_buffer->reader_page;
+ iter->head = cpu_buffer->reader_page->read;
+
+ iter->cache_reader_page = iter->head_page;
+ iter->cache_read = iter->head;
+
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
else
iter->read_stamp = iter->head_page->page->time_stamp;
- iter->cache_reader_page = cpu_buffer->reader_page;
- iter->cache_read = cpu_buffer->read;
}
/**
@@ -3764,18 +3743,20 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a time extend is encountered.
- * Since the time extend is always attached to a data event,
- * we should never loop more than once.
- * (We never hit the following condition more than twice).
+ * We repeat when a time extend is encountered or we hit
+ * the end of the page. Since the time extend is always attached
+ * to a data event, we should never loop more than three times.
+ * Once for going to next page, once on time extend, and
+ * finally once to get the event.
+ * (We never hit the following condition more than thrice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
return NULL;
- if (iter->head >= local_read(&iter->head_page->page->commit)) {
+ if (iter->head >= rb_page_size(iter->head_page)) {
rb_inc_iter(iter);
goto again;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 291397e66669..8a528392b1f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -820,11 +820,12 @@ static struct {
const char *name;
int in_ns; /* is this clock in nanoseconds? */
} trace_clocks[] = {
- { trace_clock_local, "local", 1 },
- { trace_clock_global, "global", 1 },
- { trace_clock_counter, "counter", 0 },
- { trace_clock_jiffies, "uptime", 0 },
- { trace_clock, "perf", 1 },
+ { trace_clock_local, "local", 1 },
+ { trace_clock_global, "global", 1 },
+ { trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 0 },
+ { trace_clock, "perf", 1 },
+ { ktime_get_mono_fast_ns, "mono", 1 },
ARCH_TRACE_CLOCKS
};
@@ -937,30 +938,6 @@ out:
return ret;
}
-ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
-{
- int len;
- int ret;
-
- if (!cnt)
- return 0;
-
- if (s->len <= s->readpos)
- return -EBUSY;
-
- len = s->len - s->readpos;
- if (cnt > len)
- cnt = len;
- ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
- if (ret == cnt)
- return -EFAULT;
-
- cnt -= ret;
-
- s->readpos += cnt;
- return cnt;
-}
-
static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
{
int len;
@@ -3699,6 +3676,7 @@ static const char readme_msg[] =
#endif
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
" set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " set_graph_notrace\t- Do not trace the nested calls of a function (function_graph)\n"
" max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
#endif
#ifdef CONFIG_TRACER_SNAPSHOT
@@ -4238,10 +4216,9 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
}
static ssize_t
-tracing_max_lat_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_read(unsigned long *ptr, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
char buf[64];
int r;
@@ -4253,10 +4230,9 @@ tracing_max_lat_read(struct file *filp, char __user *ubuf,
}
static ssize_t
-tracing_max_lat_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
+tracing_nsecs_write(unsigned long *ptr, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
{
- unsigned long *ptr = filp->private_data;
unsigned long val;
int ret;
@@ -4269,6 +4245,52 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
return cnt;
}
+static ssize_t
+tracing_thresh_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(&tracing_thresh, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_thresh_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+ ret = tracing_nsecs_write(&tracing_thresh, ubuf, cnt, ppos);
+ if (ret < 0)
+ goto out;
+
+ if (tr->current_trace->update_thresh) {
+ ret = tr->current_trace->update_thresh(tr);
+ if (ret < 0)
+ goto out;
+ }
+
+ ret = cnt;
+out:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static ssize_t
+tracing_max_lat_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_read(filp->private_data, ubuf, cnt, ppos);
+}
+
+static ssize_t
+tracing_max_lat_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos);
+}
+
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
struct trace_array *tr = inode->i_private;
@@ -5170,6 +5192,13 @@ static int snapshot_raw_open(struct inode *inode, struct file *filp)
#endif /* CONFIG_TRACER_SNAPSHOT */
+static const struct file_operations tracing_thresh_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_thresh_read,
+ .write = tracing_thresh_write,
+ .llseek = generic_file_llseek,
+};
+
static const struct file_operations tracing_max_lat_fops = {
.open = tracing_open_generic,
.read = tracing_max_lat_read,
@@ -6107,10 +6136,8 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
if (!topts)
return;
- for (cnt = 0; topts[cnt].opt; cnt++) {
- if (topts[cnt].entry)
- debugfs_remove(topts[cnt].entry);
- }
+ for (cnt = 0; topts[cnt].opt; cnt++)
+ debugfs_remove(topts[cnt].entry);
kfree(topts);
}
@@ -6533,7 +6560,7 @@ static __init int tracer_init_debugfs(void)
init_tracer_debugfs(&global_trace, d_tracer);
trace_create_file("tracing_thresh", 0644, d_tracer,
- &tracing_thresh, &tracing_max_lat_fops);
+ &global_trace, &tracing_thresh_fops);
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9258f5a815db..385391fb1d3b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -339,6 +339,7 @@ struct tracer_flags {
* @reset: called when one switches to another tracer
* @start: called when tracing is unpaused (echo 1 > tracing_enabled)
* @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @update_thresh: called when tracing_thresh is updated
* @open: called when the trace file is opened
* @pipe_open: called when the trace_pipe file is opened
* @close: called when the trace file is released
@@ -357,6 +358,7 @@ struct tracer {
void (*reset)(struct trace_array *tr);
void (*start)(struct trace_array *tr);
void (*stop)(struct trace_array *tr);
+ int (*update_thresh)(struct trace_array *tr);
void (*open)(struct trace_iterator *iter);
void (*pipe_open)(struct trace_iterator *iter);
void (*close)(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 5d12bb407b44..4b9c114ee9de 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -30,6 +30,18 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
return ret;
}
+ /*
+ * We checked and allowed to create parent,
+ * allow children without checking.
+ */
+ if (p_event->parent)
+ return 0;
+
+ /*
+ * It's ok to check current process (owner) permissions in here,
+ * because code below is called only via perf_event_open syscall.
+ */
+
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event)) {
if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2de53628689f..ef06ce7e9cf8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,6 +8,8 @@
*
*/
+#define pr_fmt(fmt) fmt
+
#include <linux/workqueue.h>
#include <linux/spinlock.h>
#include <linux/kthread.h>
@@ -1491,7 +1493,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
dir->entry = debugfs_create_dir(name, parent);
if (!dir->entry) {
- pr_warning("Failed to create system directory %s\n", name);
+ pr_warn("Failed to create system directory %s\n", name);
__put_system(system);
goto out_free;
}
@@ -1507,7 +1509,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs '%s/filter' entry\n", name);
+ pr_warn("Could not create debugfs '%s/filter' entry\n", name);
}
trace_create_file("enable", 0644, dir->entry, dir,
@@ -1522,8 +1524,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name,
out_fail:
/* Only print this message if failed on memory allocation */
if (!dir || !system)
- pr_warning("No memory to create event subsystem %s\n",
- name);
+ pr_warn("No memory to create event subsystem %s\n", name);
return NULL;
}
@@ -1551,8 +1552,7 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
name = ftrace_event_name(call);
file->dir = debugfs_create_dir(name, d_events);
if (!file->dir) {
- pr_warning("Could not create debugfs '%s' directory\n",
- name);
+ pr_warn("Could not create debugfs '%s' directory\n", name);
return -1;
}
@@ -1575,8 +1575,8 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file)
if (list_empty(head)) {
ret = call->class->define_fields(call);
if (ret < 0) {
- pr_warning("Could not initialize trace point"
- " events/%s\n", name);
+ pr_warn("Could not initialize trace point events/%s\n",
+ name);
return -1;
}
}
@@ -1621,7 +1621,6 @@ static void event_remove(struct ftrace_event_call *call)
if (file->event_call != call)
continue;
ftrace_event_enable_disable(file, 0);
- destroy_preds(file);
/*
* The do_for_each_event_file() is
* a double loop. After finding the call for this
@@ -1649,8 +1648,7 @@ static int event_init(struct ftrace_event_call *call)
if (call->class->raw_init) {
ret = call->class->raw_init(call);
if (ret < 0 && ret != -ENOSYS)
- pr_warn("Could not initialize trace events/%s\n",
- name);
+ pr_warn("Could not initialize trace events/%s\n", name);
}
return ret;
@@ -1749,7 +1747,8 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
- destroy_call_preds(call);
+ free_event_filter(call->filter);
+ call->filter = NULL;
}
static int probe_remove_event_call(struct ftrace_event_call *call)
@@ -1895,8 +1894,8 @@ __trace_add_event_dirs(struct trace_array *tr)
list_for_each_entry(call, &ftrace_events, list) {
ret = __trace_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2208,8 +2207,8 @@ __trace_early_add_event_dirs(struct trace_array *tr)
list_for_each_entry(file, &tr->events, list) {
ret = event_create_dir(tr->event_dir, file);
if (ret < 0)
- pr_warning("Could not create directory for event %s\n",
- ftrace_event_name(file->event_call));
+ pr_warn("Could not create directory for event %s\n",
+ ftrace_event_name(file->event_call));
}
}
@@ -2232,8 +2231,8 @@ __trace_early_add_events(struct trace_array *tr)
ret = __trace_early_add_new_event(call, tr);
if (ret < 0)
- pr_warning("Could not create early event %s\n",
- ftrace_event_name(call));
+ pr_warn("Could not create early event %s\n",
+ ftrace_event_name(call));
}
}
@@ -2280,13 +2279,13 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
entry = debugfs_create_file("set_event", 0644, parent,
tr, &ftrace_set_event_fops);
if (!entry) {
- pr_warning("Could not create debugfs 'set_event' entry\n");
+ pr_warn("Could not create debugfs 'set_event' entry\n");
return -ENOMEM;
}
d_events = debugfs_create_dir("events", parent);
if (!d_events) {
- pr_warning("Could not create debugfs 'events' directory\n");
+ pr_warn("Could not create debugfs 'events' directory\n");
return -ENOMEM;
}
@@ -2462,11 +2461,10 @@ static __init int event_trace_init(void)
entry = debugfs_create_file("available_events", 0444, d_tracer,
tr, &ftrace_avail_fops);
if (!entry)
- pr_warning("Could not create debugfs "
- "'available_events' entry\n");
+ pr_warn("Could not create debugfs 'available_events' entry\n");
if (trace_define_common_fields())
- pr_warning("tracing: Failed to allocate common fields");
+ pr_warn("tracing: Failed to allocate common fields");
ret = early_event_add_tracer(d_tracer, tr);
if (ret)
@@ -2475,7 +2473,7 @@ static __init int event_trace_init(void)
#ifdef CONFIG_MODULES
ret = register_module_notifier(&trace_module_nb);
if (ret)
- pr_warning("Failed to register trace events module notifier\n");
+ pr_warn("Failed to register trace events module notifier\n");
#endif
return 0;
}
@@ -2579,7 +2577,7 @@ static __init void event_trace_self_tests(void)
* it and the self test should not be on.
*/
if (file->flags & FTRACE_EVENT_FL_ENABLED) {
- pr_warning("Enabled event during self test!\n");
+ pr_warn("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
@@ -2607,8 +2605,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling system %s\n",
- system->name);
+ pr_warn("error enabling system %s\n",
+ system->name);
continue;
}
@@ -2616,8 +2614,8 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling system %s\n",
- system->name);
+ pr_warn("error disabling system %s\n",
+ system->name);
continue;
}
@@ -2631,7 +2629,7 @@ static __init void event_trace_self_tests(void)
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error enabling all events\n");
+ pr_warn("error enabling all events\n");
return;
}
@@ -2640,7 +2638,7 @@ static __init void event_trace_self_tests(void)
/* reset sysname */
ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
- pr_warning("error disabling all events\n");
+ pr_warn("error disabling all events\n");
return;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8a8631926a07..7a8c1528e141 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -774,17 +774,12 @@ static void __free_preds(struct event_filter *filter)
filter->n_preds = 0;
}
-static void call_filter_disable(struct ftrace_event_call *call)
-{
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
-}
-
static void filter_disable(struct ftrace_event_file *file)
{
struct ftrace_event_call *call = file->event_call;
if (call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- call_filter_disable(call);
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
else
file->flags &= ~FTRACE_EVENT_FL_FILTERED;
}
@@ -804,32 +799,6 @@ void free_event_filter(struct event_filter *filter)
__free_filter(filter);
}
-void destroy_call_preds(struct ftrace_event_call *call)
-{
- __free_filter(call->filter);
- call->filter = NULL;
-}
-
-static void destroy_file_preds(struct ftrace_event_file *file)
-{
- __free_filter(file->filter);
- file->filter = NULL;
-}
-
-/*
- * Called when destroying the ftrace_event_file.
- * The file is being freed, so we do not need to worry about
- * the file being currently used. This is for module code removing
- * the tracepoints from within it.
- */
-void destroy_preds(struct ftrace_event_file *file)
-{
- if (file->event_call->flags & TRACE_EVENT_FL_USE_CALL_FILTER)
- destroy_call_preds(file->event_call);
- else
- destroy_file_preds(file);
-}
-
static struct event_filter *__alloc_filter(void)
{
struct event_filter *filter;
@@ -873,17 +842,14 @@ static inline void __remove_filter(struct ftrace_event_file *file)
remove_filter_string(file->filter);
}
-static void filter_free_subsystem_preds(struct event_subsystem *system,
+static void filter_free_subsystem_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
-
__remove_filter(file);
}
}
@@ -901,15 +867,13 @@ static inline void __free_subsystem_filter(struct ftrace_event_file *file)
}
}
-static void filter_free_subsystem_filters(struct event_subsystem *system,
+static void filter_free_subsystem_filters(struct ftrace_subsystem_dir *dir,
struct trace_array *tr)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
__free_subsystem_filter(file);
}
@@ -1582,7 +1546,6 @@ static int fold_pred_tree(struct event_filter *filter,
static int replace_preds(struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
- char *filter_string,
bool dry_run)
{
char *operand1 = NULL, *operand2 = NULL;
@@ -1755,13 +1718,12 @@ struct filter_list {
struct event_filter *filter;
};
-static int replace_system_preds(struct event_subsystem *system,
+static int replace_system_preds(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
struct filter_parse_state *ps,
char *filter_string)
{
struct ftrace_event_file *file;
- struct ftrace_event_call *call;
struct filter_list *filter_item;
struct filter_list *tmp;
LIST_HEAD(filter_list);
@@ -1769,15 +1731,14 @@ static int replace_system_preds(struct event_subsystem *system,
int err;
list_for_each_entry(file, &tr->events, list) {
- call = file->event_call;
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
/*
* Try to see if the filter can be applied
* (filter arg is ignored on dry_run)
*/
- err = replace_preds(call, NULL, ps, filter_string, true);
+ err = replace_preds(file->event_call, NULL, ps, true);
if (err)
event_set_no_set_filter_flag(file);
else
@@ -1787,9 +1748,7 @@ static int replace_system_preds(struct event_subsystem *system,
list_for_each_entry(file, &tr->events, list) {
struct event_filter *filter;
- call = file->event_call;
-
- if (strcmp(call->class->system, system->name) != 0)
+ if (file->system != dir)
continue;
if (event_no_set_filter_flag(file))
@@ -1811,7 +1770,7 @@ static int replace_system_preds(struct event_subsystem *system,
if (err)
goto fail_mem;
- err = replace_preds(call, filter, ps, filter_string, false);
+ err = replace_preds(file->event_call, filter, ps, false);
if (err) {
filter_disable(file);
parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
@@ -1933,7 +1892,7 @@ static int create_filter(struct ftrace_event_call *call,
err = create_filter_start(filter_str, set_str, &ps, &filter);
if (!err) {
- err = replace_preds(call, filter, ps, filter_str, false);
+ err = replace_preds(call, filter, ps, false);
if (err && set_str)
append_filter_err(ps, filter);
}
@@ -1959,7 +1918,7 @@ int create_event_filter(struct ftrace_event_call *call,
* Identical to create_filter() except that it creates a subsystem filter
* and always remembers @filter_str.
*/
-static int create_system_filter(struct event_subsystem *system,
+static int create_system_filter(struct ftrace_subsystem_dir *dir,
struct trace_array *tr,
char *filter_str, struct event_filter **filterp)
{
@@ -1969,7 +1928,7 @@ static int create_system_filter(struct event_subsystem *system,
err = create_filter_start(filter_str, true, &ps, &filter);
if (!err) {
- err = replace_system_preds(system, tr, ps, filter_str);
+ err = replace_system_preds(dir, tr, ps, filter_str);
if (!err) {
/* System filters just show a default message */
kfree(filter->filter_string);
@@ -2053,18 +2012,18 @@ int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
}
if (!strcmp(strstrip(filter_string), "0")) {
- filter_free_subsystem_preds(system, tr);
+ filter_free_subsystem_preds(dir, tr);
remove_filter_string(system->filter);
filter = system->filter;
system->filter = NULL;
/* Ensure all filters are no longer used */
synchronize_sched();
- filter_free_subsystem_filters(system, tr);
+ filter_free_subsystem_filters(dir, tr);
__free_filter(filter);
goto out_unlock;
}
- err = create_system_filter(system, tr, filter_string, &filter);
+ err = create_system_filter(dir, tr, filter_string, &filter);
if (filter) {
/*
* No event actually uses the system filter
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4de3e57f723c..f0a0c982cde3 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -15,6 +15,33 @@
#include "trace.h"
#include "trace_output.h"
+static bool kill_ftrace_graph;
+
+/**
+ * ftrace_graph_is_dead - returns true if ftrace_graph_stop() was called
+ *
+ * ftrace_graph_stop() is called when a severe error is detected in
+ * the function graph tracing. This function is called by the critical
+ * paths of function graph to keep those paths from doing any more harm.
+ */
+bool ftrace_graph_is_dead(void)
+{
+ return kill_ftrace_graph;
+}
+
+/**
+ * ftrace_graph_stop - set to permanently disable function graph tracincg
+ *
+ * In case of an error int function graph tracing, this is called
+ * to try to keep function graph tracing from causing any more harm.
+ * Usually this is pretty severe and this is called to try to at least
+ * get a warning out to the user.
+ */
+void ftrace_graph_stop(void)
+{
+ kill_ftrace_graph = true;
+}
+
/* When set, irq functions will be ignored */
static int ftrace_graph_skip_irqs;
@@ -92,6 +119,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth,
unsigned long long calltime;
int index;
+ if (unlikely(ftrace_graph_is_dead()))
+ return -EBUSY;
+
if (!current->ret_stack)
return -EBUSY;
@@ -323,7 +353,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
return ret;
}
-int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
+static int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
{
if (tracing_thresh)
return 1;
@@ -412,7 +442,7 @@ void set_graph_array(struct trace_array *tr)
smp_mb();
}
-void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
+static void trace_graph_thresh_return(struct ftrace_graph_ret *trace)
{
if (tracing_thresh &&
(trace->rettime - trace->calltime < tracing_thresh))
@@ -445,6 +475,12 @@ static void graph_trace_reset(struct trace_array *tr)
unregister_ftrace_graph();
}
+static int graph_trace_update_thresh(struct trace_array *tr)
+{
+ graph_trace_reset(tr);
+ return graph_trace_init(tr);
+}
+
static int max_bytes_for_cpu;
static enum print_line_t
@@ -1399,7 +1435,7 @@ static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
seq_printf(s, " | | | |\n");
}
-void print_graph_headers(struct seq_file *s)
+static void print_graph_headers(struct seq_file *s)
{
print_graph_headers_flags(s, tracer_flags.val);
}
@@ -1495,6 +1531,7 @@ static struct trace_event graph_trace_ret_event = {
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
+ .update_thresh = graph_trace_update_thresh,
.open = graph_trace_open,
.pipe_open = graph_trace_open,
.close = graph_trace_close,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f3dad80c20b2..c6977d5a9b12 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -20,23 +20,6 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
static int next_event_type = __TRACE_LAST_TYPE + 1;
-int trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
- int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
- int ret;
-
- ret = seq_write(m, s->buffer, len);
-
- /*
- * Only reset this buffer if we successfully wrote to the
- * seq_file buffer.
- */
- if (!ret)
- trace_seq_init(s);
-
- return ret;
-}
-
enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -85,257 +68,6 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
return TRACE_TYPE_HANDLED;
}
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- va_list ap;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- va_start(ap, fmt);
- ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
- va_end(ap);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_printf);
-
-/**
- * trace_seq_bitmask - put a list of longs as a bitmask print output
- * @s: trace sequence descriptor
- * @maskp: points to an array of unsigned longs that represent a bitmask
- * @nmaskbits: The number of bits that are valid in @maskp
- *
- * It returns 0 if the trace oversizes the buffer's free
- * space, 1 otherwise.
- *
- * Writes a ASCII representation of a bitmask string into @s.
- */
-int
-trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
- int nmaskbits)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
- s->len += ret;
-
- return 1;
-}
-EXPORT_SYMBOL_GPL(trace_seq_bitmask);
-
-/**
- * trace_seq_vprintf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = vsnprintf(s->buffer + s->len, len, fmt, args);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-EXPORT_SYMBOL_GPL(trace_seq_vprintf);
-
-int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
-{
- int len = (PAGE_SIZE - 1) - s->len;
- int ret;
-
- if (s->full || !len)
- return 0;
-
- ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
-
- /* If we can't write it all, don't bother writing anything */
- if (ret >= len) {
- s->full = 1;
- return 0;
- }
-
- s->len += ret;
-
- return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-int trace_seq_puts(struct trace_seq *s, const char *str)
-{
- int len = strlen(str);
-
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, str, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- s->buffer[s->len++] = c;
-
- return 1;
-}
-EXPORT_SYMBOL(trace_seq_putc);
-
-int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
-{
- if (s->full)
- return 0;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return 0;
- }
-
- memcpy(s->buffer + s->len, mem, len);
- s->len += len;
-
- return len;
-}
-
-int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
-{
- unsigned char hex[HEX_CHARS];
- const unsigned char *data = mem;
- int i, j;
-
- if (s->full)
- return 0;
-
-#ifdef __BIG_ENDIAN
- for (i = 0, j = 0; i < len; i++) {
-#else
- for (i = len-1, j = 0; i >= 0; i--) {
-#endif
- hex[j++] = hex_asc_hi(data[i]);
- hex[j++] = hex_asc_lo(data[i]);
- }
- hex[j++] = ' ';
-
- return trace_seq_putmem(s, hex, j);
-}
-
-void *trace_seq_reserve(struct trace_seq *s, size_t len)
-{
- void *ret;
-
- if (s->full)
- return NULL;
-
- if (len > ((PAGE_SIZE - 1) - s->len)) {
- s->full = 1;
- return NULL;
- }
-
- ret = s->buffer + s->len;
- s->len += len;
-
- return ret;
-}
-
-int trace_seq_path(struct trace_seq *s, const struct path *path)
-{
- unsigned char *p;
-
- if (s->full)
- return 0;
-
- if (s->len >= (PAGE_SIZE - 1)) {
- s->full = 1;
- return 0;
- }
-
- p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
- if (!IS_ERR(p)) {
- p = mangle_path(s->buffer + s->len, p, "\n");
- if (p) {
- s->len = p - s->buffer;
- return 1;
- }
- } else {
- s->buffer[s->len++] = '?';
- return 1;
- }
-
- s->full = 1;
- return 0;
-}
-
const char *
ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
unsigned long flags,
@@ -343,7 +75,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
{
unsigned long mask;
const char *str;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
int i, first = 1;
for (i = 0; flag_array[i].name && flags; i++) {
@@ -379,7 +111,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
const struct trace_print_flags *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -390,7 +122,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%lx", val);
trace_seq_putc(p, 0);
@@ -405,7 +137,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
const struct trace_print_flags_u64 *symbol_array)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; symbol_array[i].name; i++) {
@@ -416,7 +148,7 @@ ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
break;
}
- if (ret == (const char *)(p->buffer + p->len))
+ if (ret == (const char *)(trace_seq_buffer_ptr(p)))
trace_seq_printf(p, "0x%llx", val);
trace_seq_putc(p, 0);
@@ -430,7 +162,7 @@ const char *
ftrace_print_bitmask_seq(struct trace_seq *p, void *bitmask_ptr,
unsigned int bitmask_size)
{
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
trace_seq_bitmask(p, bitmask_ptr, bitmask_size * 8);
trace_seq_putc(p, 0);
@@ -443,7 +175,7 @@ const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
int i;
- const char *ret = p->buffer + p->len;
+ const char *ret = trace_seq_buffer_ptr(p);
for (i = 0; i < buf_len; i++)
trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 127a9d8c8357..80b25b585a70 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -35,9 +35,6 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
extern int __unregister_ftrace_event(struct trace_event *event);
extern struct rw_semaphore trace_event_sem;
-#define MAX_MEMHEX_BYTES 8
-#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
-
#define SEQ_PUT_FIELD_RET(s, x) \
do { \
if (!trace_seq_putmem(s, &(x), sizeof(x))) \
@@ -46,7 +43,6 @@ do { \
#define SEQ_PUT_HEX_FIELD_RET(s, x) \
do { \
- BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
return TRACE_TYPE_PARTIAL_LINE; \
} while (0)
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
new file mode 100644
index 000000000000..1f24ed99dca2
--- /dev/null
+++ b/kernel/trace/trace_seq.c
@@ -0,0 +1,428 @@
+/*
+ * trace_seq.c
+ *
+ * Copyright (C) 2008-2014 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * The trace_seq is a handy tool that allows you to pass a descriptor around
+ * to a buffer that other functions can write to. It is similar to the
+ * seq_file functionality but has some differences.
+ *
+ * To use it, the trace_seq must be initialized with trace_seq_init().
+ * This will set up the counters within the descriptor. You can call
+ * trace_seq_init() more than once to reset the trace_seq to start
+ * from scratch.
+ *
+ * The buffer size is currently PAGE_SIZE, although it may become dynamic
+ * in the future.
+ *
+ * A write to the buffer will either succed or fail. That is, unlike
+ * sprintf() there will not be a partial write (well it may write into
+ * the buffer but it wont update the pointers). This allows users to
+ * try to write something into the trace_seq buffer and if it fails
+ * they can flush it and try again.
+ *
+ */
+#include <linux/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/trace_seq.h>
+
+/* How much buffer is left on the trace_seq? */
+#define TRACE_SEQ_BUF_LEFT(s) ((PAGE_SIZE - 1) - (s)->len)
+
+/* How much buffer is written? */
+#define TRACE_SEQ_BUF_USED(s) min((s)->len, (unsigned int)(PAGE_SIZE - 1))
+
+/**
+ * trace_print_seq - move the contents of trace_seq into a seq_file
+ * @m: the seq_file descriptor that is the destination
+ * @s: the trace_seq descriptor that is the source.
+ *
+ * Returns 0 on success and non zero on error. If it succeeds to
+ * write to the seq_file it will reset the trace_seq, otherwise
+ * it does not modify the trace_seq to let the caller try again.
+ */
+int trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+ unsigned int len = TRACE_SEQ_BUF_USED(s);
+ int ret;
+
+ ret = seq_write(m, s->buffer, len);
+
+ /*
+ * Only reset this buffer if we successfully wrote to the
+ * seq_file buffer. This lets the caller try again or
+ * do something else with the contents.
+ */
+ if (!ret)
+ trace_seq_init(s);
+
+ return ret;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf() is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ va_list ap;
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_bitmask - write a bitmask array in its ASCII representation
+ * @s: trace sequence descriptor
+ * @maskp: points to an array of unsigned longs that represent a bitmask
+ * @nmaskbits: The number of bits that are valid in @maskp
+ *
+ * Writes a ASCII representation of a bitmask string into @s.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_bitmask(struct trace_seq *s, const unsigned long *maskp,
+ int nmaskbits)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bitmap_scnprintf(s->buffer, len, maskp, nmaskbits);
+ s->len += ret;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bitmask);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
+/**
+ * trace_seq_bprintf - Write the printf string from binary arguments
+ * @s: trace sequence descriptor
+ * @fmt: The format string for the @binary arguments
+ * @binary: The binary arguments for @fmt.
+ *
+ * When recording in a fast path, a printf may be recorded with just
+ * saving the format and the arguments as they were passed to the
+ * function, instead of wasting cycles converting the arguments into
+ * ASCII characters. Instead, the arguments are saved in a 32 bit
+ * word array that is defined by the format string constraints.
+ *
+ * This function will take the format and the binary array and finish
+ * the conversion into the ASCII string within the buffer.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+ unsigned int len = TRACE_SEQ_BUF_LEFT(s);
+ int ret;
+
+ if (s->full || !len)
+ return 0;
+
+ ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->len += ret;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_bprintf);
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+ unsigned int len = strlen(str);
+
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_puts);
+
+/**
+ * trace_seq_putc - trace sequence printing of simple character
+ * @s: trace sequence descriptor
+ * @c: simple character to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple charater
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putc);
+
+/**
+ * trace_seq_putmem - write raw data into the trace_seq buffer
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to copy into the buffer
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * There may be cases where raw memory needs to be written into the
+ * buffer and a strcpy() would not work. Using this function allows
+ * for such cases.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem(struct trace_seq *s, const void *mem, unsigned int len)
+{
+ if (s->full)
+ return 0;
+
+ if (len > TRACE_SEQ_BUF_LEFT(s)) {
+ s->full = 1;
+ return 0;
+ }
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem);
+
+#define MAX_MEMHEX_BYTES 8U
+#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
+
+/**
+ * trace_seq_putmem_hex - write raw memory into the buffer in ASCII hex
+ * @s: trace sequence descriptor
+ * @mem: The raw memory to write its hex ASCII representation of
+ * @len: The length of the raw memory to copy (in bytes)
+ *
+ * This is similar to trace_seq_putmem() except instead of just copying the
+ * raw memory into the buffer it writes its ASCII representation of it
+ * in hex characters.
+ *
+ * Returns how much it wrote to the buffer.
+ */
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+ unsigned int len)
+{
+ unsigned char hex[HEX_CHARS];
+ const unsigned char *data = mem;
+ unsigned int start_len;
+ int i, j;
+ int cnt = 0;
+
+ if (s->full)
+ return 0;
+
+ while (len) {
+ start_len = min(len, HEX_CHARS - 1);
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < start_len; i++) {
+#else
+ for (i = start_len-1, j = 0; i >= 0; i--) {
+#endif
+ hex[j++] = hex_asc_hi(data[i]);
+ hex[j++] = hex_asc_lo(data[i]);
+ }
+ if (WARN_ON_ONCE(j == 0 || j/2 > len))
+ break;
+
+ /* j increments twice per loop */
+ len -= j / 2;
+ hex[j++] = ' ';
+
+ cnt += trace_seq_putmem(s, hex, j);
+ }
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_putmem_hex);
+
+/**
+ * trace_seq_path - copy a path into the sequence buffer
+ * @s: trace sequence descriptor
+ * @path: path to write into the sequence buffer.
+ *
+ * Write a path name into the sequence buffer.
+ *
+ * Returns 1 if we successfully written all the contents to
+ * the buffer.
+ * Returns 0 if we the length to write is bigger than the
+ * reserved buffer space. In this case, nothing gets written.
+ */
+int trace_seq_path(struct trace_seq *s, const struct path *path)
+{
+ unsigned char *p;
+
+ if (s->full)
+ return 0;
+
+ if (TRACE_SEQ_BUF_LEFT(s) < 1) {
+ s->full = 1;
+ return 0;
+ }
+
+ p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+ if (!IS_ERR(p)) {
+ p = mangle_path(s->buffer + s->len, p, "\n");
+ if (p) {
+ s->len = p - s->buffer;
+ return 1;
+ }
+ } else {
+ s->buffer[s->len++] = '?';
+ return 1;
+ }
+
+ s->full = 1;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(trace_seq_path);
+
+/**
+ * trace_seq_to_user - copy the squence buffer to user space
+ * @s: trace sequence descriptor
+ * @ubuf: The userspace memory location to copy to
+ * @cnt: The amount to copy
+ *
+ * Copies the sequence buffer into the userspace memory pointed to
+ * by @ubuf. It starts from the last read position (@s->readpos)
+ * and writes up to @cnt characters or till it reaches the end of
+ * the content in the buffer (@s->len), which ever comes first.
+ *
+ * On success, it returns a positive number of the number of bytes
+ * it copied.
+ *
+ * On failure it returns -EBUSY if all of the content in the
+ * sequence has been already read, which includes nothing in the
+ * sequenc (@s->len == @s->readpos).
+ *
+ * Returns -EFAULT if the copy to userspace fails.
+ */
+int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt)
+{
+ int len;
+ int ret;
+
+ if (!cnt)
+ return 0;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret == cnt)
+ return -EFAULT;
+
+ cnt -= ret;
+
+ s->readpos += cnt;
+ return cnt;
+}
+EXPORT_SYMBOL_GPL(trace_seq_to_user);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 3c9b97e6b1f4..33ff6a24b802 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -265,7 +265,6 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
if (is_ret)
tu->consumer.ret_handler = uretprobe_dispatcher;
init_trace_uprobe_filter(&tu->filter);
- tu->tp.call.flags |= TRACE_EVENT_FL_USE_CALL_FILTER;
return tu;
error:
@@ -1292,7 +1291,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
kfree(call->print_fmt);
return -ENODEV;
}
- call->flags = 0;
+
call->class->reg = trace_uprobe_register;
call->data = tu;
ret = trace_add_event_call(call);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
return;
}
-struct seq_operations proc_uid_seq_operations = {
+const struct seq_operations proc_uid_seq_operations = {
.start = uid_m_start,
.stop = m_stop,
.next = m_next,
.show = uid_m_show,
};
-struct seq_operations proc_gid_seq_operations = {
+const struct seq_operations proc_gid_seq_operations = {
.start = gid_m_start,
.stop = m_stop,
.next = m_next,
.show = gid_m_show,
};
-struct seq_operations proc_projid_seq_operations = {
+const struct seq_operations proc_projid_seq_operations = {
.start = projid_m_start,
.stop = m_stop,
.next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
struct uts_namespace *ns = NULL;
struct nsproxy *nsproxy;
- rcu_read_lock();
- nsproxy = task_nsproxy(task);
+ task_lock(task);
+ nsproxy = task->nsproxy;
if (nsproxy) {
ns = nsproxy->uts_ns;
get_uts_ns(ns);
}
- rcu_read_unlock();
+ task_unlock(task);
return ns;
}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..a8d6914030fe 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
if (hardlockup_panic)
- panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
else
- WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
+ this_cpu);
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
}
}
- printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
print_modules();
@@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
smp_mb__after_atomic();
}
+ add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
if (PTR_ERR(event) == -EOPNOTSUPP)
pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
cpu);
else
pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 35974ac69600..5dbe22aa3efd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -265,7 +265,6 @@ struct workqueue_struct {
static struct kmem_cache *pwq_cache;
-static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */
@@ -758,13 +757,6 @@ static bool too_many_workers(struct worker_pool *pool)
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
- /*
- * nr_idle and idle_list may disagree if idle rebinding is in
- * progress. Never return %true if idle_list is empty.
- */
- if (list_empty(&pool->idle_list))
- return false;
-
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
@@ -850,7 +842,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
pool = worker->pool;
/* this can only happen on the local cpu */
- if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu))
return NULL;
/*
@@ -874,35 +866,22 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
- * @wakeup: wakeup an idle worker if necessary
*
- * Set @flags in @worker->flags and adjust nr_running accordingly. If
- * nr_running becomes zero and @wakeup is %true, an idle worker is
- * woken up.
+ * Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* spin_lock_irq(pool->lock)
*/
-static inline void worker_set_flags(struct worker *worker, unsigned int flags,
- bool wakeup)
+static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
WARN_ON_ONCE(worker->task != current);
- /*
- * If transitioning into NOT_RUNNING, adjust nr_running and
- * wake up an idle worker as necessary if requested by
- * @wakeup.
- */
+ /* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
- if (wakeup) {
- if (atomic_dec_and_test(&pool->nr_running) &&
- !list_empty(&pool->worklist))
- wake_up_worker(pool);
- } else
- atomic_dec(&pool->nr_running);
+ atomic_dec(&pool->nr_running);
}
worker->flags |= flags;
@@ -1232,7 +1211,7 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork,
pwq_activate_delayed_work(work);
list_del_init(&work->entry);
- pwq_dec_nr_in_flight(get_work_pwq(work), get_work_color(work));
+ pwq_dec_nr_in_flight(pwq, get_work_color(work));
/* work->data points to pwq iff queued, point to pool */
set_work_pool_and_keep_pending(work, pool->id);
@@ -1560,7 +1539,7 @@ static void worker_enter_idle(struct worker *worker)
(worker->hentry.next || worker->hentry.pprev)))
return;
- /* can't use worker_set_flags(), also called from start_worker() */
+ /* can't use worker_set_flags(), also called from create_worker() */
worker->flags |= WORKER_IDLE;
pool->nr_idle++;
worker->last_active = jiffies;
@@ -1602,11 +1581,11 @@ static void worker_leave_idle(struct worker *worker)
list_del_init(&worker->entry);
}
-static struct worker *alloc_worker(void)
+static struct worker *alloc_worker(int node)
{
struct worker *worker;
- worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+ worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, node);
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
@@ -1670,6 +1649,9 @@ static void worker_detach_from_pool(struct worker *worker,
detach_completion = pool->detach_completion;
mutex_unlock(&pool->attach_mutex);
+ /* clear leftover flags without pool->lock after it is detached */
+ worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
+
if (detach_completion)
complete(detach_completion);
}
@@ -1678,8 +1660,7 @@ static void worker_detach_from_pool(struct worker *worker,
* create_worker - create a new workqueue worker
* @pool: pool the new worker will belong to
*
- * Create a new worker which is attached to @pool. The new worker must be
- * started by start_worker().
+ * Create and start a new worker which is attached to @pool.
*
* CONTEXT:
* Might sleep. Does GFP_KERNEL allocations.
@@ -1698,7 +1679,7 @@ static struct worker *create_worker(struct worker_pool *pool)
if (id < 0)
goto fail;
- worker = alloc_worker();
+ worker = alloc_worker(pool->node);
if (!worker)
goto fail;
@@ -1724,6 +1705,13 @@ static struct worker *create_worker(struct worker_pool *pool)
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
+ /* start the newly created worker */
+ spin_lock_irq(&pool->lock);
+ worker->pool->nr_workers++;
+ worker_enter_idle(worker);
+ wake_up_process(worker->task);
+ spin_unlock_irq(&pool->lock);
+
return worker;
fail:
@@ -1734,44 +1722,6 @@ fail:
}
/**
- * start_worker - start a newly created worker
- * @worker: worker to start
- *
- * Make the pool aware of @worker and start it.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void start_worker(struct worker *worker)
-{
- worker->pool->nr_workers++;
- worker_enter_idle(worker);
- wake_up_process(worker->task);
-}
-
-/**
- * create_and_start_worker - create and start a worker for a pool
- * @pool: the target pool
- *
- * Grab the managership of @pool and create and start a new worker for it.
- *
- * Return: 0 on success. A negative error code otherwise.
- */
-static int create_and_start_worker(struct worker_pool *pool)
-{
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
- }
-
- return worker ? 0 : -ENOMEM;
-}
-
-/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
@@ -1909,23 +1859,10 @@ restart:
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT);
while (true) {
- struct worker *worker;
-
- worker = create_worker(pool);
- if (worker) {
- del_timer_sync(&pool->mayday_timer);
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- if (WARN_ON_ONCE(need_to_create_worker(pool)))
- goto restart;
- return true;
- }
-
- if (!need_to_create_worker(pool))
+ if (create_worker(pool) || !need_to_create_worker(pool))
break;
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule_timeout(CREATE_COOLDOWN);
+ schedule_timeout_interruptible(CREATE_COOLDOWN);
if (!need_to_create_worker(pool))
break;
@@ -1933,6 +1870,11 @@ restart:
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
+ /*
+ * This is necessary even after a new worker was just successfully
+ * created as @pool->lock was dropped and the new worker might have
+ * already become busy.
+ */
if (need_to_create_worker(pool))
goto restart;
return true;
@@ -2020,13 +1962,8 @@ __acquires(&pool->lock)
lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
- /*
- * Ensure we're on the correct CPU. DISASSOCIATED test is
- * necessary to avoid spurious warnings from rescuers servicing the
- * unbound or a disassociated pool.
- */
- WARN_ON_ONCE(!(worker->flags & WORKER_UNBOUND) &&
- !(pool->flags & POOL_DISASSOCIATED) &&
+ /* ensure we're on the correct CPU */
+ WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);
/*
@@ -2052,17 +1989,22 @@ __acquires(&pool->lock)
list_del_init(&work->entry);
/*
- * CPU intensive works don't participate in concurrency
- * management. They're the scheduler's responsibility.
+ * CPU intensive works don't participate in concurrency management.
+ * They're the scheduler's responsibility. This takes @worker out
+ * of concurrency management and the next code block will chain
+ * execution of the pending work items.
*/
if (unlikely(cpu_intensive))
- worker_set_flags(worker, WORKER_CPU_INTENSIVE, true);
+ worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/*
- * Unbound pool isn't concurrency managed and work items should be
- * executed ASAP. Wake up another worker if necessary.
+ * Wake up another worker if necessary. The condition is always
+ * false for normal per-cpu workers since nr_running would always
+ * be >= 1 at this point. This is used to chain execution of the
+ * pending work items for WORKER_NOT_RUNNING workers such as the
+ * UNBOUND and CPU_INTENSIVE ones.
*/
- if ((worker->flags & WORKER_UNBOUND) && need_more_worker(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
/*
@@ -2218,7 +2160,7 @@ recheck:
}
} while (keep_working(pool));
- worker_set_flags(worker, WORKER_PREP, false);
+ worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
@@ -2311,29 +2253,27 @@ repeat:
move_linked_works(work, scheduled, &n);
process_scheduled_works(rescuer);
- spin_unlock_irq(&pool->lock);
-
- worker_detach_from_pool(rescuer, pool);
-
- spin_lock_irq(&pool->lock);
/*
* Put the reference grabbed by send_mayday(). @pool won't
- * go away while we're holding its lock.
+ * go away while we're still attached to it.
*/
put_pwq(pwq);
/*
- * Leave this pool. If keep_working() is %true, notify a
+ * Leave this pool. If need_more_worker() is %true, notify a
* regular worker; otherwise, we end up with 0 concurrency
* and stalling the execution.
*/
- if (keep_working(pool))
+ if (need_more_worker(pool))
wake_up_worker(pool);
rescuer->pool = NULL;
- spin_unlock(&pool->lock);
- spin_lock(&wq_mayday_lock);
+ spin_unlock_irq(&pool->lock);
+
+ worker_detach_from_pool(rescuer, pool);
+
+ spin_lock_irq(&wq_mayday_lock);
}
spin_unlock_irq(&wq_mayday_lock);
@@ -3458,7 +3398,7 @@ static void put_unbound_pool(struct worker_pool *pool)
return;
/* sanity checks */
- if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ if (WARN_ON(!(pool->cpu < 0)) ||
WARN_ON(!list_empty(&pool->worklist)))
return;
@@ -3524,7 +3464,7 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
if (wqattrs_equal(pool->attrs, attrs)) {
pool->refcnt++;
- goto out_unlock;
+ return pool;
}
}
@@ -3557,12 +3497,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
goto fail;
/* create and start the initial worker */
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
goto fail;
/* install */
hash_add(unbound_pool_hash, &pool->hash_node, hash);
-out_unlock:
+
return pool;
fail:
if (pool)
@@ -3591,11 +3531,6 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
return;
- /*
- * Unlink @pwq. Synchronization against wq->mutex isn't strictly
- * necessary on release but do it anyway. It's easier to verify
- * and consistent with the linking path.
- */
mutex_lock(&wq->mutex);
list_del_rcu(&pwq->pwqs_node);
is_last = list_empty(&wq->pwqs);
@@ -3692,10 +3627,7 @@ static void link_pwq(struct pool_workqueue *pwq)
if (!list_empty(&pwq->pwqs_node))
return;
- /*
- * Set the matching work_color. This is synchronized with
- * wq->mutex to avoid confusing flush_workqueue().
- */
+ /* set the matching work_color */
pwq->work_color = wq->work_color;
/* sync max_active to the current setting */
@@ -3832,7 +3764,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
return -EINVAL;
- pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ pwq_tbl = kzalloc(nr_node_ids * sizeof(pwq_tbl[0]), GFP_KERNEL);
new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
if (!pwq_tbl || !new_attrs || !tmp_attrs)
@@ -4080,7 +4012,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
- tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
+ tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);
wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
@@ -4122,7 +4054,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- rescuer = alloc_worker();
+ rescuer = alloc_worker(NUMA_NO_NODE);
if (!rescuer)
goto err_destroy;
@@ -4470,8 +4402,6 @@ static void wq_unbind_fn(struct work_struct *work)
struct worker *worker;
for_each_cpu_worker_pool(pool, cpu) {
- WARN_ON_ONCE(cpu != smp_processor_id());
-
mutex_lock(&pool->attach_mutex);
spin_lock_irq(&pool->lock);
@@ -4543,6 +4473,7 @@ static void rebind_workers(struct worker_pool *pool)
pool->attrs->cpumask) < 0);
spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;
@@ -4632,7 +4563,7 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
- if (create_and_start_worker(pool) < 0)
+ if (!create_worker(pool))
return NOTIFY_BAD;
}
break;
@@ -4644,15 +4575,10 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb,
for_each_pool(pool, pi) {
mutex_lock(&pool->attach_mutex);
- if (pool->cpu == cpu) {
- spin_lock_irq(&pool->lock);
- pool->flags &= ~POOL_DISASSOCIATED;
- spin_unlock_irq(&pool->lock);
-
+ if (pool->cpu == cpu)
rebind_workers(pool);
- } else if (pool->cpu < 0) {
+ else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);
- }
mutex_unlock(&pool->attach_mutex);
}
@@ -4856,10 +4782,6 @@ static void __init wq_numa_init(void)
cpumask_var_t *tbl;
int node, cpu;
- /* determine NUMA pwq table len - highest node id + 1 */
- for_each_node(node)
- wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
-
if (num_possible_nodes() <= 1)
return;
@@ -4876,7 +4798,7 @@ static void __init wq_numa_init(void)
* available. Build one from cpu_to_node() which should have been
* fully initialized by now.
*/
- tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ tbl = kzalloc(nr_node_ids * sizeof(tbl[0]), GFP_KERNEL);
BUG_ON(!tbl);
for_each_node(node)
@@ -4936,7 +4858,7 @@ static int __init init_workqueues(void)
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
- BUG_ON(create_and_start_worker(pool) < 0);
+ BUG_ON(!create_worker(pool));
}
}