aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/verifier.c3
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/cpu.c21
-rw-r--r--kernel/cpuset.c23
-rw-r--r--kernel/events/core.c83
-rw-r--r--kernel/events/hw_breakpoint.c7
-rw-r--r--kernel/events/uprobes.c1
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c36
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/generic-chip.c36
-rw-r--r--kernel/kmod.c76
-rw-r--r--kernel/locking/mutex.c8
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/power/hibernate.c8
-rw-r--r--kernel/power/process.c57
-rw-r--r--kernel/power/qos.c27
-rw-r--r--kernel/power/suspend.c4
-rw-r--r--kernel/rcu/Makefile2
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/rcutorture.c1
-rw-r--r--kernel/rcu/tiny.c6
-rw-r--r--kernel/rcu/tree.c112
-rw-r--r--kernel/rcu/tree.h23
-rw-r--r--kernel/rcu/tree_plugin.h144
-rw-r--r--kernel/rcu/update.c89
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c361
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c142
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c389
-rw-r--r--kernel/sched/idle_task.c5
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h45
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/wait.c66
-rw-r--r--kernel/signal.c42
-rw-r--r--kernel/smpboot.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/sysctl.c3
-rw-r--r--kernel/time/Makefile2
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/posix-cpu-timers.c2
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/test_udelay.c (renamed from kernel/time/udelay_test.c)0
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/time.c20
-rw-r--r--kernel/time/timekeeping.c59
-rw-r--r--kernel/time/timer.c3
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/ring_buffer.c81
-rw-r--r--kernel/trace/trace.c33
-rw-r--r--kernel/trace/trace_syscalls.c8
63 files changed, 1594 insertions, 681 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index dc5c77544fd6..17ea6d4a9a24 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_IRQ_WORK) += irq_work.o
obj-$(CONFIG_CPU_PM) += cpu_pm.o
-obj-$(CONFIG_NET) += bpf/
+obj-$(CONFIG_BPF) += bpf/
obj-$(CONFIG_PERF_EVENTS) += events/
diff --git a/kernel/audit.c b/kernel/audit.c
index 80983df92cd4..1f37f15117e5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -499,7 +499,6 @@ static int kauditd_thread(void *dummy)
set_freezable();
while (!kthread_should_stop()) {
struct sk_buff *skb;
- DECLARE_WAITQUEUE(wait, current);
flush_hold_queue();
@@ -514,16 +513,8 @@ static int kauditd_thread(void *dummy)
audit_printk_skb(skb);
continue;
}
- set_current_state(TASK_INTERRUPTIBLE);
- add_wait_queue(&kauditd_wait, &wait);
- if (!skb_queue_len(&audit_skb_queue)) {
- try_to_freeze();
- schedule();
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&kauditd_wait, &wait);
+ wait_event_freezable(kauditd_wait, skb_queue_len(&audit_skb_queue));
}
return 0;
}
@@ -739,7 +730,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
audit_log_task_info(ab, current);
- audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
+ audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
audit_feature_names[which], !!old_feature, !!new_feature,
!!old_lock, !!new_lock, res);
audit_log_end(ab);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index e242e3a9864a..80f29e015570 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
chunk->owners[i].index = i;
}
fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
+ chunk->mark.mask = FS_IN_IGNORED;
return chunk;
}
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 45427239f375..0daf7f6ae7df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1,5 +1,5 @@
-obj-y := core.o syscall.o verifier.o
-
+obj-y := core.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
ifdef CONFIG_TEST_BPF
-obj-y += test_stub.o
+obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index f0c30c59b317..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -655,3 +655,12 @@ void bpf_prog_free(struct bpf_prog *fp)
schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
+
+/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
+ * skb_copy_bits(), so provide a weak definition of it for NET-less config.
+ */
+int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
+ int len)
+{
+ return -EFAULT;
+}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 801f5f3b9307..9f81818f2941 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1409,7 +1409,8 @@ static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
if (memcmp(&old->regs[i], &cur->regs[i],
sizeof(old->regs[0])) != 0) {
if (old->regs[i].type == NOT_INIT ||
- old->regs[i].type == UNKNOWN_VALUE)
+ (old->regs[i].type == UNKNOWN_VALUE &&
+ cur->regs[i].type != NOT_INIT))
continue;
return false;
}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
}
NOKPROBE_SYMBOL(context_tracking_user_enter);
-#ifdef CONFIG_PREEMPT
-/**
- * preempt_schedule_context - preempt_schedule called by tracing
- *
- * The tracing infrastructure uses preempt_enable_notrace to prevent
- * recursion and tracing preempt enabling caused by the tracing
- * infrastructure itself. But as tracing can happen in areas coming
- * from userspace or just about to enter userspace, a preempt enable
- * can occur before user_exit() is called. This will cause the scheduler
- * to be called when the system is still in usermode.
- *
- * To prevent this, the preempt_enable_notrace will use this function
- * instead of preempt_schedule() to exit user context if needed before
- * calling the scheduler.
- */
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
-{
- enum ctx_state prev_ctx;
-
- if (likely(!preemptible()))
- return;
-
- /*
- * Need to disable preemption in case user_exit() is traced
- * and the tracer calls preempt_enable_notrace() causing
- * an infinite recursion.
- */
- preempt_disable_notrace();
- prev_ctx = exception_enter();
- preempt_enable_no_resched_notrace();
-
- preempt_schedule();
-
- preempt_disable_notrace();
- exception_exit(prev_ctx);
- preempt_enable_notrace();
-}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_PREEMPT */
-
/**
* context_tracking_user_exit - Inform the context tracking that the CPU is
* exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 356450f09c1f..5d220234b3ca 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
* an ongoing cpu hotplug operation.
*/
int refcount;
+ /* And allows lockless put_online_cpus(). */
+ atomic_t puts_pending;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -84,6 +86,16 @@ static struct {
#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
+static void apply_puts_pending(int max)
+{
+ int delta;
+
+ if (atomic_read(&cpu_hotplug.puts_pending) >= max) {
+ delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
+ cpu_hotplug.refcount -= delta;
+ }
+}
+
void get_online_cpus(void)
{
might_sleep();
@@ -91,6 +103,7 @@ void get_online_cpus(void)
return;
cpuhp_lock_acquire_read();
mutex_lock(&cpu_hotplug.lock);
+ apply_puts_pending(65536);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
}
@@ -103,6 +116,7 @@ bool try_get_online_cpus(void)
if (!mutex_trylock(&cpu_hotplug.lock))
return false;
cpuhp_lock_acquire_tryread();
+ apply_puts_pending(65536);
cpu_hotplug.refcount++;
mutex_unlock(&cpu_hotplug.lock);
return true;
@@ -113,7 +127,11 @@ void put_online_cpus(void)
{
if (cpu_hotplug.active_writer == current)
return;
- mutex_lock(&cpu_hotplug.lock);
+ if (!mutex_trylock(&cpu_hotplug.lock)) {
+ atomic_inc(&cpu_hotplug.puts_pending);
+ cpuhp_lock_release();
+ return;
+ }
if (WARN_ON(!cpu_hotplug.refcount))
cpu_hotplug.refcount++; /* try to fix things up */
@@ -155,6 +173,7 @@ void cpu_hotplug_begin(void)
cpuhp_lock_acquire();
for (;;) {
mutex_lock(&cpu_hotplug.lock);
+ apply_puts_pending(1);
if (likely(!cpu_hotplug.refcount))
break;
__set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1f107c74087b..723cfc9d0ad7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -506,6 +506,16 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
goto out;
}
+ /*
+ * We can't shrink if we won't have enough room for SCHED_DEADLINE
+ * tasks.
+ */
+ ret = -EBUSY;
+ if (is_cpu_exclusive(cur) &&
+ !cpuset_cpumask_can_shrink(cur->cpus_allowed,
+ trial->cpus_allowed))
+ goto out;
+
ret = 0;
out:
rcu_read_unlock();
@@ -1429,17 +1439,8 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
goto out_unlock;
cgroup_taskset_for_each(task, tset) {
- /*
- * Kthreads which disallow setaffinity shouldn't be moved
- * to a new cpuset; we don't want to change their cpu
- * affinity and isolating such threads by their set of
- * allowed nodes is unnecessary. Thus, cpusets are not
- * applicable for such threads. This prevents checking for
- * success of set_cpus_allowed_ptr() on all attached tasks
- * before cpus_allowed may be changed.
- */
- ret = -EINVAL;
- if (task->flags & PF_NO_SETAFFINITY)
+ ret = task_can_attach(task, cs->cpus_allowed);
+ if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1425d07018de..3e19d3ebc29c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1562,8 +1562,10 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group
if (!task) {
/*
- * Per cpu events are removed via an smp call and
- * the removal is always successful.
+ * Per cpu events are removed via an smp call. The removal can
+ * fail if the CPU is currently offline, but in that case we
+ * already called __perf_remove_from_context from
+ * perf_event_exit_cpu.
*/
cpu_function_call(event->cpu, __perf_remove_from_context, &re);
return;
@@ -4458,7 +4460,7 @@ perf_output_sample_regs(struct perf_output_handle *handle,
}
}
-static void perf_sample_regs_user(struct perf_regs_user *regs_user,
+static void perf_sample_regs_user(struct perf_regs *regs_user,
struct pt_regs *regs)
{
if (!user_mode(regs)) {
@@ -4469,11 +4471,22 @@ static void perf_sample_regs_user(struct perf_regs_user *regs_user,
}
if (regs) {
- regs_user->regs = regs;
regs_user->abi = perf_reg_abi(current);
+ regs_user->regs = regs;
+ } else {
+ regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
+ regs_user->regs = NULL;
}
}
+static void perf_sample_regs_intr(struct perf_regs *regs_intr,
+ struct pt_regs *regs)
+{
+ regs_intr->regs = regs;
+ regs_intr->abi = perf_reg_abi(current);
+}
+
+
/*
* Get remaining task size from user stack pointer.
*
@@ -4855,6 +4868,23 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_TRANSACTION)
perf_output_put(handle, data->txn);
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ u64 abi = data->regs_intr.abi;
+ /*
+ * If there are no regs to dump, notice it through
+ * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
+ */
+ perf_output_put(handle, abi);
+
+ if (abi) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ perf_output_sample_regs(handle,
+ data->regs_intr.regs,
+ mask);
+ }
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -4920,12 +4950,13 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+ if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
+ perf_sample_regs_user(&data->regs_user, regs);
+
if (sample_type & PERF_SAMPLE_REGS_USER) {
/* regs dump ABI info */
int size = sizeof(u64);
- perf_sample_regs_user(&data->regs_user, regs);
-
if (data->regs_user.regs) {
u64 mask = event->attr.sample_regs_user;
size += hweight64(mask) * sizeof(u64);
@@ -4941,15 +4972,11 @@ void perf_prepare_sample(struct perf_event_header *header,
* in case new sample type is added, because we could eat
* up the rest of the sample size.
*/
- struct perf_regs_user *uregs = &data->regs_user;
u16 stack_size = event->attr.sample_stack_user;
u16 size = sizeof(u64);
- if (!uregs->abi)
- perf_sample_regs_user(uregs, regs);
-
stack_size = perf_sample_ustack_size(stack_size, header->size,
- uregs->regs);
+ data->regs_user.regs);
/*
* If there is something to dump, add space for the dump
@@ -4962,6 +4989,21 @@ void perf_prepare_sample(struct perf_event_header *header,
data->stack_user_size = stack_size;
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_REGS_INTR) {
+ /* regs dump ABI info */
+ int size = sizeof(u64);
+
+ perf_sample_regs_intr(&data->regs_intr, regs);
+
+ if (data->regs_intr.regs) {
+ u64 mask = event->attr.sample_regs_intr;
+
+ size += hweight64(mask) * sizeof(u64);
+ }
+
+ header->size += size;
+ }
}
static void perf_event_output(struct perf_event *event,
@@ -6071,11 +6113,6 @@ static int perf_swevent_init(struct perf_event *event)
return 0;
}
-static int perf_swevent_event_idx(struct perf_event *event)
-{
- return 0;
-}
-
static struct pmu perf_swevent = {
.task_ctx_nr = perf_sw_context,
@@ -6085,8 +6122,6 @@ static struct pmu perf_swevent = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
#ifdef CONFIG_EVENT_TRACING
@@ -6204,8 +6239,6 @@ static struct pmu perf_tracepoint = {
.start = perf_swevent_start,
.stop = perf_swevent_stop,
.read = perf_swevent_read,
-
- .event_idx = perf_swevent_event_idx,
};
static inline void perf_tp_register(void)
@@ -6431,8 +6464,6 @@ static struct pmu perf_cpu_clock = {
.start = cpu_clock_event_start,
.stop = cpu_clock_event_stop,
.read = cpu_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
/*
@@ -6511,8 +6542,6 @@ static struct pmu perf_task_clock = {
.start = task_clock_event_start,
.stop = task_clock_event_stop,
.read = task_clock_event_read,
-
- .event_idx = perf_swevent_event_idx,
};
static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6542,7 +6571,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
static int perf_event_idx_default(struct perf_event *event)
{
- return event->hw.idx + 1;
+ return 0;
}
/*
@@ -7162,6 +7191,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
ret = -EINVAL;
}
+ if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
+ ret = perf_reg_validate(attr->sample_regs_intr);
out:
return ret;
@@ -8130,7 +8161,7 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = false };
+ struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
perf_pmu_rotate_stop(ctx->pmu);
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
bp->hw.state = PERF_HES_STOPPED;
}
-static int hw_breakpoint_event_idx(struct perf_event *bp)
-{
- return 0;
-}
-
static struct pmu perf_breakpoint = {
.task_ctx_nr = perf_sw_context, /* could eventually get its own */
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
.start = hw_breakpoint_start,
.stop = hw_breakpoint_stop,
.read = hw_breakpoint_pmu_read,
-
- .event_idx = hw_breakpoint_event_idx,
};
int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 1d0af8a2c646..ed8f2cde34c5 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1640,7 +1640,6 @@ bool uprobe_deny_signal(void)
if (__fatal_signal_pending(t) || arch_uprobe_xol_was_trapped(t)) {
utask->state = UTASK_SSTEP_TRAPPED;
set_tsk_thread_flag(t, TIF_UPROBE);
- set_tsk_thread_flag(t, TIF_NOTIFY_RESUME);
}
}
diff --git a/kernel/exit.c b/kernel/exit.c
index 5d30019ff953..232c4bc8bcc9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -997,6 +997,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
+
if ((exit_code & 0x7f) == 0) {
why = CLD_EXITED;
status = exit_code >> 8;
@@ -1079,6 +1081,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
* thread can reap it because we its state == DEAD/TRACE.
*/
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
retval = wo->wo_rusage
? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
@@ -1210,6 +1213,7 @@ unlock_sig:
pid = task_pid_vnr(p);
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (unlikely(wo->wo_flags & WNOWAIT))
return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
@@ -1272,6 +1276,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
pid = task_pid_vnr(p);
get_task_struct(p);
read_unlock(&tasklist_lock);
+ sched_annotate_sleep();
if (!wo->wo_info) {
retval = wo->wo_rusage
diff --git a/kernel/fork.c b/kernel/fork.c
index 9b7d746d6d62..9ca84189cfc2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1022,11 +1022,14 @@ void __cleanup_sighand(struct sighand_struct *sighand)
{
if (atomic_dec_and_test(&sighand->count)) {
signalfd_cleanup(sighand);
+ /*
+ * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it
+ * without an RCU grace period, see __lock_task_sighand().
+ */
kmem_cache_free(sighand_cachep, sighand);
}
}
-
/*
* Initialize POSIX timer handling for a thread group.
*/
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
return false;
+ if (test_thread_flag(TIF_MEMDIE))
+ return false;
+
if (pm_nosig_freezing || cgroup_freezing(p))
return true;
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
{
unsigned long flags;
- /*
- * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
- * be visible to @p as waking up implies wmb. Waking up inside
- * freezer_lock also prevents wakeups from leaking outside
- * refrigerator.
- */
spin_lock_irqsave(&freezer_lock, flags);
if (frozen(p))
wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index f3a3a071283c..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
*
* Where (A) orders the waiters increment and the futex value read through
* atomic operations (see hb_waiters_inc) and where (B) orders the write
- * to futex and the waiters read -- this is done by the barriers in
- * get_futex_key_refs(), through either ihold or atomic_inc, depending on the
- * futex type.
+ * to futex and the waiters read -- this is done by the barriers for both
+ * shared and private futexes in get_futex_key_refs().
*
* This yields the following case (where X:=waiters, Y:=futex):
*
@@ -344,13 +343,20 @@ static void get_futex_key_refs(union futex_key *key)
futex_get_mm(key); /* implies MB (B) */
break;
default:
+ /*
+ * Private futexes do not hold reference on an inode or
+ * mm, therefore the only purpose of calling get_futex_key_refs
+ * is because we need the barrier for the lockless waiter check.
+ */
smp_mb(); /* explicit MB (B) */
}
}
/*
* Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
+ * The hash bucket spinlock must not be held. This is
+ * a no-op for private futexes, see comment in the get
+ * counterpart.
*/
static void drop_futex_key_refs(union futex_key *key)
{
@@ -641,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+/*
+ * Must be called with the hb lock held.
+ */
static void free_pi_state(struct futex_pi_state *pi_state)
{
+ if (!pi_state)
+ return;
+
if (!atomic_dec_and_test(&pi_state->refcount))
return;
@@ -1521,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
}
retry:
- if (pi_state != NULL) {
- /*
- * We will have to lookup the pi_state again, so free this one
- * to keep the accounting correct.
- */
- free_pi_state(pi_state);
- pi_state = NULL;
- }
-
ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
@@ -1619,6 +1622,8 @@ retry_private:
case 0:
break;
case -EFAULT:
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1634,6 +1639,8 @@ retry_private:
* exit to complete.
* - The user space value changed.
*/
+ free_pi_state(pi_state);
+ pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1710,6 +1717,7 @@ retry_private:
}
out_unlock:
+ free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -1727,8 +1735,6 @@ out_put_keys:
out_put_key1:
put_futex_key(&key1);
out:
- if (pi_state != NULL)
- free_pi_state(pi_state);
return ret ? ret : task_count;
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index cf66c5c8458e..3b7408759bdf 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index cf80e7b0ddab..61024e8abdef 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.disable);
+ irq_reg_writel(gc, mask, ct->regs.disable);
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
@@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache |= mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit);
@@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d)
irq_gc_lock(gc);
*ct->mask_cache &= ~mask;
- irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask);
+ irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit);
@@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.enable);
+ irq_reg_writel(gc, mask, ct->regs.enable);
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
@@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit);
@@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d)
u32 mask = ~d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.mask);
- irq_reg_writel(mask, gc->reg_base + ct->regs.ack);
+ irq_reg_writel(gc, mask, ct->regs.mask);
+ irq_reg_writel(gc, mask, ct->regs.ack);
irq_gc_unlock(gc);
}
@@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d)
u32 mask = d->mask;
irq_gc_lock(gc);
- irq_reg_writel(mask, gc->reg_base + ct->regs.eoi);
+ irq_reg_writel(gc, mask, ct->regs.eoi);
irq_gc_unlock(gc);
}
@@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on)
return 0;
}
+static u32 irq_readl_be(void __iomem *addr)
+{
+ return ioread32be(addr);
+}
+
+static void irq_writel_be(u32 val, void __iomem *addr)
+{
+ iowrite32be(val, addr);
+}
+
static void
irq_init_generic_chip(struct irq_chip_generic *gc, const char *name,
int num_ct, unsigned int irq_base,
@@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags)
}
ct[i].mask_cache = mskptr;
if (flags & IRQ_GC_INIT_MASK_CACHE)
- *mskptr = irq_reg_readl(gc->reg_base + mskreg);
+ *mskptr = irq_reg_readl(gc, mskreg);
}
}
@@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip,
dgc->gc[i] = gc = tmp;
irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip,
NULL, handler);
+
gc->domain = d;
+ if (gcflags & IRQ_GC_BE_IO) {
+ gc->reg_readl = &irq_readl_be;
+ gc->reg_writel = &irq_writel_be;
+ }
+
raw_spin_lock_irqsave(&gc_lock, flags);
list_add_tail(&gc->list, &gc_list);
raw_spin_unlock_irqrestore(&gc_lock, flags);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..80f7a6d00519 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...)
EXPORT_SYMBOL(__request_module);
#endif /* CONFIG_MODULES */
+static void call_usermodehelper_freeinfo(struct subprocess_info *info)
+{
+ if (info->cleanup)
+ (*info->cleanup)(info);
+ kfree(info);
+}
+
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away
+ * or the caller used UMH_NO_WAIT.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
/*
* This is the task which runs the usermode application
*/
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ int wait = sub_info->wait & ~UMH_KILLABLE;
struct cred *new;
int retval;
@@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data)
retval = -ENOMEM;
new = prepare_kernel_cred(current);
if (!new)
- goto fail;
+ goto out;
spin_lock(&umh_sysctl_lock);
new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data)
retval = sub_info->init(sub_info, new);
if (retval) {
abort_creds(new);
- goto fail;
+ goto out;
}
}
@@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data)
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
+out:
+ sub_info->retval = retval;
+ /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
+ if (wait != UMH_WAIT_PROC)
+ umh_complete(sub_info);
if (!retval)
return 0;
-
- /* Exec failed? */
-fail:
- sub_info->retval = retval;
do_exit(0);
}
@@ -258,26 +281,6 @@ static int call_helper(void *data)
return ____call_usermodehelper(data);
}
-static void call_usermodehelper_freeinfo(struct subprocess_info *info)
-{
- if (info->cleanup)
- (*info->cleanup)(info);
- kfree(info);
-}
-
-static void umh_complete(struct subprocess_info *sub_info)
-{
- struct completion *comp = xchg(&sub_info->complete, NULL);
- /*
- * See call_usermodehelper_exec(). If xchg() returns NULL
- * we own sub_info, the UMH_KILLABLE caller has gone away.
- */
- if (comp)
- complete(comp);
- else
- call_usermodehelper_freeinfo(sub_info);
-}
-
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work)
kmod_thread_locker = NULL;
}
- switch (wait) {
- case UMH_NO_WAIT:
- call_usermodehelper_freeinfo(sub_info);
- break;
-
- case UMH_WAIT_PROC:
- if (pid > 0)
- break;
- /* FALLTHROUGH */
- case UMH_WAIT_EXEC:
- if (pid < 0)
- sub_info->retval = pid;
+ if (pid < 0) {
+ sub_info->retval = pid;
umh_complete(sub_info);
}
}
@@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
goto out;
}
- sub_info->complete = &done;
+ /*
+ * Set the completion pointer only if there is a waiter.
+ * This makes it possible to use umh_complete to free
+ * the data structure in case of UMH_NO_WAIT.
+ */
+ sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
sub_info->wait = wait;
queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index dadbf88c22c4..454195194d4a 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -378,8 +378,14 @@ done:
* reschedule now, before we try-lock the mutex. This avoids getting
* scheduled out right after we obtained the mutex.
*/
- if (need_resched())
+ if (need_resched()) {
+ /*
+ * We _should_ have TASK_RUNNING here, but just in case
+ * we do not, make it so, otherwise we might get stuck.
+ */
+ __set_current_state(TASK_RUNNING);
schedule_preempt_disabled();
+ }
return false;
}
diff --git a/kernel/module.c b/kernel/module.c
index 88cec1ddb1e3..e52a8739361a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3097,6 +3097,32 @@ static int may_init_module(void)
}
/*
+ * Can't use wait_event_interruptible() because our condition
+ * 'finished_loading()' contains a blocking primitive itself (mutex_lock).
+ */
+static int wait_finished_loading(struct module *mod)
+{
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
+ int ret = 0;
+
+ add_wait_queue(&module_wq, &wait);
+ for (;;) {
+ if (finished_loading(mod->name))
+ break;
+
+ if (signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ break;
+ }
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+ }
+ remove_wait_queue(&module_wq, &wait);
+
+ return ret;
+}
+
+/*
* We try to place it in the list now to make sure it's unique before
* we dedicate too many resources. In particular, temporary percpu
* memory exhaustion.
@@ -3116,8 +3142,8 @@ again:
|| old->state == MODULE_STATE_UNFORMED) {
/* Wait in case it fails to load. */
mutex_unlock(&module_mutex);
- err = wait_event_interruptible(module_wq,
- finished_loading(mod->name));
+
+ err = wait_finished_loading(mod);
if (err)
goto out_unlocked;
goto again;
diff --git a/kernel/panic.c b/kernel/panic.c
index d09dc5c32c67..cf80672b7924 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -244,6 +244,7 @@ static const struct tnt tnts[] = {
* 'I' - Working around severe firmware bug.
* 'O' - Out-of-tree module has been loaded.
* 'E' - Unsigned module has been loaded.
+ * 'L' - A soft lockup has previously occurred.
*
* The string is overwritten by the next call to print_tainted().
*/
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..1f35a3478f3c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode)
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
- dpm_resume_end(PMSG_RECOVER);
+ /*
+ * The above should either succeed and jump to the new kernel,
+ * or return with an error. Otherwise things are just
+ * undefined, so let's be paranoid.
+ */
+ BUG_ON(!error);
}
+ dpm_resume_end(PMSG_RECOVER);
pm_restore_gfp_mask();
resume_console();
pm_restore_console();
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7b323221b9ee..5a6ec8678b9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
while (true) {
todo = 0;
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p == current || !freeze_task(p))
continue;
if (!freezer_should_skip(p))
todo++;
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
if (!user_only) {
@@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only)
if (!wakeup) {
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
&& freezing(p) && !frozen(p))
sched_show_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
}
} else {
@@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only)
return todo ? -EBUSY : 0;
}
+static bool __check_frozen_processes(void)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p)
+ if (p != current && !freezer_should_skip(p) && !frozen(p))
+ return false;
+
+ return true;
+}
+
+/*
+ * Returns true if all freezable tasks (except for current) are frozen already
+ */
+static bool check_frozen_processes(void)
+{
+ bool ret;
+
+ read_lock(&tasklist_lock);
+ ret = __check_frozen_processes();
+ read_unlock(&tasklist_lock);
+ return ret;
+}
+
/**
* freeze_processes - Signal user space processes to enter the refrigerator.
* The current thread will not be frozen. The same process that calls
@@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only)
int freeze_processes(void)
{
int error;
+ int oom_kills_saved;
error = __usermodehelper_disable(UMH_FREEZING);
if (error)
@@ -132,11 +157,25 @@ int freeze_processes(void)
pm_wakeup_clear();
printk("Freezing user space processes ... ");
pm_freezing = true;
+ oom_kills_saved = oom_kills_count();
error = try_to_freeze_tasks(true);
if (!error) {
- printk("done.");
__usermodehelper_set_disable_depth(UMH_DISABLED);
oom_killer_disable();
+
+ /*
+ * There might have been an OOM kill while we were
+ * freezing tasks and the killed task might be still
+ * on the way out so we have to double check for race.
+ */
+ if (oom_kills_count() != oom_kills_saved &&
+ !check_frozen_processes()) {
+ __usermodehelper_set_disable_depth(UMH_ENABLED);
+ printk("OOM in progress.");
+ error = -EBUSY;
+ } else {
+ printk("done.");
+ }
}
printk("\n");
BUG_ON(in_atomic());
@@ -191,11 +230,11 @@ void thaw_processes(void)
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -218,10 +257,10 @@ void thaw_kernel_threads(void)
thaw_workqueues();
read_lock(&tasklist_lock);
- do_each_thread(g, p) {
+ for_each_process_thread(g, p) {
if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
__thaw_task(p);
- } while_each_thread(g, p);
+ }
read_unlock(&tasklist_lock);
schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..5f4c006c4b1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
};
+static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
+static struct pm_qos_constraints memory_bw_constraints = {
+ .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
+ .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
+ .type = PM_QOS_SUM,
+ .notifiers = &memory_bandwidth_notifier,
+};
+static struct pm_qos_object memory_bandwidth_pm_qos = {
+ .constraints = &memory_bw_constraints,
+ .name = "memory_bandwidth",
+};
+
+
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
- &network_throughput_pm_qos
+ &network_throughput_pm_qos,
+ &memory_bandwidth_pm_qos,
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = {
/* unlocked internal variant */
static inline int pm_qos_get_value(struct pm_qos_constraints *c)
{
+ struct plist_node *node;
+ int total_value = 0;
+
if (plist_head_empty(&c->list))
return c->no_constraint_value;
@@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
case PM_QOS_MAX:
return plist_last(&c->list)->prio;
+ case PM_QOS_SUM:
+ plist_for_each(node, &c->list)
+ total_value += node->prio;
+
+ return total_value;
+
default:
/* runtime check for not using enum */
BUG();
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 4ca9a33ff620..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,7 +146,7 @@ static int platform_suspend_prepare(suspend_state_t state)
static int platform_suspend_prepare_late(suspend_state_t state)
{
- return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ?
+ return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
freeze_ops->prepare() : 0;
}
@@ -164,7 +164,7 @@ static void platform_resume_noirq(suspend_state_t state)
static void platform_resume_early(suspend_state_t state)
{
- if (state == PM_SUSPEND_FREEZE && freeze_ops->restore)
+ if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
freeze_ops->restore();
}
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 807ccfbf69b3..e6fae503d1bc 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,6 +1,6 @@
obj-y += update.o srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_TREE_RCU) += tree.o
-obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
+obj-$(CONFIG_PREEMPT_RCU) += tree.o
obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index ff1a6de62f17..07bb02eda844 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -135,4 +135,6 @@ int rcu_jiffies_till_stall_check(void);
*/
#define TPS(x) tracepoint_string(x)
+void rcu_early_boot_tests(void);
+
#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 240fa9094f83..4d559baf06e0 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -812,6 +812,7 @@ rcu_torture_cbflood(void *arg)
cur_ops->cb_barrier();
stutter_wait("rcu_torture_cbflood");
} while (!torture_must_stop());
+ vfree(rhp);
torture_kthread_stopping("rcu_torture_cbflood");
return 0;
}
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c0623fc47125..0db5649f8817 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -247,7 +247,7 @@ void rcu_bh_qs(void)
* be called from hardirq context. It is normally called from the
* scheduling-clock interrupt.
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
RCU_TRACE(check_cpu_stalls());
if (user || rcu_is_cpu_rrupt_from_idle())
@@ -380,7 +380,9 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
}
EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_init(void)
+void __init rcu_init(void)
{
open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+
+ rcu_early_boot_tests();
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 133e47223095..7680fc275036 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -105,7 +105,7 @@ struct rcu_state sname##_state = { \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
}; \
-DEFINE_PER_CPU(struct rcu_data, sname##_data)
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -152,19 +152,6 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
*/
static int rcu_scheduler_fully_active __read_mostly;
-#ifdef CONFIG_RCU_BOOST
-
-/*
- * Control variables for per-CPU and per-rcu_node kthreads. These
- * handle all flavors of RCU.
- */
-static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DEFINE_PER_CPU(char, rcu_cpu_has_work);
-
-#endif /* #ifdef CONFIG_RCU_BOOST */
-
static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
static void invoke_rcu_core(void);
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -286,11 +273,11 @@ static void rcu_momentary_dyntick_idle(void)
* and requires special handling for preemptible RCU.
* The caller must have disabled preemption.
*/
-void rcu_note_context_switch(int cpu)
+void rcu_note_context_switch(void)
{
trace_rcu_utilization(TPS("Start context switch"));
rcu_sched_qs();
- rcu_preempt_note_context_switch(cpu);
+ rcu_preempt_note_context_switch();
if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
rcu_momentary_dyntick_idle();
trace_rcu_utilization(TPS("End context switch"));
@@ -325,7 +312,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
unsigned long *maxj),
bool *isidle, unsigned long *maxj);
static void force_quiescent_state(struct rcu_state *rsp);
-static int rcu_pending(int cpu);
+static int rcu_pending(void);
/*
* Return the number of RCU-sched batches processed thus far for debug & stats.
@@ -510,11 +497,11 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
* we really have entered idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
- bool user)
+static void rcu_eqs_enter_common(long long oldval, bool user)
{
struct rcu_state *rsp;
struct rcu_data *rdp;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
@@ -531,7 +518,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
rdp = this_cpu_ptr(rsp->rda);
do_nocb_deferred_wakeup(rdp);
}
- rcu_prepare_for_idle(smp_processor_id());
+ rcu_prepare_for_idle();
/* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
smp_mb__before_atomic(); /* See above. */
atomic_inc(&rdtp->dynticks);
@@ -565,7 +552,7 @@ static void rcu_eqs_enter(bool user)
WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) {
rdtp->dynticks_nesting = 0;
- rcu_eqs_enter_common(rdtp, oldval, user);
+ rcu_eqs_enter_common(oldval, user);
} else {
rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE;
}
@@ -589,7 +576,7 @@ void rcu_idle_enter(void)
local_irq_save(flags);
rcu_eqs_enter(false);
- rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_enter(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -639,8 +626,8 @@ void rcu_irq_exit(void)
if (rdtp->dynticks_nesting)
trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_enter_common(rdtp, oldval, true);
- rcu_sysidle_enter(rdtp, 1);
+ rcu_eqs_enter_common(oldval, true);
+ rcu_sysidle_enter(1);
local_irq_restore(flags);
}
@@ -651,16 +638,17 @@ void rcu_irq_exit(void)
* we really have exited idle, and must do the appropriate accounting.
* The caller must have disabled interrupts.
*/
-static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
- int user)
+static void rcu_eqs_exit_common(long long oldval, int user)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
rcu_dynticks_task_exit();
smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
atomic_inc(&rdtp->dynticks);
/* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
smp_mb__after_atomic(); /* See above. */
WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
- rcu_cleanup_after_idle(smp_processor_id());
+ rcu_cleanup_after_idle();
trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
if (!user && !is_idle_task(current)) {
struct task_struct *idle __maybe_unused =
@@ -691,7 +679,7 @@ static void rcu_eqs_exit(bool user)
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
- rcu_eqs_exit_common(rdtp, oldval, user);
+ rcu_eqs_exit_common(oldval, user);
}
}
@@ -712,7 +700,7 @@ void rcu_idle_exit(void)
local_irq_save(flags);
rcu_eqs_exit(false);
- rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
+ rcu_sysidle_exit(0);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -763,8 +751,8 @@ void rcu_irq_enter(void)
if (oldval)
trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting);
else
- rcu_eqs_exit_common(rdtp, oldval, true);
- rcu_sysidle_exit(rdtp, 1);
+ rcu_eqs_exit_common(oldval, true);
+ rcu_sysidle_exit(1);
local_irq_restore(flags);
}
@@ -2387,7 +2375,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
* invoked from the scheduling-clock interrupt. If rcu_pending returns
* false, there is no point in invoking rcu_check_callbacks().
*/
-void rcu_check_callbacks(int cpu, int user)
+void rcu_check_callbacks(int user)
{
trace_rcu_utilization(TPS("Start scheduler-tick"));
increment_cpu_stall_ticks();
@@ -2419,8 +2407,8 @@ void rcu_check_callbacks(int cpu, int user)
rcu_bh_qs();
}
- rcu_preempt_check_callbacks(cpu);
- if (rcu_pending(cpu))
+ rcu_preempt_check_callbacks();
+ if (rcu_pending())
invoke_rcu_core();
if (user)
rcu_note_voluntary_context_switch(current);
@@ -2963,6 +2951,9 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
*/
void synchronize_sched_expedited(void)
{
+ cpumask_var_t cm;
+ bool cma = false;
+ int cpu;
long firstsnap, s, snap;
int trycount = 0;
struct rcu_state *rsp = &rcu_sched_state;
@@ -2997,11 +2988,26 @@ void synchronize_sched_expedited(void)
}
WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
+ /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */
+ cma = zalloc_cpumask_var(&cm, GFP_KERNEL);
+ if (cma) {
+ cpumask_copy(cm, cpu_online_mask);
+ cpumask_clear_cpu(raw_smp_processor_id(), cm);
+ for_each_cpu(cpu, cm) {
+ struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+
+ if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
+ cpumask_clear_cpu(cpu, cm);
+ }
+ if (cpumask_weight(cm) == 0)
+ goto all_cpus_idle;
+ }
+
/*
* Each pass through the following loop attempts to force a
* context switch on each CPU.
*/
- while (try_stop_cpus(cpu_online_mask,
+ while (try_stop_cpus(cma ? cm : cpu_online_mask,
synchronize_sched_expedited_cpu_stop,
NULL) == -EAGAIN) {
put_online_cpus();
@@ -3013,6 +3019,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone1);
+ free_cpumask_var(cm);
return;
}
@@ -3022,6 +3029,7 @@ void synchronize_sched_expedited(void)
} else {
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
@@ -3031,6 +3039,7 @@ void synchronize_sched_expedited(void)
/* ensure test happens before caller kfree */
smp_mb__before_atomic(); /* ^^^ */
atomic_long_inc(&rsp->expedited_workdone2);
+ free_cpumask_var(cm);
return;
}
@@ -3045,6 +3054,7 @@ void synchronize_sched_expedited(void)
/* CPU hotplug operation in flight, use normal GP. */
wait_rcu_gp(call_rcu_sched);
atomic_long_inc(&rsp->expedited_normal);
+ free_cpumask_var(cm);
return;
}
snap = atomic_long_read(&rsp->expedited_start);
@@ -3052,6 +3062,9 @@ void synchronize_sched_expedited(void)
}
atomic_long_inc(&rsp->expedited_stoppedcpus);
+all_cpus_idle:
+ free_cpumask_var(cm);
+
/*
* Everyone up to our most recent fetch is covered by our grace
* period. Update the counter, but only if our work is still
@@ -3143,12 +3156,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
-static int rcu_pending(int cpu)
+static int rcu_pending(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
- if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu)))
+ if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
return 1;
return 0;
}
@@ -3158,7 +3171,7 @@ static int rcu_pending(int cpu)
* non-NULL, store an indication of whether all callbacks are lazy.
* (If there are no callbacks, all of them are deemed to be lazy.)
*/
-static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
+static int __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy)
{
bool al = true;
bool hc = false;
@@ -3166,7 +3179,7 @@ static int __maybe_unused rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
struct rcu_state *rsp;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!rdp->nxtlist)
continue;
hc = true;
@@ -3299,11 +3312,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
continue;
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
- rsp->n_barrier_done);
- atomic_inc(&rsp->barrier_cpu_count);
- __call_rcu(&rdp->barrier_head, rcu_barrier_callback,
- rsp, cpu, 0);
+ if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
+ _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ rsp->n_barrier_done);
+ } else {
+ _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ rsp->n_barrier_done);
+ atomic_inc(&rsp->barrier_cpu_count);
+ __call_rcu(&rdp->barrier_head,
+ rcu_barrier_callback, rsp, cpu, 0);
+ }
} else if (ACCESS_ONCE(rdp->qlen)) {
_rcu_barrier_trace(rsp, "OnlineQ", cpu,
rsp->n_barrier_done);
@@ -3480,8 +3498,10 @@ static int rcu_cpu_notify(struct notifier_block *self,
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
- for_each_rcu_flavor(rsp)
+ for_each_rcu_flavor(rsp) {
rcu_cleanup_dead_cpu(cpu, rsp);
+ do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu));
+ }
break;
default:
break;
@@ -3761,6 +3781,8 @@ void __init rcu_init(void)
pm_notifier(rcu_pm_notify, 0);
for_each_online_cpu(cpu)
rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
+
+ rcu_early_boot_tests();
}
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index d03764652d91..8e7b1843896e 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -139,7 +139,7 @@ struct rcu_node {
unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
- /* complete (only for TREE_PREEMPT_RCU). */
+ /* complete (only for PREEMPT_RCU). */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -530,10 +530,10 @@ DECLARE_PER_CPU(struct rcu_data, rcu_sched_data);
extern struct rcu_state rcu_bh_state;
DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
extern struct rcu_state rcu_preempt_state;
DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
@@ -547,7 +547,7 @@ DECLARE_PER_CPU(char, rcu_cpu_has_work);
/* Forward declarations for rcutree_plugin.h */
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
+static void rcu_preempt_note_context_switch(void);
static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
@@ -561,12 +561,12 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_check_callbacks(void);
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
+#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU)
static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
bool wake);
-#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
+#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PREEMPT_RCU) */
static void __init __rcu_init_preempt(void);
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
@@ -579,14 +579,15 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
#endif /* #ifdef CONFIG_RCU_BOOST */
static void __init rcu_spawn_boost_kthreads(void);
static void rcu_prepare_kthreads(int cpu);
-static void rcu_cleanup_after_idle(int cpu);
-static void rcu_prepare_for_idle(int cpu);
+static void rcu_cleanup_after_idle(void);
+static void rcu_prepare_for_idle(void);
static void rcu_idle_count_callbacks_posted(void);
static void print_cpu_stall_info_begin(void);
static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
static void print_cpu_stall_info_end(void);
static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -605,8 +606,8 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
static bool init_nocb_callback_list(struct rcu_data *rdp);
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq);
+static void rcu_sysidle_enter(int irq);
+static void rcu_sysidle_exit(int irq);
static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
unsigned long *maxj);
static bool is_sysidle_rcu_state(struct rcu_state *rsp);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 387dd4599344..3ec85cb5d544 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -30,14 +30,24 @@
#include <linux/smpboot.h>
#include "../time/tick-internal.h"
-#define RCU_KTHREAD_PRIO 1
-
#ifdef CONFIG_RCU_BOOST
+
#include "../locking/rtmutex_common.h"
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else
-#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
-#endif
+
+/* rcuc/rcub kthread realtime priority */
+static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
+module_param(kthread_prio, int, 0644);
+
+/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+
+#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_NOCB_CPU
static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
@@ -72,9 +82,6 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
pr_info("\tRCU torture testing starts during boot.\n");
#endif
-#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
- pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n");
-#endif
#if defined(CONFIG_RCU_CPU_STALL_INFO)
pr_info("\tAdditional per-CPU info printed with stalls.\n");
#endif
@@ -85,9 +92,12 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
if (nr_cpu_ids != NR_CPUS)
pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
+#ifdef CONFIG_RCU_BOOST
+ pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
+#endif
}
-#ifdef CONFIG_TREE_PREEMPT_RCU
+#ifdef CONFIG_PREEMPT_RCU
RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
static struct rcu_state *rcu_state_p = &rcu_preempt_state;
@@ -156,7 +166,7 @@ static void rcu_preempt_qs(void)
*
* Caller must disable preemption.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
struct task_struct *t = current;
unsigned long flags;
@@ -167,7 +177,7 @@ static void rcu_preempt_note_context_switch(int cpu)
!t->rcu_read_unlock_special.b.blocked) {
/* Possibly blocking in an RCU read-side critical section. */
- rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
+ rdp = this_cpu_ptr(rcu_preempt_state.rda);
rnp = rdp->mynode;
raw_spin_lock_irqsave(&rnp->lock, flags);
smp_mb__after_unlock_lock();
@@ -415,8 +425,6 @@ void rcu_read_unlock_special(struct task_struct *t)
}
}
-#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
-
/*
* Dump detailed information for all tasks blocking the current RCU
* grace period on the specified rcu_node structure.
@@ -451,14 +459,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
rcu_print_detail_task_stall_rnp(rnp);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
-static void rcu_print_detail_task_stall(struct rcu_state *rsp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
-
#ifdef CONFIG_RCU_CPU_STALL_INFO
static void rcu_print_task_stall_begin(struct rcu_node *rnp)
@@ -621,7 +621,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
*
* Caller must disable hard irqs.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
struct task_struct *t = current;
@@ -630,8 +630,8 @@ static void rcu_preempt_check_callbacks(int cpu)
return;
}
if (t->rcu_read_lock_nesting > 0 &&
- per_cpu(rcu_preempt_data, cpu).qs_pending &&
- !per_cpu(rcu_preempt_data, cpu).passed_quiesce)
+ __this_cpu_read(rcu_preempt_data.qs_pending) &&
+ !__this_cpu_read(rcu_preempt_data.passed_quiesce))
t->rcu_read_unlock_special.b.need_qs = true;
}
@@ -919,7 +919,7 @@ void exit_rcu(void)
__rcu_read_unlock();
}
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *rcu_state_p = &rcu_sched_state;
@@ -945,7 +945,7 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
* Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
-static void rcu_preempt_note_context_switch(int cpu)
+static void rcu_preempt_note_context_switch(void)
{
}
@@ -1017,7 +1017,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
* Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
-static void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(void)
{
}
@@ -1070,7 +1070,7 @@ void exit_rcu(void)
{
}
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
@@ -1326,7 +1326,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
smp_mb__after_unlock_lock();
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
- sp.sched_priority = RCU_BOOST_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
return 0;
@@ -1343,7 +1343,7 @@ static void rcu_cpu_kthread_setup(unsigned int cpu)
{
struct sched_param sp;
- sp.sched_priority = RCU_KTHREAD_PRIO;
+ sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
}
@@ -1512,10 +1512,10 @@ static void rcu_prepare_kthreads(int cpu)
* any flavor of RCU.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(unsigned long *delta_jiffies)
{
*delta_jiffies = ULONG_MAX;
- return rcu_cpu_has_callbacks(cpu, NULL);
+ return rcu_cpu_has_callbacks(NULL);
}
#endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1523,7 +1523,7 @@ int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
* Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
* after it.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
}
@@ -1531,7 +1531,7 @@ static void rcu_cleanup_after_idle(int cpu)
* Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
* is nothing.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
}
@@ -1624,15 +1624,15 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
* The caller must have disabled interrupts.
*/
#ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(int cpu, unsigned long *dj)
+int rcu_needs_cpu(unsigned long *dj)
{
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* Snapshot to detect later posting of non-lazy callback. */
rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
/* If no callbacks, RCU doesn't need the CPU. */
- if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
*dj = ULONG_MAX;
return 0;
}
@@ -1666,12 +1666,12 @@ int rcu_needs_cpu(int cpu, unsigned long *dj)
*
* The caller must have disabled interrupts.
*/
-static void rcu_prepare_for_idle(int cpu)
+static void rcu_prepare_for_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
bool needwake;
struct rcu_data *rdp;
- struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
struct rcu_node *rnp;
struct rcu_state *rsp;
int tne;
@@ -1679,7 +1679,7 @@ static void rcu_prepare_for_idle(int cpu)
/* Handle nohz enablement switches conservatively. */
tne = ACCESS_ONCE(tick_nohz_active);
if (tne != rdtp->tick_nohz_enabled_snap) {
- if (rcu_cpu_has_callbacks(cpu, NULL))
+ if (rcu_cpu_has_callbacks(NULL))
invoke_rcu_core(); /* force nohz to see update. */
rdtp->tick_nohz_enabled_snap = tne;
return;
@@ -1688,7 +1688,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
/* If this is a no-CBs CPU, no callbacks, just return. */
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
/*
@@ -1712,7 +1712,7 @@ static void rcu_prepare_for_idle(int cpu)
return;
rdtp->last_accelerate = jiffies;
for_each_rcu_flavor(rsp) {
- rdp = per_cpu_ptr(rsp->rda, cpu);
+ rdp = this_cpu_ptr(rsp->rda);
if (!*rdp->nxttail[RCU_DONE_TAIL])
continue;
rnp = rdp->mynode;
@@ -1731,10 +1731,10 @@ static void rcu_prepare_for_idle(int cpu)
* any grace periods that elapsed while the CPU was idle, and if any
* callbacks are now ready to invoke, initiate invocation.
*/
-static void rcu_cleanup_after_idle(int cpu)
+static void rcu_cleanup_after_idle(void)
{
#ifndef CONFIG_RCU_NOCB_CPU_ALL
- if (rcu_is_nocb_cpu(cpu))
+ if (rcu_is_nocb_cpu(smp_processor_id()))
return;
if (rcu_try_advance_all_cbs())
invoke_rcu_core();
@@ -2050,6 +2050,33 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
}
/*
+ * Does the specified CPU need an RCU callback for the specified flavor
+ * of rcu_barrier()?
+ */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_head *rhp;
+
+ /* No-CBs CPUs might have callbacks on any of three lists. */
+ rhp = ACCESS_ONCE(rdp->nocb_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_gp_head);
+ if (!rhp)
+ rhp = ACCESS_ONCE(rdp->nocb_follower_head);
+
+ /* Having no rcuo kthread but CBs after scheduler starts is bad! */
+ if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+ /* RCU callback enqueued before CPU first came online??? */
+ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
+ cpu, rhp->func);
+ WARN_ON_ONCE(1);
+ }
+
+ return !!rhp;
+}
+
+/*
* Enqueue the specified string of rcu_head structures onto the specified
* CPU's no-CBs lists. The CPU is specified by rdp, the head of the
* string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2546,9 +2573,13 @@ static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
rdp->nocb_leader = rdp_spawn;
if (rdp_last && rdp != rdp_spawn)
rdp_last->nocb_next_follower = rdp;
- rdp_last = rdp;
- rdp = rdp->nocb_next_follower;
- rdp_last->nocb_next_follower = NULL;
+ if (rdp == rdp_spawn) {
+ rdp = rdp->nocb_next_follower;
+ } else {
+ rdp_last = rdp;
+ rdp = rdp->nocb_next_follower;
+ rdp_last->nocb_next_follower = NULL;
+ }
} while (rdp);
rdp_spawn->nocb_next_follower = rdp_old_leader;
}
@@ -2642,6 +2673,12 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
+{
+ WARN_ON_ONCE(1); /* Should be dead code. */
+ return false;
+}
+
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
{
}
@@ -2728,9 +2765,10 @@ static int full_sysidle_state; /* Current system-idle state. */
* to detect full-system idle states, not RCU quiescent states and grace
* periods. The caller must have disabled interrupts.
*/
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
unsigned long j;
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
@@ -2799,8 +2837,10 @@ void rcu_sysidle_force_exit(void)
* usermode execution does -not- count as idle here! The caller must
* have disabled interrupts.
*/
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
+ struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+
/* If there are no nohz_full= CPUs, no need to track this. */
if (!tick_nohz_full_enabled())
return;
@@ -3094,11 +3134,11 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_enter(int irq)
{
}
-static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
+static void rcu_sysidle_exit(int irq)
{
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 3ef8ba58694e..e0d31a345ee6 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -306,7 +306,7 @@ struct debug_obj_descr rcuhead_debug_descr = {
EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
-#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp,
unsigned long secs,
unsigned long c_old, unsigned long c)
@@ -531,7 +531,8 @@ static int __noreturn rcu_tasks_kthread(void *arg)
struct rcu_head *next;
LIST_HEAD(rcu_tasks_holdouts);
- /* FIXME: Add housekeeping affinity. */
+ /* Run on housekeeping CPUs by default. Sysadm can move if desired. */
+ housekeeping_affine(current);
/*
* Each pass through the following loop makes one check for
@@ -690,3 +691,87 @@ static void rcu_spawn_tasks_kthread(void)
}
#endif /* #ifdef CONFIG_TASKS_RCU */
+
+#ifdef CONFIG_PROVE_RCU
+
+/*
+ * Early boot self test parameters, one for each flavor
+ */
+static bool rcu_self_test;
+static bool rcu_self_test_bh;
+static bool rcu_self_test_sched;
+
+module_param(rcu_self_test, bool, 0444);
+module_param(rcu_self_test_bh, bool, 0444);
+module_param(rcu_self_test_sched, bool, 0444);
+
+static int rcu_self_test_counter;
+
+static void test_callback(struct rcu_head *r)
+{
+ rcu_self_test_counter++;
+ pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
+}
+
+static void early_boot_test_call_rcu(void)
+{
+ static struct rcu_head head;
+
+ call_rcu(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_bh(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_bh(&head, test_callback);
+}
+
+static void early_boot_test_call_rcu_sched(void)
+{
+ static struct rcu_head head;
+
+ call_rcu_sched(&head, test_callback);
+}
+
+void rcu_early_boot_tests(void)
+{
+ pr_info("Running RCU self tests\n");
+
+ if (rcu_self_test)
+ early_boot_test_call_rcu();
+ if (rcu_self_test_bh)
+ early_boot_test_call_rcu_bh();
+ if (rcu_self_test_sched)
+ early_boot_test_call_rcu_sched();
+}
+
+static int rcu_verify_early_boot_tests(void)
+{
+ int ret = 0;
+ int early_boot_test_counter = 0;
+
+ if (rcu_self_test) {
+ early_boot_test_counter++;
+ rcu_barrier();
+ }
+ if (rcu_self_test_bh) {
+ early_boot_test_counter++;
+ rcu_barrier_bh();
+ }
+ if (rcu_self_test_sched) {
+ early_boot_test_counter++;
+ rcu_barrier_sched();
+ }
+
+ if (rcu_self_test_counter != early_boot_test_counter) {
+ WARN_ON(1);
+ ret = -1;
+ }
+
+ return ret;
+}
+late_initcall(rcu_verify_early_boot_tests);
+#else
+void rcu_early_boot_tests(void) {}
+#endif /* CONFIG_PROVE_RCU */
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
+ * for IO (which traditionally means blkio only).
*/
void __sched wait_for_completion_io(struct completion *x)
{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !p->on_rq);
#ifdef CONFIG_LOCKDEP
/*
@@ -1407,7 +1411,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!is_idle_task(rq->curr))
- return;
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
/* Else cpu is not in idle, do nothing here */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+
+out:
+ rcu_read_unlock();
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
+ p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
- INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
}
#endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
/**
* finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
*/
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
+ struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
tick_nohz_task_switch(current);
+ return rq;
}
#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ struct rq *rq;
- /*
- * FIXME: do we need to worry about rq being invalidated by the
- * task_switch?
- */
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
post_schedule(rq);
+ preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
*/
-static inline void
+static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
-
barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+
+ return finish_task_switch(prev);
}
/*
@@ -2475,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- * thread, breaking clock_gettime().
- */
- if (task_current(rq, p) && task_on_rq_queued(p)) {
- update_rq_clock(rq);
- ns = rq_clock_task(rq) - p->se.exec_start;
- if ((s64)ns < 0)
- ns = 0;
- }
-
- return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns = 0;
-
- rq = task_rq_lock(p, &flags);
- ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2521,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- u64 ns = 0;
+ u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
@@ -2540,7 +2489,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#endif
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);
return ns;
@@ -2802,7 +2760,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
prev = rq->curr;
schedule_debug(prev);
@@ -2855,15 +2813,8 @@ need_resched:
rq->curr = next;
++*switch_count;
- context_switch(rq, prev, next); /* unlocks the rq */
- /*
- * The context switch have flipped the stack from under us
- * and restored the local variables which were saved when
- * this task called schedule() in the past. prev == current
- * is still correct, but it can be moved to another cpu/rq.
- */
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
} else
raw_spin_unlock_irq(&rq->lock);
@@ -2903,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
* or we have been woken up remotely but the IPI has not yet arrived,
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
*/
- user_exit();
+ enum ctx_state prev_state = exception_enter();
schedule();
- user_enter();
+ exception_exit(prev_state);
}
#endif
@@ -2951,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#endif /* CONFIG_PREEMPT */
/*
@@ -4637,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
#ifdef CONFIG_SMP
/*
* move_queued_task - move a queued task to new rq.
@@ -6087,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
@@ -6259,7 +6332,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
}
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
{
int i;
@@ -6274,6 +6347,56 @@ static bool find_numa_distance(int distance)
return false;
}
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6327,6 +6450,10 @@ static void sched_init_numa(void)
if (!sched_debug())
break;
}
+
+ if (!level)
+ return;
+
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
@@ -6406,6 +6533,9 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
}
static void sched_domains_numa_masks_set(int cpu)
@@ -7158,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ if (WARN_ONCE(current->state != TASK_RUNNING,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change))
+ __set_current_state(TASK_RUNNING);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7189,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
#endif
dump_stack();
}
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -7403,8 +7552,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgrp_id,
- lockdep_is_held(&tsk->sighand->siglock)),
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
@@ -7833,6 +7986,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -8205,6 +8363,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
}
/*
- * We need to take care of a possible races here. In fact, the
- * task might have changed its scheduling policy to something
- * different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setattr()).
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
*/
- if (!dl_task(p) || dl_se->dl_new)
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
@@ -532,7 +540,7 @@ again:
dl_se->dl_yielded = 0;
if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (task_has_dl_policy(rq->curr))
+ if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
@@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -625,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
__dequeue_task_dl(rq, curr, 0);
if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -847,8 +850,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
/*
* If p is throttled, we do nothing. In fact, if it exhausted
@@ -914,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (sd_flag != SD_BALANCE_WAKE)
goto out;
rq = cpu_rq(cpu);
@@ -999,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
hrtick_start(rq, p->dl.runtime);
}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
#endif
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1052,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
/* Running task will never be pushed. */
dequeue_pushable_dl_task(rq, p);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
-#endif
set_post_schedule(rq);
@@ -1074,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
start_hrtick_dl(rq, p);
-#endif
}
static void task_fork_dl(struct task_struct *p)
@@ -1314,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ int ret = 0;
if (!rq->dl.overloaded)
return 0;
@@ -1359,7 +1374,6 @@ retry:
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
*/
- dequeue_pushable_dl_task(rq, next_task);
goto out;
}
@@ -1375,6 +1389,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ ret = 1;
resched_curr(later_rq);
@@ -1383,7 +1398,7 @@ retry:
out:
put_task_struct(next_task);
- return 1;
+ return ret;
}
static void push_dl_tasks(struct rq *rq)
@@ -1489,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
- dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
}
@@ -1498,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
struct rq *rq;
+ struct root_domain *src_rd;
int weight;
BUG_ON(!dl_task(p));
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
/*
* Update only if the task is actually running (i.e.,
* it is on the rq AND it is not throttled).
@@ -1518,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
- rq = task_rq(p);
-
/*
* The process used to be able to migrate OR it can now migrate
*/
@@ -1567,22 +1603,48 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ cancel_dl_timer(rq, p);
__dl_clear_params(p);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
@@ -1603,12 +1665,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
- if (check_resched && task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
}
}
@@ -1678,4 +1745,15 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults_memory)
- nr_faults = p->numa_faults_memory[2*node + i];
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -828,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -867,7 +873,6 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
- struct list_head task_list;
struct rcu_head rcu;
nodemask_t active_nodes;
@@ -895,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0;
}
-static inline int task_faults_idx(int nid, int priv)
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
- p->numa_faults_memory[task_faults_idx(nid, 1)];
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -914,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
- p->numa_group->faults[task_faults_idx(nid, 1)];
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(nid, 0)] +
- group->faults_cpu[task_faults_idx(nid, 1)];
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
}
/*
@@ -930,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
@@ -942,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1083,6 +1174,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
@@ -1162,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env,
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
/*
* "imp" is the fault differential for the source task between the
@@ -1185,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
@@ -1200,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
* instead.
*/
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
@@ -1303,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
@@ -1331,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1408,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1520,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
@@ -1557,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1584,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
- int priv, i;
+ int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
- i = task_faults_idx(nid, priv);
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
- fault_types[priv] += p->numa_faults_buffer_memory[i];
- p->numa_faults_buffer_memory[i] = 0;
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
@@ -1605,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
- p->numa_faults_buffer_cpu[i] = 0;
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
- p->numa_faults_memory[i] += diff;
- p->numa_faults_cpu[i] += f_diff;
- faults += p->numa_faults_memory[i];
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->faults_cpu[i] += f_diff;
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
+ group_faults += p->numa_group->faults[mem_idx];
}
}
@@ -1639,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = max_group_nid;
+ max_nid = preferred_group_nid(p, max_group_nid);
}
if (max_faults) {
@@ -1682,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
atomic_set(&grp->refcount, 1);
spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1691,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] = p->numa_faults_memory[i];
+ grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
- list_add(&p->numa_entry, &grp->task_list);
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
@@ -1750,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults_memory[i];
- grp->faults[i] += p->numa_faults_memory[i];
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
- list_move(&p->numa_entry, &grp->task_list);
my_grp->nr_tasks--;
grp->nr_tasks++;
@@ -1776,27 +1996,23 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults_memory;
+ void *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
- list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
- p->numa_faults_cpu= NULL;
- p->numa_faults_buffer_cpu = NULL;
+ p->numa_faults = NULL;
kfree(numa_faults);
}
@@ -1819,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
return;
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults_memory)) {
- int size = sizeof(*p->numa_faults_memory) *
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults_memory)
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
return;
- BUG_ON(p->numa_faults_buffer_memory);
- /*
- * The averaged statistics, shared & private, memory & cpu,
- * occupy the first half of the array. The second half of the
- * array is for current counters, which are averaged into the
- * first set by task_numa_placement.
- */
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1876,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
@@ -4446,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else {
+ } else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
@@ -4524,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (p->nr_cpus_allowed == 1)
- return prev_cpu;
-
if (sd_flag & SD_BALANCE_WAKE)
want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
@@ -5166,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -5205,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -6149,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
+ sds->local_stat.group_has_free_capacity) {
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ sgs->group_type = group_classify(sg, sgs);
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -7938,6 +8143,8 @@ const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .update_curr = update_curr_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
return 0;
}
+static void update_curr_idle(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (p->nr_cpus_allowed == 1)
- goto out;
-
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1351,16 +1348,22 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->nr_cpus_allowed == 1)
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
-
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
@@ -2128,6 +2131,8 @@ const struct sched_class rt_sched_class = {
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
u64 bw, total_bw;
};
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
extern struct mutex sched_domains_mutex;
#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1135,6 +1177,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
+ void (*update_curr) (struct rq *rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
@@ -1502,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 67426e529f59..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
return 0;
}
+static void update_curr_stop(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/kthread.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
}
EXPORT_SYMBOL(autoremove_wake_function);
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f0876f9f6dd..19e35135fc60 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1275,7 +1275,17 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
local_irq_restore(*flags);
break;
}
-
+ /*
+ * This sighand can be already freed and even reused, but
+ * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which
+ * initializes ->siglock: this slab can't go away, it has
+ * the same object type, ->siglock can't be reinitialized.
+ *
+ * We need to ensure that tsk->sighand is still the same
+ * after we take the lock, we can race with de_thread() or
+ * __exit_signal(). In the latter case the next iteration
+ * must see ->sighand == NULL.
+ */
spin_lock(&sighand->siglock);
if (likely(sighand == tsk->sighand)) {
rcu_read_unlock();
@@ -1331,23 +1341,21 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid)
int error = -ESRCH;
struct task_struct *p;
- rcu_read_lock();
-retry:
- p = pid_task(pid, PIDTYPE_PID);
- if (p) {
- error = group_send_sig_info(sig, info, p);
- if (unlikely(error == -ESRCH))
- /*
- * The task was unhashed in between, try again.
- * If it is dead, pid_task() will return NULL,
- * if we race with de_thread() it will find the
- * new leader.
- */
- goto retry;
- }
- rcu_read_unlock();
+ for (;;) {
+ rcu_read_lock();
+ p = pid_task(pid, PIDTYPE_PID);
+ if (p)
+ error = group_send_sig_info(sig, info, p);
+ rcu_read_unlock();
+ if (likely(!p || error != -ESRCH))
+ return error;
- return error;
+ /*
+ * The task was unhashed in between, try again. If it
+ * is dead, pid_task() will return NULL, if we race with
+ * de_thread() it will find the new leader.
+ */
+ }
}
int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index eb89e1807408..f032fb5284e3 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -110,7 +110,7 @@ static int smpboot_thread_fn(void *data)
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->cleanup)
ht->cleanup(td->cpu, cpu_online(td->cpu));
@@ -136,26 +136,27 @@ static int smpboot_thread_fn(void *data)
/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
+
case HP_THREAD_PARKED:
+ __set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
- preempt_disable();
- break;
+ continue;
}
if (!ht->thread_should_run(td->cpu)) {
- preempt_enable();
+ preempt_enable_no_resched();
schedule();
} else {
- set_current_state(TASK_RUNNING);
+ __set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 0699add19164..501baa9ac1be 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -656,7 +656,7 @@ static void run_ksoftirqd(unsigned int cpu)
* in the task stack here.
*/
__do_softirq();
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
local_irq_enable();
cond_resched();
return;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4aada6d9fe74..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
},
{
.procname = "numa_balancing",
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 7347426fa68d..f622cf28628a 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,7 +13,7 @@ obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o
obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o
obj-$(CONFIG_TIMER_STATS) += timer_stats.o
obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o
-obj-$(CONFIG_TEST_UDELAY) += udelay_test.o
+obj-$(CONFIG_TEST_UDELAY) += test_udelay.o
$(obj)/time.o: $(obj)/timeconst.h
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
* Also omit the add if it would overflow the u64 boundary.
*/
if ((~0ULL - clc > rnd) &&
- (!ismax || evt->mult <= (1U << evt->shift)))
+ (!ismax || evt->mult <= (1ULL << evt->shift)))
clc += rnd;
do_div(clc, evt->mult);
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 492b986195d5..a16b67859e2a 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -553,7 +553,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
*sample = cputime_to_expires(cputime.utime);
break;
case CPUCLOCK_SCHED:
- *sample = cputime.sum_exec_runtime + task_delta_exec(p);
+ *sample = cputime.sum_exec_runtime;
break;
}
return 0;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
goto out;
}
} else {
+ memset(&event.sigev_value, 0, sizeof(event.sigev_value));
event.sigev_notify = SIGEV_SIGNAL;
event.sigev_signo = SIGALRM;
event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/udelay_test.c b/kernel/time/test_udelay.c
index e622ba365a13..e622ba365a13 100644
--- a/kernel/time/udelay_test.c
+++ b/kernel/time/test_udelay.c
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7b5741fc4110..1f4356037a7d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -585,7 +585,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
last_jiffies = jiffies;
} while (read_seqretry(&jiffies_lock, seq));
- if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
+ if (rcu_needs_cpu(&rcu_delta_jiffies) ||
arch_needs_cpu() || irq_work_needs_cpu()) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index a9ae20fb0b11..65015ff2f07c 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -304,7 +304,9 @@ struct timespec timespec_trunc(struct timespec t, unsigned gran)
}
EXPORT_SYMBOL(timespec_trunc);
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+/*
+ * mktime64 - Converts date to seconds.
+ * Converts Gregorian date to seconds since 1970-01-01 00:00:00.
* Assumes input in normal date format, i.e. 1980-12-31 23:59:59
* => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
*
@@ -314,15 +316,10 @@ EXPORT_SYMBOL(timespec_trunc);
* -year/100+year/400 terms, and add 10.]
*
* This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines where long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
*/
-unsigned long
-mktime(const unsigned int year0, const unsigned int mon0,
- const unsigned int day, const unsigned int hour,
- const unsigned int min, const unsigned int sec)
+time64_t mktime64(const unsigned int year0, const unsigned int mon0,
+ const unsigned int day, const unsigned int hour,
+ const unsigned int min, const unsigned int sec)
{
unsigned int mon = mon0, year = year0;
@@ -332,15 +329,14 @@ mktime(const unsigned int year0, const unsigned int mon0,
year -= 1;
}
- return ((((unsigned long)
+ return ((((time64_t)
(year/4 - year/100 + year/400 + 367*mon/12 + day) +
year*365 - 719499
)*24 + hour /* now have hours */
)*60 + min /* now have minutes */
)*60 + sec; /* finally seconds */
}
-
-EXPORT_SYMBOL(mktime);
+EXPORT_SYMBOL(mktime64);
/**
* set_normalized_timespec - set timespec sec and nsec parts and normalize
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ec1791fae965..2dc0646258ae 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -519,9 +519,9 @@ EXPORT_SYMBOL(__getnstimeofday64);
/**
* getnstimeofday64 - Returns the time of day in a timespec64.
- * @ts: pointer to the timespec to be set
+ * @ts: pointer to the timespec64 to be set
*
- * Returns the time of day in a timespec (WARN if suspended).
+ * Returns the time of day in a timespec64 (WARN if suspended).
*/
void getnstimeofday64(struct timespec64 *ts)
{
@@ -623,7 +623,7 @@ EXPORT_SYMBOL_GPL(ktime_get_raw);
*
* The function calculates the monotonic clock from the realtime
* clock and the wall_to_monotonic offset and stores the result
- * in normalized timespec format in the variable pointed to by @ts.
+ * in normalized timespec64 format in the variable pointed to by @ts.
*/
void ktime_get_ts64(struct timespec64 *ts)
{
@@ -703,18 +703,18 @@ void do_gettimeofday(struct timeval *tv)
EXPORT_SYMBOL(do_gettimeofday);
/**
- * do_settimeofday - Sets the time of day
- * @tv: pointer to the timespec variable containing the new time
+ * do_settimeofday64 - Sets the time of day.
+ * @ts: pointer to the timespec64 variable containing the new time
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(const struct timespec *tv)
+int do_settimeofday64(const struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts_delta, xt, tmp;
+ struct timespec64 ts_delta, xt;
unsigned long flags;
- if (!timespec_valid_strict(tv))
+ if (!timespec64_valid_strict(ts))
return -EINVAL;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -723,13 +723,12 @@ int do_settimeofday(const struct timespec *tv)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = tv->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = tv->tv_nsec - xt.tv_nsec;
+ ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
+ ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
- tmp = timespec_to_timespec64(*tv);
- tk_set_xtime(tk, &tmp);
+ tk_set_xtime(tk, ts);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -741,7 +740,7 @@ int do_settimeofday(const struct timespec *tv)
return 0;
}
-EXPORT_SYMBOL(do_settimeofday);
+EXPORT_SYMBOL(do_settimeofday64);
/**
* timekeeping_inject_offset - Adds or subtracts from the current time.
@@ -895,12 +894,12 @@ int timekeeping_notify(struct clocksource *clock)
}
/**
- * getrawmonotonic - Returns the raw monotonic time in a timespec
- * @ts: pointer to the timespec to be set
+ * getrawmonotonic64 - Returns the raw monotonic time in a timespec
+ * @ts: pointer to the timespec64 to be set
*
* Returns the raw monotonic time (completely un-modified by ntp)
*/
-void getrawmonotonic(struct timespec *ts)
+void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 ts64;
@@ -915,9 +914,10 @@ void getrawmonotonic(struct timespec *ts)
} while (read_seqcount_retry(&tk_core.seq, seq));
timespec64_add_ns(&ts64, nsecs);
- *ts = timespec64_to_timespec(ts64);
+ *ts = ts64;
}
-EXPORT_SYMBOL(getrawmonotonic);
+EXPORT_SYMBOL(getrawmonotonic64);
+
/**
* timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
@@ -1068,8 +1068,8 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
}
/**
- * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
- * @delta: pointer to a timespec delta value
+ * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec64 delta value
*
* This hook is for architectures that cannot support read_persistent_clock
* because their RTC/persistent clock is only accessible when irqs are enabled.
@@ -1077,10 +1077,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk,
* This function should only be called by rtc_resume(), and allows
* a suspend offset to be injected into the timekeeping values.
*/
-void timekeeping_inject_sleeptime(struct timespec *delta)
+void timekeeping_inject_sleeptime64(struct timespec64 *delta)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 tmp;
unsigned long flags;
/*
@@ -1095,8 +1094,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
timekeeping_forward_now(tk);
- tmp = timespec_to_timespec64(*delta);
- __timekeeping_inject_sleeptime(tk, &tmp);
+ __timekeeping_inject_sleeptime(tk, delta);
timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1332,6 +1330,12 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
*
* XXX - TODO: Doc ntp_error calculation.
*/
+ if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+ /* NTP adjustment caused clocksource mult overflow */
+ WARN_ON_ONCE(1);
+ return;
+ }
+
tk->tkr.mult += mult_adj;
tk->xtime_interval += interval;
tk->tkr.xtime_nsec -= offset;
@@ -1397,7 +1401,8 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
}
if (unlikely(tk->tkr.clock->maxadj &&
- (tk->tkr.mult > tk->tkr.clock->mult + tk->tkr.clock->maxadj))) {
+ (abs(tk->tkr.mult - tk->tkr.clock->mult)
+ > tk->tkr.clock->maxadj))) {
printk_once(KERN_WARNING
"Adjusting %s more than 11%% (%ld vs %ld)\n",
tk->tkr.clock->name, (long)tk->tkr.mult,
@@ -1646,7 +1651,7 @@ struct timespec current_kernel_time(void)
}
EXPORT_SYMBOL(current_kernel_time);
-struct timespec get_monotonic_coarse(void)
+struct timespec64 get_monotonic_coarse64(void)
{
struct timekeeper *tk = &tk_core.timekeeper;
struct timespec64 now, mono;
@@ -1662,7 +1667,7 @@ struct timespec get_monotonic_coarse(void)
set_normalized_timespec64(&now, now.tv_sec + mono.tv_sec,
now.tv_nsec + mono.tv_nsec);
- return timespec64_to_timespec(now);
+ return now;
}
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3260ffdb368f..2d3f5c504939 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1377,12 +1377,11 @@ unsigned long get_next_timer_interrupt(unsigned long now)
void update_process_times(int user_tick)
{
struct task_struct *p = current;
- int cpu = smp_processor_id();
/* Note: this timer irq context must be accounted for as well. */
account_process_tick(p, user_tick);
run_local_timers();
- rcu_check_callbacks(cpu, user_tick);
+ rcu_check_callbacks(user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
irq_work_tick();
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fb186b9ddf51..31c90fec4158 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1925,8 +1925,16 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
* when we are adding another op to the rec or removing the
* current one. Thus, if the op is being added, we can
* ignore it because it hasn't attached itself to the rec
- * yet. That means we just need to find the op that has a
- * trampoline and is not beeing added.
+ * yet.
+ *
+ * If an ops is being modified (hooking to different functions)
+ * then we don't care about the new functions that are being
+ * added, just the old ones (that are probably being removed).
+ *
+ * If we are adding an ops to a function that already is using
+ * a trampoline, it needs to be removed (trampolines are only
+ * for single ops connected), then an ops that is not being
+ * modified also needs to be checked.
*/
do_for_each_ftrace_op(op, ftrace_ops_list) {
@@ -1940,17 +1948,23 @@ ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
if (op->flags & FTRACE_OPS_FL_ADDING)
continue;
+
/*
- * If the ops is not being added and has a trampoline,
- * then it must be the one that we want!
+ * If the ops is being modified and is in the old
+ * hash, then it is probably being removed from this
+ * function.
*/
- if (hash_contains_ip(ip, op->func_hash))
- return op;
-
- /* If the ops is being modified, it may be in the old hash. */
if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
hash_contains_ip(ip, &op->old_hash))
return op;
+ /*
+ * If the ops is not being added or modified, and it's
+ * in its normal filter hash, then this must be the one
+ * we want!
+ */
+ if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, op->func_hash))
+ return op;
} while_for_each_ftrace_op(op);
@@ -2293,10 +2307,13 @@ static void ftrace_run_update_code(int command)
FTRACE_WARN_ON(ret);
}
-static void ftrace_run_modify_code(struct ftrace_ops *ops, int command)
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
+ struct ftrace_hash *old_hash)
{
ops->flags |= FTRACE_OPS_FL_MODIFYING;
+ ops->old_hash.filter_hash = old_hash;
ftrace_run_update_code(command);
+ ops->old_hash.filter_hash = NULL;
ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
}
@@ -3340,7 +3357,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
static int ftrace_probe_registered;
-static void __enable_ftrace_function_probe(void)
+static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
{
int ret;
int i;
@@ -3348,7 +3365,8 @@ static void __enable_ftrace_function_probe(void)
if (ftrace_probe_registered) {
/* still need to update the function call sites */
if (ftrace_enabled)
- ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ old_hash);
return;
}
@@ -3477,13 +3495,14 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
} while_for_each_ftrace_rec();
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+ __enable_ftrace_function_probe(old_hash);
+
if (!ret)
free_ftrace_hash_rcu(old_hash);
else
count = ret;
- __enable_ftrace_function_probe();
-
out_unlock:
mutex_unlock(&ftrace_lock);
out:
@@ -3764,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
return add_hash_entry(hash, ip);
}
-static void ftrace_ops_update_code(struct ftrace_ops *ops)
+static void ftrace_ops_update_code(struct ftrace_ops *ops,
+ struct ftrace_hash *old_hash)
{
if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
- ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
}
static int
@@ -3813,7 +3833,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
old_hash = *orig_hash;
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
if (!ret) {
- ftrace_ops_update_code(ops);
+ ftrace_ops_update_code(ops, old_hash);
free_ftrace_hash_rcu(old_hash);
}
mutex_unlock(&ftrace_lock);
@@ -4058,7 +4078,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
if (!ret) {
- ftrace_ops_update_code(iter->ops);
+ ftrace_ops_update_code(iter->ops, old_hash);
free_ftrace_hash_rcu(old_hash);
}
mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2d75c94ae87d..a56e07c8d15b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
* ring_buffer_wait - wait for input to the ring buffer
* @buffer: buffer to wait on
* @cpu: the cpu buffer to wait on
+ * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
*
* If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
{
- struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
DEFINE_WAIT(wait);
struct rb_irq_work *work;
+ int ret = 0;
/*
* Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
}
- prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+ while (true) {
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
- /*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
- */
- work->waiters_pending = true;
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ work->waiters_pending = true;
+
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
+ break;
+
+ if (cpu != RING_BUFFER_ALL_CPUS &&
+ !ring_buffer_empty_cpu(buffer, cpu)) {
+ unsigned long flags;
+ bool pagebusy;
+
+ if (!full)
+ break;
+
+ raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+ raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+ if (!pagebusy)
+ break;
+ }
- if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
- (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
schedule();
+ }
finish_wait(&work->waiters, &wait);
- return 0;
+
+ return ret;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8a528392b1f4..92f4a6cee172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static int wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter, bool full)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
return 0;
- return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ full);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp)
mutex_unlock(&iter->mutex);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, false);
mutex_lock(&iter->mutex);
if (ret)
return ret;
-
- if (signal_pending(current))
- return -EINTR;
}
return 1;
@@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
goto out_unlock;
}
mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, false);
mutex_lock(&trace_types_lock);
if (ret) {
size = ret;
goto out_unlock;
}
- if (signal_pending(current)) {
- size = -EINTR;
- goto out_unlock;
- }
goto again;
}
size = 0;
@@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
};
struct buffer_ref *ref;
int entries, size, i;
- ssize_t ret;
+ ssize_t ret = 0;
mutex_lock(&trace_types_lock);
@@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
int r;
ref = kzalloc(sizeof(*ref), GFP_KERNEL);
- if (!ref)
+ if (!ref) {
+ ret = -ENOMEM;
break;
+ }
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
+ ret = -ENOMEM;
kfree(ref);
break;
}
@@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
/* did we read anything? */
if (!spd.nr_pages) {
+ if (ret)
+ goto out;
+
if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
ret = -EAGAIN;
goto out;
}
mutex_unlock(&trace_types_lock);
- ret = wait_on_pipe(iter);
+ ret = wait_on_pipe(iter, true);
mutex_lock(&trace_types_lock);
if (ret)
goto out;
- if (signal_pending(current)) {
- ret = -EINTR;
- goto out;
- }
+
goto again;
}
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4dc8b79c5f75..29228c4d5696 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
/* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
return;
@@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
int size;
syscall_nr = trace_get_syscall_nr(current, regs);
- if (syscall_nr < 0)
+ if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
return;
if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
return;