diff options
Diffstat (limited to 'kernel')
34 files changed, 1278 insertions, 903 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 080a2dfb5800..bf4495fcd25d 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -99,7 +99,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (err) goto free_smap; - err = get_callchain_buffers(); + err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) goto free_smap; diff --git a/kernel/cpu.c b/kernel/cpu.c index d948e44c471e..7b61887f7ccd 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1201,6 +1201,8 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown = takedown_cpu, .cant_stop = true, }, +#else + [CPUHP_BRINGUP_CPU] = { }, #endif }; diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 179ef4640964..e9fdb5203de5 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -104,7 +104,7 @@ fail: return -ENOMEM; } -int get_callchain_buffers(void) +int get_callchain_buffers(int event_max_stack) { int err = 0; int count; @@ -121,6 +121,15 @@ int get_callchain_buffers(void) /* If the allocation failed, give up */ if (!callchain_cpus_entries) err = -ENOMEM; + /* + * If requesting per event more than the global cap, + * return a different error to help userspace figure + * this out. + * + * And also do it here so that we have &callchain_mutex held. + */ + if (event_max_stack > sysctl_perf_event_max_stack) + err = -EOVERFLOW; goto exit; } @@ -174,11 +183,12 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) bool user = !event->attr.exclude_callchain_user; /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; if (!kernel && !user) return NULL; - return get_perf_callchain(regs, 0, kernel, user, sysctl_perf_event_max_stack, crosstask, true); + return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); } struct perf_callchain_entry * diff --git a/kernel/events/core.c b/kernel/events/core.c index 43d43a2d5811..79dae188a987 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -335,6 +335,7 @@ static atomic_t perf_sched_count; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); +static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -396,6 +397,13 @@ int perf_proc_update_handler(struct ctl_table *table, int write, if (ret || !write) return ret; + /* + * If throttling is disabled don't allow the write: + */ + if (sysctl_perf_cpu_time_max_percent == 100 || + sysctl_perf_cpu_time_max_percent == 0) + return -EINVAL; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -3686,6 +3694,39 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); +static void detach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_del_rcu(&event->sb_list); + raw_spin_unlock(&pel->lock); +} + +static bool is_sb_event(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + + if (event->parent) + return false; + + if (event->attach_state & PERF_ATTACH_TASK) + return false; + + if (attr->mmap || attr->mmap_data || attr->mmap2 || + attr->comm || attr->comm_exec || + attr->task || + attr->context_switch) + return true; + return false; +} + +static void unaccount_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + detach_sb_event(event); +} + static void unaccount_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -3749,6 +3790,8 @@ static void unaccount_event(struct perf_event *event) } unaccount_event_cpu(event, event->cpu); + + unaccount_pmu_sb_event(event); } static void perf_sched_delayed(struct work_struct *work) @@ -5875,11 +5918,11 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); +typedef void (perf_iterate_f)(struct perf_event *event, void *data); static void -perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_output_cb output, +perf_iterate_ctx(struct perf_event_context *ctx, + perf_iterate_f output, void *data, bool all) { struct perf_event *event; @@ -5896,52 +5939,55 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } } -static void -perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, - struct perf_event_context *task_ctx) +static void perf_iterate_sb_cpu(perf_iterate_f output, void *data) { - rcu_read_lock(); - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data, false); - preempt_enable(); - rcu_read_unlock(); + struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events); + struct perf_event *event; + + list_for_each_entry_rcu(event, &pel->list, sb_list) { + if (event->state < PERF_EVENT_STATE_INACTIVE) + continue; + if (!event_filter_match(event)) + continue; + output(event, data); + } } +/* + * Iterate all events that need to receive side-band events. + * + * For new callers; ensure that account_pmu_sb_event() includes + * your event, otherwise it might not get delivered. + */ static void -perf_event_aux(perf_event_aux_output_cb output, void *data, +perf_iterate_sb(perf_iterate_f output, void *data, struct perf_event_context *task_ctx) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; - struct pmu *pmu; int ctxn; + rcu_read_lock(); + preempt_disable(); + /* - * If we have task_ctx != NULL we only notify - * the task context itself. The task_ctx is set - * only for EXIT events before releasing task + * If we have task_ctx != NULL we only notify the task context itself. + * The task_ctx is set only for EXIT events before releasing task * context. */ if (task_ctx) { - perf_event_aux_task_ctx(output, data, task_ctx); - return; + perf_iterate_ctx(task_ctx, output, data, false); + goto done; } - rcu_read_lock(); - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); - if (cpuctx->unique_pmu != pmu) - goto next; - perf_event_aux_ctx(&cpuctx->ctx, output, data, false); - ctxn = pmu->task_ctx_nr; - if (ctxn < 0) - goto next; + perf_iterate_sb_cpu(output, data); + + for_each_task_context_nr(ctxn) { ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, output, data, false); -next: - put_cpu_ptr(pmu->pmu_cpu_context); + perf_iterate_ctx(ctx, output, data, false); } +done: + preempt_enable(); rcu_read_unlock(); } @@ -5990,7 +6036,7 @@ void perf_event_exec(void) perf_event_enable_on_exec(ctxn); - perf_event_aux_ctx(ctx, perf_event_addr_filters_exec, NULL, + perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL, true); } rcu_read_unlock(); @@ -6034,9 +6080,9 @@ static int __perf_pmu_output_stop(void *info) }; rcu_read_lock(); - perf_event_aux_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); + perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false); if (cpuctx->task_ctx) - perf_event_aux_ctx(cpuctx->task_ctx, __perf_event_output_stop, + perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop, &ro, false); rcu_read_unlock(); @@ -6165,7 +6211,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_output, + perf_iterate_sb(perf_event_task_output, &task_event, task_ctx); } @@ -6244,7 +6290,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_output, + perf_iterate_sb(perf_event_comm_output, comm_event, NULL); } @@ -6475,7 +6521,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_output, + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -6558,7 +6604,7 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma) if (!ctx) continue; - perf_event_aux_ctx(ctx, __perf_addr_filters_adjust, vma, true); + perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true); } rcu_read_unlock(); } @@ -6745,7 +6791,7 @@ static void perf_event_switch(struct task_struct *task, }, }; - perf_event_aux(perf_event_switch_output, + perf_iterate_sb(perf_event_switch_output, &switch_event, NULL); } @@ -8667,6 +8713,28 @@ unlock: return pmu; } +static void attach_sb_event(struct perf_event *event) +{ + struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu); + + raw_spin_lock(&pel->lock); + list_add_rcu(&event->sb_list, &pel->list); + raw_spin_unlock(&pel->lock); +} + +/* + * We keep a list of all !task (and therefore per-cpu) events + * that need to receive side-band records. + * + * This avoids having to scan all the various PMU per-cpu contexts + * looking for them. + */ +static void account_pmu_sb_event(struct perf_event *event) +{ + if (is_sb_event(event)) + attach_sb_event(event); +} + static void account_event_cpu(struct perf_event *event, int cpu) { if (event->parent) @@ -8747,6 +8815,8 @@ static void account_event(struct perf_event *event) enabled: account_event_cpu(event, event->cpu); + + account_pmu_sb_event(event); } /* @@ -8895,7 +8965,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); + err = get_callchain_buffers(attr->sample_max_stack); if (err) goto err_addr_filters; } @@ -9217,6 +9287,9 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + if (!attr.sample_max_stack) + attr.sample_max_stack = sysctl_perf_event_max_stack; + /* * In cgroup mode, the pid argument is used to pass the fd * opened to the cgroup directory in cgroupfs. The cpu argument @@ -9290,7 +9363,7 @@ SYSCALL_DEFINE5(perf_event_open, if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { - err = -ENOTSUPP; + err = -EOPNOTSUPP; goto err_alloc; } } @@ -10252,6 +10325,9 @@ static void __init perf_event_init_all_cpus(void) swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu)); + + INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu)); + raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu)); } } diff --git a/kernel/exit.c b/kernel/exit.c index 2fb4d44c51b1..84ae830234f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -776,10 +776,14 @@ void do_exit(long code) exit_signals(tsk); /* sets PF_EXITING */ /* - * tsk->flags are checked in the futex code to protect against - * an exiting task cleaning up the robust pi futexes. + * Ensure that all new tsk->pi_lock acquisitions must observe + * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). */ smp_mb(); + /* + * Ensure that we must observe the pi_state in exit_mm() -> + * mm_release() -> exit_pi_state_list(). + */ raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) { diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 4b353e0be121..0dbea887d625 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -452,7 +452,7 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val, return notifier_from_errno(ret); } -struct notifier_block jump_label_module_nb = { +static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 81f1a7107c0e..589d763a49b3 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -46,6 +46,7 @@ #include <linux/gfp.h> #include <linux/kmemcheck.h> #include <linux/random.h> +#include <linux/jhash.h> #include <asm/sections.h> @@ -309,10 +310,14 @@ static struct hlist_head chainhash_table[CHAINHASH_SIZE]; * It's a 64-bit hash, because it's important for the keys to be * unique. */ -#define iterate_chain_key(key1, key2) \ - (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ - ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ - (key2)) +static inline u64 iterate_chain_key(u64 key, u32 idx) +{ + u32 k0 = key, k1 = key >> 32; + + __jhash_mix(idx, k0, k1); /* Macro that modifies arguments! */ + + return k0 | (u64)k1 << 32; +} void lockdep_off(void) { diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index d06ae3bb46c5..57a871ae3c81 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -29,12 +29,12 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #define spin_lock_mutex(lock, flags) \ diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index a68bae5e852a..6cd6b8e9efd7 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -17,14 +17,20 @@ __list_del((waiter)->list.prev, (waiter)->list.next) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * The mutex owner can get read and written to locklessly. + * We should use WRITE_ONCE when writing the owner value to + * avoid store tearing, otherwise, a thread could potentially + * read a partially written and incomplete owner value. + */ static inline void mutex_set_owner(struct mutex *lock) { - lock->owner = current; + WRITE_ONCE(lock->owner, current); } static inline void mutex_clear_owner(struct mutex *lock) { - lock->owner = NULL; + WRITE_ONCE(lock->owner, NULL); } #else static inline void mutex_set_owner(struct mutex *lock) diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index fec082338668..19248ddf37ce 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -93,7 +93,7 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts) * that accesses can't leak upwards out of our subsequent critical * section in the case that the lock is currently held for write. */ - cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS; + cnts = atomic_fetch_add_acquire(_QR_BIAS, &lock->cnts); rspin_until_writer_unlock(lock, cnts); /* diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 5fc8c311b8fe..b2caec7315af 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -90,7 +90,7 @@ static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[MAX_NODES]); * therefore increment the cpu number by one. */ -static inline u32 encode_tail(int cpu, int idx) +static inline __pure u32 encode_tail(int cpu, int idx) { u32 tail; @@ -103,7 +103,7 @@ static inline u32 encode_tail(int cpu, int idx) return tail; } -static inline struct mcs_spinlock *decode_tail(u32 tail) +static inline __pure struct mcs_spinlock *decode_tail(u32 tail) { int cpu = (tail >> _Q_TAIL_CPU_OFFSET) - 1; int idx = (tail & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET; @@ -268,6 +268,63 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #endif /* + * Various notes on spin_is_locked() and spin_unlock_wait(), which are + * 'interesting' functions: + * + * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE + * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64, + * PPC). Also qspinlock has a similar issue per construction, the setting of + * the locked byte can be unordered acquiring the lock proper. + * + * This gets to be 'interesting' in the following cases, where the /should/s + * end up false because of this issue. + * + * + * CASE 1: + * + * So the spin_is_locked() correctness issue comes from something like: + * + * CPU0 CPU1 + * + * global_lock(); local_lock(i) + * spin_lock(&G) spin_lock(&L[i]) + * for (i) if (!spin_is_locked(&G)) { + * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep(); + * return; + * } + * // deal with fail + * + * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such + * that there is exclusion between the two critical sections. + * + * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from + * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i]) + * /should/ be constrained by the ACQUIRE from spin_lock(&G). + * + * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB. + * + * + * CASE 2: + * + * For spin_unlock_wait() there is a second correctness issue, namely: + * + * CPU0 CPU1 + * + * flag = set; + * smp_mb(); spin_lock(&l) + * spin_unlock_wait(&l); if (!flag) + * // add to lockless list + * spin_unlock(&l); + * // iterate lockless list + * + * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0 + * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE + * semantics etc..) + * + * Where flag /should/ be ordered against the locked store of l. + */ + +/* * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before * issuing an _unordered_ store to set _Q_LOCKED_VAL. * @@ -322,7 +379,7 @@ void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); done: - smp_rmb(); /* CTRL + RMB -> ACQUIRE */ + smp_acquire__after_ctrl_dep(); } EXPORT_SYMBOL(queued_spin_unlock_wait); #endif @@ -418,7 +475,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); + smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -455,6 +512,8 @@ queue: * pending stuff. * * p,*,* -> n,*,* + * + * RELEASE, such that the stores to @node must be complete. */ old = xchg_tail(lock, tail); next = NULL; @@ -465,6 +524,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* + * The above xchg_tail() is also a load of @lock which generates, + * through decode_tail(), a pointer. + * + * The address dependency matches the RELEASE of xchg_tail() + * such that the access to @prev must happen after. + */ + smp_read_barrier_depends(); + WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); @@ -494,7 +562,7 @@ queue: * * The PV pv_wait_head_or_lock function, if active, will acquire * the lock and return a non-zero value. So we have to skip the - * smp_cond_acquire() call. As the next PV queue head hasn't been + * smp_cond_load_acquire() call. As the next PV queue head hasn't been * designated yet, there is no way for the locked value to become * _Q_SLOW_VAL. So both the set_locked() and the * atomic_cmpxchg_relaxed() calls will be safe. @@ -505,7 +573,7 @@ queue: if ((val = pv_wait_head_or_lock(lock, node))) goto locked; - smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); + val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK)); locked: /* @@ -525,9 +593,9 @@ locked: break; } /* - * The smp_cond_acquire() call above has provided the necessary - * acquire semantics required for locking. At most two - * iterations of this loop may be ran. + * The smp_cond_load_acquire() call above has provided the + * necessary acquire semantics required for locking. At most + * two iterations of this loop may be ran. */ old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) @@ -551,7 +619,7 @@ release: /* * release the node */ - this_cpu_dec(mcs_nodes[0].count); + __this_cpu_dec(mcs_nodes[0].count); } EXPORT_SYMBOL(queued_spin_lock_slowpath); diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 21ede57f68b3..37649e69056c 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -112,12 +112,12 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock) #else /* _Q_PENDING_BITS == 8 */ static __always_inline void set_pending(struct qspinlock *lock) { - atomic_set_mask(_Q_PENDING_VAL, &lock->val); + atomic_or(_Q_PENDING_VAL, &lock->val); } static __always_inline void clear_pending(struct qspinlock *lock) { - atomic_clear_mask(_Q_PENDING_VAL, &lock->val); + atomic_andnot(_Q_PENDING_VAL, &lock->val); } static __always_inline int trylock_clear_pending(struct qspinlock *lock) diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 3e746607abe5..1ec0f48962b3 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1478,7 +1478,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { - if (WARN_ON(in_irq() || in_nmi() || in_serving_softirq())) + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) return 0; return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 09e30c6225e5..447e08de1fab 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -80,7 +80,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name, debug_check_no_locks_freed((void *)sem, sizeof(*sem)); lockdep_init_map(&sem->dep_map, name, key, 0); #endif - sem->count = RWSEM_UNLOCKED_VALUE; + atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE); raw_spin_lock_init(&sem->wait_lock); INIT_LIST_HEAD(&sem->wait_list); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER @@ -114,12 +114,16 @@ enum rwsem_wake_type { * - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed) * - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so) * - there must be someone on the queue - * - the spinlock must be held by the caller + * - the wait_lock must be held by the caller + * - tasks are marked for wakeup, the caller must later invoke wake_up_q() + * to actually wakeup the blocked task(s) and drop the reference count, + * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed - * - writers are only woken if downgrading is false + * - writers are only marked woken if downgrading is false */ static struct rw_semaphore * -__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) +__rwsem_mark_wake(struct rw_semaphore *sem, + enum rwsem_wake_type wake_type, struct wake_q_head *wake_q) { struct rwsem_waiter *waiter; struct task_struct *tsk; @@ -128,13 +132,16 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list); if (waiter->type == RWSEM_WAITING_FOR_WRITE) { - if (wake_type == RWSEM_WAKE_ANY) - /* Wake writer at the front of the queue, but do not - * grant it the lock yet as we want other writers - * to be able to steal it. Readers, on the other hand, - * will block as they will notice the queued writer. + if (wake_type == RWSEM_WAKE_ANY) { + /* + * Mark writer at the front of the queue for wakeup. + * Until the task is actually later awoken later by + * the caller, other writers are able to steal it. + * Readers, on the other hand, will block as they + * will notice the queued writer. */ - wake_up_process(waiter->task); + wake_q_add(wake_q, waiter->task); + } goto out; } @@ -146,15 +153,27 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) if (wake_type != RWSEM_WAKE_READ_OWNED) { adjustment = RWSEM_ACTIVE_READ_BIAS; try_reader_grant: - oldcount = rwsem_atomic_update(adjustment, sem) - adjustment; + oldcount = atomic_long_fetch_add(adjustment, &sem->count); + if (unlikely(oldcount < RWSEM_WAITING_BIAS)) { - /* A writer stole the lock. Undo our reader grant. */ - if (rwsem_atomic_update(-adjustment, sem) & - RWSEM_ACTIVE_MASK) + /* + * If the count is still less than RWSEM_WAITING_BIAS + * after removing the adjustment, it is assumed that + * a writer has stolen the lock. We have to undo our + * reader grant. + */ + if (atomic_long_add_return(-adjustment, &sem->count) < + RWSEM_WAITING_BIAS) goto out; /* Last active locker left. Retry waking readers. */ goto try_reader_grant; } + /* + * It is not really necessary to set it to reader-owned here, + * but it gives the spinners an early indication that the + * readers now have the lock. + */ + rwsem_set_reader_owned(sem); } /* Grant an infinite number of read locks to the readers at the front @@ -179,7 +198,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) adjustment -= RWSEM_WAITING_BIAS; if (adjustment) - rwsem_atomic_add(adjustment, sem); + atomic_long_add(adjustment, &sem->count); next = sem->wait_list.next; loop = woken; @@ -187,17 +206,15 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + + wake_q_add(wake_q, tsk); /* - * Make sure we do not wakeup the next reader before - * setting the nil condition to grant the next reader; - * otherwise we could miss the wakeup on the other - * side and end up sleeping again. See the pairing - * in rwsem_down_read_failed(). + * Ensure that the last operation is setting the reader + * waiter to nil such that rwsem_down_read_failed() cannot + * race with do_exit() by always holding a reference count + * to the task to wakeup. */ - smp_mb(); - waiter->task = NULL; - wake_up_process(tsk); - put_task_struct(tsk); + smp_store_release(&waiter->task, NULL); } while (--loop); sem->wait_list.next = next; @@ -216,11 +233,11 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; + WAKE_Q(wake_q); /* set up my own style of waitqueue */ waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; - get_task_struct(tsk); raw_spin_lock_irq(&sem->wait_lock); if (list_empty(&sem->wait_list)) @@ -228,7 +245,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - count = rwsem_atomic_update(adjustment, sem); + count = atomic_long_add_return(adjustment, &sem->count); /* If there are no active locks, wake the front queued process(es). * @@ -238,9 +255,10 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) if (count == RWSEM_WAITING_BIAS || (count > RWSEM_WAITING_BIAS && adjustment != -RWSEM_ACTIVE_READ_BIAS)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); /* wait to be given the lock */ while (true) { @@ -255,17 +273,29 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) } EXPORT_SYMBOL(rwsem_down_read_failed); +/* + * This function must be called with the sem->wait_lock held to prevent + * race conditions between checking the rwsem wait list and setting the + * sem->count accordingly. + */ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) { /* - * Try acquiring the write lock. Check count first in order - * to reduce unnecessary expensive cmpxchg() operations. + * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS. */ - if (count == RWSEM_WAITING_BIAS && - cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, - RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { - if (!list_is_singular(&sem->wait_list)) - rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + if (count != RWSEM_WAITING_BIAS) + return false; + + /* + * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there + * are other tasks on the wait list, we need to add on WAITING_BIAS. + */ + count = list_is_singular(&sem->wait_list) ? + RWSEM_ACTIVE_WRITE_BIAS : + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS; + + if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count) + == RWSEM_WAITING_BIAS) { rwsem_set_owner(sem); return true; } @@ -279,13 +309,13 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = READ_ONCE(sem->count); + long old, count = atomic_long_read(&sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) return false; - old = cmpxchg_acquire(&sem->count, count, + old = atomic_long_cmpxchg_acquire(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); if (old == count) { rwsem_set_owner(sem); @@ -306,16 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!owner) { - long count = READ_ONCE(sem->count); + if (!rwsem_owner_is_writer(owner)) { /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath with the lock being active, then there is a possibility - * reader(s) may have the lock. To be safe, bail spinning in these - * situations. + * Don't spin if the rwsem is readers owned. */ - if (count & RWSEM_ACTIVE_MASK) - ret = false; + ret = !rwsem_owner_is_reader(owner); goto done; } @@ -325,10 +350,15 @@ done: return ret; } -static noinline -bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) +/* + * Return true only if we can still spin on the owner field of the rwsem. + */ +static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { - long count; + struct task_struct *owner = READ_ONCE(sem->owner); + + if (!rwsem_owner_is_writer(owner)) + goto out; rcu_read_lock(); while (sem->owner == owner) { @@ -349,22 +379,16 @@ bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) cpu_relax_lowlatency(); } rcu_read_unlock(); - - if (READ_ONCE(sem->owner)) - return true; /* new owner, continue spinning */ - +out: /* - * When the owner is not set, the lock could be free or - * held by readers. Check the counter to verify the - * state. + * If there is a new owner or the owner is not set, we continue + * spinning. */ - count = READ_ONCE(sem->count); - return (count == 0 || count == RWSEM_WAITING_BIAS); + return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { - struct task_struct *owner; bool taken = false; preempt_disable(); @@ -376,12 +400,17 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) if (!osq_lock(&sem->osq)) goto done; - while (true) { - owner = READ_ONCE(sem->owner); - if (owner && !rwsem_spin_on_owner(sem, owner)) - break; - - /* wait_lock will be acquired if write_lock is obtained */ + /* + * Optimistically spin on the owner field and attempt to acquire the + * lock whenever the owner changes. Spinning will be stopped when: + * 1) the owning writer isn't running; or + * 2) readers own the lock as we can't determine if they are + * actively running or not. + */ + while (rwsem_spin_on_owner(sem)) { + /* + * Try to acquire the lock + */ if (rwsem_try_write_lock_unqueued(sem)) { taken = true; break; @@ -393,7 +422,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * we're an RT task that will live-lock because we won't let * the owner complete. */ - if (!owner && (need_resched() || rt_task(current))) + if (!sem->owner && (need_resched() || rt_task(current))) break; /* @@ -440,9 +469,10 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; + WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ - count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem); + count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); /* do optimistic spinning and steal lock if possible */ if (rwsem_optimistic_spin(sem)) @@ -465,18 +495,29 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = READ_ONCE(sem->count); + count = atomic_long_read(&sem->count); /* * If there were already threads queued before us and there are * no active writers, the lock must be read owned; so we try to * wake any read locks that were queued ahead of us. */ - if (count > RWSEM_WAITING_BIAS) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS); + if (count > RWSEM_WAITING_BIAS) { + WAKE_Q(wake_q); + + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); + /* + * The wakeup is normally called _after_ the wait_lock + * is released, but given that we are proactively waking + * readers we can deal with the wake_q overhead as it is + * similar to releasing and taking the wait_lock again + * for attempting rwsem_try_write_lock(). + */ + wake_up_q(&wake_q); + } } else - count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count); /* wait until we successfully acquire the lock */ set_current_state(state); @@ -492,7 +533,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) schedule(); set_current_state(state); - } while ((count = sem->count) & RWSEM_ACTIVE_MASK); + } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK); raw_spin_lock_irq(&sem->wait_lock); } @@ -507,10 +548,11 @@ out_nolock: raw_spin_lock_irq(&sem->wait_lock); list_del(&waiter.list); if (list_empty(&sem->wait_list)) - rwsem_atomic_update(-RWSEM_WAITING_BIAS, sem); + atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count); else - __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); + wake_up_q(&wake_q); return ERR_PTR(-EINTR); } @@ -537,6 +579,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -573,9 +616,10 @@ locked: /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } @@ -590,14 +634,16 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; + WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); /* do nothing if list empty */ if (!list_empty(&sem->wait_list)) - sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED); + sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q); raw_spin_unlock_irqrestore(&sem->wait_lock, flags); + wake_up_q(&wake_q); return sem; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 2e853ad93a3a..45ba475d4be3 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -22,6 +22,7 @@ void __sched down_read(struct rw_semaphore *sem) rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read); @@ -33,8 +34,10 @@ int down_read_trylock(struct rw_semaphore *sem) { int ret = __down_read_trylock(sem); - if (ret == 1) + if (ret == 1) { rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + rwsem_set_reader_owned(sem); + } return ret; } @@ -124,7 +127,7 @@ void downgrade_write(struct rw_semaphore *sem) * lockdep: a downgraded write will live on as a write * dependency. */ - rwsem_clear_owner(sem); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } @@ -138,6 +141,7 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); LOCK_CONTENDED(sem, __down_read_trylock, __down_read); + rwsem_set_reader_owned(sem); } EXPORT_SYMBOL(down_read_nested); diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index 870ed9a5b426..a699f4048ba1 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,14 +1,58 @@ +/* + * The owner field of the rw_semaphore structure will be set to + * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * the owner field when it unlocks. A reader, on the other hand, will + * not touch the owner field when it unlocks. + * + * In essence, the owner field now has the following 3 states: + * 1) 0 + * - lock is free or the owner hasn't set the field yet + * 2) RWSEM_READER_OWNED + * - lock is currently or previously owned by readers (lock is free + * or not set by owner yet) + * 3) Other non-zero value + * - a writer owns the lock + */ +#define RWSEM_READER_OWNED ((struct task_struct *)1UL) + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER +/* + * All writes to owner are protected by WRITE_ONCE() to make sure that + * store tearing can't happen as optimistic spinners may read and use + * the owner value concurrently without lock. Read from owner, however, + * may not need READ_ONCE() as long as the pointer value is only used + * for comparison and isn't being dereferenced. + */ static inline void rwsem_set_owner(struct rw_semaphore *sem) { - sem->owner = current; + WRITE_ONCE(sem->owner, current); } static inline void rwsem_clear_owner(struct rw_semaphore *sem) { - sem->owner = NULL; + WRITE_ONCE(sem->owner, NULL); +} + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ + /* + * We check the owner value first to make sure that we will only + * do a write to the rwsem cacheline when it is really necessary + * to minimize cacheline contention. + */ + if (sem->owner != RWSEM_READER_OWNED) + WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); +} + +static inline bool rwsem_owner_is_writer(struct task_struct *owner) +{ + return owner && owner != RWSEM_READER_OWNED; } +static inline bool rwsem_owner_is_reader(struct task_struct *owner) +{ + return owner == RWSEM_READER_OWNED; +} #else static inline void rwsem_set_owner(struct rw_semaphore *sem) { @@ -17,4 +61,8 @@ static inline void rwsem_set_owner(struct rw_semaphore *sem) static inline void rwsem_clear_owner(struct rw_semaphore *sem) { } + +static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) +{ +} #endif diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 3cee0d8393ed..d38ab08a3fe7 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -58,7 +58,7 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.vnet.ibm.com>"); #define VERBOSE_PERFOUT_ERRSTRING(s) \ do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0) -torture_param(bool, gp_exp, true, "Use expedited GP wait primitives"); +torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(int, holdoff, 10, "Holdoff time before test start (s)"); torture_param(int, nreaders, -1, "Number of RCU reader threads"); torture_param(int, nwriters, -1, "Number of RCU updater threads"); @@ -96,12 +96,7 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -#if defined(MODULE) || defined(CONFIG_RCU_PERF_TEST_RUNNABLE) -#define RCUPERF_RUNNABLE_INIT 1 -#else -#define RCUPERF_RUNNABLE_INIT 0 -#endif -static int perf_runnable = RCUPERF_RUNNABLE_INIT; +static int perf_runnable = IS_ENABLED(MODULE); module_param(perf_runnable, int, 0444); MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); @@ -363,8 +358,6 @@ rcu_perf_writer(void *arg) u64 *wdpp = writer_durations[me]; VERBOSE_PERFOUT_STRING("rcu_perf_writer task started"); - WARN_ON(rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp); - WARN_ON(rcu_gp_is_normal() && gp_exp); WARN_ON(!wdpp); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); sp.sched_priority = 1; @@ -631,12 +624,24 @@ rcu_perf_init(void) firsterr = -ENOMEM; goto unwind; } + if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } + if (rcu_gp_is_normal() && gp_exp) { + VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!"); + firsterr = -EINVAL; + goto unwind; + } for (i = 0; i < nrealwriters; i++) { writer_durations[i] = kcalloc(MAX_MEAS, sizeof(*writer_durations[i]), GFP_KERNEL); - if (!writer_durations[i]) + if (!writer_durations[i]) { + firsterr = -ENOMEM; goto unwind; + } firsterr = torture_create_kthread(rcu_perf_writer, (void *)i, writer_tasks[i]); if (firsterr) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 084a28a732eb..971e2b138063 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -182,12 +182,7 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) -#define RCUTORTURE_RUNNABLE_INIT 1 -#else -#define RCUTORTURE_RUNNABLE_INIT 0 -#endif -static int torture_runnable = RCUTORTURE_RUNNABLE_INIT; +static int torture_runnable = IS_ENABLED(MODULE); module_param(torture_runnable, int, 0444); MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); @@ -1476,7 +1471,7 @@ static int rcu_torture_barrier_cbs(void *arg) break; /* * The above smp_load_acquire() ensures barrier_phase load - * is ordered before the folloiwng ->call(). + * is ordered before the following ->call(). */ local_irq_disable(); /* Just to test no-irq call_rcu(). */ cur_ops->call(&rcu, rcu_torture_barrier_cbf); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index c7f1bc4f817c..f433959e9322 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -125,12 +125,14 @@ int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ +/* panic() on RCU Stall sysctl. */ +int sysctl_panic_on_rcu_stall __read_mostly; /* * The rcu_scheduler_active variable transitions from zero to one just * before the first task is spawned. So when this variable is zero, RCU * can assume that there is but one task, allowing RCU to (for example) - * optimize synchronize_sched() to a simple barrier(). When this variable + * optimize synchronize_rcu() to a simple barrier(). When this variable * is one, RCU must actually do all the hard work required to detect real * grace periods. This variable is also used to suppress boot-time false * positives from lockdep-RCU error checking. @@ -159,6 +161,7 @@ static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, bool wake); +static void sync_sched_exp_online_cleanup(int cpu); /* rcuc/rcub kthread realtime priority */ #ifdef CONFIG_RCU_KTHREAD_PRIO @@ -1284,9 +1287,9 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) rcu_for_each_leaf_node(rsp, rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) - dump_cpu_task(rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) + dump_cpu_task(cpu); } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } @@ -1311,6 +1314,12 @@ static void rcu_stall_kick_kthreads(struct rcu_state *rsp) } } +static inline void panic_on_rcu_stall(void) +{ + if (sysctl_panic_on_rcu_stall) + panic("RCU Stall\n"); +} + static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) { int cpu; @@ -1351,10 +1360,9 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { - for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) - if (rnp->qsmask & (1UL << cpu)) { - print_cpu_stall_info(rsp, - rnp->grplo + cpu); + for_each_leaf_node_possible_cpu(rnp, cpu) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + print_cpu_stall_info(rsp, cpu); ndetected++; } } @@ -1390,6 +1398,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rcu_check_gp_kthread_starvation(rsp); + panic_on_rcu_stall(); + force_quiescent_state(rsp); /* Kick them all. */ } @@ -1430,6 +1440,8 @@ static void print_cpu_stall(struct rcu_state *rsp) jiffies + 3 * rcu_jiffies_till_stall_check() + 3); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + panic_on_rcu_stall(); + /* * Attempt to revive the RCU machinery by forcing a context switch. * @@ -1989,8 +2001,7 @@ static bool rcu_gp_init(struct rcu_state *rsp) * of the tree within the rsp->node[] array. Note that other CPUs * will access only the leaves of the hierarchy, thus seeing that no * grace period is in progress, at least until the corresponding - * leaf node has been initialized. In addition, we have excluded - * CPU-hotplug operations. + * leaf node has been initialized. * * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. @@ -2872,7 +2883,6 @@ static void force_qs_rnp(struct rcu_state *rsp, unsigned long *maxj), bool *isidle, unsigned long *maxj) { - unsigned long bit; int cpu; unsigned long flags; unsigned long mask; @@ -2907,9 +2917,8 @@ static void force_qs_rnp(struct rcu_state *rsp, continue; } } - cpu = rnp->grplo; - bit = 1; - for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long bit = leaf_node_cpu_bit(rnp, cpu); if ((rnp->qsmask & bit) != 0) { if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; @@ -3448,549 +3457,6 @@ static bool rcu_seq_done(unsigned long *sp, unsigned long s) return ULONG_CMP_GE(READ_ONCE(*sp), s); } -/* Wrapper functions for expedited grace periods. */ -static void rcu_exp_gp_seq_start(struct rcu_state *rsp) -{ - rcu_seq_start(&rsp->expedited_sequence); -} -static void rcu_exp_gp_seq_end(struct rcu_state *rsp) -{ - rcu_seq_end(&rsp->expedited_sequence); - smp_mb(); /* Ensure that consecutive grace periods serialize. */ -} -static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) -{ - unsigned long s; - - smp_mb(); /* Caller's modifications seen first by other CPUs. */ - s = rcu_seq_snap(&rsp->expedited_sequence); - trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); - return s; -} -static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) -{ - return rcu_seq_done(&rsp->expedited_sequence, s); -} - -/* - * Reset the ->expmaskinit values in the rcu_node tree to reflect any - * recent CPU-online activity. Note that these masks are not cleared - * when CPUs go offline, so they reflect the union of all CPUs that have - * ever been online. This means that this function normally takes its - * no-work-to-do fastpath. - */ -static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) -{ - bool done; - unsigned long flags; - unsigned long mask; - unsigned long oldmask; - int ncpus = READ_ONCE(rsp->ncpus); - struct rcu_node *rnp; - struct rcu_node *rnp_up; - - /* If no new CPUs onlined since last time, nothing to do. */ - if (likely(ncpus == rsp->ncpus_snap)) - return; - rsp->ncpus_snap = ncpus; - - /* - * Each pass through the following loop propagates newly onlined - * CPUs for the current rcu_node structure up the rcu_node tree. - */ - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - continue; /* No new CPUs, nothing to do. */ - } - - /* Update this node's mask, track old value for propagation. */ - oldmask = rnp->expmaskinit; - rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* If was already nonzero, nothing to propagate. */ - if (oldmask) - continue; - - /* Propagate the new CPU up the tree. */ - mask = rnp->grpmask; - rnp_up = rnp->parent; - done = false; - while (rnp_up) { - raw_spin_lock_irqsave_rcu_node(rnp_up, flags); - if (rnp_up->expmaskinit) - done = true; - rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); - if (done) - break; - mask = rnp_up->grpmask; - rnp_up = rnp_up->parent; - } - } -} - -/* - * Reset the ->expmask values in the rcu_node tree in preparation for - * a new expedited grace period. - */ -static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) -{ - unsigned long flags; - struct rcu_node *rnp; - - sync_exp_reset_tree_hotplug(rsp); - rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - WARN_ON_ONCE(rnp->expmask); - rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } -} - -/* - * Return non-zero if there is no RCU expedited grace period in progress - * for the specified rcu_node structure, in other words, if all CPUs and - * tasks covered by the specified rcu_node structure have done their bit - * for the current expedited grace period. Works only for preemptible - * RCU -- other RCU implementation use other means. - * - * Caller must hold the rcu_state's exp_mutex. - */ -static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) -{ - return rnp->exp_tasks == NULL && - READ_ONCE(rnp->expmask) == 0; -} - -/* - * Report the exit from RCU read-side critical section for the last task - * that queued itself during or before the current expedited preemptible-RCU - * grace period. This event is reported either to the rcu_node structure on - * which the task was queued or to one of that rcu_node structure's ancestors, - * recursively up the tree. (Calm down, calm down, we do the recursion - * iteratively!) - * - * Caller must hold the rcu_state's exp_mutex and the specified rcu_node - * structure's ->lock. - */ -static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, - bool wake, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - - for (;;) { - if (!sync_rcu_preempt_exp_done(rnp)) { - if (!rnp->expmask) - rcu_initiate_boost(rnp, flags); - else - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - break; - } - if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - if (wake) { - smp_mb(); /* EGP done before wake_up(). */ - swake_up(&rsp->expedited_wq); - } - break; - } - mask = rnp->grpmask; - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ - rnp = rnp->parent; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ - WARN_ON_ONCE(!(rnp->expmask & mask)); - rnp->expmask &= ~mask; - } -} - -/* - * Report expedited quiescent state for specified node. This is a - * lock-acquisition wrapper function for __rcu_report_exp_rnp(). - * - * Caller must hold the rcu_state's exp_mutex. - */ -static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, - struct rcu_node *rnp, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - __rcu_report_exp_rnp(rsp, rnp, wake, flags); -} - -/* - * Report expedited quiescent state for multiple CPUs, all covered by the - * specified leaf rcu_node structure. Caller must hold the rcu_state's - * exp_mutex. - */ -static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, - unsigned long mask, bool wake) -{ - unsigned long flags; - - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - return; - } - rnp->expmask &= ~mask; - __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ -} - -/* - * Report expedited quiescent state for specified rcu_data (CPU). - */ -static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, - bool wake) -{ - rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); -} - -/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ -static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, - unsigned long s) -{ - if (rcu_exp_gp_seq_done(rsp, s)) { - trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); - /* Ensure test happens before caller kfree(). */ - smp_mb__before_atomic(); /* ^^^ */ - atomic_long_inc(stat); - return true; - } - return false; -} - -/* - * Funnel-lock acquisition for expedited grace periods. Returns true - * if some other task completed an expedited grace period that this task - * can piggy-back on, and with no mutex held. Otherwise, returns false - * with the mutex held, indicating that the caller must actually do the - * expedited grace period. - */ -static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - struct rcu_node *rnp = rdp->mynode; - struct rcu_node *rnp_root = rcu_get_root(rsp); - - /* Low-contention fastpath. */ - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && - (rnp == rnp_root || - ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && - !mutex_is_locked(&rsp->exp_mutex) && - mutex_trylock(&rsp->exp_mutex)) - goto fastpath; - - /* - * Each pass through the following loop works its way up - * the rcu_node tree, returning if others have done the work or - * otherwise falls through to acquire rsp->exp_mutex. The mapping - * from CPU to rcu_node structure can be inexact, as it is just - * promoting locality and is not strictly needed for correctness. - */ - for (; rnp != NULL; rnp = rnp->parent) { - if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) - return true; - - /* Work not done, either wait here or go up. */ - spin_lock(&rnp->exp_lock); - if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { - - /* Someone else doing GP, so wait for them. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, - rnp->grplo, rnp->grphi, - TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone2, s)); - return true; - } - rnp->exp_seq_rq = s; /* Followers can wait on us. */ - spin_unlock(&rnp->exp_lock); - trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, - rnp->grphi, TPS("nxtlvl")); - } - mutex_lock(&rsp->exp_mutex); -fastpath: - if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { - mutex_unlock(&rsp->exp_mutex); - return true; - } - rcu_exp_gp_seq_start(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); - return false; -} - -/* Invoked on each online non-idle CPU for expedited quiescent state. */ -static void sync_sched_exp_handler(void *data) -{ - struct rcu_data *rdp; - struct rcu_node *rnp; - struct rcu_state *rsp = data; - - rdp = this_cpu_ptr(rsp->rda); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - if (rcu_is_cpu_rrupt_from_idle()) { - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), true); - return; - } - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); - resched_cpu(smp_processor_id()); -} - -/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ -static void sync_sched_exp_online_cleanup(int cpu) -{ - struct rcu_data *rdp; - int ret; - struct rcu_node *rnp; - struct rcu_state *rsp = &rcu_sched_state; - - rdp = per_cpu_ptr(rsp->rda, cpu); - rnp = rdp->mynode; - if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) - return; - ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); - WARN_ON_ONCE(ret); -} - -/* - * Select the nodes that the upcoming expedited grace period needs - * to wait for. - */ -static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, - smp_call_func_t func) -{ - int cpu; - unsigned long flags; - unsigned long mask; - unsigned long mask_ofl_test; - unsigned long mask_ofl_ipi; - int ret; - struct rcu_node *rnp; - - sync_exp_reset_tree(rsp); - rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave_rcu_node(rnp, flags); - - /* Each pass checks a CPU for identity, offline, and idle. */ - mask_ofl_test = 0; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - if (raw_smp_processor_id() == cpu || - !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) - mask_ofl_test |= rdp->grpmask; - } - mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; - - /* - * Need to wait for any blocked tasks as well. Note that - * additional blocking tasks will also block the expedited - * GP until such time as the ->expmask bits are cleared. - */ - if (rcu_preempt_has_tasks(rnp)) - rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - - /* IPI the remaining CPUs for expedited quiescent state. */ - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(mask_ofl_ipi & mask)) - continue; -retry_ipi: - ret = smp_call_function_single(cpu, func, rsp, 0); - if (!ret) { - mask_ofl_ipi &= ~mask; - continue; - } - /* Failed, raced with offline. */ - raw_spin_lock_irqsave_rcu_node(rnp, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave_rcu_node(rnp, flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); - } - /* Report quiescent states for those that went offline. */ - mask_ofl_test |= mask_ofl_ipi; - if (mask_ofl_test) - rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); - } -} - -static void synchronize_sched_expedited_wait(struct rcu_state *rsp) -{ - int cpu; - unsigned long jiffies_stall; - unsigned long jiffies_start; - unsigned long mask; - int ndetected; - struct rcu_node *rnp; - struct rcu_node *rnp_root = rcu_get_root(rsp); - int ret; - - jiffies_stall = rcu_jiffies_till_stall_check(); - jiffies_start = jiffies; - - for (;;) { - ret = swait_event_timeout( - rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root), - jiffies_stall); - if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) - return; - if (ret < 0) { - /* Hit a signal, disable CPU stall warnings. */ - swait_event(rsp->expedited_wq, - sync_rcu_preempt_exp_done(rnp_root)); - return; - } - pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", - rsp->name); - ndetected = 0; - rcu_for_each_leaf_node(rsp, rnp) { - ndetected += rcu_print_task_exp_stall(rnp); - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - struct rcu_data *rdp; - - if (!(rnp->expmask & mask)) - continue; - ndetected++; - rdp = per_cpu_ptr(rsp->rda, cpu); - pr_cont(" %d-%c%c%c", cpu, - "O."[!!cpu_online(cpu)], - "o."[!!(rdp->grpmask & rnp->expmaskinit)], - "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); - } - mask <<= 1; - } - pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", - jiffies - jiffies_start, rsp->expedited_sequence, - rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); - if (ndetected) { - pr_err("blocking rcu_node structures:"); - rcu_for_each_node_breadth_first(rsp, rnp) { - if (rnp == rnp_root) - continue; /* printed unconditionally */ - if (sync_rcu_preempt_exp_done(rnp)) - continue; - pr_cont(" l=%u:%d-%d:%#lx/%c", - rnp->level, rnp->grplo, rnp->grphi, - rnp->expmask, - ".T"[!!rnp->exp_tasks]); - } - pr_cont("\n"); - } - rcu_for_each_leaf_node(rsp, rnp) { - mask = 1; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { - if (!(rnp->expmask & mask)) - continue; - dump_cpu_task(cpu); - } - } - jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; - } -} - -/* - * Wait for the current expedited grace period to complete, and then - * wake up everyone who piggybacked on the just-completed expedited - * grace period. Also update all the ->exp_seq_rq counters as needed - * in order to avoid counter-wrap problems. - */ -static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) -{ - struct rcu_node *rnp; - - synchronize_sched_expedited_wait(rsp); - rcu_exp_gp_seq_end(rsp); - trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); - - /* - * Switch over to wakeup mode, allowing the next GP, but -only- the - * next GP, to proceed. - */ - mutex_lock(&rsp->exp_wake_mutex); - mutex_unlock(&rsp->exp_mutex); - - rcu_for_each_node_breadth_first(rsp, rnp) { - if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { - spin_lock(&rnp->exp_lock); - /* Recheck, avoid hang in case someone just arrived. */ - if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) - rnp->exp_seq_rq = s; - spin_unlock(&rnp->exp_lock); - } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); - } - trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); - mutex_unlock(&rsp->exp_wake_mutex); -} - -/** - * synchronize_sched_expedited - Brute-force RCU-sched grace period - * - * Wait for an RCU-sched grace period to elapse, but use a "big hammer" - * approach to force the grace period to end quickly. This consumes - * significant time on all CPUs and is unfriendly to real-time workloads, - * so is thus not recommended for any sort of common-case code. In fact, - * if you are using synchronize_sched_expedited() in a loop, please - * restructure your code to batch your updates, and then use a single - * synchronize_sched() instead. - * - * This implementation can be thought of as an application of sequence - * locking to expedited grace periods, but using the sequence counter to - * determine when someone else has already done the work instead of for - * retrying readers. - */ -void synchronize_sched_expedited(void) -{ - unsigned long s; - struct rcu_state *rsp = &rcu_sched_state; - - /* If only one CPU, this is automatically a grace period. */ - if (rcu_blocking_is_gp()) - return; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_sched); - return; - } - - /* Take a snapshot of the sequence number. */ - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); - - /* Wait and clean up, including waking everyone. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_sched_expedited); - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -4281,7 +3747,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave_rcu_node(rnp, flags); - rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -4364,9 +3830,6 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ @@ -4751,4 +4214,5 @@ void __init rcu_init(void) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } +#include "tree_exp.h" #include "tree_plugin.h" diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e3959f5e6ddf..f714f873bf9d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -254,6 +254,13 @@ struct rcu_node { } ____cacheline_internodealigned_in_smp; /* + * Bitmasks in an rcu_node cover the interval [grplo, grphi] of CPU IDs, and + * are indexed relative to this interval rather than the global CPU ID space. + * This generates the bit for a CPU in node-local masks. + */ +#define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) + +/* * Do a full breadth-first scan of the rcu_node structures for the * specified rcu_state structure. */ @@ -281,6 +288,14 @@ struct rcu_node { (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) /* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +/* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h new file mode 100644 index 000000000000..6d86ab6ec2c9 --- /dev/null +++ b/kernel/rcu/tree_exp.h @@ -0,0 +1,655 @@ +/* + * RCU expedited grace periods + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2016 + * + * Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com> + */ + +/* Wrapper functions for expedited grace periods. */ +static void rcu_exp_gp_seq_start(struct rcu_state *rsp) +{ + rcu_seq_start(&rsp->expedited_sequence); +} +static void rcu_exp_gp_seq_end(struct rcu_state *rsp) +{ + rcu_seq_end(&rsp->expedited_sequence); + smp_mb(); /* Ensure that consecutive grace periods serialize. */ +} +static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) +{ + unsigned long s; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + s = rcu_seq_snap(&rsp->expedited_sequence); + trace_rcu_exp_grace_period(rsp->name, s, TPS("snap")); + return s; +} +static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) +{ + return rcu_seq_done(&rsp->expedited_sequence, s); +} + +/* + * Reset the ->expmaskinit values in the rcu_node tree to reflect any + * recent CPU-online activity. Note that these masks are not cleared + * when CPUs go offline, so they reflect the union of all CPUs that have + * ever been online. This means that this function normally takes its + * no-work-to-do fastpath. + */ +static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) +{ + bool done; + unsigned long flags; + unsigned long mask; + unsigned long oldmask; + int ncpus = READ_ONCE(rsp->ncpus); + struct rcu_node *rnp; + struct rcu_node *rnp_up; + + /* If no new CPUs onlined since last time, nothing to do. */ + if (likely(ncpus == rsp->ncpus_snap)) + return; + rsp->ncpus_snap = ncpus; + + /* + * Each pass through the following loop propagates newly onlined + * CPUs for the current rcu_node structure up the rcu_node tree. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (rnp->expmaskinit == rnp->expmaskinitnext) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + continue; /* No new CPUs, nothing to do. */ + } + + /* Update this node's mask, track old value for propagation. */ + oldmask = rnp->expmaskinit; + rnp->expmaskinit = rnp->expmaskinitnext; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* If was already nonzero, nothing to propagate. */ + if (oldmask) + continue; + + /* Propagate the new CPU up the tree. */ + mask = rnp->grpmask; + rnp_up = rnp->parent; + done = false; + while (rnp_up) { + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); + if (rnp_up->expmaskinit) + done = true; + rnp_up->expmaskinit |= mask; + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); + if (done) + break; + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + } + } +} + +/* + * Reset the ->expmask values in the rcu_node tree in preparation for + * a new expedited grace period. + */ +static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp; + + sync_exp_reset_tree_hotplug(rsp); + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + WARN_ON_ONCE(rnp->expmask); + rnp->expmask = rnp->expmaskinit; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } +} + +/* + * Return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold the rcu_state's exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return rnp->exp_tasks == NULL && + READ_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold the rcu_state's exp_mutex and the specified rcu_node + * structure's ->lock. + */ +static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, + bool wake, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + if (!rnp->expmask) + rcu_initiate_boost(rnp, flags); + else + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ + swake_up(&rsp->expedited_wq); + } + break; + } + mask = rnp->grpmask; + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ + WARN_ON_ONCE(!(rnp->expmask & mask)); + rnp->expmask &= ~mask; + } +} + +/* + * Report expedited quiescent state for specified node. This is a + * lock-acquisition wrapper function for __rcu_report_exp_rnp(). + * + * Caller must hold the rcu_state's exp_mutex. + */ +static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + __rcu_report_exp_rnp(rsp, rnp, wake, flags); +} + +/* + * Report expedited quiescent state for multiple CPUs, all covered by the + * specified leaf rcu_node structure. Caller must hold the rcu_state's + * exp_mutex. + */ +static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, + unsigned long mask, bool wake) +{ + unsigned long flags; + + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (!(rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + return; + } + rnp->expmask &= ~mask; + __rcu_report_exp_rnp(rsp, rnp, wake, flags); /* Releases rnp->lock. */ +} + +/* + * Report expedited quiescent state for specified rcu_data (CPU). + */ +static void rcu_report_exp_rdp(struct rcu_state *rsp, struct rcu_data *rdp, + bool wake) +{ + rcu_report_exp_cpu_mult(rsp, rdp->mynode, rdp->grpmask, wake); +} + +/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */ +static bool sync_exp_work_done(struct rcu_state *rsp, atomic_long_t *stat, + unsigned long s) +{ + if (rcu_exp_gp_seq_done(rsp, s)) { + trace_rcu_exp_grace_period(rsp->name, s, TPS("done")); + /* Ensure test happens before caller kfree(). */ + smp_mb__before_atomic(); /* ^^^ */ + atomic_long_inc(stat); + return true; + } + return false; +} + +/* + * Funnel-lock acquisition for expedited grace periods. Returns true + * if some other task completed an expedited grace period that this task + * can piggy-back on, and with no mutex held. Otherwise, returns false + * with the mutex held, indicating that the caller must actually do the + * expedited grace period. + */ +static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); + struct rcu_node *rnp = rdp->mynode; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + /* Low-contention fastpath. */ + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s) && + (rnp == rnp_root || + ULONG_CMP_LT(READ_ONCE(rnp_root->exp_seq_rq), s)) && + mutex_trylock(&rsp->exp_mutex)) + goto fastpath; + + /* + * Each pass through the following loop works its way up + * the rcu_node tree, returning if others have done the work or + * otherwise falls through to acquire rsp->exp_mutex. The mapping + * from CPU to rcu_node structure can be inexact, as it is just + * promoting locality and is not strictly needed for correctness. + */ + for (; rnp != NULL; rnp = rnp->parent) { + if (sync_exp_work_done(rsp, &rdp->exp_workdone1, s)) + return true; + + /* Work not done, either wait here or go up. */ + spin_lock(&rnp->exp_lock); + if (ULONG_CMP_GE(rnp->exp_seq_rq, s)) { + + /* Someone else doing GP, so wait for them. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, + rnp->grplo, rnp->grphi, + TPS("wait")); + wait_event(rnp->exp_wq[(s >> 1) & 0x3], + sync_exp_work_done(rsp, + &rdp->exp_workdone2, s)); + return true; + } + rnp->exp_seq_rq = s; /* Followers can wait on us. */ + spin_unlock(&rnp->exp_lock); + trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, + rnp->grphi, TPS("nxtlvl")); + } + mutex_lock(&rsp->exp_mutex); +fastpath: + if (sync_exp_work_done(rsp, &rdp->exp_workdone3, s)) { + mutex_unlock(&rsp->exp_mutex); + return true; + } + rcu_exp_gp_seq_start(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("start")); + return false; +} + +/* Invoked on each online non-idle CPU for expedited quiescent state. */ +static void sync_sched_exp_handler(void *data) +{ + struct rcu_data *rdp; + struct rcu_node *rnp; + struct rcu_state *rsp = data; + + rdp = this_cpu_ptr(rsp->rda); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || + __this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + if (rcu_is_cpu_rrupt_from_idle()) { + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); + return; + } + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + resched_cpu(smp_processor_id()); +} + +/* Send IPI for expedited cleanup if needed at end of CPU-hotplug operation. */ +static void sync_sched_exp_online_cleanup(int cpu) +{ + struct rcu_data *rdp; + int ret; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_sched_state; + + rdp = per_cpu_ptr(rsp->rda, cpu); + rnp = rdp->mynode; + if (!(READ_ONCE(rnp->expmask) & rdp->grpmask)) + return; + ret = smp_call_function_single(cpu, sync_sched_exp_handler, rsp, 0); + WARN_ON_ONCE(ret); +} + +/* + * Select the nodes that the upcoming expedited grace period needs + * to wait for. + */ +static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, + smp_call_func_t func) +{ + int cpu; + unsigned long flags; + unsigned long mask_ofl_test; + unsigned long mask_ofl_ipi; + int ret; + struct rcu_node *rnp; + + sync_exp_reset_tree(rsp); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave_rcu_node(rnp, flags); + + /* Each pass checks a CPU for identity, offline, and idle. */ + mask_ofl_test = 0; + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + if (raw_smp_processor_id() == cpu || + !(atomic_add_return(0, &rdtp->dynticks) & 0x1)) + mask_ofl_test |= rdp->grpmask; + } + mask_ofl_ipi = rnp->expmask & ~mask_ofl_test; + + /* + * Need to wait for any blocked tasks as well. Note that + * additional blocking tasks will also block the expedited + * GP until such time as the ->expmask bits are cleared. + */ + if (rcu_preempt_has_tasks(rnp)) + rnp->exp_tasks = rnp->blkd_tasks.next; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + + /* IPI the remaining CPUs for expedited quiescent state. */ + for_each_leaf_node_possible_cpu(rnp, cpu) { + unsigned long mask = leaf_node_cpu_bit(rnp, cpu); + if (!(mask_ofl_ipi & mask)) + continue; +retry_ipi: + ret = smp_call_function_single(cpu, func, rsp, 0); + if (!ret) { + mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); + } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + } + /* Report quiescent states for those that went offline. */ + mask_ofl_test |= mask_ofl_ipi; + if (mask_ofl_test) + rcu_report_exp_cpu_mult(rsp, rnp, mask_ofl_test, false); + } +} + +static void synchronize_sched_expedited_wait(struct rcu_state *rsp) +{ + int cpu; + unsigned long jiffies_stall; + unsigned long jiffies_start; + unsigned long mask; + int ndetected; + struct rcu_node *rnp; + struct rcu_node *rnp_root = rcu_get_root(rsp); + int ret; + + jiffies_stall = rcu_jiffies_till_stall_check(); + jiffies_start = jiffies; + + for (;;) { + ret = swait_event_timeout( + rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root), + jiffies_stall); + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) + return; + if (ret < 0) { + /* Hit a signal, disable CPU stall warnings. */ + swait_event(rsp->expedited_wq, + sync_rcu_preempt_exp_done(rnp_root)); + return; + } + pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", + rsp->name); + ndetected = 0; + rcu_for_each_leaf_node(rsp, rnp) { + ndetected += rcu_print_task_exp_stall(rnp); + for_each_leaf_node_possible_cpu(rnp, cpu) { + struct rcu_data *rdp; + + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + ndetected++; + rdp = per_cpu_ptr(rsp->rda, cpu); + pr_cont(" %d-%c%c%c", cpu, + "O."[!!cpu_online(cpu)], + "o."[!!(rdp->grpmask & rnp->expmaskinit)], + "N."[!!(rdp->grpmask & rnp->expmaskinitnext)]); + } + } + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } + rcu_for_each_leaf_node(rsp, rnp) { + for_each_leaf_node_possible_cpu(rnp, cpu) { + mask = leaf_node_cpu_bit(rnp, cpu); + if (!(rnp->expmask & mask)) + continue; + dump_cpu_task(cpu); + } + } + jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3; + } +} + +/* + * Wait for the current expedited grace period to complete, and then + * wake up everyone who piggybacked on the just-completed expedited + * grace period. Also update all the ->exp_seq_rq counters as needed + * in order to avoid counter-wrap problems. + */ +static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) +{ + struct rcu_node *rnp; + + synchronize_sched_expedited_wait(rsp); + rcu_exp_gp_seq_end(rsp); + trace_rcu_exp_grace_period(rsp->name, s, TPS("end")); + + /* + * Switch over to wakeup mode, allowing the next GP, but -only- the + * next GP, to proceed. + */ + mutex_lock(&rsp->exp_wake_mutex); + mutex_unlock(&rsp->exp_mutex); + + rcu_for_each_node_breadth_first(rsp, rnp) { + if (ULONG_CMP_LT(READ_ONCE(rnp->exp_seq_rq), s)) { + spin_lock(&rnp->exp_lock); + /* Recheck, avoid hang in case someone just arrived. */ + if (ULONG_CMP_LT(rnp->exp_seq_rq, s)) + rnp->exp_seq_rq = s; + spin_unlock(&rnp->exp_lock); + } + wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + } + trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); + mutex_unlock(&rsp->exp_wake_mutex); +} + +/** + * synchronize_sched_expedited - Brute-force RCU-sched grace period + * + * Wait for an RCU-sched grace period to elapse, but use a "big hammer" + * approach to force the grace period to end quickly. This consumes + * significant time on all CPUs and is unfriendly to real-time workloads, + * so is thus not recommended for any sort of common-case code. In fact, + * if you are using synchronize_sched_expedited() in a loop, please + * restructure your code to batch your updates, and then use a single + * synchronize_sched() instead. + * + * This implementation can be thought of as an application of sequence + * locking to expedited grace periods, but using the sequence counter to + * determine when someone else has already done the work instead of for + * retrying readers. + */ +void synchronize_sched_expedited(void) +{ + unsigned long s; + struct rcu_state *rsp = &rcu_sched_state; + + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + + /* Take a snapshot of the sequence number. */ + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_sched_exp_handler); + + /* Wait and clean up, including waking everyone. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#ifdef CONFIG_PREEMPT_RCU + +/* + * Remote handler for smp_call_function_single(). If there is an + * RCU read-side critical section in effect, request that the + * next rcu_read_unlock() record the quiescent state up the + * ->expmask fields in the rcu_node tree. Otherwise, immediately + * report the quiescent state. + */ +static void sync_rcu_exp_handler(void *info) +{ + struct rcu_data *rdp; + struct rcu_state *rsp = info; + struct task_struct *t = current; + + /* + * Within an RCU read-side critical section, request that the next + * rcu_read_unlock() report. Unless this RCU read-side critical + * section has already blocked, in which case it is already set + * up for the expedited grace period to wait on it. + */ + if (t->rcu_read_lock_nesting > 0 && + !t->rcu_read_unlock_special.b.blocked) { + t->rcu_read_unlock_special.b.exp_need_qs = true; + return; + } + + /* + * We are either exiting an RCU read-side critical section (negative + * values of t->rcu_read_lock_nesting) or are not in one at all + * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU + * read-side critical section that blocked before this expedited + * grace period started. Either way, we can immediately report + * the quiescent state. + */ + rdp = this_cpu_ptr(rsp->rda); + rcu_report_exp_rdp(rsp, rdp, true); +} + +/** + * synchronize_rcu_expedited - Brute-force RCU grace period + * + * Wait for an RCU-preempt grace period, but expedite it. The basic + * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler + * checks whether the CPU is in an RCU-preempt critical section, and + * if so, it sets a flag that causes the outermost rcu_read_unlock() + * to report the quiescent state. On the other hand, if the CPU is + * not in an RCU read-side critical section, the IPI handler reports + * the quiescent state immediately. + * + * Although this is a greate improvement over previous expedited + * implementations, it is still unfriendly to real-time workloads, so is + * thus not recommended for any sort of common-case code. In fact, if + * you are using synchronize_rcu_expedited() in a loop, please restructure + * your code to batch your updates, and then Use a single synchronize_rcu() + * instead. + */ +void synchronize_rcu_expedited(void) +{ + struct rcu_state *rsp = rcu_state_p; + unsigned long s; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + + s = rcu_exp_gp_seq_snap(rsp); + if (exp_funnel_lock(rsp, s)) + return; /* Someone else did our work for us. */ + + /* Initialize the rcu_node tree in preparation for the wait. */ + sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); + + /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ + rcu_exp_wait_wake(rsp, s); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#else /* #ifdef CONFIG_PREEMPT_RCU */ + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ff1cd4e1188d..0082fce402a0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -79,8 +79,6 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); - if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) - pr_info("\tRCU torture testing starts during boot.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -681,84 +679,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -/* - * Remote handler for smp_call_function_single(). If there is an - * RCU read-side critical section in effect, request that the - * next rcu_read_unlock() record the quiescent state up the - * ->expmask fields in the rcu_node tree. Otherwise, immediately - * report the quiescent state. - */ -static void sync_rcu_exp_handler(void *info) -{ - struct rcu_data *rdp; - struct rcu_state *rsp = info; - struct task_struct *t = current; - - /* - * Within an RCU read-side critical section, request that the next - * rcu_read_unlock() report. Unless this RCU read-side critical - * section has already blocked, in which case it is already set - * up for the expedited grace period to wait on it. - */ - if (t->rcu_read_lock_nesting > 0 && - !t->rcu_read_unlock_special.b.blocked) { - t->rcu_read_unlock_special.b.exp_need_qs = true; - return; - } - - /* - * We are either exiting an RCU read-side critical section (negative - * values of t->rcu_read_lock_nesting) or are not in one at all - * (zero value of t->rcu_read_lock_nesting). Or we are in an RCU - * read-side critical section that blocked before this expedited - * grace period started. Either way, we can immediately report - * the quiescent state. - */ - rdp = this_cpu_ptr(rsp->rda); - rcu_report_exp_rdp(rsp, rdp, true); -} - -/** - * synchronize_rcu_expedited - Brute-force RCU grace period - * - * Wait for an RCU-preempt grace period, but expedite it. The basic - * idea is to IPI all non-idle non-nohz online CPUs. The IPI handler - * checks whether the CPU is in an RCU-preempt critical section, and - * if so, it sets a flag that causes the outermost rcu_read_unlock() - * to report the quiescent state. On the other hand, if the CPU is - * not in an RCU read-side critical section, the IPI handler reports - * the quiescent state immediately. - * - * Although this is a greate improvement over previous expedited - * implementations, it is still unfriendly to real-time workloads, so is - * thus not recommended for any sort of common-case code. In fact, if - * you are using synchronize_rcu_expedited() in a loop, please restructure - * your code to batch your updates, and then Use a single synchronize_rcu() - * instead. - */ -void synchronize_rcu_expedited(void) -{ - struct rcu_state *rsp = rcu_state_p; - unsigned long s; - - /* If expedited grace periods are prohibited, fall back to normal. */ - if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu); - return; - } - - s = rcu_exp_gp_seq_snap(rsp); - if (exp_funnel_lock(rsp, s)) - return; /* Someone else did our work for us. */ - - /* Initialize the rcu_node tree in preparation for the wait. */ - sync_rcu_exp_select_cpus(rsp, sync_rcu_exp_handler); - - /* Wait for ->blkd_tasks lists to drain, then wake everyone up. */ - rcu_exp_wait_wake(rsp, s); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - /** * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. * @@ -883,16 +803,6 @@ static void rcu_preempt_check_callbacks(void) } /* - * Wait for an rcu-preempt grace period, but make it happen quickly. - * But because preemptible RCU does not exist, map to rcu-sched. - */ -void synchronize_rcu_expedited(void) -{ - synchronize_sched_expedited(); -} -EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); - -/* * Because preemptible RCU does not exist, rcu_barrier() is just * another name for rcu_barrier_sched(). */ @@ -1254,8 +1164,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) return; if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) return; - for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) - if ((mask & 0x1) && cpu != outgoingcpu) + for_each_leaf_node_possible_cpu(rnp, cpu) + if ((mask & leaf_node_cpu_bit(rnp, cpu)) && + cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); if (cpumask_weight(cm) == 0) cpumask_setall(cm); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 3e888cd5a594..f0d8322bc3ec 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -528,6 +528,7 @@ static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10; module_param(rcu_task_stall_timeout, int, 0644); static void rcu_spawn_tasks_kthread(void); +static struct task_struct *rcu_tasks_kthread_ptr; /* * Post an RCU-tasks callback. First call must be from process context @@ -537,6 +538,7 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) { unsigned long flags; bool needwake; + bool havetask = READ_ONCE(rcu_tasks_kthread_ptr); rhp->next = NULL; rhp->func = func; @@ -545,7 +547,9 @@ void call_rcu_tasks(struct rcu_head *rhp, rcu_callback_t func) *rcu_tasks_cbs_tail = rhp; rcu_tasks_cbs_tail = &rhp->next; raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags); - if (needwake) { + /* We can't create the thread unless interrupts are enabled. */ + if ((needwake && havetask) || + (!havetask && !irqs_disabled_flags(flags))) { rcu_spawn_tasks_kthread(); wake_up(&rcu_tasks_cbs_wq); } @@ -790,7 +794,6 @@ static int __noreturn rcu_tasks_kthread(void *arg) static void rcu_spawn_tasks_kthread(void) { static DEFINE_MUTEX(rcu_tasks_kthread_mutex); - static struct task_struct *rcu_tasks_kthread_ptr; struct task_struct *t; if (READ_ONCE(rcu_tasks_kthread_ptr)) { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4e9617a7e7d9..5c883fe8e440 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1937,7 +1937,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1948,7 +1948,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1974,7 +1974,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -2047,7 +2047,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -5405,13 +5405,15 @@ void idle_task_exit(void) /* * Since this CPU is going 'away' for a while, fold any nr_active delta * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. + * nr_active count is stable. We need to take the teardown thread which + * is calling this into account, so we hand in adjust = 1 to the load + * calculation. * * Also see the comment "Global load-average calculations". */ static void calc_load_migrate(struct rq *rq) { - long delta = calc_load_fold_active(rq); + long delta = calc_load_fold_active(rq, 1); if (delta) atomic_long_add(delta, &calc_load_tasks); } diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index b0b93fd33af9..a2d6eb71f06b 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -78,11 +78,11 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) loads[2] = (avenrun[2] + offset) << shift; } -long calc_load_fold_active(struct rq *this_rq) +long calc_load_fold_active(struct rq *this_rq, long adjust) { long nr_active, delta = 0; - nr_active = this_rq->nr_running; + nr_active = this_rq->nr_running - adjust; nr_active += (long)this_rq->nr_uninterruptible; if (nr_active != this_rq->calc_load_active) { @@ -188,7 +188,7 @@ void calc_load_enter_idle(void) * We're going into NOHZ mode, if there's any pending delta, fold it * into the pending idle delta. */ - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -389,7 +389,7 @@ void calc_global_load_tick(struct rq *this_rq) if (time_before(jiffies, this_rq->calc_load_update)) return; - delta = calc_load_fold_active(this_rq); + delta = calc_load_fold_active(this_rq, 0); if (delta) atomic_long_add(delta, &calc_load_tasks); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f44da95c70cd..c64fc5114004 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -28,7 +28,7 @@ extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; extern void calc_global_load_tick(struct rq *this_rq); -extern long calc_load_fold_active(struct rq *this_rq); +extern long calc_load_fold_active(struct rq *this_rq, long adjust); #ifdef CONFIG_SMP extern void cpu_load_update_active(struct rq *this_rq); @@ -1114,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif diff --git a/kernel/smp.c b/kernel/smp.c index 74165443c240..36552beed397 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -107,7 +107,7 @@ void __init call_function_init(void) */ static __always_inline void csd_lock_wait(struct call_single_data *csd) { - smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } static __always_inline void csd_lock(struct call_single_data *csd) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 87b2fc38398b..35f0dcb1cb4f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1205,6 +1205,17 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif +#if defined(CONFIG_TREE_RCU) || defined(CONFIG_PREEMPT_RCU) + { + .procname = "panic_on_rcu_stall", + .data = &sysctl_panic_on_rcu_stall, + .maxlen = sizeof(sysctl_panic_on_rcu_stall), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } }; diff --git a/kernel/task_work.c b/kernel/task_work.c index 53fa971d000d..6ab4842b00e8 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -108,7 +108,6 @@ void task_work_run(void) * fail, but it can play with *work and other entries. */ raw_spin_unlock_wait(&task->pi_lock); - smp_mb(); do { next = work->next; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1cafba860b08..39008d78927a 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -777,6 +777,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 479d25cd3d4f..255e225393ac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2186,6 +2186,7 @@ struct timespec64 get_monotonic_coarse64(void) return now; } +EXPORT_SYMBOL(get_monotonic_coarse64); /* * Must hold jiffies_lock diff --git a/kernel/torture.c b/kernel/torture.c index fa0bdeee17ac..75961b3decfe 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -82,6 +82,104 @@ static int min_online = -1; static int max_online; /* + * Attempt to take a CPU offline. Return false if the CPU is already + * offline or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, + unsigned long *sum_offl, int *min_offl, int *max_offl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (!cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_offl_attempts)++; + ret = cpu_down(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offline %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: offlined %d\n", + torture_type, cpu); + (*n_offl_successes)++; + delta = jiffies - starttime; + sum_offl += delta; + if (*min_offl < 0) { + *min_offl = delta; + *max_offl = delta; + } + if (*min_offl > delta) + *min_offl = delta; + if (*max_offl < delta) + *max_offl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_offline); + +/* + * Attempt to bring a CPU online. Return false if the CPU is already + * online or if it is not subject to CPU-hotplug operations. The + * caller can detect other failures by looking at the statistics. + */ +bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, + unsigned long *sum_onl, int *min_onl, int *max_onl) +{ + unsigned long delta; + int ret; + unsigned long starttime; + + if (cpu_online(cpu) || !cpu_is_hotpluggable(cpu)) + return false; + + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlining %d\n", + torture_type, cpu); + starttime = jiffies; + (*n_onl_attempts)++; + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "torture_onoff task: onlined %d\n", + torture_type, cpu); + (*n_onl_successes)++; + delta = jiffies - starttime; + *sum_onl += delta; + if (*min_onl < 0) { + *min_onl = delta; + *max_onl = delta; + } + if (*min_onl > delta) + *min_onl = delta; + if (*max_onl < delta) + *max_onl = delta; + } + + return true; +} +EXPORT_SYMBOL_GPL(torture_online); + +/* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. */ @@ -89,16 +187,19 @@ static int torture_onoff(void *arg) { int cpu; - unsigned long delta; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; - unsigned long starttime; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); + + if (maxcpu == 0) { + VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); + goto stop; + } + if (onoff_holdoff > 0) { VERBOSE_TOROUT_STRING("torture_onoff begin holdoff"); schedule_timeout_interruptible(onoff_holdoff); @@ -106,69 +207,16 @@ torture_onoff(void *arg) } while (!torture_must_stop()) { cpu = (torture_random(&rand) >> 4) % (maxcpu + 1); - if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_offline_attempts++; - ret = cpu_down(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offline %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: offlined %d\n", - torture_type, cpu); - n_offline_successes++; - delta = jiffies - starttime; - sum_offline += delta; - if (min_offline < 0) { - min_offline = delta; - max_offline = delta; - } - if (min_offline > delta) - min_offline = delta; - if (max_offline < delta) - max_offline = delta; - } - } else if (cpu_is_hotpluggable(cpu)) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlining %d\n", - torture_type, cpu); - starttime = jiffies; - n_online_attempts++; - ret = cpu_up(cpu); - if (ret) { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: online %d failed: errno %d\n", - torture_type, cpu, ret); - } else { - if (verbose) - pr_alert("%s" TORTURE_FLAG - "torture_onoff task: onlined %d\n", - torture_type, cpu); - n_online_successes++; - delta = jiffies - starttime; - sum_online += delta; - if (min_online < 0) { - min_online = delta; - max_online = delta; - } - if (min_online > delta) - min_online = delta; - if (max_online < delta) - max_online = delta; - } - } + if (!torture_offline(cpu, + &n_offline_attempts, &n_offline_successes, + &sum_offline, &min_offline, &max_offline)) + torture_online(cpu, + &n_online_attempts, &n_online_successes, + &sum_online, &min_online, &max_online); schedule_timeout_interruptible(onoff_interval); } + +stop: torture_kthread_stopping("torture_onoff"); return 0; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index e1c0e996b5ae..97e7b793df35 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4600,15 +4600,11 @@ static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) return; - /* is @cpu the only online CPU? */ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); - if (cpumask_weight(&cpumask) != 1) - return; /* as we're called from CPU_ONLINE, the following shouldn't fail */ for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, - pool->attrs->cpumask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, &cpumask) < 0); } /* |