diff options
Diffstat (limited to 'kernel')
51 files changed, 1714 insertions, 299 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index e420a0c41b5f..cc3416f0deda 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -72,6 +72,8 @@ #include <linux/fs_struct.h> #include <linux/compat.h> #include <linux/ctype.h> +#include <linux/string.h> +#include <uapi/linux/limits.h> #include "audit.h" @@ -1861,8 +1863,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } list_for_each_entry_reverse(n, &context->names_list, list) { - /* does the name pointer match? */ - if (!n->name || n->name->name != name->name) + if (!n->name || strcmp(n->name->name, name->name)) continue; /* match the correct record type */ @@ -1877,12 +1878,48 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, } out_alloc: - /* unable to find the name from a previous getname(). Allocate a new - * anonymous entry. - */ - n = audit_alloc_name(context, AUDIT_TYPE_NORMAL); + /* unable to find an entry with both a matching name and type */ + n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN); if (!n) return; + /* unfortunately, while we may have a path name to record with the + * inode, we can't always rely on the string lasting until the end of + * the syscall so we need to create our own copy, it may fail due to + * memory allocation issues, but we do our best */ + if (name) { + /* we can't use getname_kernel() due to size limits */ + size_t len = strlen(name->name) + 1; + struct filename *new = __getname(); + + if (unlikely(!new)) + goto out; + + if (len <= (PATH_MAX - sizeof(*new))) { + new->name = (char *)(new) + sizeof(*new); + new->separate = false; + } else if (len <= PATH_MAX) { + /* this looks odd, but is due to final_putname() */ + struct filename *new2; + + new2 = kmalloc(sizeof(*new2), GFP_KERNEL); + if (unlikely(!new2)) { + __putname(new); + goto out; + } + new2->name = (char *)new; + new2->separate = true; + new = new2; + } else { + /* we should never get here, but let's be safe */ + __putname(new); + goto out; + } + strlcpy((char *)new->name, name->name, len); + new->uptr = NULL; + new->aname = n; + n->name = new; + n->name_put = true; + } out: if (parent) { n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 055ae6ac0280..eac45b688c08 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1025,6 +1025,16 @@ static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && @@ -1727,7 +1737,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a32345c255da..0979e0a5dd11 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4489,7 +4489,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; if (visible) { @@ -4521,9 +4521,6 @@ err_list_del: list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); err_free_id: - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 672310e1597e..71b52dd957de 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1204,7 +1204,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, mutex_unlock(&callback_mutex); /* use trialcs->mems_allowed as a temp variable */ - update_nodemasks_hier(cs, &cs->mems_allowed); + update_nodemasks_hier(cs, &trialcs->mems_allowed); done: return retval; } diff --git a/kernel/events/core.c b/kernel/events/core.c index 8c681d1f480f..e0b00cc9cda2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3188,7 +3188,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; @@ -3729,28 +3729,21 @@ static void perf_event_for_each(struct perf_event *event, mutex_unlock(&ctx->mutex); } -static int perf_event_period(struct perf_event *event, u64 __user *arg) -{ - struct perf_event_context *ctx = event->ctx; - int ret = 0, active; +struct period_event { + struct perf_event *event; u64 value; +}; - if (!is_sampling_event(event)) - return -EINVAL; - - if (copy_from_user(&value, arg, sizeof(value))) - return -EFAULT; - - if (!value) - return -EINVAL; +static int __perf_event_period(void *info) +{ + struct period_event *pe = info; + struct perf_event *event = pe->event; + struct perf_event_context *ctx = event->ctx; + u64 value = pe->value; + bool active; - raw_spin_lock_irq(&ctx->lock); + raw_spin_lock(&ctx->lock); if (event->attr.freq) { - if (value > sysctl_perf_event_sample_rate) { - ret = -EINVAL; - goto unlock; - } - event->attr.sample_freq = value; } else { event->attr.sample_period = value; @@ -3769,11 +3762,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } + raw_spin_unlock(&ctx->lock); -unlock: + return 0; +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct period_event pe = { .event = event, }; + struct perf_event_context *ctx = event->ctx; + struct task_struct *task; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + if (event->attr.freq && value > sysctl_perf_event_sample_rate) + return -EINVAL; + + task = ctx->task; + pe.value = value; + + if (!task) { + cpu_function_call(event->cpu, __perf_event_period, &pe); + return 0; + } + +retry: + if (!task_function_call(task, __perf_event_period, &pe)) + return 0; + + raw_spin_lock_irq(&ctx->lock); + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + task = ctx->task; + goto retry; + } + + __perf_event_period(&pe); raw_spin_unlock_irq(&ctx->lock); - return ret; + return 0; } static const struct file_operations perf_fops; @@ -4057,20 +4092,20 @@ static void ring_buffer_attach(struct perf_event *event, WARN_ON_ONCE(event->rcu_pending); old_rb = event->rb; - event->rcu_batches = get_state_synchronize_rcu(); - event->rcu_pending = 1; - spin_lock_irqsave(&old_rb->event_lock, flags); list_del_rcu(&event->rb_entry); spin_unlock_irqrestore(&old_rb->event_lock, flags); - } - if (event->rcu_pending && rb) { - cond_synchronize_rcu(event->rcu_batches); - event->rcu_pending = 0; + event->rcu_batches = get_state_synchronize_rcu(); + event->rcu_pending = 1; } if (rb) { + if (event->rcu_pending) { + cond_synchronize_rcu(event->rcu_batches); + event->rcu_pending = 0; + } + spin_lock_irqsave(&rb->event_lock, flags); list_add_rcu(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); @@ -4398,12 +4433,20 @@ static const struct file_operations perf_fops = { * to user-space before waking everybody up. */ +static inline struct fasync_struct **perf_event_fasync(struct perf_event *event) +{ + /* only the parent has fasync state */ + if (event->parent) + event = event->parent; + return &event->fasync; +} + void perf_event_wakeup(struct perf_event *event) { ring_buffer_wakeup(event); if (event->pending_kill) { - kill_fasync(&event->fasync, SIGIO, event->pending_kill); + kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill); event->pending_kill = 0; } } @@ -5638,7 +5681,7 @@ static int __perf_event_overflow(struct perf_event *event, else perf_event_output(event, data, regs); - if (event->fasync && event->pending_kill) { + if (*perf_event_fasync(event) && event->pending_kill) { event->pending_wakeup = 1; irq_work_queue(&event->pending); } diff --git a/kernel/exit.c b/kernel/exit.c index 285ae8468de4..a9bda5938e7f 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -930,17 +930,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1313,7 +1324,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 669e2ca7bf71..3a8ecfb38831 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1834,13 +1834,21 @@ static int check_unshare_flags(unsigned long unshare_flags) CLONE_NEWUSER|CLONE_NEWPID)) return -EINVAL; /* - * Not implemented, but pretend it works if there is nothing to - * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND - * needs to unshare vm. + * Not implemented, but pretend it works if there is nothing + * to unshare. Note that unsharing the address space or the + * signal handlers also need to unshare the signal queues (aka + * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { - /* FIXME: get_task_mm() increments ->mm_users */ - if (atomic_read(¤t->mm->mm_users) > 1) + if (!thread_group_empty(current)) + return -EINVAL; + } + if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { + if (atomic_read(¤t->sighand->count) > 1) + return -EINVAL; + } + if (unshare_flags & CLONE_VM) { + if (!current_is_single_threaded()) return -EINVAL; } @@ -1909,16 +1917,16 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* - * If unsharing a thread from a thread group, must also unshare vm. - */ - if (unshare_flags & CLONE_THREAD) - unshare_flags |= CLONE_VM; - /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* + * If unsharing a signal handlers, must also unshare the signal queues. + */ + if (unshare_flags & CLONE_SIGHAND) + unshare_flags |= CLONE_THREAD; + /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) diff --git a/kernel/futex.c b/kernel/futex.c index 647ff4b3a150..53ead0af1ba7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1159,10 +1159,20 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; - else if (curval != uval) - ret = -EINVAL; + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not + * try the TID->0 transition) raced with a waiter setting the + * FUTEX_WAITERS flag between get_user() and locking the hash + * bucket lock, retry the operation. + */ + if ((FUTEX_TID_MASK & curval) == uval) + ret = -EAGAIN; + else + ret = -EINVAL; + } if (ret) { raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; @@ -1373,8 +1383,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); @@ -2431,6 +2441,15 @@ retry: */ if (ret == -EFAULT) goto pi_faulted; + /* + * A unconditional UNLOCK_PI op raced against a waiter + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { + spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } goto out_unlock; } @@ -2677,6 +2696,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_lock(&hb2->lock); BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + free_pi_state(q.pi_state); spin_unlock(&hb2->lock); } } else { @@ -2804,7 +2828,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 225086b2652e..9a76e3beda54 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -55,6 +55,21 @@ config GENERIC_IRQ_CHIP config IRQ_DOMAIN bool +# Support for hierarchical irq domains +config IRQ_DOMAIN_HIERARCHY + bool + select IRQ_DOMAIN + +# Generic MSI interrupt support +config GENERIC_MSI_IRQ + bool + +# Generic MSI hierarchical interrupt domain support +config GENERIC_MSI_IRQ_DOMAIN + bool + select IRQ_DOMAIN_HIERARCHY + select GENERIC_MSI_IRQ + config HANDLE_DOMAIN_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index fff17381f0af..d12123526e2b 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_IRQ_DOMAIN) += irqdomain.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_PM_SLEEP) += pm.o +obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index e5202f00cabc..6f1c7a566b95 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/irqdomain.h> #include <trace/events/irq.h> @@ -178,6 +179,7 @@ int irq_startup(struct irq_desc *desc, bool resend) irq_state_clr_disabled(desc); desc->depth = 0; + irq_domain_activate_irq(&desc->irq_data); if (desc->irq_data.chip->irq_startup) { ret = desc->irq_data.chip->irq_startup(&desc->irq_data); irq_state_clr_masked(desc); @@ -199,6 +201,7 @@ void irq_shutdown(struct irq_desc *desc) desc->irq_data.chip->irq_disable(&desc->irq_data); else desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_domain_deactivate_irq(&desc->irq_data); irq_state_set_masked(desc); } @@ -728,7 +731,30 @@ __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, if (!handle) { handle = handle_bad_irq; } else { - if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + struct irq_data *irq_data = &desc->irq_data; +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + /* + * With hierarchical domains we might run into a + * situation where the outermost chip is not yet set + * up, but the inner chips are there. Instead of + * bailing we install the handler, but obviously we + * cannot enable/startup the interrupt at this point. + */ + while (irq_data) { + if (irq_data->chip != &no_irq_chip) + break; + /* + * Bail out if the outer chip is not set up + * and the interrrupt supposed to be started + * right away. + */ + if (WARN_ON(is_chained)) + goto out; + /* Try the parent */ + irq_data = irq_data->parent_data; + } +#endif + if (WARN_ON(!irq_data || irq_data->chip == &no_irq_chip)) goto out; } @@ -847,3 +873,105 @@ void irq_cpu_offline(void) raw_spin_unlock_irqrestore(&desc->lock, flags); } } + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_chip_ack_parent - Acknowledge the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_ack_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_ack(data); +} + +/** + * irq_chip_mask_parent - Mask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_mask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_mask(data); +} + +/** + * irq_chip_unmask_parent - Unmask the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_unmask_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_unmask(data); +} + +/** + * irq_chip_eoi_parent - Invoke EOI on the parent interrupt + * @data: Pointer to interrupt specific data + */ +void irq_chip_eoi_parent(struct irq_data *data) +{ + data = data->parent_data; + data->chip->irq_eoi(data); +} + +/** + * irq_chip_set_affinity_parent - Set affinity on the parent interrupt + * @data: Pointer to interrupt specific data + * @dest: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Conditinal, as the underlying parent chip might not implement it. + */ +int irq_chip_set_affinity_parent(struct irq_data *data, + const struct cpumask *dest, bool force) +{ + data = data->parent_data; + if (data->chip->irq_set_affinity) + return data->chip->irq_set_affinity(data, dest, force); + + return -ENOSYS; +} + +/** + * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware + * @data: Pointer to interrupt specific data + * + * Iterate through the domain hierarchy of the interrupt and check + * whether a hw retrigger function exists. If yes, invoke it. + */ +int irq_chip_retrigger_hierarchy(struct irq_data *data) +{ + for (data = data->parent_data; data; data = data->parent_data) + if (data->chip && data->chip->irq_retrigger) + return data->chip->irq_retrigger(data); + + return -ENOSYS; +} +#endif + +/** + * irq_chip_compose_msi_msg - Componse msi message for a irq chip + * @data: Pointer to interrupt specific data + * @msg: Pointer to the MSI message + * + * For hierarchical domains we find the first chip in the hierarchy + * which implements the irq_compose_msi_msg callback. For non + * hierarchical we use the top level chip. + */ +int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) +{ + struct irq_data *pos = NULL; + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY + for (; data; data = data->parent_data) +#endif + if (data->chip && data->chip->irq_compose_msi_msg) + pos = data; + if (!pos) + return -ENOSYS; + + pos->chip->irq_compose_msi_msg(pos, msg); + + return 0; +} diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d5d0f7345c54..74d90a754268 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -104,7 +104,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, return -ENOMEM; rc = request_any_context_irq(irq, handler, irqflags, devname, dev_id); - if (rc) { + if (rc < 0) { devres_free(dr); return rc; } @@ -113,7 +113,7 @@ int devm_request_any_context_irq(struct device *dev, unsigned int irq, dr->dev_id = dev_id; devres_add(dev, dr); - return 0; + return rc; } EXPORT_SYMBOL(devm_request_any_context_irq); diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index cf80e7b0ddab..61024e8abdef 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -39,7 +39,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.disable); + irq_reg_writel(gc, mask, ct->regs.disable); *ct->mask_cache &= ~mask; irq_gc_unlock(gc); } @@ -59,7 +59,7 @@ void irq_gc_mask_set_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache |= mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_set_bit); @@ -79,7 +79,7 @@ void irq_gc_mask_clr_bit(struct irq_data *d) irq_gc_lock(gc); *ct->mask_cache &= ~mask; - irq_reg_writel(*ct->mask_cache, gc->reg_base + ct->regs.mask); + irq_reg_writel(gc, *ct->mask_cache, ct->regs.mask); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_mask_clr_bit); @@ -98,7 +98,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.enable); + irq_reg_writel(gc, mask, ct->regs.enable); *ct->mask_cache |= mask; irq_gc_unlock(gc); } @@ -114,7 +114,7 @@ void irq_gc_ack_set_bit(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } EXPORT_SYMBOL_GPL(irq_gc_ack_set_bit); @@ -130,7 +130,7 @@ void irq_gc_ack_clr_bit(struct irq_data *d) u32 mask = ~d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -145,8 +145,8 @@ void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.mask); - irq_reg_writel(mask, gc->reg_base + ct->regs.ack); + irq_reg_writel(gc, mask, ct->regs.mask); + irq_reg_writel(gc, mask, ct->regs.ack); irq_gc_unlock(gc); } @@ -161,7 +161,7 @@ void irq_gc_eoi(struct irq_data *d) u32 mask = d->mask; irq_gc_lock(gc); - irq_reg_writel(mask, gc->reg_base + ct->regs.eoi); + irq_reg_writel(gc, mask, ct->regs.eoi); irq_gc_unlock(gc); } @@ -191,6 +191,16 @@ int irq_gc_set_wake(struct irq_data *d, unsigned int on) return 0; } +static u32 irq_readl_be(void __iomem *addr) +{ + return ioread32be(addr); +} + +static void irq_writel_be(u32 val, void __iomem *addr) +{ + iowrite32be(val, addr); +} + static void irq_init_generic_chip(struct irq_chip_generic *gc, const char *name, int num_ct, unsigned int irq_base, @@ -245,7 +255,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } ct[i].mask_cache = mskptr; if (flags & IRQ_GC_INIT_MASK_CACHE) - *mskptr = irq_reg_readl(gc->reg_base + mskreg); + *mskptr = irq_reg_readl(gc, mskreg); } } @@ -300,7 +310,13 @@ int irq_alloc_domain_generic_chips(struct irq_domain *d, int irqs_per_chip, dgc->gc[i] = gc = tmp; irq_init_generic_chip(gc, name, num_ct, i * irqs_per_chip, NULL, handler); + gc->domain = d; + if (gcflags & IRQ_GC_BE_IO) { + gc->reg_readl = &irq_readl_be; + gc->reg_writel = &irq_writel_be; + } + raw_spin_lock_irqsave(&gc_lock, flags); list_add_tail(&gc->list, &gc_list); raw_spin_unlock_irqrestore(&gc_lock, flags); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6534ff6ce02e..021f823794e7 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,6 +23,10 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; +static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, + irq_hw_number_t hwirq, int node); +static void irq_domain_check_hierarchy(struct irq_domain *domain); + /** * __irq_domain_add() - Allocate a new irq_domain data structure * @of_node: optional device-tree node of the interrupt controller @@ -30,7 +34,7 @@ static struct irq_domain *irq_default_domain; * @hwirq_max: Maximum number of interrupts supported by controller * @direct_max: Maximum value of direct maps; Use ~0 for no limit; 0 for no * direct mapping - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates and initialize and irq_domain structure. @@ -56,6 +60,7 @@ struct irq_domain *__irq_domain_add(struct device_node *of_node, int size, domain->hwirq_max = hwirq_max; domain->revmap_size = size; domain->revmap_direct_max_irq = direct_max; + irq_domain_check_hierarchy(domain); mutex_lock(&irq_domain_mutex); list_add(&domain->link, &irq_domain_list); @@ -109,7 +114,7 @@ EXPORT_SYMBOL_GPL(irq_domain_remove); * @first_irq: first number of irq block assigned to the domain, * pass zero to assign irqs on-the-fly. If first_irq is non-zero, then * pre-map all of the irqs in the domain to virqs starting at first_irq. - * @ops: map/unmap domain callbacks + * @ops: domain callbacks * @host_data: Controller private data pointer * * Allocates an irq_domain, and optionally if first_irq is positive then also @@ -174,20 +179,20 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, domain = __irq_domain_add(of_node, first_hwirq + size, first_hwirq + size, 0, ops, host_data); - if (!domain) - return NULL; - - irq_domain_associate_many(domain, first_irq, first_hwirq, size); + if (domain) + irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } EXPORT_SYMBOL_GPL(irq_domain_add_legacy); /** - * irq_find_host() - Locates a domain for a given device node + * irq_find_matching_host() - Locates a domain for a given device node * @node: device-tree node of the interrupt controller + * @bus_token: domain-specific data */ -struct irq_domain *irq_find_host(struct device_node *node) +struct irq_domain *irq_find_matching_host(struct device_node *node, + enum irq_domain_bus_token bus_token) { struct irq_domain *h, *found = NULL; int rc; @@ -196,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node) * it might potentially be set to match all interrupts in * the absence of a device node. This isn't a problem so far * yet though... + * + * bus_token == DOMAIN_BUS_ANY matches any domain, any other + * values must generate an exact match for the domain to be + * selected. */ mutex_lock(&irq_domain_mutex); list_for_each_entry(h, &irq_domain_list, link) { if (h->ops->match) - rc = h->ops->match(h, node); + rc = h->ops->match(h, node, bus_token); else - rc = (h->of_node != NULL) && (h->of_node == node); + rc = ((h->of_node != NULL) && (h->of_node == node) && + ((bus_token == DOMAIN_BUS_ANY) || + (h->bus_token == bus_token))); if (rc) { found = h; @@ -212,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node) mutex_unlock(&irq_domain_mutex); return found; } -EXPORT_SYMBOL_GPL(irq_find_host); +EXPORT_SYMBOL_GPL(irq_find_matching_host); /** * irq_set_default_host() - Set a "default" irq domain @@ -388,7 +399,6 @@ EXPORT_SYMBOL_GPL(irq_create_direct_mapping); unsigned int irq_create_mapping(struct irq_domain *domain, irq_hw_number_t hwirq) { - unsigned int hint; int virq; pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq); @@ -410,12 +420,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - hint = hwirq % nr_irqs; - if (hint == 0) - hint++; - virq = irq_alloc_desc_from(hint, of_node_to_nid(domain->of_node)); - if (virq <= 0) - virq = irq_alloc_desc_from(1, of_node_to_nid(domain->of_node)); + virq = irq_domain_alloc_descs(-1, 1, hwirq, + of_node_to_nid(domain->of_node)); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -471,7 +477,7 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) struct irq_domain *domain; irq_hw_number_t hwirq; unsigned int type = IRQ_TYPE_NONE; - unsigned int virq; + int virq; domain = irq_data->np ? irq_find_host(irq_data->np) : irq_default_domain; if (!domain) { @@ -489,10 +495,24 @@ unsigned int irq_create_of_mapping(struct of_phandle_args *irq_data) return 0; } - /* Create mapping */ - virq = irq_create_mapping(domain, hwirq); - if (!virq) - return virq; + if (irq_domain_is_hierarchy(domain)) { + /* + * If we've already configured this interrupt, + * don't do it again, or hell will break loose. + */ + virq = irq_find_mapping(domain, hwirq); + if (virq) + return virq; + + virq = irq_domain_alloc_irqs(domain, 1, NUMA_NO_NODE, irq_data); + if (virq <= 0) + return 0; + } else { + /* Create mapping */ + virq = irq_create_mapping(domain, hwirq); + if (!virq) + return virq; + } /* Set type if specified and different than the current one */ if (type != IRQ_TYPE_NONE && @@ -540,8 +560,8 @@ unsigned int irq_find_mapping(struct irq_domain *domain, return 0; if (hwirq < domain->revmap_direct_max_irq) { - data = irq_get_irq_data(hwirq); - if (data && (data->domain == domain) && (data->hwirq == hwirq)) + data = irq_domain_get_irq_data(domain, hwirq); + if (data && data->hwirq == hwirq) return hwirq; } @@ -709,3 +729,518 @@ const struct irq_domain_ops irq_domain_simple_ops = { .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); + +static int irq_domain_alloc_descs(int virq, unsigned int cnt, + irq_hw_number_t hwirq, int node) +{ + unsigned int hint; + + if (virq >= 0) { + virq = irq_alloc_descs(virq, virq, cnt, node); + } else { + hint = hwirq % nr_irqs; + if (hint == 0) + hint++; + virq = irq_alloc_descs_from(hint, cnt, node); + if (virq <= 0 && hint > 1) + virq = irq_alloc_descs_from(1, cnt, node); + } + + return virq; +} + +#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY +/** + * irq_domain_add_hierarchy - Add a irqdomain into the hierarchy + * @parent: Parent irq domain to associate with the new domain + * @flags: Irq domain flags associated to the domain + * @size: Size of the domain. See below + * @node: Optional device-tree node of the interrupt controller + * @ops: Pointer to the interrupt domain callbacks + * @host_data: Controller private data pointer + * + * If @size is 0 a tree domain is created, otherwise a linear domain. + * + * If successful the parent is associated to the new domain and the + * domain flags are set. + * Returns pointer to IRQ domain, or NULL on failure. + */ +struct irq_domain *irq_domain_add_hierarchy(struct irq_domain *parent, + unsigned int flags, + unsigned int size, + struct device_node *node, + const struct irq_domain_ops *ops, + void *host_data) +{ + struct irq_domain *domain; + + if (size) + domain = irq_domain_add_linear(node, size, ops, host_data); + else + domain = irq_domain_add_tree(node, ops, host_data); + if (domain) { + domain->parent = parent; + domain->flags |= flags; + } + + return domain; +} + +static void irq_domain_insert_irq(int virq) +{ + struct irq_data *data; + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = virq; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_insert(&domain->revmap_tree, hwirq, data); + mutex_unlock(&revmap_trees_mutex); + } + + /* If not already assigned, give the domain the chip's name */ + if (!domain->name && data->chip) + domain->name = data->chip->name; + } + + irq_clear_status_flags(virq, IRQ_NOREQUEST); +} + +static void irq_domain_remove_irq(int virq) +{ + struct irq_data *data; + + irq_set_status_flags(virq, IRQ_NOREQUEST); + irq_set_chip_and_handler(virq, NULL, NULL); + synchronize_irq(virq); + smp_mb(); + + for (data = irq_get_irq_data(virq); data; data = data->parent_data) { + struct irq_domain *domain = data->domain; + irq_hw_number_t hwirq = data->hwirq; + + if (hwirq < domain->revmap_size) { + domain->linear_revmap[hwirq] = 0; + } else { + mutex_lock(&revmap_trees_mutex); + radix_tree_delete(&domain->revmap_tree, hwirq); + mutex_unlock(&revmap_trees_mutex); + } + } +} + +static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain, + struct irq_data *child) +{ + struct irq_data *irq_data; + + irq_data = kzalloc_node(sizeof(*irq_data), GFP_KERNEL, child->node); + if (irq_data) { + child->parent_data = irq_data; + irq_data->irq = child->irq; + irq_data->node = child->node; + irq_data->domain = domain; + } + + return irq_data; +} + +static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data, *tmp; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + tmp = irq_data->parent_data; + irq_data->parent_data = NULL; + irq_data->domain = NULL; + + while (tmp) { + irq_data = tmp; + tmp = tmp->parent_data; + kfree(irq_data); + } + } +} + +static int irq_domain_alloc_irq_data(struct irq_domain *domain, + unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *irq_data; + struct irq_domain *parent; + int i; + + /* The outermost irq_data is embedded in struct irq_desc */ + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_get_irq_data(virq + i); + irq_data->domain = domain; + + for (parent = domain->parent; parent; parent = parent->parent) { + irq_data = irq_domain_insert_irq_data(parent, irq_data); + if (!irq_data) { + irq_domain_free_irq_data(virq, i + 1); + return -ENOMEM; + } + } + } + + return 0; +} + +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data; + + for (irq_data = irq_get_irq_data(virq); irq_data; + irq_data = irq_data->parent_data) + if (irq_data->domain == domain) + return irq_data; + + return NULL; +} + +/** + * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hwirq number + * @chip: The associated interrupt chip + * @chip_data: The associated chip data + */ +int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data) +{ + struct irq_data *irq_data = irq_domain_get_irq_data(domain, virq); + + if (!irq_data) + return -ENOENT; + + irq_data->hwirq = hwirq; + irq_data->chip = chip ? chip : &no_irq_chip; + irq_data->chip_data = chip_data; + + return 0; +} + +/** + * irq_domain_set_info - Set the complete data for a @virq in @domain + * @domain: Interrupt domain to match + * @virq: IRQ number + * @hwirq: The hardware interrupt number + * @chip: The associated interrupt chip + * @chip_data: The associated interrupt chip data + * @handler: The interrupt flow handler + * @handler_data: The interrupt flow handler data + * @handler_name: The interrupt handler name + */ +void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, + irq_hw_number_t hwirq, struct irq_chip *chip, + void *chip_data, irq_flow_handler_t handler, + void *handler_data, const char *handler_name) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, chip, chip_data); + __irq_set_handler(virq, handler, 0, handler_name); + irq_set_handler_data(virq, handler_data); +} + +/** + * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data + * @irq_data: The pointer to irq_data + */ +void irq_domain_reset_irq_data(struct irq_data *irq_data) +{ + irq_data->hwirq = 0; + irq_data->chip = &no_irq_chip; + irq_data->chip_data = NULL; +} + +/** + * irq_domain_free_irqs_common - Clear irq_data and free the parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_common(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct irq_data *irq_data; + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_data = irq_domain_get_irq_data(domain, virq + i); + if (irq_data) + irq_domain_reset_irq_data(irq_data); + } + irq_domain_free_irqs_parent(domain, virq, nr_irqs); +} + +/** + * irq_domain_free_irqs_top - Clear handler and handler data, clear irqdata and free parent + * @domain: Interrupt domain to match + * @virq: IRQ number to start with + * @nr_irqs: The number of irqs to free + */ +void irq_domain_free_irqs_top(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + int i; + + for (i = 0; i < nr_irqs; i++) { + irq_set_handler_data(virq + i, NULL); + irq_set_handler(virq + i, NULL); + } + irq_domain_free_irqs_common(domain, virq, nr_irqs); +} + +static bool irq_domain_is_auto_recursive(struct irq_domain *domain) +{ + return domain->flags & IRQ_DOMAIN_FLAG_AUTO_RECURSIVE; +} + +static void irq_domain_free_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs) +{ + domain->ops->free(domain, irq_base, nr_irqs); + if (irq_domain_is_auto_recursive(domain)) { + BUG_ON(!domain->parent); + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); + } +} + +static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) +{ + int ret = 0; + struct irq_domain *parent = domain->parent; + bool recursive = irq_domain_is_auto_recursive(domain); + + BUG_ON(recursive && !parent); + if (recursive) + ret = irq_domain_alloc_irqs_recursive(parent, irq_base, + nr_irqs, arg); + if (ret >= 0) + ret = domain->ops->alloc(domain, irq_base, nr_irqs, arg); + if (ret < 0 && recursive) + irq_domain_free_irqs_recursive(parent, irq_base, nr_irqs); + + return ret; +} + +/** + * __irq_domain_alloc_irqs - Allocate IRQs from domain + * @domain: domain to allocate from + * @irq_base: allocate specified IRQ nubmer if irq_base >= 0 + * @nr_irqs: number of IRQs to allocate + * @node: NUMA node id for memory allocation + * @arg: domain specific argument + * @realloc: IRQ descriptors have already been allocated if true + * + * Allocate IRQ numbers and initialized all data structures to support + * hierarchy IRQ domains. + * Parameter @realloc is mainly to support legacy IRQs. + * Returns error code or allocated IRQ number + * + * The whole process to setup an IRQ has been split into two steps. + * The first step, __irq_domain_alloc_irqs(), is to allocate IRQ + * descriptor and required hardware resources. The second step, + * irq_domain_activate_irq(), is to program hardwares with preallocated + * resources. In this way, it's easier to rollback when failing to + * allocate resources. + */ +int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base, + unsigned int nr_irqs, int node, void *arg, + bool realloc) +{ + int i, ret, virq; + + if (domain == NULL) { + domain = irq_default_domain; + if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n")) + return -EINVAL; + } + + if (!domain->ops->alloc) { + pr_debug("domain->ops->alloc() is NULL\n"); + return -ENOSYS; + } + + if (realloc && irq_base >= 0) { + virq = irq_base; + } else { + virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node); + if (virq < 0) { + pr_debug("cannot allocate IRQ(base %d, count %d)\n", + irq_base, nr_irqs); + return virq; + } + } + + if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) { + pr_debug("cannot allocate memory for IRQ%d\n", virq); + ret = -ENOMEM; + goto out_free_desc; + } + + mutex_lock(&irq_domain_mutex); + ret = irq_domain_alloc_irqs_recursive(domain, virq, nr_irqs, arg); + if (ret < 0) { + mutex_unlock(&irq_domain_mutex); + goto out_free_irq_data; + } + for (i = 0; i < nr_irqs; i++) + irq_domain_insert_irq(virq + i); + mutex_unlock(&irq_domain_mutex); + + return virq; + +out_free_irq_data: + irq_domain_free_irq_data(virq, nr_irqs); +out_free_desc: + irq_free_descs(virq, nr_irqs); + return ret; +} + +/** + * irq_domain_free_irqs - Free IRQ number and associated data structures + * @virq: base IRQ number + * @nr_irqs: number of IRQs to free + */ +void irq_domain_free_irqs(unsigned int virq, unsigned int nr_irqs) +{ + struct irq_data *data = irq_get_irq_data(virq); + int i; + + if (WARN(!data || !data->domain || !data->domain->ops->free, + "NULL pointer, cannot free irq\n")) + return; + + mutex_lock(&irq_domain_mutex); + for (i = 0; i < nr_irqs; i++) + irq_domain_remove_irq(virq + i); + irq_domain_free_irqs_recursive(data->domain, virq, nr_irqs); + mutex_unlock(&irq_domain_mutex); + + irq_domain_free_irq_data(virq, nr_irqs); + irq_free_descs(virq, nr_irqs); +} + +/** + * irq_domain_alloc_irqs_parent - Allocate interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to allocate + * @arg: Allocation data (arch/domain specific) + * + * Check whether the domain has been setup recursive. If not allocate + * through the parent domain. + */ +int irq_domain_alloc_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs, + void *arg) +{ + /* irq_domain_alloc_irqs_recursive() has called parent's alloc() */ + if (irq_domain_is_auto_recursive(domain)) + return 0; + + domain = domain->parent; + if (domain) + return irq_domain_alloc_irqs_recursive(domain, irq_base, + nr_irqs, arg); + return -ENOSYS; +} + +/** + * irq_domain_free_irqs_parent - Free interrupts from parent domain + * @irq_base: Base IRQ number + * @nr_irqs: Number of IRQs to free + * + * Check whether the domain has been setup recursive. If not free + * through the parent domain. + */ +void irq_domain_free_irqs_parent(struct irq_domain *domain, + unsigned int irq_base, unsigned int nr_irqs) +{ + /* irq_domain_free_irqs_recursive() will call parent's free */ + if (!irq_domain_is_auto_recursive(domain) && domain->parent) + irq_domain_free_irqs_recursive(domain->parent, irq_base, + nr_irqs); +} + +/** + * irq_domain_activate_irq - Call domain_ops->activate recursively to activate + * interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * This is the second step to call domain_ops->activate to program interrupt + * controllers, so the interrupt could actually get delivered. + */ +void irq_domain_activate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (irq_data->parent_data) + irq_domain_activate_irq(irq_data->parent_data); + if (domain->ops->activate) + domain->ops->activate(domain, irq_data); + } +} + +/** + * irq_domain_deactivate_irq - Call domain_ops->deactivate recursively to + * deactivate interrupt + * @irq_data: outermost irq_data associated with interrupt + * + * It calls domain_ops->deactivate to program interrupt controllers to disable + * interrupt delivery. + */ +void irq_domain_deactivate_irq(struct irq_data *irq_data) +{ + if (irq_data && irq_data->domain) { + struct irq_domain *domain = irq_data->domain; + + if (domain->ops->deactivate) + domain->ops->deactivate(domain, irq_data); + if (irq_data->parent_data) + irq_domain_deactivate_irq(irq_data->parent_data); + } +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ + /* Hierarchy irq_domains must implement callback alloc() */ + if (domain->ops->alloc) + domain->flags |= IRQ_DOMAIN_FLAG_HIERARCHY; +} +#else /* CONFIG_IRQ_DOMAIN_HIERARCHY */ +/** + * irq_domain_get_irq_data - Get irq_data associated with @virq and @domain + * @domain: domain to match + * @virq: IRQ number to get irq_data + */ +struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, + unsigned int virq) +{ + struct irq_data *irq_data = irq_get_irq_data(virq); + + return (irq_data && irq_data->domain == domain) ? irq_data : NULL; +} + +static void irq_domain_check_hierarchy(struct irq_domain *domain) +{ +} +#endif /* CONFIG_IRQ_DOMAIN_HIERARCHY */ diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 382cbe57abf3..48dcd69c7379 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -241,6 +241,7 @@ int irq_do_set_affinity(struct irq_data *data, const struct cpumask *mask, ret = chip->irq_set_affinity(data, mask, force); switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: cpumask_copy(data->affinity, mask); case IRQ_SET_MASK_OK_NOCOPY: irq_set_thread_affinity(desc); @@ -675,6 +676,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, switch (ret) { case IRQ_SET_MASK_OK: + case IRQ_SET_MASK_OK_DONE: irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); irqd_set(&desc->irq_data, flags); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c new file mode 100644 index 000000000000..3e18163f336f --- /dev/null +++ b/kernel/irq/msi.c @@ -0,0 +1,330 @@ +/* + * linux/kernel/irq/msi.c + * + * Copyright (C) 2014 Intel Corp. + * Author: Jiang Liu <jiang.liu@linux.intel.com> + * + * This file is licensed under GPLv2. + * + * This file contains common code to support Message Signalled Interrupt for + * PCI compatible and non PCI compatible devices. + */ +#include <linux/types.h> +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/irqdomain.h> +#include <linux/msi.h> + +/* Temparory solution for building, will be removed later */ +#include <linux/pci.h> + +void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) +{ + *msg = entry->msg; +} + +void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) +{ + struct msi_desc *entry = irq_get_msi_desc(irq); + + __get_cached_msi_msg(entry, msg); +} +EXPORT_SYMBOL_GPL(get_cached_msi_msg); + +#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN +static inline void irq_chip_write_msi_msg(struct irq_data *data, + struct msi_msg *msg) +{ + data->chip->irq_write_msi_msg(data, msg); +} + +/** + * msi_domain_set_affinity - Generic affinity setter function for MSI domains + * @irq_data: The irq data associated to the interrupt + * @mask: The affinity mask to set + * @force: Flag to enforce setting (disable online checks) + * + * Intended to be used by MSI interrupt controllers which are + * implemented with hierarchical domains. + */ +int msi_domain_set_affinity(struct irq_data *irq_data, + const struct cpumask *mask, bool force) +{ + struct irq_data *parent = irq_data->parent_data; + struct msi_msg msg; + int ret; + + ret = parent->chip->irq_set_affinity(parent, mask, force); + if (ret >= 0 && ret != IRQ_SET_MASK_OK_DONE) { + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); + } + + return ret; +} + +static void msi_domain_activate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + BUG_ON(irq_chip_compose_msi_msg(irq_data, &msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static void msi_domain_deactivate(struct irq_domain *domain, + struct irq_data *irq_data) +{ + struct msi_msg msg; + + memset(&msg, 0, sizeof(msg)); + irq_chip_write_msi_msg(irq_data, &msg); +} + +static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs, void *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + irq_hw_number_t hwirq = ops->get_hwirq(info, arg); + int i, ret; + + if (irq_find_mapping(domain, hwirq) > 0) + return -EEXIST; + + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + + for (i = 0; i < nr_irqs; i++) { + ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); + if (ret < 0) { + if (ops->msi_free) { + for (i--; i > 0; i--) + ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); + return ret; + } + } + + return 0; +} + +static void msi_domain_free(struct irq_domain *domain, unsigned int virq, + unsigned int nr_irqs) +{ + struct msi_domain_info *info = domain->host_data; + int i; + + if (info->ops->msi_free) { + for (i = 0; i < nr_irqs; i++) + info->ops->msi_free(domain, info, virq + i); + } + irq_domain_free_irqs_top(domain, virq, nr_irqs); +} + +static struct irq_domain_ops msi_domain_ops = { + .alloc = msi_domain_alloc, + .free = msi_domain_free, + .activate = msi_domain_activate, + .deactivate = msi_domain_deactivate, +}; + +#ifdef GENERIC_MSI_DOMAIN_OPS +static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info, + msi_alloc_info_t *arg) +{ + return arg->hwirq; +} + +static int msi_domain_ops_prepare(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + memset(arg, 0, sizeof(*arg)); + return 0; +} + +static void msi_domain_ops_set_desc(msi_alloc_info_t *arg, + struct msi_desc *desc) +{ + arg->desc = desc; +} +#else +#define msi_domain_ops_get_hwirq NULL +#define msi_domain_ops_prepare NULL +#define msi_domain_ops_set_desc NULL +#endif /* !GENERIC_MSI_DOMAIN_OPS */ + +static int msi_domain_ops_init(struct irq_domain *domain, + struct msi_domain_info *info, + unsigned int virq, irq_hw_number_t hwirq, + msi_alloc_info_t *arg) +{ + irq_domain_set_hwirq_and_chip(domain, virq, hwirq, info->chip, + info->chip_data); + if (info->handler && info->handler_name) { + __irq_set_handler(virq, info->handler, 0, info->handler_name); + if (info->handler_data) + irq_set_handler_data(virq, info->handler_data); + } + return 0; +} + +static int msi_domain_ops_check(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + return 0; +} + +static struct msi_domain_ops msi_domain_ops_default = { + .get_hwirq = msi_domain_ops_get_hwirq, + .msi_init = msi_domain_ops_init, + .msi_check = msi_domain_ops_check, + .msi_prepare = msi_domain_ops_prepare, + .set_desc = msi_domain_ops_set_desc, +}; + +static void msi_domain_update_dom_ops(struct msi_domain_info *info) +{ + struct msi_domain_ops *ops = info->ops; + + if (ops == NULL) { + info->ops = &msi_domain_ops_default; + return; + } + + if (ops->get_hwirq == NULL) + ops->get_hwirq = msi_domain_ops_default.get_hwirq; + if (ops->msi_init == NULL) + ops->msi_init = msi_domain_ops_default.msi_init; + if (ops->msi_check == NULL) + ops->msi_check = msi_domain_ops_default.msi_check; + if (ops->msi_prepare == NULL) + ops->msi_prepare = msi_domain_ops_default.msi_prepare; + if (ops->set_desc == NULL) + ops->set_desc = msi_domain_ops_default.set_desc; +} + +static void msi_domain_update_chip_ops(struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + BUG_ON(!chip); + if (!chip->irq_mask) + chip->irq_mask = pci_msi_mask_irq; + if (!chip->irq_unmask) + chip->irq_unmask = pci_msi_unmask_irq; + if (!chip->irq_set_affinity) + chip->irq_set_affinity = msi_domain_set_affinity; +} + +/** + * msi_create_irq_domain - Create a MSI interrupt domain + * @of_node: Optional device-tree node of the interrupt controller + * @info: MSI domain info + * @parent: Parent irq domain + */ +struct irq_domain *msi_create_irq_domain(struct device_node *node, + struct msi_domain_info *info, + struct irq_domain *parent) +{ + if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS) + msi_domain_update_dom_ops(info); + if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS) + msi_domain_update_chip_ops(info); + + return irq_domain_add_hierarchy(parent, 0, 0, node, &msi_domain_ops, + info); +} + +/** + * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain + * @domain: The domain to allocate from + * @dev: Pointer to device struct of the device for which the interrupts + * are allocated + * @nvec: The number of interrupts to allocate + * + * Returns 0 on success or an error code. + */ +int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, + int nvec) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + msi_alloc_info_t arg; + struct msi_desc *desc; + int i, ret, virq = -1; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, &arg); + if (ret) + return ret; + + for_each_msi_entry(desc, dev) { + ops->set_desc(&arg, desc); + if (info->flags & MSI_FLAG_IDENTITY_MAP) + virq = (int)ops->get_hwirq(info, &arg); + else + virq = -1; + + virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + dev_to_node(dev), &arg, false); + if (virq < 0) { + ret = -ENOSPC; + if (ops->handle_error) + ret = ops->handle_error(domain, desc, ret); + if (ops->msi_finish) + ops->msi_finish(&arg, ret); + return ret; + } + + for (i = 0; i < desc->nvec_used; i++) + irq_set_msi_desc_off(virq, i, desc); + } + + if (ops->msi_finish) + ops->msi_finish(&arg, 0); + + for_each_msi_entry(desc, dev) { + if (desc->nvec_used == 1) + dev_dbg(dev, "irq %d for MSI\n", virq); + else + dev_dbg(dev, "irq [%d-%d] for MSI\n", + virq, virq + desc->nvec_used - 1); + } + + return 0; +} + +/** + * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev + * @domain: The domain to managing the interrupts + * @dev: Pointer to device struct of the device for which the interrupts + * are free + */ +void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) +{ + struct msi_desc *desc; + + for_each_msi_entry(desc, dev) { + irq_domain_free_irqs(desc->irq, desc->nvec_used); + desc->irq = 0; + } +} + +/** + * msi_get_domain_info - Get the MSI interrupt domain info for @domain + * @domain: The interrupt domain to retrieve data from + * + * Returns the pointer to the msi_domain_info stored in + * @domain->host_data. + */ +struct msi_domain_info *msi_get_domain_info(struct irq_domain *domain) +{ + return (struct msi_domain_info *)domain->host_data; +} + +#endif /* CONFIG_GENERIC_MSI_IRQ_DOMAIN */ diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 9dc9bfd8a678..9791f93dd5f2 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> +#include <linux/mutex.h> #include "internals.h" @@ -326,18 +327,29 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) void register_irq_proc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_MUTEX(register_lock); char name [MAX_NAMELEN]; - if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip)) return; + /* + * irq directories are registered only when a handler is + * added, not when the descriptor is created, so multiple + * tasks might try to register at the same time. + */ + mutex_lock(®ister_lock); + + if (desc->dir) + goto out_unlock; + memset(name, 0, MAX_NAMELEN); sprintf(name, "%d", irq); /* create /proc/irq/1234 */ desc->dir = proc_mkdir(name, root_irq_dir); if (!desc->dir) - return; + goto out_unlock; #ifdef CONFIG_SMP /* create /proc/irq/<irq>/smp_affinity */ @@ -358,6 +370,9 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc) proc_create_data("spurious", 0444, desc->dir, &irq_spurious_proc_fops, (void *)(long)irq); + +out_unlock: + mutex_unlock(®ister_lock); } void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 9065107f083e..7a5237a1bce5 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -75,13 +75,21 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq) !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { #ifdef CONFIG_HARDIRQS_SW_RESEND /* - * If the interrupt has a parent irq and runs - * in the thread context of the parent irq, - * retrigger the parent. + * If the interrupt is running in the thread + * context of the parent irq we need to be + * careful, because we cannot trigger it + * directly. */ - if (desc->parent_irq && - irq_settings_is_nested_thread(desc)) + if (irq_settings_is_nested_thread(desc)) { + /* + * If the parent_irq is valid, we + * retrigger the parent, otherwise we + * do nothing. + */ + if (!desc->parent_irq) + return; irq = desc->parent_irq; + } /* Set it pending and activate the softirq: */ set_bit(irq, irqs_resend); tasklet_schedule(&resend_tasklet); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/kexec.c b/kernel/kexec.c index 2abf9f6e9a61..04eae03efe1e 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -707,7 +707,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image, do { unsigned long pfn, epfn, addr, eaddr; - pages = kimage_alloc_pages(GFP_KERNEL, order); + pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_pfn(pages); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index dadbf88c22c4..e3fe1dda72ca 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -471,9 +471,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -499,6 +496,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); diff --git a/kernel/module.c b/kernel/module.c index c353707bbbd5..1df11b175a24 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -179,6 +179,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -914,11 +917,15 @@ void symbol_put_addr(void *addr) if (core_kernel_text(a)) return; - /* module_text_address is safe here: we're supposed to have reference - * to module from symbol_get, so it can't go away. */ + /* + * Even though we hold a reference on the module; we still need to + * disable preemption in order to safely traverse the data structure. + */ + preempt_disable(); modaddr = __module_text_address(a); BUG_ON(!modaddr); module_put(modaddr); + preempt_enable(); } EXPORT_SYMBOL_GPL(symbol_put_addr); @@ -2321,8 +2328,20 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_size = ALIGN(mod->init_size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_size; + mod->init_size += sizeof(struct mod_kallsyms); + mod->init_size = debug_align(mod->init_size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2331,28 +2350,33 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->module_init + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->module_core + info->symoffs; + mod->core_kallsyms.strtab = s = mod->module_core + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -3072,9 +3096,8 @@ static int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif unset_module_init_ro_nx(mod); module_free(mod, mod->module_init); @@ -3397,6 +3420,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3404,6 +3432,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3413,32 +3442,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3531,19 +3560,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3552,11 +3583,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3593,11 +3625,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, int ret; list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/panic.c b/kernel/panic.c index 8ff8e53b0c0c..5179cc6872b9 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,7 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> +#include <linux/console.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -145,6 +146,17 @@ void panic(const char *fmt, ...) bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. Locks debug + * should be disabled to avoid reporting bad unlock balance when + * panic() is not being callled from OOPS. + */ + debug_locks_off(); + console_flush_on_panic(); + if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index bbef57f5bdfd..c2ae4618a159 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -191,7 +191,7 @@ config DPM_WATCHDOG config DPM_WATCHDOG_TIMEOUT int "Watchdog timeout in seconds" range 1 120 - default 12 + default 60 depends on DPM_WATCHDOG config PM_TRACE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4a27612b44c3..df1198461db3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -487,11 +487,11 @@ static int check_syslog_permissions(int type, bool from_file) * already done the capabilities checks at open time. */ if (from_file && type != SYSLOG_ACTION_OPEN) - return 0; + goto ok; if (syslog_action_restricted(type)) { if (capable(CAP_SYSLOG)) - return 0; + goto ok; /* * For historical reasons, accept CAP_SYS_ADMIN too, with * a warning. @@ -501,10 +501,11 @@ static int check_syslog_permissions(int type, bool from_file) "CAP_SYS_ADMIN but no CAP_SYSLOG " "(deprecated).\n", current->comm, task_pid_nr(current)); - return 0; + goto ok; } return -EPERM; } +ok: return security_syslog(type); } @@ -1291,10 +1292,6 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (error) goto out; - error = security_syslog(type); - if (error) - return error; - switch (type) { case SYSLOG_ACTION_CLOSE: /* Close log */ break; @@ -2250,13 +2247,24 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2323,6 +2331,8 @@ skip: start_critical_timings(); local_irq_restore(flags); #endif + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2390,6 +2400,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 09d76da725af..893070571f16 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -224,6 +224,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -233,18 +241,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -311,7 +334,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 1723df96c689..afc35f0aea3e 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -282,6 +282,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); + if (rcp->donetail == &rcp->rcucblist) { + /* No callbacks ready, so just leave. */ + local_irq_restore(flags); + return; + } RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; diff --git a/kernel/resource.c b/kernel/resource.c index 0bcebffc4e77..e3011e1a5c8d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1073,9 +1073,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8ad9dcc8270e..16a3401bc209 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2319,11 +2319,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) * If a task dies, then it sets TASK_DEAD in tsk->state and calls * schedule one last time. The schedule call will never return, and * the scheduled task must drop that reference. - * The test for TASK_DEAD must occur while the runqueue locks are - * still held, otherwise prev could be scheduled on another cpu, die - * there before we look at prev->state, and then the reference would - * be dropped twice. - * Manfred Spraul <manfred@colorfullife.com> + * + * We must observe prev->state before clearing prev->on_cpu (in + * finish_lock_switch), otherwise a concurrent wakeup can get prev + * running on another CPU and we could rave with its RUNNING -> DEAD + * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); @@ -2471,13 +2471,20 @@ unsigned long nr_running(void) /* * Check if only the current task is running on the cpu. + * + * Caution: this function does not check that the caller has disabled + * preemption, thus the result might have a time-of-check-to-time-of-use + * race. The caller is responsible to use it correctly, for example: + * + * - from a non-preemptable section (of course) + * + * - from a thread that is bound to a single CPU + * + * - in a loop with very short iterations (e.g. a polling loop) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->nr_running == 1) - return true; - else - return false; + return raw_rq()->nr_running == 1; } EXPORT_SYMBOL(single_task_running); @@ -4816,14 +4823,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif @@ -5597,6 +5606,14 @@ static int sched_cpu_active(struct notifier_block *nfb, case CPU_STARTING: set_cpu_rq_start_time(); return NOTIFY_OK; + case CPU_ONLINE: + /* + * At this point a starting CPU has marked itself as online via + * set_cpu_online(). But it might not yet have marked itself + * as active, which is essential from here on. + * + * Thus, fall-through and help the starting CPU along. + */ case CPU_DOWN_FAILED: set_cpu_active((long)hcpu, true); return NOTIFY_OK; @@ -6721,7 +6738,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2da134c9bae7..2ee44eb30f2b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -259,21 +259,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b51aad9b0464..2c04650654f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1979,8 +1979,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; + } /* * Shared library pages mapped by multiple processes are not @@ -4842,18 +4844,21 @@ again: * entity, update_curr() will update its vruntime, otherwise * forget we've ever seen it. */ - if (curr && curr->on_rq) - update_curr(cfs_rq); - else - curr = NULL; + if (curr) { + if (curr->on_rq) + update_curr(cfs_rq); + else + curr = NULL; - /* - * This call to check_cfs_rq_runtime() will do the throttle and - * dequeue its entity in the parent(s). Therefore the 'simple' - * nr_running test will indeed be correct. - */ - if (unlikely(check_cfs_rq_runtime(cfs_rq))) - goto simple; + /* + * This call to check_cfs_rq_runtime() will do the + * throttle and dequeue its entity in the parent(s). + * Therefore the 'simple' nr_running test will indeed + * be correct. + */ + if (unlikely(check_cfs_rq_runtime(cfs_rq))) + goto simple; + } se = pick_next_entity(cfs_rq, curr); cfs_rq = group_cfs_rq(se); diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 8ecd552fe4f2..09dfe51f89b5 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -97,10 +97,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9e0deba1566e..c41de96c4563 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -994,9 +994,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * After ->on_cpu is cleared, the task can be moved to a different CPU. * We must ensure this doesn't happen until the switch is completely * finished. + * + * Pairs with the control dependency and rmb in try_to_wake_up(). */ - smp_wmb(); - prev->on_cpu = 0; + smp_store_release(&prev->on_cpu, 0); #endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 4ef9687ac115..a39025b02fd5 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -317,24 +317,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } diff --git a/kernel/signal.c b/kernel/signal.c index 237aec98fa7e..6cb4167321d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2846,7 +2846,8 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif break; @@ -3113,7 +3114,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3159,7 +3160,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info; + siginfo_t info = {}; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cd0e835ecb85..07da431802e4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1656,6 +1656,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 9a4f750a2963..b99a55863de1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1320,7 +1320,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ce033c7aa2e8..9cff0ab82b63 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - int result = 0; + unsigned int result = 0; if (!clk) - return -ENODEV; + return POLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 79450c994e07..d7b794998144 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -785,6 +785,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 57dbbb7d50c0..481b70fdd7eb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -202,8 +202,7 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) /* calculate the delta since the last update_wall_time: */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); - nsec = delta * tkr->mult + tkr->xtime_nsec; - nsec >>= tkr->shift; + nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift; /* If arch requires, add in get_arch_timeoffset() */ return nsec + arch_gettimeoffset(); @@ -1369,7 +1368,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs(tick_error); + tick_error = abs64(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a29ab1a17023..8f102be0f20b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -983,13 +983,27 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); + struct tvec_base *new_base = per_cpu(tvec_bases, cpu); + struct tvec_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0fc5cfedcc8c..d4588b08e07a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -466,7 +466,8 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; - unsigned int nr_pages; + unsigned long nr_pages; + unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -486,7 +487,7 @@ struct ring_buffer_per_cpu { u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ - int nr_pages_to_update; + long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; @@ -1165,10 +1166,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { - int i; struct buffer_page *bpage, *tmp; + long i; for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1205,7 +1206,7 @@ free_pages: } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) + unsigned long nr_pages) { LIST_HEAD(pages); @@ -1230,7 +1231,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1330,8 +1331,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct ring_buffer *buffer; + long nr_pages; int bsize; - int cpu, nr_pages; + int cpu; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1457,12 +1459,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) } static int -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; - unsigned int nr_removed; + unsigned long nr_removed; unsigned long head_bit; int page_entries; @@ -1679,7 +1681,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages; + unsigned long nr_pages; int cpu, err = 0; /* @@ -1693,14 +1695,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be @@ -2680,11 +2681,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, * just so happens that it is the same bit corresponding to * the current context. */ -static DEFINE_PER_CPU(unsigned int, current_context); -static __always_inline int trace_recursive_lock(void) +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = __this_cpu_read(current_context); + unsigned int val = cpu_buffer->current_context; int bit; if (in_interrupt()) { @@ -2701,23 +2702,21 @@ static __always_inline int trace_recursive_lock(void) return 1; val |= (1 << bit); - __this_cpu_write(current_context, val); + cpu_buffer->current_context = val; return 0; } -static __always_inline void trace_recursive_unlock(void) +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = __this_cpu_read(current_context); - - val &= val & (val - 1); - __this_cpu_write(current_context, val); + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } #else -#define trace_recursive_lock() (0) -#define trace_recursive_unlock() do { } while (0) +#define trace_recursive_lock(cpu_buffer) (0) +#define trace_recursive_unlock(cpu_buffer) do { } while (0) #endif @@ -2749,35 +2748,34 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); - if (atomic_read(&buffer->record_disabled)) - goto out_nocheck; - - if (trace_recursive_lock()) - goto out_nocheck; + if (unlikely(atomic_read(&buffer->record_disabled))) + goto out; cpu = raw_smp_processor_id(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; - if (atomic_read(&cpu_buffer->record_disabled)) + if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (unlikely(length > BUF_MAX_DATA_SIZE)) + goto out; + + if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) - goto out; + goto out_unlock; return event; + out_unlock: + trace_recursive_unlock(cpu_buffer); out: - trace_recursive_unlock(); - - out_nocheck: preempt_enable_notrace(); return NULL; } @@ -2867,7 +2865,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_wakeups(buffer, cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -2978,7 +2976,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, out: rb_end_commit(cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -4655,8 +4653,9 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; - int cpu_i, nr_pages_same; - unsigned int nr_pages; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; switch (action) { case CPU_UP_PREPARE: diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 3f9e328c30b5..728f99b4533d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -452,7 +452,7 @@ static int __init ring_buffer_benchmark_init(void) if (producer_fifo >= 0) { struct sched_param param = { - .sched_priority = consumer_fifo + .sched_priority = producer_fifo }; sched_setscheduler(producer, SCHED_FIFO, ¶m); } else diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7e8e95698307..bee851402458 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4689,7 +4689,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 33ec7b68c3a6..e759a36e7870 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -444,6 +444,7 @@ enum { TRACE_CONTROL_BIT, + TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 697fb9bac8f0..60850b4fcb04 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -37,9 +37,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) struct trace_branch *entry; struct ring_buffer *buffer; unsigned long flags; - int cpu, pc; + int pc; const char *p; + if (current->trace_recursion & TRACE_BRANCH_BIT) + return; + /* * I would love to save just the ftrace_likely_data pointer, but * this code can also be used by modules. Ugly things can happen @@ -50,10 +53,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) if (unlikely(!tr)) return; - local_irq_save(flags); - cpu = raw_smp_processor_id(); - data = per_cpu_ptr(tr->trace_buffer.data, cpu); - if (atomic_inc_return(&data->disabled) != 1) + raw_local_irq_save(flags); + current->trace_recursion |= TRACE_BRANCH_BIT; + data = this_cpu_ptr(tr->trace_buffer.data); + if (atomic_read(&data->disabled)) goto out; pc = preempt_count(); @@ -82,8 +85,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&data->disabled); - local_irq_restore(flags); + current->trace_recursion &= ~TRACE_BRANCH_BIT; + raw_local_irq_restore(flags); } static inline diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 53e9ae056872..b6dd7f3bfc03 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -646,7 +646,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } @@ -1585,8 +1586,13 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + /* + * Only event directories that can be enabled should have + * triggers. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7a8c1528e141..357b1ddf088b 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1050,6 +1050,9 @@ static void parse_init(struct filter_parse_state *ps, static char infix_next(struct filter_parse_state *ps) { + if (!ps->infix.cnt) + return 0; + ps->infix.cnt--; return ps->infix.string[ps->infix.tail++]; @@ -1065,6 +1068,9 @@ static char infix_peek(struct filter_parse_state *ps) static void infix_advance(struct filter_parse_state *ps) { + if (!ps->infix.cnt) + return; + ps->infix.cnt--; ps->infix.tail++; } @@ -1363,19 +1369,26 @@ static int check_preds(struct filter_parse_state *ps) { int n_normal_preds = 0, n_logical_preds = 0; struct postfix_elt *elt; + int cnt = 0; list_for_each_entry(elt, &ps->postfix, list) { - if (elt->op == OP_NONE) + if (elt->op == OP_NONE) { + cnt++; continue; + } + cnt--; if (elt->op == OP_AND || elt->op == OP_OR) { n_logical_preds++; continue; } n_normal_preds++; + /* all ops should have operands */ + if (cnt < 0) + break; } - if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + if (cnt != 1 || !n_normal_preds || n_logical_preds >= n_normal_preds) { parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; } diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 2900817ba65c..9008103db583 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -38,6 +38,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -59,7 +63,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } @@ -291,6 +296,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 416a3f67e7fd..6122f386dd5d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -657,6 +657,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) @@ -4552,6 +4581,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { |