aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c37
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/auditfilter.c2
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/cgroup.c68
-rw-r--r--kernel/cpu/idle.c9
-rw-r--r--kernel/cpuset.c16
-rw-r--r--kernel/events/core.c40
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c4
-rw-r--r--kernel/fork.c14
-rw-r--r--kernel/freezer.c6
-rw-r--r--kernel/futex.c19
-rw-r--r--kernel/hrtimer.c28
-rw-r--r--kernel/irq/irqdesc.c30
-rw-r--r--kernel/irq/manage.c9
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kexec.c4
-rw-r--r--kernel/module.c34
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/power/Kconfig20
-rw-r--r--kernel/power/autosleep.c3
-rw-r--r--kernel/power/qos.c13
-rw-r--r--kernel/power/snapshot.c6
-rw-r--r--kernel/printk.c2
-rw-r--r--kernel/ptrace.c3
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c50
-rw-r--r--kernel/sched/cputime.c19
-rw-r--r--kernel/sched/debug.c17
-rw-r--r--kernel/sched/fair.c1841
-rw-r--r--kernel/sched/rt.c14
-rw-r--r--kernel/sched/sched.h16
-rw-r--r--kernel/smp.c12
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c101
-rw-r--r--kernel/time/jiffies.c6
-rw-r--r--kernel/time/ntp.c9
-rw-r--r--kernel/time/tick-broadcast.c127
-rw-r--r--kernel/time/tick-common.c109
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--kernel/time/timekeeping.c37
-rw-r--r--kernel/time/timer_list.c41
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ftrace.c326
-rw-r--r--kernel/trace/ring_buffer.c7
-rw-r--r--kernel/trace/trace.c456
-rw-r--r--kernel/trace/trace.h5
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_events.c321
-rw-r--r--kernel/trace/trace_events_filter.c17
-rw-r--r--kernel/trace/trace_export.c7
-rw-r--r--kernel/trace/trace_irqsoff.c4
-rw-r--r--kernel/trace/trace_kprobe.c42
-rw-r--r--kernel/trace/trace_syscalls.c21
-rw-r--r--kernel/trace/trace_uprobe.c55
-rw-r--r--kernel/tracepoint.c7
-rw-r--r--kernel/user_namespace.c17
-rw-r--r--kernel/workqueue.c78
61 files changed, 3554 insertions, 670 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index 91e53d04b6a..6def25f1b35 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -103,7 +103,8 @@ static int audit_rate_limit;
/* Number of outstanding audit_buffers allowed. */
static int audit_backlog_limit = 64;
-static int audit_backlog_wait_time = 60 * HZ;
+#define AUDIT_BACKLOG_WAIT_TIME (60 * HZ)
+static int audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
static int audit_backlog_wait_overflow = 0;
/* The identity of the user shutting down the audit system. */
@@ -613,7 +614,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
int rc = 0;
uid_t uid = from_kuid(&init_user_ns, current_uid());
- if (!audit_enabled) {
+ if (!audit_enabled && msg_type != AUDIT_USER_AVC) {
*ab = NULL;
return rc;
}
@@ -659,6 +660,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
switch (msg_type) {
case AUDIT_GET:
+ status_set.mask = 0;
status_set.enabled = audit_enabled;
status_set.failure = audit_failure;
status_set.pid = audit_pid;
@@ -670,7 +672,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
&status_set, sizeof(status_set));
break;
case AUDIT_SET:
- if (nlh->nlmsg_len < sizeof(struct audit_status))
+ if (nlmsg_len(nlh) < sizeof(struct audit_status))
return -EINVAL;
status_get = (struct audit_status *)data;
if (status_get->mask & AUDIT_STATUS_ENABLED) {
@@ -832,7 +834,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
- memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
+ memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))
return -EINVAL;
@@ -1117,9 +1119,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
sleep_time = timeout_start + audit_backlog_wait_time -
jiffies;
- if ((long)sleep_time > 0)
+ if ((long)sleep_time > 0) {
wait_for_auditd(sleep_time);
- continue;
+ continue;
+ }
}
if (audit_rate_check() && printk_ratelimit())
printk(KERN_WARNING
@@ -1133,6 +1136,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
return NULL;
}
+ audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME;
+
ab = audit_buffer_alloc(ctx, gfp_mask, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
@@ -1535,6 +1540,26 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
}
}
+ /* log the audit_names record type */
+ audit_log_format(ab, " nametype=");
+ switch(n->type) {
+ case AUDIT_TYPE_NORMAL:
+ audit_log_format(ab, "NORMAL");
+ break;
+ case AUDIT_TYPE_PARENT:
+ audit_log_format(ab, "PARENT");
+ break;
+ case AUDIT_TYPE_CHILD_DELETE:
+ audit_log_format(ab, "DELETE");
+ break;
+ case AUDIT_TYPE_CHILD_CREATE:
+ audit_log_format(ab, "CREATE");
+ break;
+ default:
+ audit_log_format(ab, "UNKNOWN");
+ break;
+ }
+
audit_log_fcaps(ab, n);
audit_log_end(ab);
}
diff --git a/kernel/audit.h b/kernel/audit.h
index 1c95131ef76..123c9b7c397 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -85,6 +85,7 @@ struct audit_names {
struct filename *name;
int name_len; /* number of chars to log */
+ bool hidden; /* don't log this record */
bool name_put; /* call __putname()? */
unsigned long ino;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 6bd4a90d199..4d230eafd5e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -423,7 +423,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
f->lsm_rule = NULL;
/* Support legacy tests for a valid loginuid */
- if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
+ if ((f->type == AUDIT_LOGINUID) && (f->val == ~0U)) {
f->type = AUDIT_LOGINUID_SET;
f->val = 0;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 3c8a601324a..9845cb32b60 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1399,8 +1399,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
}
i = 0;
- list_for_each_entry(n, &context->names_list, list)
+ list_for_each_entry(n, &context->names_list, list) {
+ if (n->hidden)
+ continue;
audit_log_name(context, n, NULL, i++, &call_panic);
+ }
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -1769,14 +1772,15 @@ void audit_putname(struct filename *name)
* __audit_inode - store the inode and device from a lookup
* @name: name being audited
* @dentry: dentry being audited
- * @parent: does this dentry represent the parent?
+ * @flags: attributes for this particular entry
*/
void __audit_inode(struct filename *name, const struct dentry *dentry,
- unsigned int parent)
+ unsigned int flags)
{
struct audit_context *context = current->audit_context;
const struct inode *inode = dentry->d_inode;
struct audit_names *n;
+ bool parent = flags & AUDIT_INODE_PARENT;
if (!context->in_syscall)
return;
@@ -1831,6 +1835,8 @@ out:
if (parent) {
n->name_len = n->name ? parent_len(n->name->name) : AUDIT_NAME_FULL;
n->type = AUDIT_TYPE_PARENT;
+ if (flags & AUDIT_INODE_HIDDEN)
+ n->hidden = true;
} else {
n->name_len = AUDIT_NAME_FULL;
n->type = AUDIT_TYPE_NORMAL;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb97..d0def7fc284 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -92,6 +92,14 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -873,7 +881,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
{
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
- schedule_work(&cgrp->free_work);
+ queue_work(cgroup_destroy_wq, &cgrp->free_work);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1995,7 +2003,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
- continue;
+ goto next;
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
@@ -2003,7 +2011,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
ent.cgrp = task_cgroup_from_root(tsk, root);
/* nothing to do if this task is already in the cgroup */
if (ent.cgrp == cgrp)
- continue;
+ goto next;
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
@@ -2011,7 +2019,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
-
+ next:
if (!threadgroup)
break;
} while_each_thread(leader, tsk);
@@ -2769,13 +2777,17 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
{
LIST_HEAD(pending);
struct cgroup *cgrp, *n;
+ struct super_block *sb = ss->root->sb;
/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
- if (cfts && ss->root != &rootnode) {
+ if (cfts && ss->root != &rootnode &&
+ atomic_inc_not_zero(&sb->s_active)) {
list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
dget(cgrp->dentry);
list_add_tail(&cgrp->cft_q_node, &pending);
}
+ } else {
+ sb = NULL;
}
mutex_unlock(&cgroup_mutex);
@@ -2798,6 +2810,9 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
dput(cgrp->dentry);
}
+ if (sb)
+ deactivate_super(sb);
+
mutex_unlock(&cgroup_cft_mutex);
}
@@ -3727,6 +3742,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
}
/*
+ * When dput() is called asynchronously, if umount has been done and
+ * then deactivate_super() in cgroup_free_fn() kills the superblock,
+ * there's a small window that vfs will see the root dentry with non-zero
+ * refcnt and trigger BUG().
+ *
+ * That's why we hold a reference before dput() and drop it right after.
+ */
+static void cgroup_dput(struct cgroup *cgrp)
+{
+ struct super_block *sb = cgrp->root->sb;
+
+ atomic_inc(&sb->s_active);
+ dput(cgrp->dentry);
+ deactivate_super(sb);
+}
+
+/*
* Unregister event and free resources.
*
* Gets called from workqueue.
@@ -3746,7 +3778,7 @@ static void cgroup_event_remove(struct work_struct *work)
eventfd_ctx_put(event->eventfd);
kfree(event);
- dput(cgrp->dentry);
+ cgroup_dput(cgrp);
}
/*
@@ -4031,12 +4063,8 @@ static void css_dput_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, dput_work);
- struct dentry *dentry = css->cgroup->dentry;
- struct super_block *sb = dentry->d_sb;
- atomic_inc(&sb->s_active);
- dput(dentry);
- deactivate_super(sb);
+ cgroup_dput(css->cgroup);
}
static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4666,6 +4694,22 @@ out:
return err;
}
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4976,7 +5020,7 @@ void __css_put(struct cgroup_subsys_state *css)
v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
if (v == 0)
- schedule_work(&css->dput_work);
+ queue_work(cgroup_destroy_wq, &css->dput_work);
}
EXPORT_SYMBOL_GPL(__css_put);
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb..c261409500e 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
rcu_idle_enter();
trace_cpu_idle_rcuidle(0, smp_processor_id());
local_irq_enable();
- while (!need_resched())
+ while (!tif_need_resched())
cpu_relax();
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
cpu_idle_poll();
} else {
- current_clr_polling();
- if (!need_resched()) {
+ if (!current_clr_polling_and_test()) {
stop_critical_timings();
rcu_idle_enter();
arch_cpu_idle();
@@ -103,7 +102,7 @@ static void cpu_idle_loop(void)
} else {
local_irq_enable();
}
- current_set_polling();
+ __current_set_polling();
}
arch_cpu_idle_exit();
}
@@ -129,7 +128,7 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
- current_set_polling();
+ __current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64b3f791bbe..d9dd521ddd6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -984,8 +984,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
need_loop = task_has_mempolicy(tsk) ||
!nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop)
+ if (need_loop) {
+ local_irq_disable();
write_seqcount_begin(&tsk->mems_allowed_seq);
+ }
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
@@ -993,8 +995,10 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
tsk->mems_allowed = *newmems;
- if (need_loop)
+ if (need_loop) {
write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
+ }
task_unlock(tsk);
}
@@ -1502,11 +1506,13 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{
struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
+ int retval = 0;
mutex_lock(&cpuset_mutex);
- if (!is_cpuset_online(cs))
+ if (!is_cpuset_online(cs)) {
+ retval = -ENODEV;
goto out_unlock;
+ }
switch (type) {
case FILE_CPU_EXCLUSIVE:
@@ -2416,9 +2422,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
task_lock(current);
cs = nearest_hardwall_ancestor(task_cs(current));
+ allowed = node_isset(node, cs->mems_allowed);
task_unlock(current);
- allowed = node_isset(node, cs->mems_allowed);
mutex_unlock(&callback_mutex);
return allowed;
}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b391907d535..f8eb2b154bd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -761,8 +761,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -778,6 +788,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -787,6 +799,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -1761,7 +1774,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -7228,7 +7250,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
@@ -7399,14 +7421,14 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
static void __perf_event_exit_context(void *__info)
{
struct perf_event_context *ctx = __info;
- struct perf_event *event, *tmp;
+ struct perf_event *event;
perf_pmu_rotate_stop(ctx->pmu);
- list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_remove_from_context(event);
- list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &ctx->event_list, event_entry)
__perf_remove_from_context(event);
+ rcu_read_unlock();
}
static void perf_event_exit_cpu_context(int cpu)
@@ -7430,11 +7452,11 @@ static void perf_event_exit_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
+ perf_event_exit_cpu_context(cpu);
+
mutex_lock(&swhash->hlist_mutex);
swevent_hlist_release(swhash);
mutex_unlock(&swhash->hlist_mutex);
-
- perf_event_exit_cpu_context(cpu);
}
#else
static inline void perf_event_exit_cpu(int cpu) { }
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b..9c2ddfbf452 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -87,10 +87,31 @@ again:
goto out;
/*
- * Publish the known good head. Rely on the full barrier implied
- * by atomic_dec_and_test() order the rb->head read and this
- * write.
+ * Since the mmap() consumer (userspace) can run on a different CPU:
+ *
+ * kernel user
+ *
+ * READ ->data_tail READ ->data_head
+ * smp_mb() (A) smp_rmb() (C)
+ * WRITE $data READ $data
+ * smp_wmb() (B) smp_mb() (D)
+ * STORE ->data_head WRITE ->data_tail
+ *
+ * Where A pairs with D, and B pairs with C.
+ *
+ * I don't think A needs to be a full barrier because we won't in fact
+ * write data until we see the store from userspace. So we simply don't
+ * issue the data WRITE until we observe it. Be conservative for now.
+ *
+ * OTOH, D needs to be a full barrier since it separates the data READ
+ * from the tail WRITE.
+ *
+ * For B a WMB is sufficient since it separates two WRITEs, and for C
+ * an RMB is sufficient since it separates two READs.
+ *
+ * See perf_output_begin().
*/
+ smp_wmb();
rb->user_page->data_head = head;
/*
@@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle,
* Userspace could choose to issue a mb() before updating the
* tail pointer. So that all reads will be completed before the
* write is issued.
+ *
+ * See perf_output_put_handle().
*/
tail = ACCESS_ONCE(rb->user_page->data_tail);
- smp_rmb();
+ smp_mb();
offset = head = local_read(&rb->head);
head += size;
if (unlikely(!perf_output_space(rb, tail, offset, head)))
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f3569747d62..ad8e1bdca70 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1682,12 +1682,10 @@ static bool handle_trampoline(struct pt_regs *regs)
tmp = ri;
ri = ri->next;
kfree(tmp);
+ utask->depth--;
if (!chained)
break;
-
- utask->depth--;
-
BUG_ON(!ri);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 987b28a1f01..ff7be9dac4c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -544,6 +544,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ clear_tlb_flush_pending(mm);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -1171,10 +1172,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
return ERR_PTR(-EINVAL);
/*
- * If the new process will be in a different pid namespace
- * don't allow the creation of threads.
+ * If the new process will be in a different pid namespace don't
+ * allow it to share a thread group or signal handlers with the
+ * forking task.
*/
- if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) &&
+ if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) &&
(task_active_pid_ns(current) != current->nsproxy->pid_ns))
return ERR_PTR(-EINVAL);
@@ -1675,6 +1677,12 @@ SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
int __user *, parent_tidptr,
int __user *, child_tidptr,
int, tls_val)
+#elif defined(CONFIG_CLONE_BACKWARDS3)
+SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
+ int, stack_size,
+ int __user *, parent_tidptr,
+ int __user *, child_tidptr,
+ int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
int __user *, parent_tidptr,
diff --git a/kernel/freezer.c b/kernel/freezer.c
index c38893b0efb..78758512b1e 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -19,6 +19,12 @@ EXPORT_SYMBOL(system_freezing_cnt);
bool pm_freezing;
bool pm_nosig_freezing;
+/*
+ * Temporary export for the deadlock workaround in ata_scsi_hotplug().
+ * Remove once the hack becomes unnecessary.
+ */
+EXPORT_SYMBOL_GPL(pm_freezing);
+
/* protects freezing and frozen transitions */
static DEFINE_SPINLOCK(freezer_lock);
diff --git a/kernel/futex.c b/kernel/futex.c
index b26dcfc02c9..3bc18bf48d0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -61,12 +61,15 @@
#include <linux/nsproxy.h>
#include <linux/ptrace.h>
#include <linux/sched/rt.h>
+#include <linux/hugetlb.h>
#include <asm/futex.h>
#include "rtmutex_common.h"
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
int __read_mostly futex_cmpxchg_enabled;
+#endif
#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
@@ -286,7 +289,7 @@ again:
put_page(page);
/* serialize against __split_huge_page_splitting() */
local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) {
+ if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
page_head = compound_head(page);
/*
* page_head is valid pointer but we must pin
@@ -365,7 +368,7 @@ again:
} else {
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = page_head->mapping->host;
- key->shared.pgoff = page_head->index;
+ key->shared.pgoff = basepage_index(page);
}
get_futex_key_refs(key);
@@ -2728,10 +2731,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int __init futex_init(void)
+static void __init futex_detect_cmpxchg(void)
{
+#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
u32 curval;
- int i;
/*
* This will fail and we want it. Some arch implementations do
@@ -2745,6 +2748,14 @@ static int __init futex_init(void)
*/
if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
+#endif
+}
+
+static int __init futex_init(void)
+{
+ int i;
+
+ futex_detect_cmpxchg();
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
plist_head_init(&futex_queues[i].chain);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index fd4b13b131f..2288fbdada1 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -721,17 +721,20 @@ static int hrtimer_switch_to_hres(void)
return 1;
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
/*
- * Called from timekeeping code to reprogramm the hrtimer interrupt
- * device. If called from the timer interrupt context we defer it to
- * softirq context.
+ * Called from timekeeping and resume code to reprogramm the hrtimer
+ * interrupt device on all cpus.
*/
void clock_was_set_delayed(void)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
- cpu_base->clock_was_set = 1;
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ schedule_work(&hrtimer_work);
}
#else
@@ -780,8 +783,10 @@ void hrtimers_resume(void)
WARN_ONCE(!irqs_disabled(),
KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+ /* Retrigger on the local CPU */
retrigger_next_event(NULL);
- timerfd_clock_was_set();
+ /* And schedule a retrigger for all others */
+ clock_was_set_delayed();
}
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1432,13 +1437,6 @@ void hrtimer_peek_ahead_timers(void)
static void run_hrtimer_softirq(struct softirq_action *h)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
- if (cpu_base->clock_was_set) {
- cpu_base->clock_was_set = 0;
- clock_was_set();
- }
-
hrtimer_peek_ahead_timers();
}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 192a302d6cf..3fcb6faa5fa 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -23,10 +23,35 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
+static int __init irq_affinity_setup(char *str)
+{
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+ cpulist_parse(str, irq_default_affinity);
+ /*
+ * Set at least the boot cpu. We don't want to end up with
+ * bugreports caused by random comandline masks
+ */
+ cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
+ return 1;
+}
+__setup("irqaffinity=", irq_affinity_setup);
+
+extern struct cpumask hmp_slow_cpu_mask;
+
static void __init init_irq_default_affinity(void)
{
- alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
- cpumask_setall(irq_default_affinity);
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!irq_default_affinity)
+ zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#endif
+#ifdef CONFIG_SCHED_HMP
+ if (!cpumask_empty(&hmp_slow_cpu_mask)) {
+ cpumask_copy(irq_default_affinity, &hmp_slow_cpu_mask);
+ return;
+ }
+#endif
+ if (cpumask_empty(irq_default_affinity))
+ cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)
@@ -274,6 +299,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
{
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
+EXPORT_SYMBOL(irq_to_desc);
static void free_desc(unsigned int irq)
{
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fa17855ca65..9bd5c8a6c8e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -555,9 +555,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return 0;
if (irq_settings_can_request(desc)) {
- if (desc->action)
- if (irqflags & desc->action->flags & IRQF_SHARED)
- canrequest =1;
+ if (!desc->action ||
+ irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest = 1;
}
irq_put_desc_unlock(desc, flags);
return canrequest;
@@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
static void wake_threads_waitq(struct irq_desc *desc)
{
- if (atomic_dec_and_test(&desc->threads_active) &&
- waitqueue_active(&desc->wait_for_threads))
+ if (atomic_dec_and_test(&desc->threads_active))
wake_up(&desc->wait_for_threads);
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cb228bf2176..abcd6ca86cb 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -50,7 +50,7 @@ static void resume_irqs(bool want_early)
bool is_early = desc->action &&
desc->action->flags & IRQF_EARLY_RESUME;
- if (is_early != want_early)
+ if (!is_early && want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f7b55ba74..1f8d9382dba 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -47,6 +47,9 @@ u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
size_t vmcoreinfo_size;
size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+/* Flag to indicate we are going to kexec a new kernel */
+bool kexec_in_progress = false;
+
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
.name = "Crash kernel",
@@ -1678,6 +1681,7 @@ int kernel_kexec(void)
} else
#endif
{
+ kexec_in_progress = true;
kernel_restart_prepare(NULL);
printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown();
diff --git a/kernel/module.c b/kernel/module.c
index cab4bce49c2..fa53db8aade 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2927,7 +2927,6 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
{
/* Module within temporary copy. */
struct module *mod;
- Elf_Shdr *pcpusec;
int err;
mod = setup_load_info(info, flags);
@@ -2942,17 +2941,10 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
err = module_frob_arch_sections(info->hdr, info->sechdrs,
info->secstrings, mod);
if (err < 0)
- goto out;
+ return ERR_PTR(err);
- pcpusec = &info->sechdrs[info->index.pcpu];
- if (pcpusec->sh_size) {
- /* We have a special allocation for this section. */
- err = percpu_modalloc(mod,
- pcpusec->sh_size, pcpusec->sh_addralign);
- if (err)
- goto out;
- pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC;
- }
+ /* We will do a special allocation for per-cpu sections later. */
+ info->sechdrs[info->index.pcpu].sh_flags &= ~(unsigned long)SHF_ALLOC;
/* Determine total sizes, and put offsets in sh_entsize. For now
this is done generically; there doesn't appear to be any
@@ -2963,17 +2955,22 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
/* Allocate and move to the final place */
err = move_module(mod, info);
if (err)
- goto free_percpu;
+ return ERR_PTR(err);
/* Module has been copied to its final place now: return it. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
kmemleak_load_module(mod, info);
return mod;
+}
-free_percpu:
- percpu_modfree(mod);
-out:
- return ERR_PTR(err);
+static int alloc_module_percpu(struct module *mod, struct load_info *info)
+{
+ Elf_Shdr *pcpusec = &info->sechdrs[info->index.pcpu];
+ if (!pcpusec->sh_size)
+ return 0;
+
+ /* We have a special allocation for this section. */
+ return percpu_modalloc(mod, pcpusec->sh_size, pcpusec->sh_addralign);
}
/* mod is no longer valid after this! */
@@ -3237,6 +3234,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
}
#endif
+ /* To avoid stressing percpu allocator, do this once we're unique. */
+ err = alloc_module_percpu(mod, info);
+ if (err)
+ goto unlink_mod;
+
/* Now module is in final location, initialize linked lists, etc. */
err = module_unload_init(mod);
if (err)
diff --git a/kernel/pid.c b/kernel/pid.c
index 0db3e791a06..0eb6d8e8b1d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -264,6 +264,7 @@ void free_pid(struct pid *pid)
struct pid_namespace *ns = upid->ns;
hlist_del_rcu(&upid->pid_chain);
switch(--ns->nr_hashed) {
+ case 2:
case 1:
/* When all that is left in the pid namespace
* is the reaper wake up the reaper. The reaper
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5dfdc9ea180..46455961a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -263,6 +263,26 @@ config PM_GENERIC_DOMAINS
bool
depends on PM
+config WQ_POWER_EFFICIENT_DEFAULT
+ bool "Enable workqueue power-efficient mode by default"
+ depends on PM
+ default n
+ help
+ Per-cpu workqueues are generally preferred because they show
+ better performance thanks to cache locality; unfortunately,
+ per-cpu workqueues tend to be more power hungry than unbound
+ workqueues.
+
+ Enabling workqueue.power_efficient kernel parameter makes the
+ per-cpu workqueues which were observed to contribute
+ significantly to power consumption unbound, leading to measurably
+ lower power usage at the cost of small performance overhead.
+
+ This config option determines whether workqueue.power_efficient
+ is enabled by default.
+
+ If in doubt, say N.
+
config PM_GENERIC_DOMAINS_SLEEP
def_bool y
depends on PM_SLEEP && PM_GENERIC_DOMAINS
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
index c6422ffeda9..9012ecf7b81 100644
--- a/kernel/power/autosleep.c
+++ b/kernel/power/autosleep.c
@@ -32,7 +32,8 @@ static void try_to_suspend(struct work_struct *work)
mutex_lock(&autosleep_lock);
- if (!pm_save_wakeup_count(initial_count)) {
+ if (!pm_save_wakeup_count(initial_count) ||
+ system_state != SYSTEM_RUNNING) {
mutex_unlock(&autosleep_lock);
goto out;
}
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 587dddeebf1..25cf89bc659 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -293,6 +293,15 @@ int pm_qos_request_active(struct pm_qos_request *req)
}
EXPORT_SYMBOL_GPL(pm_qos_request_active);
+static void __pm_qos_update_request(struct pm_qos_request *req,
+ s32 new_value)
+{
+ if (new_value != req->node.prio)
+ pm_qos_update_target(
+ pm_qos_array[req->pm_qos_class]->constraints,
+ &req->node, PM_QOS_UPDATE_REQ, new_value);
+}
+
/**
* pm_qos_work_fn - the timeout handler of pm_qos_update_request_timeout
* @work: work struct for the delayed work (timeout)
@@ -305,7 +314,7 @@ static void pm_qos_work_fn(struct work_struct *work)
struct pm_qos_request,
work);
- pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+ __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
}
/**
@@ -365,6 +374,8 @@ void pm_qos_update_request(struct pm_qos_request *req,
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_UPDATE_REQ, new_value);
+
+ __pm_qos_update_request(req, new_value);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 0de28576807..91c04f16e79 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1398,7 +1398,11 @@ int hibernate_preallocate_memory(void)
* highmem and non-highmem zones separately.
*/
pages_highmem = preallocate_image_highmem(highmem / 2);
- alloc = (count - max_size) - pages_highmem;
+ alloc = count - max_size;
+ if (alloc > pages_highmem)
+ alloc -= pages_highmem;
+ else
+ alloc = 0;
pages = preallocate_image_memory(alloc, avail_normal);
if (pages < alloc) {
/* We have exhausted non-highmem pages, try highmem. */
diff --git a/kernel/printk.c b/kernel/printk.c
index 8212c1aef12..d37d45c90ae 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1369,9 +1369,9 @@ static int console_trylock_for_printk(unsigned int cpu)
}
}
logbuf_cpu = UINT_MAX;
+ raw_spin_unlock(&logbuf_lock);
if (wake)
up(&console_sem);
- raw_spin_unlock(&logbuf_lock);
return retval;
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 335a7ae697f..afadcf7b4a2 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -257,7 +257,8 @@ ok:
if (task->mm)
dumpable = get_dumpable(task->mm);
rcu_read_lock();
- if (!dumpable && !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
+ if (dumpable != SUID_DUMP_USER &&
+ !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
rcu_read_unlock();
return -EPERM;
}
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 64de5f8b0c9..4a073539c58 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void)
if (IS_ERR(tg))
goto out_free;
- sched_online_group(tg, &root_task_group);
-
kref_init(&ag->kref);
init_rwsem(&ag->lock);
ag->id = atomic_inc_return(&autogroup_seq_nr);
@@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void)
#endif
tg->autogroup = ag;
+ sched_online_group(tg, &root_task_group);
return ag;
out_free:
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e8b335016c5..277e3557d0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1487,7 +1487,13 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
- smp_wmb();
+ /*
+ * If we are going to wake up a thread waiting for CONDITION we
+ * need to ensure that CONDITION=1 done by the caller can not be
+ * reordered with p->state check below. This pairs with mb() in
+ * set_current_state() the waiting thread does.
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
@@ -1617,6 +1623,20 @@ static void __sched_fork(struct task_struct *p)
#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
p->se.avg.runnable_avg_period = 0;
p->se.avg.runnable_avg_sum = 0;
+#ifdef CONFIG_SCHED_HMP
+ /* keep LOAD_AVG_MAX in sync with fair.c if load avg series is changed */
+#define LOAD_AVG_MAX 47742
+ if (p->mm) {
+ p->se.avg.hmp_last_up_migration = 0;
+ p->se.avg.hmp_last_down_migration = 0;
+ p->se.avg.load_avg_ratio = 1023;
+ p->se.avg.load_avg_contrib =
+ (1023 * scale_load_down(p->se.load.weight));
+ p->se.avg.runnable_avg_period = LOAD_AVG_MAX;
+ p->se.avg.runnable_avg_sum = LOAD_AVG_MAX;
+ p->se.avg.usage_avg_sum = LOAD_AVG_MAX;
+ }
+#endif
#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2966,6 +2986,12 @@ need_resched:
if (sched_feat(HRTICK))
hrtick_clear(rq);
+ /*
+ * Make sure that signal_pending_state()->signal_pending() below
+ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+ * done by the caller to avoid the race with signal_wake_up().
+ */
+ smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
@@ -3813,6 +3839,8 @@ static struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
+extern struct cpumask hmp_slow_cpu_mask;
+
/* Actually do priority change: must hold rq lock. */
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
@@ -3822,8 +3850,17 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_HMP
+ if (!cpumask_empty(&hmp_slow_cpu_mask))
+ if (cpumask_equal(&p->cpus_allowed, cpu_all_mask)) {
+ p->nr_cpus_allowed =
+ cpumask_weight(&hmp_slow_cpu_mask);
+ do_set_cpus_allowed(p, &hmp_slow_cpu_mask);
+ }
+#endif
+ }
else
p->sched_class = &fair_sched_class;
set_load_weight(p);
@@ -7800,7 +7837,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
runtime_enabled = quota != RUNTIME_INF;
runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
- account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+ /*
+ * If we need to toggle cfs_bandwidth_used, off->on must occur
+ * before making related changes, and on->off must occur afterwards
+ */
+ if (runtime_enabled && !runtime_was_enabled)
+ cfs_bandwidth_usage_inc();
raw_spin_lock_irq(&cfs_b->lock);
cfs_b->period = ns_to_ktime(period);
cfs_b->quota = quota;
@@ -7826,6 +7868,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
unthrottle_cfs_rq(cfs_rq);
raw_spin_unlock_irq(&rq->lock);
}
+ if (runtime_was_enabled && !runtime_enabled)
+ cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index b5ccba22603..1101d92635c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -558,7 +558,7 @@ static void cputime_adjust(struct task_cputime *curr,
struct cputime *prev,
cputime_t *ut, cputime_t *st)
{
- cputime_t rtime, stime, utime, total;
+ cputime_t rtime, stime, utime;
if (vtime_accounting_enabled()) {
*ut = curr->utime;
@@ -566,9 +566,6 @@ static void cputime_adjust(struct task_cputime *curr,
return;
}
- stime = curr->stime;
- total = stime + curr->utime;
-
/*
* Tick based cputime accounting depend on random scheduling
* timeslices of a task to be interrupted or not by the timer.
@@ -589,13 +586,19 @@ static void cputime_adjust(struct task_cputime *curr,
if (prev->stime + prev->utime >= rtime)
goto out;
- if (total) {
+ stime = curr->stime;
+ utime = curr->utime;
+
+ if (utime == 0) {
+ stime = rtime;
+ } else if (stime == 0) {
+ utime = rtime;
+ } else {
+ cputime_t total = stime + utime;
+
stime = scale_stime((__force u64)stime,
(__force u64)rtime, (__force u64)total);
utime = rtime - stime;
- } else {
- stime = rtime;
- utime = 0;
}
/*
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a67352..c2665cd2959 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -94,6 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#ifdef CONFIG_SMP
P(se->avg.runnable_avg_sum);
P(se->avg.runnable_avg_period);
+ P(se->avg.usage_avg_sum);
P(se->avg.load_avg_contrib);
P(se->avg.decay_count);
#endif
@@ -223,6 +224,16 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->tg_runnable_contrib);
SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
atomic_read(&cfs_rq->tg->runnable_avg));
+ SEQ_printf(m, " .%-30s: %d\n", "tg->usage_avg",
+ atomic_read(&cfs_rq->tg->usage_avg));
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+ cfs_rq->tg->cfs_bandwidth.timer_active);
+ SEQ_printf(m, " .%-30s: %d\n", "throttled",
+ cfs_rq->throttled);
+ SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
+ cfs_rq->throttle_count);
#endif
print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -566,6 +577,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ P(se.avg.runnable_avg_sum);
+ P(se.avg.runnable_avg_period);
+ P(se.avg.load_avg_contrib);
+ P(se.avg.decay_count);
+#endif
P(policy);
P(prio);
#undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614465c..c6d617b2c50 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -31,9 +31,18 @@
#include <linux/task_work.h>
#include <trace/events/sched.h>
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq header to add a notifier so that cpu frequency
+ * scaling can track the current CPU frequency
+ */
+#include <linux/cpufreq.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
#include "sched.h"
+
/*
* Targeted preemption latency for CPU-bound tasks:
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -936,6 +945,13 @@ void task_numa_work(struct callback_head *work)
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
+ /*
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
+ */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -1201,8 +1217,91 @@ static u32 __compute_runnable_contrib(u64 n)
return contrib + runnable_avg_yN_sum[n];
}
-/*
- * We can represent the historical contribution to runnable average as the
+#ifdef CONFIG_SCHED_HMP
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+ ssize_t (*to_sysfs_text)(char *buf, int buf_size);
+};
+
+#define HMP_DATA_SYSFS_MAX 8
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
+#endif
+ int multiplier; /* used to scale the time delta */
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data;
+
+static u64 hmp_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_SCHED_HMP */
+
+/* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
* history into segments of approximately 1ms (1024us); label the segment that
* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
@@ -1231,13 +1330,24 @@ static u32 __compute_runnable_contrib(u64 n)
*/
static __always_inline int __update_entity_runnable_avg(u64 now,
struct sched_avg *sa,
- int runnable)
+ int runnable,
+ int running,
+ int cpu)
{
u64 delta, periods;
u32 runnable_contrib;
int delta_w, decayed = 0;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ u64 scaled_delta;
+ u32 scaled_runnable_contrib;
+ int scaled_delta_w;
+ u32 curr_scale = 1024;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
delta = now - sa->last_runnable_update;
+#ifdef CONFIG_SCHED_HMP
+ delta = hmp_variable_scale_convert(delta);
+#endif
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
@@ -1256,6 +1366,12 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
return 0;
sa->last_runnable_update = now;
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ curr_scale = freq_scale[cpu].curr_scale;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
/* delta_w is the amount already accumulated against our next period */
delta_w = sa->runnable_avg_period % 1024;
if (delta + delta_w >= 1024) {
@@ -1268,8 +1384,20 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
* period and accrue it.
*/
delta_w = 1024 - delta_w;
+ /* scale runnable time if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta_w = (delta_w * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta_w;
+ if (running)
+ sa->usage_avg_sum += scaled_delta_w;
+#else
if (runnable)
sa->runnable_avg_sum += delta_w;
+ if (running)
+ sa->usage_avg_sum += delta_w;
+#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta_w;
delta -= delta_w;
@@ -1277,22 +1405,49 @@ static __always_inline int __update_entity_runnable_avg(u64 now,
/* Figure out how many additional periods this update spans */
periods = delta / 1024;
delta %= 1024;
-
+ /* decay the load we have accumulated so far */
sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
periods + 1);
sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
periods + 1);
-
+ sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
+ /* add the contribution from this period */
/* Efficiently calculate \sum (1..n_period) 1024*y^i */
runnable_contrib = __compute_runnable_contrib(periods);
+ /* Apply load scaling if necessary.
+ * Note that multiplying the whole series is same as
+ * multiplying all terms
+ */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_runnable_contrib = (runnable_contrib * curr_scale)
+ >> SCHED_FREQSCALE_SHIFT;
+ if (runnable)
+ sa->runnable_avg_sum += scaled_runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += scaled_runnable_contrib;
+#else
if (runnable)
sa->runnable_avg_sum += runnable_contrib;
+ if (running)
+ sa->usage_avg_sum += runnable_contrib;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += runnable_contrib;
}
/* Remainder of delta accrued against u_0` */
+ /* scale if necessary */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
+ if (runnable)
+ sa->runnable_avg_sum += scaled_delta;
+ if (running)
+ sa->usage_avg_sum += scaled_delta;
+#else
if (runnable)
sa->runnable_avg_sum += delta;
+ if (running)
+ sa->usage_avg_sum += delta;
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
sa->runnable_avg_period += delta;
return decayed;
@@ -1305,12 +1460,9 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
u64 decays = atomic64_read(&cfs_rq->decay_counter);
decays -= se->avg.decay_count;
- if (!decays)
- return 0;
-
- se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
+ if (decays)
+ se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
se->avg.decay_count = 0;
-
return decays;
}
@@ -1338,16 +1490,28 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa,
struct cfs_rq *cfs_rq)
{
struct task_group *tg = cfs_rq->tg;
- long contrib;
+ long contrib, usage_contrib;
/* The fraction of a cpu used by this cfs_rq */
contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
sa->runnable_avg_period + 1);
contrib -= cfs_rq->tg_runnable_contrib;
- if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
+ usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
+ sa->runnable_avg_period + 1);
+ usage_contrib -= cfs_rq->tg_usage_contrib;
+
+ /*
+ * contrib/usage at this point represent deltas, only update if they
+ * are substantive.
+ */
+ if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
+ (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
atomic_add(contrib, &tg->runnable_avg);
cfs_rq->tg_runnable_contrib += contrib;
+
+ atomic_add(usage_contrib, &tg->usage_avg);
+ cfs_rq->tg_usage_contrib += usage_contrib;
}
}
@@ -1408,12 +1572,18 @@ static inline void __update_task_entity_contrib(struct sched_entity *se)
contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
contrib /= (se->avg.runnable_avg_period + 1);
se->avg.load_avg_contrib = scale_load(contrib);
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg_contrib);
+ contrib = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
+ contrib /= (se->avg.runnable_avg_period + 1);
+ se->avg.load_avg_ratio = scale_load(contrib);
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.load_avg_ratio);
}
/* Compute the current contribution to load_avg by se, return any delta */
-static long __update_entity_load_avg_contrib(struct sched_entity *se)
+static long __update_entity_load_avg_contrib(struct sched_entity *se, long *ratio)
{
long old_contrib = se->avg.load_avg_contrib;
+ long old_ratio = se->avg.load_avg_ratio;
if (entity_is_task(se)) {
__update_task_entity_contrib(se);
@@ -1422,6 +1592,8 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se)
__update_group_entity_contrib(se);
}
+ if (ratio)
+ *ratio = se->avg.load_avg_ratio - old_ratio;
return se->avg.load_avg_contrib - old_contrib;
}
@@ -1441,9 +1613,13 @@ static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- long contrib_delta;
+ long contrib_delta, ratio_delta;
u64 now;
+ int cpu = -1; /* not used in normal case */
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = cfs_rq->rq->cpu;
+#endif
/*
* For a group entity we need to use their owned cfs_rq_clock_task() in
* case they are the parent of a throttled hierarchy.
@@ -1453,18 +1629,21 @@ static inline void update_entity_load_avg(struct sched_entity *se,
else
now = cfs_rq_clock_task(group_cfs_rq(se));
- if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
+ if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
+ cfs_rq->curr == se, cpu))
return;
- contrib_delta = __update_entity_load_avg_contrib(se);
+ contrib_delta = __update_entity_load_avg_contrib(se, &ratio_delta);
if (!update_cfs_rq)
return;
- if (se->on_rq)
+ if (se->on_rq) {
cfs_rq->runnable_load_avg += contrib_delta;
- else
+ rq_of(cfs_rq)->avg.load_avg_ratio += ratio_delta;
+ } else {
subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
+ }
}
/*
@@ -1497,8 +1676,17 @@ static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
{
- __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
+ int cpu = -1; /* not used in normal case */
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ cpu = rq->cpu;
+#endif
+ __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
+ runnable, cpu);
__update_tg_runnable_avg(&rq->avg, &rq->cfs);
+ trace_sched_rq_runnable_ratio(cpu_of(rq), rq->avg.load_avg_ratio);
+ trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
+ trace_sched_rq_nr_running(cpu_of(rq), rq->nr_running, rq->nr_iowait.counter);
}
/* Add the load generated by se into cfs_rq's child load-average */
@@ -1540,6 +1728,8 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
}
cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
+ rq_of(cfs_rq)->avg.load_avg_ratio += se->avg.load_avg_ratio;
+
/* we force update consideration on load-balancer moves */
update_cfs_rq_blocked_load(cfs_rq, !wakeup);
}
@@ -1558,6 +1748,8 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
update_cfs_rq_blocked_load(cfs_rq, !sleep);
cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
+ rq_of(cfs_rq)->avg.load_avg_ratio -= se->avg.load_avg_ratio;
+
if (sleep) {
cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
@@ -1886,6 +2078,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
+ update_entity_load_avg(se, 1);
}
update_stats_curr_start(cfs_rq, se);
@@ -1984,6 +2177,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
@@ -2021,13 +2215,14 @@ static inline bool cfs_bandwidth_used(void)
return static_key_false(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- static_key_slow_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
@@ -2035,7 +2230,8 @@ static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
@@ -2287,6 +2483,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
}
@@ -2398,6 +2596,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (idle)
goto out_unlock;
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
+
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
@@ -2458,7 +2663,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2534,10 +2745,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
@@ -2662,11 +2875,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
@@ -3314,6 +3527,708 @@ done:
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+static const int hmp_max_tasks = 5;
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int dc, cpu;
+
+ pr_debug("Initializing HMP scheduler:\n");
+
+ /* Initialize hmp_domains using platform code */
+ arch_get_hmp_domains(&hmp_domains);
+ if (list_empty(&hmp_domains)) {
+ pr_debug("HMP domain list is empty!\n");
+ return 0;
+ }
+
+ /* Print hmp_domains */
+ dc = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ cpulist_scnprintf(buf, 64, &domain->possible_cpus);
+ pr_debug(" HMP domain %d: %s\n", dc, buf);
+
+ for_each_cpu_mask(cpu, domain->possible_cpus) {
+ per_cpu(hmp_cpu_domain, cpu) = domain;
+ }
+ dc++;
+ }
+
+ return 1;
+}
+
+static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
+{
+ struct hmp_domain *domain;
+ struct list_head *pos;
+
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ if(cpumask_test_cpu(cpu, &domain->possible_cpus))
+ return domain;
+ }
+ return NULL;
+}
+
+static void hmp_online_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_set_cpu(cpu, &domain->cpus);
+}
+
+static void hmp_offline_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_clear_cpu(cpu, &domain->cpus);
+}
+/*
+ * Needed to determine heaviest tasks etc.
+ */
+static inline unsigned int hmp_cpu_is_fastest(int cpu);
+static inline unsigned int hmp_cpu_is_slowest(int cpu);
+static inline struct hmp_domain *hmp_slower_domain(int cpu);
+static inline struct hmp_domain *hmp_faster_domain(int cpu);
+
+/* must hold runqueue lock for queue se is currently on */
+static struct sched_entity *hmp_get_heaviest_task(
+ struct sched_entity *se, int migrate_up)
+{
+ int num_tasks = hmp_max_tasks;
+ struct sched_entity *max_se = se;
+ unsigned long int max_ratio = se->avg.load_avg_ratio;
+ const struct cpumask *hmp_target_mask = NULL;
+
+ if (migrate_up) {
+ struct hmp_domain *hmp;
+ if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
+ return max_se;
+
+ hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
+ hmp_target_mask = &hmp->cpus;
+ }
+ /* The currently running task is not on the runqueue */
+ se = __pick_first_entity(cfs_rq_of(se));
+
+ while (num_tasks && se) {
+ if (entity_is_task(se) &&
+ (se->avg.load_avg_ratio > max_ratio &&
+ hmp_target_mask &&
+ cpumask_intersects(hmp_target_mask,
+ tsk_cpus_allowed(task_of(se))))) {
+ max_se = se;
+ max_ratio = se->avg.load_avg_ratio;
+ }
+ se = __pick_next_entity(se);
+ num_tasks--;
+ }
+ return max_se;
+}
+
+static struct sched_entity *hmp_get_lightest_task(
+ struct sched_entity *se, int migrate_down)
+{
+ int num_tasks = hmp_max_tasks;
+ struct sched_entity *min_se = se;
+ unsigned long int min_ratio = se->avg.load_avg_ratio;
+ const struct cpumask *hmp_target_mask = NULL;
+
+ if (migrate_down) {
+ struct hmp_domain *hmp;
+ if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
+ return min_se;
+ hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
+ hmp_target_mask = &hmp->cpus;
+ }
+ /* The currently running task is not on the runqueue */
+ se = __pick_first_entity(cfs_rq_of(se));
+
+ while (num_tasks && se) {
+ if (entity_is_task(se) &&
+ (se->avg.load_avg_ratio < min_ratio &&
+ hmp_target_mask &&
+ cpumask_intersects(hmp_target_mask,
+ tsk_cpus_allowed(task_of(se))))) {
+ min_se = se;
+ min_ratio = se->avg.load_avg_ratio;
+ }
+ se = __pick_next_entity(se);
+ num_tasks--;
+ }
+ return min_se;
+}
+
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ *
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ *
+ * Small Task Packing:
+ * We can choose to fill the littlest CPUs in an HMP system rather than
+ * the typical spreading mechanic. This behavior is controllable using
+ * two variables.
+ * hmp_packing_enabled: runtime control over pack/spread
+ * hmp_full_threshold: Consider a CPU with this much unweighted load full
+ */
+unsigned int hmp_up_threshold = 700;
+unsigned int hmp_down_threshold = 512;
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
+#endif
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+#ifndef CONFIG_ARCH_VEXPRESS_TC2
+unsigned int hmp_packing_enabled = 1;
+unsigned int hmp_full_threshold = (NICE_0_LOAD * 9) / 8;
+#else
+/* TC2 has a sharp consumption curve @ around 800Mhz, so
+ we aim to spread the load around that frequency. */
+unsigned int hmp_packing_enabled;
+unsigned int hmp_full_threshold = 650; /* 80% of the 800Mhz freq * NICE_0_LOAD */
+#endif
+#endif
+
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu, struct cpumask *affinity);
+
+static inline struct hmp_domain *hmp_smallest_domain(void)
+{
+ return list_entry(hmp_domains.prev, struct hmp_domain, hmp_domains);
+}
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ int lowest_cpu=NR_CPUS;
+ __always_unused int lowest_ratio;
+ struct hmp_domain *hmp;
+
+ if (hmp_cpu_is_fastest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_faster_domain(cpu);
+
+ lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ tsk_cpus_allowed(tsk));
+
+ return lowest_cpu;
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ int lowest_cpu=NR_CPUS;
+ struct hmp_domain *hmp;
+ __always_unused int lowest_ratio;
+
+ if (hmp_cpu_is_slowest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_slower_domain(cpu);
+
+ lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ tsk_cpus_allowed(tsk));
+
+ return lowest_cpu;
+}
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Select the 'best' candidate little CPU to wake up on.
+ * Implements a packing strategy which examines CPU in
+ * logical CPU order, and selects the first which will
+ * have at least 10% capacity available, according to
+ * both tracked load of the runqueue and the task.
+ */
+static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
+ int cpu) {
+ int tmp_cpu;
+ unsigned long estimated_load;
+ struct hmp_domain *hmp;
+ struct sched_avg *avg;
+ struct cpumask allowed_hmp_cpus;
+
+ if(!hmp_packing_enabled ||
+ tsk->se.avg.load_avg_ratio > ((NICE_0_LOAD * 90)/100))
+ return hmp_select_slower_cpu(tsk, cpu);
+
+ if (hmp_cpu_is_slowest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_slower_domain(cpu);
+
+ /* respect affinity */
+ cpumask_and(&allowed_hmp_cpus, &hmp->cpus,
+ tsk_cpus_allowed(tsk));
+
+ for_each_cpu_mask(tmp_cpu, allowed_hmp_cpus) {
+ avg = &cpu_rq(tmp_cpu)->avg;
+ /* estimate new rq load if we add this task */
+ estimated_load = avg->load_avg_ratio +
+ tsk->se.avg.load_avg_ratio;
+ if (estimated_load <= hmp_full_threshold) {
+ cpu = tmp_cpu;
+ break;
+ }
+ }
+ /* if no match was found, the task uses the initial value */
+ return cpu;
+}
+#endif
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+ /* hack - always use clock from first online CPU */
+ u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ se->avg.hmp_last_up_migration = now;
+ se->avg.hmp_last_down_migration = 0;
+ cpu_rq(cpu)->avg.hmp_last_up_migration = now;
+ cpu_rq(cpu)->avg.hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+ /* hack - always use clock from first online CPU */
+ u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ se->avg.hmp_last_down_migration = now;
+ se->avg.hmp_last_up_migration = 0;
+ cpu_rq(cpu)->avg.hmp_last_down_migration = now;
+ cpu_rq(cpu)->avg.hmp_last_up_migration = 0;
+}
+
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the load_avg_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity load_avg_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static inline u64 hmp_variable_scale_convert(u64 delta)
+{
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+ low *= hmp_data.multiplier;
+ high *= hmp_data.multiplier;
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+#else
+ return delta;
+#endif
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ int temp;
+
+ if (hmp_attr->to_sysfs_text != NULL)
+ return hmp_attr->to_sysfs_text(buf, PAGE_SIZE);
+
+ temp = *(hmp_attr->value);
+ if (hmp_attr->to_sysfs != NULL)
+ temp = hmp_attr->to_sysfs(temp);
+
+ return (ssize_t)sprintf(buf, "%d\n", temp);
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int temp;
+ ssize_t ret = count;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ char *str = vmalloc(count + 1);
+ if (str == NULL)
+ return -ENOMEM;
+ memcpy(str, buf, count);
+ str[count] = 0;
+ if (sscanf(str, "%d", &temp) < 1)
+ ret = -EINVAL;
+ else {
+ if (hmp_attr->from_sysfs != NULL)
+ temp = hmp_attr->from_sysfs(temp);
+ if (temp < 0)
+ ret = -EINVAL;
+ else
+ *(hmp_attr->value) = temp;
+ }
+ vfree(str);
+ return ret;
+}
+
+static ssize_t hmp_print_domains(char *outbuf, int outbufsize)
+{
+ char buf[64];
+ const char nospace[] = "%s", space[] = " %s";
+ const char *fmt = nospace;
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int outpos = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ if (cpumask_scnprintf(buf, 64, &domain->possible_cpus)) {
+ outpos += sprintf(outbuf+outpos, fmt, buf);
+ fmt = space;
+ }
+ }
+ strcat(outbuf, "\n");
+ return outpos+1;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+static int hmp_period_tofrom_sysfs(int value)
+{
+ return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+#endif
+/* max value for threshold is 1024 */
+static int hmp_theshold_from_sysfs(int value)
+{
+ if (value > 1024)
+ return -1;
+ return value;
+}
+#if defined(CONFIG_SCHED_HMP_LITTLE_PACKING) || \
+ defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)
+/* toggle control is only 0,1 off/on */
+static int hmp_toggle_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/* packing value must be non-negative */
+static int hmp_packing_from_sysfs(int value)
+{
+ if (value < 0)
+ return -1;
+ return value;
+}
+#endif
+static void hmp_attr_add(
+ const char *name,
+ int *value,
+ int (*to_sysfs)(int),
+ int (*from_sysfs)(int),
+ ssize_t (*to_sysfs_text)(char *, int),
+ umode_t mode)
+{
+ int i = 0;
+ while (hmp_data.attributes[i] != NULL) {
+ i++;
+ if (i >= HMP_DATA_SYSFS_MAX)
+ return;
+ }
+ if (mode)
+ hmp_data.attr[i].attr.mode = mode;
+ else
+ hmp_data.attr[i].attr.mode = 0644;
+ hmp_data.attr[i].show = hmp_show;
+ hmp_data.attr[i].store = hmp_store;
+ hmp_data.attr[i].attr.name = name;
+ hmp_data.attr[i].value = value;
+ hmp_data.attr[i].to_sysfs = to_sysfs;
+ hmp_data.attr[i].from_sysfs = from_sysfs;
+ hmp_data.attr[i].to_sysfs_text = to_sysfs_text;
+ hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+ hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+ int ret;
+ memset(&hmp_data, sizeof(hmp_data), 0);
+ hmp_attr_add("hmp_domains",
+ NULL,
+ NULL,
+ NULL,
+ hmp_print_domains,
+ 0444);
+ hmp_attr_add("up_threshold",
+ &hmp_up_threshold,
+ NULL,
+ hmp_theshold_from_sysfs,
+ NULL,
+ 0);
+ hmp_attr_add("down_threshold",
+ &hmp_down_threshold,
+ NULL,
+ hmp_theshold_from_sysfs,
+ NULL,
+ 0);
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ /* by default load_avg_period_ms == LOAD_AVG_PERIOD
+ * meaning no change
+ */
+ hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_PERIOD);
+ hmp_attr_add("load_avg_period_ms",
+ &hmp_data.multiplier,
+ hmp_period_tofrom_sysfs,
+ hmp_period_tofrom_sysfs,
+ NULL,
+ 0);
+#endif
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_toggle_from_sysfs,
+ NULL,
+ 0);
+#endif
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+ hmp_attr_add("packing_enable",
+ &hmp_packing_enabled,
+ NULL,
+ hmp_toggle_from_sysfs,
+ NULL,
+ 0);
+ hmp_attr_add("packing_limit",
+ &hmp_full_threshold,
+ NULL,
+ hmp_packing_from_sysfs,
+ NULL,
+ 0);
+#endif
+ hmp_data.attr_group.name = "hmp";
+ hmp_data.attr_group.attrs = hmp_data.attributes;
+ ret = sysfs_create_group(kernel_kobj,
+ &hmp_data.attr_group);
+ return 0;
+}
+late_initcall(hmp_attr_init);
+/*
+ * return the load of the lowest-loaded CPU in a given HMP domain
+ * min_cpu optionally points to an int to receive the CPU.
+ * affinity optionally points to a cpumask containing the
+ * CPUs to be considered. note:
+ * + min_cpu = NR_CPUS only if no CPUs are in the set of
+ * affinity && hmp_domain cpus
+ * + min_cpu will always otherwise equal one of the CPUs in
+ * the hmp domain
+ * + when more than one CPU has the same load, the one which
+ * is least-recently-disturbed by an HMP migration will be
+ * selected
+ * + if all CPUs are equally loaded or idle and the times are
+ * all the same, the first in the set will be used
+ * + if affinity is not set, cpu_online_mask is used
+ */
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu, struct cpumask *affinity)
+{
+ int cpu;
+ int min_cpu_runnable_temp = NR_CPUS;
+ u64 min_target_last_migration = ULLONG_MAX;
+ u64 curr_last_migration;
+ unsigned long min_runnable_load = INT_MAX;
+ unsigned long contrib;
+ struct sched_avg *avg;
+ struct cpumask temp_cpumask;
+ /*
+ * only look at CPUs allowed if specified,
+ * otherwise look at all online CPUs in the
+ * right HMP domain
+ */
+ cpumask_and(&temp_cpumask, &hmpd->cpus, affinity ? affinity : cpu_online_mask);
+
+ for_each_cpu_mask(cpu, temp_cpumask) {
+ avg = &cpu_rq(cpu)->avg;
+ /* used for both up and down migration */
+ curr_last_migration = avg->hmp_last_up_migration ?
+ avg->hmp_last_up_migration : avg->hmp_last_down_migration;
+
+ contrib = avg->load_avg_ratio;
+ /*
+ * Consider a runqueue completely busy if there is any load
+ * on it. Definitely not the best for overall fairness, but
+ * does well in typical Android use cases.
+ */
+ if (contrib)
+ contrib = 1023;
+
+ if ((contrib < min_runnable_load) ||
+ (contrib == min_runnable_load &&
+ curr_last_migration < min_target_last_migration)) {
+ /*
+ * if the load is the same target the CPU with
+ * the longest time since a migration.
+ * This is to spread migration load between
+ * members of a domain more evenly when the
+ * domain is fully loaded
+ */
+ min_runnable_load = contrib;
+ min_cpu_runnable_temp = cpu;
+ min_target_last_migration = curr_last_migration;
+ }
+ }
+
+ if (min_cpu)
+ *min_cpu = min_cpu_runnable_temp;
+
+ return min_runnable_load;
+}
+
+/*
+ * Calculate the task starvation
+ * This is the ratio of actually running time vs. runnable time.
+ * If the two are equal the task is getting the cpu time it needs or
+ * it is alone on the cpu and the cpu is fully utilized.
+ */
+static inline unsigned int hmp_task_starvation(struct sched_entity *se)
+{
+ u32 starvation;
+
+ starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
+ starvation /= (se->avg.runnable_avg_sum + 1);
+
+ return scale_load(starvation);
+}
+
+static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
+{
+ int min_usage;
+ int dest_cpu = NR_CPUS;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return NR_CPUS;
+
+ /* Is there an idle CPU in the current domain */
+ min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
+ if (min_usage == 0) {
+ trace_sched_hmp_offload_abort(cpu, min_usage, "load");
+ return NR_CPUS;
+ }
+
+ /* Is the task alone on the cpu? */
+ if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
+ trace_sched_hmp_offload_abort(cpu,
+ cpu_rq(cpu)->cfs.h_nr_running, "nr_running");
+ return NR_CPUS;
+ }
+
+ /* Is the task actually starving? */
+ /* >=25% ratio running/runnable = starving */
+ if (hmp_task_starvation(se) > 768) {
+ trace_sched_hmp_offload_abort(cpu, hmp_task_starvation(se),
+ "starvation");
+ return NR_CPUS;
+ }
+
+ /* Does the slower domain have any idle CPUs? */
+ min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
+ tsk_cpus_allowed(task_of(se)));
+
+ if (min_usage == 0) {
+ trace_sched_hmp_offload_succeed(cpu, dest_cpu);
+ return dest_cpu;
+ } else
+ trace_sched_hmp_offload_abort(cpu,min_usage,"slowdomain");
+ return NR_CPUS;
+}
+#endif /* CONFIG_SCHED_HMP */
+
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -3338,6 +4253,19 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
if (p->nr_cpus_allowed == 1)
return prev_cpu;
+#ifdef CONFIG_SCHED_HMP
+ /* always put non-kernel forking tasks on a big domain */
+ if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu);
+ if (new_cpu != NR_CPUS) {
+ hmp_next_up_delay(&p->se, new_cpu);
+ return new_cpu;
+ }
+ /* failed to perform HMP fork balance, use normal balance */
+ new_cpu = cpu;
+ }
+#endif
+
if (sd_flag & SD_BALANCE_WAKE) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
want_affine = 1;
@@ -3412,6 +4340,31 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
unlock:
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ prev_cpu = task_cpu(p);
+
+ if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ return new_cpu;
+ }
+ if (hmp_down_migration(prev_cpu, &p->se)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+ new_cpu = hmp_best_little_cpu(p, prev_cpu);
+#else
+ new_cpu = hmp_select_slower_cpu(p, prev_cpu);
+#endif
+ if (new_cpu != prev_cpu) {
+ hmp_next_down_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ return new_cpu;
+ }
+ }
+ /* Make sure that the task stays in its previous hmp domain */
+ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
+ return prev_cpu;
+#endif
+
return new_cpu;
}
@@ -3421,6 +4374,16 @@ unlock:
* load-balance).
*/
#ifdef CONFIG_FAIR_GROUP_SCHED
+
+#ifdef CONFIG_NO_HZ_COMMON
+static int nohz_test_cpu(int cpu);
+#else
+static inline int nohz_test_cpu(int cpu)
+{
+ return 0;
+}
+#endif
+
/*
* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -3440,6 +4403,25 @@ migrate_task_rq_fair(struct task_struct *p, int next_cpu)
* be negative here since on-rq tasks have decay-count == 0.
*/
if (se->avg.decay_count) {
+ /*
+ * If we migrate a sleeping task away from a CPU
+ * which has the tick stopped, then both the clock_task
+ * and decay_counter will be out of date for that CPU
+ * and we will not decay load correctly.
+ */
+ if (!se->on_rq && nohz_test_cpu(task_cpu(p))) {
+ struct rq *rq = cpu_rq(task_cpu(p));
+ unsigned long flags;
+ /*
+ * Current CPU cannot be holding rq->lock in this
+ * circumstance, but another might be. We must hold
+ * rq->lock before we go poking around in its clocks
+ */
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ update_rq_clock(rq);
+ update_cfs_rq_blocked_load(cfs_rq, 0);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
se->avg.decay_count = -__synchronize_entity_decay(se);
atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
}
@@ -3945,7 +4927,6 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
-
tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
if (!tsk_cache_hot ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -5230,7 +6211,9 @@ out_one_pinned:
out:
return ld_moved;
}
-
+#ifdef CONFIG_SCHED_HMP
+static unsigned int hmp_idle_pull(int this_cpu);
+#endif
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -5275,7 +6258,10 @@ void idle_balance(int this_cpu, struct rq *this_rq)
}
}
rcu_read_unlock();
-
+#ifdef CONFIG_SCHED_HMP
+ if (!pulled_task)
+ pulled_task = hmp_idle_pull(this_cpu);
+#endif
raw_spin_lock(&this_rq->lock);
if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
@@ -5368,12 +6354,65 @@ static struct {
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
+/*
+ * nohz_test_cpu used when load tracking is enabled. FAIR_GROUP_SCHED
+ * dependency below may be removed when load tracking guards are
+ * removed.
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static int nohz_test_cpu(int cpu)
+{
+ return cpumask_test_cpu(cpu, nohz.idle_cpus_mask);
+}
+#endif
+
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+/*
+ * Decide if the tasks on the busy CPUs in the
+ * littlest domain would benefit from an idle balance
+ */
+static int hmp_packing_ilb_needed(int cpu)
+{
+ struct hmp_domain *hmp;
+ /* always allow ilb on non-slowest domain */
+ if (!hmp_cpu_is_slowest(cpu))
+ return 1;
+
+ /* if disabled, use normal ILB behaviour */
+ if (!hmp_packing_enabled)
+ return 1;
+
+ hmp = hmp_cpu_domain(cpu);
+ for_each_cpu_and(cpu, &hmp->cpus, nohz.idle_cpus_mask) {
+ /* only idle balance if a CPU is loaded over threshold */
+ if (cpu_rq(cpu)->avg.load_avg_ratio > hmp_full_threshold)
+ return 1;
+ }
+ return 0;
+}
+#endif
+
static inline int find_new_ilb(int call_cpu)
{
int ilb = cpumask_first(nohz.idle_cpus_mask);
+#ifdef CONFIG_SCHED_HMP
+ int ilb_needed = 1;
+
+ /* restrict nohz balancing to occur in the same hmp domain */
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+ if (ilb < nr_cpu_ids)
+ ilb_needed = hmp_packing_ilb_needed(ilb);
+#endif
+
+ if (ilb_needed && ilb < nr_cpu_ids && idle_cpu(ilb))
+ return ilb;
+#else
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
+#endif
return nr_cpu_ids;
}
@@ -5650,6 +6689,18 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
if (time_before(now, nohz.next_balance))
return 0;
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Bail out if there are no nohz CPUs in our
+ * HMP domain, since we will move tasks between
+ * domains through wakeup and force balancing
+ * as necessary based upon task load.
+ */
+ if (cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
+ return 0;
+#endif
+
if (rq->nr_running >= 2)
goto need_kick;
@@ -5682,6 +6733,558 @@ need_kick:
static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
#endif
+#ifdef CONFIG_SCHED_HMP
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ int temp_target_cpu;
+ u64 now;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return 0;
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if (p->prio >= hmp_up_prio)
+ return 0;
+#endif
+ if (se->avg.load_avg_ratio < hmp_up_threshold)
+ return 0;
+
+ /* Let the task load settle before doing another up migration */
+ /* hack - always use clock from first online CPU */
+ now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ if (((now - se->avg.hmp_last_up_migration) >> 10)
+ < hmp_next_up_threshold)
+ return 0;
+
+ /* hmp_domain_min_load only returns 0 for an
+ * idle CPU or 1023 for any partly-busy one.
+ * Be explicit about requirement for an idle CPU.
+ */
+ if (hmp_domain_min_load(hmp_faster_domain(cpu), &temp_target_cpu,
+ tsk_cpus_allowed(p)) == 0 && temp_target_cpu != NR_CPUS) {
+ if(target_cpu)
+ *target_cpu = temp_target_cpu;
+ return 1;
+ }
+ return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu)) {
+#ifdef CONFIG_SCHED_HMP_LITTLE_PACKING
+ if(hmp_packing_enabled)
+ return 1;
+ else
+#endif
+ return 0;
+ }
+
+#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
+ /* Filter by task priority */
+ if ((p->prio >= hmp_up_prio) &&
+ cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))) {
+ return 1;
+ }
+#endif
+
+ /* Let the task load settle before doing another down migration */
+ /* hack - always use clock from first online CPU */
+ now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ if (((now - se->avg.hmp_last_down_migration) >> 10)
+ < hmp_next_down_threshold)
+ return 0;
+
+ if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))
+ && se->avg.load_avg_ratio < hmp_down_threshold) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ put_task_struct(p);
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+/*
+ * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_idle_pull uses this to push a currently running task
+ * off a runqueue to a faster CPU.
+ * Locking is slightly different than usual.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_idle_pull_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance))
+ goto out_unlock;
+
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+ rcu_read_unlock();
+ double_unlock_balance(busiest_rq, target_rq);
+out_unlock:
+ put_task_struct(p);
+ busiest_rq->active_balance = 0;
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ return 0;
+}
+
+/*
+ * Move task in a runnable state to another CPU.
+ *
+ * Tailored on 'active_load_balance_stop_cpu' with slight
+ * modification to locking and pre-transfer checks. Note
+ * rq->lock must be held before calling.
+ */
+static void hmp_migrate_runnable_task(struct rq *rq)
+{
+ struct sched_domain *sd;
+ int src_cpu = cpu_of(rq);
+ struct rq *src_rq = rq;
+ int dst_cpu = rq->push_cpu;
+ struct rq *dst_rq = cpu_rq(dst_cpu);
+ struct task_struct *p = rq->migrate_task;
+ /*
+ * One last check to make sure nobody else is playing
+ * with the source rq.
+ */
+ if (src_rq->active_balance)
+ goto out;
+
+ if (src_rq->nr_running <= 1)
+ goto out;
+
+ if (task_rq(p) != src_rq)
+ goto out;
+ /*
+ * Not sure if this applies here but one can never
+ * be too cautious
+ */
+ BUG_ON(src_rq == dst_rq);
+
+ double_lock_balance(src_rq, dst_rq);
+
+ rcu_read_lock();
+ for_each_domain(dst_cpu, sd) {
+ if (cpumask_test_cpu(src_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = dst_cpu,
+ .dst_rq = dst_rq,
+ .src_cpu = src_cpu,
+ .src_rq = src_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+ if (move_specific_task(&env, p))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+ }
+
+ rcu_read_unlock();
+ double_unlock_balance(src_rq, dst_rq);
+out:
+ put_task_struct(p);
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int cpu, target_cpu;
+ struct sched_entity *curr, *orig;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force, got_target;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return;
+ for_each_online_cpu(cpu) {
+ force = 0;
+ got_target = 0;
+ target = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ if (!entity_is_task(curr)) {
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(curr);
+ while (cfs_rq) {
+ curr = cfs_rq->curr;
+ cfs_rq = group_cfs_rq(curr);
+ }
+ }
+ orig = curr;
+ curr = hmp_get_heaviest_task(curr, 1);
+ p = task_of(curr);
+ if (hmp_up_migration(cpu, &target_cpu, curr)) {
+ if (!target->active_balance) {
+ get_task_struct(p);
+ target->push_cpu = target_cpu;
+ target->migrate_task = p;
+ got_target = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ }
+ }
+ if (!got_target && !target->active_balance) {
+ /*
+ * For now we just check the currently running task.
+ * Selecting the lightest task for offloading will
+ * require extensive book keeping.
+ */
+ curr = hmp_get_lightest_task(orig, 1);
+ p = task_of(curr);
+ target->push_cpu = hmp_offload_down(cpu, curr);
+ if (target->push_cpu < NR_CPUS) {
+ get_task_struct(p);
+ target->migrate_task = p;
+ got_target = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_OFFLOAD);
+ hmp_next_down_delay(&p->se, target->push_cpu);
+ }
+ }
+ /*
+ * We have a target with no active_balance. If the task
+ * is not currently running move it, otherwise let the
+ * CPU stopper take care of it.
+ */
+ if (got_target && !target->active_balance) {
+ if (!task_running(target, p)) {
+ trace_sched_hmp_migrate_force_running(p, 0);
+ hmp_migrate_runnable_task(target);
+ } else {
+ target->active_balance = 1;
+ force = 1;
+ }
+ }
+
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+ spin_unlock(&hmp_force_migration);
+}
+/*
+ * hmp_idle_pull looks at little domain runqueues to see
+ * if a task should be pulled.
+ *
+ * Reuses hmp_force_migration spinlock.
+ *
+ */
+static unsigned int hmp_idle_pull(int this_cpu)
+{
+ int cpu;
+ struct sched_entity *curr, *orig;
+ struct hmp_domain *hmp_domain = NULL;
+ struct rq *target = NULL, *rq;
+ unsigned long flags, ratio = 0;
+ unsigned int force = 0;
+ struct task_struct *p = NULL;
+
+ if (!hmp_cpu_is_slowest(this_cpu))
+ hmp_domain = hmp_slower_domain(this_cpu);
+ if (!hmp_domain)
+ return 0;
+
+ if (!spin_trylock(&hmp_force_migration))
+ return 0;
+
+ /* first select a task */
+ for_each_cpu(cpu, &hmp_domain->cpus) {
+ rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ curr = rq->cfs.curr;
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ continue;
+ }
+ if (!entity_is_task(curr)) {
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(curr);
+ while (cfs_rq) {
+ curr = cfs_rq->curr;
+ if (!entity_is_task(curr))
+ cfs_rq = group_cfs_rq(curr);
+ else
+ cfs_rq = NULL;
+ }
+ }
+ orig = curr;
+ curr = hmp_get_heaviest_task(curr, 1);
+ if (curr->avg.load_avg_ratio > hmp_up_threshold &&
+ curr->avg.load_avg_ratio > ratio) {
+ p = task_of(curr);
+ target = rq;
+ ratio = curr->avg.load_avg_ratio;
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ if (!p)
+ goto done;
+
+ /* now we have a candidate */
+ raw_spin_lock_irqsave(&target->lock, flags);
+ if (!target->active_balance && task_rq(p) == target) {
+ get_task_struct(p);
+ target->push_cpu = this_cpu;
+ target->migrate_task = p;
+ trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_IDLE_PULL);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ /*
+ * if the task isn't running move it right away.
+ * Otherwise setup the active_balance mechanic and let
+ * the CPU stopper do its job.
+ */
+ if (!task_running(target, p)) {
+ trace_sched_hmp_migrate_idle_running(p, 0);
+ hmp_migrate_runnable_task(target);
+ } else {
+ target->active_balance = 1;
+ force = 1;
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+
+ if (force) {
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_idle_pull_cpu_stop,
+ target, &target->active_balance_work);
+ }
+done:
+ spin_unlock(&hmp_force_migration);
+ return force;
+}
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -5693,6 +7296,8 @@ static void run_rebalance_domains(struct softirq_action *h)
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+ hmp_force_up_migration(this_cpu);
+
rebalance_domains(this_cpu, idle);
/*
@@ -5725,11 +7330,17 @@ void trigger_load_balance(struct rq *rq, int cpu)
static void rq_online_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_online_cpu(rq->cpu);
+#endif
update_sysctl();
}
static void rq_offline_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_offline_cpu(rq->cpu);
+#endif
update_sysctl();
/* Ensure any throttled groups are reachable by pick_next_task */
@@ -5777,11 +7388,15 @@ static void task_fork_fair(struct task_struct *p)
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
@@ -5831,15 +7446,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
@@ -6060,7 +7675,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}
@@ -6192,6 +7808,139 @@ __init void init_sched_fair_class(void)
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
+
+#ifdef CONFIG_SCHED_HMP
+ hmp_cpu_mask_setup();
+#endif
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ u32 result = curr / max;
+ return result;
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_INCOMPATIBLE)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].max = 1024;
+ freq_scale[ret].min = 1024;
+ freq_scale[ret].curr_scale = 1024;
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 127a2c4cf4a..15334e6de83 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -964,6 +964,13 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -973,6 +980,13 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /*
+ * Change rq's cpupri only if rt_rq is the top queue.
+ */
+ if (&rq->rt != rt_rq)
+ return;
+#endif
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ce39224d615..989c5aec3a5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -142,7 +142,7 @@ struct task_group {
atomic_t load_weight;
atomic64_t load_avg;
- atomic_t runnable_avg;
+ atomic_t runnable_avg, usage_avg;
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -279,7 +279,7 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
/* These always depend on CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_FAIR_GROUP_SCHED
- u32 tg_runnable_contrib;
+ u32 tg_runnable_contrib, tg_usage_contrib;
u64 tg_load_contrib;
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -464,6 +464,9 @@ struct rq {
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+#ifdef CONFIG_SCHED_HMP
+ struct task_struct *migrate_task;
+#endif
/* cpu of this runqueue: */
int cpu;
int online;
@@ -642,6 +645,12 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#ifdef CONFIG_SCHED_HMP
+static LIST_HEAD(hmp_domains);
+DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu)))
+#endif /* CONFIG_SCHED_HMP */
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1318,7 +1327,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
#ifdef CONFIG_NO_HZ_COMMON
enum rq_nohz_flag_bits {
diff --git a/kernel/smp.c b/kernel/smp.c
index 4dba0f7b72a..23ccc67dcbb 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,6 +12,8 @@
#include <linux/gfp.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/smp.h>
#include "smpboot.h"
@@ -159,8 +161,10 @@ void generic_exec_single(int cpu, struct call_single_data *csd, int wait)
* locking and barrier primitives. Generic code isn't really
* equipped to do the right thing...
*/
- if (ipi)
+ if (ipi) {
+ trace_smp_call_func_send(csd->func, cpu);
arch_send_call_function_single_ipi(cpu);
+ }
if (wait)
csd_lock_wait(csd);
@@ -197,8 +201,9 @@ void generic_smp_call_function_single_interrupt(void)
* so save them away before making the call:
*/
csd_flags = csd->flags;
-
+ trace_smp_call_func_entry(csd->func);
csd->func(csd->info);
+ trace_smp_call_func_exit(csd->func);
/*
* Unlocked CSDs are valid through generic_exec_single():
@@ -228,6 +233,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int this_cpu;
int err = 0;
+ trace_smp_call_func_send(func, cpu);
/*
* prevent preemption and reschedule on another processor,
* as well as CPU removal
@@ -245,7 +251,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
if (cpu == this_cpu) {
local_irq_save(flags);
+ trace_smp_call_func_entry(func);
func(info);
+ trace_smp_call_func_exit(func);
local_irq_restore(flags);
} else {
if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3d6833f125d..787b3a03242 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -330,10 +330,19 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (!force_irqthreads)
- __do_softirq();
- else
+ if (!force_irqthreads) {
+ /*
+ * We can safely execute softirq on the current stack if
+ * it is the irq stack, because it should be near empty
+ * at this stage. But we have no way to know if the arch
+ * calls irq_exit() on the irq stack. So call softirq
+ * in its own stack to prevent from any overrun on top
+ * of a potentially deep task stack.
+ */
+ do_softirq();
+ } else {
wakeup_softirqd();
+ }
}
static inline void tick_irq_exit(void)
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index f11d83b1294..a8f5084dcde 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -445,7 +445,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
return hrtimer_get_res(baseid, tp);
}
@@ -462,7 +462,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
if (!alarmtimer_get_rtcdev())
- return -ENOTSUPP;
+ return -EINVAL;
*tp = ktime_to_timespec(base->gettime());
return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index c6d6400ee13..9df0e3b19f0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -15,7 +15,6 @@
#include <linux/hrtimer.h>
#include <linux/init.h>
#include <linux/module.h>
-#include <linux/notifier.h>
#include <linux/smp.h>
#include "tick-internal.h"
@@ -23,36 +22,67 @@
/* The registered clock event devices */
static LIST_HEAD(clockevent_devices);
static LIST_HEAD(clockevents_released);
-
-/* Notification for clock events */
-static RAW_NOTIFIER_HEAD(clockevents_chain);
-
/* Protection for the above */
static DEFINE_RAW_SPINLOCK(clockevents_lock);
-/**
- * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
- * @latch: value to convert
- * @evt: pointer to clock event device descriptor
- *
- * Math helper, returns latch value converted to nanoseconds (bound checked)
- */
-u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
+ bool ismax)
{
u64 clc = (u64) latch << evt->shift;
+ u64 rnd;
if (unlikely(!evt->mult)) {
evt->mult = 1;
WARN_ON(1);
}
+ rnd = (u64) evt->mult - 1;
+
+ /*
+ * Upper bound sanity check. If the backwards conversion is
+ * not equal latch, we know that the above shift overflowed.
+ */
+ if ((clc >> evt->shift) != (u64)latch)
+ clc = ~0ULL;
+
+ /*
+ * Scaled math oddities:
+ *
+ * For mult <= (1 << shift) we can safely add mult - 1 to
+ * prevent integer rounding loss. So the backwards conversion
+ * from nsec to device ticks will be correct.
+ *
+ * For mult > (1 << shift), i.e. device frequency is > 1GHz we
+ * need to be careful. Adding mult - 1 will result in a value
+ * which when converted back to device ticks can be larger
+ * than latch by up to (mult - 1) >> shift. For the min_delta
+ * calculation we still want to apply this in order to stay
+ * above the minimum device ticks limit. For the upper limit
+ * we would end up with a latch value larger than the upper
+ * limit of the device, so we omit the add to stay below the
+ * device upper boundary.
+ *
+ * Also omit the add if it would overflow the u64 boundary.
+ */
+ if ((~0ULL - clc > rnd) &&
+ (!ismax || evt->mult <= (1U << evt->shift)))
+ clc += rnd;
do_div(clc, evt->mult);
- if (clc < 1000)
- clc = 1000;
- if (clc > KTIME_MAX)
- clc = KTIME_MAX;
- return clc;
+ /* Deltas less than 1usec are pointless noise */
+ return clc > 1000 ? clc : 1000;
+}
+
+/**
+ * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
+ * @latch: value to convert
+ * @evt: pointer to clock event device descriptor
+ *
+ * Math helper, returns latch value converted to nanoseconds (bound checked)
+ */
+u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
+{
+ return cev_delta2ns(latch, evt, false);
}
EXPORT_SYMBOL_GPL(clockevent_delta2ns);
@@ -232,30 +262,6 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
return (rc && force) ? clockevents_program_min_delta(dev) : rc;
}
-/**
- * clockevents_register_notifier - register a clock events change listener
- */
-int clockevents_register_notifier(struct notifier_block *nb)
-{
- unsigned long flags;
- int ret;
-
- raw_spin_lock_irqsave(&clockevents_lock, flags);
- ret = raw_notifier_chain_register(&clockevents_chain, nb);
- raw_spin_unlock_irqrestore(&clockevents_lock, flags);
-
- return ret;
-}
-
-/*
- * Notify about a clock event change. Called with clockevents_lock
- * held.
- */
-static void clockevents_do_notify(unsigned long reason, void *dev)
-{
- raw_notifier_call_chain(&clockevents_chain, reason, dev);
-}
-
/*
* Called after a notify add to make devices available which were
* released from the notifier call.
@@ -269,7 +275,7 @@ static void clockevents_notify_released(void)
struct clock_event_device, list);
list_del(&dev->list);
list_add(&dev->list, &clockevent_devices);
- clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ tick_check_new_device(dev);
}
}
@@ -290,7 +296,7 @@ void clockevents_register_device(struct clock_event_device *dev)
raw_spin_lock_irqsave(&clockevents_lock, flags);
list_add(&dev->list, &clockevent_devices);
- clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
+ tick_check_new_device(dev);
clockevents_notify_released();
raw_spin_unlock_irqrestore(&clockevents_lock, flags);
@@ -317,8 +323,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
sec = 600;
clockevents_calc_mult_shift(dev, freq, sec);
- dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
- dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+ dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
+ dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
}
/**
@@ -386,6 +392,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
* released list and do a notify add later.
*/
if (old) {
+ module_put(old->owner);
clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
list_del(&old->list);
list_add(&old->list, &clockevents_released);
@@ -433,7 +440,7 @@ void clockevents_notify(unsigned long reason, void *arg)
int cpu;
raw_spin_lock_irqsave(&clockevents_lock, flags);
- clockevents_do_notify(reason, arg);
+ tick_notify(reason, arg);
switch (reason) {
case CLOCK_EVT_NOTIFY_CPU_DEAD:
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 7a925ba456f..a6a5bf53e86 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -51,7 +51,13 @@
* HZ shrinks, so values greater than 8 overflow 32bits when
* HZ=100.
*/
+#if HZ < 34
+#define JIFFIES_SHIFT 6
+#elif HZ < 67
+#define JIFFIES_SHIFT 7
+#else
#define JIFFIES_SHIFT 8
+#endif
static cycle_t jiffies_read(struct clocksource *cs)
{
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 8f5b3b98577..af8d1d4f3d5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
* called as close as possible to 500 ms before the new second starts.
* This code is run on a timer. If the clock is set, that timer
* may not expire at the correct time. Thus, we adjust...
+ * We want the clock to be within a couple of ticks from the target.
*/
if (!ntp_synced()) {
/*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
}
getnstimeofday(&now);
- if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) {
+ if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
struct timespec adjust = now;
fail = -ENODEV;
@@ -516,13 +517,13 @@ static void sync_cmos_clock(struct work_struct *work)
schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
}
-static void notify_cmos_timer(void)
+void ntp_notify_cmos_timer(void)
{
schedule_delayed_work(&sync_cmos_work, 0);
}
#else
-static inline void notify_cmos_timer(void) { }
+void ntp_notify_cmos_timer(void) { }
#endif
@@ -687,8 +688,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai)
if (!(time_status & STA_NANO))
txc->time.tv_usec /= NSEC_PER_USEC;
- notify_cmos_timer();
-
return result;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 20d6fba7065..19ee339a1d0 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -19,6 +19,7 @@
#include <linux/profile.h>
#include <linux/sched.h>
#include <linux/smp.h>
+#include <linux/module.h>
#include "tick-internal.h"
@@ -29,6 +30,7 @@
static struct tick_device tick_broadcast_device;
static cpumask_var_t tick_broadcast_mask;
+static cpumask_var_t tick_broadcast_on;
static cpumask_var_t tmpmask;
static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
static int tick_broadcast_force;
@@ -64,17 +66,34 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
/*
* Check, if the device can be utilized as broadcast device:
*/
-int tick_check_broadcast_device(struct clock_event_device *dev)
+static bool tick_check_broadcast_device(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (newdev->features & CLOCK_EVT_FEAT_C3STOP))
+ return false;
+
+ if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT &&
+ !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+
+ return !curdev || newdev->rating > curdev->rating;
+}
+
+/*
+ * Conditionally install/replace broadcast device
+ */
+void tick_install_broadcast_device(struct clock_event_device *dev)
{
struct clock_event_device *cur = tick_broadcast_device.evtdev;
- if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
- (tick_broadcast_device.evtdev &&
- tick_broadcast_device.evtdev->rating >= dev->rating) ||
- (dev->features & CLOCK_EVT_FEAT_C3STOP))
- return 0;
+ if (!tick_check_broadcast_device(cur, dev))
+ return;
- clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ if (!try_module_get(dev->owner))
+ return;
+
+ clockevents_exchange_device(cur, dev);
if (cur)
cur->event_handler = clockevents_handle_noop;
tick_broadcast_device.evtdev = dev;
@@ -90,7 +109,6 @@ int tick_check_broadcast_device(struct clock_event_device *dev)
*/
if (dev->features & CLOCK_EVT_FEAT_ONESHOT)
tick_clock_notify();
- return 1;
}
/*
@@ -123,8 +141,9 @@ static void tick_device_setup_broadcast_func(struct clock_event_device *dev)
*/
int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
unsigned long flags;
- int ret = 0;
+ int ret;
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
@@ -138,20 +157,59 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
dev->event_handler = tick_handle_periodic;
tick_device_setup_broadcast_func(dev);
cpumask_set_cpu(cpu, tick_broadcast_mask);
- tick_broadcast_start_periodic(tick_broadcast_device.evtdev);
+ tick_broadcast_start_periodic(bc);
ret = 1;
} else {
/*
- * When the new device is not affected by the stop
- * feature and the cpu is marked in the broadcast mask
- * then clear the broadcast bit.
+ * Clear the broadcast bit for this cpu if the
+ * device is not power state affected.
*/
- if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) {
- int cpu = smp_processor_id();
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
cpumask_clear_cpu(cpu, tick_broadcast_mask);
- tick_broadcast_clear_oneshot(cpu);
- } else {
+ else
tick_device_setup_broadcast_func(dev);
+
+ /*
+ * Clear the broadcast bit if the CPU is not in
+ * periodic broadcast on state.
+ */
+ if (!cpumask_test_cpu(cpu, tick_broadcast_on))
+ cpumask_clear_cpu(cpu, tick_broadcast_mask);
+
+ switch (tick_broadcast_device.mode) {
+ case TICKDEV_MODE_ONESHOT:
+ /*
+ * If the system is in oneshot mode we can
+ * unconditionally clear the oneshot mask bit,
+ * because the CPU is running and therefore
+ * not in an idle state which causes the power
+ * state affected device to stop. Let the
+ * caller initialize the device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
+ ret = 0;
+ break;
+
+ case TICKDEV_MODE_PERIODIC:
+ /*
+ * If the system is in periodic mode, check
+ * whether the broadcast device can be
+ * switched off now.
+ */
+ if (cpumask_empty(tick_broadcast_mask) && bc)
+ clockevents_shutdown(bc);
+ /*
+ * If we kept the cpu in the broadcast mask,
+ * tell the caller to leave the per cpu device
+ * in shutdown state. The periodic interrupt
+ * is delivered by the broadcast device.
+ */
+ ret = cpumask_test_cpu(cpu, tick_broadcast_mask);
+ break;
+ default:
+ /* Nothing to do */
+ ret = 0;
+ break;
}
}
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -281,6 +339,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
switch (*reason) {
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
+ cpumask_set_cpu(cpu, tick_broadcast_on);
if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
@@ -290,8 +349,12 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
tick_broadcast_force = 1;
break;
case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
- if (!tick_broadcast_force &&
- cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
+ if (tick_broadcast_force)
+ break;
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
+ if (!tick_device_is_functional(dev))
+ break;
+ if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_mask)) {
if (tick_broadcast_device.mode ==
TICKDEV_MODE_PERIODIC)
tick_setup_periodic(dev, 0);
@@ -349,6 +412,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
bc = tick_broadcast_device.evtdev;
cpumask_clear_cpu(cpu, tick_broadcast_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_on);
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
if (bc && cpumask_empty(tick_broadcast_mask))
@@ -475,7 +539,15 @@ void tick_check_oneshot_broadcast(int cpu)
if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ /*
+ * We might be in the middle of switching over from
+ * periodic to oneshot. If the CPU has not yet
+ * switched over, leave the device alone.
+ */
+ if (td->mode == TICKDEV_MODE_ONESHOT) {
+ clockevents_set_mode(td->evtdev,
+ CLOCK_EVT_MODE_ONESHOT);
+ }
}
}
@@ -522,6 +594,13 @@ again:
cpumask_clear(tick_broadcast_force_mask);
/*
+ * Sanity check. Catch the case where we try to broadcast to
+ * offline cpus.
+ */
+ if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask)))
+ cpumask_and(tmpmask, tmpmask, cpu_online_mask);
+
+ /*
* Wakeup the cpus which have an expired event.
*/
tick_do_broadcast(tmpmask);
@@ -673,6 +752,7 @@ out:
static void tick_broadcast_clear_oneshot(int cpu)
{
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
}
static void tick_broadcast_init_next_event(struct cpumask *mask,
@@ -761,10 +841,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
/*
- * Clear the broadcast mask flag for the dead cpu, but do not
- * stop the broadcast device!
+ * Clear the broadcast masks for the dead cpu, but do not stop
+ * the broadcast device!
*/
cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_pending_mask);
+ cpumask_clear_cpu(cpu, tick_broadcast_force_mask);
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -792,6 +874,7 @@ bool tick_broadcast_oneshot_available(void)
void __init tick_broadcast_init(void)
{
zalloc_cpumask_var(&tick_broadcast_mask, GFP_NOWAIT);
+ zalloc_cpumask_var(&tick_broadcast_on, GFP_NOWAIT);
zalloc_cpumask_var(&tmpmask, GFP_NOWAIT);
#ifdef CONFIG_TICK_ONESHOT
zalloc_cpumask_var(&tick_broadcast_oneshot_mask, GFP_NOWAIT);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 5d3fb100bc0..086216c433f 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,6 +18,7 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
+#include <linux/module.h>
#include <asm/irq_regs.h>
@@ -194,7 +195,8 @@ static void tick_setup_device(struct tick_device *td,
* When global broadcasting is active, check if the current
* device is registered as a placeholder for broadcast mode.
* This allows us to handle this x86 misfeature in a generic
- * way.
+ * way. This function also returns !=0 when we keep the
+ * current active broadcast state for this CPU.
*/
if (tick_device_uses_broadcast(newdev, cpu))
return;
@@ -205,14 +207,50 @@ static void tick_setup_device(struct tick_device *td,
tick_setup_oneshot(newdev, handler, next_event);
}
+static bool tick_check_percpu(struct clock_event_device *curdev,
+ struct clock_event_device *newdev, int cpu)
+{
+ if (!cpumask_test_cpu(cpu, newdev->cpumask))
+ return false;
+ if (cpumask_equal(newdev->cpumask, cpumask_of(cpu)))
+ return true;
+ /* Check if irq affinity can be set */
+ if (newdev->irq >= 0 && !irq_can_set_affinity(newdev->irq))
+ return false;
+ /* Prefer an existing cpu local device */
+ if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
+ return false;
+ return true;
+}
+
+static bool tick_check_preferred(struct clock_event_device *curdev,
+ struct clock_event_device *newdev)
+{
+ /* Prefer oneshot capable device */
+ if (!(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) {
+ if (curdev && (curdev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return false;
+ if (tick_oneshot_mode_active())
+ return false;
+ }
+
+ /*
+ * Use the higher rated one, but prefer a CPU local device with a lower
+ * rating than a non-CPU local device
+ */
+ return !curdev ||
+ newdev->rating > curdev->rating ||
+ !cpumask_equal(curdev->cpumask, newdev->cpumask);
+}
+
/*
* Check, if the new registered device should be used.
*/
-static int tick_check_new_device(struct clock_event_device *newdev)
+void tick_check_new_device(struct clock_event_device *newdev)
{
struct clock_event_device *curdev;
struct tick_device *td;
- int cpu, ret = NOTIFY_OK;
+ int cpu;
unsigned long flags;
raw_spin_lock_irqsave(&tick_device_lock, flags);
@@ -225,40 +263,15 @@ static int tick_check_new_device(struct clock_event_device *newdev)
curdev = td->evtdev;
/* cpu local device ? */
- if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) {
-
- /*
- * If the cpu affinity of the device interrupt can not
- * be set, ignore it.
- */
- if (!irq_can_set_affinity(newdev->irq))
- goto out_bc;
+ if (!tick_check_percpu(curdev, newdev, cpu))
+ goto out_bc;
- /*
- * If we have a cpu local device already, do not replace it
- * by a non cpu local device
- */
- if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu)))
- goto out_bc;
- }
+ /* Preference decision */
+ if (!tick_check_preferred(curdev, newdev))
+ goto out_bc;
- /*
- * If we have an active device, then check the rating and the oneshot
- * feature.
- */
- if (curdev) {
- /*
- * Prefer one shot capable devices !
- */
- if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) &&
- !(newdev->features & CLOCK_EVT_FEAT_ONESHOT))
- goto out_bc;
- /*
- * Check the rating
- */
- if (curdev->rating >= newdev->rating)
- goto out_bc;
- }
+ if (!try_module_get(newdev->owner))
+ return;
/*
* Replace the eventually existing device by the new
@@ -275,18 +288,14 @@ static int tick_check_new_device(struct clock_event_device *newdev)
tick_oneshot_notify();
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
- return NOTIFY_STOP;
+ return;
out_bc:
/*
* Can the new device be used as a broadcast device ?
*/
- if (tick_check_broadcast_device(newdev))
- ret = NOTIFY_STOP;
-
+ tick_install_broadcast_device(newdev);
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
-
- return ret;
}
/*
@@ -360,17 +369,10 @@ static void tick_resume(void)
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
}
-/*
- * Notification about clock event devices
- */
-static int tick_notify(struct notifier_block *nb, unsigned long reason,
- void *dev)
+void tick_notify(unsigned long reason, void *dev)
{
switch (reason) {
- case CLOCK_EVT_NOTIFY_ADD:
- return tick_check_new_device(dev);
-
case CLOCK_EVT_NOTIFY_BROADCAST_ON:
case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
@@ -404,21 +406,12 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
default:
break;
}
-
- return NOTIFY_OK;
}
-static struct notifier_block tick_notifier = {
- .notifier_call = tick_notify,
-};
-
/**
* tick_init - initialize the tick control
- *
- * Register the notifier with the clockevents framework
*/
void __init tick_init(void)
{
- clockevents_register_notifier(&tick_notifier);
tick_broadcast_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f0299eae460..60742fe6f63 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -18,6 +18,8 @@ extern int tick_do_timer_cpu __read_mostly;
extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
extern void tick_handle_periodic(struct clock_event_device *dev);
+extern void tick_notify(unsigned long reason, void *dev);
+extern void tick_check_new_device(struct clock_event_device *dev);
extern void clockevents_shutdown(struct clock_event_device *dev);
@@ -90,7 +92,7 @@ static inline bool tick_broadcast_oneshot_available(void) { return false; }
*/
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
-extern int tick_check_broadcast_device(struct clock_event_device *dev);
+extern void tick_install_broadcast_device(struct clock_event_device *dev);
extern int tick_is_broadcast_device(struct clock_event_device *dev);
extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
extern void tick_shutdown_broadcast(unsigned int *cpup);
@@ -102,9 +104,8 @@ tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
#else /* !BROADCAST */
-static inline int tick_check_broadcast_device(struct clock_event_device *dev)
+static inline void tick_install_broadcast_device(struct clock_event_device *dev)
{
- return 0;
}
static inline int tick_is_broadcast_device(struct clock_event_device *dev)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0cf1c145318..4251374578b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -832,13 +832,10 @@ void tick_nohz_irq_exit(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
- if (ts->inidle) {
- /* Cancel the timer because CPU already waken up from the C-states*/
- menu_hrtimer_cancel();
+ if (ts->inidle)
__tick_nohz_idle_enter(ts);
- } else {
+ else
tick_nohz_full_stop_tick(ts);
- }
}
/**
@@ -936,8 +933,6 @@ void tick_nohz_idle_exit(void)
ts->inidle = 0;
- /* Cancel the timer because CPU already waken up from the C-states*/
- menu_hrtimer_cancel();
if (ts->idle_active || ts->tick_stopped)
now = ktime_get();
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index baeeb5c87cf..76fefb1613b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -72,7 +72,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
tk->wall_to_monotonic = wtm;
set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
tk->offs_real = timespec_to_ktime(tmp);
- tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
}
static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -590,7 +590,7 @@ s32 timekeeping_get_tai_offset(void)
static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
{
tk->tai_offset = tai_offset;
- tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
+ tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
}
/**
@@ -605,6 +605,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
raw_spin_lock_irqsave(&timekeeper_lock, flags);
write_seqcount_begin(&timekeeper_seq);
__timekeeping_set_tai_offset(tk, tai_offset);
+ timekeeping_update(tk, false, true);
write_seqcount_end(&timekeeper_seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
clock_was_set();
@@ -1007,6 +1008,8 @@ static int timekeeping_suspend(void)
timekeeping_suspend_time =
timespec_add(timekeeping_suspend_time, delta_delta);
}
+
+ timekeeping_update(tk, false, true);
write_seqcount_end(&timekeeper_seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
@@ -1236,9 +1239,10 @@ out_adjust:
* It also calls into the NTP code to handle leapsecond processing.
*
*/
-static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
+static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
{
u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
+ unsigned int clock_set = 0;
while (tk->xtime_nsec >= nsecps) {
int leap;
@@ -1260,9 +1264,10 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
- clock_was_set_delayed();
+ clock_set = 1;
}
}
+ return clock_set;
}
/**
@@ -1275,7 +1280,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
* Returns the unconsumed cycles.
*/
static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
- u32 shift)
+ u32 shift,
+ unsigned int *clock_set)
{
cycle_t interval = tk->cycle_interval << shift;
u64 raw_nsecs;
@@ -1289,7 +1295,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
tk->cycle_last += interval;
tk->xtime_nsec += tk->xtime_interval << shift;
- accumulate_nsecs_to_secs(tk);
+ *clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
raw_nsecs = (u64)tk->raw_interval << shift;
@@ -1328,7 +1334,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
tk->xtime_nsec -= remainder;
tk->xtime_nsec += 1ULL << tk->shift;
tk->ntp_error += remainder << tk->ntp_error_shift;
-
+ tk->ntp_error -= (1ULL << tk->shift) << tk->ntp_error_shift;
}
#else
#define old_vsyscall_fixup(tk)
@@ -1347,6 +1353,7 @@ static void update_wall_time(void)
struct timekeeper *tk = &shadow_timekeeper;
cycle_t offset;
int shift = 0, maxshift;
+ unsigned int clock_set = 0;
unsigned long flags;
raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1381,7 +1388,8 @@ static void update_wall_time(void)
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
while (offset >= tk->cycle_interval) {
- offset = logarithmic_accumulation(tk, offset, shift);
+ offset = logarithmic_accumulation(tk, offset, shift,
+ &clock_set);
if (offset < tk->cycle_interval<<shift)
shift--;
}
@@ -1399,7 +1407,7 @@ static void update_wall_time(void)
* Finally, make sure that after the rounding
* xtime_nsec isn't larger than NSEC_PER_SEC
*/
- accumulate_nsecs_to_secs(tk);
+ clock_set |= accumulate_nsecs_to_secs(tk);
write_seqcount_begin(&timekeeper_seq);
/* Update clock->cycle_last with the new value */
@@ -1419,6 +1427,10 @@ static void update_wall_time(void)
write_seqcount_end(&timekeeper_seq);
out:
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (clock_set)
+ /* have to call outside the timekeeper_seq */
+ clock_was_set_delayed();
+
}
/**
@@ -1677,11 +1689,16 @@ int do_adjtimex(struct timex *txc)
if (tai != orig_tai) {
__timekeeping_set_tai_offset(tk, tai);
- clock_was_set_delayed();
+ timekeeping_update(tk, false, true);
}
write_seqcount_end(&timekeeper_seq);
raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+ if (tai != orig_tai)
+ clock_was_set();
+
+ ntp_notify_cmos_timer();
+
return ret;
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 3bdf2832301..61ed862cdd3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -265,10 +265,9 @@ static inline void timer_list_header(struct seq_file *m, u64 now)
static int timer_list_show(struct seq_file *m, void *v)
{
struct timer_list_iter *iter = v;
- u64 now = ktime_to_ns(ktime_get());
if (iter->cpu == -1 && !iter->second_pass)
- timer_list_header(m, now);
+ timer_list_header(m, iter->now);
else if (!iter->second_pass)
print_cpu(m, iter->cpu, iter->now);
#ifdef CONFIG_GENERIC_CLOCKEVENTS
@@ -298,33 +297,41 @@ void sysrq_timer_list_show(void)
return;
}
-static void *timer_list_start(struct seq_file *file, loff_t *offset)
+static void *move_iter(struct timer_list_iter *iter, loff_t offset)
{
- struct timer_list_iter *iter = file->private;
-
- if (!*offset) {
- iter->cpu = -1;
- iter->now = ktime_to_ns(ktime_get());
- } else if (iter->cpu >= nr_cpu_ids) {
+ for (; offset; offset--) {
+ iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
+ if (iter->cpu >= nr_cpu_ids) {
#ifdef CONFIG_GENERIC_CLOCKEVENTS
- if (!iter->second_pass) {
- iter->cpu = -1;
- iter->second_pass = true;
- } else
- return NULL;
+ if (!iter->second_pass) {
+ iter->cpu = -1;
+ iter->second_pass = true;
+ } else
+ return NULL;
#else
- return NULL;
+ return NULL;
#endif
+ }
}
return iter;
}
+static void *timer_list_start(struct seq_file *file, loff_t *offset)
+{
+ struct timer_list_iter *iter = file->private;
+
+ if (!*offset)
+ iter->now = ktime_to_ns(ktime_get());
+ iter->cpu = -1;
+ iter->second_pass = false;
+ return move_iter(iter, *offset);
+}
+
static void *timer_list_next(struct seq_file *file, void *v, loff_t *offset)
{
struct timer_list_iter *iter = file->private;
- iter->cpu = cpumask_next(iter->cpu, cpu_online_mask);
++*offset;
- return timer_list_start(file, offset);
+ return move_iter(iter, 1);
}
static void timer_list_stop(struct seq_file *seq, void *v)
diff --git a/kernel/timer.c b/kernel/timer.c
index 15ffdb3f194..15bc1b41021 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -149,9 +149,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
- if (j <= jiffies) /* rounding ate our timeout entirely; */
- return original;
- return j;
+ /*
+ * Make sure j is still in the future. Otherwise return the
+ * unmodified value.
+ */
+ return time_is_after_jiffies(j) ? j : original;
}
/**
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6c508ff33c6..4b93b841225 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly;
/* Current function tracing op */
struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
+/* What to set function_trace_op to */
+static struct ftrace_ops *set_function_trace_op;
/* List for set_ftrace_pid's pids. */
LIST_HEAD(ftrace_pids);
@@ -278,6 +280,29 @@ static void update_global_ops(void)
global_ops.func = func;
}
+static void ftrace_sync(struct work_struct *work)
+{
+ /*
+ * This function is just a stub to implement a hard force
+ * of synchronize_sched(). This requires synchronizing
+ * tasks even in userspace and idle.
+ *
+ * Yes, function tracing is rude.
+ */
+}
+
+static void ftrace_sync_ipi(void *data)
+{
+ /* Probably not needed, but do it anyway */
+ smp_rmb();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void update_function_graph_func(void);
+#else
+static inline void update_function_graph_func(void) { }
+#endif
+
static void update_ftrace_function(void)
{
ftrace_func_t func;
@@ -296,16 +321,61 @@ static void update_ftrace_function(void)
!FTRACE_FORCE_LIST_FUNC)) {
/* Set the ftrace_ops that the arch callback uses */
if (ftrace_ops_list == &global_ops)
- function_trace_op = ftrace_global_list;
+ set_function_trace_op = ftrace_global_list;
else
- function_trace_op = ftrace_ops_list;
+ set_function_trace_op = ftrace_ops_list;
func = ftrace_ops_list->func;
} else {
/* Just use the default ftrace_ops */
- function_trace_op = &ftrace_list_end;
+ set_function_trace_op = &ftrace_list_end;
func = ftrace_ops_list_func;
}
+ /* If there's no change, then do nothing more here */
+ if (ftrace_trace_function == func)
+ return;
+
+ update_function_graph_func();
+
+ /*
+ * If we are using the list function, it doesn't care
+ * about the function_trace_ops.
+ */
+ if (func == ftrace_ops_list_func) {
+ ftrace_trace_function = func;
+ /*
+ * Don't even bother setting function_trace_ops,
+ * it would be racy to do so anyway.
+ */
+ return;
+ }
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+ /*
+ * For static tracing, we need to be a bit more careful.
+ * The function change takes affect immediately. Thus,
+ * we need to coorditate the setting of the function_trace_ops
+ * with the setting of the ftrace_trace_function.
+ *
+ * Set the function to the list ops, which will call the
+ * function we want, albeit indirectly, but it handles the
+ * ftrace_ops and doesn't depend on function_trace_op.
+ */
+ ftrace_trace_function = ftrace_ops_list_func;
+ /*
+ * Make sure all CPUs see this. Yes this is slow, but static
+ * tracing is slow and nasty to have enabled.
+ */
+ schedule_on_each_cpu(ftrace_sync);
+ /* Now all cpus are using the list ops. */
+ function_trace_op = set_function_trace_op;
+ /* Make sure the function_trace_op is visible on all CPUs */
+ smp_wmb();
+ /* Nasty way to force a rmb on all cpus */
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
+ /* OK, we are all set to update the ftrace_trace_function now! */
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
ftrace_trace_function = func;
}
@@ -367,9 +437,6 @@ static int remove_ftrace_list_ops(struct ftrace_ops **list,
static int __register_ftrace_function(struct ftrace_ops *ops)
{
- if (unlikely(ftrace_disabled))
- return -ENODEV;
-
if (FTRACE_WARN_ON(ops == &global_ops))
return -EINVAL;
@@ -417,9 +484,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- if (ftrace_disabled)
- return -ENODEV;
-
if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
return -EBUSY;
@@ -434,16 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
ret = remove_ftrace_list_ops(&ftrace_control_list,
&control_ops, ops);
- if (!ret) {
- /*
- * The ftrace_ops is now removed from the list,
- * so there'll be no new users. We must ensure
- * all current users are done before we free
- * the control data.
- */
- synchronize_sched();
- control_ops_free(ops);
- }
} else
ret = remove_ftrace_ops(&ftrace_ops_list, ops);
@@ -453,13 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
if (ftrace_enabled)
update_ftrace_function();
- /*
- * Dynamic ops may be freed, we must make sure that all
- * callers are done before leaving this function.
- */
- if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
- synchronize_sched();
-
return 0;
}
@@ -756,7 +803,7 @@ static int ftrace_profile_init(void)
int cpu;
int ret = 0;
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
ret = ftrace_profile_init_cpu(cpu);
if (ret)
break;
@@ -1416,12 +1463,22 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
* the hashes are freed with call_rcu_sched().
*/
static int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
struct ftrace_hash *filter_hash;
struct ftrace_hash *notrace_hash;
int ret;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
+ /*
+ * There's a small race when adding ops that the ftrace handler
+ * that wants regs, may be called without them. We can not
+ * allow that handler to be called if regs is NULL.
+ */
+ if (regs == NULL && (ops->flags & FTRACE_OPS_FL_SAVE_REGS))
+ return 0;
+#endif
+
filter_hash = rcu_dereference_raw_notrace(ops->filter_hash);
notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash);
@@ -1948,8 +2005,14 @@ void ftrace_modify_all_code(int command)
else if (command & FTRACE_DISABLE_CALLS)
ftrace_replace_code(0);
- if (command & FTRACE_UPDATE_TRACE_FUNC)
+ if (command & FTRACE_UPDATE_TRACE_FUNC) {
+ function_trace_op = set_function_trace_op;
+ smp_wmb();
+ /* If irqs are disabled, we are in stop machine */
+ if (!irqs_disabled())
+ smp_call_function(ftrace_sync_ipi, NULL, 1);
ftrace_update_ftrace_func(ftrace_trace_function);
+ }
if (command & FTRACE_START_FUNC_RET)
ftrace_enable_ftrace_graph_caller();
@@ -2038,10 +2101,15 @@ static void ftrace_startup_enable(int command)
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
bool hash_enable = true;
+ int ret;
if (unlikely(ftrace_disabled))
return -ENODEV;
+ ret = __register_ftrace_function(ops);
+ if (ret)
+ return ret;
+
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
@@ -2063,12 +2131,17 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
return 0;
}
-static void ftrace_shutdown(struct ftrace_ops *ops, int command)
+static int ftrace_shutdown(struct ftrace_ops *ops, int command)
{
bool hash_disable = true;
+ int ret;
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
+
+ ret = __unregister_ftrace_function(ops);
+ if (ret)
+ return ret;
ftrace_start_up--;
/*
@@ -2102,10 +2175,42 @@ static void ftrace_shutdown(struct ftrace_ops *ops, int command)
command |= FTRACE_UPDATE_TRACE_FUNC;
}
- if (!command || !ftrace_enabled)
- return;
+ if (!command || !ftrace_enabled) {
+ /*
+ * If these are control ops, they still need their
+ * per_cpu field freed. Since, function tracing is
+ * not currently active, we can just free them
+ * without synchronizing all CPUs.
+ */
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ return 0;
+ }
ftrace_run_update_code(command);
+
+ /*
+ * Dynamic ops may be freed, we must make sure that all
+ * callers are done before leaving this function.
+ * The same goes for freeing the per_cpu data of the control
+ * ops.
+ *
+ * Again, normal synchronize_sched() is not good enough.
+ * We need to do a hard force of sched synchronization.
+ * This is because we use preempt_disable() to do RCU, but
+ * the function tracers can be called where RCU is not watching
+ * (like before user_exit()). We can not rely on the RCU
+ * infrastructure to do the synchronization, thus we must do it
+ * ourselves.
+ */
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+ schedule_on_each_cpu(ftrace_sync);
+
+ if (ops->flags & FTRACE_OPS_FL_CONTROL)
+ control_ops_free(ops);
+ }
+
+ return 0;
}
static void ftrace_startup_sysctl(void)
@@ -2134,12 +2239,57 @@ static cycle_t ftrace_update_time;
static unsigned long ftrace_update_cnt;
unsigned long ftrace_update_tot_cnt;
-static int ops_traces_mod(struct ftrace_ops *ops)
+static inline int ops_traces_mod(struct ftrace_ops *ops)
{
- struct ftrace_hash *hash;
+ /*
+ * Filter_hash being empty will default to trace module.
+ * But notrace hash requires a test of individual module functions.
+ */
+ return ftrace_hash_empty(ops->filter_hash) &&
+ ftrace_hash_empty(ops->notrace_hash);
+}
- hash = ops->filter_hash;
- return ftrace_hash_empty(hash);
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static inline bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ /* If ops isn't enabled, ignore it */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return 0;
+
+ /* If ops traces all mods, we already accounted for it */
+ if (ops_traces_mod(ops))
+ return 0;
+
+ /* The function must be in the filter */
+ if (!ftrace_hash_empty(ops->filter_hash) &&
+ !ftrace_lookup_ip(ops->filter_hash, rec->ip))
+ return 0;
+
+ /* If in notrace hash, we ignore it too */
+ if (ftrace_lookup_ip(ops->notrace_hash, rec->ip))
+ return 0;
+
+ return 1;
+}
+
+static int referenced_filters(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *ops;
+ int cnt = 0;
+
+ for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
+ if (ops_references_rec(ops, rec))
+ cnt++;
+ }
+
+ return cnt;
}
static int ftrace_update_code(struct module *mod)
@@ -2148,6 +2298,7 @@ static int ftrace_update_code(struct module *mod)
struct dyn_ftrace *p;
cycle_t start, stop;
unsigned long ref = 0;
+ bool test = false;
int i;
/*
@@ -2161,9 +2312,12 @@ static int ftrace_update_code(struct module *mod)
for (ops = ftrace_ops_list;
ops != &ftrace_list_end; ops = ops->next) {
- if (ops->flags & FTRACE_OPS_FL_ENABLED &&
- ops_traces_mod(ops))
- ref++;
+ if (ops->flags & FTRACE_OPS_FL_ENABLED) {
+ if (ops_traces_mod(ops))
+ ref++;
+ else
+ test = true;
+ }
}
}
@@ -2173,12 +2327,16 @@ static int ftrace_update_code(struct module *mod)
for (pg = ftrace_new_pgs; pg; pg = pg->next) {
for (i = 0; i < pg->index; i++) {
+ int cnt = ref;
+
/* If something went wrong, bail without enabling anything */
if (unlikely(ftrace_disabled))
return -1;
p = &pg->records[i];
- p->flags = ref;
+ if (test)
+ cnt += referenced_filters(p);
+ p->flags = cnt;
/*
* Do the initial record conversion from mcount jump
@@ -2198,7 +2356,7 @@ static int ftrace_update_code(struct module *mod)
* conversion puts the module to the correct state, thus
* passing the ftrace_make_call check.
*/
- if (ftrace_start_up && ref) {
+ if (ftrace_start_up && cnt) {
int failed = __ftrace_replace_code(p, 1);
if (failed)
ftrace_bug(failed, p->ip);
@@ -2957,16 +3115,13 @@ static void __enable_ftrace_function_probe(void)
if (i == FTRACE_FUNC_HASHSIZE)
return;
- ret = __register_ftrace_function(&trace_probe_ops);
- if (!ret)
- ret = ftrace_startup(&trace_probe_ops, 0);
+ ret = ftrace_startup(&trace_probe_ops, 0);
ftrace_probe_registered = 1;
}
static void __disable_ftrace_function_probe(void)
{
- int ret;
int i;
if (!ftrace_probe_registered)
@@ -2979,9 +3134,7 @@ static void __disable_ftrace_function_probe(void)
}
/* no more funcs left */
- ret = __unregister_ftrace_function(&trace_probe_ops);
- if (!ret)
- ftrace_shutdown(&trace_probe_ops, 0);
+ ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
}
@@ -4178,17 +4331,20 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(ops, command) \
- ({ \
- (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
- 0; \
+# define ftrace_startup(ops, command) \
+ ({ \
+ int ___ret = __register_ftrace_function(ops); \
+ if (!___ret) \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ ___ret; \
})
-# define ftrace_shutdown(ops, command) do { } while (0)
+# define ftrace_shutdown(ops, command) __unregister_ftrace_function(ops)
+
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
static inline int
-ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
return 1;
}
@@ -4211,7 +4367,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
do_for_each_ftrace_op(op, ftrace_control_list) {
if (!(op->flags & FTRACE_OPS_FL_STUB) &&
!ftrace_function_local_disabled(op) &&
- ftrace_ops_test(op, ip))
+ ftrace_ops_test(op, ip, regs))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
trace_recursion_clear(TRACE_CONTROL_BIT);
@@ -4244,7 +4400,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
*/
preempt_disable_notrace();
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (ftrace_ops_test(op, ip))
+ if (ftrace_ops_test(op, ip, regs))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
preempt_enable_notrace();
@@ -4583,9 +4739,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_lock);
- ret = __register_ftrace_function(ops);
- if (!ret)
- ret = ftrace_startup(ops, 0);
+ ret = ftrace_startup(ops, 0);
mutex_unlock(&ftrace_lock);
@@ -4604,9 +4758,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
int ret;
mutex_lock(&ftrace_lock);
- ret = __unregister_ftrace_function(ops);
- if (!ret)
- ftrace_shutdown(ops, 0);
+ ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
@@ -4666,6 +4818,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
trace_func_graph_ret_t ftrace_graph_return =
(trace_func_graph_ret_t)ftrace_stub;
trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -4800,6 +4953,37 @@ ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
return NOTIFY_DONE;
}
+/* Just a place holder for function graph */
+static struct ftrace_ops fgraph_ops __read_mostly = {
+ .func = ftrace_stub,
+ .flags = FTRACE_OPS_FL_STUB | FTRACE_OPS_FL_GLOBAL |
+ FTRACE_OPS_FL_RECURSION_SAFE,
+};
+
+static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
+{
+ if (!ftrace_ops_test(&global_ops, trace->func, NULL))
+ return 0;
+ return __ftrace_graph_entry(trace);
+}
+
+/*
+ * The function graph tracer should only trace the functions defined
+ * by set_ftrace_filter and set_ftrace_notrace. If another function
+ * tracer ops is registered, the graph tracer requires testing the
+ * function against the global ops, and not just trace any function
+ * that any ftrace_ops registered.
+ */
+static void update_function_graph_func(void)
+{
+ if (ftrace_ops_list == &ftrace_list_end ||
+ (ftrace_ops_list == &global_ops &&
+ global_ops.next == &ftrace_list_end))
+ ftrace_graph_entry = __ftrace_graph_entry;
+ else
+ ftrace_graph_entry = ftrace_graph_entry_test;
+}
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -4824,9 +5008,18 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
}
ftrace_graph_return = retfunc;
- ftrace_graph_entry = entryfunc;
- ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
+ /*
+ * Update the indirect function to the entryfunc, and the
+ * function that gets called to the entry_test first. Then
+ * call the update fgraph entry function to determine if
+ * the entryfunc should be called directly or not.
+ */
+ __ftrace_graph_entry = entryfunc;
+ ftrace_graph_entry = ftrace_graph_entry_test;
+ update_function_graph_func();
+
+ ret = ftrace_startup(&fgraph_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -4843,7 +5036,8 @@ void unregister_ftrace_graph(void)
ftrace_graph_active--;
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
+ __ftrace_graph_entry = ftrace_graph_entry_stub;
+ ftrace_shutdown(&fgraph_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e444ff88f0a..fd12cc56371 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2396,6 +2396,13 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
write &= RB_WRITE_MASK;
tail = write - length;
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ delta = 0;
+
/* See if we shot pass the end of this buffer page */
if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e71a8be4a6e..6dbdf277c8f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -193,6 +193,37 @@ static struct trace_array global_trace;
LIST_HEAD(ftrace_trace_arrays);
+int trace_array_get(struct trace_array *this_tr)
+{
+ struct trace_array *tr;
+ int ret = -ENODEV;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr == this_tr) {
+ tr->ref++;
+ ret = 0;
+ break;
+ }
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static void __trace_array_put(struct trace_array *this_tr)
+{
+ WARN_ON(!this_tr->ref);
+ this_tr->ref--;
+}
+
+void trace_array_put(struct trace_array *this_tr)
+{
+ mutex_lock(&trace_types_lock);
+ __trace_array_put(this_tr);
+ mutex_unlock(&trace_types_lock);
+}
+
int filter_current_check_discard(struct ring_buffer *buffer,
struct ftrace_event_call *call, void *rec,
struct ring_buffer_event *event)
@@ -201,23 +232,43 @@ int filter_current_check_discard(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(filter_current_check_discard);
-cycle_t ftrace_now(int cpu)
+cycle_t buffer_ftrace_now(struct trace_buffer *buf, int cpu)
{
u64 ts;
/* Early boot up does not have a buffer yet */
- if (!global_trace.trace_buffer.buffer)
+ if (!buf->buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
- ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
+ ts = ring_buffer_time_stamp(buf->buffer, cpu);
+ ring_buffer_normalize_time_stamp(buf->buffer, cpu, &ts);
return ts;
}
+cycle_t ftrace_now(int cpu)
+{
+ return buffer_ftrace_now(&global_trace.trace_buffer, cpu);
+}
+
+/**
+ * tracing_is_enabled - Show if global_trace has been disabled
+ *
+ * Shows if the global trace has been enabled or not. It uses the
+ * mirror flag "buffer_disabled" to be used in fast paths such as for
+ * the irqsoff tracer. But it may be inaccurate due to races. If you
+ * need to know the accurate state, use tracing_is_on() which is a little
+ * slower, but accurate.
+ */
int tracing_is_enabled(void)
{
- return tracing_is_on();
+ /*
+ * For quick access (irqsoff uses this in fast path), just
+ * return the mirror variable of the state of the ring buffer.
+ * It's a little racy, but we don't really care.
+ */
+ smp_rmb();
+ return !global_trace.buffer_disabled;
}
/*
@@ -240,7 +291,7 @@ static struct tracer *trace_types __read_mostly;
/*
* trace_types_lock is used to protect the trace_types list.
*/
-static DEFINE_MUTEX(trace_types_lock);
+DEFINE_MUTEX(trace_types_lock);
/*
* serialize the access of the ring buffer
@@ -330,6 +381,23 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
+void tracer_tracing_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_on(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 0;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
+
/**
* tracing_on - enable tracing buffers
*
@@ -338,15 +406,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
*/
void tracing_on(void)
{
- if (global_trace.trace_buffer.buffer)
- ring_buffer_record_on(global_trace.trace_buffer.buffer);
- /*
- * This flag is only looked at when buffers haven't been
- * allocated yet. We don't really care about the race
- * between setting this flag and actually turning
- * on the buffer.
- */
- global_trace.buffer_disabled = 0;
+ tracer_tracing_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_on);
@@ -364,6 +424,9 @@ int __trace_puts(unsigned long ip, const char *str, int size)
unsigned long irq_flags;
int alloc;
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
alloc = sizeof(*entry) + size + 2; /* possible \n added */
local_save_flags(irq_flags);
@@ -404,6 +467,9 @@ int __trace_bputs(unsigned long ip, const char *str)
unsigned long irq_flags;
int size = sizeof(struct bputs_entry);
+ if (unlikely(tracing_selftest_running || tracing_disabled))
+ return 0;
+
local_save_flags(irq_flags);
buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
@@ -540,6 +606,23 @@ void tracing_snapshot_alloc(void)
EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
#endif /* CONFIG_TRACER_SNAPSHOT */
+void tracer_tracing_off(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ ring_buffer_record_off(tr->trace_buffer.buffer);
+ /*
+ * This flag is looked at when buffers haven't been allocated
+ * yet, or by some tracers (like irqsoff), that just want to
+ * know if the ring buffer has been disabled, but it can handle
+ * races of where it gets disabled but we still do a record.
+ * As the check is in the fast path of the tracers, it is more
+ * important to be fast than accurate.
+ */
+ tr->buffer_disabled = 1;
+ /* Make the flag seen by readers */
+ smp_wmb();
+}
+
/**
* tracing_off - turn off tracing buffers
*
@@ -550,26 +633,29 @@ EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
*/
void tracing_off(void)
{
- if (global_trace.trace_buffer.buffer)
- ring_buffer_record_off(global_trace.trace_buffer.buffer);
- /*
- * This flag is only looked at when buffers haven't been
- * allocated yet. We don't really care about the race
- * between setting this flag and actually turning
- * on the buffer.
- */
- global_trace.buffer_disabled = 1;
+ tracer_tracing_off(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_off);
/**
+ * tracer_tracing_is_on - show real state of ring buffer enabled
+ * @tr : the trace array to know if ring buffer is enabled
+ *
+ * Shows real state of the ring buffer if it is enabled or not.
+ */
+int tracer_tracing_is_on(struct trace_array *tr)
+{
+ if (tr->trace_buffer.buffer)
+ return ring_buffer_record_is_on(tr->trace_buffer.buffer);
+ return !tr->buffer_disabled;
+}
+
+/**
* tracing_is_on - show state of ring buffers enabled
*/
int tracing_is_on(void)
{
- if (global_trace.trace_buffer.buffer)
- return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
- return !global_trace.buffer_disabled;
+ return tracer_tracing_is_on(&global_trace);
}
EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -746,9 +832,12 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
if (isspace(ch)) {
parser->buffer[parser->idx] = 0;
parser->cont = false;
- } else {
+ } else if (parser->idx < parser->size - 1) {
parser->cont = true;
parser->buffer[parser->idx++] = ch;
+ } else {
+ ret = -EINVAL;
+ goto out;
}
*ppos += read;
@@ -1119,7 +1208,7 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
/* Make sure all commits have finished */
synchronize_sched();
- buf->time_start = ftrace_now(buf->cpu);
+ buf->time_start = buffer_ftrace_now(buf, buf->cpu);
for_each_online_cpu(cpu)
ring_buffer_reset_cpu(buffer, cpu);
@@ -1127,23 +1216,17 @@ void tracing_reset_online_cpus(struct trace_buffer *buf)
ring_buffer_record_enable(buffer);
}
-void tracing_reset_current(int cpu)
-{
- tracing_reset(&global_trace.trace_buffer, cpu);
-}
-
+/* Must have trace_types_lock held */
void tracing_reset_all_online_cpus(void)
{
struct trace_array *tr;
- mutex_lock(&trace_types_lock);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
tracing_reset_online_cpus(&tr->max_buffer);
#endif
}
- mutex_unlock(&trace_types_lock);
}
#define SAVED_CMDLINES 128
@@ -2760,6 +2843,17 @@ static int s_show(struct seq_file *m, void *v)
return 0;
}
+/*
+ * Should be used after trace_array_get(), trace_types_lock
+ * ensures that i_cdev was already initialized.
+ */
+static inline int tracing_get_cpu(struct inode *inode)
+{
+ if (inode->i_cdev) /* See trace_create_cpu_file() */
+ return (long)inode->i_cdev - 1;
+ return RING_BUFFER_ALL_CPUS;
+}
+
static const struct seq_operations tracer_seq_ops = {
.start = s_start,
.next = s_next,
@@ -2770,8 +2864,7 @@ static const struct seq_operations tracer_seq_ops = {
static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
- struct trace_cpu *tc = inode->i_private;
- struct trace_array *tr = tc->tr;
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int cpu;
@@ -2812,8 +2905,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
iter->trace_buffer = &tr->trace_buffer;
iter->snapshot = snapshot;
iter->pos = -1;
+ iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
- iter->cpu_file = tc->cpu;
/* Notify the tracer early; before we stop tracing. */
if (iter->trace && iter->trace->open)
@@ -2850,8 +2943,6 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
tracing_iter_reset(iter, cpu);
}
- tr->ref++;
-
mutex_unlock(&trace_types_lock);
return iter;
@@ -2874,24 +2965,41 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
return 0;
}
+/*
+ * Open and update trace_array ref count.
+ * Must have the current trace_array passed to it.
+ */
+int tracing_open_generic_tr(struct inode *inode, struct file *filp)
+{
+ struct trace_array *tr = inode->i_private;
+
+ if (tracing_disabled)
+ return -ENODEV;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ filp->private_data = inode->i_private;
+
+ return 0;
+}
+
static int tracing_release(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
struct seq_file *m = file->private_data;
struct trace_iterator *iter;
- struct trace_array *tr;
int cpu;
- if (!(file->f_mode & FMODE_READ))
+ if (!(file->f_mode & FMODE_READ)) {
+ trace_array_put(tr);
return 0;
+ }
+ /* Writes do not use seq_file */
iter = m->private;
- tr = iter->tr;
-
mutex_lock(&trace_types_lock);
- WARN_ON(!tr->ref);
- tr->ref--;
-
for_each_tracing_cpu(cpu) {
if (iter->buffer_iter[cpu])
ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2903,6 +3011,9 @@ static int tracing_release(struct inode *inode, struct file *file)
if (!iter->snapshot)
/* reenable tracing if it was previously enabled */
tracing_start_tr(tr);
+
+ __trace_array_put(tr);
+
mutex_unlock(&trace_types_lock);
mutex_destroy(&iter->mutex);
@@ -2910,24 +3021,44 @@ static int tracing_release(struct inode *inode, struct file *file)
kfree(iter->trace);
kfree(iter->buffer_iter);
seq_release_private(inode, file);
+
return 0;
}
+static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+ return 0;
+}
+
+static int tracing_single_release_tr(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return single_release(inode, file);
+}
+
static int tracing_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
/* If this file was open for write, then erase contents */
- if ((file->f_mode & FMODE_WRITE) &&
- (file->f_flags & O_TRUNC)) {
- struct trace_cpu *tc = inode->i_private;
- struct trace_array *tr = tc->tr;
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ int cpu = tracing_get_cpu(inode);
- if (tc->cpu == RING_BUFFER_ALL_CPUS)
+ if (cpu == RING_BUFFER_ALL_CPUS)
tracing_reset_online_cpus(&tr->trace_buffer);
else
- tracing_reset(&tr->trace_buffer, tc->cpu);
+ tracing_reset(&tr->trace_buffer, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -2937,6 +3068,10 @@ static int tracing_open(struct inode *inode, struct file *file)
else if (trace_flags & TRACE_ITER_LATENCY_FMT)
iter->iter_flags |= TRACE_FILE_LAT_FMT;
}
+
+ if (ret < 0)
+ trace_array_put(tr);
+
return ret;
}
@@ -3293,17 +3428,27 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
static int tracing_trace_options_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
if (tracing_disabled)
return -ENODEV;
- return single_open(file, tracing_trace_options_show, inode->i_private);
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
+ ret = single_open(file, tracing_trace_options_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
}
static const struct file_operations tracing_iter_fops = {
.open = tracing_trace_options_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_tr,
.write = tracing_trace_options_write,
};
@@ -3783,20 +3928,23 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
- struct trace_cpu *tc = inode->i_private;
- struct trace_array *tr = tc->tr;
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
int ret = 0;
if (tracing_disabled)
return -ENODEV;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
mutex_lock(&trace_types_lock);
/* create a buffer to store the information to pass to userspace */
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
ret = -ENOMEM;
+ __trace_array_put(tr);
goto out;
}
@@ -3826,9 +3974,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (trace_clocks[tr->clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
- iter->cpu_file = tc->cpu;
- iter->tr = tc->tr;
- iter->trace_buffer = &tc->tr->trace_buffer;
+ iter->tr = tr;
+ iter->trace_buffer = &tr->trace_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -3843,6 +3991,7 @@ out:
fail:
kfree(iter->trace);
kfree(iter);
+ __trace_array_put(tr);
mutex_unlock(&trace_types_lock);
return ret;
}
@@ -3850,6 +3999,7 @@ fail:
static int tracing_release_pipe(struct inode *inode, struct file *file)
{
struct trace_iterator *iter = file->private_data;
+ struct trace_array *tr = inode->i_private;
mutex_lock(&trace_types_lock);
@@ -3863,6 +4013,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
kfree(iter->trace);
kfree(iter);
+ trace_array_put(tr);
+
return 0;
}
@@ -3939,7 +4091,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_enabled() && iter->pos)
+ if (!tracing_is_on() && iter->pos)
break;
}
@@ -4000,6 +4152,7 @@ waitagain:
memset(&iter->seq, 0,
sizeof(struct trace_iterator) -
offsetof(struct trace_iterator, seq));
+ cpumask_clear(iter->started);
iter->pos = -1;
trace_event_read_lock();
@@ -4200,15 +4353,16 @@ static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_cpu *tc = filp->private_data;
- struct trace_array *tr = tc->tr;
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
+ int cpu = tracing_get_cpu(inode);
char buf[64];
int r = 0;
ssize_t ret;
mutex_lock(&trace_types_lock);
- if (tc->cpu == RING_BUFFER_ALL_CPUS) {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
int cpu, buf_size_same;
unsigned long size;
@@ -4235,7 +4389,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
} else
r = sprintf(buf, "X\n");
} else
- r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10);
mutex_unlock(&trace_types_lock);
@@ -4247,7 +4401,8 @@ static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct trace_cpu *tc = filp->private_data;
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
unsigned long val;
int ret;
@@ -4261,8 +4416,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;
-
- ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
+ ret = tracing_resize_ring_buffer(tr, val, tracing_get_cpu(inode));
if (ret < 0)
return ret;
@@ -4316,10 +4470,12 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
/* disable tracing ? */
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
- tracing_off();
+ tracer_tracing_off(tr);
/* resize the ring buffer to 0 */
tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
+ trace_array_put(tr);
+
return 0;
}
@@ -4328,6 +4484,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
unsigned long addr = (unsigned long)ubuf;
+ struct trace_array *tr = filp->private_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
struct print_entry *entry;
@@ -4387,7 +4544,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* possible \n added */
- buffer = global_trace.trace_buffer.buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
if (!event) {
@@ -4478,12 +4635,12 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
* New clock may not be consistent with the previous clock.
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
- tracing_reset_online_cpus(&global_trace.trace_buffer);
+ tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
- tracing_reset_online_cpus(&global_trace.max_buffer);
+ tracing_reset_online_cpus(&tr->max_buffer);
#endif
mutex_unlock(&trace_types_lock);
@@ -4495,10 +4652,20 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
static int tracing_clock_open(struct inode *inode, struct file *file)
{
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
if (tracing_disabled)
return -ENODEV;
- return single_open(file, tracing_clock_show, inode->i_private);
+ if (trace_array_get(tr))
+ return -ENODEV;
+
+ ret = single_open(file, tracing_clock_show, inode->i_private);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
}
struct ftrace_buffer_info {
@@ -4510,31 +4677,40 @@ struct ftrace_buffer_info {
#ifdef CONFIG_TRACER_SNAPSHOT
static int tracing_snapshot_open(struct inode *inode, struct file *file)
{
- struct trace_cpu *tc = inode->i_private;
+ struct trace_array *tr = inode->i_private;
struct trace_iterator *iter;
struct seq_file *m;
int ret = 0;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
if (file->f_mode & FMODE_READ) {
iter = __tracing_open(inode, file, true);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
} else {
/* Writes still need the seq_file to hold the private data */
+ ret = -ENOMEM;
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (!m)
- return -ENOMEM;
+ goto out;
iter = kzalloc(sizeof(*iter), GFP_KERNEL);
if (!iter) {
kfree(m);
- return -ENOMEM;
+ goto out;
}
- iter->tr = tc->tr;
- iter->trace_buffer = &tc->tr->max_buffer;
- iter->cpu_file = tc->cpu;
+ ret = 0;
+
+ iter->tr = tr;
+ iter->trace_buffer = &tr->max_buffer;
+ iter->cpu_file = tracing_get_cpu(inode);
m->private = iter;
file->private_data = m;
}
+out:
+ if (ret < 0)
+ trace_array_put(tr);
return ret;
}
@@ -4616,9 +4792,12 @@ out:
static int tracing_snapshot_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
+ int ret;
+
+ ret = tracing_release(inode, file);
if (file->f_mode & FMODE_READ)
- return tracing_release(inode, file);
+ return ret;
/* If write only, the seq_file is just a stub */
if (m)
@@ -4684,34 +4863,38 @@ static const struct file_operations tracing_pipe_fops = {
};
static const struct file_operations tracing_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_entries_read,
.write = tracing_entries_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_total_entries_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_total_entries_read,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations tracing_free_buffer_fops = {
+ .open = tracing_open_generic_tr,
.write = tracing_free_buffer_write,
.release = tracing_free_buffer_release,
};
static const struct file_operations tracing_mark_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.write = tracing_mark_write,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
static const struct file_operations trace_clock_fops = {
.open = tracing_clock_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = tracing_single_release_tr,
.write = tracing_clock_write,
};
@@ -4736,23 +4919,26 @@ static const struct file_operations snapshot_raw_fops = {
static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
- struct trace_cpu *tc = inode->i_private;
- struct trace_array *tr = tc->tr;
+ struct trace_array *tr = inode->i_private;
struct ftrace_buffer_info *info;
+ int ret;
if (tracing_disabled)
return -ENODEV;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
+ if (!info) {
+ trace_array_put(tr);
return -ENOMEM;
+ }
mutex_lock(&trace_types_lock);
- tr->ref++;
-
info->iter.tr = tr;
- info->iter.cpu_file = tc->cpu;
+ info->iter.cpu_file = tracing_get_cpu(inode);
info->iter.trace = tr->current_trace;
info->iter.trace_buffer = &tr->trace_buffer;
info->spare = NULL;
@@ -4763,7 +4949,11 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
mutex_unlock(&trace_types_lock);
- return nonseekable_open(inode, filp);
+ ret = nonseekable_open(inode, filp);
+ if (ret < 0)
+ trace_array_put(tr);
+
+ return ret;
}
static unsigned int
@@ -4863,8 +5053,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
mutex_lock(&trace_types_lock);
- WARN_ON(!iter->tr->ref);
- iter->tr->ref--;
+ __trace_array_put(iter->tr);
if (info->spare)
ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
@@ -5066,14 +5255,14 @@ static ssize_t
tracing_stats_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
- struct trace_cpu *tc = filp->private_data;
- struct trace_array *tr = tc->tr;
+ struct inode *inode = file_inode(filp);
+ struct trace_array *tr = inode->i_private;
struct trace_buffer *trace_buf = &tr->trace_buffer;
+ int cpu = tracing_get_cpu(inode);
struct trace_seq *s;
unsigned long cnt;
unsigned long long t;
unsigned long usec_rem;
- int cpu = tc->cpu;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
@@ -5126,9 +5315,10 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
}
static const struct file_operations tracing_stats_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = tracing_stats_read,
.llseek = generic_file_llseek,
+ .release = tracing_release_generic_tr,
};
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -5317,10 +5507,20 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
return tr->percpu_dir;
}
+static struct dentry *
+trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent,
+ void *data, long cpu, const struct file_operations *fops)
+{
+ struct dentry *ret = trace_create_file(name, mode, parent, data, fops);
+
+ if (ret) /* See tracing_get_cpu() */
+ ret->d_inode->i_cdev = (void *)(cpu + 1);
+ return ret;
+}
+
static void
tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
{
- struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5336,28 +5536,28 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
}
/* per cpu trace_pipe */
- trace_create_file("trace_pipe", 0444, d_cpu,
- (void *)&data->trace_cpu, &tracing_pipe_fops);
+ trace_create_cpu_file("trace_pipe", 0444, d_cpu,
+ tr, cpu, &tracing_pipe_fops);
/* per cpu trace */
- trace_create_file("trace", 0644, d_cpu,
- (void *)&data->trace_cpu, &tracing_fops);
+ trace_create_cpu_file("trace", 0644, d_cpu,
+ tr, cpu, &tracing_fops);
- trace_create_file("trace_pipe_raw", 0444, d_cpu,
- (void *)&data->trace_cpu, &tracing_buffers_fops);
+ trace_create_cpu_file("trace_pipe_raw", 0444, d_cpu,
+ tr, cpu, &tracing_buffers_fops);
- trace_create_file("stats", 0444, d_cpu,
- (void *)&data->trace_cpu, &tracing_stats_fops);
+ trace_create_cpu_file("stats", 0444, d_cpu,
+ tr, cpu, &tracing_stats_fops);
- trace_create_file("buffer_size_kb", 0444, d_cpu,
- (void *)&data->trace_cpu, &tracing_entries_fops);
+ trace_create_cpu_file("buffer_size_kb", 0444, d_cpu,
+ tr, cpu, &tracing_entries_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", 0644, d_cpu,
- (void *)&data->trace_cpu, &snapshot_fops);
+ trace_create_cpu_file("snapshot", 0644, d_cpu,
+ tr, cpu, &snapshot_fops);
- trace_create_file("snapshot_raw", 0444, d_cpu,
- (void *)&data->trace_cpu, &snapshot_raw_fops);
+ trace_create_cpu_file("snapshot_raw", 0444, d_cpu,
+ tr, cpu, &snapshot_raw_fops);
#endif
}
@@ -5612,15 +5812,10 @@ rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer *buffer = tr->trace_buffer.buffer;
char buf[64];
int r;
- if (buffer)
- r = ring_buffer_record_is_on(buffer);
- else
- r = 0;
-
+ r = tracer_tracing_is_on(tr);
r = sprintf(buf, "%d\n", r);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -5642,11 +5837,11 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
if (buffer) {
mutex_lock(&trace_types_lock);
if (val) {
- ring_buffer_record_on(buffer);
+ tracer_tracing_on(tr);
if (tr->current_trace->start)
tr->current_trace->start(tr);
} else {
- ring_buffer_record_off(buffer);
+ tracer_tracing_off(tr);
if (tr->current_trace->stop)
tr->current_trace->stop(tr);
}
@@ -5659,9 +5854,10 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
}
static const struct file_operations rb_simple_fops = {
- .open = tracing_open_generic,
+ .open = tracing_open_generic_tr,
.read = rb_simple_read,
.write = rb_simple_write,
+ .release = tracing_release_generic_tr,
.llseek = default_llseek,
};
@@ -5688,6 +5884,8 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+ buf->tr = tr;
+
buf->buffer = ring_buffer_alloc(size, rb_flags);
if (!buf->buffer)
return -ENOMEM;
@@ -5775,8 +5973,10 @@ static int new_instance_create(const char *name)
goto out_free_tr;
ret = event_trace_add_tracer(tr->dir, tr);
- if (ret)
+ if (ret) {
+ debugfs_remove_recursive(tr->dir);
goto out_free_tr;
+ }
init_tracer_debugfs(tr, tr->dir);
@@ -5922,13 +6122,13 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
tr, &tracing_iter_fops);
trace_create_file("trace", 0644, d_tracer,
- (void *)&tr->trace_cpu, &tracing_fops);
+ tr, &tracing_fops);
trace_create_file("trace_pipe", 0444, d_tracer,
- (void *)&tr->trace_cpu, &tracing_pipe_fops);
+ tr, &tracing_pipe_fops);
trace_create_file("buffer_size_kb", 0644, d_tracer,
- (void *)&tr->trace_cpu, &tracing_entries_fops);
+ tr, &tracing_entries_fops);
trace_create_file("buffer_total_size_kb", 0444, d_tracer,
tr, &tracing_total_entries_fops);
@@ -5943,11 +6143,11 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
&trace_clock_fops);
trace_create_file("tracing_on", 0644, d_tracer,
- tr, &rb_simple_fops);
+ tr, &rb_simple_fops);
#ifdef CONFIG_TRACER_SNAPSHOT
trace_create_file("snapshot", 0644, d_tracer,
- (void *)&tr->trace_cpu, &snapshot_fops);
+ tr, &snapshot_fops);
#endif
for_each_tracing_cpu(cpu)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 20572ed88c5..51b44483eb7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -224,6 +224,11 @@ enum {
extern struct list_head ftrace_trace_arrays;
+extern struct mutex trace_types_lock;
+
+extern int trace_array_get(struct trace_array *tr);
+extern void trace_array_put(struct trace_array *tr);
+
/*
* The global tracer (top) should be the first trace array added,
* but we check the flag anyway.
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 84b1e045fab..8354dc81ae6 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
{
/* The ftrace function trace is allowed only for root. */
if (ftrace_event_is_function(tp_event) &&
- perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
return -EPERM;
/* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 27963e2bf4b..001b349af93 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -27,12 +27,6 @@
DEFINE_MUTEX(event_mutex);
-DEFINE_MUTEX(event_storage_mutex);
-EXPORT_SYMBOL_GPL(event_storage_mutex);
-
-char event_storage[EVENT_STORAGE_SIZE];
-EXPORT_SYMBOL_GPL(event_storage);
-
LIST_HEAD(ftrace_events);
static LIST_HEAD(ftrace_common_fields);
@@ -41,6 +35,23 @@ static LIST_HEAD(ftrace_common_fields);
static struct kmem_cache *field_cachep;
static struct kmem_cache *file_cachep;
+#define SYSTEM_FL_FREE_NAME (1 << 31)
+
+static inline int system_refcount(struct event_subsystem *system)
+{
+ return system->ref_count & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_inc(struct event_subsystem *system)
+{
+ return (system->ref_count++) & ~SYSTEM_FL_FREE_NAME;
+}
+
+static int system_refcount_dec(struct event_subsystem *system)
+{
+ return (--system->ref_count) & ~SYSTEM_FL_FREE_NAME;
+}
+
/* Double loops, do not use break, only goto's work */
#define do_for_each_event_file(tr, file) \
list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
@@ -97,7 +108,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
field = kmem_cache_alloc(field_cachep, GFP_TRACE);
if (!field)
- goto err;
+ return -ENOMEM;
field->name = name;
field->type = type;
@@ -114,11 +125,6 @@ static int __trace_define_field(struct list_head *head, const char *type,
list_add(&field->link, head);
return 0;
-
-err:
- kmem_cache_free(field_cachep, field);
-
- return -ENOMEM;
}
int trace_define_field(struct ftrace_event_call *call, const char *type,
@@ -349,8 +355,8 @@ static void __put_system(struct event_subsystem *system)
{
struct event_filter *filter = system->filter;
- WARN_ON_ONCE(system->ref_count == 0);
- if (--system->ref_count)
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ if (system_refcount_dec(system))
return;
list_del(&system->list);
@@ -359,13 +365,15 @@ static void __put_system(struct event_subsystem *system)
kfree(filter->filter_string);
kfree(filter);
}
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
kfree(system);
}
static void __get_system(struct event_subsystem *system)
{
- WARN_ON_ONCE(system->ref_count == 0);
- system->ref_count++;
+ WARN_ON_ONCE(system_refcount(system) == 0);
+ system_refcount_inc(system);
}
static void __get_system_dir(struct ftrace_subsystem_dir *dir)
@@ -379,7 +387,7 @@ static void __put_system_dir(struct ftrace_subsystem_dir *dir)
{
WARN_ON_ONCE(dir->ref_count == 0);
/* If the subsystem is about to be freed, the dir must be too */
- WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+ WARN_ON_ONCE(system_refcount(dir->subsystem) == 1 && dir->ref_count != 1);
__put_system(dir->subsystem);
if (!--dir->ref_count)
@@ -393,17 +401,55 @@ static void put_system(struct ftrace_subsystem_dir *dir)
mutex_unlock(&event_mutex);
}
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+ if (!dir)
+ return;
+
+ if (!--dir->nr_events) {
+ debugfs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
+
+static void *event_file_data(struct file *filp)
+{
+ return ACCESS_ONCE(file_inode(filp)->i_private);
+}
+
+static void remove_event_file_dir(struct ftrace_event_file *file)
+{
+ struct dentry *dir = file->dir;
+ struct dentry *child;
+
+ if (dir) {
+ spin_lock(&dir->d_lock); /* probably unneeded */
+ list_for_each_entry(child, &dir->d_subdirs, d_u.d_child) {
+ if (child->d_inode) /* probably unneeded */
+ child->d_inode->i_private = NULL;
+ }
+ spin_unlock(&dir->d_lock);
+
+ debugfs_remove_recursive(dir);
+ }
+
+ list_del(&file->list);
+ remove_subsystem(file->system);
+ kmem_cache_free(file_cachep, file);
+}
+
/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
-static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
- const char *sub, const char *event, int set)
+static int
+__ftrace_set_clr_event_nolock(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
{
struct ftrace_event_file *file;
struct ftrace_event_call *call;
int ret = -EINVAL;
- mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
@@ -429,6 +475,17 @@ static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
ret = 0;
}
+
+ return ret;
+}
+
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+ ret = __ftrace_set_clr_event_nolock(tr, match, sub, event, set);
mutex_unlock(&event_mutex);
return ret;
@@ -623,13 +680,23 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
+ unsigned long flags;
char *buf;
- if (file->flags & FTRACE_EVENT_FL_ENABLED) {
- if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
+ mutex_lock(&event_mutex);
+ file = event_file_data(filp);
+ if (likely(file))
+ flags = file->flags;
+ mutex_unlock(&event_mutex);
+
+ if (!file)
+ return -ENODEV;
+
+ if (flags & FTRACE_EVENT_FL_ENABLED) {
+ if (flags & FTRACE_EVENT_FL_SOFT_DISABLED)
buf = "0*\n";
- else if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ else if (flags & FTRACE_EVENT_FL_SOFT_MODE)
buf = "1*\n";
else
buf = "1\n";
@@ -643,13 +710,10 @@ static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_file *file = filp->private_data;
+ struct ftrace_event_file *file;
unsigned long val;
int ret;
- if (!file)
- return -EINVAL;
-
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -661,8 +725,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
switch (val) {
case 0:
case 1:
+ ret = -ENODEV;
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(file, val);
+ file = event_file_data(filp);
+ if (likely(file))
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -769,7 +836,7 @@ enum {
static void *f_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
struct list_head *common_head = &ftrace_common_fields;
struct list_head *head = trace_get_fields(call);
@@ -813,6 +880,11 @@ static void *f_start(struct seq_file *m, loff_t *pos)
loff_t l = 0;
void *p;
+ /* ->stop() is called even if ->start() fails */
+ mutex_lock(&event_mutex);
+ if (!event_file_data(m->private))
+ return ERR_PTR(-ENODEV);
+
/* Start by showing the header */
if (!*pos)
return (void *)FORMAT_HEADER;
@@ -827,7 +899,7 @@ static void *f_start(struct seq_file *m, loff_t *pos)
static int f_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = m->private;
+ struct ftrace_event_call *call = event_file_data(m->private);
struct ftrace_event_field *field;
const char *array_descriptor;
@@ -878,6 +950,7 @@ static int f_show(struct seq_file *m, void *v)
static void f_stop(struct seq_file *m, void *p)
{
+ mutex_unlock(&event_mutex);
}
static const struct seq_operations trace_format_seq_ops = {
@@ -889,7 +962,6 @@ static const struct seq_operations trace_format_seq_ops = {
static int trace_format_open(struct inode *inode, struct file *file)
{
- struct ftrace_event_call *call = inode->i_private;
struct seq_file *m;
int ret;
@@ -898,7 +970,7 @@ static int trace_format_open(struct inode *inode, struct file *file)
return ret;
m = file->private_data;
- m->private = call;
+ m->private = file;
return 0;
}
@@ -906,19 +978,22 @@ static int trace_format_open(struct inode *inode, struct file *file)
static ssize_t
event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ int id = (long)event_file_data(filp);
struct trace_seq *s;
int r;
if (*ppos)
return 0;
+ if (unlikely(!id))
+ return -ENODEV;
+
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
trace_seq_init(s);
- trace_seq_printf(s, "%d\n", call->event.type);
+ trace_seq_printf(s, "%d\n", id);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, s->len);
@@ -930,21 +1005,28 @@ static ssize_t
event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
struct trace_seq *s;
- int r;
+ int r = -ENODEV;
if (*ppos)
return 0;
s = kmalloc(sizeof(*s), GFP_KERNEL);
+
if (!s)
return -ENOMEM;
trace_seq_init(s);
- print_event_filter(call, s);
- r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (call)
+ print_event_filter(call, s);
+ mutex_unlock(&event_mutex);
+
+ if (call)
+ r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
kfree(s);
@@ -955,9 +1037,9 @@ static ssize_t
event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_call *call;
char *buf;
- int err;
+ int err = -ENODEV;
if (cnt >= PAGE_SIZE)
return -EINVAL;
@@ -972,7 +1054,12 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_event_filter(call, buf);
+ mutex_lock(&event_mutex);
+ call = event_file_data(filp);
+ if (call)
+ err = apply_event_filter(call, buf);
+ mutex_unlock(&event_mutex);
+
free_page((unsigned long) buf);
if (err < 0)
return err;
@@ -992,6 +1079,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
int ret;
/* Make sure the system still exists */
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
list_for_each_entry(tr, &ftrace_trace_arrays, list) {
list_for_each_entry(dir, &tr->systems, list) {
@@ -1007,6 +1095,7 @@ static int subsystem_open(struct inode *inode, struct file *filp)
}
exit_loop:
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
if (!system)
return -ENODEV;
@@ -1014,9 +1103,17 @@ static int subsystem_open(struct inode *inode, struct file *filp)
/* Some versions of gcc think dir can be uninitialized here */
WARN_ON(!dir);
+ /* Still need to increment the ref count of the system */
+ if (trace_array_get(tr) < 0) {
+ put_system(dir);
+ return -ENODEV;
+ }
+
ret = tracing_open_generic(inode, filp);
- if (ret < 0)
+ if (ret < 0) {
+ trace_array_put(tr);
put_system(dir);
+ }
return ret;
}
@@ -1027,16 +1124,23 @@ static int system_tr_open(struct inode *inode, struct file *filp)
struct trace_array *tr = inode->i_private;
int ret;
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
+
/* Make a temporary dir that has no system but points to tr */
dir = kzalloc(sizeof(*dir), GFP_KERNEL);
- if (!dir)
+ if (!dir) {
+ trace_array_put(tr);
return -ENOMEM;
+ }
dir->tr = tr;
ret = tracing_open_generic(inode, filp);
- if (ret < 0)
+ if (ret < 0) {
+ trace_array_put(tr);
kfree(dir);
+ }
filp->private_data = dir;
@@ -1047,6 +1151,8 @@ static int subsystem_release(struct inode *inode, struct file *file)
{
struct ftrace_subsystem_dir *dir = file->private_data;
+ trace_array_put(dir->tr);
+
/*
* If dir->subsystem is NULL, then this is a temporary
* descriptor that was made for a trace_array to enable
@@ -1143,6 +1249,7 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
static int ftrace_event_avail_open(struct inode *inode, struct file *file);
static int ftrace_event_set_open(struct inode *inode, struct file *file);
+static int ftrace_event_release(struct inode *inode, struct file *file);
static const struct seq_operations show_event_seq_ops = {
.start = t_start,
@@ -1170,7 +1277,7 @@ static const struct file_operations ftrace_set_event_fops = {
.read = seq_read,
.write = ftrace_event_write,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = ftrace_event_release,
};
static const struct file_operations ftrace_enable_fops = {
@@ -1188,7 +1295,6 @@ static const struct file_operations ftrace_event_format_fops = {
};
static const struct file_operations ftrace_event_id_fops = {
- .open = tracing_open_generic,
.read = event_id_read,
.llseek = default_llseek,
};
@@ -1247,6 +1353,15 @@ ftrace_event_open(struct inode *inode, struct file *file,
return ret;
}
+static int ftrace_event_release(struct inode *inode, struct file *file)
+{
+ struct trace_array *tr = inode->i_private;
+
+ trace_array_put(tr);
+
+ return seq_release(inode, file);
+}
+
static int
ftrace_event_avail_open(struct inode *inode, struct file *file)
{
@@ -1260,12 +1375,19 @@ ftrace_event_set_open(struct inode *inode, struct file *file)
{
const struct seq_operations *seq_ops = &show_set_event_seq_ops;
struct trace_array *tr = inode->i_private;
+ int ret;
+
+ if (trace_array_get(tr) < 0)
+ return -ENODEV;
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
ftrace_clear_events(tr);
- return ftrace_event_open(inode, file, seq_ops);
+ ret = ftrace_event_open(inode, file, seq_ops);
+ if (ret < 0)
+ trace_array_put(tr);
+ return ret;
}
static struct event_subsystem *
@@ -1279,7 +1401,15 @@ create_new_subsystem(const char *name)
return NULL;
system->ref_count = 1;
- system->name = name;
+
+ /* Only allocate if dynamic (kprobes and modules) */
+ if (!core_kernel_data((unsigned long)name)) {
+ system->ref_count |= SYSTEM_FL_FREE_NAME;
+ system->name = kstrdup(name, GFP_KERNEL);
+ if (!system->name)
+ goto out_free;
+ } else
+ system->name = name;
system->filter = NULL;
@@ -1292,6 +1422,8 @@ create_new_subsystem(const char *name)
return system;
out_free:
+ if (system->ref_count & SYSTEM_FL_FREE_NAME)
+ kfree(system->name);
kfree(system);
return NULL;
}
@@ -1410,8 +1542,8 @@ event_create_dir(struct dentry *parent,
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, file->dir, call,
- id);
+ trace_create_file("id", 0444, file->dir,
+ (void *)(long)call->event.type, id);
#endif
/*
@@ -1436,33 +1568,16 @@ event_create_dir(struct dentry *parent,
return 0;
}
-static void remove_subsystem(struct ftrace_subsystem_dir *dir)
-{
- if (!dir)
- return;
-
- if (!--dir->nr_events) {
- debugfs_remove_recursive(dir->entry);
- list_del(&dir->list);
- __put_system_dir(dir);
- }
-}
-
static void remove_event_from_tracers(struct ftrace_event_call *call)
{
struct ftrace_event_file *file;
struct trace_array *tr;
do_for_each_event_file_safe(tr, file) {
-
if (file->event_call != call)
continue;
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
-
+ remove_event_file_dir(file);
/*
* The do_for_each_event_file_safe() is
* a double loop. After finding the call for this
@@ -1591,6 +1706,7 @@ static void __add_event_to_tracers(struct ftrace_event_call *call,
int trace_add_event_call(struct ftrace_event_call *call)
{
int ret;
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
ret = __register_event(call, NULL);
@@ -1598,11 +1714,13 @@ int trace_add_event_call(struct ftrace_event_call *call)
__add_event_to_tracers(call, NULL);
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
return ret;
}
/*
- * Must be called under locking both of event_mutex and trace_event_sem.
+ * Must be called under locking of trace_types_lock, event_mutex and
+ * trace_event_sem.
*/
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
@@ -1611,14 +1729,47 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
destroy_preds(call);
}
+static int probe_remove_event_call(struct ftrace_event_call *call)
+{
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+#ifdef CONFIG_PERF_EVENTS
+ if (call->perf_refcount)
+ return -EBUSY;
+#endif
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ /*
+ * We can't rely on ftrace_event_enable_disable(enable => 0)
+ * we are going to do, FTRACE_EVENT_FL_SOFT_MODE can suppress
+ * TRACE_REG_UNREGISTER.
+ */
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return -EBUSY;
+ break;
+ } while_for_each_event_file();
+
+ __trace_remove_event_call(call);
+
+ return 0;
+}
+
/* Remove an event_call */
-void trace_remove_event_call(struct ftrace_event_call *call)
+int trace_remove_event_call(struct ftrace_event_call *call)
{
+ int ret;
+
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
down_write(&trace_event_sem);
- __trace_remove_event_call(call);
+ ret = probe_remove_event_call(call);
up_write(&trace_event_sem);
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
}
#define for_each_event(event, start, end) \
@@ -1703,6 +1854,16 @@ static void trace_module_add_events(struct module *mod)
struct ftrace_module_file_ops *file_ops = NULL;
struct ftrace_event_call **call, **start, **end;
+ if (!mod->num_trace_events)
+ return;
+
+ /* Don't add infrastructure for mods without tracepoints */
+ if (trace_module_has_bad_taint(mod)) {
+ pr_err("%s: module has bad taint, not creating trace events\n",
+ mod->name);
+ return;
+ }
+
start = mod->trace_events;
end = mod->trace_events + mod->num_trace_events;
@@ -1762,6 +1923,7 @@ static int trace_module_notify(struct notifier_block *self,
{
struct module *mod = data;
+ mutex_lock(&trace_types_lock);
mutex_lock(&event_mutex);
switch (val) {
case MODULE_STATE_COMING:
@@ -1772,6 +1934,7 @@ static int trace_module_notify(struct notifier_block *self,
break;
}
mutex_unlock(&event_mutex);
+ mutex_unlock(&trace_types_lock);
return 0;
}
@@ -2188,12 +2351,8 @@ __trace_remove_event_dirs(struct trace_array *tr)
{
struct ftrace_event_file *file, *next;
- list_for_each_entry_safe(file, next, &tr->events, list) {
- list_del(&file->list);
- debugfs_remove_recursive(file->dir);
- remove_subsystem(file->system);
- kmem_cache_free(file_cachep, file);
- }
+ list_for_each_entry_safe(file, next, &tr->events, list)
+ remove_event_file_dir(file);
}
static void
@@ -2329,11 +2488,11 @@ early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
int event_trace_del_tracer(struct trace_array *tr)
{
- /* Disable any running events */
- __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
-
mutex_lock(&event_mutex);
+ /* Disable any running events */
+ __ftrace_set_clr_event_nolock(tr, NULL, NULL, NULL, 0);
+
down_write(&trace_event_sem);
__trace_remove_event_dirs(tr);
debugfs_remove_recursive(tr->event_dir);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e1b653f7e1c..0a1edc694d6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -631,17 +631,15 @@ static void append_filter_err(struct filter_parse_state *ps,
free_page((unsigned long) buf);
}
+/* caller must hold event_mutex */
void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter;
+ struct event_filter *filter = call->filter;
- mutex_lock(&event_mutex);
- filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
trace_seq_printf(s, "none\n");
- mutex_unlock(&event_mutex);
}
void print_subsystem_event_filter(struct event_subsystem *system,
@@ -1835,23 +1833,22 @@ static int create_system_filter(struct event_subsystem *system,
return err;
}
+/* caller must hold event_mutex */
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
struct event_filter *filter;
- int err = 0;
-
- mutex_lock(&event_mutex);
+ int err;
if (!strcmp(strstrip(filter_string), "0")) {
filter_disable(call);
filter = call->filter;
if (!filter)
- goto out_unlock;
+ return 0;
RCU_INIT_POINTER(call->filter, NULL);
/* Make sure the filter is not being used */
synchronize_sched();
__free_filter(filter);
- goto out_unlock;
+ return 0;
}
err = create_filter(call, filter_string, true, &filter);
@@ -1878,8 +1875,6 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
__free_filter(tmp);
}
}
-out_unlock:
- mutex_unlock(&event_mutex);
return err;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index d21a7467008..d7d0b50b1b7 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __array
#define __array(type, item, len) \
do { \
+ char *type_str = #type"["__stringify(len)"]"; \
BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
- mutex_lock(&event_storage_mutex); \
- snprintf(event_storage, sizeof(event_storage), \
- "%s[%d]", #type, len); \
- ret = trace_define_field(event_call, event_storage, #item, \
+ ret = trace_define_field(event_call, type_str, #item, \
offsetof(typeof(field), item), \
sizeof(field.item), \
is_signed_type(type), filter_type); \
- mutex_unlock(&event_storage_mutex); \
if (ret) \
return ret; \
} while (0);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index b19d065a28c..2aefbee93a6 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -373,7 +373,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
struct trace_array_cpu *data;
unsigned long flags;
- if (likely(!tracer_enabled))
+ if (!tracer_enabled || !tracing_is_enabled())
return;
cpu = raw_smp_processor_id();
@@ -416,7 +416,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
else
return;
- if (!tracer_enabled)
+ if (!tracer_enabled || !tracing_is_enabled())
return;
data = per_cpu_ptr(tr->trace_buffer.data, cpu);
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 9f46e98ba8f..64abc8ca928 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -90,7 +90,7 @@ static __kprobes bool trace_probe_is_on_module(struct trace_probe *tp)
}
static int register_probe_event(struct trace_probe *tp);
-static void unregister_probe_event(struct trace_probe *tp);
+static int unregister_probe_event(struct trace_probe *tp);
static DEFINE_MUTEX(probe_lock);
static LIST_HEAD(probe_list);
@@ -281,6 +281,8 @@ trace_probe_file_index(struct trace_probe *tp, struct ftrace_event_file *file)
static int
disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
{
+ struct ftrace_event_file **old = NULL;
+ int wait = 0;
int ret = 0;
mutex_lock(&probe_enable_lock);
@@ -314,10 +316,7 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
}
rcu_assign_pointer(tp->files, new);
-
- /* Make sure the probe is done with old files */
- synchronize_sched();
- kfree(old);
+ wait = 1;
} else
tp->flags &= ~TP_FLAG_PROFILE;
@@ -326,11 +325,25 @@ disable_trace_probe(struct trace_probe *tp, struct ftrace_event_file *file)
disable_kretprobe(&tp->rp);
else
disable_kprobe(&tp->rp.kp);
+ wait = 1;
}
out_unlock:
mutex_unlock(&probe_enable_lock);
+ if (wait) {
+ /*
+ * Synchronize with kprobe_trace_func/kretprobe_trace_func
+ * to ensure disabled (all running handlers are finished).
+ * This is not only for kfree(), but also the caller,
+ * trace_remove_event_call() supposes it for releasing
+ * event_call related objects, which will be accessed in
+ * the kprobe_trace_func/kretprobe_trace_func.
+ */
+ synchronize_sched();
+ kfree(old); /* Ignored if link == NULL */
+ }
+
return ret;
}
@@ -398,9 +411,12 @@ static int unregister_trace_probe(struct trace_probe *tp)
if (trace_probe_is_enabled(tp))
return -EBUSY;
+ /* Will fail if probe is being used by ftrace or perf */
+ if (unregister_probe_event(tp))
+ return -EBUSY;
+
__unregister_trace_probe(tp);
list_del(&tp->list);
- unregister_probe_event(tp);
return 0;
}
@@ -679,7 +695,9 @@ static int release_all_trace_probes(void)
/* TODO: Use batch unregistration */
while (!list_empty(&probe_list)) {
tp = list_entry(probe_list.next, struct trace_probe, list);
- unregister_trace_probe(tp);
+ ret = unregister_trace_probe(tp);
+ if (ret)
+ goto end;
free_trace_probe(tp);
}
@@ -1312,11 +1330,15 @@ static int register_probe_event(struct trace_probe *tp)
return ret;
}
-static void unregister_probe_event(struct trace_probe *tp)
+static int unregister_probe_event(struct trace_probe *tp)
{
+ int ret;
+
/* tp->event is unregistered in trace_remove_event_call() */
- trace_remove_event_call(&tp->call);
- kfree(tp->call.print_fmt);
+ ret = trace_remove_event_call(&tp->call);
+ if (!ret)
+ kfree(tp->call.print_fmt);
+ return ret;
}
/* Make a debugfs interface for controlling probe points */
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 8f2ac73c7a5..322e1646107 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -306,6 +306,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
+ unsigned long irq_flags;
+ int pc;
int syscall_nr;
int size;
@@ -321,9 +323,12 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
- sys_data->enter_event->event.type, size, 0, 0);
+ sys_data->enter_event->event.type, size, irq_flags, pc);
if (!event)
return;
@@ -333,7 +338,8 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
if (!filter_current_check_discard(buffer, sys_data->enter_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
@@ -343,6 +349,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
struct ring_buffer *buffer;
+ unsigned long irq_flags;
+ int pc;
int syscall_nr;
syscall_nr = trace_get_syscall_nr(current, regs);
@@ -355,9 +363,13 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer,
- sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry),
+ irq_flags, pc);
if (!event)
return;
@@ -367,7 +379,8 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
if (!filter_current_check_discard(buffer, sys_data->exit_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
static int reg_event_syscall_enter(struct ftrace_event_file *file,
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index 32494fb0ee6..6fd72b76852 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -70,7 +70,7 @@ struct trace_uprobe {
(sizeof(struct probe_arg) * (n)))
static int register_uprobe_event(struct trace_uprobe *tu);
-static void unregister_uprobe_event(struct trace_uprobe *tu);
+static int unregister_uprobe_event(struct trace_uprobe *tu);
static DEFINE_MUTEX(uprobe_lock);
static LIST_HEAD(uprobe_list);
@@ -164,11 +164,17 @@ static struct trace_uprobe *find_probe_event(const char *event, const char *grou
}
/* Unregister a trace_uprobe and probe_event: call with locking uprobe_lock */
-static void unregister_trace_uprobe(struct trace_uprobe *tu)
+static int unregister_trace_uprobe(struct trace_uprobe *tu)
{
+ int ret;
+
+ ret = unregister_uprobe_event(tu);
+ if (ret)
+ return ret;
+
list_del(&tu->list);
- unregister_uprobe_event(tu);
free_trace_uprobe(tu);
+ return 0;
}
/* Register a trace_uprobe and probe_event */
@@ -181,9 +187,12 @@ static int register_trace_uprobe(struct trace_uprobe *tu)
/* register as an event */
old_tp = find_probe_event(tu->call.name, tu->call.class->system);
- if (old_tp)
+ if (old_tp) {
/* delete old event */
- unregister_trace_uprobe(old_tp);
+ ret = unregister_trace_uprobe(old_tp);
+ if (ret)
+ goto end;
+ }
ret = register_uprobe_event(tu);
if (ret) {
@@ -256,6 +265,8 @@ static int create_trace_uprobe(int argc, char **argv)
group = UPROBE_EVENT_SYSTEM;
if (is_delete) {
+ int ret;
+
if (!event) {
pr_info("Delete command needs an event name.\n");
return -EINVAL;
@@ -269,9 +280,9 @@ static int create_trace_uprobe(int argc, char **argv)
return -ENOENT;
}
/* delete an event */
- unregister_trace_uprobe(tu);
+ ret = unregister_trace_uprobe(tu);
mutex_unlock(&uprobe_lock);
- return 0;
+ return ret;
}
if (argc < 2) {
@@ -283,8 +294,10 @@ static int create_trace_uprobe(int argc, char **argv)
return -EINVAL;
}
arg = strchr(argv[1], ':');
- if (!arg)
+ if (!arg) {
+ ret = -EINVAL;
goto fail_address_parse;
+ }
*arg++ = '\0';
filename = argv[1];
@@ -406,16 +419,20 @@ fail_address_parse:
return ret;
}
-static void cleanup_all_probes(void)
+static int cleanup_all_probes(void)
{
struct trace_uprobe *tu;
+ int ret = 0;
mutex_lock(&uprobe_lock);
while (!list_empty(&uprobe_list)) {
tu = list_entry(uprobe_list.next, struct trace_uprobe, list);
- unregister_trace_uprobe(tu);
+ ret = unregister_trace_uprobe(tu);
+ if (ret)
+ break;
}
mutex_unlock(&uprobe_lock);
+ return ret;
}
/* Probes listing interfaces */
@@ -460,8 +477,13 @@ static const struct seq_operations probes_seq_op = {
static int probes_open(struct inode *inode, struct file *file)
{
- if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC))
- cleanup_all_probes();
+ int ret;
+
+ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+ ret = cleanup_all_probes();
+ if (ret)
+ return ret;
+ }
return seq_open(file, &probes_seq_op);
}
@@ -968,12 +990,17 @@ static int register_uprobe_event(struct trace_uprobe *tu)
return ret;
}
-static void unregister_uprobe_event(struct trace_uprobe *tu)
+static int unregister_uprobe_event(struct trace_uprobe *tu)
{
+ int ret;
+
/* tu->event is unregistered in trace_remove_event_call() */
- trace_remove_event_call(&tu->call);
+ ret = trace_remove_event_call(&tu->call);
+ if (ret)
+ return ret;
kfree(tu->call.print_fmt);
tu->call.print_fmt = NULL;
+ return 0;
}
/* Make a trace interface for controling probe points */
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 29f26540e9c..031cc5655a5 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter)
EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
#ifdef CONFIG_MODULES
+bool trace_module_has_bad_taint(struct module *mod)
+{
+ return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP));
+}
+
static int tracepoint_module_coming(struct module *mod)
{
struct tp_module *tp_mod, *iter;
@@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod)
* module headers (for forced load), to make sure we don't cause a crash.
* Staging and out-of-tree GPL modules are fine.
*/
- if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)))
+ if (trace_module_has_bad_taint(mod))
return 0;
mutex_lock(&tracepoints_mutex);
tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index d8c30db06c5..9064b919a40 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -62,6 +62,9 @@ int create_user_ns(struct cred *new)
kgid_t group = new->egid;
int ret;
+ if (parent_ns->level > 32)
+ return -EUSERS;
+
/*
* Verify that we can not violate the policy of which files
* may be accessed that is specified by the root directory,
@@ -92,6 +95,7 @@ int create_user_ns(struct cred *new)
atomic_set(&ns->count, 1);
/* Leave the new->user_ns reference with the new user namespace. */
ns->parent = parent_ns;
+ ns->level = parent_ns->level + 1;
ns->owner = owner;
ns->group = group;
@@ -105,16 +109,21 @@ int create_user_ns(struct cred *new)
int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
{
struct cred *cred;
+ int err = -ENOMEM;
if (!(unshare_flags & CLONE_NEWUSER))
return 0;
cred = prepare_creds();
- if (!cred)
- return -ENOMEM;
+ if (cred) {
+ err = create_user_ns(cred);
+ if (err)
+ put_cred(cred);
+ else
+ *new_cred = cred;
+ }
- *new_cred = cred;
- return create_user_ns(cred);
+ return err;
}
void free_user_ns(struct user_namespace *ns)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee8e29a2320..6188aafe259 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -272,6 +272,15 @@ static cpumask_var_t *wq_numa_possible_cpumask;
static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+/* see the comment above the definition of WQ_POWER_EFFICIENT */
+#ifdef CONFIG_WQ_POWER_EFFICIENT_DEFAULT
+static bool wq_power_efficient = true;
+#else
+static bool wq_power_efficient;
+#endif
+
+module_param_named(power_efficient, wq_power_efficient, bool, 0444);
+
static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -295,6 +304,9 @@ static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
/* I: attributes used when instantiating standard unbound pools on demand */
static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+/* I: attributes used when instantiating ordered pools on demand */
+static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -305,6 +317,10 @@ struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
+struct workqueue_struct *system_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_power_efficient_wq);
+struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
+EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);
static int worker_thread(void *__worker);
static void copy_workqueue_attrs(struct workqueue_attrs *to,
@@ -1820,6 +1836,12 @@ static void destroy_worker(struct worker *worker)
if (worker->flags & WORKER_IDLE)
pool->nr_idle--;
+ /*
+ * Once WORKER_DIE is set, the kworker may destroy itself at any
+ * point. Pin to ensure the task stays until we're done with it.
+ */
+ get_task_struct(worker->task);
+
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
@@ -1828,6 +1850,7 @@ static void destroy_worker(struct worker *worker)
spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
+ put_task_struct(worker->task);
kfree(worker);
spin_lock_irq(&pool->lock);
@@ -2188,6 +2211,15 @@ __acquires(&pool->lock)
dump_stack();
}
+ /*
+ * The following prevents a kworker from hogging CPU on !PREEMPT
+ * kernels, where a requeueing work item waiting for something to
+ * happen could deadlock with stop_machine as such work item could
+ * indefinitely requeue itself while all other CPUs are trapped in
+ * stop_machine.
+ */
+ cond_resched();
+
spin_lock_irq(&pool->lock);
/* clear cpu intensive status */
@@ -3398,6 +3430,12 @@ static void copy_workqueue_attrs(struct workqueue_attrs *to,
{
to->nice = from->nice;
cpumask_copy(to->cpumask, from->cpumask);
+ /*
+ * Unlike hash and equality test, this function doesn't ignore
+ * ->no_numa as it is used for both pool and wq attrs. Instead,
+ * get_unbound_pool() explicitly clears ->no_numa after copying.
+ */
+ to->no_numa = from->no_numa;
}
/* hash value of the content of @attr */
@@ -3565,6 +3603,12 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
copy_workqueue_attrs(pool->attrs, attrs);
+ /*
+ * no_numa isn't a worker_pool attribute, always clear it. See
+ * 'struct workqueue_attrs' comments for detail.
+ */
+ pool->attrs->no_numa = false;
+
/* if cpumask is contained inside a NUMA node, we belong to that node */
if (wq_numa_enabled) {
for_each_node(node) {
@@ -4038,7 +4082,7 @@ out_unlock:
static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI;
- int cpu;
+ int cpu, ret;
if (!(wq->flags & WQ_UNBOUND)) {
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
@@ -4058,6 +4102,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
mutex_unlock(&wq->mutex);
}
return 0;
+ } else if (wq->flags & __WQ_ORDERED) {
+ ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ /* there should only be single pwq for ordering guarantee */
+ WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
+ wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
+ "ordering guarantee broken for workqueue %s\n", wq->name);
+ return ret;
} else {
return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
@@ -4086,6 +4137,10 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /* see the comment above the definition of WQ_POWER_EFFICIENT */
+ if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
+ flags |= WQ_UNBOUND;
+
/* allocate wq and format name */
if (flags & WQ_UNBOUND)
tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
@@ -4969,13 +5024,23 @@ static int __init init_workqueues(void)
}
}
- /* create default unbound wq attrs */
+ /* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;
BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;
+
+ /*
+ * An ordered wq should have only one pwq as ordering is
+ * guaranteed by max_active which is enforced by pwqs.
+ * Turn off NUMA so that dfl_pwq is used for all nodes.
+ */
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ attrs->no_numa = true;
+ ordered_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);
@@ -4985,8 +5050,15 @@ static int __init init_workqueues(void)
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
+ system_power_efficient_wq = alloc_workqueue("events_power_efficient",
+ WQ_POWER_EFFICIENT, 0);
+ system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
+ WQ_FREEZABLE | WQ_POWER_EFFICIENT,
+ 0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
- !system_unbound_wq || !system_freezable_wq);
+ !system_unbound_wq || !system_freezable_wq ||
+ !system_power_efficient_wq ||
+ !system_freezable_power_efficient_wq);
return 0;
}
early_initcall(init_workqueues);