diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup.c | 59 | ||||
-rw-r--r-- | kernel/cpu.c | 24 | ||||
-rw-r--r-- | kernel/debug/kdb/kdb_io.c | 12 | ||||
-rw-r--r-- | kernel/fork.c | 3 | ||||
-rw-r--r-- | kernel/panic.c | 8 | ||||
-rw-r--r-- | kernel/power/Kconfig | 9 | ||||
-rw-r--r-- | kernel/power/Makefile | 2 | ||||
-rw-r--r-- | kernel/power/process.c | 22 | ||||
-rw-r--r-- | kernel/power/suspend.c | 35 | ||||
-rw-r--r-- | kernel/power/wakeup_reason.c | 225 | ||||
-rw-r--r-- | kernel/printk/printk.c | 8 | ||||
-rw-r--r-- | kernel/sched/core.c | 15 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1 | ||||
-rw-r--r-- | kernel/sys.c | 173 | ||||
-rw-r--r-- | kernel/sysctl.c | 31 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 3 | ||||
-rw-r--r-- | kernel/trace/Makefile | 1 | ||||
-rw-r--r-- | kernel/trace/gpu-traces.c | 23 | ||||
-rw-r--r-- | kernel/trace/trace.c | 98 | ||||
-rw-r--r-- | kernel/trace/trace.h | 4 | ||||
-rw-r--r-- | kernel/trace/trace_functions_graph.c | 43 | ||||
-rw-r--r-- | kernel/trace/trace_output.c | 184 | ||||
-rw-r--r-- | kernel/watchdog.c | 123 |
23 files changed, 1050 insertions, 56 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 470f6536b9e8..a57b7c86871c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2663,6 +2663,45 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, return ret; } +int subsys_cgroup_allow_attach(struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct cgroup_subsys_state *css; + + if (capable(CAP_SYS_NICE)) + return 0; + + cgroup_taskset_for_each(task, css, tset) { + tcred = __task_cred(task); + + if (current != task && !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) + return -EACCES; + } + + return 0; +} + +static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct cgroup_subsys_state *css; + int i; + int ret; + + for_each_css(css, i, cgrp) { + if (css->ss->allow_attach) { + ret = css->ss->allow_attach(tset); + if (ret) + return ret; + } else { + return -EACCES; + } + } + + return 0; +} + static int cgroup_procs_write_permission(struct task_struct *task, struct cgroup *dst_cgrp, struct kernfs_open_file *of) @@ -2677,8 +2716,24 @@ static int cgroup_procs_write_permission(struct task_struct *task, */ if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) - ret = -EACCES; + !uid_eq(cred->euid, tcred->suid)) { + /* + * if the default permission check fails, give each + * cgroup a chance to extend the permission check + */ + struct cgroup_taskset tset = { + .src_csets = LIST_HEAD_INIT(tset.src_csets), + .dst_csets = LIST_HEAD_INIT(tset.dst_csets), + .csets = &tset.src_csets, + }; + struct css_set *cset; + cset = task_css_set(task); + list_add(&cset->mg_node, &tset.src_csets); + ret = cgroup_allow_attach(dst_cgrp, &tset); + list_del(&tset.src_csets); + if (ret) + ret = -EACCES; + } if (!ret && cgroup_on_dfl(dst_cgrp)) { struct super_block *sb = of->file->f_path.dentry->d_sb; diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..37731292f8a1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -24,6 +24,8 @@ #include <linux/irq.h> #include <trace/events/power.h> +#include <trace/events/sched.h> + #include "smpboot.h" #ifdef CONFIG_SMP @@ -425,6 +427,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) out_release: cpu_hotplug_done(); + trace_sched_cpu_hotplug(cpu, err, 0); if (!err) cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); return err; @@ -530,6 +533,7 @@ out_notify: __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); out: cpu_hotplug_done(); + trace_sched_cpu_hotplug(cpu, ret, 1); return ret; } @@ -827,3 +831,23 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_online_bits), src); } + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void idle_notifier_call_chain(unsigned long val) +{ + atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(idle_notifier_call_chain); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index fc1ef736253c..0b891286a150 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int i; int diag, dtab_count; int key; - + static int last_crlf; diag = kdbgetintenv("DTABCOUNT", &dtab_count); if (diag) @@ -237,6 +237,9 @@ poll_again: return buffer; if (key != 9) tab = 0; + if (key != 10 && key != 13) + last_crlf = 0; + switch (key) { case 8: /* backspace */ if (cp > buffer) { @@ -254,7 +257,12 @@ poll_again: *cp = tmp; } break; - case 13: /* enter */ + case 10: /* new line */ + case 13: /* carriage return */ + /* handle \n after \r */ + if (last_crlf && last_crlf != key) + break; + last_crlf = key; *lastchar++ = '\n'; *lastchar++ = '\0'; if (!KDB_STATE(KGDB_TRANS)) { diff --git a/kernel/fork.c b/kernel/fork.c index 1155eac61687..7ec6e9939b2c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -800,7 +800,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + !ptrace_may_access(task, mode) && + !capable(CAP_SYS_RESOURCE)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/kernel/panic.c b/kernel/panic.c index 4b150bc0c6c1..f07bfc9fe613 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -28,6 +28,9 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +/* Machine specific panic information string */ +char *mach_panic_string; + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask; static int pause_on_oops; @@ -413,6 +416,11 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); + + if (mach_panic_string) + printk(KERN_WARNING "Board Information: %s\n", + mach_panic_string); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..84c480946fb2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -28,6 +28,15 @@ config SUSPEND_SKIP_SYNC of suspend, or they are content with invoking sync() from user-space before invoking suspend. Say Y if that's your case. +config WAKELOCK + bool "Android's method of preventing suspend" + default y + ---help--- + This allows applications to prevent the CPU from suspending while + they need it. + + Say Y if you are running an android userspace. + config HIBERNATE_CALLBACKS bool diff --git a/kernel/power/Makefile b/kernel/power/Makefile index cb880a14cc39..22eb9ed879ad 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -12,3 +12,5 @@ obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_SUSPEND) += wakeup_reason.o diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..e7f1f736a5b6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -18,6 +18,7 @@ #include <linux/workqueue.h> #include <linux/kmod.h> #include <trace/events/power.h> +#include <linux/wakeup_reason.h> /* * Timeout for stopping processes @@ -35,6 +36,9 @@ static int try_to_freeze_tasks(bool user_only) unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; +#ifdef CONFIG_PM_SLEEP + char suspend_abort[MAX_SUSPEND_ABORT_LEN]; +#endif do_gettimeofday(&start); @@ -64,6 +68,11 @@ static int try_to_freeze_tasks(bool user_only) break; if (pm_wakeup_pending()) { +#ifdef CONFIG_PM_SLEEP + pm_get_active_wakeup_sources(suspend_abort, + MAX_SUSPEND_ABORT_LEN); + log_suspend_abort_reason(suspend_abort); +#endif wakeup = true; break; } @@ -83,15 +92,17 @@ static int try_to_freeze_tasks(bool user_only) do_div(elapsed_msecs64, NSEC_PER_MSEC); elapsed_msecs = elapsed_msecs64; - if (todo) { + if (wakeup) { pr_cont("\n"); - pr_err("Freezing of tasks %s after %d.%03d seconds " - "(%d tasks refusing to freeze, wq_busy=%d):\n", - wakeup ? "aborted" : "failed", + pr_err("Freezing of tasks aborted after %d.%03d seconds", + elapsed_msecs / 1000, elapsed_msecs % 1000); + } else if (todo) { + pr_cont("\n"); + pr_err("Freezing of tasks failed after %d.%03d seconds" + " (%d tasks refusing to freeze, wq_busy=%d):\n", elapsed_msecs / 1000, elapsed_msecs % 1000, todo - wq_busy, wq_busy); - if (!wakeup) { read_lock(&tasklist_lock); for_each_process_thread(g, p) { if (p != current && !freezer_should_skip(p) @@ -99,7 +110,6 @@ static int try_to_freeze_tasks(bool user_only) sched_show_task(p); } read_unlock(&tasklist_lock); - } } else { pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000, elapsed_msecs % 1000); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..024411816ccf 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -26,9 +26,11 @@ #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/ftrace.h> +#include <linux/rtc.h> #include <trace/events/power.h> #include <linux/compiler.h> #include <linux/moduleparam.h> +#include <linux/wakeup_reason.h> #include "power.h" @@ -312,7 +314,8 @@ void __weak arch_suspend_enable_irqs(void) */ static int suspend_enter(suspend_state_t state, bool *wakeup) { - int error; + char suspend_abort[MAX_SUSPEND_ABORT_LEN]; + int error, last_dev; error = platform_suspend_prepare(state); if (error) @@ -320,7 +323,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; printk(KERN_ERR "PM: late suspend of devices failed\n"); + log_suspend_abort_reason("%s device failed to power down", + suspend_stats.failed_devs[last_dev]); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +336,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { + last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1; + last_dev %= REC_FAILED_NUM; printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + log_suspend_abort_reason("noirq suspend of %s device failed", + suspend_stats.failed_devs[last_dev]); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -353,8 +364,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) } error = disable_nonboot_cpus(); - if (error || suspend_test(TEST_CPUS)) + if (error || suspend_test(TEST_CPUS)) { + log_suspend_abort_reason("Disabling non-boot cpus failed"); goto Enable_cpus; + } arch_suspend_disable_irqs(); BUG_ON(!irqs_disabled()); @@ -370,6 +383,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) state, false); events_check_enabled = false; } else if (*wakeup) { + pm_get_active_wakeup_sources(suspend_abort, + MAX_SUSPEND_ABORT_LEN); + log_suspend_abort_reason(suspend_abort); error = -EBUSY; } syscore_resume(); @@ -417,6 +433,7 @@ int suspend_devices_and_enter(suspend_state_t state) error = dpm_suspend_start(PMSG_SUSPEND); if (error) { pr_err("PM: Some devices failed to suspend, or early wake event detected\n"); + log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected"); goto Recover_platform; } suspend_test_finish("suspend devices"); @@ -518,6 +535,18 @@ static int enter_state(suspend_state_t state) return error; } +static void pm_suspend_marker(char *annotation) +{ + struct timespec ts; + struct rtc_time tm; + + getnstimeofday(&ts); + rtc_time_to_tm(ts.tv_sec, &tm); + pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n", + annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec); +} + /** * pm_suspend - Externally visible function for suspending the system. * @state: System sleep state to enter. @@ -532,6 +561,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pm_suspend_marker("entry"); error = enter_state(state); if (error) { suspend_stats.fail++; @@ -539,6 +569,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } + pm_suspend_marker("exit"); return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c new file mode 100644 index 000000000000..252611fad2fe --- /dev/null +++ b/kernel/power/wakeup_reason.c @@ -0,0 +1,225 @@ +/* + * kernel/power/wakeup_reason.c + * + * Logs the reasons which caused the kernel to resume from + * the suspend mode. + * + * Copyright (C) 2014 Google, Inc. + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/wakeup_reason.h> +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/io.h> +#include <linux/kobject.h> +#include <linux/sysfs.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/notifier.h> +#include <linux/suspend.h> + + +#define MAX_WAKEUP_REASON_IRQS 32 +static int irq_list[MAX_WAKEUP_REASON_IRQS]; +static int irqcount; +static bool suspend_abort; +static char abort_reason[MAX_SUSPEND_ABORT_LEN]; +static struct kobject *wakeup_reason; +static DEFINE_SPINLOCK(resume_reason_lock); + +static ktime_t last_monotime; /* monotonic time before last suspend */ +static ktime_t curr_monotime; /* monotonic time after last suspend */ +static ktime_t last_stime; /* monotonic boottime offset before last suspend */ +static ktime_t curr_stime; /* monotonic boottime offset after last suspend */ + +static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int irq_no, buf_offset = 0; + struct irq_desc *desc; + spin_lock(&resume_reason_lock); + if (suspend_abort) { + buf_offset = sprintf(buf, "Abort: %s", abort_reason); + } else { + for (irq_no = 0; irq_no < irqcount; irq_no++) { + desc = irq_to_desc(irq_list[irq_no]); + if (desc && desc->action && desc->action->name) + buf_offset += sprintf(buf + buf_offset, "%d %s\n", + irq_list[irq_no], desc->action->name); + else + buf_offset += sprintf(buf + buf_offset, "%d\n", + irq_list[irq_no]); + } + } + spin_unlock(&resume_reason_lock); + return buf_offset; +} + +static ssize_t last_suspend_time_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct timespec sleep_time; + struct timespec total_time; + struct timespec suspend_resume_time; + + /* + * total_time is calculated from monotonic bootoffsets because + * unlike CLOCK_MONOTONIC it include the time spent in suspend state. + */ + total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime)); + + /* + * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC) + * time interval before entering suspend and post suspend. + */ + suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime)); + + /* sleep_time = total_time - suspend_resume_time */ + sleep_time = timespec_sub(total_time, suspend_resume_time); + + /* Export suspend_resume_time and sleep_time in pair here. */ + return sprintf(buf, "%lu.%09lu %lu.%09lu\n", + suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec, + sleep_time.tv_sec, sleep_time.tv_nsec); +} + +static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason); +static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time); + +static struct attribute *attrs[] = { + &resume_reason.attr, + &suspend_time.attr, + NULL, +}; +static struct attribute_group attr_group = { + .attrs = attrs, +}; + +/* + * logs all the wake up reasons to the kernel + * stores the irqs to expose them to the userspace via sysfs + */ +void log_wakeup_reason(int irq) +{ + struct irq_desc *desc; + desc = irq_to_desc(irq); + if (desc && desc->action && desc->action->name) + printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq, + desc->action->name); + else + printk(KERN_INFO "Resume caused by IRQ %d\n", irq); + + spin_lock(&resume_reason_lock); + if (irqcount == MAX_WAKEUP_REASON_IRQS) { + spin_unlock(&resume_reason_lock); + printk(KERN_WARNING "Resume caused by more than %d IRQs\n", + MAX_WAKEUP_REASON_IRQS); + return; + } + + irq_list[irqcount++] = irq; + spin_unlock(&resume_reason_lock); +} + +int check_wakeup_reason(int irq) +{ + int irq_no; + int ret = false; + + spin_lock(&resume_reason_lock); + for (irq_no = 0; irq_no < irqcount; irq_no++) + if (irq_list[irq_no] == irq) { + ret = true; + break; + } + spin_unlock(&resume_reason_lock); + return ret; +} + +void log_suspend_abort_reason(const char *fmt, ...) +{ + va_list args; + + spin_lock(&resume_reason_lock); + + //Suspend abort reason has already been logged. + if (suspend_abort) { + spin_unlock(&resume_reason_lock); + return; + } + + suspend_abort = true; + va_start(args, fmt); + vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args); + va_end(args); + spin_unlock(&resume_reason_lock); +} + +/* Detects a suspend and clears all the previous wake up reasons*/ +static int wakeup_reason_pm_event(struct notifier_block *notifier, + unsigned long pm_event, void *unused) +{ + switch (pm_event) { + case PM_SUSPEND_PREPARE: + spin_lock(&resume_reason_lock); + irqcount = 0; + suspend_abort = false; + spin_unlock(&resume_reason_lock); + /* monotonic time since boot */ + last_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + last_stime = ktime_get_boottime(); + break; + case PM_POST_SUSPEND: + /* monotonic time since boot */ + curr_monotime = ktime_get(); + /* monotonic time since boot including the time spent in suspend */ + curr_stime = ktime_get_boottime(); + break; + default: + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block wakeup_reason_pm_notifier_block = { + .notifier_call = wakeup_reason_pm_event, +}; + +/* Initializes the sysfs parameter + * registers the pm_event notifier + */ +int __init wakeup_reason_init(void) +{ + int retval; + + retval = register_pm_notifier(&wakeup_reason_pm_notifier_block); + if (retval) + printk(KERN_WARNING "[%s] failed to register PM notifier %d\n", + __func__, retval); + + wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj); + if (!wakeup_reason) { + printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n", + __func__); + return 1; + } + retval = sysfs_create_group(wakeup_reason, &attr_group); + if (retval) { + kobject_put(wakeup_reason); + printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n", + __func__, retval); + } + return 0; +} + +late_initcall(wakeup_reason_init); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ce8826f1053..28fb44dccbad 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -55,6 +55,10 @@ #include "console_cmdline.h" #include "braille.h" +#ifdef CONFIG_EARLY_PRINTK_DIRECT +extern void printascii(char *); +#endif + int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ @@ -1754,6 +1758,10 @@ asmlinkage int vprintk_emit(int facility, int level, } } +#ifdef CONFIG_EARLY_PRINTK_DIRECT + printascii(text); +#endif + if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 732e993b564b..61b0914cc7aa 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7542,6 +7542,14 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == preempt_offset); } +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + void __might_sleep(const char *file, int line, int preempt_offset) { /* @@ -7566,8 +7574,10 @@ void ___might_sleep(const char *file, int line, int preempt_offset) rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && - !is_idle_task(current)) || - system_state != SYSTEM_RUNNING || oops_in_progress) + !is_idle_task(current)) || oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -8599,6 +8609,7 @@ struct cgroup_subsys cpu_cgrp_subsys = { .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .allow_attach = subsys_cgroup_allow_attach, .legacy_cftypes = cpu_files, .early_init = 1, }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cfdc0e61066c..ba24bfe4ac51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2931,6 +2931,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) } trace_sched_stat_blocked(tsk, delta); + trace_sched_blocked_reason(tsk); /* * Blocking time is in units of nanosecs, so shift by diff --git a/kernel/sys.c b/kernel/sys.c index 6af9212ab5aa..11333311cf1c 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -41,6 +41,8 @@ #include <linux/syscore_ops.h> #include <linux/version.h> #include <linux/ctype.h> +#include <linux/mm.h> +#include <linux/mempolicy.h> #include <linux/compat.h> #include <linux/syscalls.h> @@ -2072,10 +2074,158 @@ static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) } #endif +#ifdef CONFIG_MMU +static int prctl_update_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + const char __user *name_addr) +{ + struct mm_struct *mm = vma->vm_mm; + int error = 0; + pgoff_t pgoff; + + if (name_addr == vma_get_anon_name(vma)) { + *prev = vma; + goto out; + } + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma, + vma->vm_file, pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, name_addr); + if (*prev) { + vma = *prev; + goto success; + } + + *prev = vma; + + if (start != vma->vm_start) { + error = split_vma(mm, vma, start, 1); + if (error) + goto out; + } + + if (end != vma->vm_end) { + error = split_vma(mm, vma, end, 0); + if (error) + goto out; + } + +success: + if (!vma->vm_file) + vma->anon_name = name_addr; + +out: + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +static int prctl_set_vma_anon_name(unsigned long start, unsigned long end, + unsigned long arg) +{ + unsigned long tmp; + struct vm_area_struct *vma, *prev; + int unmapped_error = 0; + int error = -EINVAL; + + /* + * If the interval [start,end) covers some unmapped address + * ranges, just ignore them, but return -ENOMEM at the end. + * - this matches the handling in madvise. + */ + vma = find_vma_prev(current->mm, start, &prev); + if (vma && start > vma->vm_start) + prev = vma; + + for (;;) { + /* Still start < end. */ + error = -ENOMEM; + if (!vma) + return error; + + /* Here start < (end|vma->vm_end). */ + if (start < vma->vm_start) { + unmapped_error = -ENOMEM; + start = vma->vm_start; + if (start >= end) + return error; + } + + /* Here vma->vm_start <= start < (end|vma->vm_end) */ + tmp = vma->vm_end; + if (end < tmp) + tmp = end; + + /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ + error = prctl_update_vma_anon_name(vma, &prev, start, tmp, + (const char __user *)arg); + if (error) + return error; + start = tmp; + if (prev && start < prev->vm_end) + start = prev->vm_end; + error = unmapped_error; + if (start >= end) + return error; + if (prev) + vma = prev->vm_next; + else /* madvise_remove dropped mmap_sem */ + vma = find_vma(current->mm, start); + } +} + +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long len_in, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + int error; + unsigned long len; + unsigned long end; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + down_write(&mm->mmap_sem); + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + error = prctl_set_vma_anon_name(start, end, arg); + break; + default: + error = -EINVAL; + } + + up_write(&mm->mmap_sem); + + return error; +} +#else /* CONFIG_MMU */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long len_in, unsigned long arg) +{ + return -EINVAL; +} +#endif + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { struct task_struct *me = current; + struct task_struct *tsk; unsigned char comm[sizeof(me->comm)]; long error; @@ -2218,6 +2368,26 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_TID_ADDRESS: error = prctl_get_tid_address(me, (int __user **)arg2); break; + case PR_SET_TIMERSLACK_PID: + if (task_pid_vnr(current) != (pid_t)arg3 && + !capable(CAP_SYS_NICE)) + return -EPERM; + rcu_read_lock(); + tsk = find_task_by_vpid((pid_t)arg3); + if (tsk == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + get_task_struct(tsk); + rcu_read_unlock(); + if (arg2 <= 0) + tsk->timer_slack_ns = + tsk->default_timer_slack_ns; + else + tsk->timer_slack_ns = arg2; + put_task_struct(tsk); + error = 0; + break; case PR_SET_CHILD_SUBREAPER: me->signal->is_child_subreaper = !!arg2; break; @@ -2266,6 +2436,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..35bfe0c1360b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -104,6 +104,7 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; #endif extern int pid_max; +extern int extra_free_kbytes; extern int pid_max_min, pid_max_max; extern int percpu_pagelist_fraction; extern int compat_log; @@ -1393,6 +1394,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "extra_free_kbytes", + .data = &extra_free_kbytes, + .maxlen = sizeof(extra_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), @@ -1568,6 +1577,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index e45db6b0d878..5f5b66a2f156 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,6 +77,9 @@ config EVENT_TRACING select CONTEXT_SWITCH_TRACER bool +config GPU_TRACEPOINTS + bool + config CONTEXT_SWITCH_TRACER bool diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 9b1044e936a6..ba04bf0c2653 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -64,6 +64,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o +obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c new file mode 100644 index 000000000000..a4b3f00faee3 --- /dev/null +++ b/kernel/trace/gpu-traces.c @@ -0,0 +1,23 @@ +/* + * GPU tracepoints + * + * Copyright (C) 2013 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/module.h> + +#define CREATE_TRACE_POINTS +#include <trace/events/gpu.h> + +EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch); +EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..5f375d4c05fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1352,6 +1352,7 @@ void tracing_reset_all_online_cpus(void) #define SAVED_CMDLINES_DEFAULT 128 #define NO_CMDLINE_MAP UINT_MAX +static unsigned saved_tgids[SAVED_CMDLINES_DEFAULT]; static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; struct saved_cmdlines_buffer { unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; @@ -1590,7 +1591,7 @@ static int trace_save_cmdline(struct task_struct *tsk) } set_cmdline(idx, tsk->comm); - + saved_tgids[idx] = tsk->tgid; arch_spin_unlock(&trace_cmdline_lock); return 1; @@ -1633,6 +1634,25 @@ void trace_find_cmdline(int pid, char comm[]) preempt_enable(); } +int trace_find_tgid(int pid) +{ + unsigned map; + int tgid; + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + map = savedcmd->map_pid_to_cmdline[pid]; + if (map != NO_CMDLINE_MAP) + tgid = saved_tgids[map]; + else + tgid = -1; + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); + + return tgid; +} + void tracing_record_cmdline(struct task_struct *tsk) { if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on()) @@ -2583,6 +2603,13 @@ static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) "# | | | | |\n"); } +static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | | |\n"); +} + static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { print_event_info(buf, m); @@ -2595,6 +2622,18 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file "# | | | |||| | |\n"); } +static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m) +{ + print_event_info(buf, m); + seq_puts(m, "# _-----=> irqs-off\n"); + seq_puts(m, "# / _----=> need-resched\n"); + seq_puts(m, "# | / _---=> hardirq/softirq\n"); + seq_puts(m, "# || / _--=> preempt-depth\n"); + seq_puts(m, "# ||| / delay\n"); + seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |||| | |\n"); +} + void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { @@ -2907,9 +2946,15 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->trace_buffer, m); + if (trace_flags & TRACE_ITER_TGID) + print_func_help_header_irq_tgid(iter->trace_buffer, m); + else + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->trace_buffer, m); + if (trace_flags & TRACE_ITER_TGID) + print_func_help_header_tgid(iter->trace_buffer, m); + else + print_func_help_header(iter->trace_buffer, m); } } } @@ -4161,6 +4206,50 @@ static void trace_insert_enum_map(struct module *mod, } static ssize_t +tracing_saved_tgids_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES_DEFAULT*(16+1+16), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES_DEFAULT; i++) { + int tgid; + int r; + + pid = savedcmd->map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + tgid = trace_find_tgid(pid); + r = sprintf(buf, "%d %d\n", pid, tgid); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + + return len; +} + +static const struct file_operations tracing_saved_tgids_fops = { + .open = tracing_open_generic, + .read = tracing_saved_tgids_read, + .llseek = generic_file_llseek, +}; + +static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { @@ -6784,6 +6873,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) trace_create_file("trace_marker", 0220, d_tracer, tr, &tracing_mark_fops); + trace_create_file("saved_tgids", 0444, d_tracer, + tr, &tracing_saved_tgids_fops); + trace_create_file("trace_clock", 0644, d_tracer, tr, &trace_clock_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 919d9d07686f..e1265f95457f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -656,6 +656,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags, extern cycle_t ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); +extern int trace_find_tgid(int pid); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; @@ -970,7 +971,8 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ - BRANCH_FLAGS + BRANCH_FLAGS \ + C(TGID, "print-tgid"), /* * By defining C, we can make TRACE_FLAGS a list of bit names diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index a663cbb84107..4641bdb40f8f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -64,6 +64,9 @@ struct fgraph_data { #define TRACE_GRAPH_INDENT 2 +/* Flag options */ +#define TRACE_GRAPH_PRINT_FLAT 0x80 + static unsigned int max_depth; static struct tracer_opt trace_opts[] = { @@ -87,6 +90,8 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) }, /* Include time within nested functions */ { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) }, + /* Use standard trace formatting rather than hierarchical */ + { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) }, { } /* Empty entry */ }; @@ -1165,6 +1170,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) int cpu = iter->cpu; int ret; + if (flags & TRACE_GRAPH_PRINT_FLAT) + return TRACE_TYPE_UNHANDLED; + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; return TRACE_TYPE_HANDLED; @@ -1222,13 +1230,6 @@ print_graph_function(struct trace_iterator *iter) return print_graph_function_flags(iter, tracer_flags.val); } -static enum print_line_t -print_graph_function_event(struct trace_iterator *iter, int flags, - struct trace_event *event) -{ - return print_graph_function(iter); -} - static void print_lat_header(struct seq_file *s, u32 flags) { static const char spaces[] = " " /* 16 spaces */ @@ -1297,6 +1298,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags) struct trace_iterator *iter = s->private; struct trace_array *tr = iter->tr; + if (flags & TRACE_GRAPH_PRINT_FLAT) { + trace_default_header(s); + return; + } + if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO)) return; @@ -1378,19 +1384,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) return 0; } -static struct trace_event_functions graph_functions = { - .trace = print_graph_function_event, -}; - -static struct trace_event graph_trace_entry_event = { - .type = TRACE_GRAPH_ENT, - .funcs = &graph_functions, -}; - -static struct trace_event graph_trace_ret_event = { - .type = TRACE_GRAPH_RET, - .funcs = &graph_functions -}; static struct tracer graph_trace __tracer_data = { .name = "function_graph", @@ -1467,16 +1460,6 @@ static __init int init_graph_trace(void) { max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); - if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - - if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); - return 1; - } - return register_tracer(&graph_trace); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..3bc4b6de0f4d 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -526,11 +526,21 @@ int trace_print_context(struct trace_iterator *iter) unsigned long long t; unsigned long secs, usec_rem; char comm[TASK_COMM_LEN]; + int tgid; trace_find_cmdline(entry->pid, comm); - trace_seq_printf(s, "%16s-%-5d [%03d] ", - comm, entry->pid, iter->cpu); + trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); + + if (tr->trace_flags & TRACE_ITER_TGID) { + tgid = trace_find_tgid(entry->pid); + if (tgid < 0) + trace_seq_puts(s, "(-----) "); + else + trace_seq_printf(s, "(%5d) ", tgid); + } + + trace_seq_printf(s, "[%03d] ", iter->cpu); if (tr->trace_flags & TRACE_ITER_IRQ_INFO) trace_print_lat_fmt(s, entry); @@ -845,6 +855,174 @@ static struct trace_event trace_fn_event = { .funcs = &trace_fn_funcs, }; +/* TRACE_GRAPH_ENT */ +static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct ftrace_graph_ent_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_puts(s, "graph_ent: func="); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (!seq_print_ip_sym(s, field->graph_ent.func, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + trace_seq_puts(s, "\n"); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %d\n", + field->graph_ent.func, + field->graph_ent.depth); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->graph_ent.func); + SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ent_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->graph_ent.func); + SEQ_PUT_FIELD(s, field->graph_ent.depth); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_graph_ent_funcs = { + .trace = trace_graph_ent_trace, + .raw = trace_graph_ent_raw, + .hex = trace_graph_ent_hex, + .binary = trace_graph_ent_bin, +}; + +static struct trace_event trace_graph_ent_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &trace_graph_ent_funcs, +}; + +/* TRACE_GRAPH_RET */ +static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct ftrace_graph_ret_entry *field; + + trace_assign_type(field, entry); + + trace_seq_puts(s, "graph_ret: func="); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + if (!seq_print_ip_sym(s, field->ret.func, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + trace_seq_puts(s, "\n"); + if (trace_seq_has_overflowed(s)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + + trace_assign_type(field, iter->ent); + + trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n", + field->ret.func, + field->ret.calltime, + field->ret.rettime, + field->ret.overrun, + field->ret.depth); + if (trace_seq_has_overflowed(&iter->seq)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD(s, field->ret.func); + SEQ_PUT_HEX_FIELD(s, field->ret.calltime); + SEQ_PUT_HEX_FIELD(s, field->ret.rettime); + SEQ_PUT_HEX_FIELD(s, field->ret.overrun); + SEQ_PUT_HEX_FIELD(s, field->ret.depth); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_graph_ret_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD(s, field->ret.func); + SEQ_PUT_FIELD(s, field->ret.calltime); + SEQ_PUT_FIELD(s, field->ret.rettime); + SEQ_PUT_FIELD(s, field->ret.overrun); + SEQ_PUT_FIELD(s, field->ret.depth); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_graph_ret_funcs = { + .trace = trace_graph_ret_trace, + .raw = trace_graph_ret_raw, + .hex = trace_graph_ret_hex, + .binary = trace_graph_ret_bin, +}; + +static struct trace_event trace_graph_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &trace_graph_ret_funcs, +}; + /* TRACE_CTX an TRACE_WAKE */ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) @@ -1222,6 +1400,8 @@ static struct trace_event trace_print_event = { static struct trace_event *events[] __initdata = { &trace_fn_event, + &trace_graph_ent_event, + &trace_graph_ret_event, &trace_ctx_event, &trace_wake_event, &trace_stack_event, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18f34cf75f74..25955235ecdd 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -103,6 +103,11 @@ static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved); static DEFINE_PER_CPU(bool, hard_watchdog_warn); static DEFINE_PER_CPU(bool, watchdog_nmi_touch); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static cpumask_t __read_mostly watchdog_cpus; +#endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif static unsigned long soft_lockup_nmi_warn; @@ -271,7 +276,7 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_touch_ts, 0); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* watchdog detector functions */ static bool is_hardlockup(void) { @@ -285,6 +290,76 @@ static bool is_hardlockup(void) } #endif +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static unsigned int watchdog_next_cpu(unsigned int cpu) +{ + cpumask_t cpus = watchdog_cpus; + unsigned int next_cpu; + + next_cpu = cpumask_next(cpu, &cpus); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(&cpus); + + if (next_cpu == cpu) + return nr_cpu_ids; + + return next_cpu; +} + +static int is_hardlockup_other_cpu(unsigned int cpu) +{ + unsigned long hrint = per_cpu(hrtimer_interrupts, cpu); + + if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) + return 1; + + per_cpu(hrtimer_interrupts_saved, cpu) = hrint; + return 0; +} + +static void watchdog_check_hardlockup_other_cpu(void) +{ + unsigned int next_cpu; + + /* + * Test for hardlockups every 3 samples. The sample period is + * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over + * watchdog_thresh (over by 20%). + */ + if (__this_cpu_read(hrtimer_interrupts) % 3 != 0) + return; + + /* check for a hardlockup on the next cpu */ + next_cpu = watchdog_next_cpu(smp_processor_id()); + if (next_cpu >= nr_cpu_ids) + return; + + smp_rmb(); + + if (per_cpu(watchdog_nmi_touch, next_cpu) == true) { + per_cpu(watchdog_nmi_touch, next_cpu) = false; + return; + } + + if (is_hardlockup_other_cpu(next_cpu)) { + /* only warn once */ + if (per_cpu(hard_watchdog_warn, next_cpu) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu); + + per_cpu(hard_watchdog_warn, next_cpu) = true; + } else { + per_cpu(hard_watchdog_warn, next_cpu) = false; + } +} +#else +static inline void watchdog_check_hardlockup_other_cpu(void) { return; } +#endif + static int is_softlockup(unsigned long touch_ts) { unsigned long now = get_timestamp(); @@ -297,7 +372,7 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -360,7 +435,7 @@ static void watchdog_overflow_callback(struct perf_event *event, __this_cpu_write(hard_watchdog_warn, false); return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static void watchdog_interrupt_count(void) { @@ -384,6 +459,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) /* kick the hardlockup detector */ watchdog_interrupt_count(); + /* test for hardlockups on the next cpu */ + watchdog_check_hardlockup_other_cpu(); + /* kick the softlockup detector */ wake_up_process(__this_cpu_read(softlockup_watchdog)); @@ -561,7 +639,7 @@ static void watchdog(unsigned int cpu) watchdog_nmi_disable(cpu); } -#ifdef CONFIG_HARDLOCKUP_DETECTOR +#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI /* * People like the simple clean cpu node info on boot. * Reduce the watchdog noise by only printing messages @@ -660,9 +738,44 @@ static void watchdog_nmi_disable(unsigned int cpu) } #else +#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU +static int watchdog_nmi_enable(unsigned int cpu) +{ + /* + * The new cpu will be marked online before the first hrtimer interrupt + * runs on it. If another cpu tests for a hardlockup on the new cpu + * before it has run its first hrtimer, it will get a false positive. + * Touch the watchdog on the new cpu to delay the first check for at + * least 3 sampling periods to guarantee one hrtimer has run on the new + * cpu. + */ + per_cpu(watchdog_nmi_touch, cpu) = true; + smp_wmb(); + cpumask_set_cpu(cpu, &watchdog_cpus); + return 0; +} + +static void watchdog_nmi_disable(unsigned int cpu) +{ + unsigned int next_cpu = watchdog_next_cpu(cpu); + + /* + * Offlining this cpu will cause the cpu before this one to start + * checking the one after this one. If this cpu just finished checking + * the next cpu and updating hrtimer_interrupts_saved, and then the + * previous cpu checks it within one sample period, it will trigger a + * false positive. Touch the watchdog on the next cpu to prevent it. + */ + if (next_cpu < nr_cpu_ids) + per_cpu(watchdog_nmi_touch, next_cpu) = true; + smp_wmb(); + cpumask_clear_cpu(cpu, &watchdog_cpus); +} +#else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -#endif /* CONFIG_HARDLOCKUP_DETECTOR */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */ +#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */ static struct smp_hotplug_thread watchdog_threads = { .store = &softlockup_watchdog, |