aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks4
-rw-r--r--kernel/Kconfig.preempt1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/compat.c68
-rw-r--r--kernel/cpuset.c31
-rw-r--r--kernel/debug/debug_core.c34
-rw-r--r--kernel/debug/gdbstub.c10
-rw-r--r--kernel/debug/kdb/kdb_bp.c7
-rw-r--r--kernel/debug/kdb/kdb_bt.c1
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c95
-rw-r--r--kernel/debug/kdb/kdb_main.c3
-rw-r--r--kernel/debug/kdb/kdb_private.h7
-rw-r--r--kernel/dma.c1
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/exit.c42
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c38
-rw-r--r--kernel/futex_compat.c38
-rw-r--r--kernel/irq/Kconfig15
-rw-r--r--kernel/irq/handle.c16
-rw-r--r--kernel/irq/irqdomain.c8
-rw-r--r--kernel/irq/manage.c19
-rw-r--r--kernel/irq/migration.c10
-rw-r--r--kernel/kexec.c7
-rw-r--r--kernel/kmod.c84
-rw-r--r--kernel/module.c37
-rw-r--r--kernel/params.c40
-rw-r--r--kernel/pid_namespace.c41
-rw-r--r--kernel/ptrace.c66
-rw-r--r--kernel/rwsem.c1
-rw-r--r--kernel/sched/core.c72
-rw-r--r--kernel/sched/fair.c16
-rw-r--r--kernel/sched/rt.c2
-rw-r--r--kernel/sched/sched.h3
-rw-r--r--kernel/signal.c19
-rw-r--r--kernel/smp.c90
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/sys.c17
-rw-r--r--kernel/sysctl.c513
-rw-r--r--kernel/sysctl_check.c160
-rw-r--r--kernel/time.c6
-rw-r--r--kernel/time/alarmtimer.c8
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/ntp.c134
-rw-r--r--kernel/time/timekeeping.c53
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/ftrace.c3
-rw-r--r--kernel/trace/ring_buffer.c157
-rw-r--r--kernel/trace/trace.c113
-rw-r--r--kernel/trace/trace.h3
-rw-r--r--kernel/trace/trace_entries.h16
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/watchdog.c27
55 files changed, 985 insertions, 1178 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 5068e2a4e75f..2251882daf53 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -124,8 +124,8 @@ config INLINE_SPIN_LOCK_IRQSAVE
def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \
ARCH_INLINE_SPIN_LOCK_IRQSAVE
-config INLINE_SPIN_UNLOCK
- def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK)
+config UNINLINE_SPIN_UNLOCK
+ bool
config INLINE_SPIN_UNLOCK_BH
def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 24e7cb0ba26a..3f9c97419f02 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -36,6 +36,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
select PREEMPT_COUNT
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
diff --git a/kernel/Makefile b/kernel/Makefile
index 2d9de86b7e76..cb41b9547c9f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -27,7 +27,6 @@ obj-y += power/
obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
-obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f4ea4b6f3cf1..ed64ccac67c9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1883,7 +1883,7 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval;
+ int retval = 0;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
struct cgroupfs_root *root = cgrp->root;
diff --git a/kernel/compat.c b/kernel/compat.c
index f346cedfe24d..74ff8498809a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -31,11 +31,10 @@
#include <asm/uaccess.h>
/*
- * Note that the native side is already converted to a timespec, because
- * that's what we want anyway.
+ * Get/set struct timeval with struct timespec on the native side
*/
-static int compat_get_timeval(struct timespec *o,
- struct compat_timeval __user *i)
+static int compat_get_timeval_convert(struct timespec *o,
+ struct compat_timeval __user *i)
{
long usec;
@@ -46,8 +45,8 @@ static int compat_get_timeval(struct timespec *o,
return 0;
}
-static int compat_put_timeval(struct compat_timeval __user *o,
- struct timeval *i)
+static int compat_put_timeval_convert(struct compat_timeval __user *o,
+ struct timeval *i)
{
return (put_user(i->tv_sec, &o->tv_sec) ||
put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
@@ -117,7 +116,7 @@ asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
if (tv) {
struct timeval ktv;
do_gettimeofday(&ktv);
- if (compat_put_timeval(tv, &ktv))
+ if (compat_put_timeval_convert(tv, &ktv))
return -EFAULT;
}
if (tz) {
@@ -135,7 +134,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
struct timezone ktz;
if (tv) {
- if (compat_get_timeval(&kts, tv))
+ if (compat_get_timeval_convert(&kts, tv))
return -EFAULT;
}
if (tz) {
@@ -146,12 +145,29 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
}
+int get_compat_timeval(struct timeval *tv, const struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_READ, ctv, sizeof(*ctv)) ||
+ __get_user(tv->tv_sec, &ctv->tv_sec) ||
+ __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_timeval);
+
+int put_compat_timeval(const struct timeval *tv, struct compat_timeval __user *ctv)
+{
+ return (!access_ok(VERIFY_WRITE, ctv, sizeof(*ctv)) ||
+ __put_user(tv->tv_sec, &ctv->tv_sec) ||
+ __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_timeval);
+
int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
{
return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+EXPORT_SYMBOL_GPL(get_compat_timespec);
int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts)
{
@@ -161,6 +177,42 @@ int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user
}
EXPORT_SYMBOL_GPL(put_compat_timespec);
+int compat_get_timeval(struct timeval *tv, const void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0;
+ else
+ return get_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_get_timeval);
+
+int compat_put_timeval(const struct timeval *tv, void __user *utv)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0;
+ else
+ return put_compat_timeval(tv, utv);
+}
+EXPORT_SYMBOL_GPL(compat_put_timeval);
+
+int compat_get_timespec(struct timespec *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0;
+ else
+ return get_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec);
+
+int compat_put_timespec(const struct timespec *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0;
+ else
+ return put_compat_timespec(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec);
+
static long compat_nanosleep_restart(struct restart_block *restart)
{
struct compat_timespec __user *rmtp;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1010cc61931f..14f7070b4ba2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -270,11 +270,11 @@ static struct file_system_type cpuset_fs_type = {
* are online. If none are online, walk up the cpuset hierarchy
* until we find one that does have some online cpus. If we get
* all the way to the top and still haven't found any online cpus,
- * return cpu_online_map. Or if passed a NULL cs from an exit'ing
- * task, return cpu_online_map.
+ * return cpu_online_mask. Or if passed a NULL cs from an exit'ing
+ * task, return cpu_online_mask.
*
* One way or another, we guarantee to return some non-empty subset
- * of cpu_online_map.
+ * of cpu_online_mask.
*
* Call with callback_mutex held.
*/
@@ -867,7 +867,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
int retval;
int is_load_balanced;
- /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
+ /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
return -EACCES;
@@ -2149,7 +2149,7 @@ void __init cpuset_init_smp(void)
*
* Description: Returns the cpumask_var_t cpus_allowed of the cpuset
* attached to the specified @tsk. Guaranteed to return some non-empty
- * subset of cpu_online_map, even if this means going outside the
+ * subset of cpu_online_mask, even if this means going outside the
* tasks cpuset.
**/
@@ -2162,10 +2162,9 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
mutex_unlock(&callback_mutex);
}
-int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
const struct cpuset *cs;
- int cpu;
rcu_read_lock();
cs = task_cs(tsk);
@@ -2186,22 +2185,10 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
* set any mask even if it is not right from task_cs() pov,
* the pending set_cpus_allowed_ptr() will fix things.
+ *
+ * select_fallback_rq() will fix things ups and set cpu_possible_mask
+ * if required.
*/
-
- cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
- if (cpu >= nr_cpu_ids) {
- /*
- * Either tsk->cpus_allowed is wrong (see above) or it
- * is actually empty. The latter case is only possible
- * if we are racing with remove_tasks_in_empty_cpuset().
- * Like above we can temporary set any mask and rely on
- * set_cpus_allowed_ptr() as synchronization point.
- */
- do_set_cpus_allowed(tsk, cpu_possible_mask);
- cpu = cpumask_any(cpu_active_mask);
- }
-
- return cpu;
}
void cpuset_init_current_mems_allowed(void)
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0d7c08784efb..1dc53bae56e1 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -41,6 +41,7 @@
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/sysrq.h>
+#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/kgdb.h>
#include <linux/kdb.h>
@@ -52,7 +53,6 @@
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
#include <linux/atomic.h>
-#include <asm/system.h>
#include "debug_core.h"
@@ -75,6 +75,8 @@ static int exception_level;
struct kgdb_io *dbg_io_ops;
static DEFINE_SPINLOCK(kgdb_registration_lock);
+/* Action for the reboot notifiter, a global allow kdb to change it */
+static int kgdbreboot;
/* kgdb console driver is loaded */
static int kgdb_con_registered;
/* determine if kgdb console output should be used */
@@ -96,6 +98,7 @@ static int __init opt_kgdb_con(char *str)
early_param("kgdbcon", opt_kgdb_con);
module_param(kgdb_use_con, int, 0644);
+module_param(kgdbreboot, int, 0644);
/*
* Holds information about breakpoints in a kernel. These breakpoints are
@@ -784,6 +787,33 @@ void __init dbg_late_init(void)
kdb_init(KDB_INIT_FULL);
}
+static int
+dbg_notify_reboot(struct notifier_block *this, unsigned long code, void *x)
+{
+ /*
+ * Take the following action on reboot notify depending on value:
+ * 1 == Enter debugger
+ * 0 == [the default] detatch debug client
+ * -1 == Do nothing... and use this until the board resets
+ */
+ switch (kgdbreboot) {
+ case 1:
+ kgdb_breakpoint();
+ case -1:
+ goto done;
+ }
+ if (!dbg_kdb_mode)
+ gdbstub_exit(code);
+done:
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block dbg_reboot_notifier = {
+ .notifier_call = dbg_notify_reboot,
+ .next = NULL,
+ .priority = INT_MAX,
+};
+
static void kgdb_register_callbacks(void)
{
if (!kgdb_io_module_registered) {
@@ -791,6 +821,7 @@ static void kgdb_register_callbacks(void)
kgdb_arch_init();
if (!dbg_is_early)
kgdb_arch_late();
+ register_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_register(&panic_notifier_list,
&kgdb_panic_event_nb);
#ifdef CONFIG_MAGIC_SYSRQ
@@ -812,6 +843,7 @@ static void kgdb_unregister_callbacks(void)
*/
if (kgdb_io_module_registered) {
kgdb_io_module_registered = 0;
+ unregister_reboot_notifier(&dbg_reboot_notifier);
atomic_notifier_chain_unregister(&panic_notifier_list,
&kgdb_panic_event_nb);
kgdb_arch_exit();
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index c22d8c28ad84..ce615e064482 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1111,6 +1111,13 @@ void gdbstub_exit(int status)
unsigned char checksum, ch, buffer[3];
int loop;
+ if (!kgdb_connected)
+ return;
+ kgdb_connected = 0;
+
+ if (!dbg_io_ops || dbg_kdb_mode)
+ return;
+
buffer[0] = 'W';
buffer[1] = hex_asc_hi(status);
buffer[2] = hex_asc_lo(status);
@@ -1129,5 +1136,6 @@ void gdbstub_exit(int status)
dbg_io_ops->write_char(hex_asc_lo(checksum));
/* make sure the output is flushed, lest the bootloader clobber it */
- dbg_io_ops->flush();
+ if (dbg_io_ops->flush)
+ dbg_io_ops->flush();
}
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 20059ef4459a..8418c2f8ec5d 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,6 +153,13 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
+#ifdef CONFIG_DEBUG_RODATA
+ if (!bp->bp_type) {
+ kdb_printf("Software breakpoints are unavailable.\n"
+ " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " OR use hw breaks: help bph\n");
+ }
+#endif
return 1;
}
return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7179eac7b41c..07c9bbb94a0b 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -15,7 +15,6 @@
#include <linux/sched.h>
#include <linux/kdb.h>
#include <linux/nmi.h>
-#include <asm/system.h>
#include "kdb_private.h"
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 4802eb5840e1..9b5f17da1c56 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -689,7 +689,7 @@ kdb_printit:
if (!dbg_kdb_mode && kgdb_connected) {
gdbstub_msg_write(kdb_buffer, retlen);
} else {
- if (!dbg_io_ops->is_console) {
+ if (dbg_io_ops && !dbg_io_ops->is_console) {
len = strlen(kdb_buffer);
cp = kdb_buffer;
while (len--) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
index 4bca634975c0..118527aa60ea 100644
--- a/kernel/debug/kdb/kdb_keyboard.c
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -25,6 +25,7 @@
#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
static int kbd_exists;
+static int kbd_last_ret;
/*
* Check if the keyboard controller has a keypress for us.
@@ -90,8 +91,11 @@ int kdb_get_kbd_char(void)
return -1;
}
- if ((scancode & 0x80) != 0)
+ if ((scancode & 0x80) != 0) {
+ if (scancode == 0x9c)
+ kbd_last_ret = 0;
return -1;
+ }
scancode &= 0x7f;
@@ -178,35 +182,82 @@ int kdb_get_kbd_char(void)
return -1; /* ignore unprintables */
}
- if ((scancode & 0x7f) == 0x1c) {
- /*
- * enter key. All done. Absorb the release scancode.
- */
+ if (scancode == 0x1c) {
+ kbd_last_ret = 1;
+ return 13;
+ }
+
+ return keychar & 0xff;
+}
+EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
+
+/*
+ * Best effort cleanup of ENTER break codes on leaving KDB. Called on
+ * exiting KDB, when we know we processed an ENTER or KP ENTER scan
+ * code.
+ */
+void kdb_kbd_cleanup_state(void)
+{
+ int scancode, scanstatus;
+
+ /*
+ * Nothing to clean up, since either
+ * ENTER was never pressed, or has already
+ * gotten cleaned up.
+ */
+ if (!kbd_last_ret)
+ return;
+
+ kbd_last_ret = 0;
+ /*
+ * Enter key. Need to absorb the break code here, lest it gets
+ * leaked out if we exit KDB as the result of processing 'g'.
+ *
+ * This has several interesting implications:
+ * + Need to handle KP ENTER, which has break code 0xe0 0x9c.
+ * + Need to handle repeat ENTER and repeat KP ENTER. Repeats
+ * only get a break code at the end of the repeated
+ * sequence. This means we can't propagate the repeated key
+ * press, and must swallow it away.
+ * + Need to handle possible PS/2 mouse input.
+ * + Need to handle mashed keys.
+ */
+
+ while (1) {
while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
- ;
+ cpu_relax();
/*
- * Fetch the scancode
+ * Fetch the scancode.
*/
scancode = inb(KBD_DATA_REG);
scanstatus = inb(KBD_STATUS_REG);
- while (scanstatus & KBD_STAT_MOUSE_OBF) {
- scancode = inb(KBD_DATA_REG);
- scanstatus = inb(KBD_STATUS_REG);
- }
+ /*
+ * Skip mouse input.
+ */
+ if (scanstatus & KBD_STAT_MOUSE_OBF)
+ continue;
- if (scancode != 0x9c) {
- /*
- * Wasn't an enter-release, why not?
- */
- kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
- scancode, scanstatus);
- }
+ /*
+ * If we see 0xe0, this is either a break code for KP
+ * ENTER, or a repeat make for KP ENTER. Either way,
+ * since the second byte is equivalent to an ENTER,
+ * skip the 0xe0 and try again.
+ *
+ * If we see 0x1c, this must be a repeat ENTER or KP
+ * ENTER (and we swallowed 0xe0 before). Try again.
+ *
+ * We can also see make and break codes for other keys
+ * mashed before or after pressing ENTER. Thus, if we
+ * see anything other than 0x9c, we have to try again.
+ *
+ * Note, if you held some key as ENTER was depressed,
+ * that break code would get leaked out.
+ */
+ if (scancode != 0x9c)
+ continue;
- return 13;
+ return;
}
-
- return keychar & 0xff;
}
-EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index e2ae7349437f..67b847dfa2bb 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1400,6 +1400,9 @@ int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
if (KDB_STATE(DOING_SS))
KDB_STATE_CLEAR(SSBPT);
+ /* Clean up any keyboard devices before leaving */
+ kdb_kbd_cleanup_state();
+
return result;
}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index e381d105b40b..47c4e56e513b 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -246,6 +246,13 @@ extern void debug_kusage(void);
extern void kdb_set_current_task(struct task_struct *);
extern struct task_struct *kdb_current_task;
+
+#ifdef CONFIG_KDB_KEYBOARD
+extern void kdb_kbd_cleanup_state(void);
+#else /* ! CONFIG_KDB_KEYBOARD */
+#define kdb_kbd_cleanup_state()
+#endif /* ! CONFIG_KDB_KEYBOARD */
+
#ifdef CONFIG_MODULES
extern struct list_head *kdb_modules;
#endif /* CONFIG_MODULES */
diff --git a/kernel/dma.c b/kernel/dma.c
index 68a2306522c8..6c6262f86c17 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -18,7 +18,6 @@
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <asm/dma.h>
-#include <asm/system.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b50357914fb..a6a9ec4cd8f5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
*running = ctx_time - event->tstamp_running;
}
-void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now)
+void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{
}
@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running +
atomic64_read(&event->child_total_time_running);
- perf_update_user_clock(userpg, now);
+ arch_perf_update_userpage(userpg, now);
barrier();
++userpg->lock;
@@ -7116,6 +7116,13 @@ void __init perf_event_init(void)
/* do not patch jump label more than once per second */
jump_label_rate_limit(&perf_sched_events, HZ);
+
+ /*
+ * Build time assertion that we keep the data_head at the intended
+ * location. IOW, validation we got the __reserved[] size right.
+ */
+ BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
+ != 1024);
}
static int __init perf_event_sysfs_init(void)
diff --git a/kernel/exit.c b/kernel/exit.c
index 16b07bfac224..d8bd3b425fa7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -474,7 +474,7 @@ static void close_files(struct files_struct * files)
i = j * __NFDBITS;
if (i >= fdt->max_fds)
break;
- set = fdt->open_fds->fds_bits[j++];
+ set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
struct file * file = xchg(&fdt->fd[i], NULL);
@@ -687,11 +687,11 @@ static void exit_mm(struct task_struct * tsk)
}
/*
- * When we die, we re-parent all our children.
- * Try to give them to another thread in our thread
- * group, and if no such member exists, give it to
- * the child reaper process (ie "init") in our pid
- * space.
+ * When we die, we re-parent all our children, and try to:
+ * 1. give them to another thread in our thread group, if such a member exists
+ * 2. give it to the first ancestor process which prctl'd itself as a
+ * child_subreaper for its children (like a service manager)
+ * 3. give it to the init process (PID 1) in our pid namespace
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
__releases(&tasklist_lock)
@@ -711,8 +711,11 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
if (unlikely(pid_ns->child_reaper == father)) {
write_unlock_irq(&tasklist_lock);
- if (unlikely(pid_ns == &init_pid_ns))
- panic("Attempted to kill init!");
+ if (unlikely(pid_ns == &init_pid_ns)) {
+ panic("Attempted to kill init! exitcode=0x%08x\n",
+ father->signal->group_exit_code ?:
+ father->exit_code);
+ }
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
@@ -722,6 +725,29 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
* forget_original_parent() must move them somewhere.
*/
pid_ns->child_reaper = init_pid_ns.child_reaper;
+ } else if (father->signal->has_child_subreaper) {
+ struct task_struct *reaper;
+
+ /*
+ * Find the first ancestor marked as child_subreaper.
+ * Note that the code below checks same_thread_group(reaper,
+ * pid_ns->child_reaper). This is what we need to DTRT in a
+ * PID namespace. However we still need the check above, see
+ * http://marc.info/?l=linux-kernel&m=131385460420380
+ */
+ for (reaper = father->real_parent;
+ reaper != &init_task;
+ reaper = reaper->real_parent) {
+ if (same_thread_group(reaper, pid_ns->child_reaper))
+ break;
+ if (!reaper->signal->is_child_subreaper)
+ continue;
+ thread = reaper;
+ do {
+ if (!(thread->flags & PF_EXITING))
+ return reaper;
+ } while_each_thread(reaper, thread);
+ }
}
return pid_ns->child_reaper;
diff --git a/kernel/fork.c b/kernel/fork.c
index 37674ec55cde..b9372a0bff18 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1051,6 +1051,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
+ sig->has_child_subreaper = current->signal->has_child_subreaper ||
+ current->signal->is_child_subreaper;
+
mutex_init(&sig->cred_guard_mutex);
return 0;
diff --git a/kernel/futex.c b/kernel/futex.c
index 72efa1e4359a..e2b0fb9a0b3b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -59,6 +59,7 @@
#include <linux/magic.h>
#include <linux/pid.h>
#include <linux/nsproxy.h>
+#include <linux/ptrace.h>
#include <asm/futex.h>
@@ -2443,40 +2444,31 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
{
struct robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(head, head_ptr);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 5f9e689dc8f0..83e368b005fc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -10,6 +10,7 @@
#include <linux/compat.h>
#include <linux/nsproxy.h>
#include <linux/futex.h>
+#include <linux/ptrace.h>
#include <asm/uaccess.h>
@@ -136,40 +137,31 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
{
struct compat_robust_list_head __user *head;
unsigned long ret;
- const struct cred *cred = current_cred(), *pcred;
+ struct task_struct *p;
if (!futex_cmpxchg_enabled)
return -ENOSYS;
+ WARN_ONCE(1, "deprecated: get_robust_list will be deleted in 2013.\n");
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
if (!pid)
- head = current->compat_robust_list;
+ p = current;
else {
- struct task_struct *p;
-
- ret = -ESRCH;
- rcu_read_lock();
p = find_task_by_vpid(pid);
if (!p)
goto err_unlock;
- ret = -EPERM;
- pcred = __task_cred(p);
- /* If victim is in different user_ns, then uids are not
- comparable, so we must have CAP_SYS_PTRACE */
- if (cred->user->user_ns != pcred->user->user_ns) {
- if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
- goto ok;
- }
- /* If victim is in same user_ns, then uids are comparable */
- if (cred->euid != pcred->euid &&
- cred->euid != pcred->uid &&
- !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
- goto err_unlock;
-ok:
- head = p->compat_robust_list;
- rcu_read_unlock();
}
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
if (put_user(sizeof(*head), len_ptr))
return -EFAULT;
return put_user(ptr_to_compat(head), head_ptr);
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 5a38bf4de641..cf1a4a68ce44 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -13,7 +13,7 @@ config GENERIC_HARDIRQS
# Options selectable by the architecture code
# Make sparse irq Kconfig switch below available
-config HAVE_SPARSE_IRQ
+config MAY_HAVE_SPARSE_IRQ
bool
# Enable the generic irq autoprobe mechanism
@@ -56,13 +56,22 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+config IRQ_DOMAIN_DEBUG
+ bool "Expose hardware/virtual IRQ mapping via debugfs"
+ depends on IRQ_DOMAIN && DEBUG_FS
+ help
+ This option will show the mapping relationship between hardware irq
+ numbers and Linux irq numbers. The mapping is exposed via debugfs
+ in the file "virq_mapping".
+
+ If you don't know what this means you don't need it.
+
# Support forced irq threading
config IRQ_FORCED_THREADING
bool
config SPARSE_IRQ
- bool "Support sparse irq numbering"
- depends on HAVE_SPARSE_IRQ
+ bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ
---help---
Sparse irq numbering is useful for distro kernels that want
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 6ff84e6a954c..bdb180325551 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -54,14 +54,18 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
{
/*
- * Wake up the handler thread for this action. In case the
- * thread crashed and was killed we just pretend that we
- * handled the interrupt. The hardirq handler has disabled the
- * device interrupt, so no irq storm is lurking. If the
+ * In case the thread crashed and was killed we just pretend that
+ * we handled the interrupt. The hardirq handler has disabled the
+ * device interrupt, so no irq storm is lurking.
+ */
+ if (action->thread->flags & PF_EXITING)
+ return;
+
+ /*
+ * Wake up the handler thread for this action. If the
* RUNTHREAD bit is already set, nothing to do.
*/
- if ((action->thread->flags & PF_EXITING) ||
- test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ if (test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
return;
/*
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index af48e59bc2ff..3601f3fbf67c 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -632,7 +632,7 @@ unsigned int irq_linear_revmap(struct irq_domain *domain,
return revmap[hwirq];
}
-#ifdef CONFIG_VIRQ_DEBUG
+#ifdef CONFIG_IRQ_DOMAIN_DEBUG
static int virq_debug_show(struct seq_file *m, void *private)
{
unsigned long flags;
@@ -668,7 +668,7 @@ static int virq_debug_show(struct seq_file *m, void *private)
data = irq_desc_get_chip_data(desc);
seq_printf(m, "0x%16p ", data);
- if (desc->irq_data.domain->of_node)
+ if (desc->irq_data.domain && desc->irq_data.domain->of_node)
p = desc->irq_data.domain->of_node->full_name;
else
p = none;
@@ -695,14 +695,14 @@ static const struct file_operations virq_debug_fops = {
static int __init irq_debugfs_init(void)
{
- if (debugfs_create_file("virq_mapping", S_IRUGO, powerpc_debugfs_root,
+ if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL,
NULL, &virq_debug_fops) == NULL)
return -ENOMEM;
return 0;
}
__initcall(irq_debugfs_init);
-#endif /* CONFIG_VIRQ_DEBUG */
+#endif /* CONFIG_IRQ_DOMAIN_DEBUG */
int irq_domain_simple_map(struct irq_domain *d, unsigned int irq,
irq_hw_number_t hwirq)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b0ccd1ac2d6a..89a3ea82569b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -282,7 +282,7 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
struct cpumask *set = irq_default_affinity;
- int ret;
+ int ret, node = desc->irq_data.node;
/* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
@@ -301,6 +301,13 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
cpumask_and(mask, cpu_online_mask, set);
+ if (node != NUMA_NO_NODE) {
+ const struct cpumask *nodemask = cpumask_of_node(node);
+
+ /* make sure at least one of the cpus in nodemask is online */
+ if (cpumask_intersects(mask, nodemask))
+ cpumask_and(mask, mask, nodemask);
+ }
ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
switch (ret) {
case IRQ_SET_MASK_OK:
@@ -645,7 +652,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
* is marked MASKED.
*/
static void irq_finalize_oneshot(struct irq_desc *desc,
- struct irqaction *action, bool force)
+ struct irqaction *action)
{
if (!(desc->istate & IRQS_ONESHOT))
return;
@@ -679,7 +686,7 @@ again:
* we would clear the threads_oneshot bit of this thread which
* was just set.
*/
- if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ if (test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
goto out_unlock;
desc->threads_oneshot &= ~action->thread_mask;
@@ -739,7 +746,7 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
local_bh_disable();
ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action, false);
+ irq_finalize_oneshot(desc, action);
local_bh_enable();
return ret;
}
@@ -755,7 +762,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc,
irqreturn_t ret;
ret = action->thread_fn(action->irq, action->dev_id);
- irq_finalize_oneshot(desc, action, false);
+ irq_finalize_oneshot(desc, action);
return ret;
}
@@ -844,7 +851,7 @@ void exit_irq_thread(void)
wake_threads_waitq(desc);
/* Prevent a stale desc->threads_oneshot */
- irq_finalize_oneshot(desc, action, true);
+ irq_finalize_oneshot(desc, action);
}
static void irq_setup_forced_threading(struct irqaction *new)
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 47420908fba0..c3c89751b327 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -43,12 +43,16 @@ void irq_move_masked_irq(struct irq_data *idata)
* masking the irqs.
*/
if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
- < nr_cpu_ids))
- if (!chip->irq_set_affinity(&desc->irq_data,
- desc->pending_mask, false)) {
+ < nr_cpu_ids)) {
+ int ret = chip->irq_set_affinity(&desc->irq_data,
+ desc->pending_mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
irq_set_thread_affinity(desc);
}
+ }
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a6a675cb9818..4e2e472f6aeb 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -37,7 +37,6 @@
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
-#include <asm/system.h>
#include <asm/sections.h>
/* Per cpu memory for storing cpu states in case of system crash. */
@@ -1359,6 +1358,10 @@ static int __init parse_crashkernel_simple(char *cmdline,
if (*cur == '@')
*crash_base = memparse(cur+1, &cur);
+ else if (*cur != ' ' && *cur != '\0') {
+ pr_warning("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -1462,7 +1465,9 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(init_uts_ns);
VMCOREINFO_SYMBOL(node_online_map);
+#ifdef CONFIG_MMU
VMCOREINFO_SYMBOL(swapper_pg_dir);
+#endif
VMCOREINFO_SYMBOL(_stext);
VMCOREINFO_SYMBOL(vmlist);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a0a88543934e..957a7aab8ebc 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -60,6 +60,43 @@ static DECLARE_RWSEM(umhelper_sem);
*/
char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
+static void free_modprobe_argv(struct subprocess_info *info)
+{
+ kfree(info->argv[3]); /* check call_modprobe() */
+ kfree(info->argv);
+}
+
+static int call_modprobe(char *module_name, int wait)
+{
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+
+ char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+ if (!argv)
+ goto out;
+
+ module_name = kstrdup(module_name, GFP_KERNEL);
+ if (!module_name)
+ goto free_argv;
+
+ argv[0] = modprobe_path;
+ argv[1] = "-q";
+ argv[2] = "--";
+ argv[3] = module_name; /* check free_modprobe_argv() */
+ argv[4] = NULL;
+
+ return call_usermodehelper_fns(modprobe_path, argv, envp,
+ wait | UMH_KILLABLE, NULL, free_modprobe_argv, NULL);
+free_argv:
+ kfree(argv);
+out:
+ return -ENOMEM;
+}
+
/**
* __request_module - try to load a kernel module
* @wait: wait (or not) for the operation to complete
@@ -81,11 +118,6 @@ int __request_module(bool wait, const char *fmt, ...)
char module_name[MODULE_NAME_LEN];
unsigned int max_modprobes;
int ret;
- char *argv[] = { modprobe_path, "-q", "--", module_name, NULL };
- static char *envp[] = { "HOME=/",
- "TERM=linux",
- "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL };
static atomic_t kmod_concurrent = ATOMIC_INIT(0);
#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
static int kmod_loop_msg;
@@ -128,9 +160,7 @@ int __request_module(bool wait, const char *fmt, ...)
trace_module_request(module_name, wait, _RET_IP_);
- ret = call_usermodehelper_fns(modprobe_path, argv, envp,
- wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
- NULL, NULL, NULL);
+ ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
atomic_dec(&kmod_concurrent);
return ret;
@@ -188,7 +218,7 @@ static int ____call_usermodehelper(void *data)
/* Exec failed? */
fail:
sub_info->retval = retval;
- do_exit(0);
+ return 0;
}
void call_usermodehelper_freeinfo(struct subprocess_info *info)
@@ -199,6 +229,19 @@ void call_usermodehelper_freeinfo(struct subprocess_info *info)
}
EXPORT_SYMBOL(call_usermodehelper_freeinfo);
+static void umh_complete(struct subprocess_info *sub_info)
+{
+ struct completion *comp = xchg(&sub_info->complete, NULL);
+ /*
+ * See call_usermodehelper_exec(). If xchg() returns NULL
+ * we own sub_info, the UMH_KILLABLE caller has gone away.
+ */
+ if (comp)
+ complete(comp);
+ else
+ call_usermodehelper_freeinfo(sub_info);
+}
+
/* Keventd can't block, but this (a child) can. */
static int wait_for_helper(void *data)
{
@@ -235,7 +278,7 @@ static int wait_for_helper(void *data)
sub_info->retval = ret;
}
- complete(sub_info->complete);
+ umh_complete(sub_info);
return 0;
}
@@ -244,7 +287,7 @@ static void __call_usermodehelper(struct work_struct *work)
{
struct subprocess_info *sub_info =
container_of(work, struct subprocess_info, work);
- enum umh_wait wait = sub_info->wait;
+ int wait = sub_info->wait & ~UMH_KILLABLE;
pid_t pid;
/* CLONE_VFORK: wait until the usermode helper has execve'd
@@ -269,7 +312,7 @@ static void __call_usermodehelper(struct work_struct *work)
case UMH_WAIT_EXEC:
if (pid < 0)
sub_info->retval = pid;
- complete(sub_info->complete);
+ umh_complete(sub_info);
}
}
@@ -435,8 +478,7 @@ EXPORT_SYMBOL(call_usermodehelper_setfns);
* asynchronously if wait is not set, and runs as a child of keventd.
* (ie. it runs with full root capabilities).
*/
-int call_usermodehelper_exec(struct subprocess_info *sub_info,
- enum umh_wait wait)
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
{
DECLARE_COMPLETION_ONSTACK(done);
int retval = 0;
@@ -456,9 +498,21 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
queue_work(khelper_wq, &sub_info->work);
if (wait == UMH_NO_WAIT) /* task has freed sub_info */
goto unlock;
+
+ if (wait & UMH_KILLABLE) {
+ retval = wait_for_completion_killable(&done);
+ if (!retval)
+ goto wait_done;
+
+ /* umh_complete() will see NULL and free sub_info */
+ if (xchg(&sub_info->complete, NULL))
+ goto unlock;
+ /* fallthrough, umh_complete() was already called */
+ }
+
wait_for_completion(&done);
+wait_done:
retval = sub_info->retval;
-
out:
call_usermodehelper_freeinfo(sub_info);
unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2c932760fd33..78ac6ec1e425 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -105,6 +105,7 @@ struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
/* Block module loading/unloading? */
int modules_disabled = 0;
+core_param(nomodule, modules_disabled, bint, 0);
/* Waiting for a module to finish initializing? */
static DECLARE_WAIT_QUEUE_HEAD(module_wq);
@@ -903,6 +904,36 @@ static ssize_t show_refcnt(struct module_attribute *mattr,
static struct module_attribute modinfo_refcnt =
__ATTR(refcnt, 0444, show_refcnt, NULL);
+void __module_get(struct module *module)
+{
+ if (module) {
+ preempt_disable();
+ __this_cpu_inc(module->refptr->incs);
+ trace_module_get(module, _RET_IP_);
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(__module_get);
+
+bool try_module_get(struct module *module)
+{
+ bool ret = true;
+
+ if (module) {
+ preempt_disable();
+
+ if (likely(module_is_live(module))) {
+ __this_cpu_inc(module->refptr->incs);
+ trace_module_get(module, _RET_IP_);
+ } else
+ ret = false;
+
+ preempt_enable();
+ }
+ return ret;
+}
+EXPORT_SYMBOL(try_module_get);
+
void module_put(struct module *module)
{
if (module) {
@@ -2380,8 +2411,7 @@ static int copy_and_check(struct load_info *info,
return -ENOEXEC;
/* Suck in entire file: we'll want most of it. */
- /* vmalloc barfs on "unusual" numbers. Check here */
- if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
+ if ((hdr = vmalloc(len)) == NULL)
return -ENOMEM;
if (copy_from_user(hdr, umod, len) != 0) {
@@ -2922,7 +2952,8 @@ static struct module *load_module(void __user *umod,
mutex_unlock(&module_mutex);
/* Module is ready to execute: parsing args may do that. */
- err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
+ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+ -32768, 32767, NULL);
if (err < 0)
goto unlink;
diff --git a/kernel/params.c b/kernel/params.c
index 4bc965d8a1fe..f37d82631347 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
-#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
@@ -88,6 +87,8 @@ static int parse_one(char *param,
char *val,
const struct kernel_param *params,
unsigned num_params,
+ s16 min_level,
+ s16 max_level,
int (*handle_unknown)(char *param, char *val))
{
unsigned int i;
@@ -96,6 +97,9 @@ static int parse_one(char *param,
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
+ if (params[i].level < min_level
+ || params[i].level > max_level)
+ return 0;
/* No one handled NULL, so do it here. */
if (!val && params[i].ops->set != param_set_bool
&& params[i].ops->set != param_set_bint)
@@ -175,6 +179,8 @@ int parse_args(const char *name,
char *args,
const struct kernel_param *params,
unsigned num,
+ s16 min_level,
+ s16 max_level,
int (*unknown)(char *param, char *val))
{
char *param, *val;
@@ -190,7 +196,8 @@ int parse_args(const char *name,
args = next_arg(args, &param, &val);
irq_was_disabled = irqs_disabled();
- ret = parse_one(param, val, params, num, unknown);
+ ret = parse_one(param, val, params, num,
+ min_level, max_level, unknown);
if (irq_was_disabled && !irqs_disabled()) {
printk(KERN_WARNING "parse_args(): option '%s' enabled "
"irq's!\n", param);
@@ -298,35 +305,18 @@ EXPORT_SYMBOL(param_ops_charp);
/* Actually could be a bool or an int, for historical reasons. */
int param_set_bool(const char *val, const struct kernel_param *kp)
{
- bool v;
- int ret;
-
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
- ret = strtobool(val, &v);
- if (ret)
- return ret;
-
- if (kp->flags & KPARAM_ISBOOL)
- *(bool *)kp->arg = v;
- else
- *(int *)kp->arg = v;
- return 0;
+ return strtobool(val, kp->arg);
}
EXPORT_SYMBOL(param_set_bool);
int param_get_bool(char *buffer, const struct kernel_param *kp)
{
- bool val;
- if (kp->flags & KPARAM_ISBOOL)
- val = *(bool *)kp->arg;
- else
- val = *(int *)kp->arg;
-
/* Y and N chosen as being relatively non-coder friendly */
- return sprintf(buffer, "%c", val ? 'Y' : 'N');
+ return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
}
EXPORT_SYMBOL(param_get_bool);
@@ -344,7 +334,6 @@ int param_set_invbool(const char *val, const struct kernel_param *kp)
struct kernel_param dummy;
dummy.arg = &boolval;
- dummy.flags = KPARAM_ISBOOL;
ret = param_set_bool(val, &dummy);
if (ret == 0)
*(bool *)kp->arg = !boolval;
@@ -373,7 +362,6 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
/* Match bool exactly, by re-using it. */
boolkp = *kp;
boolkp.arg = &v;
- boolkp.flags |= KPARAM_ISBOOL;
ret = param_set_bool(val, &boolkp);
if (ret == 0)
@@ -394,7 +382,7 @@ static int param_array(const char *name,
unsigned int min, unsigned int max,
void *elem, int elemsize,
int (*set)(const char *, const struct kernel_param *kp),
- u16 flags,
+ s16 level,
unsigned int *num)
{
int ret;
@@ -404,7 +392,7 @@ static int param_array(const char *name,
/* Get the name right for errors. */
kp.name = name;
kp.arg = elem;
- kp.flags = flags;
+ kp.level = level;
*num = 0;
/* We expect a comma-separated list of values. */
@@ -445,7 +433,7 @@ static int param_array_set(const char *val, const struct kernel_param *kp)
unsigned int temp_num;
return param_array(kp->name, val, 1, arr->max, arr->elem,
- arr->elemsize, arr->ops->set, kp->flags,
+ arr->elemsize, arr->ops->set, kp->level,
arr->num ?: &temp_num);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a8968396046d..57bc1fd35b3c 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,6 +15,7 @@
#include <linux/acct.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
+#include <linux/reboot.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -168,13 +169,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
while (nr > 0) {
rcu_read_lock();
- /*
- * Any nested-container's init processes won't ignore the
- * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
- */
task = pid_task(find_vpid(nr), PIDTYPE_PID);
- if (task)
- send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
+ if (task && !__fatal_signal_pending(task))
+ send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
rcu_read_unlock();
@@ -187,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
+ if (pid_ns->reboot)
+ current->signal->group_exit_code = pid_ns->reboot;
+
acct_exit_ns(pid_ns);
return;
}
@@ -221,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = {
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
+int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
+{
+ if (pid_ns == &init_pid_ns)
+ return 0;
+
+ switch (cmd) {
+ case LINUX_REBOOT_CMD_RESTART2:
+ case LINUX_REBOOT_CMD_RESTART:
+ pid_ns->reboot = SIGHUP;
+ break;
+
+ case LINUX_REBOOT_CMD_POWER_OFF:
+ case LINUX_REBOOT_CMD_HALT:
+ pid_ns->reboot = SIGINT;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ read_lock(&tasklist_lock);
+ force_sig(SIGKILL, pid_ns->child_reaper);
+ read_unlock(&tasklist_lock);
+
+ do_exit(0);
+
+ /* Not reached */
+ return 0;
+}
+
static __init int pid_namespaces_init(void)
{
pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 00ab2ca5ed11..ee8d49b9c309 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -231,26 +231,22 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
}
static int ptrace_attach(struct task_struct *task, long request,
+ unsigned long addr,
unsigned long flags)
{
bool seize = (request == PTRACE_SEIZE);
int retval;
- /*
- * SEIZE will enable new ptrace behaviors which will be implemented
- * gradually. SEIZE_DEVEL is used to prevent applications
- * expecting full SEIZE behaviors trapping on kernel commits which
- * are still in the process of implementing them.
- *
- * Only test programs for new ptrace behaviors being implemented
- * should set SEIZE_DEVEL. If unset, SEIZE will fail with -EIO.
- *
- * Once SEIZE behaviors are completely implemented, this flag and
- * the following test will be removed.
- */
retval = -EIO;
- if (seize && !(flags & PTRACE_SEIZE_DEVEL))
- goto out;
+ if (seize) {
+ if (addr != 0)
+ goto out;
+ if (flags & ~(unsigned long)PTRACE_O_MASK)
+ goto out;
+ flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
+ } else {
+ flags = PT_PTRACED;
+ }
audit_ptrace(task);
@@ -262,7 +258,7 @@ static int ptrace_attach(struct task_struct *task, long request,
/*
* Protect exec's credential calculations against our interference;
- * interference; SUID, SGID and LSM creds get determined differently
+ * SUID, SGID and LSM creds get determined differently
* under ptrace.
*/
retval = -ERESTARTNOINTR;
@@ -282,11 +278,11 @@ static int ptrace_attach(struct task_struct *task, long request,
if (task->ptrace)
goto unlock_tasklist;
- task->ptrace = PT_PTRACED;
if (seize)
- task->ptrace |= PT_SEIZED;
+ flags |= PT_SEIZED;
if (ns_capable(task_user_ns(task), CAP_SYS_PTRACE))
- task->ptrace |= PT_PTRACE_CAP;
+ flags |= PT_PTRACE_CAP;
+ task->ptrace = flags;
__ptrace_link(task, current);
@@ -528,30 +524,18 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
- child->ptrace &= ~PT_TRACE_MASK;
+ unsigned flags;
- if (data & PTRACE_O_TRACESYSGOOD)
- child->ptrace |= PT_TRACESYSGOOD;
-
- if (data & PTRACE_O_TRACEFORK)
- child->ptrace |= PT_TRACE_FORK;
-
- if (data & PTRACE_O_TRACEVFORK)
- child->ptrace |= PT_TRACE_VFORK;
-
- if (data & PTRACE_O_TRACECLONE)
- child->ptrace |= PT_TRACE_CLONE;
-
- if (data & PTRACE_O_TRACEEXEC)
- child->ptrace |= PT_TRACE_EXEC;
-
- if (data & PTRACE_O_TRACEVFORKDONE)
- child->ptrace |= PT_TRACE_VFORK_DONE;
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
- if (data & PTRACE_O_TRACEEXIT)
- child->ptrace |= PT_TRACE_EXIT;
+ /* Avoid intermediate state when all opts are cleared */
+ flags = child->ptrace;
+ flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
+ flags |= (data << PT_OPT_FLAG_SHIFT);
+ child->ptrace = flags;
- return (data & ~PTRACE_O_MASK) ? -EINVAL : 0;
+ return 0;
}
static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info)
@@ -891,7 +875,7 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
- ret = ptrace_attach(child, request, data);
+ ret = ptrace_attach(child, request, addr, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
@@ -1034,7 +1018,7 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
}
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
- ret = ptrace_attach(child, request, data);
+ ret = ptrace_attach(child, request, addr, data);
/*
* Some architectures need to do book-keeping after
* a ptrace attach.
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b152f74f02de..6850f53e02d8 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -10,7 +10,6 @@
#include <linux/export.h>
#include <linux/rwsem.h>
-#include <asm/system.h>
#include <linux/atomic.h>
/*
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 503d6426126d..4603b9d8f30a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -73,6 +73,7 @@
#include <linux/init_task.h>
#include <linux/binfmts.h>
+#include <asm/switch_to.h>
#include <asm/tlb.h>
#include <asm/irq_regs.h>
#include <asm/mutex.h>
@@ -1264,29 +1265,59 @@ EXPORT_SYMBOL_GPL(kick_process);
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
- int dest_cpu;
const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+ enum { cpuset, possible, fail } state = cpuset;
+ int dest_cpu;
/* Look for allowed, online CPU in same node. */
- for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
+ for_each_cpu(dest_cpu, nodemask) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
+ }
+
+ for (;;) {
+ /* Any allowed, online CPU? */
+ for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
+ if (!cpu_online(dest_cpu))
+ continue;
+ if (!cpu_active(dest_cpu))
+ continue;
+ goto out;
+ }
- /* Any allowed, online CPU? */
- dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
- if (dest_cpu < nr_cpu_ids)
- return dest_cpu;
+ switch (state) {
+ case cpuset:
+ /* No more Mr. Nice Guy. */
+ cpuset_cpus_allowed_fallback(p);
+ state = possible;
+ break;
- /* No more Mr. Nice Guy. */
- dest_cpu = cpuset_cpus_allowed_fallback(p);
- /*
- * Don't tell them about moving exiting tasks or
- * kernel threads (both mm NULL), since they never
- * leave kernel.
- */
- if (p->mm && printk_ratelimit()) {
- printk_sched("process %d (%s) no longer affine to cpu%d\n",
- task_pid_nr(p), p->comm, cpu);
+ case possible:
+ do_set_cpus_allowed(p, cpu_possible_mask);
+ state = fail;
+ break;
+
+ case fail:
+ BUG();
+ break;
+ }
+ }
+
+out:
+ if (state != cpuset) {
+ /*
+ * Don't tell them about moving exiting tasks or
+ * kernel threads (both mm NULL), since they never
+ * leave kernel.
+ */
+ if (p->mm && printk_ratelimit()) {
+ printk_sched("process %d (%s) no longer affine to cpu%d\n",
+ task_pid_nr(p), p->comm, cpu);
+ }
}
return dest_cpu;
@@ -1933,6 +1964,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
local_irq_enable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
finish_lock_switch(rq, prev);
+ finish_arch_post_lock_switch();
fire_sched_in_preempt_notifiers(current);
if (mm)
@@ -3070,8 +3102,6 @@ EXPORT_SYMBOL(sub_preempt_count);
*/
static noinline void __schedule_bug(struct task_struct *prev)
{
- struct pt_regs *regs = get_irq_regs();
-
if (oops_in_progress)
return;
@@ -3082,11 +3112,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
print_modules();
if (irqs_disabled())
print_irqtrace_events(prev);
-
- if (regs)
- show_regs(regs);
- else
- dump_stack();
+ dump_stack();
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 94340c7544a9..0d97ebdc58f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -416,8 +416,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec);
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
/**************************************************************
* Scheduling class tree data structure manipulation methods:
@@ -1162,7 +1162,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
__clear_buddies_skip(se);
}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
@@ -1546,8 +1546,8 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
resched_task(rq_of(cfs_rq)->curr);
}
-static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec)
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
{
if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
return;
@@ -2073,11 +2073,11 @@ void unthrottle_offline_cfs_rqs(struct rq *rq)
}
#else /* CONFIG_CFS_BANDWIDTH */
-static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
- unsigned long delta_exec) {}
+static __always_inline
+void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec) {}
static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
-static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
{
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b60dad720173..44af55e6d5d0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1428,7 +1428,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
next_idx:
if (idx >= MAX_RT_PRIO)
continue;
- if (next && next->prio < idx)
+ if (next && next->prio <= idx)
continue;
list_for_each_entry(rt_se, array->queue + idx, run_list) {
struct task_struct *p;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 42b1f304b044..fb3acba4d52e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -681,6 +681,9 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
diff --git a/kernel/signal.c b/kernel/signal.c
index e76001ccf5cd..17afcaf582d0 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -36,6 +36,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/siginfo.h>
+#include <asm/cacheflush.h>
#include "audit.h" /* audit_signal_info() */
/*
@@ -58,21 +59,20 @@ static int sig_handler_ignored(void __user *handler, int sig)
(handler == SIG_DFL && sig_kernel_ignore(sig));
}
-static int sig_task_ignored(struct task_struct *t, int sig,
- int from_ancestor_ns)
+static int sig_task_ignored(struct task_struct *t, int sig, bool force)
{
void __user *handler;
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !from_ancestor_ns)
+ handler == SIG_DFL && !force)
return 1;
return sig_handler_ignored(handler, sig);
}
-static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
+static int sig_ignored(struct task_struct *t, int sig, bool force)
{
/*
* Blocked signals are never ignored, since the
@@ -82,7 +82,7 @@ static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, from_ancestor_ns))
+ if (!sig_task_ignored(t, sig, force))
return 0;
/*
@@ -855,7 +855,7 @@ static void ptrace_trap_notify(struct task_struct *t)
* Returns true if the signal should be actually delivered, otherwise
* it should be dropped.
*/
-static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
+static int prepare_signal(int sig, struct task_struct *p, bool force)
{
struct signal_struct *signal = p->signal;
struct task_struct *t;
@@ -915,7 +915,7 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
}
}
- return !sig_ignored(p, sig, from_ancestor_ns);
+ return !sig_ignored(p, sig, force);
}
/*
@@ -1059,7 +1059,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
assert_spin_locked(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
- if (!prepare_signal(sig, t, from_ancestor_ns))
+ if (!prepare_signal(sig, t,
+ from_ancestor_ns || (info == SEND_SIG_FORCED)))
goto ret;
pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1601,7 +1602,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
ret = 1; /* the signal is ignored */
result = TRACE_SIGNAL_IGNORED;
- if (!prepare_signal(sig, t, 0))
+ if (!prepare_signal(sig, t, false))
goto out;
ret = 0;
diff --git a/kernel/smp.c b/kernel/smp.c
index db197d60489b..2f8b10ecf759 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -701,3 +701,93 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait)
return ret;
}
EXPORT_SYMBOL(on_each_cpu);
+
+/**
+ * on_each_cpu_mask(): Run a function on processors specified by
+ * cpumask, which may include the local processor.
+ * @mask: The set of cpus to run on (only runs on online subset).
+ * @func: The function to run. This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to the function.
+ * @wait: If true, wait (atomically) until function has completed
+ * on other CPUs.
+ *
+ * If @wait is true, then returns once @func has returned.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
+ void *info, bool wait)
+{
+ int cpu = get_cpu();
+
+ smp_call_function_many(mask, func, info, wait);
+ if (cpumask_test_cpu(cpu, mask)) {
+ local_irq_disable();
+ func(info);
+ local_irq_enable();
+ }
+ put_cpu();
+}
+EXPORT_SYMBOL(on_each_cpu_mask);
+
+/*
+ * on_each_cpu_cond(): Call a function on each processor for which
+ * the supplied function cond_func returns true, optionally waiting
+ * for all the required CPUs to finish. This may include the local
+ * processor.
+ * @cond_func: A callback function that is passed a cpu id and
+ * the the info parameter. The function is called
+ * with preemption disabled. The function should
+ * return a blooean value indicating whether to IPI
+ * the specified CPU.
+ * @func: The function to run on all applicable CPUs.
+ * This must be fast and non-blocking.
+ * @info: An arbitrary pointer to pass to both functions.
+ * @wait: If true, wait (atomically) until function has
+ * completed on other CPUs.
+ * @gfp_flags: GFP flags to use when allocating the cpumask
+ * used internally by the function.
+ *
+ * The function might sleep if the GFP flags indicates a non
+ * atomic allocation is allowed.
+ *
+ * Preemption is disabled to protect against CPUs going offline but not online.
+ * CPUs going online during the call will not be seen or sent an IPI.
+ *
+ * You must not call this function with disabled interrupts or
+ * from a hardware interrupt handler or from a bottom half handler.
+ */
+void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
+ smp_call_func_t func, void *info, bool wait,
+ gfp_t gfp_flags)
+{
+ cpumask_var_t cpus;
+ int cpu, ret;
+
+ might_sleep_if(gfp_flags & __GFP_WAIT);
+
+ if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) {
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info))
+ cpumask_set_cpu(cpu, cpus);
+ on_each_cpu_mask(cpus, func, info, wait);
+ preempt_enable();
+ free_cpumask_var(cpus);
+ } else {
+ /*
+ * No free cpumask, bother. No matter, we'll
+ * just have to IPI them one by one.
+ */
+ preempt_disable();
+ for_each_online_cpu(cpu)
+ if (cond_func(cpu, info)) {
+ ret = smp_call_function_single(cpu, func,
+ info, wait);
+ WARN_ON_ONCE(!ret);
+ }
+ preempt_enable();
+ }
+}
+EXPORT_SYMBOL(on_each_cpu_cond);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 84c7d96918bf..5cdd8065a3ce 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -163,7 +163,7 @@ void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
EXPORT_SYMBOL(_raw_spin_lock_bh);
#endif
-#ifndef CONFIG_INLINE_SPIN_UNLOCK
+#ifdef CONFIG_UNINLINE_SPIN_UNLOCK
void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
{
__raw_spin_unlock(lock);
diff --git a/kernel/sys.c b/kernel/sys.c
index 888d227fd195..e7006eb6c1e4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
magic2 != LINUX_REBOOT_MAGIC2C))
return -EINVAL;
+ /*
+ * If pid namespaces are enabled and the current task is in a child
+ * pid_namespace, the command is handled by reboot_pid_ns() which will
+ * call do_exit().
+ */
+ ret = reboot_pid_ns(task_active_pid_ns(current), cmd);
+ if (ret)
+ return ret;
+
/* Instead of trying to make the power_off code look like
* halt when pm_power_off is not set do it the easy way.
*/
@@ -1962,6 +1971,14 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_MM:
error = prctl_set_mm(arg2, arg3, arg4, arg5);
break;
+ case PR_SET_CHILD_SUBREAPER:
+ me->signal->is_child_subreaper = !!arg2;
+ error = 0;
+ break;
+ case PR_GET_CHILD_SUBREAPER:
+ error = put_user(me->signal->is_child_subreaper,
+ (int __user *) arg2);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d53046b905..52b3a06a02f8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -23,6 +23,7 @@
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
+#include <linux/bitmap.h>
#include <linux/signal.h>
#include <linux/printk.h>
#include <linux/proc_fs.h>
@@ -68,6 +69,9 @@
#include <asm/stacktrace.h>
#include <asm/io.h>
#endif
+#ifdef CONFIG_SPARC
+#include <asm/setup.h>
+#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
#include <linux/acct.h>
#endif
@@ -142,7 +146,6 @@ static const int cap_last_cap = CAP_LAST_CAP;
#include <linux/inotify.h>
#endif
#ifdef CONFIG_SPARC
-#include <asm/system.h>
#endif
#ifdef CONFIG_SPARC64
@@ -193,20 +196,6 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
#endif
-static struct ctl_table root_table[];
-static struct ctl_table_root sysctl_table_root;
-static struct ctl_table_header root_table_header = {
- {{.count = 1,
- .ctl_table = root_table,
- .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
- .root = &sysctl_table_root,
- .set = &sysctl_table_root.default_set,
-};
-static struct ctl_table_root sysctl_table_root = {
- .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
- .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
-};
-
static struct ctl_table kern_table[];
static struct ctl_table vm_table[];
static struct ctl_table fs_table[];
@@ -223,7 +212,7 @@ int sysctl_legacy_va_layout;
/* The default sysctl tables: */
-static struct ctl_table root_table[] = {
+static struct ctl_table sysctl_base_table[] = {
{
.procname = "kernel",
.mode = 0555,
@@ -1560,490 +1549,12 @@ static struct ctl_table dev_table[] = {
{ }
};
-static DEFINE_SPINLOCK(sysctl_lock);
-
-/* called under sysctl_lock */
-static int use_table(struct ctl_table_header *p)
+int __init sysctl_init(void)
{
- if (unlikely(p->unregistering))
- return 0;
- p->used++;
- return 1;
-}
-
-/* called under sysctl_lock */
-static void unuse_table(struct ctl_table_header *p)
-{
- if (!--p->used)
- if (unlikely(p->unregistering))
- complete(p->unregistering);
-}
-
-/* called under sysctl_lock, will reacquire if has to wait */
-static void start_unregistering(struct ctl_table_header *p)
-{
- /*
- * if p->used is 0, nobody will ever touch that entry again;
- * we'll eliminate all paths to it before dropping sysctl_lock
- */
- if (unlikely(p->used)) {
- struct completion wait;
- init_completion(&wait);
- p->unregistering = &wait;
- spin_unlock(&sysctl_lock);
- wait_for_completion(&wait);
- spin_lock(&sysctl_lock);
- } else {
- /* anything non-NULL; we'll never dereference it */
- p->unregistering = ERR_PTR(-EINVAL);
- }
- /*
- * do not remove from the list until nobody holds it; walking the
- * list in do_sysctl() relies on that.
- */
- list_del_init(&p->ctl_entry);
-}
-
-void sysctl_head_get(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- head->count++;
- spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- if (!--head->count)
- kfree_rcu(head, rcu);
- spin_unlock(&sysctl_lock);
-}
-
-struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
-{
- if (!head)
- BUG();
- spin_lock(&sysctl_lock);
- if (!use_table(head))
- head = ERR_PTR(-ENOENT);
- spin_unlock(&sysctl_lock);
- return head;
-}
-
-void sysctl_head_finish(struct ctl_table_header *head)
-{
- if (!head)
- return;
- spin_lock(&sysctl_lock);
- unuse_table(head);
- spin_unlock(&sysctl_lock);
-}
-
-static struct ctl_table_set *
-lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
- struct ctl_table_set *set = &root->default_set;
- if (root->lookup)
- set = root->lookup(root, namespaces);
- return set;
-}
-
-static struct list_head *
-lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
-{
- struct ctl_table_set *set = lookup_header_set(root, namespaces);
- return &set->list;
-}
-
-struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
- struct ctl_table_header *prev)
-{
- struct ctl_table_root *root;
- struct list_head *header_list;
- struct ctl_table_header *head;
- struct list_head *tmp;
-
- spin_lock(&sysctl_lock);
- if (prev) {
- head = prev;
- tmp = &prev->ctl_entry;
- unuse_table(prev);
- goto next;
- }
- tmp = &root_table_header.ctl_entry;
- for (;;) {
- head = list_entry(tmp, struct ctl_table_header, ctl_entry);
-
- if (!use_table(head))
- goto next;
- spin_unlock(&sysctl_lock);
- return head;
- next:
- root = head->root;
- tmp = tmp->next;
- header_list = lookup_header_list(root, namespaces);
- if (tmp != header_list)
- continue;
-
- do {
- root = list_entry(root->root_list.next,
- struct ctl_table_root, root_list);
- if (root == &sysctl_table_root)
- goto out;
- header_list = lookup_header_list(root, namespaces);
- } while (list_empty(header_list));
- tmp = header_list->next;
- }
-out:
- spin_unlock(&sysctl_lock);
- return NULL;
-}
-
-struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
-{
- return __sysctl_head_next(current->nsproxy, prev);
-}
-
-void register_sysctl_root(struct ctl_table_root *root)
-{
- spin_lock(&sysctl_lock);
- list_add_tail(&root->root_list, &sysctl_table_root.root_list);
- spin_unlock(&sysctl_lock);
-}
-
-/*
- * sysctl_perm does NOT grant the superuser all rights automatically, because
- * some sysctl variables are readonly even to root.
- */
-
-static int test_perm(int mode, int op)
-{
- if (!current_euid())
- mode >>= 6;
- else if (in_egroup_p(0))
- mode >>= 3;
- if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
- return 0;
- return -EACCES;
-}
-
-int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
-{
- int mode;
-
- if (root->permissions)
- mode = root->permissions(root, current->nsproxy, table);
- else
- mode = table->mode;
-
- return test_perm(mode, op);
-}
-
-static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table)
-{
- for (; table->procname; table++) {
- table->parent = parent;
- if (table->child)
- sysctl_set_parent(table, table->child);
- }
-}
-
-static __init int sysctl_init(void)
-{
- sysctl_set_parent(NULL, root_table);
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
- sysctl_check_table(current->nsproxy, root_table);
-#endif
+ register_sysctl_table(sysctl_base_table);
return 0;
}
-core_initcall(sysctl_init);
-
-static struct ctl_table *is_branch_in(struct ctl_table *branch,
- struct ctl_table *table)
-{
- struct ctl_table *p;
- const char *s = branch->procname;
-
- /* branch should have named subdirectory as its first element */
- if (!s || !branch->child)
- return NULL;
-
- /* ... and nothing else */
- if (branch[1].procname)
- return NULL;
-
- /* table should contain subdirectory with the same name */
- for (p = table; p->procname; p++) {
- if (!p->child)
- continue;
- if (p->procname && strcmp(p->procname, s) == 0)
- return p;
- }
- return NULL;
-}
-
-/* see if attaching q to p would be an improvement */
-static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
-{
- struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
- struct ctl_table *next;
- int is_better = 0;
- int not_in_parent = !p->attached_by;
-
- while ((next = is_branch_in(by, to)) != NULL) {
- if (by == q->attached_by)
- is_better = 1;
- if (to == p->attached_by)
- not_in_parent = 1;
- by = by->child;
- to = next->child;
- }
-
- if (is_better && not_in_parent) {
- q->attached_by = by;
- q->attached_to = to;
- q->parent = p;
- }
-}
-
-/**
- * __register_sysctl_paths - register a sysctl hierarchy
- * @root: List of sysctl headers to register on
- * @namespaces: Data to compute which lists of sysctl entries are visible
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * The members of the &struct ctl_table structure are used as follows:
- *
- * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not
- * enter a sysctl file
- *
- * data - a pointer to data for use by proc_handler
- *
- * maxlen - the maximum size in bytes of the data
- *
- * mode - the file permissions for the /proc/sys file, and for sysctl(2)
- *
- * child - a pointer to the child sysctl table if this entry is a directory, or
- * %NULL.
- *
- * proc_handler - the text handler routine (described below)
- *
- * de - for internal use by the sysctl routines
- *
- * extra1, extra2 - extra pointers usable by the proc handler routines
- *
- * Leaf nodes in the sysctl tree will be represented by a single file
- * under /proc; non-leaf nodes will be represented by directories.
- *
- * sysctl(2) can automatically manage read and write requests through
- * the sysctl table. The data and maxlen fields of the ctl_table
- * struct enable minimal validation of the values being written to be
- * performed, and the mode field allows minimal authentication.
- *
- * There must be a proc_handler routine for any terminal nodes
- * mirrored under /proc/sys (non-terminals are handled by a built-in
- * directory handler). Several default handlers are available to
- * cover common cases -
- *
- * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(),
- * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(),
- * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax()
- *
- * It is the handler's job to read the input buffer from user memory
- * and process it. The handler should return 0 on success.
- *
- * This routine returns %NULL on a failure to register, and a pointer
- * to the table header on success.
- */
-struct ctl_table_header *__register_sysctl_paths(
- struct ctl_table_root *root,
- struct nsproxy *namespaces,
- const struct ctl_path *path, struct ctl_table *table)
-{
- struct ctl_table_header *header;
- struct ctl_table *new, **prevp;
- unsigned int n, npath;
- struct ctl_table_set *set;
-
- /* Count the path components */
- for (npath = 0; path[npath].procname; ++npath)
- ;
-
- /*
- * For each path component, allocate a 2-element ctl_table array.
- * The first array element will be filled with the sysctl entry
- * for this, the second will be the sentinel (procname == 0).
- *
- * We allocate everything in one go so that we don't have to
- * worry about freeing additional memory in unregister_sysctl_table.
- */
- header = kzalloc(sizeof(struct ctl_table_header) +
- (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
- if (!header)
- return NULL;
-
- new = (struct ctl_table *) (header + 1);
-
- /* Now connect the dots */
- prevp = &header->ctl_table;
- for (n = 0; n < npath; ++n, ++path) {
- /* Copy the procname */
- new->procname = path->procname;
- new->mode = 0555;
-
- *prevp = new;
- prevp = &new->child;
-
- new += 2;
- }
- *prevp = table;
- header->ctl_table_arg = table;
-
- INIT_LIST_HEAD(&header->ctl_entry);
- header->used = 0;
- header->unregistering = NULL;
- header->root = root;
- sysctl_set_parent(NULL, header->ctl_table);
- header->count = 1;
-#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
- if (sysctl_check_table(namespaces, header->ctl_table)) {
- kfree(header);
- return NULL;
- }
-#endif
- spin_lock(&sysctl_lock);
- header->set = lookup_header_set(root, namespaces);
- header->attached_by = header->ctl_table;
- header->attached_to = root_table;
- header->parent = &root_table_header;
- for (set = header->set; set; set = set->parent) {
- struct ctl_table_header *p;
- list_for_each_entry(p, &set->list, ctl_entry) {
- if (p->unregistering)
- continue;
- try_attach(p, header);
- }
- }
- header->parent->count++;
- list_add_tail(&header->ctl_entry, &header->set->list);
- spin_unlock(&sysctl_lock);
-
- return header;
-}
-
-/**
- * register_sysctl_table_path - register a sysctl table hierarchy
- * @path: The path to the directory the sysctl table is in.
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See __register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
- path, table);
-}
-
-/**
- * register_sysctl_table - register a sysctl table hierarchy
- * @table: the top-level table structure
- *
- * Register a sysctl table hierarchy. @table should be a filled in ctl_table
- * array. A completely 0 filled entry terminates the table.
- *
- * See register_sysctl_paths for more details.
- */
-struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
-{
- static const struct ctl_path null_path[] = { {} };
-
- return register_sysctl_paths(null_path, table);
-}
-
-/**
- * unregister_sysctl_table - unregister a sysctl table hierarchy
- * @header: the header returned from register_sysctl_table
- *
- * Unregisters the sysctl table and all children. proc entries may not
- * actually be removed until they are no longer used by anyone.
- */
-void unregister_sysctl_table(struct ctl_table_header * header)
-{
- might_sleep();
-
- if (header == NULL)
- return;
-
- spin_lock(&sysctl_lock);
- start_unregistering(header);
- if (!--header->parent->count) {
- WARN_ON(1);
- kfree_rcu(header->parent, rcu);
- }
- if (!--header->count)
- kfree_rcu(header, rcu);
- spin_unlock(&sysctl_lock);
-}
-
-int sysctl_is_seen(struct ctl_table_header *p)
-{
- struct ctl_table_set *set = p->set;
- int res;
- spin_lock(&sysctl_lock);
- if (p->unregistering)
- res = 0;
- else if (!set->is_seen)
- res = 1;
- else
- res = set->is_seen(set);
- spin_unlock(&sysctl_lock);
- return res;
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
- struct ctl_table_set *parent,
- int (*is_seen)(struct ctl_table_set *))
-{
- INIT_LIST_HEAD(&p->list);
- p->parent = parent ? parent : &sysctl_table_root.default_set;
- p->is_seen = is_seen;
-}
-
-#else /* !CONFIG_SYSCTL */
-struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
-{
- return NULL;
-}
-
-struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
- struct ctl_table *table)
-{
- return NULL;
-}
-
-void unregister_sysctl_table(struct ctl_table_header * table)
-{
-}
-
-void setup_sysctl_set(struct ctl_table_set *p,
- struct ctl_table_set *parent,
- int (*is_seen)(struct ctl_table_set *))
-{
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
-}
-
#endif /* CONFIG_SYSCTL */
/*
@@ -2885,9 +2396,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
}
}
- while (val_a <= val_b)
- set_bit(val_a++, tmp_bitmap);
-
+ bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1);
first = 0;
proc_skip_char(&kbuf, &left, '\n');
}
@@ -2930,8 +2439,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
if (*ppos)
bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
else
- memcpy(bitmap, tmp_bitmap,
- BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
+ bitmap_copy(bitmap, tmp_bitmap, bitmap_len);
}
kfree(tmp_bitmap);
*lenp -= left;
@@ -3009,6 +2517,3 @@ EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
EXPORT_SYMBOL(proc_doulongvec_minmax);
EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
-EXPORT_SYMBOL(register_sysctl_table);
-EXPORT_SYMBOL(register_sysctl_paths);
-EXPORT_SYMBOL(unregister_sysctl_table);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
deleted file mode 100644
index 362da653813d..000000000000
--- a/kernel/sysctl_check.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/stat.h>
-#include <linux/sysctl.h>
-#include "../fs/xfs/xfs_sysctl.h"
-#include <linux/sunrpc/debug.h>
-#include <linux/string.h>
-#include <net/ip_vs.h>
-
-
-static int sysctl_depth(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth;
-
- depth = 0;
- for (tmp = table; tmp->parent; tmp = tmp->parent)
- depth++;
-
- return depth;
-}
-
-static struct ctl_table *sysctl_parent(struct ctl_table *table, int n)
-{
- int i;
-
- for (i = 0; table && i < n; i++)
- table = table->parent;
-
- return table;
-}
-
-
-static void sysctl_print_path(struct ctl_table *table)
-{
- struct ctl_table *tmp;
- int depth, i;
- depth = sysctl_depth(table);
- if (table->procname) {
- for (i = depth; i >= 0; i--) {
- tmp = sysctl_parent(table, i);
- printk("/%s", tmp->procname?tmp->procname:"");
- }
- }
- printk(" ");
-}
-
-static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
- struct ctl_table *table)
-{
- struct ctl_table_header *head;
- struct ctl_table *ref, *test;
- int depth, cur_depth;
-
- depth = sysctl_depth(table);
-
- for (head = __sysctl_head_next(namespaces, NULL); head;
- head = __sysctl_head_next(namespaces, head)) {
- cur_depth = depth;
- ref = head->ctl_table;
-repeat:
- test = sysctl_parent(table, cur_depth);
- for (; ref->procname; ref++) {
- int match = 0;
- if (cur_depth && !ref->child)
- continue;
-
- if (test->procname && ref->procname &&
- (strcmp(test->procname, ref->procname) == 0))
- match++;
-
- if (match) {
- if (cur_depth != 0) {
- cur_depth--;
- ref = ref->child;
- goto repeat;
- }
- goto out;
- }
- }
- }
- ref = NULL;
-out:
- sysctl_head_finish(head);
- return ref;
-}
-
-static void set_fail(const char **fail, struct ctl_table *table, const char *str)
-{
- if (*fail) {
- printk(KERN_ERR "sysctl table check failed: ");
- sysctl_print_path(table);
- printk(" %s\n", *fail);
- dump_stack();
- }
- *fail = str;
-}
-
-static void sysctl_check_leaf(struct nsproxy *namespaces,
- struct ctl_table *table, const char **fail)
-{
- struct ctl_table *ref;
-
- ref = sysctl_check_lookup(namespaces, table);
- if (ref && (ref != table))
- set_fail(fail, table, "Sysctl already exists");
-}
-
-int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
-{
- int error = 0;
- for (; table->procname; table++) {
- const char *fail = NULL;
-
- if (table->parent) {
- if (!table->parent->procname)
- set_fail(&fail, table, "Parent without procname");
- }
- if (table->child) {
- if (table->data)
- set_fail(&fail, table, "Directory with data?");
- if (table->maxlen)
- set_fail(&fail, table, "Directory with maxlen?");
- if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode)
- set_fail(&fail, table, "Writable sysctl directory");
- if (table->proc_handler)
- set_fail(&fail, table, "Directory with proc_handler");
- if (table->extra1)
- set_fail(&fail, table, "Directory with extra1");
- if (table->extra2)
- set_fail(&fail, table, "Directory with extra2");
- } else {
- if ((table->proc_handler == proc_dostring) ||
- (table->proc_handler == proc_dointvec) ||
- (table->proc_handler == proc_dointvec_minmax) ||
- (table->proc_handler == proc_dointvec_jiffies) ||
- (table->proc_handler == proc_dointvec_userhz_jiffies) ||
- (table->proc_handler == proc_dointvec_ms_jiffies) ||
- (table->proc_handler == proc_doulongvec_minmax) ||
- (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) {
- if (!table->data)
- set_fail(&fail, table, "No data");
- if (!table->maxlen)
- set_fail(&fail, table, "No maxlen");
- }
-#ifdef CONFIG_PROC_SYSCTL
- if (!table->proc_handler)
- set_fail(&fail, table, "No proc_handler");
-#endif
- sysctl_check_leaf(namespaces, table, &fail);
- }
- if (table->mode > 0777)
- set_fail(&fail, table, "bogus .mode");
- if (fail) {
- set_fail(&fail, table, NULL);
- error = -EINVAL;
- }
- if (table->child)
- error |= sysctl_check_table(namespaces, table->child);
- }
- return error;
-}
diff --git a/kernel/time.c b/kernel/time.c
index 73e416db0a1e..ba744cf80696 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -163,7 +163,6 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
return error;
if (tz) {
- /* SMP safe, global irq locking makes it work. */
sys_tz = *tz;
update_vsyscall_tz();
if (firsttime) {
@@ -173,12 +172,7 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
}
}
if (tv)
- {
- /* SMP safe, again the code in arch/foo/time.c should
- * globally block out interrupts when it runs.
- */
return do_settimeofday(tv);
- }
return 0;
}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a46f5d64504..8a538c55fc7b 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -96,6 +96,11 @@ static int alarmtimer_rtc_add_device(struct device *dev,
return 0;
}
+static inline void alarmtimer_rtc_timer_init(void)
+{
+ rtc_timer_init(&rtctimer, NULL, NULL);
+}
+
static struct class_interface alarmtimer_rtc_interface = {
.add_dev = &alarmtimer_rtc_add_device,
};
@@ -117,6 +122,7 @@ static inline struct rtc_device *alarmtimer_get_rtcdev(void)
#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
+static inline void alarmtimer_rtc_timer_init(void) { }
#endif
/**
@@ -783,6 +789,8 @@ static int __init alarmtimer_init(void)
.nsleep = alarm_timer_nsleep,
};
+ alarmtimer_rtc_timer_init();
+
posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index a45ca167ab24..c9583382141a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -500,7 +500,7 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
{
u64 ret;
/*
- * We won't try to correct for more then 11% adjustments (110,000 ppm),
+ * We won't try to correct for more than 11% adjustments (110,000 ppm),
*/
ret = (u64)cs->mult * 11;
do_div(ret,100);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 6e039b144daf..f03fd83b170b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -34,8 +34,6 @@ unsigned long tick_nsec;
static u64 tick_length;
static u64 tick_length_base;
-static struct hrtimer leap_timer;
-
#define MAX_TICKADJ 500LL /* usecs */
#define MAX_TICKADJ_SCALED \
(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -381,70 +379,63 @@ u64 ntp_tick_length(void)
/*
- * Leap second processing. If in leap-insert state at the end of the
- * day, the system clock is set back one second; if in leap-delete
- * state, the system clock is set ahead one second.
+ * this routine handles the overflow of the microsecond field
+ *
+ * The tricky bits of code to handle the accurate clock support
+ * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
+ * They were originally developed for SUN and DEC kernels.
+ * All the kudos should go to Dave for this stuff.
+ *
+ * Also handles leap second processing, and returns leap offset
*/
-static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
+int second_overflow(unsigned long secs)
{
- enum hrtimer_restart res = HRTIMER_NORESTART;
- unsigned long flags;
+ s64 delta;
int leap = 0;
+ unsigned long flags;
spin_lock_irqsave(&ntp_lock, flags);
+
+ /*
+ * Leap second processing. If in leap-insert state at the end of the
+ * day, the system clock is set back one second; if in leap-delete
+ * state, the system clock is set ahead one second.
+ */
switch (time_state) {
case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
break;
case TIME_INS:
- leap = -1;
- time_state = TIME_OOP;
- printk(KERN_NOTICE
- "Clock: inserting leap second 23:59:60 UTC\n");
- hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
- res = HRTIMER_RESTART;
+ if (secs % 86400 == 0) {
+ leap = -1;
+ time_state = TIME_OOP;
+ printk(KERN_NOTICE
+ "Clock: inserting leap second 23:59:60 UTC\n");
+ }
break;
case TIME_DEL:
- leap = 1;
- time_tai--;
- time_state = TIME_WAIT;
- printk(KERN_NOTICE
- "Clock: deleting leap second 23:59:59 UTC\n");
+ if ((secs + 1) % 86400 == 0) {
+ leap = 1;
+ time_tai--;
+ time_state = TIME_WAIT;
+ printk(KERN_NOTICE
+ "Clock: deleting leap second 23:59:59 UTC\n");
+ }
break;
case TIME_OOP:
time_tai++;
time_state = TIME_WAIT;
- /* fall through */
+ break;
+
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL)))
time_state = TIME_OK;
break;
}
- spin_unlock_irqrestore(&ntp_lock, flags);
- /*
- * We have to call this outside of the ntp_lock to keep
- * the proper locking hierarchy
- */
- if (leap)
- timekeeping_leap_insert(leap);
-
- return res;
-}
-
-/*
- * this routine handles the overflow of the microsecond field
- *
- * The tricky bits of code to handle the accurate clock support
- * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
- * They were originally developed for SUN and DEC kernels.
- * All the kudos should go to Dave for this stuff.
- */
-void second_overflow(void)
-{
- s64 delta;
- unsigned long flags;
-
- spin_lock_irqsave(&ntp_lock, flags);
/* Bump the maxerror field */
time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -481,15 +472,17 @@ void second_overflow(void)
tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
<< NTP_SCALE_SHIFT;
time_adjust = 0;
+
+
+
out:
spin_unlock_irqrestore(&ntp_lock, flags);
+
+ return leap;
}
#ifdef CONFIG_GENERIC_CMOS_UPDATE
-/* Disable the cmos update - used by virtualization and embedded */
-int no_sync_cmos_clock __read_mostly;
-
static void sync_cmos_clock(struct work_struct *work);
static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
@@ -536,35 +529,13 @@ static void sync_cmos_clock(struct work_struct *work)
static void notify_cmos_timer(void)
{
- if (!no_sync_cmos_clock)
- schedule_delayed_work(&sync_cmos_work, 0);
+ schedule_delayed_work(&sync_cmos_work, 0);
}
#else
static inline void notify_cmos_timer(void) { }
#endif
-/*
- * Start the leap seconds timer:
- */
-static inline void ntp_start_leap_timer(struct timespec *ts)
-{
- long now = ts->tv_sec;
-
- if (time_status & STA_INS) {
- time_state = TIME_INS;
- now += 86400 - now % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-
- return;
- }
-
- if (time_status & STA_DEL) {
- time_state = TIME_DEL;
- now += 86400 - (now + 1) % 86400;
- hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
- }
-}
/*
* Propagate a new txc->status value into the NTP state:
@@ -589,22 +560,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
time_status &= STA_RONLY;
time_status |= txc->status & ~STA_RONLY;
- switch (time_state) {
- case TIME_OK:
- ntp_start_leap_timer(ts);
- break;
- case TIME_INS:
- case TIME_DEL:
- time_state = TIME_OK;
- ntp_start_leap_timer(ts);
- case TIME_WAIT:
- if (!(time_status & (STA_INS | STA_DEL)))
- time_state = TIME_OK;
- break;
- case TIME_OOP:
- hrtimer_restart(&leap_timer);
- break;
- }
}
/*
* Called with the xtime lock held, so we can access and modify
@@ -686,9 +641,6 @@ int do_adjtimex(struct timex *txc)
(txc->tick < 900000/USER_HZ ||
txc->tick > 1100000/USER_HZ))
return -EINVAL;
-
- if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
- hrtimer_cancel(&leap_timer);
}
if (txc->modes & ADJ_SETOFFSET) {
@@ -1010,6 +962,4 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
void __init ntp_init(void)
{
ntp_clear();
- hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
- leap_timer.function = ntp_leap_second;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 403c2a092830..d66b21308f7c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -184,18 +184,6 @@ static void timekeeping_update(bool clearntp)
}
-void timekeeping_leap_insert(int leapsecond)
-{
- unsigned long flags;
-
- write_seqlock_irqsave(&timekeeper.lock, flags);
- timekeeper.xtime.tv_sec += leapsecond;
- timekeeper.wall_to_monotonic.tv_sec -= leapsecond;
- timekeeping_update(false);
- write_sequnlock_irqrestore(&timekeeper.lock, flags);
-
-}
-
/**
* timekeeping_forward_now - update clock to the current time
*
@@ -448,9 +436,12 @@ EXPORT_SYMBOL(timekeeping_inject_offset);
static int change_clocksource(void *data)
{
struct clocksource *new, *old;
+ unsigned long flags;
new = (struct clocksource *) data;
+ write_seqlock_irqsave(&timekeeper.lock, flags);
+
timekeeping_forward_now();
if (!new->enable || new->enable(new) == 0) {
old = timekeeper.clock;
@@ -458,6 +449,10 @@ static int change_clocksource(void *data)
if (old->disable)
old->disable(old);
}
+ timekeeping_update(true);
+
+ write_sequnlock_irqrestore(&timekeeper.lock, flags);
+
return 0;
}
@@ -827,7 +822,7 @@ static void timekeeping_adjust(s64 offset)
int adj;
/*
- * The point of this is to check if the error is greater then half
+ * The point of this is to check if the error is greater than half
* an interval.
*
* First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
@@ -835,7 +830,7 @@ static void timekeeping_adjust(s64 offset)
* Note we subtract one in the shift, so that error is really error*2.
* This "saves" dividing(shifting) interval twice, but keeps the
* (error > interval) comparison as still measuring if error is
- * larger then half an interval.
+ * larger than half an interval.
*
* Note: It does not "save" on aggravation when reading the code.
*/
@@ -843,7 +838,7 @@ static void timekeeping_adjust(s64 offset)
if (error > interval) {
/*
* We now divide error by 4(via shift), which checks if
- * the error is greater then twice the interval.
+ * the error is greater than twice the interval.
* If it is greater, we need a bigadjust, if its smaller,
* we can adjust by 1.
*/
@@ -874,13 +869,15 @@ static void timekeeping_adjust(s64 offset)
} else /* No adjustment needed */
return;
- WARN_ONCE(timekeeper.clock->maxadj &&
- (timekeeper.mult + adj > timekeeper.clock->mult +
- timekeeper.clock->maxadj),
- "Adjusting %s more then 11%% (%ld vs %ld)\n",
+ if (unlikely(timekeeper.clock->maxadj &&
+ (timekeeper.mult + adj >
+ timekeeper.clock->mult + timekeeper.clock->maxadj))) {
+ printk_once(KERN_WARNING
+ "Adjusting %s more than 11%% (%ld vs %ld)\n",
timekeeper.clock->name, (long)timekeeper.mult + adj,
(long)timekeeper.clock->mult +
timekeeper.clock->maxadj);
+ }
/*
* So the following can be confusing.
*
@@ -952,7 +949,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
u64 raw_nsecs;
- /* If the offset is smaller then a shifted interval, do nothing */
+ /* If the offset is smaller than a shifted interval, do nothing */
if (offset < timekeeper.cycle_interval<<shift)
return offset;
@@ -962,9 +959,11 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
while (timekeeper.xtime_nsec >= nsecps) {
+ int leap;
timekeeper.xtime_nsec -= nsecps;
timekeeper.xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(timekeeper.xtime.tv_sec);
+ timekeeper.xtime.tv_sec += leap;
}
/* Accumulate raw time */
@@ -1018,13 +1017,13 @@ static void update_wall_time(void)
* With NO_HZ we may have to accumulate many cycle_intervals
* (think "ticks") worth of time at once. To do this efficiently,
* we calculate the largest doubling multiple of cycle_intervals
- * that is smaller then the offset. We then accumulate that
+ * that is smaller than the offset. We then accumulate that
* chunk in one go, and then try to consume the next smaller
* doubled multiple.
*/
shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
shift = max(0, shift);
- /* Bound shift to one less then what overflows tick_length */
+ /* Bound shift to one less than what overflows tick_length */
maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
shift = min(shift, maxshift);
while (offset >= timekeeper.cycle_interval) {
@@ -1072,12 +1071,14 @@ static void update_wall_time(void)
/*
* Finally, make sure that after the rounding
- * xtime.tv_nsec isn't larger then NSEC_PER_SEC
+ * xtime.tv_nsec isn't larger than NSEC_PER_SEC
*/
if (unlikely(timekeeper.xtime.tv_nsec >= NSEC_PER_SEC)) {
+ int leap;
timekeeper.xtime.tv_nsec -= NSEC_PER_SEC;
timekeeper.xtime.tv_sec++;
- second_overflow();
+ leap = second_overflow(timekeeper.xtime.tv_sec);
+ timekeeper.xtime.tv_sec += leap;
}
timekeeping_update(false);
@@ -1260,6 +1261,8 @@ ktime_t ktime_get_monotonic_offset(void)
return timespec_to_ktime(wtom);
}
+EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
+
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cd3134510f3d..a1d2849f2473 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
+ select FRAME_POINTER if !ARM_UNWIND && !PPC && !S390 && !MICROBLAZE
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 867bd1dd2dd0..0fa92f677c92 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -249,7 +249,8 @@ static void update_ftrace_function(void)
#else
__ftrace_trace_function = func;
#endif
- ftrace_trace_function = ftrace_test_stop_func;
+ ftrace_trace_function =
+ (func == ftrace_stub) ? func : ftrace_test_stop_func;
#endif
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5b7b5c1195b..cf8d11e91efd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -154,33 +154,10 @@ enum {
static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
-#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
-
-/**
- * tracing_on - enable all tracing buffers
- *
- * This function enables all tracing buffers that may have been
- * disabled with tracing_off.
- */
-void tracing_on(void)
-{
- set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_on);
+/* Used for individual buffers (after the counter) */
+#define RB_BUFFER_OFF (1 << 20)
-/**
- * tracing_off - turn off all tracing buffers
- *
- * This function stops all tracing buffers from recording data.
- * It does not disable any overhead the tracers themselves may
- * be causing. This function simply causes all recording to
- * the ring buffers to fail.
- */
-void tracing_off(void)
-{
- clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags);
-}
-EXPORT_SYMBOL_GPL(tracing_off);
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
/**
* tracing_off_permanent - permanently disable ring buffers
@@ -193,15 +170,6 @@ void tracing_off_permanent(void)
set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
}
-/**
- * tracing_is_on - show state of ring buffers enabled
- */
-int tracing_is_on(void)
-{
- return ring_buffer_flags == RB_BUFFERS_ON;
-}
-EXPORT_SYMBOL_GPL(tracing_is_on);
-
#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
#define RB_ALIGNMENT 4U
#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
@@ -2619,6 +2587,63 @@ void ring_buffer_record_enable(struct ring_buffer *buffer)
EXPORT_SYMBOL_GPL(ring_buffer_record_enable);
/**
+ * ring_buffer_record_off - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * This is different than ring_buffer_record_disable() as
+ * it works like an on/off switch, where as the disable() verison
+ * must be paired with a enable().
+ */
+void ring_buffer_record_off(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd | RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_off);
+
+/**
+ * ring_buffer_record_on - restart writes into the buffer
+ * @buffer: The ring buffer to start writes to.
+ *
+ * This enables all writes to the buffer that was disabled by
+ * ring_buffer_record_off().
+ *
+ * This is different than ring_buffer_record_enable() as
+ * it works like an on/off switch, where as the enable() verison
+ * must be paired with a disable().
+ */
+void ring_buffer_record_on(struct ring_buffer *buffer)
+{
+ unsigned int rd;
+ unsigned int new_rd;
+
+ do {
+ rd = atomic_read(&buffer->record_disabled);
+ new_rd = rd & ~RB_BUFFER_OFF;
+ } while (atomic_cmpxchg(&buffer->record_disabled, rd, new_rd) != rd);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_record_on);
+
+/**
+ * ring_buffer_record_is_on - return true if the ring buffer can write
+ * @buffer: The ring buffer to see if write is enabled
+ *
+ * Returns true if the ring buffer is in a state that it accepts writes.
+ */
+int ring_buffer_record_is_on(struct ring_buffer *buffer)
+{
+ return !atomic_read(&buffer->record_disabled);
+}
+
+/**
* ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
* @buffer: The ring buffer to stop writes to.
* @cpu: The CPU buffer to stop
@@ -4039,68 +4064,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
-#ifdef CONFIG_TRACING
-static ssize_t
-rb_simple_read(struct file *filp, char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = filp->private_data;
- char buf[64];
- int r;
-
- if (test_bit(RB_BUFFERS_DISABLED_BIT, p))
- r = sprintf(buf, "permanently disabled\n");
- else
- r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p));
-
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-rb_simple_write(struct file *filp, const char __user *ubuf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = filp->private_data;
- unsigned long val;
- int ret;
-
- ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
- if (ret)
- return ret;
-
- if (val)
- set_bit(RB_BUFFERS_ON_BIT, p);
- else
- clear_bit(RB_BUFFERS_ON_BIT, p);
-
- (*ppos)++;
-
- return cnt;
-}
-
-static const struct file_operations rb_simple_fops = {
- .open = tracing_open_generic,
- .read = rb_simple_read,
- .write = rb_simple_write,
- .llseek = default_llseek,
-};
-
-
-static __init int rb_init_debugfs(void)
-{
- struct dentry *d_tracer;
-
- d_tracer = tracing_init_dentry();
-
- trace_create_file("tracing_on", 0644, d_tracer,
- &ring_buffer_flags, &rb_simple_fops);
-
- return 0;
-}
-
-fs_initcall(rb_init_debugfs);
-#endif
-
#ifdef CONFIG_HOTPLUG_CPU
static int rb_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 10d5503f0d04..ed7b5d1e12f4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -36,6 +36,7 @@
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/poll.h>
+#include <linux/nmi.h>
#include <linux/fs.h>
#include "trace.h"
@@ -352,6 +353,59 @@ static void wakeup_work_handler(struct work_struct *work)
static DECLARE_DELAYED_WORK(wakeup_work, wakeup_work_handler);
/**
+ * tracing_on - enable tracing buffers
+ *
+ * This function enables tracing buffers that may have been
+ * disabled with tracing_off.
+ */
+void tracing_on(void)
+{
+ if (global_trace.buffer)
+ ring_buffer_record_on(global_trace.buffer);
+ /*
+ * This flag is only looked at when buffers haven't been
+ * allocated yet. We don't really care about the race
+ * between setting this flag and actually turning
+ * on the buffer.
+ */
+ global_trace.buffer_disabled = 0;
+}
+EXPORT_SYMBOL_GPL(tracing_on);
+
+/**
+ * tracing_off - turn off tracing buffers
+ *
+ * This function stops the tracing buffers from recording data.
+ * It does not disable any overhead the tracers themselves may
+ * be causing. This function simply causes all recording to
+ * the ring buffers to fail.
+ */
+void tracing_off(void)
+{
+ if (global_trace.buffer)
+ ring_buffer_record_on(global_trace.buffer);
+ /*
+ * This flag is only looked at when buffers haven't been
+ * allocated yet. We don't really care about the race
+ * between setting this flag and actually turning
+ * on the buffer.
+ */
+ global_trace.buffer_disabled = 1;
+}
+EXPORT_SYMBOL_GPL(tracing_off);
+
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+ if (global_trace.buffer)
+ return ring_buffer_record_is_on(global_trace.buffer);
+ return !global_trace.buffer_disabled;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
+/**
* trace_wake_up - wake up tasks waiting for trace input
*
* Schedules a delayed work to wake up any task that is blocked on the
@@ -1644,6 +1698,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
int cpu_file = iter->cpu_file;
u64 next_ts = 0, ts;
int next_cpu = -1;
+ int next_size = 0;
int cpu;
/*
@@ -1675,9 +1730,12 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
next_cpu = cpu;
next_ts = ts;
next_lost = lost_events;
+ next_size = iter->ent_size;
}
}
+ iter->ent_size = next_size;
+
if (ent_cpu)
*ent_cpu = next_cpu;
@@ -4567,6 +4625,55 @@ static __init void create_trace_options_dir(void)
create_trace_option_core_file(trace_options[i], i);
}
+static ssize_t
+rb_simple_read(struct file *filp, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct ring_buffer *buffer = filp->private_data;
+ char buf[64];
+ int r;
+
+ if (buffer)
+ r = ring_buffer_record_is_on(buffer);
+ else
+ r = 0;
+
+ r = sprintf(buf, "%d\n", r);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+rb_simple_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct ring_buffer *buffer = filp->private_data;
+ unsigned long val;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ if (buffer) {
+ if (val)
+ ring_buffer_record_on(buffer);
+ else
+ ring_buffer_record_off(buffer);
+ }
+
+ (*ppos)++;
+
+ return cnt;
+}
+
+static const struct file_operations rb_simple_fops = {
+ .open = tracing_open_generic,
+ .read = rb_simple_read,
+ .write = rb_simple_write,
+ .llseek = default_llseek,
+};
+
static __init int tracer_init_debugfs(void)
{
struct dentry *d_tracer;
@@ -4626,6 +4733,9 @@ static __init int tracer_init_debugfs(void)
trace_create_file("trace_clock", 0644, d_tracer, NULL,
&trace_clock_fops);
+ trace_create_file("tracing_on", 0644, d_tracer,
+ global_trace.buffer, &rb_simple_fops);
+
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
@@ -4798,6 +4908,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
if (ret != TRACE_TYPE_NO_CONSUME)
trace_consume(&iter);
}
+ touch_nmi_watchdog();
trace_printk_seq(&iter.seq);
}
@@ -4863,6 +4974,8 @@ __init static int tracer_alloc_buffers(void)
goto out_free_cpumask;
}
global_trace.entries = ring_buffer_size(global_trace.buffer);
+ if (global_trace.buffer_disabled)
+ tracing_off();
#ifdef CONFIG_TRACER_MAX_TRACE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54faec790bc1..95059f091a24 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -154,6 +154,7 @@ struct trace_array {
struct ring_buffer *buffer;
unsigned long entries;
int cpu;
+ int buffer_disabled;
cycle_t time_start;
struct task_struct *waiter;
struct trace_array_cpu *data[NR_CPUS];
@@ -835,13 +836,11 @@ extern const char *__stop___trace_bprintk_fmt[];
filter)
#include "trace_entries.h"
-#ifdef CONFIG_PERF_EVENTS
#ifdef CONFIG_FUNCTION_TRACER
int perf_ftrace_event_register(struct ftrace_event_call *call,
enum trace_reg type, void *data);
#else
#define perf_ftrace_event_register NULL
#endif /* CONFIG_FUNCTION_TRACER */
-#endif /* CONFIG_PERF_EVENTS */
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index d91eb0541b3a..4108e1250ca2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -166,6 +166,12 @@ FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry,
#define FTRACE_STACK_ENTRIES 8
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
FTRACE_ENTRY(kernel_stack, stack_entry,
TRACE_STACK,
@@ -175,8 +181,9 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
__dynamic_array(unsigned long, caller )
),
- F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
__entry->caller[0], __entry->caller[1], __entry->caller[2],
__entry->caller[3], __entry->caller[4], __entry->caller[5],
__entry->caller[6], __entry->caller[7]),
@@ -193,8 +200,9 @@ FTRACE_ENTRY(user_stack, userstack_entry,
__array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
- F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
- "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n",
+ F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
+ "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n",
__entry->caller[0], __entry->caller[1], __entry->caller[2],
__entry->caller[3], __entry->caller[4], __entry->caller[5],
__entry->caller[6], __entry->caller[7]),
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7b46c9bd22ae..3dd15e8bc856 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -162,7 +162,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define __dynamic_array(type, item)
#undef F_printk
-#define F_printk(fmt, args...) #fmt ", " __stringify(args)
+#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
#undef FTRACE_ENTRY_REG
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 14bc092fb12c..df30ee08bdd4 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,6 +9,8 @@
* to those contributors as well.
*/
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
#include <linux/mm.h>
#include <linux/cpu.h>
#include <linux/nmi.h>
@@ -319,11 +321,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
*/
static int watchdog(void *unused)
{
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+ struct sched_param param = { .sched_priority = 0 };
struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- sched_setscheduler(current, SCHED_FIFO, &param);
-
/* initialize timestamp */
__touch_watchdog();
@@ -349,8 +349,11 @@ static int watchdog(void *unused)
set_current_state(TASK_INTERRUPTIBLE);
}
+ /*
+ * Drop the policy/priority elevation during thread exit to avoid a
+ * scheduling latency spike.
+ */
__set_current_state(TASK_RUNNING);
- param.sched_priority = 0;
sched_setscheduler(current, SCHED_NORMAL, &param);
return 0;
}
@@ -376,18 +379,20 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
if (!IS_ERR(event)) {
- printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
+ pr_info("enabled, takes one hw-pmu counter.\n");
goto out_save;
}
/* vary the KERN level based on the returned errno */
if (PTR_ERR(event) == -EOPNOTSUPP)
- printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
else if (PTR_ERR(event) == -ENOENT)
- printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu);
+ pr_warning("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
else
- printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event));
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
return PTR_ERR(event);
/* success path */
@@ -439,9 +444,10 @@ static int watchdog_enable(int cpu)
/* create the watchdog thread */
if (!p) {
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
if (IS_ERR(p)) {
- printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
+ pr_err("softlockup watchdog for %i failed\n", cpu);
if (!err) {
/* if hardlockup hasn't already set this */
err = PTR_ERR(p);
@@ -450,6 +456,7 @@ static int watchdog_enable(int cpu)
}
goto out;
}
+ sched_setscheduler(p, SCHED_FIFO, &param);
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
per_cpu(softlockup_watchdog, cpu) = p;
@@ -496,7 +503,7 @@ static void watchdog_enable_all_cpus(void)
watchdog_enabled = 1;
if (!watchdog_enabled)
- printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n");
+ pr_err("failed to be enabled on some cpus\n");
}