aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c2
-rw-r--r--kernel/audit.c10
-rw-r--r--kernel/bpf/percpu_freelist.c8
-rw-r--r--kernel/bpf/verifier.c111
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/fork.c6
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/jump_label.c2
-rw-r--r--kernel/sched/deadline.c63
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/features.h5
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/signal.c18
-rw-r--r--kernel/time/tick-sched.c19
-rw-r--r--kernel/time/timer.c35
-rw-r--r--kernel/trace/ring_buffer.c6
-rw-r--r--kernel/trace/trace.c42
-rw-r--r--kernel/trace/trace_events_hist.c4
-rw-r--r--kernel/uid16.c1
-rw-r--r--kernel/workqueue.c1
21 files changed, 272 insertions, 92 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 74963d192c5d..37f1dc696fbd 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -99,7 +99,7 @@ static int check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
- if (time_is_before_jiffies(acct->needcheck))
+ if (time_is_after_jiffies(acct->needcheck))
goto out;
/* May block */
diff --git a/kernel/audit.c b/kernel/audit.c
index f1ca11613379..da4e7c0e36f7 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -79,13 +79,13 @@ static int audit_initialized;
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled;
-u32 audit_ever_enabled;
+u32 audit_enabled = AUDIT_OFF;
+u32 audit_ever_enabled = !!AUDIT_OFF;
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default;
+static u32 audit_default = AUDIT_OFF;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -1199,8 +1199,6 @@ static int __init audit_init(void)
skb_queue_head_init(&audit_skb_queue);
skb_queue_head_init(&audit_skb_hold_queue);
audit_initialized = AUDIT_INITIALIZED;
- audit_enabled = audit_default;
- audit_ever_enabled |= !!audit_default;
audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
@@ -1217,6 +1215,8 @@ static int __init audit_enable(char *str)
audit_default = !!simple_strtol(str, NULL, 0);
if (!audit_default)
audit_initialized = AUDIT_DISABLED;
+ audit_enabled = audit_default;
+ audit_ever_enabled = !!audit_enabled;
pr_info("%s\n", audit_default ?
"enabled (after initialization)" : "disabled (until reboot)");
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 5c51d1985b51..673fa6fe2d73 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -78,8 +78,10 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
+ unsigned long flags;
int orig_cpu, cpu;
+ local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -87,14 +89,16 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock(&head->lock);
+ raw_spin_unlock_irqrestore(&head->lock, flags);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu)
+ if (cpu == orig_cpu) {
+ local_irq_restore(flags);
return NULL;
+ }
}
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 372454aa7f37..d7eeebfafe8d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1790,10 +1790,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
/* case: R = imm
* remember the value we stored into this reg
*/
+ u64 imm;
+
+ if (BPF_CLASS(insn->code) == BPF_ALU64)
+ imm = insn->imm;
+ else
+ imm = (u32)insn->imm;
+
regs[insn->dst_reg].type = CONST_IMM;
- regs[insn->dst_reg].imm = insn->imm;
- regs[insn->dst_reg].max_value = insn->imm;
- regs[insn->dst_reg].min_value = insn->imm;
+ regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].max_value = imm;
+ regs[insn->dst_reg].min_value = imm;
}
} else if (opcode > BPF_END) {
@@ -1861,10 +1868,28 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
((BPF_SRC(insn->code) == BPF_X &&
regs[insn->src_reg].type == CONST_IMM) ||
BPF_SRC(insn->code) == BPF_K)) {
- if (BPF_SRC(insn->code) == BPF_X)
+ if (BPF_SRC(insn->code) == BPF_X) {
+ /* check in case the register contains a big
+ * 64-bit value
+ */
+ if (regs[insn->src_reg].imm < -MAX_BPF_STACK ||
+ regs[insn->src_reg].imm > MAX_BPF_STACK) {
+ verbose("R%d value too big in R%d pointer arithmetic\n",
+ insn->src_reg, insn->dst_reg);
+ return -EACCES;
+ }
dst_reg->imm += regs[insn->src_reg].imm;
- else
+ } else {
+ /* safe against overflow: addition of 32-bit
+ * numbers in 64-bit representation
+ */
dst_reg->imm += insn->imm;
+ }
+ if (dst_reg->imm > 0 || dst_reg->imm < -MAX_BPF_STACK) {
+ verbose("R%d out-of-bounds pointer arithmetic\n",
+ insn->dst_reg);
+ return -EACCES;
+ }
return 0;
} else if (opcode == BPF_ADD &&
BPF_CLASS(insn->code) == BPF_ALU64 &&
@@ -2697,11 +2722,12 @@ static bool states_equal(struct bpf_verifier_env *env,
/* If we didn't map access then again we don't care about the
* mismatched range values and it's ok if our old type was
- * UNKNOWN and we didn't go to a NOT_INIT'ed reg.
+ * UNKNOWN and we didn't go to a NOT_INIT'ed or pointer reg.
*/
if (rold->type == NOT_INIT ||
(!varlen_map_access && rold->type == UNKNOWN_VALUE &&
- rcur->type != NOT_INIT))
+ rcur->type != NOT_INIT &&
+ !__is_pointer_value(env->allow_ptr_leaks, rcur)))
continue;
/* Don't care about the reg->id in this case. */
@@ -2862,6 +2888,7 @@ static int do_check(struct bpf_verifier_env *env)
if (err)
return err;
+ env->insn_aux_data[insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3059,6 +3086,7 @@ process_bpf_exit:
return err;
insn_idx++;
+ env->insn_aux_data[insn_idx].seen = true;
} else {
verbose("invalid BPF_LD mode\n");
return -EINVAL;
@@ -3210,6 +3238,63 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env)
insn->src_reg = 0;
}
+/* single env->prog->insni[off] instruction was replaced with the range
+ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying
+ * [0, off) and [off, end) to new locations, so the patched range stays zero
+ */
+static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len,
+ u32 off, u32 cnt)
+{
+ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data;
+ int i;
+
+ if (cnt == 1)
+ return 0;
+ new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len);
+ if (!new_data)
+ return -ENOMEM;
+ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off);
+ memcpy(new_data + off + cnt - 1, old_data + off,
+ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1));
+ for (i = off; i < off + cnt - 1; i++)
+ new_data[i].seen = true;
+ env->insn_aux_data = new_data;
+ vfree(old_data);
+ return 0;
+}
+
+static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off,
+ const struct bpf_insn *patch, u32 len)
+{
+ struct bpf_prog *new_prog;
+
+ new_prog = bpf_patch_insn_single(env->prog, off, patch, len);
+ if (!new_prog)
+ return NULL;
+ if (adjust_insn_aux_data(env, new_prog->len, off, len))
+ return NULL;
+ return new_prog;
+}
+
+/* The verifier does more data flow analysis than llvm and will not explore
+ * branches that are dead at run time. Malicious programs can have dead code
+ * too. Therefore replace all dead at-run-time code with nops.
+ */
+static void sanitize_dead_code(struct bpf_verifier_env *env)
+{
+ struct bpf_insn_aux_data *aux_data = env->insn_aux_data;
+ struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0);
+ struct bpf_insn *insn = env->prog->insnsi;
+ const int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++) {
+ if (aux_data[i].seen)
+ continue;
+ memcpy(insn + i, &nop, sizeof(nop));
+ }
+}
+
/* convert load instructions that access fields of 'struct __sk_buff'
* into sequence of instructions that access fields of 'struct sk_buff'
*/
@@ -3229,10 +3314,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
} else if (cnt) {
- new_prog = bpf_patch_insn_single(env->prog, 0,
- insn_buf, cnt);
+ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
+
env->prog = new_prog;
delta += cnt - 1;
}
@@ -3253,7 +3338,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
else
continue;
- if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX)
+ if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
cnt = ops->convert_ctx_access(type, insn->dst_reg, insn->src_reg,
@@ -3263,8 +3348,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
return -EINVAL;
}
- new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf,
- cnt);
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3373,6 +3457,9 @@ skip_full_check:
free_states(env);
if (ret == 0)
+ sanitize_dead_code(env);
+
+ if (ret == 0)
/* program is valid, convert *(u32*)(ctx + off) accesses */
ret = convert_ctx_accesses(env);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 010db3c943cd..c6a4cf8ba645 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1633,9 +1633,9 @@ static struct cpuhp_step cpuhp_bp_states[] = {
* before blk_mq_queue_reinit_notify() from notify_dead(),
* otherwise a RCU stall occurs.
*/
- [CPUHP_TIMERS_DEAD] = {
+ [CPUHP_TIMERS_PREPARE] = {
.name = "timers:dead",
- .startup.single = NULL,
+ .startup.single = timers_prepare_cpu,
.teardown.single = timers_dead_cpu,
},
/* Kicks the plugged cpu into life */
@@ -1645,11 +1645,6 @@ static struct cpuhp_step cpuhp_bp_states[] = {
.teardown.single = NULL,
.cant_stop = true,
},
- [CPUHP_AP_SMPCFD_DYING] = {
- .name = "smpcfd:dying",
- .startup.single = NULL,
- .teardown.single = smpcfd_dying_cpu,
- },
/*
* Handled on controll processor until the plugged processor manages
* this itself.
@@ -1691,6 +1686,11 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.startup.single = NULL,
.teardown.single = rcutree_dying_cpu,
},
+ [CPUHP_AP_SMPCFD_DYING] = {
+ .name = "smpcfd:dying",
+ .startup.single = NULL,
+ .teardown.single = smpcfd_dying_cpu,
+ },
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index 83c666537a7a..3203e9dee9f8 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -349,7 +349,7 @@ poll_again:
}
kdb_printf("\n");
for (i = 0; i < count; i++) {
- if (kallsyms_symbol_next(p_tmp, i) < 0)
+ if (WARN_ON(!kallsyms_symbol_next(p_tmp, i)))
break;
kdb_printf("%s ", p_tmp);
*(p_tmp + len) = '\0';
diff --git a/kernel/fork.c b/kernel/fork.c
index 276acd8acf0a..2529725eefa2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
#include <linux/tsacct_kern.h>
#include <linux/cn_proc.h>
#include <linux/freezer.h>
+#include <linux/kaiser.h>
#include <linux/delayacct.h>
#include <linux/taskstats_kern.h>
#include <linux/random.h>
@@ -214,6 +215,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
static inline void free_thread_stack(struct task_struct *tsk)
{
+ kaiser_unmap_thread_stack(tsk->stack);
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
unsigned long flags;
@@ -518,6 +520,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
* functions again.
*/
tsk->stack = stack;
+
+ err= kaiser_map_thread_stack(tsk->stack);
+ if (err)
+ goto free_stack;
#ifdef CONFIG_VMAP_STACK
tsk->stack_vm_area = stack_vm_area;
#endif
diff --git a/kernel/groups.c b/kernel/groups.c
index 2fcadd66a8fd..94bde5210e3d 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -77,7 +77,7 @@ static int groups_from_user(struct group_info *group_info,
}
/* a simple Shell sort */
-static void groups_sort(struct group_info *group_info)
+void groups_sort(struct group_info *group_info)
{
int base, max, stride;
int gidsetsize = group_info->ngroups;
@@ -103,6 +103,7 @@ static void groups_sort(struct group_info *group_info)
stride /= 3;
}
}
+EXPORT_SYMBOL(groups_sort);
/* a simple bsearch */
int groups_search(const struct group_info *group_info, kgid_t grp)
@@ -134,7 +135,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp)
void set_groups(struct cred *new, struct group_info *group_info)
{
put_group_info(new->group_info);
- groups_sort(group_info);
get_group_info(group_info);
new->group_info = group_info;
}
@@ -218,6 +218,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a9b8cf500591..def4548ea40c 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -612,7 +612,7 @@ static __init int jump_label_test(void)
return 0;
}
-late_initcall(jump_label_test);
+early_initcall(jump_label_test);
#endif /* STATIC_KEYS_SELFTEST */
#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index e00accf92a4b..c77fd444dc3c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -445,13 +445,13 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
*
* This function returns true if:
*
- * runtime / (deadline - t) > dl_runtime / dl_period ,
+ * runtime / (deadline - t) > dl_runtime / dl_deadline ,
*
* IOW we can't recycle current parameters.
*
- * Notice that the bandwidth check is done against the period. For
+ * Notice that the bandwidth check is done against the deadline. For
* task with deadline equal to period this is the same of using
- * dl_deadline instead of dl_period in the equation above.
+ * dl_period instead of dl_deadline in the equation above.
*/
static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, u64 t)
@@ -476,7 +476,7 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
* of anything below microseconds resolution is actually fiction
* (but still we want to give the user that illusion >;).
*/
- left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+ left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
right = ((dl_se->deadline - t) >> DL_SCALE) *
(pi_se->dl_runtime >> DL_SCALE);
@@ -505,10 +505,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
}
}
+static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
+{
+ return dl_se->deadline - dl_se->dl_deadline + dl_se->dl_period;
+}
+
/*
* If the entity depleted all its runtime, and if we want it to sleep
* while waiting for some new execution time to become available, we
- * set the bandwidth enforcement timer to the replenishment instant
+ * set the bandwidth replenishment timer to the replenishment instant
* and try to activate it.
*
* Notice that it is important for the caller to know if the timer
@@ -530,7 +535,7 @@ static int start_dl_timer(struct task_struct *p)
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
- act = ns_to_ktime(dl_se->deadline);
+ act = ns_to_ktime(dl_next_period(dl_se));
now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -638,6 +643,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
rf.cookie = lockdep_pin_lock(&rq->lock);
+ update_rq_clock(rq);
/*
* Now that the task has been migrated to the new RQ and we
@@ -690,6 +696,37 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
timer->irqsafe = 1;
}
+/*
+ * During the activation, CBS checks if it can reuse the current task's
+ * runtime and period. If the deadline of the task is in the past, CBS
+ * cannot use the runtime, and so it replenishes the task. This rule
+ * works fine for implicit deadline tasks (deadline == period), and the
+ * CBS was designed for implicit deadline tasks. However, a task with
+ * constrained deadline (deadine < period) might be awakened after the
+ * deadline, but before the next period. In this case, replenishing the
+ * task would allow it to run for runtime / deadline. As in this case
+ * deadline < period, CBS enables a task to run for more than the
+ * runtime / period. In a very loaded system, this can cause a domino
+ * effect, making other tasks miss their deadlines.
+ *
+ * To avoid this problem, in the activation of a constrained deadline
+ * task after the deadline but before the next period, throttle the
+ * task and set the replenishing timer to the begin of the next period,
+ * unless it is boosted.
+ */
+static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)
+{
+ struct task_struct *p = dl_task_of(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
+ dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ return;
+ dl_se->dl_throttled = 1;
+ }
+}
+
static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
@@ -923,6 +960,11 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
__dequeue_dl_entity(dl_se);
}
+static inline bool dl_is_constrained(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_deadline < dl_se->dl_period;
+}
+
static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *pi_task = rt_mutex_get_top_task(p);
@@ -949,6 +991,15 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
}
/*
+ * Check if a constrained deadline task was activated
+ * after the deadline but before the next period.
+ * If that is the case, the task will be throttled and
+ * the replenishment timer will be set to the next period.
+ */
+ if (!p->dl.dl_throttled && dl_is_constrained(&p->dl))
+ dl_check_constrained_dl(&p->dl);
+
+ /*
* If p is throttled, we do nothing. In fact, if it exhausted
* its budget it needs a replenishment and, since it now is on
* its rq, the bandwidth timer callback (which clearly has not
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f9189909640b..c6db32c0c557 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5451,7 +5451,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
* Due to large variance we need a large fuzz factor; hackbench in
* particularly is sensitive here.
*/
- if ((avg_idle / 512) < avg_cost)
+ if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
return -1;
time = local_clock();
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 6d28fcd08872..36086f74e011 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -59,6 +59,11 @@ SCHED_FEAT(PREEMPT_LAZY, true)
SCHED_FEAT(TTWU_QUEUE, true)
#endif
+/*
+ * When doing wakeups, attempt to limit superfluous scans of the LLC domain.
+ */
+SCHED_FEAT(SIS_AVG_CPU, false)
+
#ifdef HAVE_RT_PUSH_IPI
/*
* In order to avoid a thundering herd attack of CPUs that are
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 340a0a5d435c..d361629c0f96 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2023,8 +2023,9 @@ static void pull_rt_task(struct rq *this_rq)
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
+ int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overloaded(this_rq)))
+ if (likely(!rt_overload_count))
return;
/*
@@ -2033,6 +2034,11 @@ static void pull_rt_task(struct rq *this_rq)
*/
smp_rmb();
+ /* If we are the only overloaded CPU do nothing */
+ if (rt_overload_count == 1 &&
+ cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
+ return;
+
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
diff --git a/kernel/signal.c b/kernel/signal.c
index 99918dcd836f..4d094ae3a625 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -73,7 +73,7 @@ static int sig_task_ignored(struct task_struct *t, int sig, bool force)
handler = sig_handler(t, sig);
if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
- handler == SIG_DFL && !force)
+ handler == SIG_DFL && !(force && sig_kernel_only(sig)))
return 1;
return sig_handler_ignored(handler, sig);
@@ -89,13 +89,15 @@ static int sig_ignored(struct task_struct *t, int sig, bool force)
if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
return 0;
- if (!sig_task_ignored(t, sig, force))
- return 0;
-
/*
- * Tracers may want to know about even ignored signals.
+ * Tracers may want to know about even ignored signal unless it
+ * is SIGKILL which can't be reported anyway but can be ignored
+ * by SIGNAL_UNKILLABLE task.
*/
- return !t->ptrace;
+ if (t->ptrace && sig != SIGKILL)
+ return 0;
+
+ return sig_task_ignored(t, sig, force);
}
/*
@@ -977,9 +979,9 @@ static void complete_signal(int sig, struct task_struct *p, int group)
* then start taking the whole group down immediately.
*/
if (sig_fatal(p, sig) &&
- !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
+ !(signal->flags & SIGNAL_GROUP_EXIT) &&
!sigismember(&t->real_blocked, sig) &&
- (sig == SIGKILL || !t->ptrace)) {
+ (sig == SIGKILL || !p->ptrace)) {
/*
* This signal will be fatal to the whole group.
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 66d85482a96e..c573b1a848b6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -669,6 +669,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
}
+static inline bool local_timer_softirq_pending(void)
+{
+ return local_softirq_pending() & TIMER_SOFTIRQ;
+}
+
static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ktime_t now, int cpu)
{
@@ -685,8 +690,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
} while (read_seqcount_retry(&jiffies_seq, seq));
ts->last_jiffies = basejiff;
- if (rcu_needs_cpu(basemono, &next_rcu) ||
- arch_needs_cpu() || irq_work_needs_cpu()) {
+ /*
+ * Keep the periodic tick, when RCU, architecture or irq_work
+ * requests it.
+ * Aside of that check whether the local timer softirq is
+ * pending. If so its a bad idea to call get_next_timer_interrupt()
+ * because there is an already expired timer, so it will request
+ * immeditate expiry, which rearms the hardware timer with a
+ * minimal delta which brings us back to this place
+ * immediately. Lather, rinse and repeat...
+ */
+ if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() ||
+ irq_work_needs_cpu() || local_timer_softirq_pending()) {
next_tick = basemono + TICK_NSEC;
} else {
/*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 31703677505f..8e75e7442aaa 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -852,11 +852,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
return base;
}
@@ -866,11 +865,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
/*
- * If the timer is deferrable and nohz is active then we need to use
- * the deferrable base.
+ * If the timer is deferrable and NO_HZ_COMMON is set then we need
+ * to use the deferrable base.
*/
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active &&
- (tflags & TIMER_DEFERRABLE))
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
base = this_cpu_ptr(&timer_bases[BASE_DEF]);
return base;
}
@@ -1024,8 +1022,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
if (!ret && pending_only)
goto out_unlock;
- debug_activate(timer, expires);
-
new_base = get_target_base(base, timer->flags);
if (base != new_base) {
@@ -1049,6 +1045,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
}
+ debug_activate(timer, expires);
+
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1719,7 +1717,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
irq_work_tick_soft();
__run_timers(base);
- if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
+ if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
}
@@ -1888,6 +1886,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h
}
}
+int timers_prepare_cpu(unsigned int cpu)
+{
+ struct timer_base *base;
+ int b;
+
+ for (b = 0; b < NR_BASES; b++) {
+ base = per_cpu_ptr(&timer_bases[b], cpu);
+ base->clk = jiffies;
+ base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
+ base->is_idle = false;
+ base->must_forward_clk = true;
+ }
+ return 0;
+}
+
int timers_dead_cpu(unsigned int cpu)
{
struct timer_base *old_base;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f5c016e8fc88..3e1d11f4fe44 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
/* Missed count stored at end */
#define RB_MISSED_STORED (1 << 30)
+#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
+
struct buffer_data_page {
u64 time_stamp; /* page time stamp */
local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
*/
size_t ring_buffer_page_len(void *page)
{
- return local_read(&((struct buffer_data_page *)page)->commit)
+ struct buffer_data_page *bpage = page;
+
+ return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
+ BUF_PAGE_HDR_SIZE;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 035e61604455..00d9ebcf42e2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3737,37 +3737,30 @@ static const struct file_operations show_traces_fops = {
.llseek = seq_lseek,
};
-/*
- * The tracer itself will not take this lock, but still we want
- * to provide a consistent cpumask to user-space:
- */
-static DEFINE_MUTEX(tracing_cpumask_update_lock);
-
-/*
- * Temporary storage for the character representation of the
- * CPU bitmask (and one more byte for the newline):
- */
-static char mask_str[NR_CPUS + 1];
-
static ssize_t
tracing_cpumask_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct trace_array *tr = file_inode(filp)->i_private;
+ char *mask_str;
int len;
- mutex_lock(&tracing_cpumask_update_lock);
+ len = snprintf(NULL, 0, "%*pb\n",
+ cpumask_pr_args(tr->tracing_cpumask)) + 1;
+ mask_str = kmalloc(len, GFP_KERNEL);
+ if (!mask_str)
+ return -ENOMEM;
- len = snprintf(mask_str, count, "%*pb\n",
+ len = snprintf(mask_str, len, "%*pb\n",
cpumask_pr_args(tr->tracing_cpumask));
if (len >= count) {
count = -EINVAL;
goto out_err;
}
- count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
+ count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
out_err:
- mutex_unlock(&tracing_cpumask_update_lock);
+ kfree(mask_str);
return count;
}
@@ -3787,8 +3780,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
if (err)
goto err_unlock;
- mutex_lock(&tracing_cpumask_update_lock);
-
local_irq_disable();
arch_spin_lock(&tr->max_lock);
for_each_tracing_cpu(cpu) {
@@ -3811,8 +3802,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
local_irq_enable();
cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
-
- mutex_unlock(&tracing_cpumask_update_lock);
free_cpumask_var(tracing_cpumask_new);
return count;
@@ -6202,7 +6191,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
- int entries, size, i;
+ int entries, i;
ssize_t ret = 0;
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6253,14 +6242,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- /*
- * zero out any left over data, this is going to
- * user land.
- */
- size = ring_buffer_page_len(ref->page);
- if (size < PAGE_SIZE)
- memset(ref->page + size, 0, PAGE_SIZE - size);
-
page = virt_to_page(ref->page);
spd.pages[i] = page;
@@ -6984,6 +6965,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
buf->data = alloc_percpu(struct trace_array_cpu);
if (!buf->data) {
ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
return -ENOMEM;
}
@@ -7007,7 +6989,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
allocate_snapshot ? size : 1);
if (WARN_ON(ret)) {
ring_buffer_free(tr->trace_buffer.buffer);
+ tr->trace_buffer.buffer = NULL;
free_percpu(tr->trace_buffer.data);
+ tr->trace_buffer.data = NULL;
return -ENOMEM;
}
tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f3a960ed75a1..0664044ade06 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -449,7 +449,7 @@ static int create_val_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
@@ -547,7 +547,7 @@ static int create_key_field(struct hist_trigger_data *hist_data,
}
field = trace_find_event_field(file->event_call, field_name);
- if (!field) {
+ if (!field || !field->size) {
ret = -EINVAL;
goto out;
}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index cc40793464e3..dcffcce9d75e 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -190,6 +190,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
return retval;
}
+ groups_sort(group_info);
retval = set_current_groups(group_info);
put_group_info(group_info);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index beee8ad4b9ea..7eed129f114a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1538,6 +1538,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
struct timer_list *timer = &dwork->timer;
struct work_struct *work = &dwork->work;
+ WARN_ON_ONCE(!wq);
WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
timer->data != (unsigned long)dwork);
WARN_ON_ONCE(timer_pending(timer));