From 7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 9 Jun 2015 21:32:09 +1000
Subject: cgroup: allow a cgroup subsystem to reject a fork

Add a new cgroup subsystem callback can_fork that conditionally
states whether or not the fork is accepted or rejected by a cgroup
policy. In addition, add a cancel_fork callback so that if an error
occurs later in the forking process, any state modified by can_fork can
be reverted.

Allow for a private opaque pointer to be passed from cgroup_can_fork to
cgroup_post_fork, allowing for the fork state to be stored by each
subsystem separately.

Also add a tagging system for cgroup_subsys.h to allow for CGROUP_<TAG>
enumerations to be be defined and used. In addition, explicitly add a
CGROUP_CANFORK_COUNT macro to make arrays easier to define.

This is in preparation for implementing the pids cgroup subsystem.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h   | 10 +++++-
 include/linux/cgroup.h        | 15 +++++++--
 include/linux/cgroup_subsys.h | 23 ++++++++++++++
 kernel/cgroup.c               | 73 +++++++++++++++++++++++++++++++++++++++++--
 kernel/cgroup_freezer.c       |  2 +-
 kernel/fork.c                 | 17 ++++++++--
 kernel/sched/core.c           |  2 +-
 7 files changed, 133 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..83e37d8c4d80 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -34,12 +34,17 @@ struct seq_file;
 
 /* define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _cgrp_id,
+#define SUBSYS_TAG(_t) CGROUP_ ## _t, \
+	__unused_tag_ ## _t = CGROUP_ ## _t - 1,
 enum cgroup_subsys_id {
 #include <linux/cgroup_subsys.h>
 	CGROUP_SUBSYS_COUNT,
 };
+#undef SUBSYS_TAG
 #undef SUBSYS
 
+#define CGROUP_CANFORK_COUNT (CGROUP_CANFORK_END - CGROUP_CANFORK_START)
+
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_NO_REF	= (1 << 0), /* no reference counting for this css */
@@ -406,7 +411,9 @@ struct cgroup_subsys {
 			      struct cgroup_taskset *tset);
 	void (*attach)(struct cgroup_subsys_state *css,
 		       struct cgroup_taskset *tset);
-	void (*fork)(struct task_struct *task);
+	int (*can_fork)(struct task_struct *task, void **priv_p);
+	void (*cancel_fork)(struct task_struct *task, void *priv);
+	void (*fork)(struct task_struct *task, void *priv);
 	void (*exit)(struct cgroup_subsys_state *css,
 		     struct cgroup_subsys_state *old_css,
 		     struct task_struct *task);
@@ -491,6 +498,7 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
 
 #else	/* CONFIG_CGROUPS */
 
+#define CGROUP_CANFORK_COUNT 0
 #define CGROUP_SUBSYS_COUNT 0
 
 static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) {}
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..a71fe2a3984e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -62,7 +62,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		     struct pid *pid, struct task_struct *tsk);
 
 void cgroup_fork(struct task_struct *p);
-void cgroup_post_fork(struct task_struct *p);
+extern int cgroup_can_fork(struct task_struct *p,
+			   void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_cancel_fork(struct task_struct *p,
+			       void *ss_priv[CGROUP_CANFORK_COUNT]);
+extern void cgroup_post_fork(struct task_struct *p,
+			     void *old_ss_priv[CGROUP_CANFORK_COUNT]);
 void cgroup_exit(struct task_struct *p);
 
 int cgroup_init_early(void);
@@ -524,7 +529,13 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
 				    struct dentry *dentry) { return -EINVAL; }
 
 static inline void cgroup_fork(struct task_struct *p) {}
-static inline void cgroup_post_fork(struct task_struct *p) {}
+static inline int cgroup_can_fork(struct task_struct *p,
+				  void *ss_priv[CGROUP_CANFORK_COUNT])
+{ return 0; }
+static inline void cgroup_cancel_fork(struct task_struct *p,
+				      void *ss_priv[CGROUP_CANFORK_COUNT]) {}
+static inline void cgroup_post_fork(struct task_struct *p,
+				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index e4a96fb14403..ec43bce7e1ea 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -3,6 +3,17 @@
  *
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
+
+/*
+ * This file *must* be included with SUBSYS() defined.
+ * SUBSYS_TAG() is a noop if undefined.
+ */
+
+#ifndef SUBSYS_TAG
+#define __TMP_SUBSYS_TAG
+#define SUBSYS_TAG(_x)
+#endif
+
 #if IS_ENABLED(CONFIG_CPUSETS)
 SUBSYS(cpuset)
 #endif
@@ -47,12 +58,24 @@ SUBSYS(net_prio)
 SUBSYS(hugetlb)
 #endif
 
+/*
+ * Subsystems that implement the can_fork() family of callbacks.
+ */
+SUBSYS_TAG(CANFORK_START)
+SUBSYS_TAG(CANFORK_END)
+
 /*
  * The following subsystems are not supported on the default hierarchy.
  */
 #if IS_ENABLED(CONFIG_CGROUP_DEBUG)
 SUBSYS(debug)
 #endif
+
+#ifdef __TMP_SUBSYS_TAG
+#undef __TMP_SUBSYS_TAG
+#undef SUBSYS_TAG
+#endif
+
 /*
  * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..a59dd1a6b74a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -186,6 +186,9 @@ static u64 css_serial_nr_next = 1;
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
 
+/* Ditto for the can_fork callback. */
+static unsigned long have_canfork_callback __read_mostly;
+
 static struct cftype cgroup_dfl_base_files[];
 static struct cftype cgroup_legacy_base_files[];
 
@@ -4955,6 +4958,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
 	 * registered, no tasks have been forked, so we don't
@@ -5197,6 +5201,19 @@ static const struct file_operations proc_cgroupstats_operations = {
 	.release = single_release,
 };
 
+static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
+		return &ss_priv[i - CGROUP_CANFORK_START];
+	return NULL;
+}
+
+static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
+{
+	void **private = subsys_canfork_priv_p(ss_priv, i);
+	return private ? *private : NULL;
+}
+
 /**
  * cgroup_fork - initialize cgroup related fields during copy_process()
  * @child: pointer to task_struct of forking parent process.
@@ -5211,6 +5228,57 @@ void cgroup_fork(struct task_struct *child)
 	INIT_LIST_HEAD(&child->cg_list);
 }
 
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the task in question.
+ *
+ * This calls the subsystem can_fork() callbacks. If the can_fork() callback
+ * returns an error, the fork aborts with that error code. This allows for
+ * a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child,
+		    void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i, j, ret;
+
+	for_each_subsys_which(ss, i, &have_canfork_callback) {
+		ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
+		if (ret)
+			goto out_revert;
+	}
+
+	return 0;
+
+out_revert:
+	for_each_subsys(ss, j) {
+		if (j >= i)
+			break;
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
+	}
+
+	return ret;
+}
+
+/**
+ * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
+ * @child: the task in question
+ *
+ * This calls the cancel_fork() callbacks if a fork failed *after*
+ * cgroup_can_fork() succeded.
+ */
+void cgroup_cancel_fork(struct task_struct *child,
+			void *ss_priv[CGROUP_CANFORK_COUNT])
+{
+	struct cgroup_subsys *ss;
+	int i;
+
+	for_each_subsys(ss, i)
+		if (ss->cancel_fork)
+			ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
+}
+
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
@@ -5221,7 +5289,8 @@ void cgroup_fork(struct task_struct *child)
  * cgroup_task_iter_start() - to guarantee that the new task ends up on its
  * list.
  */
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+		      void *old_ss_priv[CGROUP_CANFORK_COUNT])
 {
 	struct cgroup_subsys *ss;
 	int i;
@@ -5266,7 +5335,7 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	for_each_subsys_which(ss, i, &have_fork_callback)
-		ss->fork(child);
+		ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
 }
 
 /**
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
  * to do anything as freezer_attach() will put @task into the appropriate
  * state.
  */
-static void freezer_fork(struct task_struct *task)
+static void freezer_fork(struct task_struct *task, void *private)
 {
 	struct freezer *freezer;
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..40e3af12c55e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1239,6 +1239,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 {
 	int retval;
 	struct task_struct *p;
+	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
 
 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return ERR_PTR(-EINVAL);
@@ -1512,6 +1513,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	INIT_LIST_HEAD(&p->thread_group);
 	p->task_works = NULL;
 
+	/*
+	 * Ensure that the cgroup subsystem policies allow the new process to be
+	 * forked. It should be noted the the new process's css_set can be changed
+	 * between here and cgroup_post_fork() if an organisation operation is in
+	 * progress.
+	 */
+	retval = cgroup_can_fork(p, cgrp_ss_priv);
+	if (retval)
+		goto bad_fork_free_pid;
+
 	/*
 	 * Make it visible to the rest of the system, but dont wake it up yet.
 	 * Need tasklist lock for parent etc handling!
@@ -1548,7 +1559,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_pid;
+		goto bad_fork_cancel_cgroup;
 	}
 
 	if (likely(p->pid)) {
@@ -1590,7 +1601,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	write_unlock_irq(&tasklist_lock);
 
 	proc_fork_connector(p);
-	cgroup_post_fork(p);
+	cgroup_post_fork(p, cgrp_ss_priv);
 	if (clone_flags & CLONE_THREAD)
 		threadgroup_change_end(current);
 	perf_event_fork(p);
@@ -1600,6 +1611,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	return p;
 
+bad_fork_cancel_cgroup:
+	cgroup_cancel_fork(p, cgrp_ss_priv);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..d811652fe6f5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8068,7 +8068,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
 	sched_offline_group(tg);
 }
 
-static void cpu_cgroup_fork(struct task_struct *task)
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
 {
 	sched_move_task(task);
 }
-- 
cgit v1.2.3


From 49b786ea146f69c371df18e81ce0a2d5839f865c Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 9 Jun 2015 21:32:10 +1000
Subject: cgroup: implement the PIDs subsystem

Adds a new single-purpose PIDs subsystem to limit the number of
tasks that can be forked inside a cgroup. Essentially this is an
implementation of RLIMIT_NPROC that applies to a cgroup rather than a
process tree.

However, it should be noted that organisational operations (adding and
removing tasks from a PIDs hierarchy) will *not* be prevented. Rather,
the number of tasks in the hierarchy cannot exceed the limit through
forking. This is due to the fact that, in the unified hierarchy, attach
cannot fail (and it is not possible for a task to overcome its PIDs
cgroup policy limit by attaching to a child cgroup -- even if migrating
mid-fork it must be able to fork in the parent first).

PIDs are fundamentally a global resource, and it is possible to reach
PID exhaustion inside a cgroup without hitting any reasonable kmemcg
policy. Once you've hit PID exhaustion, you're only in a marginally
better state than OOM. This subsystem allows PID exhaustion inside a
cgroup to be prevented.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 CREDITS                       |   5 +
 include/linux/cgroup_subsys.h |   5 +
 init/Kconfig                  |  16 ++
 kernel/Makefile               |   1 +
 kernel/cgroup_pids.c          | 366 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 393 insertions(+)
 create mode 100644 kernel/cgroup_pids.c

diff --git a/CREDITS b/CREDITS
index 1d616640bbf6..4fcf9cd8544c 100644
--- a/CREDITS
+++ b/CREDITS
@@ -3219,6 +3219,11 @@ S: 69 rue Dunois
 S: 75013 Paris
 S: France
 
+N: Aleksa Sarai
+E: cyphar@cyphar.com
+W: https://www.cyphar.com/
+D: `pids` cgroup subsystem
+
 N: Dipankar Sarma
 E: dipankar@in.ibm.com
 D: RCU
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index ec43bce7e1ea..1f36945fd23d 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -62,6 +62,11 @@ SUBSYS(hugetlb)
  * Subsystems that implement the can_fork() family of callbacks.
  */
 SUBSYS_TAG(CANFORK_START)
+
+#if IS_ENABLED(CONFIG_CGROUP_PIDS)
+SUBSYS(pids)
+#endif
+
 SUBSYS_TAG(CANFORK_END)
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index af09b4fb43d2..2184b34cbf73 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -955,6 +955,22 @@ config CGROUP_FREEZER
 	  Provides a way to freeze and unfreeze all tasks in a
 	  cgroup.
 
+config CGROUP_PIDS
+	bool "PIDs cgroup subsystem"
+	help
+	  Provides enforcement of process number limits in the scope of a
+	  cgroup. Any attempt to fork more processes than is allowed in the
+	  cgroup will fail. PIDs are fundamentally a global resource because it
+	  is fairly trivial to reach PID exhaustion before you reach even a
+	  conservative kmemcg limit. As a result, it is possible to grind a
+	  system to halt without being limited by other cgroup policies. The
+	  PIDs cgroup subsystem is designed to stop this from happening.
+
+	  It should be noted that organisational operations (such as attaching
+	  to a cgroup hierarchy will *not* be blocked by the PIDs subsystem),
+	  since the PIDs limit only affects a process's ability to fork, not to
+	  attach to a cgroup.
+
 config CGROUP_DEVICE
 	bool "Device controller for cgroups"
 	help
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..718fb8afab7a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CGROUPS) += cgroup.o
 obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
+obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_UTS_NS) += utsname.o
 obj-$(CONFIG_USER_NS) += user_namespace.o
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..d75488824ae2
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,366 @@
+/*
+ * Process number limiting controller for cgroups.
+ *
+ * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
+ * after a certain limit is reached.
+ *
+ * Since it is trivial to hit the task limit without hitting any kmemcg limits
+ * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
+ * preventable in the scope of a cgroup hierarchy by allowing resource limiting
+ * of the number of tasks in a cgroup.
+ *
+ * In order to use the `pids` controller, set the maximum number of tasks in
+ * pids.max (this is not available in the root cgroup for obvious reasons). The
+ * number of processes currently in the cgroup is given by pids.current.
+ * Organisational operations are not blocked by cgroup policies, so it is
+ * possible to have pids.current > pids.max. However, it is not possible to
+ * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
+ * would cause a cgroup policy to be violated.
+ *
+ * To set a cgroup to have no limit, set pids.max to "max". This is the default
+ * for all new cgroups (N.B. that PID limits are hierarchical, so the most
+ * stringent limit in the hierarchy is followed).
+ *
+ * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
+ * a superset of parent/child/pids.current.
+ *
+ * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/threads.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+
+#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
+#define PIDS_MAX_STR "max"
+
+struct pids_cgroup {
+	struct cgroup_subsys_state	css;
+
+	/*
+	 * Use 64-bit types so that we can safely represent "max" as
+	 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
+	 */
+	atomic64_t			counter;
+	int64_t				limit;
+};
+
+static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct pids_cgroup, css);
+}
+
+static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
+{
+	return css_pids(pids->css.parent);
+}
+
+static struct cgroup_subsys_state *
+pids_css_alloc(struct cgroup_subsys_state *parent)
+{
+	struct pids_cgroup *pids;
+
+	pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
+	if (!pids)
+		return ERR_PTR(-ENOMEM);
+
+	pids->limit = PIDS_MAX;
+	atomic64_set(&pids->counter, 0);
+	return &pids->css;
+}
+
+static void pids_css_free(struct cgroup_subsys_state *css)
+{
+	kfree(css_pids(css));
+}
+
+/**
+ * pids_cancel - uncharge the local pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to cancel
+ *
+ * This function will WARN if the pid count goes under 0, because such a case is
+ * a bug in the pids controller proper.
+ */
+static void pids_cancel(struct pids_cgroup *pids, int num)
+{
+	/*
+	 * A negative count (or overflow for that matter) is invalid,
+	 * and indicates a bug in the `pids` controller proper.
+	 */
+	WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
+}
+
+/**
+ * pids_uncharge - hierarchically uncharge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to uncharge
+ */
+static void pids_uncharge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		pids_cancel(p, num);
+}
+
+/**
+ * pids_charge - hierarchically charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function does *not* follow the pid limit set. It cannot fail and the new
+ * pid count may exceed the limit. This is only used for reverting failed
+ * attaches, where there is no other way out than violating the limit.
+ */
+static void pids_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p;
+
+	for (p = pids; p; p = parent_pids(p))
+		atomic64_add(num, &p->counter);
+}
+
+/**
+ * pids_try_charge - hierarchically try to charge the pid count
+ * @pids: the pid cgroup state
+ * @num: the number of pids to charge
+ *
+ * This function follows the set limit. It will fail if the charge would cause
+ * the new value to exceed the hierarchical limit. Returns 0 if the charge
+ * succeded, otherwise -EAGAIN.
+ */
+static int pids_try_charge(struct pids_cgroup *pids, int num)
+{
+	struct pids_cgroup *p, *q;
+
+	for (p = pids; p; p = parent_pids(p)) {
+		int64_t new = atomic64_add_return(num, &p->counter);
+
+		/*
+		 * Since new is capped to the maximum number of pid_t, if
+		 * p->limit is %PIDS_MAX then we know that this test will never
+		 * fail.
+		 */
+		if (new > p->limit)
+			goto revert;
+	}
+
+	return 0;
+
+revert:
+	for (q = pids; q != p; q = parent_pids(q))
+		pids_cancel(q, num);
+	pids_cancel(p, num);
+
+	return -EAGAIN;
+}
+
+static int pids_can_attach(struct cgroup_subsys_state *css,
+			   struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		/*
+		 * Grab a ref to each task's css. We don't drop the ref until
+		 * we either fail and hit ->cancel_attach() or succeed and hit
+		 * ->attach().
+		 */
+		old_css = task_get_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(pids, 1);
+		pids_uncharge(old_pids, 1);
+	}
+
+	return 0;
+}
+
+static void pids_cancel_attach(struct cgroup_subsys_state *css,
+			       struct cgroup_taskset *tset)
+{
+	struct pids_cgroup *pids = css_pids(css);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset) {
+		struct cgroup_subsys_state *old_css;
+		struct pids_cgroup *old_pids;
+
+		old_css = task_css(task, pids_cgrp_id);
+		old_pids = css_pids(old_css);
+
+		pids_charge(old_pids, 1);
+		pids_uncharge(pids, 1);
+		css_put(old_css);
+	}
+}
+
+static void pids_attach(struct cgroup_subsys_state *css,
+			struct cgroup_taskset *tset)
+{
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, tset)
+		css_put(task_css(task, pids_cgrp_id));
+}
+
+static int pids_can_fork(struct task_struct *task, void **priv_p)
+{
+	struct cgroup_subsys_state *css;
+	struct pids_cgroup *pids;
+	int err;
+
+	/*
+	 * Use the "current" task_css for the pids subsystem as the tentative
+	 * css. It is possible we will charge the wrong hierarchy, in which
+	 * case we will forcefully revert/reapply the charge on the right
+	 * hierarchy after it is committed to the task proper.
+	 */
+	css = task_get_css(current, pids_cgrp_id);
+	pids = css_pids(css);
+
+	err = pids_try_charge(pids, 1);
+	if (err)
+		goto err_css_put;
+
+	*priv_p = css;
+	return 0;
+
+err_css_put:
+	css_put(css);
+	return err;
+}
+
+static void pids_cancel_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css = priv;
+	struct pids_cgroup *pids = css_pids(css);
+
+	pids_uncharge(pids, 1);
+	css_put(css);
+}
+
+static void pids_fork(struct task_struct *task, void *priv)
+{
+	struct cgroup_subsys_state *css;
+	struct cgroup_subsys_state *old_css = priv;
+	struct pids_cgroup *pids;
+	struct pids_cgroup *old_pids = css_pids(old_css);
+
+	css = task_get_css(task, pids_cgrp_id);
+	pids = css_pids(css);
+
+	/*
+	 * If the association has changed, we have to revert and reapply the
+	 * charge/uncharge on the wrong hierarchy to the current one. Since
+	 * the association can only change due to an organisation event, its
+	 * okay for us to ignore the limit in this case.
+	 */
+	if (pids != old_pids) {
+		pids_uncharge(old_pids, 1);
+		pids_charge(pids, 1);
+	}
+
+	css_put(css);
+	css_put(old_css);
+}
+
+static void pids_exit(struct cgroup_subsys_state *css,
+		      struct cgroup_subsys_state *old_css,
+		      struct task_struct *task)
+{
+	struct pids_cgroup *pids = css_pids(old_css);
+
+	pids_uncharge(pids, 1);
+}
+
+static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
+			      size_t nbytes, loff_t off)
+{
+	struct cgroup_subsys_state *css = of_css(of);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit;
+	int err;
+
+	buf = strstrip(buf);
+	if (!strcmp(buf, PIDS_MAX_STR)) {
+		limit = PIDS_MAX;
+		goto set_limit;
+	}
+
+	err = kstrtoll(buf, 0, &limit);
+	if (err)
+		return err;
+
+	if (limit < 0 || limit >= PIDS_MAX)
+		return -EINVAL;
+
+set_limit:
+	/*
+	 * Limit updates don't need to be mutex'd, since it isn't
+	 * critical that any racing fork()s follow the new limit.
+	 */
+	pids->limit = limit;
+	return nbytes;
+}
+
+static int pids_max_show(struct seq_file *sf, void *v)
+{
+	struct cgroup_subsys_state *css = seq_css(sf);
+	struct pids_cgroup *pids = css_pids(css);
+	int64_t limit = pids->limit;
+
+	if (limit >= PIDS_MAX)
+		seq_printf(sf, "%s\n", PIDS_MAX_STR);
+	else
+		seq_printf(sf, "%lld\n", limit);
+
+	return 0;
+}
+
+static s64 pids_current_read(struct cgroup_subsys_state *css,
+			     struct cftype *cft)
+{
+	struct pids_cgroup *pids = css_pids(css);
+
+	return atomic64_read(&pids->counter);
+}
+
+static struct cftype pids_files[] = {
+	{
+		.name = "max",
+		.write = pids_max_write,
+		.seq_show = pids_max_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "current",
+		.read_s64 = pids_current_read,
+	},
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys pids_cgrp_subsys = {
+	.css_alloc	= pids_css_alloc,
+	.css_free	= pids_css_free,
+	.attach		= pids_attach,
+	.can_attach 	= pids_can_attach,
+	.cancel_attach 	= pids_cancel_attach,
+	.can_fork	= pids_can_fork,
+	.cancel_fork	= pids_cancel_fork,
+	.fork		= pids_fork,
+	.exit		= pids_exit,
+	.legacy_cftypes	= pids_files,
+	.dfl_cftypes	= pids_files,
+};
-- 
cgit v1.2.3


From 917d8e2d10f40e28aa9e0d824b2e5b8197d79fc2 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 13 Jun 2015 03:21:58 +1000
Subject: cgroup: add documentation for the PIDs controller

Add documentation derived from kernel/cgroup_pids.c to the relevant
Documentation/ directory, along with a few examples of how to use the
PIDs controller as well an explanation of its peculiarities.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/00-INDEX |  2 +
 Documentation/cgroups/pids.txt | 85 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+)
 create mode 100644 Documentation/cgroups/pids.txt

diff --git a/Documentation/cgroups/00-INDEX b/Documentation/cgroups/00-INDEX
index 96ce071a3633..3f5a40f57d4a 100644
--- a/Documentation/cgroups/00-INDEX
+++ b/Documentation/cgroups/00-INDEX
@@ -22,6 +22,8 @@ net_cls.txt
 	- Network classifier cgroups details and usages.
 net_prio.txt
 	- Network priority cgroups details and usages.
+pids.txt
+	- Process number cgroups details and usages.
 resource_counter.txt
 	- Resource Counter API.
 unified-hierarchy.txt
diff --git a/Documentation/cgroups/pids.txt b/Documentation/cgroups/pids.txt
new file mode 100644
index 000000000000..1a078b5d281a
--- /dev/null
+++ b/Documentation/cgroups/pids.txt
@@ -0,0 +1,85 @@
+						   Process Number Controller
+						   =========================
+
+Abstract
+--------
+
+The process number controller is used to allow a cgroup hierarchy to stop any
+new tasks from being fork()'d or clone()'d after a certain limit is reached.
+
+Since it is trivial to hit the task limit without hitting any kmemcg limits in
+place, PIDs are a fundamental resource. As such, PID exhaustion must be
+preventable in the scope of a cgroup hierarchy by allowing resource limiting of
+the number of tasks in a cgroup.
+
+Usage
+-----
+
+In order to use the `pids` controller, set the maximum number of tasks in
+pids.max (this is not available in the root cgroup for obvious reasons). The
+number of processes currently in the cgroup is given by pids.current.
+
+Organisational operations are not blocked by cgroup policies, so it is possible
+to have pids.current > pids.max. This can be done by either setting the limit to
+be smaller than pids.current, or attaching enough processes to the cgroup such
+that pids.current > pids.max. However, it is not possible to violate a cgroup
+policy through fork() or clone(). fork() and clone() will return -EAGAIN if the
+creation of a new process would cause a cgroup policy to be violated.
+
+To set a cgroup to have no limit, set pids.max to "max". This is the default for
+all new cgroups (N.B. that PID limits are hierarchical, so the most stringent
+limit in the hierarchy is followed).
+
+pids.current tracks all child cgroup hierarchies, so parent/pids.current is a
+superset of parent/child/pids.current.
+
+Example
+-------
+
+First, we mount the pids controller:
+# mkdir -p /sys/fs/cgroup/pids
+# mount -t cgroup -o pids none /sys/fs/cgroup/pids
+
+Then we create a hierarchy, set limits and attach processes to it:
+# mkdir -p /sys/fs/cgroup/pids/parent/child
+# echo 2 > /sys/fs/cgroup/pids/parent/pids.max
+# echo $$ > /sys/fs/cgroup/pids/parent/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+#
+
+It should be noted that attempts to overcome the set limit (2 in this case) will
+fail:
+
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+Even if we migrate to a child cgroup (which doesn't have a set limit), we will
+not be able to overcome the most stringent limit in the hierarchy (in this case,
+parent's):
+
+# echo $$ > /sys/fs/cgroup/pids/parent/child/cgroup.procs
+# cat /sys/fs/cgroup/pids/parent/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.current
+2
+# cat /sys/fs/cgroup/pids/parent/child/pids.max
+max
+# ( /bin/echo "Here's some processes for you." | cat )
+sh: fork: Resource temporary unavailable
+#
+
+We can set a limit that is smaller than pids.current, which will stop any new
+processes from being forked at all (note that the shell itself counts towards
+pids.current):
+
+# echo 1 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+# echo 0 > /sys/fs/cgroup/pids/parent/pids.max
+# /bin/echo "We can't even spawn a single process now."
+sh: fork: Resource temporary unavailable
+#
-- 
cgit v1.2.3


From cf780b7dc795ff495629b9879dc874eda49ada62 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov@parallels.com>
Date: Mon, 3 Aug 2015 15:32:26 +0300
Subject: cgroup: fix idr_preload usage

It does not make much sense to call idr_preload with the same gfp mask
as the following idr_alloc, but this is what we do in cgroup_idr_alloc.
This patch fixes the idr_preload usage by making cgroup_idr_alloc call
idr_alloc w/o __GFP_WAIT. Since it is now safe to call cgroup_idr_alloc
with GFP_KERNEL, the patch also fixes all its callers appropriately.

Signed-off-by: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a59dd1a6b74a..97e2da0d3b27 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -210,7 +210,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 
 	idr_preload(gfp_mask);
 	spin_lock_bh(&cgroup_idr_lock);
-	ret = idr_alloc(idr, ptr, start, end, gfp_mask);
+	ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
 	spin_unlock_bh(&cgroup_idr_lock);
 	idr_preload_end();
 	return ret;
@@ -1669,7 +1669,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT);
+	ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
 	root_cgrp->id = ret;
@@ -4582,7 +4582,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 	if (err)
 		goto err_free_css;
 
-	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT);
+	err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
 	if (err < 0)
 		goto err_free_percpu_ref;
 	css->id = err;
@@ -4659,7 +4659,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	 * Temporarily set the pointer to NULL, so idr_find() won't return
 	 * a half-baked cgroup.
 	 */
-	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT);
+	cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
 	if (cgrp->id < 0) {
 		ret = -ENOMEM;
 		goto out_cancel_ref;
-- 
cgit v1.2.3


From 6abc8ca19df0078de17dc38340db3002ed489ce7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 4 Aug 2015 15:20:55 -0400
Subject: cgroup: define controller file conventions

Traditionally, each cgroup controller implemented whatever interface
it wanted leading to interfaces which are widely inconsistent.
Examining the requirements of the controllers readily yield that there
are only a few control schemes shared among all.

Two major controllers already had to implement new interface for the
unified hierarchy due to significant structural changes.  Let's take
the chance to establish common conventions throughout all controllers.

This patch defines CGROUP_WEIGHT_MIN/DFL/MAX to be used on all weight
based control knobs and documents the conventions that controllers
should follow on the unified hierarchy.  Except for io.weight knob,
all existing unified hierarchy knobs are already compliant.  A
follow-up patch will update io.weight.

v2: Added descriptions of min, low and high knobs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 Documentation/cgroups/unified-hierarchy.txt | 80 ++++++++++++++++++++++++++---
 include/linux/cgroup.h                      |  9 ++++
 2 files changed, 81 insertions(+), 8 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7647ab..1ee9caf29e57 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -23,10 +23,13 @@ CONTENTS
 5. Other Changes
   5-1. [Un]populated Notification
   5-2. Other Core Changes
-  5-3. Per-Controller Changes
-    5-3-1. blkio
-    5-3-2. cpuset
-    5-3-3. memory
+  5-3. Controller File Conventions
+    5-3-1. Format
+    5-3-2. Control Knobs
+  5-4. Per-Controller Changes
+    5-4-1. blkio
+    5-4-2. cpuset
+    5-4-3. memory
 6. Planned Changes
   6-1. CAP for resource control
 
@@ -372,14 +375,75 @@ supported and the interface files "release_agent" and
 - The "cgroup.clone_children" file is removed.
 
 
-5-3. Per-Controller Changes
+5-3. Controller File Conventions
 
-5-3-1. blkio
+5-3-1. Format
+
+In general, all controller files should be in one of the following
+formats whenever possible.
+
+- Values only files
+
+  VAL0 VAL1...\n
+
+- Flat keyed files
+
+  KEY0 VAL0\n
+  KEY1 VAL1\n
+  ...
+
+- Nested keyed files
+
+  KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
+  KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
+  ...
+
+For a writeable file, the format for writing should generally match
+reading; however, controllers may allow omitting later fields or
+implement restricted shortcuts for most common use cases.
+
+For both flat and nested keyed files, only the values for a single key
+can be written at a time.  For nested keyed files, the sub key pairs
+may be specified in any order and not all pairs have to be specified.
+
+
+5-3-2. Control Knobs
+
+- Settings for a single feature should generally be implemented in a
+  single file.
+
+- In general, the root cgroup should be exempt from resource control
+  and thus shouldn't have resource control knobs.
+
+- If a controller implements ratio based resource distribution, the
+  control knob should be named "weight" and have the range [1, 10000]
+  and 100 should be the default value.  The values are chosen to allow
+  enough and symmetric bias in both directions while keeping it
+  intuitive (the default is 100%).
+
+- If a controller implements an absolute resource guarantee and/or
+  limit, the control knobs should be named "min" and "max"
+  respectively.  If a controller implements best effort resource
+  gurantee and/or limit, the control knobs should be named "low" and
+  "high" respectively.
+
+  In the above four control files, the special token "max" should be
+  used to represent upward infinity for both reading and writing.
+
+- If a setting has configurable default value and specific overrides,
+  the default settings should be keyed with "default" and appear as
+  the first entry in the file.  Specific entries can use "default" as
+  its value to indicate inheritance of the default value.
+
+
+5-4. Per-Controller Changes
+
+5-4-1. blkio
 
 - blk-throttle becomes properly hierarchical.
 
 
-5-3-2. cpuset
+5-4-2. cpuset
 
 - Tasks are kept in empty cpusets after hotplug and take on the masks
   of the nearest non-empty ancestor, instead of being moved to it.
@@ -388,7 +452,7 @@ supported and the interface files "release_agent" and
   masks of the nearest non-empty ancestor.
 
 
-5-3-3. memory
+5-4-3. memory
 
 - use_hierarchy is on by default and the cgroup file for the flag is
   not created.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a593e299162e..c6bf9d30c270 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -22,6 +22,15 @@
 
 #ifdef CONFIG_CGROUPS
 
+/*
+ * All weight knobs on the default hierarhcy should use the following min,
+ * default and max values.  The default value is the logarithmic center of
+ * MIN and MAX and allows 100x to be expressed in both directions.
+ */
+#define CGROUP_WEIGHT_MIN		1
+#define CGROUP_WEIGHT_DFL		100
+#define CGROUP_WEIGHT_MAX		10000
+
 /* a css_task_iter should be treated as an opaque object */
 struct css_task_iter {
 	struct cgroup_subsys		*ss;
-- 
cgit v1.2.3


From d0ec4230a077521a984ac604acbd8c15cc12a664 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 5 Aug 2015 16:03:19 -0400
Subject: cgroup: export cgrp_dfl_root

While cgroup subsystems can't be modules, blkcg supports dynamically
loadable policies which interact with cgroup core.  Export
cgrp_dfl_root so that cgroup_on_dfl() can be used in those modules.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..3f6641632f73 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
  * part of that cgroup.
  */
 struct cgroup_root cgrp_dfl_root;
+EXPORT_SYMBOL_GPL(cgrp_dfl_root);
 
 /*
  * The default hierarchy always exists but is hidden until mounted for the
-- 
cgit v1.2.3


From 731a981e1059dd68c97212ccc9c0e1f169c1a77b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 11 Aug 2015 13:35:42 -0400
Subject: cgroup: make cftype->private a unsigned long

It's pretty unusual to have an int as a private data field and it
makes it impossible to carray a pointer value through it.  Let's make
it an unsigned long.  AFAICS, this shouldn't break anything.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup-defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 93755a629299..8f5770a7d85a 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -318,7 +318,7 @@ struct cftype {
 	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
-	int private;
+	unsigned long private;
 	/*
 	 * If not 0, file mode is set to this value, otherwise it will
 	 * be figured out automatically
-- 
cgit v1.2.3


From d98817d4961b9ef75062d1e129829d283b3dac57 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 13:58:16 -0700
Subject: cgroup: don't print subsystems for the default hierarchy

It doesn't make sense to print subsystems on mount option or
/proc/PID/cgroup for the default hierarchy.

* cgroup.controllers file at the root of the default hierarchy lists
  the currently attached controllers.

* The default hierarchy is catch-all for unmounted subsystems.

* The default hierarchy doesn't accept any mount options.

Suppress subsystem printing on mount options and /proc/PID/cgroup for
the default hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: cgroups@vger.kernel.org
---
 kernel/cgroup.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f6641632f73..d9f854defa78 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1333,9 +1333,10 @@ static int cgroup_show_options(struct seq_file *seq,
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	for_each_subsys(ss, ssid)
-		if (root->subsys_mask & (1 << ssid))
-			seq_printf(seq, ",%s", ss->name);
+	if (root != &cgrp_dfl_root)
+		for_each_subsys(ss, ssid)
+			if (root->subsys_mask & (1 << ssid))
+				seq_printf(seq, ",%s", ss->name);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -5137,9 +5138,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			continue;
 
 		seq_printf(m, "%d:", root->hierarchy_id);
-		for_each_subsys(ss, ssid)
-			if (root->subsys_mask & (1 << ssid))
-				seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+		if (root != &cgrp_dfl_root)
+			for_each_subsys(ss, ssid)
+				if (root->subsys_mask & (1 << ssid))
+					seq_printf(m, "%s%s", count++ ? "," : "",
+						   ss->name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
-- 
cgit v1.2.3


From 3e1d2eed39d804e48282931835c7203fa47fe1d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 18 Aug 2015 13:58:16 -0700
Subject: cgroup: introduce cgroup_subsys->legacy_name

This allows cgroup subsystems to use a different name on the unified
hierarchy.  cgroup_subsys->name is used on the unified hierarchy,
->legacy_name elsewhere.  If ->legacy_name is not explicitly set, it's
automatically set to ->name and the userland visible behavior remains
unchanged.

v2: Make parse_cgroupfs_options() only consider ->legacy_name as mount
    options are used only on legacy hierarchies.  Suggested by Li
    Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: cgroups@vger.kernel.org
---
 include/linux/cgroup-defs.h |  3 +++
 kernel/cgroup.c             | 29 ++++++++++++++++++-----------
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8f5770a7d85a..7d0bb53e4553 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -434,6 +434,9 @@ struct cgroup_subsys {
 	int id;
 	const char *name;
 
+	/* optional, initialized automatically during boot if not set */
+	const char *legacy_name;
+
 	/* link to parent, protected by cgroup_lock() */
 	struct cgroup_root *root;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d9f854defa78..9c8eb17d593a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1028,10 +1028,13 @@ static const struct file_operations proc_cgroupstats_operations;
 static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
 			      char *buf)
 {
+	struct cgroup_subsys *ss = cft->ss;
+
 	if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
 	    !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
 		snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
-			 cft->ss->name, cft->name);
+			 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
+			 cft->name);
 	else
 		strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
 	return buf;
@@ -1336,7 +1339,7 @@ static int cgroup_show_options(struct seq_file *seq,
 	if (root != &cgrp_dfl_root)
 		for_each_subsys(ss, ssid)
 			if (root->subsys_mask & (1 << ssid))
-				seq_printf(seq, ",%s", ss->name);
+				seq_printf(seq, ",%s", ss->legacy_name);
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1449,7 +1452,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		}
 
 		for_each_subsys(ss, i) {
-			if (strcmp(token, ss->name))
+			if (strcmp(token, ss->legacy_name))
 				continue;
 			if (ss->disabled)
 				continue;
@@ -4995,6 +4998,8 @@ int __init cgroup_init_early(void)
 
 		ss->id = i;
 		ss->name = cgroup_subsys_name[i];
+		if (!ss->legacy_name)
+			ss->legacy_name = cgroup_subsys_name[i];
 
 		if (ss->early_init)
 			cgroup_init_subsys(ss, true);
@@ -5142,7 +5147,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			for_each_subsys(ss, ssid)
 				if (root->subsys_mask & (1 << ssid))
 					seq_printf(m, "%s%s", count++ ? "," : "",
-						   ss->name);
+						   ss->legacy_name);
 		if (strlen(root->name))
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
@@ -5182,7 +5187,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
-			   ss->name, ss->root->hierarchy_id,
+			   ss->legacy_name, ss->root->hierarchy_id,
 			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
 
 	mutex_unlock(&cgroup_mutex);
@@ -5404,12 +5409,14 @@ static int __init cgroup_disable(char *str)
 			continue;
 
 		for_each_subsys(ss, i) {
-			if (!strcmp(token, ss->name)) {
-				ss->disabled = 1;
-				printk(KERN_INFO "Disabling %s control group"
-					" subsystem\n", ss->name);
-				break;
-			}
+			if (strcmp(token, ss->name) &&
+			    strcmp(token, ss->legacy_name))
+				continue;
+
+			ss->disabled = 1;
+			printk(KERN_INFO "Disabling %s control group subsystem\n",
+			       ss->name);
+			break;
 		}
 	}
 	return 1;
-- 
cgit v1.2.3


From ce52399520e4b97466165737e00c7b528ae8c8f5 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 25 Aug 2015 12:50:44 +1000
Subject: cgroup: pids: fix invalid get/put usage

Fix incorrect usage of css_get and css_put to put a different css in
pids_{cancel_,}attach() than the one grabbed in pids_can_attach(). This
could lead to quite serious memory leakage (and unsafe operations on the
putted css).

tj: minor comment update

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup_pids.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index d75488824ae2..806cd7693ac8 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -173,11 +173,11 @@ static int pids_can_attach(struct cgroup_subsys_state *css,
 		struct pids_cgroup *old_pids;
 
 		/*
-		 * Grab a ref to each task's css. We don't drop the ref until
-		 * we either fail and hit ->cancel_attach() or succeed and hit
-		 * ->attach().
+		 * No need to pin @old_css between here and cancel_attach()
+		 * because cgroup core protects it from being freed before
+		 * the migration completes or fails.
 		 */
-		old_css = task_get_css(task, pids_cgrp_id);
+		old_css = task_css(task, pids_cgrp_id);
 		old_pids = css_pids(old_css);
 
 		pids_charge(pids, 1);
@@ -202,19 +202,9 @@ static void pids_cancel_attach(struct cgroup_subsys_state *css,
 
 		pids_charge(old_pids, 1);
 		pids_uncharge(pids, 1);
-		css_put(old_css);
 	}
 }
 
-static void pids_attach(struct cgroup_subsys_state *css,
-			struct cgroup_taskset *tset)
-{
-	struct task_struct *task;
-
-	cgroup_taskset_for_each(task, tset)
-		css_put(task_css(task, pids_cgrp_id));
-}
-
 static int pids_can_fork(struct task_struct *task, void **priv_p)
 {
 	struct cgroup_subsys_state *css;
@@ -354,7 +344,6 @@ static struct cftype pids_files[] = {
 struct cgroup_subsys pids_cgrp_subsys = {
 	.css_alloc	= pids_css_alloc,
 	.css_free	= pids_css_free,
-	.attach		= pids_attach,
 	.can_attach 	= pids_can_attach,
 	.cancel_attach 	= pids_cancel_attach,
 	.can_fork	= pids_can_fork,
-- 
cgit v1.2.3