From 5edee61edeaaebafe584f8fb7074c1ef4658596b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup: cgroup_subsys->fork() should be called after the task is added to css_set cgroup core has a bug which violates a basic rule about event notifications - when a new entity needs to be added, you add that to the notification list first and then make the new entity conform to the current state. If done in the reverse order, an event happening inbetween will be lost. cgroup_subsys->fork() is invoked way before the new task is added to the css_set. Currently, cgroup_freezer is the only user of ->fork() and uses it to make new tasks conform to the current state of the freezer. If FROZEN state is requested while fork is in progress between cgroup_fork_callbacks() and cgroup_post_fork(), the child could escape freezing - the cgroup isn't frozen when ->fork() is called and the freezer couldn't see the new task on the css_set. This patch moves cgroup_subsys->fork() invocation to cgroup_post_fork() after the new task is added to the css_set. cgroup_fork_callbacks() is removed. Because now a task may be migrated during cgroup_subsys->fork(), freezer_fork() is updated so that it adheres to the usual RCU locking and the rather pointless comment on why locking can be different there is removed (if it doesn't make anything simpler, why even bother?). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: stable@vger.kernel.org --- kernel/cgroup.c | 62 ++++++++++++++++++++++++------------------------- kernel/cgroup_freezer.c | 13 ++++------- kernel/fork.c | 9 +------ 3 files changed, 35 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39a..b7a0171067e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4843,45 +4843,20 @@ void cgroup_fork(struct task_struct *child) INIT_LIST_HEAD(&child->cg_list); } -/** - * cgroup_fork_callbacks - run fork callbacks - * @child: the new task - * - * Called on a new task very soon before adding it to the - * tasklist. No need to take any locks since no-one can - * be operating on this task. - */ -void cgroup_fork_callbacks(struct task_struct *child) -{ - if (need_forkexit_callback) { - int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - struct cgroup_subsys *ss = subsys[i]; - - /* - * forkexit callbacks are only supported for - * builtin subsystems. - */ - if (!ss || ss->module) - continue; - - if (ss->fork) - ss->fork(child); - } - } -} - /** * cgroup_post_fork - called on a new task after adding it to the task list * @child: the task in question * - * Adds the task to the list running through its css_set if necessary. - * Has to be after the task is visible on the task list in case we race - * with the first call to cgroup_iter_start() - to guarantee that the - * new task ends up on its list. + * Adds the task to the list running through its css_set if necessary and + * call the subsystem fork() callbacks. Has to be after the task is + * visible on the task list in case we race with the first call to + * cgroup_iter_start() - to guarantee that the new task ends up on its + * list. */ void cgroup_post_fork(struct task_struct *child) { + int i; + /* * use_task_css_set_links is set to 1 before we walk the tasklist * under the tasklist_lock and we read it here after we added the child @@ -4910,7 +4885,30 @@ void cgroup_post_fork(struct task_struct *child) } write_unlock(&css_set_lock); } + + /* + * Call ss->fork(). This must happen after @child is linked on + * css_set; otherwise, @child might change state between ->fork() + * and addition to css_set. + */ + if (need_forkexit_callback) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + /* + * fork/exit callbacks are supported only for + * builtin subsystems and we don't need further + * synchronization as they never go away. + */ + if (!ss || ss->module) + continue; + + if (ss->fork) + ss->fork(child); + } + } } + /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b1724ce9898..12bfedb598c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -186,23 +186,15 @@ static void freezer_fork(struct task_struct *task) { struct freezer *freezer; - /* - * No lock is needed, since the task isn't on tasklist yet, - * so it can't be moved to another cgroup, which means the - * freezer won't be removed and will be valid during this - * function call. Nevertheless, apply RCU read-side critical - * section to suppress RCU lockdep false positives. - */ rcu_read_lock(); freezer = task_freezer(task); - rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the * following check. */ if (!freezer->css.cgroup->parent) - return; + goto out; spin_lock_irq(&freezer->lock); BUG_ON(freezer->state == CGROUP_FROZEN); @@ -210,7 +202,10 @@ static void freezer_fork(struct task_struct *task) /* Locking avoids race with FREEZING -> THAWED transitions. */ if (freezer->state == CGROUP_FREEZING) freeze_task(task); + spin_unlock_irq(&freezer->lock); +out: + rcu_read_unlock(); } /* diff --git a/kernel/fork.c b/kernel/fork.c index 8b20ab7d3aa..acc4cb62f32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1135,7 +1135,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - int cgroup_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1393,12 +1392,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; - /* Now that the task is set up, run cgroup callbacks if - * necessary. We need to run them before the task is visible - * on the tasklist. */ - cgroup_fork_callbacks(p); - cgroup_callbacks_done = 1; - /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1503,7 +1496,7 @@ bad_fork_cleanup_cgroup: #endif if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); - cgroup_exit(p, cgroup_callbacks_done); + cgroup_exit(p, 0); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: -- cgit v1.2.3 From 51f246ed95caed12898649d8170d2d352da6af76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: make it official that writes to freezer.state don't fail try_to_freeze_cgroup() has condition checks which are intended to fail the write operation to freezer.state if there are tasks which can't be frozen. The condition checks have been broken for quite some time now. freeze_task() returns %false if the target task can't be frozen, so num_cant_freeze_now is never incremented. In addition, strangely, cgroup freezing proceeds even after the write is failed, which is rather broken. This patch rips out the non-working code intended to fail the write to freezer.state when the cgroup contains non-freezable tasks and makes it official that writes to freezer.state succeed whether there are non-freezable tasks in the cgroup or not. This leaves is_task_frozen_enough() with only one user - upste_if_frozen(). Collapse it into the caller. Note that this removes an extra call to freezing(). This doesn't cause any userland behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki --- kernel/cgroup_freezer.c | 43 +++++++++++-------------------------------- 1 file changed, 11 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 12bfedb598c..05d52185139 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -150,13 +150,6 @@ static void freezer_destroy(struct cgroup *cgroup) kfree(freezer); } -/* task is frozen or will freeze immediately when next it gets woken */ -static bool is_task_frozen_enough(struct task_struct *task) -{ - return frozen(task) || - (task_is_stopped_or_traced(task) && freezing(task)); -} - /* * The call to cgroup_lock() in the freezer.state write method prevents * a write to that file racing against an attach, and hence the @@ -222,7 +215,8 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { ntotal++; - if (freezing(task) && is_task_frozen_enough(task)) + if (freezing(task) && (frozen(task) || + task_is_stopped_or_traced(task))) nfrozen++; } @@ -264,24 +258,15 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, return 0; } -static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int num_cant_freeze_now = 0; cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { - if (!freeze_task(task)) - continue; - if (is_task_frozen_enough(task)) - continue; - if (!freezing(task) && !freezer_should_skip(task)) - num_cant_freeze_now++; - } + while ((task = cgroup_iter_next(cgroup, &it))) + freeze_task(task); cgroup_iter_end(cgroup, &it); - - return num_cant_freeze_now ? -EBUSY : 0; } static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) @@ -295,13 +280,10 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static int freezer_change_state(struct cgroup *cgroup, - enum freezer_state goal_state) +static void freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) { - struct freezer *freezer; - int retval = 0; - - freezer = cgroup_freezer(cgroup); + struct freezer *freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); @@ -318,22 +300,19 @@ static int freezer_change_state(struct cgroup *cgroup, if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; - retval = try_to_freeze_cgroup(cgroup, freezer); + freeze_cgroup(cgroup, freezer); break; default: BUG(); } spin_unlock_irq(&freezer->lock); - - return retval; } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - int retval; enum freezer_state goal_state; if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) @@ -345,9 +324,9 @@ static int freezer_write(struct cgroup *cgroup, if (!cgroup_lock_live_group(cgroup)) return -ENODEV; - retval = freezer_change_state(cgroup, goal_state); + freezer_change_state(cgroup, goal_state); cgroup_unlock(); - return retval; + return 0; } static struct cftype files[] = { -- cgit v1.2.3 From 3c426d5e114035d00453bb5d82a92826db1ed71f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: don't stall transition to FROZEN for PF_NOFREEZE or PF_FREEZER_SKIP tasks cgroup_freezer doesn't transition from FREEZING to FROZEN if the cgroup contains PF_NOFREEZE tasks or tasks sleeping with PF_FREEZER_SKIP set. Only kernel tasks can be non-freezable (PF_NOFREEZE) and there's nothing cgroup_freezer or userland can do about or to it. It's pointless to stall the transition for PF_NOFREEZE tasks. PF_FREEZER_SKIP indicates that the task can be skipped when determining whether frozen state is reached. A task with PF_FREEZER_SKIP is guaranteed to perform try_to_freeze() after it wakes up and can be considered frozen much like stopped or traced tasks. Note that a vfork parent uses PF_FREEZER_SKIP while waiting for the child. This updates update_if_frozen() such that it only considers freezable tasks and treats %true freezer_should_skip() tasks as frozen. This allows cgroups w/ kthreads and vfork parents successfully reach FROZEN state. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki --- kernel/cgroup_freezer.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 05d52185139..557f3678c4e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -214,10 +214,18 @@ static void update_if_frozen(struct cgroup *cgroup, cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { - ntotal++; - if (freezing(task) && (frozen(task) || - task_is_stopped_or_traced(task))) - nfrozen++; + if (freezing(task)) { + ntotal++; + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (frozen(task) || task_is_stopped_or_traced(task) || + freezer_should_skip(task)) + nfrozen++; + } } if (old_state == CGROUP_THAWED) { -- cgit v1.2.3 From 8755ade683241e8c6b8fe8d22d0ae35041a3dc51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: allow moving tasks in and out of a frozen cgroup cgroup_freezer is one of the few users of cgroup_subsys->can_attach() and uses it to prevent tasks from being migrated into or out of a frozen cgroup. This makes cgroup_freezer cumbersome to use especially when co-mounted with other controllers. ->can_attach() is problematic in general as it can make co-mounting multiple cgroups difficult - migrating tasks may fail for reasons completely irrelevant for other controllers. freezer_can_attach() in particular is more problematic because it messes with cgroup internal locking to ensure that the state verification performed at freezer_can_attach() stays valid until migration is complete. This patch replaces freezer_can_attach() with freezer_attach() so that tasks are always allowed to migrate - they are nudged into the conforming state from freezer_attach(). This means that there can be tasks which are being migrated which don't conform to the current cgroup_freezer state until freezer_attach() is complete. Under the current locking scheme, the only such place is freezer_fork() which is updated to handle such window. While this patch doesn't remove the use of internal cgroup locking from freezer_read/write() paths, it removes the requirement to keep the freezer state constant while migrating and enables such change. Note that this creates a userland visible behavior change - FROZEN cgroup can no longer be used to lock migrations in and out of the cgroup. This behavior change is intended. I don't think the feature is necessary - userland should coordinate accesses to cgroup fs anyway - and even if the feature is needed cgroup_freezer is the completely wrong place to implement it. Signed-off-by: Tejun Heo LKML-Reference: <1350426526-14254-1-git-send-email-tj@kernel.org> Cc: Matt Helsley Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 51 ++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 557f3678c4e..0b0e10545ef 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -152,27 +152,38 @@ static void freezer_destroy(struct cgroup *cgroup) /* * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence the - * can_attach() result will remain valid until the attach completes. + * a write to that file racing against an attach, and hence we don't need + * to worry about racing against migration. */ -static int freezer_can_attach(struct cgroup *new_cgroup, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + spin_lock_irq(&freezer->lock); + /* - * Anything frozen can't move or be moved to/from. + * Make the new tasks conform to the current state of @new_cgrp. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_cgrp but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + * This means that, to determine whether to freeze, one should test + * whether the state equals THAWED. */ - cgroup_taskset_for_each(task, new_cgroup, tset) - if (cgroup_freezing(task)) - return -EBUSY; - - freezer = cgroup_freezer(new_cgroup); - if (freezer->state != CGROUP_THAWED) - return -EBUSY; + cgroup_taskset_for_each(task, new_cgrp, tset) { + if (freezer->state == CGROUP_THAWED) { + __thaw_task(task); + } else { + freeze_task(task); + freezer->state = CGROUP_FREEZING; + } + } - return 0; + spin_unlock_irq(&freezer->lock); } static void freezer_fork(struct task_struct *task) @@ -190,12 +201,12 @@ static void freezer_fork(struct task_struct *task) goto out; spin_lock_irq(&freezer->lock); - BUG_ON(freezer->state == CGROUP_FROZEN); - - /* Locking avoids race with FREEZING -> THAWED transitions. */ - if (freezer->state == CGROUP_FREEZING) + /* + * @task might have been just migrated into a FROZEN cgroup. Test + * equality with THAWED. Read the comment in freezer_attach(). + */ + if (freezer->state != CGROUP_THAWED) freeze_task(task); - spin_unlock_irq(&freezer->lock); out: rcu_read_unlock(); @@ -352,7 +363,7 @@ struct cgroup_subsys freezer_subsys = { .create = freezer_create, .destroy = freezer_destroy, .subsys_id = freezer_subsys_id, - .can_attach = freezer_can_attach, + .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, -- cgit v1.2.3 From b4d18311d37b0b1b370a1ef3e4de92b97930f0a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:14 -0700 Subject: cgroup_freezer: prepare update_if_frozen() for locking change Locking will change such that migration can happen while freezer_read/write() is in progress. This means that update_if_frozen() can no longer assume that all tasks in the cgroup coform to the current freezer state - newly migrated tasks which haven't finished freezer_attach() yet might be in any state. This patch updates update_if_frozen() such that it no longer verifies task states against freezer state. It now simply decides whether FREEZING stage is complete. This removal of verification makes it meaningless to call from freezer_change_state(). Drop it and move the fast exit test from freezer_read() - the only left caller - to update_if_frozen(). Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 43 +++++++++++++++++-------------------------- 1 file changed, 17 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 0b0e10545ef..3d45503a21a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -213,41 +213,39 @@ out: } /* - * caller must hold freezer->lock + * We change from FREEZING to FROZEN lazily if the cgroup was only + * partially frozen when we exitted write. Caller must hold freezer->lock. + * + * Task states and freezer state might disagree while tasks are being + * migrated into @cgroup, so we can't verify task states against @freezer + * state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, - struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { struct cgroup_iter it; struct task_struct *task; - unsigned int nfrozen = 0, ntotal = 0; - enum freezer_state old_state = freezer->state; + + if (freezer->state != CGROUP_FREEZING) + return; cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { if (freezing(task)) { - ntotal++; /* * freezer_should_skip() indicates that the task * should be skipped when determining freezing * completion. Consider it frozen in addition to * the usual frozen condition. */ - if (frozen(task) || task_is_stopped_or_traced(task) || - freezer_should_skip(task)) - nfrozen++; + if (!frozen(task) && !task_is_stopped_or_traced(task) && + !freezer_should_skip(task)) + goto notyet; } } - if (old_state == CGROUP_THAWED) { - BUG_ON(nfrozen > 0); - } else if (old_state == CGROUP_FREEZING) { - if (nfrozen == ntotal) - freezer->state = CGROUP_FROZEN; - } else { /* old_state == CGROUP_FROZEN */ - BUG_ON(nfrozen != ntotal); - } - + freezer->state = CGROUP_FROZEN; +notyet: cgroup_iter_end(cgroup, &it); } @@ -262,13 +260,8 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); + update_if_frozen(cgroup, freezer); state = freezer->state; - if (state == CGROUP_FREEZING) { - /* We change from FREEZING to FROZEN lazily if the cgroup was - * only partially frozen when we exitted write. */ - update_if_frozen(cgroup, freezer); - state = freezer->state; - } spin_unlock_irq(&freezer->lock); cgroup_unlock(); @@ -306,8 +299,6 @@ static void freezer_change_state(struct cgroup *cgroup, spin_lock_irq(&freezer->lock); - update_if_frozen(cgroup, freezer); - switch (goal_state) { case CGROUP_THAWED: if (freezer->state != CGROUP_THAWED) -- cgit v1.2.3 From ead5c473712eb26db792b18a4dc98fdb312883fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 16 Oct 2012 15:03:15 -0700 Subject: cgroup_freezer: don't use cgroup_lock_live_group() freezer_read/write() used cgroup_lock_live_group() to synchronize against task migration into and out of the target cgroup. cgroup_lock_live_group() grabs the internal cgroup lock and using it from outside cgroup core leads to complex and fragile locking dependency issues which are difficult to resolve. Now that freezer_can_attach() is replaced with freezer_attach() and update_if_frozen() updated, nothing requires excluding migration against freezer state reads and changes. This patch removes cgroup_lock_live_group() and the matching cgroup_unlock() usages. The prone-to-bitrot, already outdated and unnecessary global lock hierarchy documentation is replaced with documentation in local scope. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: Rafael J. Wysocki Cc: Li Zefan --- kernel/cgroup_freezer.c | 66 ++++++++----------------------------------------- 1 file changed, 10 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 3d45503a21a..8a92b0e5209 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -84,50 +84,6 @@ static const char *freezer_state_strs[] = { struct cgroup_subsys freezer_subsys; -/* Locks taken and their ordering - * ------------------------------ - * cgroup_mutex (AKA cgroup_lock) - * freezer->lock - * css_set_lock - * task->alloc_lock (AKA task_lock) - * task->sighand->siglock - * - * cgroup code forces css_set_lock to be taken before task->alloc_lock - * - * freezer_create(), freezer_destroy(): - * cgroup_mutex [ by cgroup core ] - * - * freezer_can_attach(): - * cgroup_mutex (held by caller of can_attach) - * - * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): - * freezer->lock - * sighand->siglock (if the cgroup is freezing) - * - * freezer_read(): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * - * freezer_write() (freeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * sighand->siglock (fake signal delivery inside freeze_task()) - * - * freezer_write() (unfreeze): - * cgroup_mutex - * freezer->lock - * write_lock css_set_lock (cgroup iterator start) - * task->alloc_lock - * read_lock css_set_lock (cgroup iterator start) - * task->alloc_lock (inside __thaw_task(), prevents race with refrigerator()) - * sighand->siglock - */ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) { struct freezer *freezer; @@ -151,9 +107,13 @@ static void freezer_destroy(struct cgroup *cgroup) } /* - * The call to cgroup_lock() in the freezer.state write method prevents - * a write to that file racing against an attach, and hence we don't need - * to worry about racing against migration. + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. */ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { @@ -217,8 +177,8 @@ out: * partially frozen when we exitted write. Caller must hold freezer->lock. * * Task states and freezer state might disagree while tasks are being - * migrated into @cgroup, so we can't verify task states against @freezer - * state here. See freezer_attach() for details. + * migrated into or out of @cgroup, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. */ static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) { @@ -255,15 +215,11 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct freezer *freezer; enum freezer_state state; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; - freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); update_if_frozen(cgroup, freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); - cgroup_unlock(); seq_puts(m, freezer_state_strs[state]); seq_putc(m, '\n'); @@ -297,6 +253,7 @@ static void freezer_change_state(struct cgroup *cgroup, { struct freezer *freezer = cgroup_freezer(cgroup); + /* also synchronizes against task migration, see freezer_attach() */ spin_lock_irq(&freezer->lock); switch (goal_state) { @@ -332,10 +289,7 @@ static int freezer_write(struct cgroup *cgroup, else return -EINVAL; - if (!cgroup_lock_live_group(cgroup)) - return -ENODEV; freezer_change_state(cgroup, goal_state); - cgroup_unlock(); return 0; } -- cgit v1.2.3 From 5d8f72b55c275677865de670fa147ed318191d81 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 26 Oct 2012 19:46:06 +0200 Subject: freezer: change ptrace_stop/do_signal_stop to use freezable_schedule() try_to_freeze_tasks() and cgroup_freezer rely on scheduler locks to ensure that a task doing STOPPED/TRACED -> RUNNING transition can't escape freezing. This mostly works, but ptrace_stop() does not necessarily call schedule(), it can change task->state back to RUNNING and check freezing() without any lock/barrier in between. We could add the necessary barrier, but this patch changes ptrace_stop() and do_signal_stop() to use freezable_schedule(). This fixes the race, freezer_count() and freezer_should_skip() carefully avoid the race. And this simplifies the code, try_to_freeze_tasks/update_if_frozen no longer need to use task_is_stopped_or_traced() checks with the non trivial assumptions. We can rely on the mechanism which was specially designed to mark the sleeping task as "frozen enough". v2: As Tejun pointed out, we can also change get_signal_to_deliver() and move try_to_freeze() up before 'relock' label. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/cgroup_freezer.c | 3 +-- kernel/freezer.c | 11 ++--------- kernel/power/process.c | 13 +------------ kernel/signal.c | 20 ++++++-------------- 4 files changed, 10 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 8a92b0e5209..bedefd9a22d 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -198,8 +198,7 @@ static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) * completion. Consider it frozen in addition to * the usual frozen condition. */ - if (!frozen(task) && !task_is_stopped_or_traced(task) && - !freezer_should_skip(task)) + if (!frozen(task) && !freezer_should_skip(task)) goto notyet; } } diff --git a/kernel/freezer.c b/kernel/freezer.c index 11f82a4d4ea..c38893b0efb 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -116,17 +116,10 @@ bool freeze_task(struct task_struct *p) return false; } - if (!(p->flags & PF_KTHREAD)) { + if (!(p->flags & PF_KTHREAD)) fake_signal_wake_up(p); - /* - * fake_signal_wake_up() goes through p's scheduler - * lock and guarantees that TASK_STOPPED/TRACED -> - * TASK_RUNNING transition can't race with task state - * testing in try_to_freeze_tasks(). - */ - } else { + else wake_up_state(p, TASK_INTERRUPTIBLE); - } spin_unlock_irqrestore(&freezer_lock, flags); return true; diff --git a/kernel/power/process.c b/kernel/power/process.c index 87da817f9e1..d5a258b60c6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -48,18 +48,7 @@ static int try_to_freeze_tasks(bool user_only) if (p == current || !freeze_task(p)) continue; - /* - * Now that we've done set_freeze_flag, don't - * perturb a task in TASK_STOPPED or TASK_TRACED. - * It is "frozen enough". If the task does wake - * up, it will immediately call try_to_freeze. - * - * Because freeze_task() goes through p's scheduler lock, it's - * guaranteed that TASK_STOPPED/TRACED -> TASK_RUNNING - * transition can't race with task state testing here. - */ - if (!task_is_stopped_or_traced(p) && - !freezer_should_skip(p)) + if (!freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 0af8868525d..5ffb5626e07 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1908,7 +1908,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) preempt_disable(); read_unlock(&tasklist_lock); preempt_enable_no_resched(); - schedule(); + freezable_schedule(); } else { /* * By the time we got the lock, our tracer went away. @@ -1929,13 +1929,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) read_unlock(&tasklist_lock); } - /* - * While in TASK_TRACED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. - */ - try_to_freeze(); - /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with @@ -2092,7 +2085,7 @@ static bool do_signal_stop(int signr) } /* Now we don't run again until woken by SIGCONT or SIGKILL */ - schedule(); + freezable_schedule(); return true; } else { /* @@ -2200,15 +2193,14 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, if (unlikely(uprobe_deny_signal())) return 0; -relock: /* - * We'll jump back here after any time we were stopped in TASK_STOPPED. - * While in TASK_STOPPED, we were considered "frozen enough". - * Now that we woke up, it's crucial if we're supposed to be - * frozen that we freeze now before running anything substantial. + * Do this once, we can't return to user-mode if freezing() == T. + * do_signal_stop() and ptrace_stop() do freezable_schedule() and + * thus do not need another check after return. */ try_to_freeze(); +relock: spin_lock_irq(&sighand->siglock); /* * Every stopped thread goes here after wakeup. Check to see if -- cgit v1.2.3 From ed95779340b50e362245c81b5dec0d11a1debfa8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill cgroup_subsys->__DEPRECATED_clear_css_refs 2ef37d3fe4 ("memcg: Simplify mem_cgroup_force_empty_list error handling") removed the last user of __DEPRECATED_clear_css_refs. This patch removes __DEPRECATED_clear_css_refs and mechanisms to support it. * Conditionals dependent on __DEPRECATED_clear_css_refs removed. * cgroup_clear_css_refs() can no longer fail. All that needs to be done are deactivating refcnts, setting CSS_REMOVED and putting the base reference on each css. Remove cgroup_clear_css_refs() and the failure path, and open-code the loops into cgroup_rmdir(). This patch keeps the two for_each_subsys() loops separate while open coding them. They can be merged now but there are scheduled changes which need them to be separate, so keep them separate to reduce the amount of churn. local_irq_save/restore() from cgroup_clear_css_refs() are replaced with local_irq_disable/enable() for simplicity. This is safe as cgroup_rmdir() is always called with IRQ enabled. Note that this IRQ switching is necessary to ensure that css_tryget() isn't called from IRQ context on the same CPU while lower context is between CSS deactivation and setting CSS_REMOVED as css_tryget() would hang forever in such cases waiting for CSS to be re-activated or CSS_REMOVED set. This will go away soon. v2: cgroup_call_pre_destroy() removal dropped per Michal. Commit message updated to explain local_irq_disable/enable() conversion. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 131 ++++++++++++++++---------------------------------------- 1 file changed, 36 insertions(+), 95 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 79818507e44..8c605e2bdd1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -865,11 +865,8 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp) continue; ret = ss->pre_destroy(cgrp); - if (ret) { - /* ->pre_destroy() failure is being deprecated */ - WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs); + if (WARN_ON_ONCE(ret)) break; - } } return ret; @@ -3901,14 +3898,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, cgrp->subsys[ss->subsys_id] = css; /* - * If !clear_css_refs, css holds an extra ref to @cgrp->dentry - * which is put on the last css_put(). dput() requires process - * context, which css_put() may be called without. @css->dput_work - * will be used to invoke dput() asynchronously from css_put(). + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->dput_work will be used to invoke + * dput() asynchronously from css_put(). */ INIT_WORK(&css->dput_work, css_dput_fn); - if (ss->__DEPRECATED_clear_css_refs) - set_bit(CSS_CLEAR_CSS_REFS, &css->flags); } /* @@ -3978,10 +3973,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* If !clear_css_refs, each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ for_each_subsys(root, ss) - if (!ss->__DEPRECATED_clear_css_refs) - dget(dentry); + dget(dentry); /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4066,71 +4060,6 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -/* - * Atomically mark all (or else none) of the cgroup's CSS objects as - * CSS_REMOVED. Return true on success, or false if the cgroup has - * busy subsystems. Call with cgroup_mutex held - * - * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or - * not, cgroup removal behaves differently. - * - * If clear is set, css refcnt for the subsystem should be zero before - * cgroup removal can be committed. This is implemented by - * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be - * called multiple times until all css refcnts reach zero and is allowed to - * veto removal on any invocation. This behavior is deprecated and will be - * removed as soon as the existing user (memcg) is updated. - * - * If clear is not set, each css holds an extra reference to the cgroup's - * dentry and cgroup removal proceeds regardless of css refs. - * ->pre_destroy() will be called at least once and is not allowed to fail. - * On the last put of each css, whenever that may be, the extra dentry ref - * is put so that dentry destruction happens only after all css's are - * released. - */ -static int cgroup_clear_css_refs(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - unsigned long flags; - bool failed = false; - - local_irq_save(flags); - - /* - * Block new css_tryget() by deactivating refcnt. If all refcnts - * for subsystems w/ clear_css_refs set were 1 at the moment of - * deactivation, we succeeded. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - WARN_ON(atomic_read(&css->refcnt) < 0); - atomic_add(CSS_DEACT_BIAS, &css->refcnt); - - if (ss->__DEPRECATED_clear_css_refs) - failed |= css_refcnt(css) != 1; - } - - /* - * If succeeded, set REMOVED and put all the base refs; otherwise, - * restore refcnts to positive values. Either way, all in-progress - * css_tryget() will be released. - */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - if (!failed) { - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } else { - atomic_sub(CSS_DEACT_BIAS, &css->refcnt); - } - } - - local_irq_restore(flags); - return !failed; -} - static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -4138,10 +4067,10 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup *parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; + struct cgroup_subsys *ss; int ret; /* the vfs holds both inode->i_mutex already */ -again: mutex_lock(&cgroup_mutex); if (atomic_read(&cgrp->count) != 0) { mutex_unlock(&cgroup_mutex); @@ -4182,21 +4111,34 @@ again: return -EBUSY; } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { - mutex_unlock(&cgroup_mutex); - /* - * Because someone may call cgroup_wakeup_rmdir_waiter() before - * prepare_to_wait(), we need to check this flag. - */ - if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) - schedule(); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - if (signal_pending(current)) - return -EINTR; - goto again; + + local_irq_disable(); + + /* block new css_tryget() by deactivating refcnt */ + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + WARN_ON(atomic_read(&css->refcnt) < 0); + atomic_add(CSS_DEACT_BIAS, &css->refcnt); + } + + /* + * Set REMOVED. All in-progress css_tryget() will be released. + * Put all the base refs. Each css holds an extra reference to the + * cgroup's dentry and cgroup removal proceeds regardless of css + * refs. On the last put of each css, whenever that may be, the + * extra dentry ref is put so that dentry destruction happens only + * after all css's are released. + */ + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + set_bit(CSS_REMOVED, &css->flags); + css_put(css); } - /* NO css_tryget() can success after here. */ + + local_irq_enable(); + finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); @@ -4949,8 +4891,7 @@ void __css_put(struct cgroup_subsys_state *css) cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: - if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags)) - schedule_work(&css->dput_work); + schedule_work(&css->dput_work); break; } rcu_read_unlock(); -- cgit v1.2.3 From e93160803ffda2e67d9ff9cacb63bb6868c8398f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:58 -0800 Subject: cgroup: kill CSS_REMOVED CSS_REMOVED is one of the several contortions which were necessary to support css reference draining on cgroup removal. All css->refcnts which need draining should be deactivated and verified to equal zero atomically w.r.t. css_tryget(). If any one isn't zero, all refcnts needed to be re-activated and css_tryget() shouldn't fail in the process. This was achieved by letting css_tryget() busy-loop until either the refcnt is reactivated (failed removal attempt) or CSS_REMOVED is set (committing to removal). Now that css refcnt draining is no longer used, there's no need for atomic rollback mechanism. css_tryget() simply can look at the reference count and fail if it's deactivated - it's never getting re-activated. This patch removes CSS_REMOVED and updates __css_tryget() to fail if the refcnt is deactivated. As deactivation and removal are a single step now, they no longer need to be protected against css_tryget() happening from irq context. Remove local_irq_disable/enable() from cgroup_rmdir(). Note that this removes css_is_removed() whose only user is VM_BUG_ON() in memcontrol.c. We can replace it with a check on the refcnt but given that the only use case is a debug assert, I think it's better to simply unexport it. v2: Comment updated and explanation on local_irq_disable/enable() added per Michal Hocko. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Johannes Weiner Cc: Balbir Singh --- kernel/cgroup.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c605e2bdd1..c194f9e4fc7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -170,8 +170,8 @@ struct css_id { * The css to which this ID points. This pointer is set to valid value * after cgroup is populated. If cgroup is removed, this will be NULL. * This pointer is expected to be RCU-safe because destroy() - * is called after synchronize_rcu(). But for safe use, css_is_removed() - * css_tryget() should be used for avoiding race. + * is called after synchronize_rcu(). But for safe use, css_tryget() + * should be used for avoiding race. */ struct cgroup_subsys_state __rcu *css; /* @@ -4112,8 +4112,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - local_irq_disable(); - /* block new css_tryget() by deactivating refcnt */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; @@ -4123,21 +4121,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } /* - * Set REMOVED. All in-progress css_tryget() will be released. * Put all the base refs. Each css holds an extra reference to the * cgroup's dentry and cgroup removal proceeds regardless of css * refs. On the last put of each css, whenever that may be, the * extra dentry ref is put so that dentry destruction happens only * after all css's are released. */ - for_each_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; - - set_bit(CSS_REMOVED, &css->flags); - css_put(css); - } - - local_irq_enable(); + for_each_subsys(cgrp->root, ss) + css_put(cgrp->subsys[ss->subsys_id]); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); @@ -4861,15 +4852,17 @@ static void check_for_release(struct cgroup *cgrp) /* Caller must verify that the css is not for root cgroup */ bool __css_tryget(struct cgroup_subsys_state *css) { - do { - int v = css_refcnt(css); + while (true) { + int t, v; - if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v) + v = css_refcnt(css); + t = atomic_cmpxchg(&css->refcnt, v, v + 1); + if (likely(t == v)) return true; + else if (t < 0) + return false; cpu_relax(); - } while (!test_bit(CSS_REMOVED, &css->flags)); - - return false; + } } EXPORT_SYMBOL_GPL(__css_tryget); -- cgit v1.2.3 From 976c06bcccc50573997609fa7ec842479bd96ffb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: use cgroup_lock_live_group(parent) in cgroup_create() This patch makes cgroup_create() fail if @parent is marked removed. This is to prepare for further updates to cgroup_rmdir() path. Note that this change isn't strictly necessary. cgroup can only be created via mkdir and the removed marking and dentry removal happen without releasing cgroup_mutex, so cgroup_create() can never race with cgroup_rmdir(). Even after the scheduled updates to cgroup_rmdir(), cgroup_mkdir() and cgroup_rmdir() are synchronized by i_mutex rendering the added liveliness check unnecessary. Do it anyway such that locking is contained inside cgroup proper and we don't get nasty surprises if we ever grow another caller of cgroup_create(). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c194f9e4fc7..f22e3cd1978 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3927,6 +3927,18 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (!cgrp) return -ENOMEM; + /* + * Only live parents can have children. Note that the liveliness + * check isn't strictly necessary because cgroup_mkdir() and + * cgroup_rmdir() are fully synchronized by i_mutex; however, do it + * anyway so that locking is contained inside cgroup proper and we + * don't get nasty surprises if we ever grow another caller. + */ + if (!cgroup_lock_live_group(parent)) { + err = -ENODEV; + goto err_free; + } + /* Grab a reference on the superblock so the hierarchy doesn't * get deleted on unmount if there are child cgroups. This * can be done outside cgroup_mutex, since the sb can't @@ -3934,8 +3946,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, * fs */ atomic_inc(&sb->s_active); - mutex_lock(&cgroup_mutex); - init_cgroup_housekeeping(cgrp); cgrp->parent = parent; @@ -4006,7 +4016,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* Release the reference count that we took on the superblock */ deactivate_super(sb); - +err_free: kfree(cgrp); return err; } -- cgit v1.2.3 From 1a90dd508b0b00e382fd61a46f55dc889ac21b39 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: deactivate CSS's and mark cgroup dead before invoking ->pre_destroy() Because ->pre_destroy() could fail and can't be called under cgroup_mutex, cgroup destruction did something very ugly. 1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise. 2. Release cgroup_mutex and call ->pre_destroy(). 3. Re-grab cgroup_mutex and verify it can still be destroyed; fail otherwise. 4. Continue destroying. In addition to being ugly, it has been always broken in various ways. For example, memcg ->pre_destroy() expects the cgroup to be inactive after it's done but tasks can be attached and detached between #2 and #3 and the conditions that memcg verified in ->pre_destroy() might no longer hold by the time control reaches #3. Now that ->pre_destroy() is no longer allowed to fail. We can switch to the following. 1. Grab cgroup_mutex and verify it can be destroyed; fail otherwise. 2. Deactivate CSS's and mark the cgroup removed thus preventing any further operations which can invalidate the verification from #1. 3. Release cgroup_mutex and call ->pre_destroy(). 4. Re-grab cgroup_mutex and continue destroying. After this change, controllers can safely assume that ->pre_destroy() will only be called only once for a given cgroup and, once ->pre_destroy() is called, the cgroup will stay dormant till it's destroyed. This removes the only reason ->pre_destroy() can fail - new task being attached or child cgroup being created inbetween. Error out path is removed and ->pre_destroy() invocation is open coded in cgroup_rmdir(). v2: cgroup_call_pre_destroy() removal moved to this patch per Michal. Commit message updated per Glauber. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 65 +++++++++++++++++---------------------------------------- 1 file changed, 19 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f22e3cd1978..66204a6f68f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -851,27 +851,6 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) return inode; } -/* - * Call subsys's pre_destroy handler. - * This is called before css refcnt check. - */ -static int cgroup_call_pre_destroy(struct cgroup *cgrp) -{ - struct cgroup_subsys *ss; - int ret = 0; - - for_each_subsys(cgrp->root, ss) { - if (!ss->pre_destroy) - continue; - - ret = ss->pre_destroy(cgrp); - if (WARN_ON_ONCE(ret)) - break; - } - - return ret; -} - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -4078,19 +4057,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - int ret; - - /* the vfs holds both inode->i_mutex already */ - mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - if (!list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); - return -EBUSY; - } - mutex_unlock(&cgroup_mutex); /* * In general, subsystem has no css->refcnt after pre_destroy(). But @@ -4103,16 +4069,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) */ set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - /* - * Call pre_destroy handlers of subsys. Notify subsystems - * that rmdir() request comes. - */ - ret = cgroup_call_pre_destroy(cgrp); - if (ret) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - return ret; - } - + /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { @@ -4122,13 +4079,30 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - /* block new css_tryget() by deactivating refcnt */ + /* + * Block new css_tryget() by deactivating refcnt and mark @cgrp + * removed. This makes future css_tryget() and child creation + * attempts fail thus maintaining the removal conditions verified + * above. + */ for_each_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; WARN_ON(atomic_read(&css->refcnt) < 0); atomic_add(CSS_DEACT_BIAS, &css->refcnt); } + set_bit(CGRP_REMOVED, &cgrp->flags); + + /* + * Tell subsystems to initate destruction. pre_destroy() should be + * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix + * potential deadlock in pre_destroy") for details. + */ + mutex_unlock(&cgroup_mutex); + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy) + WARN_ON_ONCE(ss->pre_destroy(cgrp)); + mutex_lock(&cgroup_mutex); /* * Put all the base refs. Each css holds an extra reference to the @@ -4144,7 +4118,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); raw_spin_lock(&release_list_lock); - set_bit(CGRP_REMOVED, &cgrp->flags); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); -- cgit v1.2.3 From b25ed609d0eecf077db607e88ea70bae83b6adb2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: remove CGRP_WAIT_ON_RMDIR, cgroup_exclude_rmdir() and cgroup_release_and_wakeup_rmdir() CGRP_WAIT_ON_RMDIR is another kludge which was added to make cgroup destruction rollback somewhat working. cgroup_rmdir() used to drain CSS references and CGRP_WAIT_ON_RMDIR and the associated waitqueue and helpers were used to allow the task performing rmdir to wait for the next relevant event. Unfortunately, the wait is visible to controllers too and the mechanism got exposed to memcg by 887032670d ("cgroup avoid permanent sleep at rmdir"). Now that the draining and retries are gone, CGRP_WAIT_ON_RMDIR is unnecessary. Remove it and all the mechanisms supporting it. Note that memcontrol.c changes are essentially revert of 887032670d ("cgroup avoid permanent sleep at rmdir"). Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh --- kernel/cgroup.c | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66204a6f68f..c5f6fb28dd0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,33 +965,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) remove_dir(dentry); } -/* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - /* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function @@ -1963,12 +1936,6 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) } synchronize_rcu(); - - /* - * wake up rmdir() waiter. the rmdir should fail since the cgroup - * is no longer empty. - */ - cgroup_wakeup_rmdir_waiter(cgrp); out: if (retval) { for_each_subsys(root, ss) { @@ -2138,7 +2105,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) * step 5: success! and cleanup */ synchronize_rcu(); - cgroup_wakeup_rmdir_waiter(cgrp); retval = 0; out_put_css_set_refs: if (retval) { @@ -4058,26 +4024,13 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - /* - * In general, subsystem has no css->refcnt after pre_destroy(). But - * in racy cases, subsystem may have to get css->refcnt after - * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes - * make rmdir return -EBUSY too often. To avoid that, we use waitqueue - * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir - * and subsystem's reference count handling. Please see css_get/put - * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - /* the vfs holds both inode->i_mutex already */ mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); /* * Block new css_tryget() by deactivating refcnt and mark @cgrp @@ -4114,9 +4067,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) for_each_subsys(cgrp->root, ss) css_put(cgrp->subsys[ss->subsys_id]); - finish_wait(&cgroup_rmdir_waitq, &wait); - clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); - raw_spin_lock(&release_list_lock); if (!list_empty(&cgrp->release_list)) list_del_init(&cgrp->release_list); @@ -4864,7 +4814,6 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiter(cgrp); break; case 0: schedule_work(&css->dput_work); -- cgit v1.2.3 From bcf6de1b9129531215d26dd9af8331e84973bc52 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Nov 2012 09:16:59 -0800 Subject: cgroup: make ->pre_destroy() return void All ->pre_destory() implementations return 0 now, which is the only allowed return value. Make it return void. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Balbir Singh Cc: Vivek Goyal --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c5f6fb28dd0..83cd7d041c6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4054,7 +4054,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&cgroup_mutex); for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) - WARN_ON_ONCE(ss->pre_destroy(cgrp)); + ss->pre_destroy(cgrp); mutex_lock(&cgroup_mutex); /* -- cgit v1.2.3 From 316eb661f125397d46f16f94e3c81ad3dc4c1233 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 8 Nov 2012 21:36:38 +0800 Subject: cgroup: set 'start' with the right value in cgroup_path. 'start' is set to buf + buflen and do the '--' immediately. Just set it to 'buf + buflen - 1' directly. Signed-off-by: Tao Ma Signed-off-by: Tejun Heo Cc: Li Zefan --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3070164e203..998ab5957c6 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1770,9 +1770,9 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) return 0; } - start = buf + buflen; + start = buf + buflen - 1; - *--start = '\0'; + *start = '\0'; for (;;) { int len = dentry->d_name.len; -- cgit v1.2.3 From a8638030f668884720b8f4456448d0ce33952b05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: add cgroup_subsys->post_create() Currently, there's no way for a controller to find out whether a new cgroup finished all ->create() allocatinos successfully and is considered "live" by cgroup. This becomes a problem later when we add generic descendants walking to cgroup which can be used by controllers as controllers don't have a synchronization point where it can synchronize against new cgroups appearing in such walks. This patch adds ->post_create(). It's called after all ->create() succeeded and the cgroup is linked into the generic cgroup hierarchy. This plays the counterpart of ->pre_destroy(). When used in combination with the to-be-added generic descendant iterators, ->post_create() can be used to implement reliable state inheritance. It will be explained with the descendant iterators. v2: Added a paragraph about its future use w/ descendant iterators per Michal. v3: Forgot to add ->post_create() invocation to cgroup_load_subsys(). Fixed. Signed-off-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 998ab5957c6..26f2edbaf4f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4059,10 +4059,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; - /* each css holds a ref to the cgroup's dentry */ - for_each_subsys(root, ss) + for_each_subsys(root, ss) { + /* each css holds a ref to the cgroup's dentry */ dget(dentry); + /* creation succeeded, notify subsystems */ + if (ss->post_create) + ss->post_create(cgrp); + } + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -4280,6 +4285,9 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) ss->active = 1; + if (ss->post_create) + ss->post_create(&ss->root->top_cgroup); + /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ BUG_ON(ss->module); @@ -4389,6 +4397,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->active = 1; + if (ss->post_create) + ss->post_create(&ss->root->top_cgroup); + /* success! */ mutex_unlock(&cgroup_mutex); return 0; -- cgit v1.2.3 From eb6fd5040ee799009173daa49c3e7aa0362167c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: use rculist ops for cgroup->children Use RCU safe list operations for cgroup->children. This will be used to implement cgroup children / descendant walking which can be used by controllers. Note that cgroup_create() now puts a new cgroup at the end of the ->children list instead of head. This isn't strictly necessary but is done so that the iteration order is more conventional. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 26f2edbaf4f..2ed5968a04e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1650,7 +1650,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); - BUG_ON(!list_empty(&root_cgrp->sibling)); BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); @@ -1699,7 +1698,6 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); - BUG_ON(!list_empty(&cgrp->sibling)); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -4052,7 +4050,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - list_add(&cgrp->sibling, &cgrp->parent->children); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; err = cgroup_create_dir(cgrp, dentry, mode); @@ -4083,7 +4081,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_remove: - list_del(&cgrp->sibling); + list_del_rcu(&cgrp->sibling); root->number_of_cgroups--; err_destroy: @@ -4209,7 +4207,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) raw_spin_unlock(&release_list_lock); /* delete this cgroup from parent->children */ - list_del_init(&cgrp->sibling); + list_del_rcu(&cgrp->sibling); list_del_init(&cgrp->allcg_node); -- cgit v1.2.3 From 574bd9f7c7c1d32f52dea5488018a6ff79031e22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup: implement generic child / descendant walk macros Currently, cgroup doesn't provide any generic helper for walking a given cgroup's children or descendants. This patch adds the following three macros. * cgroup_for_each_child() - walk immediate children of a cgroup. * cgroup_for_each_descendant_pre() - visit all descendants of a cgroup in pre-order tree traversal. * cgroup_for_each_descendant_post() - visit all descendants of a cgroup in post-order tree traversal. All three only require the user to hold RCU read lock during traversal. Verifying that each iterated cgroup is online is the responsibility of the user. When used with proper synchronization, cgroup_for_each_descendant_pre() can be used to propagate state updates to descendants in reliable way. See comments for details. v2: s/config/state/ in commit message and comments per Michal. More documentation on synchronization rules. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Acked-by: Li Zefan --- kernel/cgroup.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2ed5968a04e..0f8fa6aa371 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2984,6 +2984,92 @@ static void cgroup_enable_task_cg_lists(void) write_unlock(&css_set_lock); } +/** + * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_pre(). Find the next + * descendant to visit for pre-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, pretend we just visited @cgroup */ + if (!pos) { + if (list_empty(&cgroup->children)) + return NULL; + pos = cgroup; + } + + /* visit the first child if exists */ + next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + if (next) + return next; + + /* no child, visit my or the closest ancestor's next sibling */ + do { + next = list_entry_rcu(pos->sibling.next, struct cgroup, + sibling); + if (&next->sibling != &pos->parent->children) + return next; + + pos = pos->parent; + } while (pos != cgroup); + + return NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); + +static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +{ + struct cgroup *last; + + do { + last = pos; + pos = list_first_or_null_rcu(&pos->children, struct cgroup, + sibling); + } while (pos); + + return last; +} + +/** + * cgroup_next_descendant_post - find the next descendant for post-order walk + * @pos: the current position (%NULL to initiate traversal) + * @cgroup: cgroup whose descendants to walk + * + * To be used by cgroup_for_each_descendant_post(). Find the next + * descendant to visit for post-order traversal of @cgroup's descendants. + */ +struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, + struct cgroup *cgroup) +{ + struct cgroup *next; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* if first iteration, visit the leftmost descendant */ + if (!pos) { + next = cgroup_leftmost_descendant(cgroup); + return next != cgroup ? next : NULL; + } + + /* if there's an unvisited sibling, visit its leftmost descendant */ + next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); + if (&next->sibling != &pos->parent->children) + return cgroup_leftmost_descendant(next); + + /* no sibling left, visit parent */ + next = pos->parent; + return next != cgroup ? next : NULL; +} +EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit v1.2.3 From bcd66c894ac994ef5d6fbbaa9a24506ffaf64fd4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:29 -0800 Subject: cgroup_freezer: trivial cleanups * Clean-up indentation and line-breaks. Drop the invalid comment about freezer->lock. * Make all internal functions take @freezer instead of both @cgroup and @freezer. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index bedefd9a22d..975b3d8732f 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -29,17 +29,15 @@ enum freezer_state { }; struct freezer { - struct cgroup_subsys_state css; - enum freezer_state state; - spinlock_t lock; /* protects _writes_ to state */ + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; }; -static inline struct freezer *cgroup_freezer( - struct cgroup *cgroup) +static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of( - cgroup_subsys_state(cgroup, freezer_subsys_id), - struct freezer, css); + return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) @@ -180,8 +178,9 @@ out: * migrated into or out of @cgroup, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup, struct freezer *freezer) +static void update_if_frozen(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -211,12 +210,11 @@ notyet: static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { - struct freezer *freezer; + struct freezer *freezer = cgroup_freezer(cgroup); enum freezer_state state; - freezer = cgroup_freezer(cgroup); spin_lock_irq(&freezer->lock); - update_if_frozen(cgroup, freezer); + update_if_frozen(freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); @@ -225,8 +223,9 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, return 0; } -static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void freeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -236,8 +235,9 @@ static void freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +static void unfreeze_cgroup(struct freezer *freezer) { + struct cgroup *cgroup = freezer->css.cgroup; struct cgroup_iter it; struct task_struct *task; @@ -247,11 +247,9 @@ static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void freezer_change_state(struct cgroup *cgroup, +static void freezer_change_state(struct freezer *freezer, enum freezer_state goal_state) { - struct freezer *freezer = cgroup_freezer(cgroup); - /* also synchronizes against task migration, see freezer_attach() */ spin_lock_irq(&freezer->lock); @@ -260,13 +258,13 @@ static void freezer_change_state(struct cgroup *cgroup, if (freezer->state != CGROUP_THAWED) atomic_dec(&system_freezing_cnt); freezer->state = CGROUP_THAWED; - unfreeze_cgroup(cgroup, freezer); + unfreeze_cgroup(freezer); break; case CGROUP_FROZEN: if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; - freeze_cgroup(cgroup, freezer); + freeze_cgroup(freezer); break; default: BUG(); @@ -275,8 +273,7 @@ static void freezer_change_state(struct cgroup *cgroup, spin_unlock_irq(&freezer->lock); } -static int freezer_write(struct cgroup *cgroup, - struct cftype *cft, +static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { enum freezer_state goal_state; @@ -288,7 +285,7 @@ static int freezer_write(struct cgroup *cgroup, else return -EINVAL; - freezer_change_state(cgroup, goal_state); + freezer_change_state(cgroup_freezer(cgroup), goal_state); return 0; } -- cgit v1.2.3 From 04a4ec32571e88c614b229824afec376d24cc035 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: prepare freezer_change_state() for full hierarchy support * Make freezer_change_state() take bool @freeze instead of enum freezer_state. * Separate out freezer_apply_state() out of freezer_change_state(). This makes freezer_change_state() a rather silly thin wrapper. It will be filled with hierarchy handling later on. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 975b3d8732f..2690830e742 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -247,45 +247,57 @@ static void unfreeze_cgroup(struct freezer *freezer) cgroup_iter_end(cgroup, &it); } -static void freezer_change_state(struct freezer *freezer, - enum freezer_state goal_state) +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze) { /* also synchronizes against task migration, see freezer_attach() */ - spin_lock_irq(&freezer->lock); + lockdep_assert_held(&freezer->lock); - switch (goal_state) { - case CGROUP_THAWED: - if (freezer->state != CGROUP_THAWED) - atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; - unfreeze_cgroup(freezer); - break; - case CGROUP_FROZEN: + if (freeze) { if (freezer->state == CGROUP_THAWED) atomic_inc(&system_freezing_cnt); freezer->state = CGROUP_FREEZING; freeze_cgroup(freezer); - break; - default: - BUG(); + } else { + if (freezer->state != CGROUP_THAWED) + atomic_dec(&system_freezing_cnt); + freezer->state = CGROUP_THAWED; + unfreeze_cgroup(freezer); } +} +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @cgroup according to @freeze. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + /* update @freezer */ + spin_lock_irq(&freezer->lock); + freezer_apply_state(freezer, freeze); spin_unlock_irq(&freezer->lock); } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, const char *buffer) { - enum freezer_state goal_state; + bool freeze; if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) - goal_state = CGROUP_THAWED; + freeze = false; else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) - goal_state = CGROUP_FROZEN; + freeze = true; else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), goal_state); + freezer_change_state(cgroup_freezer(cgroup), freeze); return 0; } -- cgit v1.2.3 From d6a2fe134219adf94e6bd0c8f6e2a3163ff68c41 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: make freezer->state mask of flags freezer->state was an enum value - one of THAWED, FREEZING and FROZEN. As the scheduled full hierarchy support requires more than one freezing condition, switch it to mask of flags. If FREEZING is not set, it's thawed. FREEZING is set if freezing or frozen. If frozen, both FREEZING and FROZEN are set. Now that tasks can be attached to an already frozen cgroup, this also makes freezing condition checks more natural. This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 60 ++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2690830e742..e76aa9fb3ef 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,15 +22,14 @@ #include #include -enum freezer_state { - CGROUP_THAWED = 0, - CGROUP_FREEZING, - CGROUP_FROZEN, +enum freezer_state_flags { + CGROUP_FREEZING = (1 << 1), /* this freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ }; struct freezer { struct cgroup_subsys_state css; - enum freezer_state state; + unsigned int state; spinlock_t lock; }; @@ -48,12 +47,10 @@ static inline struct freezer *task_freezer(struct task_struct *task) bool cgroup_freezing(struct task_struct *task) { - enum freezer_state state; bool ret; rcu_read_lock(); - state = task_freezer(task)->state; - ret = state == CGROUP_FREEZING || state == CGROUP_FROZEN; + ret = task_freezer(task)->state & CGROUP_FREEZING; rcu_read_unlock(); return ret; @@ -63,10 +60,13 @@ bool cgroup_freezing(struct task_struct *task) * cgroups_write_string() limits the size of freezer state strings to * CGROUP_LOCAL_BUFFER_SIZE */ -static const char *freezer_state_strs[] = { - "THAWED", - "FREEZING", - "FROZEN", +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; }; /* @@ -91,7 +91,6 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) return ERR_PTR(-ENOMEM); spin_lock_init(&freezer->lock); - freezer->state = CGROUP_THAWED; return &freezer->css; } @@ -99,7 +98,7 @@ static void freezer_destroy(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); kfree(freezer); } @@ -129,15 +128,13 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) * Tasks in @tset are on @new_cgrp but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - * This means that, to determine whether to freeze, one should test - * whether the state equals THAWED. */ cgroup_taskset_for_each(task, new_cgrp, tset) { - if (freezer->state == CGROUP_THAWED) { + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state = CGROUP_FREEZING; + freezer->state &= ~CGROUP_FROZEN; } } @@ -159,11 +156,7 @@ static void freezer_fork(struct task_struct *task) goto out; spin_lock_irq(&freezer->lock); - /* - * @task might have been just migrated into a FROZEN cgroup. Test - * equality with THAWED. Read the comment in freezer_attach(). - */ - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) freeze_task(task); spin_unlock_irq(&freezer->lock); out: @@ -184,7 +177,8 @@ static void update_if_frozen(struct freezer *freezer) struct cgroup_iter it; struct task_struct *task; - if (freezer->state != CGROUP_FREEZING) + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) return; cgroup_iter_start(cgroup, &it); @@ -202,7 +196,7 @@ static void update_if_frozen(struct freezer *freezer) } } - freezer->state = CGROUP_FROZEN; + freezer->state |= CGROUP_FROZEN; notyet: cgroup_iter_end(cgroup, &it); } @@ -211,14 +205,14 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { struct freezer *freezer = cgroup_freezer(cgroup); - enum freezer_state state; + unsigned int state; spin_lock_irq(&freezer->lock); update_if_frozen(freezer); state = freezer->state; spin_unlock_irq(&freezer->lock); - seq_puts(m, freezer_state_strs[state]); + seq_puts(m, freezer_state_strs(state)); seq_putc(m, '\n'); return 0; } @@ -258,14 +252,14 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze) lockdep_assert_held(&freezer->lock); if (freeze) { - if (freezer->state == CGROUP_THAWED) + if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); - freezer->state = CGROUP_FREEZING; + freezer->state |= CGROUP_FREEZING; freeze_cgroup(freezer); } else { - if (freezer->state != CGROUP_THAWED) + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); - freezer->state = CGROUP_THAWED; + freezer->state &= ~(CGROUP_FREEZING | CGROUP_FROZEN); unfreeze_cgroup(freezer); } } @@ -290,9 +284,9 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, { bool freeze; - if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + if (strcmp(buffer, freezer_state_strs(0)) == 0) freeze = false; - else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + else if (strcmp(buffer, freezer_state_strs(CGROUP_FROZEN)) == 0) freeze = true; else return -EINVAL; -- cgit v1.2.3 From a225218060fc8f10ed396c9c8187074697ad044d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: introduce CGROUP_FREEZING_[SELF|PARENT] Introduce FREEZING_SELF and FREEZING_PARENT and make FREEZING OR of the two flags. This is to prepare for full hierarchy support. freezer_apply_date() is updated such that it can handle setting and clearing of both flags. The two flags are also exposed to userland via read-only files self_freezing and parent_freezing. Other than the added cgroupfs files, this patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 55 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e76aa9fb3ef..b8ad93c6f5f 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -23,8 +23,12 @@ #include enum freezer_state_flags { - CGROUP_FREEZING = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, }; struct freezer { @@ -245,8 +249,13 @@ static void unfreeze_cgroup(struct freezer *freezer) * freezer_apply_state - apply state change to a single cgroup_freezer * @freezer: freezer to apply state change to * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. */ -static void freezer_apply_state(struct freezer *freezer, bool freeze) +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) { /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer->lock); @@ -254,13 +263,19 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze) if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); - freezer->state |= CGROUP_FREEZING; + freezer->state |= state; freeze_cgroup(freezer); } else { - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~(CGROUP_FREEZING | CGROUP_FROZEN); - unfreeze_cgroup(freezer); + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } } } @@ -275,7 +290,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { /* update @freezer */ spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze); + freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); spin_unlock_irq(&freezer->lock); } @@ -295,6 +310,20 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, return 0; } +static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + static struct cftype files[] = { { .name = "state", @@ -302,6 +331,16 @@ static struct cftype files[] = { .read_seq_string = freezer_read, .write_string = freezer_write, }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, { } /* terminate */ }; -- cgit v1.2.3 From 5300a9b3482b6d9c32de6d5f4eaeab0fbafa70a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: add ->post_create() and ->pre_destroy() and track online state A cgroup is online and visible to iteration between ->post_create() and ->pre_destroy(). This patch introduces CGROUP_FREEZER_ONLINE and toggles it from the newly added freezer_post_create() and freezer_pre_destroy() while holding freezer->lock such that a cgroup_freezer can be reilably distinguished to be online. This will be used by full hierarchy support. ONLINE test is added to freezer_apply_state() but it currently doesn't make any difference as freezer_write() can only be called for an online cgroup. Adjusting system_freezing_cnt on destruction is moved from freezer_destroy() to the new freezer_pre_destroy() for consistency. This patch doesn't introduce any noticeable behavior change. Signed-off-by: Tejun Heo Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index b8ad93c6f5f..4f12d317c4c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -23,6 +23,7 @@ #include enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ @@ -98,13 +99,45 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) return &freezer->css; } -static void freezer_destroy(struct cgroup *cgroup) +/** + * freezer_post_create - commit creation of a freezer cgroup + * @cgroup: cgroup being created + * + * We're committing to creation of @cgroup. Mark it online. + */ +static void freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + freezer->state |= CGROUP_FREEZER_ONLINE; + spin_unlock_irq(&freezer->lock); +} + +/** + * freezer_pre_destroy - initiate destruction of @cgroup + * @cgroup: cgroup being destroyed + * + * @cgroup is going away. Mark it dead and decrement system_freezing_count + * if it was holding one. + */ +static void freezer_pre_destroy(struct cgroup *cgroup) +{ + struct freezer *freezer = cgroup_freezer(cgroup); + + spin_lock_irq(&freezer->lock); + if (freezer->state & CGROUP_FREEZING) atomic_dec(&system_freezing_cnt); - kfree(freezer); + + freezer->state = 0; + + spin_unlock_irq(&freezer->lock); +} + +static void freezer_destroy(struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); } /* @@ -260,6 +293,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, /* also synchronizes against task migration, see freezer_attach() */ lockdep_assert_held(&freezer->lock); + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + if (freeze) { if (!(freezer->state & CGROUP_FREEZING)) atomic_inc(&system_freezing_cnt); @@ -347,6 +383,8 @@ static struct cftype files[] = { struct cgroup_subsys freezer_subsys = { .name = "freezer", .create = freezer_create, + .post_create = freezer_post_create, + .pre_destroy = freezer_pre_destroy, .destroy = freezer_destroy, .subsys_id = freezer_subsys_id, .attach = freezer_attach, -- cgit v1.2.3 From ef9fe980c6fcc1821ab955b74b242d2d6585fa75 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 9 Nov 2012 09:12:30 -0800 Subject: cgroup_freezer: implement proper hierarchy support Up until now, cgroup_freezer didn't implement hierarchy properly. cgroups could be arranged in hierarchy but it didn't make any difference in how each cgroup_freezer behaved. They all operated separately. This patch implements proper hierarchy support. If a cgroup is frozen, all its descendants are frozen. A cgroup is thawed iff it and all its ancestors are THAWED. freezer.self_freezing shows the current freezing state for the cgroup itself. freezer.parent_freezing shows whether the cgroup is freezing because any of its ancestors is freezing. freezer_post_create() locks the parent and new cgroup and inherits the parent's state and freezer_change_state() applies new state top-down using cgroup_for_each_descendant_pre() which guarantees that no child can escape its parent's state. update_if_frozen() uses cgroup_for_each_descendant_post() to propagate frozen states bottom-up. Synchronization could be coarser and easier by using a single mutex to protect all hierarchy operations. Finer grained approach was used because it wasn't too difficult for cgroup_freezer and I think it's beneficial to have an example implementation and cgroup_freezer is rather simple and can serve a good one. As this makes cgroup_freezer properly hierarchical, freezer_subsys.broken_hierarchy marking is removed. Note that this patch changes userland visible behavior - freezing a cgroup now freezes all its descendants too. This behavior change is intended and has been warned via .broken_hierarchy. v2: Michal spotted a bug in freezer_change_state() - descendants were inheriting from the wrong ancestor. Fixed. v3: Documentation/cgroups/freezer-subsystem.txt updated. Signed-off-by: Tejun Heo Reviewed-by: Michal Hocko --- kernel/cgroup_freezer.c | 161 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 123 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 4f12d317c4c..670a4af7dc9 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -22,6 +22,13 @@ #include #include +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ enum freezer_state_flags { CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ @@ -50,6 +57,15 @@ static inline struct freezer *task_freezer(struct task_struct *task) struct freezer, css); } +static struct freezer *parent_freezer(struct freezer *freezer) +{ + struct cgroup *pcg = freezer->css.cgroup->parent; + + if (pcg) + return cgroup_freezer(pcg); + return NULL; +} + bool cgroup_freezing(struct task_struct *task) { bool ret; @@ -74,17 +90,6 @@ static const char *freezer_state_strs(unsigned int state) return "THAWED"; }; -/* - * State diagram - * Transitions are caused by userspace writes to the freezer.state file. - * The values in parenthesis are state labels. The rest are edge labels. - * - * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) - * ^ ^ | | - * | \_______THAWED_______/ | - * \__________________________THAWED____________/ - */ - struct cgroup_subsys freezer_subsys; static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) @@ -103,15 +108,34 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) * freezer_post_create - commit creation of a freezer cgroup * @cgroup: cgroup being created * - * We're committing to creation of @cgroup. Mark it online. + * We're committing to creation of @cgroup. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. */ static void freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *parent = parent_freezer(freezer); + + /* + * The following double locking and freezing state inheritance + * guarantee that @cgroup can never escape ancestors' freezing + * states. See cgroup_for_each_descendant_pre() for details. + */ + if (parent) + spin_lock_irq(&parent->lock); + spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING); - spin_lock_irq(&freezer->lock); freezer->state |= CGROUP_FREEZER_ONLINE; - spin_unlock_irq(&freezer->lock); + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + spin_unlock(&freezer->lock); + if (parent) + spin_unlock_irq(&parent->lock); } /** @@ -153,6 +177,7 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) { struct freezer *freezer = cgroup_freezer(new_cgrp); struct task_struct *task; + bool clear_frozen = false; spin_lock_irq(&freezer->lock); @@ -172,10 +197,25 @@ static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) } else { freeze_task(task); freezer->state &= ~CGROUP_FROZEN; + clear_frozen = true; } } spin_unlock_irq(&freezer->lock); + + /* + * Propagate FROZEN clearing upwards. We may race with + * update_if_frozen(), but as long as both work bottom-up, either + * update_if_frozen() sees child's FROZEN cleared or we clear the + * parent's FROZEN later. No parent w/ !FROZEN children can be + * left FROZEN. + */ + while (clear_frozen && (freezer = parent_freezer(freezer))) { + spin_lock_irq(&freezer->lock); + freezer->state &= ~CGROUP_FROZEN; + clear_frozen = freezer->state & CGROUP_FREEZING; + spin_unlock_irq(&freezer->lock); + } } static void freezer_fork(struct task_struct *task) @@ -200,24 +240,47 @@ out: rcu_read_unlock(); } -/* - * We change from FREEZING to FROZEN lazily if the cgroup was only - * partially frozen when we exitted write. Caller must hold freezer->lock. +/** + * update_if_frozen - update whether a cgroup finished freezing + * @cgroup: cgroup of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being * migrated into or out of @cgroup, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct freezer *freezer) +static void update_if_frozen(struct cgroup *cgroup) { - struct cgroup *cgroup = freezer->css.cgroup; + struct freezer *freezer = cgroup_freezer(cgroup); + struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; + WARN_ON_ONCE(!rcu_read_lock_held()); + + spin_lock_irq(&freezer->lock); + if (!(freezer->state & CGROUP_FREEZING) || (freezer->state & CGROUP_FROZEN)) - return; + goto out_unlock; + + /* are all (live) children frozen? */ + cgroup_for_each_child(pos, cgroup) { + struct freezer *child = cgroup_freezer(pos); + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) + goto out_unlock; + } + + /* are all tasks frozen? */ cgroup_iter_start(cgroup, &it); while ((task = cgroup_iter_next(cgroup, &it))) { @@ -229,27 +292,32 @@ static void update_if_frozen(struct freezer *freezer) * the usual frozen condition. */ if (!frozen(task) && !freezer_should_skip(task)) - goto notyet; + goto out_iter_end; } } freezer->state |= CGROUP_FROZEN; -notyet: +out_iter_end: cgroup_iter_end(cgroup, &it); +out_unlock: + spin_unlock_irq(&freezer->lock); } static int freezer_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { - struct freezer *freezer = cgroup_freezer(cgroup); - unsigned int state; + struct cgroup *pos; - spin_lock_irq(&freezer->lock); - update_if_frozen(freezer); - state = freezer->state; - spin_unlock_irq(&freezer->lock); + rcu_read_lock(); - seq_puts(m, freezer_state_strs(state)); + /* update states bottom-up */ + cgroup_for_each_descendant_post(pos, cgroup) + update_if_frozen(pos); + update_if_frozen(cgroup); + + rcu_read_unlock(); + + seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); seq_putc(m, '\n'); return 0; } @@ -320,14 +388,39 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, * @freezer: freezer of interest * @freeze: whether to freeze or thaw * - * Freeze or thaw @cgroup according to @freeze. + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. */ static void freezer_change_state(struct freezer *freezer, bool freeze) { + struct cgroup *pos; + /* update @freezer */ spin_lock_irq(&freezer->lock); freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); spin_unlock_irq(&freezer->lock); + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + rcu_read_lock(); + cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { + struct freezer *pos_f = cgroup_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + /* + * Our update to @parent->state is already visible which is + * all we need. No need to lock @parent. For more info on + * synchronization, see freezer_post_create(). + */ + spin_lock_irq(&pos_f->lock); + freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + spin_unlock_irq(&pos_f->lock); + } + rcu_read_unlock(); } static int freezer_write(struct cgroup *cgroup, struct cftype *cft, @@ -390,12 +483,4 @@ struct cgroup_subsys freezer_subsys = { .attach = freezer_attach, .fork = freezer_fork, .base_cftypes = files, - - /* - * freezer subsys doesn't handle hierarchy at all. Frozen state - * should be inherited through the hierarchy - if a parent is - * frozen, all its children should be frozen. Fix it and remove - * the following. - */ - .broken_hierarchy = true, }; -- cgit v1.2.3 From 175431635ec09b1d1bba04979b006b99e8305a83 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:35 -0800 Subject: cgroup: remove incorrect dget/dput() pair in cgroup_create_dir() cgroup_create_dir() does weird dancing with dentry refcnt. On success, it gets and then puts it achieving nothing. On failure, it puts but there isn't no matching get anywhere leading to the following oops if cgroup_create_file() fails for whatever reason. ------------[ cut here ]------------ kernel BUG at /work/os/work/fs/dcache.c:552! invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU 2 Pid: 697, comm: mkdir Not tainted 3.7.0-rc4-work+ #3 Bochs Bochs RIP: 0010:[] [] dput+0x1dc/0x1e0 RSP: 0018:ffff88001a3ebef8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff88000e5b1ef8 RCX: 0000000000000403 RDX: 0000000000000303 RSI: 2000000000000000 RDI: ffff88000e5b1f58 RBP: ffff88001a3ebf18 R08: ffffffff82c76960 R09: 0000000000000001 R10: ffff880015022080 R11: ffd9bed70f48a041 R12: 00000000ffffffea R13: 0000000000000001 R14: ffff88000e5b1f58 R15: 00007fff57656d60 FS: 00007ff05fcb3800(0000) GS:ffff88001fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004046f0 CR3: 000000001315f000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process mkdir (pid: 697, threadinfo ffff88001a3ea000, task ffff880015022080) Stack: ffff88001a3ebf48 00000000ffffffea 0000000000000001 0000000000000000 ffff88001a3ebf38 ffffffff811cc889 0000000000000001 ffff88000e5b1ef8 ffff88001a3ebf68 ffffffff811d1fc9 ffff8800198d7f18 ffff880019106ef8 Call Trace: [] done_path_create+0x19/0x50 [] sys_mkdirat+0x59/0x80 [] sys_mkdir+0x19/0x20 [] system_call_fastpath+0x16/0x1b Code: 00 48 8d 90 18 01 00 00 48 89 93 c0 00 00 00 4c 89 a0 18 01 00 00 48 8b 83 a0 00 00 00 83 80 28 01 00 00 01 e8 e6 6f a0 00 eb 92 <0f> 0b 66 90 0f 1f 44 00 00 55 48 89 e5 41 57 41 56 49 89 fe 41 RIP [] dput+0x1dc/0x1e0 RSP ---[ end trace 1277bcfd9561ddb0 ]--- Fix it by dropping the unnecessary dget/dput() pair. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: stable@vger.kernel.org --- kernel/cgroup.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0f8fa6aa371..d0803f097f8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2684,9 +2684,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, dentry->d_fsdata = cgrp; inc_nlink(parent->d_inode); rcu_assign_pointer(cgrp->dentry, dentry); - dget(dentry); } - dput(dentry); return error; } -- cgit v1.2.3 From 2243076ad128d0cc196cf6597e6ddcf6bc907676 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:35 -0800 Subject: cgroup: initialize cgrp->allcg_node in init_cgroup_housekeeping() Not strictly necessary but it's annoying to have uninitialized list_head around. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d0803f097f8..ed0e177c465 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1381,6 +1381,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->files); INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->allcg_node); INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); -- cgit v1.2.3 From 28fd6f30ac3efd9170ae1ac89f3521d53b5eb83a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: open-code cgroup_create_dir() The operation order of cgroup creation is about to change and cgroup_create_dir() is more of a hindrance than a proper abstraction. Open-code it by moving the parent nlink adjustment next to self nlink adjustment in cgroup_create_file() and the rest to cgroup_create(). This patch doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 +++++------------------------- 1 file changed, 5 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed0e177c465..b042673171e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2652,6 +2652,7 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, /* start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); + inc_nlink(dentry->d_parent->d_inode); /* start with the directory inode held, so that we can * populate it without racing with another mkdir */ @@ -2666,30 +2667,6 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, return 0; } -/* - * cgroup_create_dir - create a directory for an object. - * @cgrp: the cgroup we create the directory for. It must have a valid - * ->parent field. And we are going to fill its ->dentry field. - * @dentry: dentry of the new cgroup - * @mode: mode to set on new directory. - */ -static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, - umode_t mode) -{ - struct dentry *parent; - int error = 0; - - parent = cgrp->parent->dentry; - error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); - if (!error) { - dentry->d_fsdata = cgrp; - inc_nlink(parent->d_inode); - rcu_assign_pointer(cgrp->dentry, dentry); - } - - return error; -} - /** * cgroup_file_mode - deduce file mode of a control file * @cft: the control file in question @@ -4138,10 +4115,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - err = cgroup_create_dir(cgrp, dentry, mode); + err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) goto err_remove; + dentry->d_fsdata = cgrp; + rcu_assign_pointer(cgrp->dentry, dentry); + for_each_subsys(root, ss) { /* each css holds a ref to the cgroup's dentry */ dget(dentry); -- cgit v1.2.3 From 4e139afc22cb98d0d032ffce0285bfcc73ca5217 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: create directory before linking while creating a new cgroup While creating a new cgroup, cgroup_create() links the newly allocated cgroup into various places before trying to create its directory. Because cgroup life-cycle is tied to the vfs objects, this makes it impossible to use cgroup_rmdir() for rolling back creation - the removal logic depends on having full vfs objects. This patch moves directory creation above linking and collect linking operations to one place. This allows directory creation failure to share error exit path with css allocation failures and any failure sites afterwards (to be added later) can use cgroup_rmdir() logic to undo creation. Note that this also makes the memory barriers around cgroup->dentry, which currently is misleadingly using RCU operations, unnecessary. This will be handled in the next patch. While at it, locking BUG_ON() on i_mutex is converted to lockdep_assert_held(). v2: Patch originally removed %NULL dentry check in cgroup_path(); however, Li pointed out that this patch doesn't make it unnecessary as ->create() may call cgroup_path(). Drop the change for now. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b042673171e..d62a529db2f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4112,15 +4112,22 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); - root->number_of_cgroups++; - + /* + * Create directory. cgroup_create_file() returns with the new + * directory locked on success so that it can be populated without + * dropping cgroup_mutex. + */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_remove; + goto err_destroy; + lockdep_assert_held(&dentry->d_inode->i_mutex); + /* allocation complete, commit to creation */ dentry->d_fsdata = cgrp; rcu_assign_pointer(cgrp->dentry, dentry); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); + list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); + root->number_of_cgroups++; for_each_subsys(root, ss) { /* each css holds a ref to the cgroup's dentry */ @@ -4131,11 +4138,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, ss->post_create(cgrp); } - /* The cgroup directory was pre-locked for us */ - BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); - - list_add_tail(&cgrp->allcg_node, &root->allcg_list); - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); /* If err < 0, we have a half-filled directory - oh well ;) */ @@ -4144,20 +4146,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, return 0; - err_remove: - - list_del_rcu(&cgrp->sibling); - root->number_of_cgroups--; - - err_destroy: - +err_destroy: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(cgrp); } - mutex_unlock(&cgroup_mutex); - /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free: -- cgit v1.2.3 From febfcef60d4f9457785b45aab548bc7ee5ea158f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: cgroup->dentry isn't a RCU pointer cgroup->dentry is marked and used as a RCU pointer; however, it isn't one - the final dentry put doesn't go through call_rcu(). cgroup and dentry share the same RCU freeing rule via synchronize_rcu() in cgroup_diput() (kfree_rcu() used on cgrp is unnecessary). If cgrp is accessible under RCU read lock, so is its dentry and dereferencing cgrp->dentry doesn't need any further RCU protection or annotation. While not being accurate, before the previous patch, the RCU accessors served a purpose as memory barriers - cgroup->dentry used to be assigned after the cgroup was made visible to cgroup_path(), so the assignment and dereferencing in cgroup_path() needed the memory barrier pair. Now that list_add_tail_rcu() happens after cgroup->dentry is assigned, this no longer is necessary. Remove the now unnecessary and misleading RCU annotations from cgroup->dentry. To make up for the removal of rcu_dereference_check() in cgroup_path(), add an explicit rcu_lockdep_assert(), which asserts the dereference rule of @cgrp, not cgrp->dentry. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d62a529db2f..affc76d7f73 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1756,9 +1756,11 @@ static struct kobject *cgroup_kobj; */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { + struct dentry *dentry = cgrp->dentry; char *start; - struct dentry *dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + + rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), + "cgroup_path() called without proper locking"); if (!dentry || cgrp == dummytop) { /* @@ -1782,8 +1784,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) if (!cgrp) break; - dentry = rcu_dereference_check(cgrp->dentry, - cgroup_lock_is_held()); + dentry = cgrp->dentry; if (!cgrp->parent) continue; if (--start < buf) @@ -4124,7 +4125,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* allocation complete, commit to creation */ dentry->d_fsdata = cgrp; - rcu_assign_pointer(cgrp->dentry, dentry); + cgrp->dentry = dentry; list_add_tail(&cgrp->allcg_node, &root->allcg_list); list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; -- cgit v1.2.3 From 38b53abaa3e0c7e750ef73eee919cf42eee6b134 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: make CSS_* flags bit masks instead of bit positions Currently, CSS_* flags are defined as bit positions and manipulated using atomic bitops. There's no reason to use atomic bitops for them and bit positions are clunkier to deal with than bit masks. Make CSS_* bit masks instead and use the usual C bitwise operators to access them. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index affc76d7f73..82ad8785faf 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4020,7 +4020,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags = 0; css->id = NULL; if (cgrp == dummytop) - set_bit(CSS_ROOT, &css->flags); + css->flags |= CSS_ROOT; BUG_ON(cgrp->subsys[ss->subsys_id]); cgrp->subsys[ss->subsys_id] = css; -- cgit v1.2.3 From b48c6a80a0f7584056f768268fc12b87745a393f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: trivial cleanup for cgroup_init/load_subsys() Consistently use @css and @dummytop in these two functions instead of referring to them indirectly. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 82ad8785faf..6cc693b91c4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4332,7 +4332,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * pointer to this state - since the subsystem is * newly registered, all tasks and hence the * init_css_set is in the subsystem's top cgroup. */ - init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + init_css_set.subsys[ss->subsys_id] = css; need_forkexit_callback |= ss->fork || ss->exit; @@ -4344,7 +4344,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) ss->active = 1; if (ss->post_create) - ss->post_create(&ss->root->top_cgroup); + ss->post_create(dummytop); /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ @@ -4456,7 +4456,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->active = 1; if (ss->post_create) - ss->post_create(&ss->root->top_cgroup); + ss->post_create(dummytop); /* success! */ mutex_unlock(&cgroup_mutex); -- cgit v1.2.3 From 648bb56d076bde31113f09a7d24d95bc8d4155ac Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:36 -0800 Subject: cgroup: lock cgroup_mutex in cgroup_init_subsys() Make cgroup_init_subsys() grab cgroup_mutex while initializing a subsystem so that all helpers and callbacks are called under the context they expect. This isn't strictly necessary as cgroup_init_subsys() doesn't race with anybody but will allow adding lockdep assertions. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6cc693b91c4..09751657abd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4317,6 +4317,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + mutex_lock(&cgroup_mutex); + /* init base cftset */ cgroup_init_cftsets(ss); @@ -4346,6 +4348,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) if (ss->post_create) ss->post_create(dummytop); + mutex_unlock(&cgroup_mutex); + /* this function shouldn't be used with modular subsystems, since they * need to register a subsys_id, among other things */ BUG_ON(ss->module); -- cgit v1.2.3 From 02ae7486d05ae6df8395409a4945b2420f1e35c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: fix harmless bugs in cgroup_load_subsys() fail path and cgroup_unload_subsys() * If idr init fails, cgroup_load_subsys() cleared dummytop->subsys[] before calilng ->destroy() making CSS inaccessible to the callback, and didn't unlink ss->sibling. As no modular controller uses ->use_id, this doesn't cause any actual problems. * cgroup_unload_subsys() was forgetting to free idr, call ->pre_destroy() and clear ->active. As there currently is no modular controller which uses ->use_id, ->pre_destroy() or ->active, this doesn't cause any actual problems. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09751657abd..5679cb1ce43 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4420,9 +4420,10 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) if (ss->use_id) { int ret = cgroup_init_idr(ss, css); if (ret) { - dummytop->subsys[ss->subsys_id] = NULL; ss->destroy(dummytop); + dummytop->subsys[ss->subsys_id] = NULL; subsys[ss->subsys_id] = NULL; + list_del_init(&ss->sibling); mutex_unlock(&cgroup_mutex); return ret; } @@ -4490,7 +4491,19 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) */ BUG_ON(ss->root != &rootnode); + /* ->pre_destroy() should be called outside cgroup_mutex for now */ + if (ss->pre_destroy) + ss->pre_destroy(dummytop); + mutex_lock(&cgroup_mutex); + + ss->active = 0; + + if (ss->use_id) { + idr_remove_all(&ss->idr); + idr_destroy(&ss->idr); + } + /* deassign the subsys_id */ subsys[ss->subsys_id] = NULL; -- cgit v1.2.3 From 42809dd4225b2f3127a4804314a1b33608620d96 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: separate out cgroup_destroy_locked() Separate out cgroup_destroy_locked() from cgroup_destroy(). This will be later used in cgroup_create() failure path. While at it, add lockdep asserts on i_mutex and cgroup_mutex, and move @d and @parent assignments to their declarations. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5679cb1ce43..4412d9694f1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -242,6 +242,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +static int cgroup_destroy_locked(struct cgroup *cgrp); + #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) { @@ -4209,22 +4211,20 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) return 0; } -static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +static int cgroup_destroy_locked(struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { - struct cgroup *cgrp = dentry->d_fsdata; - struct dentry *d; - struct cgroup *parent; + struct dentry *d = cgrp->dentry; + struct cgroup *parent = cgrp->parent; DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; - /* the vfs holds both inode->i_mutex already */ - mutex_lock(&cgroup_mutex); - parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { - mutex_unlock(&cgroup_mutex); + lockdep_assert_held(&d->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); + + if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) return -EBUSY; - } /* * Block new css_tryget() by deactivating refcnt and mark @cgrp @@ -4243,7 +4243,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) /* * Tell subsystems to initate destruction. pre_destroy() should be * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix - * potential deadlock in pre_destroy") for details. + * potential deadlock in pre_destroy") for details. This temporary + * unlocking should go away once cgroup_mutex is unexported from + * controllers. */ mutex_unlock(&cgroup_mutex); for_each_subsys(cgrp->root, ss) @@ -4268,11 +4270,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); - list_del_init(&cgrp->allcg_node); - d = dget(cgrp->dentry); - + dget(d); cgroup_d_remove_dir(d); dput(d); @@ -4293,10 +4293,20 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) } spin_unlock(&cgrp->event_list_lock); - mutex_unlock(&cgroup_mutex); return 0; } +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + int ret; + + mutex_lock(&cgroup_mutex); + ret = cgroup_destroy_locked(dentry->d_fsdata); + mutex_unlock(&cgroup_mutex); + + return ret; +} + static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) { INIT_LIST_HEAD(&ss->cftsets); -- cgit v1.2.3 From a31f2d3ff7fe20cbe2a143515a7d7c408b29dd0d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: introduce CSS_ONLINE flag and on/offline_css() helpers New helpers on/offline_css() respectively wrap ->post_create() and ->pre_destroy() invocations. online_css() sets CSS_ONLINE after ->post_create() is complete and offline_css() invokes ->pre_destroy() iff CSS_ONLINE is set and clears it while also handling the temporary dropping of cgroup_mutex. This patch doesn't introduce any behavior change at the moment but will be used to improve cgroup_create() failure path and allow ->post_create() to fail. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 65 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4412d9694f1..78a3d5c0968 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4035,6 +4035,42 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } +/* invoke ->post_create() on a new CSS and mark it online */ +static void online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + lockdep_assert_held(&cgroup_mutex); + + if (ss->post_create) + ss->post_create(cgrp); + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; +} + +/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) + __releases(&cgroup_mutex) __acquires(&cgroup_mutex) +{ + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + lockdep_assert_held(&cgroup_mutex); + + if (!(css->flags & CSS_ONLINE)) + return; + + /* + * pre_destroy() should be called with cgroup_mutex unlocked. See + * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for + * details. This temporary unlocking should go away once + * cgroup_mutex is unexported from controllers. + */ + if (ss->pre_destroy) { + mutex_unlock(&cgroup_mutex); + ss->pre_destroy(cgrp); + mutex_lock(&cgroup_mutex); + } + + cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; +} + /* * cgroup_create - create a cgroup * @parent: cgroup that will be parent of the new cgroup @@ -4137,8 +4173,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, dget(dentry); /* creation succeeded, notify subsystems */ - if (ss->post_create) - ss->post_create(cgrp); + online_css(ss, cgrp); } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); @@ -4240,18 +4275,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) } set_bit(CGRP_REMOVED, &cgrp->flags); - /* - * Tell subsystems to initate destruction. pre_destroy() should be - * called with cgroup_mutex unlocked. See 3fa59dfbc3 ("cgroup: fix - * potential deadlock in pre_destroy") for details. This temporary - * unlocking should go away once cgroup_mutex is unexported from - * controllers. - */ - mutex_unlock(&cgroup_mutex); + /* tell subsystems to initate destruction */ for_each_subsys(cgrp->root, ss) - if (ss->pre_destroy) - ss->pre_destroy(cgrp); - mutex_lock(&cgroup_mutex); + offline_css(ss, cgrp); /* * Put all the base refs. Each css holds an extra reference to the @@ -4354,9 +4380,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; - - if (ss->post_create) - ss->post_create(dummytop); + online_css(ss, dummytop); mutex_unlock(&cgroup_mutex); @@ -4469,9 +4493,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; - - if (ss->post_create) - ss->post_create(dummytop); + online_css(ss, dummytop); /* success! */ mutex_unlock(&cgroup_mutex); @@ -4501,12 +4523,9 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) */ BUG_ON(ss->root != &rootnode); - /* ->pre_destroy() should be called outside cgroup_mutex for now */ - if (ss->pre_destroy) - ss->pre_destroy(dummytop); - mutex_lock(&cgroup_mutex); + offline_css(ss, dummytop); ss->active = 0; if (ss->use_id) { -- cgit v1.2.3 From d19e19de48aa0b90c56cd93c8a46ebac46273429 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: simplify cgroup_load_subsys() failure path Now that cgroup_unload_subsys() can tell whether the root css is online or not, we can safely call cgroup_unload_subsys() after idr init failure in cgroup_load_subsys(). Replace the manual unrolling and invoke cgroup_unload_subsys() on failure. This drops cgroup_mutex inbetween but should be safe as the subsystem will fail try_module_get() and thus can't be mounted inbetween. As this means that cgroup_unload_subsys() can be called before css_sets are rehashed, remove BUG_ON() on %NULL css_set->subsys[] from cgroup_unload_subsys(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 78a3d5c0968..166b5141f3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4400,8 +4400,8 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) */ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) { - int i; struct cgroup_subsys_state *css; + int i, ret; /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || @@ -4452,15 +4452,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) init_cgroup_css(css, ss, dummytop); /* init_idr must be after init_cgroup_css because it sets css->id. */ if (ss->use_id) { - int ret = cgroup_init_idr(ss, css); - if (ret) { - ss->destroy(dummytop); - dummytop->subsys[ss->subsys_id] = NULL; - subsys[ss->subsys_id] = NULL; - list_del_init(&ss->sibling); - mutex_unlock(&cgroup_mutex); - return ret; - } + ret = cgroup_init_idr(ss, css); + if (ret) + goto err_unload; } /* @@ -4498,6 +4492,12 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* success! */ mutex_unlock(&cgroup_mutex); return 0; + +err_unload: + mutex_unlock(&cgroup_mutex); + /* @ss can't be mounted here as try_module_get() would fail */ + cgroup_unload_subsys(ss); + return ret; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); @@ -4548,7 +4548,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) struct css_set *cg = link->cg; hlist_del(&cg->hlist); - BUG_ON(!cg->subsys[ss->subsys_id]); cg->subsys[ss->subsys_id] = NULL; hhead = css_set_hash(cg->subsys); hlist_add_head(&cg->hlist, hhead); -- cgit v1.2.3 From b8a2df6a5b576d04f78f54abf254c283527d8bbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:37 -0800 Subject: cgroup: use mutex_trylock() when grabbing i_mutex of a new cgroup directory All cgroup directory i_mutexes nest outside cgroup_mutex; however, new directory creation is a special case. A new cgroup directory is created while holding cgroup_mutex. Populating the new directory requires both the new directory's i_mutex and cgroup_mutex. Because all directory i_mutexes nest outside cgroup_mutex, grabbing both requires releasing cgroup_mutex first, which isn't a good idea as the new cgroup isn't yet ready to be manipulated by other cgroup opreations. This is worked around by grabbing the new directory's i_mutex while holding cgroup_mutex before making it visible. As there's no other user at that point, grabbing the i_mutex under cgroup_mutex can't lead to deadlock. cgroup_create_file() was using I_MUTEX_CHILD to tell lockdep not to worry about the reverse locking order; however, this creates pseudo locking dependency cgroup_mutex -> I_MUTEX_CHILD, which isn't true - all directory i_mutexes are still nested outside cgroup_mutex. This pseudo locking dependency can lead to spurious lockdep warnings. Use mutex_trylock() instead. This will always succeed and lockdep doesn't create any locking dependency for it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 166b5141f3d..c53f42e3170 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2657,9 +2657,15 @@ static int cgroup_create_file(struct dentry *dentry, umode_t mode, inc_nlink(inode); inc_nlink(dentry->d_parent->d_inode); - /* start with the directory inode held, so that we can - * populate it without racing with another mkdir */ - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + /* + * Control reaches here with cgroup_mutex held. + * @inode->i_mutex should nest outside cgroup_mutex but we + * want to populate it immediately without releasing + * cgroup_mutex. As @inode isn't visible to anyone else + * yet, trylock will always succeed without affecting + * lockdep checks. + */ + WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); } else if (S_ISREG(mode)) { inode->i_size = 0; inode->i_fop = &cgroup_file_operations; -- cgit v1.2.3 From 4b8b47eb0001a89f271d671d959b235faa8f03e2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: update cgroup_create() failure path cgroup_create() was ignoring failure of cgroupfs files. Update it such that, if file creation fails, it rolls back by calling cgroup_destroy_locked() and returns failure. Note that error out goto labels are renamed. The labels are a bit confusing but will become better w/ later cgroup operation renames. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c53f42e3170..027b66e6669 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4107,7 +4107,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free; + goto err_free_cgrp; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4135,13 +4135,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, css = ss->create(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); - goto err_destroy; + goto err_free_all; } init_cgroup_css(css, ss, cgrp); if (ss->use_id) { err = alloc_css_id(ss, parent, cgrp); if (err) - goto err_destroy; + goto err_free_all; } /* At error, ->destroy() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) @@ -4164,7 +4164,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ err = cgroup_create_file(dentry, S_IFDIR | mode, sb); if (err < 0) - goto err_destroy; + goto err_free_all; lockdep_assert_held(&dentry->d_inode->i_mutex); /* allocation complete, commit to creation */ @@ -4183,14 +4183,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); - /* If err < 0, we have a half-filled directory - oh well ;) */ + if (err) + goto err_destroy; mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); return 0; -err_destroy: +err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) ss->destroy(cgrp); @@ -4198,9 +4199,15 @@ err_destroy: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); -err_free: +err_free_cgrp: kfree(cgrp); return err; + +err_destroy: + cgroup_destroy_locked(cgrp); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + return err; } static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -- cgit v1.2.3 From b1929db42f8a649d9a9e397119f628c27fd4021f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: allow ->post_create() to fail There could be cases where controllers want to do initialization operations which may fail from ->post_create(). This patch makes ->post_create() return -errno to indicate failure and online_css() relay such failures. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 29 +++++++++++++++++++---------- kernel/cgroup_freezer.c | 4 +++- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 027b66e6669..c389f425868 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,14 +4041,18 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online */ -static void online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +/* invoke ->post_create() on a new CSS and mark it online if successful */ +static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + int ret = 0; + lockdep_assert_held(&cgroup_mutex); if (ss->post_create) - ss->post_create(cgrp); - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + ret = ss->post_create(cgrp); + if (!ret) + cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + return ret; } /* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ @@ -4174,12 +4178,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - for_each_subsys(root, ss) { - /* each css holds a ref to the cgroup's dentry */ + /* each css holds a ref to the cgroup's dentry */ + for_each_subsys(root, ss) dget(dentry); - /* creation succeeded, notify subsystems */ - online_css(ss, cgrp); + /* creation succeeded, notify subsystems */ + for_each_subsys(root, ss) { + err = online_css(ss, cgrp); + if (err) + goto err_destroy; } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); @@ -4393,7 +4400,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) BUG_ON(!list_empty(&init_task.tasks)); ss->active = 1; - online_css(ss, dummytop); + BUG_ON(online_css(ss, dummytop)); mutex_unlock(&cgroup_mutex); @@ -4500,7 +4507,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); ss->active = 1; - online_css(ss, dummytop); + ret = online_css(ss, dummytop); + if (ret) + goto err_unload; /* success! */ mutex_unlock(&cgroup_mutex); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 670a4af7dc9..ee8bb671688 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -112,7 +112,7 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) * parent's freezing state while holding both parent's and our * freezer->lock. */ -static void freezer_post_create(struct cgroup *cgroup) +static int freezer_post_create(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); struct freezer *parent = parent_freezer(freezer); @@ -136,6 +136,8 @@ static void freezer_post_create(struct cgroup *cgroup) spin_unlock(&freezer->lock); if (parent) spin_unlock_irq(&parent->lock); + + return 0; } /** -- cgit v1.2.3 From 92fb97487a7e41b222c1417cabd1d1ab7cc3a48c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: rename ->create/post_create/pre_destroy/destroy() to ->css_alloc/online/offline/free() Rename cgroup_subsys css lifetime related callbacks to better describe what their roles are. Also, update documentation. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 51 +++++++++++++++++++++++++------------------------ kernel/cgroup_freezer.c | 20 +++++++++---------- kernel/cpuset.c | 10 +++++----- kernel/events/core.c | 8 ++++---- kernel/sched/core.c | 16 ++++++++-------- 5 files changed, 53 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c389f425868..d35463bab48 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -876,7 +876,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) * Release the subsystem state objects. */ for_each_subsys(cgrp->root, ss) - ss->destroy(cgrp); + ss->css_free(cgrp); cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4048,8 +4048,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) lockdep_assert_held(&cgroup_mutex); - if (ss->post_create) - ret = ss->post_create(cgrp); + if (ss->css_online) + ret = ss->css_online(cgrp); if (!ret) cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; return ret; @@ -4067,14 +4067,14 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; /* - * pre_destroy() should be called with cgroup_mutex unlocked. See + * css_offline() should be called with cgroup_mutex unlocked. See * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for * details. This temporary unlocking should go away once * cgroup_mutex is unexported from controllers. */ - if (ss->pre_destroy) { + if (ss->css_offline) { mutex_unlock(&cgroup_mutex); - ss->pre_destroy(cgrp); + ss->css_offline(cgrp); mutex_lock(&cgroup_mutex); } @@ -4136,7 +4136,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->create(cgrp); + css = ss->css_alloc(cgrp); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4147,7 +4147,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - /* At error, ->destroy() callback has to free assigned ID. */ + /* At error, ->css_free() callback has to free assigned ID. */ if (clone_children(parent) && ss->post_clone) ss->post_clone(cgrp); @@ -4201,7 +4201,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_subsys(root, ss) { if (cgrp->subsys[ss->subsys_id]) - ss->destroy(cgrp); + ss->css_free(cgrp); } mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ @@ -4381,7 +4381,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &rootnode.subsys_list); ss->root = &rootnode; - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, dummytop); @@ -4425,7 +4425,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) /* check name and function validity */ if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || - ss->create == NULL || ss->destroy == NULL) + ss->css_alloc == NULL || ss->css_free == NULL) return -EINVAL; /* @@ -4454,10 +4454,11 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) subsys[ss->subsys_id] = ss; /* - * no ss->create seems to need anything important in the ss struct, so - * this can happen first (i.e. before the rootnode attachment). + * no ss->css_alloc seems to need anything important in the ss + * struct, so this can happen first (i.e. before the rootnode + * attachment). */ - css = ss->create(dummytop); + css = ss->css_alloc(dummytop); if (IS_ERR(css)) { /* failure case - need to deassign the subsys[] slot. */ subsys[ss->subsys_id] = NULL; @@ -4577,12 +4578,12 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) write_unlock(&css_set_lock); /* - * remove subsystem's css from the dummytop and free it - need to free - * before marking as null because ss->destroy needs the cgrp->subsys - * pointer to find their state. note that this also takes care of - * freeing the css_id. + * remove subsystem's css from the dummytop and free it - need to + * free before marking as null because ss->css_free needs the + * cgrp->subsys pointer to find their state. note that this also + * takes care of freeing the css_id. */ - ss->destroy(dummytop); + ss->css_free(dummytop); dummytop->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -4626,8 +4627,8 @@ int __init cgroup_init_early(void) BUG_ON(!ss->name); BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); - BUG_ON(!ss->create); - BUG_ON(!ss->destroy); + BUG_ON(!ss->css_alloc); + BUG_ON(!ss->css_free); if (ss->subsys_id != i) { printk(KERN_ERR "cgroup: Subsys %s id == %d\n", ss->name, ss->subsys_id); @@ -5439,7 +5440,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_create(struct cgroup *cont) +static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5449,7 +5450,7 @@ static struct cgroup_subsys_state *debug_create(struct cgroup *cont) return css; } -static void debug_destroy(struct cgroup *cont) +static void debug_css_free(struct cgroup *cont) { kfree(cont->subsys[debug_subsys_id]); } @@ -5578,8 +5579,8 @@ static struct cftype debug_files[] = { struct cgroup_subsys debug_subsys = { .name = "debug", - .create = debug_create, - .destroy = debug_destroy, + .css_alloc = debug_css_alloc, + .css_free = debug_css_free, .subsys_id = debug_subsys_id, .base_cftypes = debug_files, }; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index ee8bb671688..75dda1ea502 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -92,7 +92,7 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) +static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) { struct freezer *freezer; @@ -105,14 +105,14 @@ static struct cgroup_subsys_state *freezer_create(struct cgroup *cgroup) } /** - * freezer_post_create - commit creation of a freezer cgroup + * freezer_css_online - commit creation of a freezer cgroup * @cgroup: cgroup being created * * We're committing to creation of @cgroup. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_post_create(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); struct freezer *parent = parent_freezer(freezer); @@ -141,13 +141,13 @@ static int freezer_post_create(struct cgroup *cgroup) } /** - * freezer_pre_destroy - initiate destruction of @cgroup + * freezer_css_offline - initiate destruction of @cgroup * @cgroup: cgroup being destroyed * * @cgroup is going away. Mark it dead and decrement system_freezing_count * if it was holding one. */ -static void freezer_pre_destroy(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup *cgroup) { struct freezer *freezer = cgroup_freezer(cgroup); @@ -161,7 +161,7 @@ static void freezer_pre_destroy(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_destroy(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup *cgroup) { kfree(cgroup_freezer(cgroup)); } @@ -477,10 +477,10 @@ static struct cftype files[] = { struct cgroup_subsys freezer_subsys = { .name = "freezer", - .create = freezer_create, - .post_create = freezer_post_create, - .pre_destroy = freezer_pre_destroy, - .destroy = freezer_destroy, + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, .subsys_id = freezer_subsys_id, .attach = freezer_attach, .fork = freezer_fork, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f33c7153b6d..06931337c4e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1821,11 +1821,11 @@ static void cpuset_post_clone(struct cgroup *cgroup) } /* - * cpuset_create - create a cpuset + * cpuset_css_alloc - allocate a cpuset css * cont: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) +static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { struct cpuset *cs; struct cpuset *parent; @@ -1864,7 +1864,7 @@ static struct cgroup_subsys_state *cpuset_create(struct cgroup *cont) * will call async_rebuild_sched_domains(). */ -static void cpuset_destroy(struct cgroup *cont) +static void cpuset_css_free(struct cgroup *cont) { struct cpuset *cs = cgroup_cs(cont); @@ -1878,8 +1878,8 @@ static void cpuset_destroy(struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", - .create = cpuset_create, - .destroy = cpuset_destroy, + .css_alloc = cpuset_css_alloc, + .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .post_clone = cpuset_post_clone, diff --git a/kernel/events/core.c b/kernel/events/core.c index dbccf83c134..f9ff5493171 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7434,7 +7434,7 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) +static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) { struct perf_cgroup *jc; @@ -7451,7 +7451,7 @@ static struct cgroup_subsys_state *perf_cgroup_create(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_destroy(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), @@ -7492,8 +7492,8 @@ static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, struct cgroup_subsys perf_subsys = { .name = "perf_event", .subsys_id = perf_subsys_id, - .create = perf_cgroup_create, - .destroy = perf_cgroup_destroy, + .css_alloc = perf_cgroup_css_alloc, + .css_free = perf_cgroup_css_free, .exit = perf_cgroup_exit, .attach = perf_cgroup_attach, diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda71..6f20c8fb232 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7468,7 +7468,7 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) struct task_group, css); } -static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) { struct task_group *tg, *parent; @@ -7485,7 +7485,7 @@ static struct cgroup_subsys_state *cpu_cgroup_create(struct cgroup *cgrp) return &tg->css; } -static void cpu_cgroup_destroy(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); @@ -7845,8 +7845,8 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgroup_subsys = { .name = "cpu", - .create = cpu_cgroup_create, - .destroy = cpu_cgroup_destroy, + .css_alloc = cpu_cgroup_css_alloc, + .css_free = cpu_cgroup_css_free, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .exit = cpu_cgroup_exit, @@ -7869,7 +7869,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { struct cpuacct root_cpuacct; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_create(struct cgroup *cgrp) +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) { struct cpuacct *ca; @@ -7899,7 +7899,7 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_destroy(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup *cgrp) { struct cpuacct *ca = cgroup_ca(cgrp); @@ -8070,8 +8070,8 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", - .create = cpuacct_create, - .destroy = cpuacct_destroy, + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, .subsys_id = cpuacct_subsys_id, .base_cftypes = files, }; -- cgit v1.2.3 From 2260e7fc1f18ad815324605c1ce7d5c6fd9b19a2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:38 -0800 Subject: cgroup: s/CGRP_CLONE_CHILDREN/CGRP_CPUSET_CLONE_CHILDREN/ clone_children is only meaningful for cpuset and will stay that way. Rename the flag to reflect that and update documentation. Also, drop clone_children() wrapper in cgroup.c. The thin wrapper is used only a few times and one of them will go away soon. Signed-off-by: Tejun Heo Acked-by: Serge E. Hallyn Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d35463bab48..2895880e680 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -296,11 +296,6 @@ static int notify_on_release(const struct cgroup *cgrp) return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); } -static int clone_children(const struct cgroup *cgrp) -{ - return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); -} - /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -1101,7 +1096,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",xattr"); if (strlen(root->release_agent_path)) seq_printf(seq, ",release_agent=%s", root->release_agent_path); - if (clone_children(&root->top_cgroup)) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) seq_puts(seq, ",clone_children"); if (strlen(root->name)) seq_printf(seq, ",name=%s", root->name); @@ -1113,7 +1108,7 @@ struct cgroup_sb_opts { unsigned long subsys_mask; unsigned long flags; char *release_agent; - bool clone_children; + bool cpuset_clone_children; char *name; /* User explicitly requested empty subsystem */ bool none; @@ -1164,7 +1159,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; } if (!strcmp(token, "clone_children")) { - opts->clone_children = true; + opts->cpuset_clone_children = true; continue; } if (!strcmp(token, "xattr")) { @@ -1474,8 +1469,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) strcpy(root->name, opts->name); - if (opts->clone_children) - set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); + if (opts->cpuset_clone_children) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); return root; } @@ -3905,7 +3900,7 @@ fail: static u64 cgroup_clone_children_read(struct cgroup *cgrp, struct cftype *cft) { - return clone_children(cgrp); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); } static int cgroup_clone_children_write(struct cgroup *cgrp, @@ -3913,9 +3908,9 @@ static int cgroup_clone_children_write(struct cgroup *cgrp, u64 val) { if (val) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); else - clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); return 0; } @@ -4130,8 +4125,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); - if (clone_children(parent)) - set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); for_each_subsys(root, ss) { struct cgroup_subsys_state *css; @@ -4148,7 +4143,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } /* At error, ->css_free() callback has to free assigned ID. */ - if (clone_children(parent) && ss->post_clone) + if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags) && + ss->post_clone) ss->post_clone(cgrp); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && -- cgit v1.2.3 From 033fa1c5f5f73833598a0beb022c0048fb769dad Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 08:13:39 -0800 Subject: cgroup, cpuset: remove cgroup_subsys->post_clone() Currently CGRP_CPUSET_CLONE_CHILDREN triggers ->post_clone(). Now that clone_children is cpuset specific, there's no reason to have this rather odd option activation mechanism in cgroup core. cpuset can check the flag from its ->css_allocate() and take the necessary action. Move cpuset_post_clone() logic to the end of cpuset_css_alloc() and remove cgroup_subsys->post_clone(). Loosely based on Glauber's "generalize post_clone into post_create" patch. Signed-off-by: Tejun Heo Original-patch-by: Glauber Costa Original-patch: <1351686554-22592-2-git-send-email-glommer@parallels.com> Acked-by: Serge E. Hallyn Acked-by: Li Zefan Cc: Glauber Costa --- kernel/cgroup.c | 4 --- kernel/cpuset.c | 80 ++++++++++++++++++++++++++------------------------------- 2 files changed, 36 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2895880e680..96719f73e95 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4142,10 +4142,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - /* At error, ->css_free() callback has to free assigned ID. */ - if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags) && - ss->post_clone) - ss->post_clone(cgrp); if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && parent->parent) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 06931337c4e..b017887d632 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1783,43 +1783,6 @@ static struct cftype files[] = { { } /* terminate */ }; -/* - * post_clone() is called during cgroup_create() when the - * clone_children mount argument was specified. The cgroup - * can not yet have any tasks. - * - * Currently we refuse to set up the cgroup - thereby - * refusing the task to be entered, and as a result refusing - * the sys_unshare() or clone() which initiated it - if any - * sibling cpusets have exclusive cpus or mem. - * - * If this becomes a problem for some users who wish to - * allow that scenario, then cpuset_post_clone() could be - * changed to grant parent->cpus_allowed-sibling_cpus_exclusive - * (and likewise for mems) to the new cgroup. Called with cgroup_mutex - * held. - */ -static void cpuset_post_clone(struct cgroup *cgroup) -{ - struct cgroup *parent, *child; - struct cpuset *cs, *parent_cs; - - parent = cgroup->parent; - list_for_each_entry(child, &parent->children, sibling) { - cs = cgroup_cs(child); - if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) - return; - } - cs = cgroup_cs(cgroup); - parent_cs = cgroup_cs(parent); - - mutex_lock(&callback_mutex); - cs->mems_allowed = parent_cs->mems_allowed; - cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); - mutex_unlock(&callback_mutex); - return; -} - /* * cpuset_css_alloc - allocate a cpuset css * cont: control group that the new cpuset will be part of @@ -1827,13 +1790,14 @@ static void cpuset_post_clone(struct cgroup *cgroup) static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) { - struct cpuset *cs; - struct cpuset *parent; + struct cgroup *parent_cg = cont->parent; + struct cgroup *tmp_cg; + struct cpuset *parent, *cs; - if (!cont->parent) { + if (!parent_cg) return &top_cpuset.css; - } - parent = cgroup_cs(cont->parent); + parent = cgroup_cs(parent_cg); + cs = kmalloc(sizeof(*cs), GFP_KERNEL); if (!cs) return ERR_PTR(-ENOMEM); @@ -1855,7 +1819,36 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) cs->parent = parent; number_of_cpusets++; - return &cs->css ; + + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) + goto skip_clone; + + /* + * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is + * set. This flag handling is implemented in cgroup core for + * histrical reasons - the flag may be specified during mount. + * + * Currently, if any sibling cpusets have exclusive cpus or mem, we + * refuse to clone the configuration - thereby refusing the task to + * be entered, and as a result refusing the sys_unshare() or + * clone() which initiated it. If this becomes a problem for some + * users who wish to allow that scenario, then this could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. + */ + list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { + struct cpuset *tmp_cs = cgroup_cs(tmp_cg); + + if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) + goto skip_clone; + } + + mutex_lock(&callback_mutex); + cs->mems_allowed = parent->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + mutex_unlock(&callback_mutex); +skip_clone: + return &cs->css; } /* @@ -1882,7 +1875,6 @@ struct cgroup_subsys cpuset_subsys = { .css_free = cpuset_css_free, .can_attach = cpuset_can_attach, .attach = cpuset_attach, - .post_clone = cpuset_post_clone, .subsys_id = cpuset_subsys_id, .base_cftypes = files, .early_init = 1, -- cgit v1.2.3 From 0a950f65e1e64f4e82b4b5507773848ea88bcb8e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 Nov 2012 09:02:12 -0800 Subject: cgroup: add cgroup->id With the introduction of generic cgroup hierarchy iterators, css_id is being phased out. It was unnecessarily complex, id'ing the wrong thing (cgroups need IDs, not CSSes) and has other oddities like not being available at ->css_alloc(). This patch adds cgroup->id, which is a simple per-hierarchy ida-allocated ID which is assigned before ->css_alloc() and released after ->css_free(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Neil Horman --- kernel/cgroup.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 96719f73e95..6db01417d39 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -138,6 +138,9 @@ struct cgroupfs_root { /* Hierarchy-specific flags */ unsigned long flags; + /* IDs for cgroups in this hierarchy */ + struct ida cgroup_ida; + /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; @@ -890,6 +893,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode) simple_xattrs_free(&cgrp->xattrs); + ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); kfree_rcu(cgrp, rcu_head); } else { struct cfent *cfe = __d_cfe(dentry); @@ -1465,6 +1469,7 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; + ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1483,6 +1488,7 @@ static void cgroup_drop_root(struct cgroupfs_root *root) spin_lock(&hierarchy_id_lock); ida_remove(&hierarchy_ida, root->hierarchy_id); spin_unlock(&hierarchy_id_lock); + ida_destroy(&root->cgroup_ida); kfree(root); } @@ -4093,10 +4099,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys *ss; struct super_block *sb = root->sb; + /* allocate the cgroup and its ID, 0 is reserved for the root */ cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); if (!cgrp) return -ENOMEM; + cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + if (cgrp->id < 0) + goto err_free_cgrp; + /* * Only live parents can have children. Note that the liveliness * check isn't strictly necessary because cgroup_mkdir() and @@ -4106,7 +4117,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, */ if (!cgroup_lock_live_group(parent)) { err = -ENODEV; - goto err_free_cgrp; + goto err_free_id; } /* Grab a reference on the superblock so the hierarchy doesn't @@ -4198,6 +4209,8 @@ err_free_all: mutex_unlock(&cgroup_mutex); /* Release the reference count that we took on the superblock */ deactivate_super(sb); +err_free_id: + ida_simple_remove(&root->cgroup_ida, cgrp->id); err_free_cgrp: kfree(cgrp); return err; -- cgit v1.2.3 From d0b2fdd2a51203f04ea0a5d716e033c15e0231af Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 20 Nov 2012 22:06:18 +0800 Subject: cgroup: remove obsolete guarantee from cgroup_task_migrate. 'guarantee' is already removed from cgroup_task_migrate, so remove the corresponding comments. Some other typos in cgroup are also changed. Cc: Tejun Heo Cc: Li Zefan Signed-off-by: Tao Ma Signed-off-by: Tejun Heo --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6db01417d39..55a0a770a5a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -782,12 +782,12 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * The task_lock() exception * * The need for this exception arises from the action of - * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * cgroup_attach_task(), which overwrites one task's cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroup pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -1882,9 +1882,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_size); /* * cgroup_task_migrate - move a task from one cgroup to another. * - * 'guarantee' is set if the caller promises that a new css_set for the task - * will already exist. If not set, this function might sleep, and can fail with - * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. + * Must be called with cgroup_mutex and threadgroup locked. */ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, struct task_struct *tsk, struct css_set *newcg) -- cgit v1.2.3 From fddfb02ad0d0d3b479c2a26a8ae7e6411b34706b Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 28 Nov 2012 17:15:21 +0800 Subject: cgroup: move list add after list head initilization 2243076ad1 ("cgroup: initialize cgrp->allcg_node in init_cgroup_housekeeping()") initializes cgrp->allcg_node in init_cgroup_housekeeping(). Then in init_cgroup_root(), we should call init_cgroup_housekeeping() before adding it to &root->allcg_list; otherwise, we are initializing an entry already in a list. Signed-off-by: Li Zhong Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 55a0a770a5a..c02b05560d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1401,8 +1401,8 @@ static void init_cgroup_root(struct cgroupfs_root *root) root->number_of_cgroups = 1; cgrp->root = root; cgrp->top_cgroup = cgrp; - list_add_tail(&cgrp->allcg_node, &root->allcg_list); init_cgroup_housekeeping(cgrp); + list_add_tail(&cgrp->allcg_node, &root->allcg_list); } static bool init_root_id(struct cgroupfs_root *root) -- cgit v1.2.3 From 205a872bd6f9a9a09ef035ef1e90185a8245cc58 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 28 Nov 2012 13:50:44 -0800 Subject: cgroup: fix lockdep warning for event_control The cgroup_event_wake() function is called with the wait queue head locked and it takes cgrp->event_list_lock. However, in cgroup_rmdir() remove_wait_queue() was being called after taking cgrp->event_list_lock. Correct the lock ordering by using a temporary list to obtain the event list to remove from the wait queue. Signed-off-by: Greg Thelen Signed-off-by: Aaron Durbin Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02b05560d1..589433f7a74 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4277,6 +4277,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) DEFINE_WAIT(wait); struct cgroup_event *event, *tmp; struct cgroup_subsys *ss; + LIST_HEAD(tmp_list); lockdep_assert_held(&d->d_inode->i_mutex); lockdep_assert_held(&cgroup_mutex); @@ -4331,16 +4332,20 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup - * directory to avoid race between userspace and kernelspace + * directory to avoid race between userspace and kernelspace. Use + * a temporary list to avoid a deadlock with cgroup_event_wake(). Since + * cgroup_event_wake() is called with the wait queue head locked, + * remove_wait_queue() cannot be called while holding event_list_lock. */ spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_splice_init(&cgrp->event_list, &tmp_list); + spin_unlock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &tmp_list, list) { list_del(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); } - spin_unlock(&cgrp->event_list_lock); return 0; } -- cgit v1.2.3 From 9718ceb3431acdeee9bdec5c18e18266333970aa Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Wed, 28 Nov 2012 13:50:45 -0800 Subject: cgroup: list_del_init() on removed events Use list_del_init() rather than list_del() to remove events from cgrp->event_list. No functional change. This is just defensive coding. Signed-off-by: Greg Thelen Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 589433f7a74..46dcdbd7485 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3767,7 +3767,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, if (flags & POLLHUP) { __remove_wait_queue(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); - list_del(&event->list); + list_del_init(&event->list); spin_unlock(&cgrp->event_list_lock); /* * We are in atomic context, but cgroup_event_remove() may @@ -4341,7 +4341,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_splice_init(&cgrp->event_list, &tmp_list); spin_unlock(&cgrp->event_list_lock); list_for_each_entry_safe(event, tmp, &tmp_list, list) { - list_del(&event->list); + list_del_init(&event->list); remove_wait_queue(event->wqh, &event->wait); eventfd_signal(event->eventfd, 1); schedule_work(&event->remove); -- cgit v1.2.3 From 1f869e8711d18aaf6e2979922bc9377ad394b82f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 30 Nov 2012 17:31:23 +0400 Subject: cgroup: warn about broken hierarchies only after css_online If everything goes right, it shouldn't really matter if we are spitting this warning after css_alloc or css_online. If we fail between then, there are some ill cases where we would previously see the message and now we won't (like if the files fail to be created). I believe it really shouldn't matter: this message is intended in spirit to be shown when creation succeeds, but with insane settings. Signed-off-by: Glauber Costa Signed-off-by: Tejun Heo --- kernel/cgroup.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 46dcdbd7485..b186a7e25b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4151,15 +4151,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err) goto err_free_all; } - - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - parent->parent) { - pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); - ss->warned_broken_hierarchy = true; - } } /* @@ -4188,6 +4179,15 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = online_css(ss, cgrp); if (err) goto err_destroy; + + if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && + parent->parent) { + pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", + current->comm, current->pid, ss->name); + if (!strcmp(ss->name, "memory")) + pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); + ss->warned_broken_hierarchy = true; + } } err = cgroup_populate_dir(cgrp, true, root->subsys_mask); -- cgit v1.2.3 From 879a3d9dbbde823ac77d39131e7a287f31b8296f Mon Sep 17 00:00:00 2001 From: Gao feng Date: Sat, 1 Dec 2012 00:21:28 +0800 Subject: cgroup: use cgroup_addrm_files() in cgroup_clear_directory() cgroup_clear_directory() incorrectly invokes cgroup_rm_file() on each cftset of the target subsystems, which only removes the first file of each set. This leaves dangling files after subsystems are removed from a cgroup root via remount. Use cgroup_addrm_files() to remove all files of target subsystems. tj: Move cgroup_addrm_files() prototype decl upwards next to other global declarations. Commit message updated. Signed-off-by: Gao feng Signed-off-by: Tejun Heo --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b186a7e25b0..e1293a9189b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -246,6 +246,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); static int need_forkexit_callback __read_mostly; static int cgroup_destroy_locked(struct cgroup *cgrp); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, + struct cftype cfts[], bool is_add); #ifdef CONFIG_PROVE_LOCKING int cgroup_lock_is_held(void) @@ -964,7 +966,7 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, if (!test_bit(ss->subsys_id, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_rm_file(cgrp, set->cfts); + cgroup_addrm_files(cgrp, NULL, set->cfts, false); } if (base_files) { while (!list_empty(&cgrp->files)) -- cgit v1.2.3 From 7083d0378a1746f2b45729cae494c6b92e75d73f Mon Sep 17 00:00:00 2001 From: Gao feng Date: Mon, 3 Dec 2012 09:28:18 +0800 Subject: cgroup: remove subsystem files when remounting cgroup cgroup_clear_directroy is called by cgroup_d_remove_dir and cgroup_remount. when we call cgroup_remount to remount the cgroup,the subsystem may be unlinked from cgroupfs_root->subsys_list in rebind_subsystem,this subsystem's files will not be removed in cgroup_clear_directroy. And the system will panic when we try to access these files. this patch removes subsystems's files before rebind_subsystems, if rebind_subsystems failed, repopulate these removed files. With help from Tejun. Signed-off-by: Gao feng Signed-off-by: Tejun Heo --- kernel/cgroup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e1293a9189b..5cc37241a6f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1349,14 +1349,21 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* + * Clear out the files of subsystems that should be removed, do + * this before rebind_subsystems, since rebind_subsystems may + * change this hierarchy's subsys_list. + */ + cgroup_clear_directory(cgrp->dentry, false, removed_mask); + ret = rebind_subsystems(root, opts.subsys_mask); if (ret) { + /* rebind_subsystems failed, re-populate the removed files */ + cgroup_populate_dir(cgrp, false, removed_mask); drop_parsed_module_refcounts(opts.subsys_mask); goto out_unlock; } - /* clear out any existing files and repopulate subsystem files */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); /* re-populate subsystem files */ cgroup_populate_dir(cgrp, false, added_mask); -- cgit v1.2.3 From f33fddc2b9573d8359f1007d4bbe5cd587a0c093 Mon Sep 17 00:00:00 2001 From: Gao feng Date: Thu, 6 Dec 2012 14:38:57 +0800 Subject: cgroup_rm_file: don't delete the uncreated files in cgroup_add_file,when creating files for cgroup, some of creation may be skipped. So we need to avoid deleting these uncreated files in cgroup_rm_file, otherwise the warning msg will be triggered. "cgroup_addrm_files: failed to remove memory_pressure_enabled, err=-2" Signed-off-by: Gao feng Acked-by: Li Zefan Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cgroup.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5cc37241a6f..f34c41bfaa3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2724,12 +2724,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, simple_xattrs_init(&cft->xattrs); - /* does @cft->flags tell us to skip creation on @cgrp? */ - if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) - return 0; - if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) - return 0; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { strcpy(name, subsys->name); strcat(name, "."); @@ -2770,6 +2764,12 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, int err, ret = 0; for (cft = cfts; cft->name[0] != '\0'; cft++) { + /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) + continue; + if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) + continue; + if (is_add) err = cgroup_add_file(cgrp, subsys, cft); else -- cgit v1.2.3