aboutsummaryrefslogtreecommitdiff
path: root/kernel/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup.c')
-rw-r--r--kernel/cgroup.c68
1 files changed, 56 insertions, 12 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7c9e6ddb97..d0def7fc284 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -92,6 +92,14 @@ static DEFINE_MUTEX(cgroup_mutex);
static DEFINE_MUTEX(cgroup_root_mutex);
/*
+ * cgroup destruction makes heavy use of work items and there can be a lot
+ * of concurrent destructions. Use a separate workqueue so that cgroup
+ * destruction work items don't end up filling up max_active of system_wq
+ * which may lead to deadlock.
+ */
+static struct workqueue_struct *cgroup_destroy_wq;
+
+/*
* Generate an array of cgroup subsystem pointers. At boot time, this is
* populated with the built in subsystems, and modular subsystems are
* registered after that. The mutable section of this array is protected by
@@ -873,7 +881,7 @@ static void cgroup_free_rcu(struct rcu_head *head)
{
struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
- schedule_work(&cgrp->free_work);
+ queue_work(cgroup_destroy_wq, &cgrp->free_work);
}
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1995,7 +2003,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
- continue;
+ goto next;
/* as per above, nr_threads may decrease, but not increase. */
BUG_ON(i >= group_size);
@@ -2003,7 +2011,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
ent.cgrp = task_cgroup_from_root(tsk, root);
/* nothing to do if this task is already in the cgroup */
if (ent.cgrp == cgrp)
- continue;
+ goto next;
/*
* saying GFP_ATOMIC has no effect here because we did prealloc
* earlier, but it's good form to communicate our expectations.
@@ -2011,7 +2019,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
BUG_ON(retval != 0);
i++;
-
+ next:
if (!threadgroup)
break;
} while_each_thread(leader, tsk);
@@ -2769,13 +2777,17 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
{
LIST_HEAD(pending);
struct cgroup *cgrp, *n;
+ struct super_block *sb = ss->root->sb;
/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
- if (cfts && ss->root != &rootnode) {
+ if (cfts && ss->root != &rootnode &&
+ atomic_inc_not_zero(&sb->s_active)) {
list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
dget(cgrp->dentry);
list_add_tail(&cgrp->cft_q_node, &pending);
}
+ } else {
+ sb = NULL;
}
mutex_unlock(&cgroup_mutex);
@@ -2798,6 +2810,9 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
dput(cgrp->dentry);
}
+ if (sb)
+ deactivate_super(sb);
+
mutex_unlock(&cgroup_cft_mutex);
}
@@ -3727,6 +3742,23 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp,
}
/*
+ * When dput() is called asynchronously, if umount has been done and
+ * then deactivate_super() in cgroup_free_fn() kills the superblock,
+ * there's a small window that vfs will see the root dentry with non-zero
+ * refcnt and trigger BUG().
+ *
+ * That's why we hold a reference before dput() and drop it right after.
+ */
+static void cgroup_dput(struct cgroup *cgrp)
+{
+ struct super_block *sb = cgrp->root->sb;
+
+ atomic_inc(&sb->s_active);
+ dput(cgrp->dentry);
+ deactivate_super(sb);
+}
+
+/*
* Unregister event and free resources.
*
* Gets called from workqueue.
@@ -3746,7 +3778,7 @@ static void cgroup_event_remove(struct work_struct *work)
eventfd_ctx_put(event->eventfd);
kfree(event);
- dput(cgrp->dentry);
+ cgroup_dput(cgrp);
}
/*
@@ -4031,12 +4063,8 @@ static void css_dput_fn(struct work_struct *work)
{
struct cgroup_subsys_state *css =
container_of(work, struct cgroup_subsys_state, dput_work);
- struct dentry *dentry = css->cgroup->dentry;
- struct super_block *sb = dentry->d_sb;
- atomic_inc(&sb->s_active);
- dput(dentry);
- deactivate_super(sb);
+ cgroup_dput(css->cgroup);
}
static void init_cgroup_css(struct cgroup_subsys_state *css,
@@ -4666,6 +4694,22 @@ out:
return err;
}
+static int __init cgroup_wq_init(void)
+{
+ /*
+ * There isn't much point in executing destruction path in
+ * parallel. Good chunk is serialized with cgroup_mutex anyway.
+ * Use 1 for @max_active.
+ *
+ * We would prefer to do this in cgroup_init() above, but that
+ * is called before init_workqueues(): so leave this until after.
+ */
+ cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ BUG_ON(!cgroup_destroy_wq);
+ return 0;
+}
+core_initcall(cgroup_wq_init);
+
/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4976,7 +5020,7 @@ void __css_put(struct cgroup_subsys_state *css)
v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
if (v == 0)
- schedule_work(&css->dput_work);
+ queue_work(cgroup_destroy_wq, &css->dput_work);
}
EXPORT_SYMBOL_GPL(__css_put);