diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bio.c | 37 | ||||
-rw-r--r-- | block/blk-cgroup.c | 754 | ||||
-rw-r--r-- | block/blk-cgroup.h | 603 | ||||
-rw-r--r-- | block/blk-core.c | 76 | ||||
-rw-r--r-- | block/blk-integrity.c | 1 | ||||
-rw-r--r-- | block/blk-sysfs.c | 4 | ||||
-rw-r--r-- | block/blk-throttle.c | 509 | ||||
-rw-r--r-- | block/blk.h | 5 | ||||
-rw-r--r-- | block/bounce.c | 8 | ||||
-rw-r--r-- | block/cfq-iosched.c | 734 | ||||
-rw-r--r-- | block/elevator.c | 4 | ||||
-rw-r--r-- | block/genhd.c | 1 |
12 files changed, 1227 insertions, 1509 deletions
diff --git a/block/bio.c b/block/bio.c index cbce3e2208f4..d2501a479256 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1983,6 +1983,29 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_ EXPORT_SYMBOL(bioset_create_nobvec); #ifdef CONFIG_BLK_CGROUP + +/** + * bio_associate_blkcg - associate a bio with the specified blkcg + * @bio: target bio + * @blkcg_css: css of the blkcg to associate + * + * Associate @bio with the blkcg specified by @blkcg_css. Block layer will + * treat @bio as if it were issued by a task which belongs to the blkcg. + * + * This function takes an extra reference of @blkcg_css which will be put + * when @bio is released. The caller must own @bio and is responsible for + * synchronizing calls to this function. + */ +int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +{ + if (unlikely(bio->bi_css)) + return -EBUSY; + css_get(blkcg_css); + bio->bi_css = blkcg_css; + return 0; +} +EXPORT_SYMBOL_GPL(bio_associate_blkcg); + /** * bio_associate_current - associate a bio with %current * @bio: target bio @@ -1999,28 +2022,20 @@ EXPORT_SYMBOL(bioset_create_nobvec); int bio_associate_current(struct bio *bio) { struct io_context *ioc; - struct cgroup_subsys_state *css; - if (bio->bi_ioc) + if (bio->bi_css) return -EBUSY; ioc = current->io_context; if (!ioc) return -ENOENT; - /* acquire active ref on @ioc and associate */ get_io_context_active(ioc); bio->bi_ioc = ioc; - - /* associate blkcg if exists */ - rcu_read_lock(); - css = task_css(current, blkio_cgrp_id); - if (css && css_tryget_online(css)) - bio->bi_css = css; - rcu_read_unlock(); - + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } +EXPORT_SYMBOL_GPL(bio_associate_current); /** * bio_disassociate_task - undo bio_associate_current() diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 6817e28960b7..66e6f1aae02e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -9,29 +9,46 @@ * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> + * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/err.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> -#include "blk-cgroup.h" +#include <linux/ctype.h> +#include <linux/blk-cgroup.h> #include "blk.h" #define MAX_KEY_LEN 100 +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT, - .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, }; +struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; + static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { @@ -52,9 +69,14 @@ static void blkg_free(struct blkcg_gq *blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) - kfree(blkg->pd[i]); + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - blk_exit_rl(&blkg->rl); + if (blkg->blkcg != &blkcg_root) + blk_exit_rl(&blkg->rl); + + blkg_rwstat_exit(&blkg->stat_ios); + blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } @@ -77,6 +99,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, if (!blkg) return NULL; + if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) || + blkg_rwstat_init(&blkg->stat_ios, gfp_mask)) + goto err_free; + blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; @@ -97,7 +123,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, continue; /* alloc per-policy data and attach it to blkg */ - pd = kzalloc_node(pol->pd_size, gfp_mask, q->node); + pd = pol->pd_alloc_fn(gfp_mask, q->node); if (!pd) goto err_free; @@ -113,26 +139,11 @@ err_free: return NULL; } -/** - * __blkg_lookup - internal version of blkg_lookup() - * @blkcg: blkcg of interest - * @q: request_queue of interest - * @update_hint: whether to update lookup hint with the result or not - * - * This is internal version and shouldn't be used by policy - * implementations. Looks up blkgs for the @blkcg - @q pair regardless of - * @q's bypass state. If @update_hint is %true, the caller should be - * holding @q->queue_lock and lookup hint is updated on success. - */ -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint) +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; - blkg = rcu_dereference(blkcg->blkg_hint); - if (blkg && blkg->q == q) - return blkg; - /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg @@ -150,35 +161,18 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, return NULL; } - -/** - * blkg_lookup - lookup blkg for the specified blkcg - q pair - * @blkcg: blkcg of interest - * @q: request_queue of interest - * - * Lookup blkg for the @blkcg - @q pair. This function should be called - * under RCU read lock and is guaranteed to return %NULL if @q is bypassing - * - see blk_queue_bypass_start() for details. - */ -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - if (unlikely(blk_queue_bypass(q))) - return NULL; - return __blkg_lookup(blkcg, q, false); -} -EXPORT_SYMBOL_GPL(blkg_lookup); +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as - * necessary using %GFP_ATOMIC. @new_blkg is always consumed on return. + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; + struct bdi_writeback_congested *wb_congested; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -186,26 +180,34 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { - ret = -EINVAL; + ret = -ENODEV; goto err_free_blkg; } + wb_congested = wb_congested_get_create(&q->backing_dev_info, + blkcg->css.id, GFP_NOWAIT); + if (!wb_congested) { + ret = -ENOMEM; + goto err_put_css; + } + /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC); + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_css; + goto err_put_congested; } } blkg = new_blkg; + blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { - ret = -EINVAL; - goto err_put_css; + ret = -ENODEV; + goto err_put_congested; } blkg_get(blkg->parent); } @@ -215,7 +217,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) - pol->pd_init_fn(blkg); + pol->pd_init_fn(blkg->pd[i]); } /* insert */ @@ -229,24 +231,21 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg); + pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; spin_unlock(&blkcg->lock); - if (!ret) { - if (blkcg == &blkcg_root) { - q->root_blkg = blkg; - q->root_rl.blkg = blkg; - } + if (!ret) return blkg; - } /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); +err_put_congested: + wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: @@ -281,7 +280,7 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, * we shouldn't allow anything to go through for a bypassing queue. */ if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -EINVAL : -EBUSY); + return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -305,11 +304,11 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, return blkg; } } -EXPORT_SYMBOL_GPL(blkg_lookup_create); static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; + struct blkcg_gq *parent = blkg->parent; int i; lockdep_assert_held(blkg->q->queue_lock); @@ -323,8 +322,14 @@ static void blkg_destroy(struct blkcg_gq *blkg) struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg); + pol->pd_offline_fn(blkg->pd[i]); } + + if (parent) { + blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes); + blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios); + } + blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); @@ -340,15 +345,6 @@ static void blkg_destroy(struct blkcg_gq *blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* - * If root blkg is destroyed. Just clear the pointer since root_rl - * does not take reference on root blkg. - */ - if (blkcg == &blkcg_root) { - blkg->q->root_blkg = NULL; - blkg->q->root_rl.blkg = NULL; - } - - /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ @@ -374,6 +370,9 @@ static void blkg_destroy_all(struct request_queue *q) blkg_destroy(blkg); spin_unlock(&blkcg->lock); } + + q->root_blkg = NULL; + q->root_rl.blkg = NULL; } /* @@ -387,21 +386,14 @@ static void blkg_destroy_all(struct request_queue *q) void __blkg_release_rcu(struct rcu_head *rcu_head) { struct blkcg_gq *blkg = container_of(rcu_head, struct blkcg_gq, rcu_head); - int i; - - /* tell policies that this one is being freed */ - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - - if (blkg->pd[i] && pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - } /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); + wb_congested_put(blkg->wb_congested); + blkg_free(blkg); } EXPORT_SYMBOL_GPL(__blkg_release_rcu); @@ -448,20 +440,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkcg_gq *blkg; int i; - /* - * XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex - * which ends up putting cgroup's internal cgroup_tree_mutex under - * it; however, cgroup_tree_mutex is nested above cgroup file - * active protection and grabbing blkcg_pol_mutex from a cgroup - * file operation creates a possible circular dependency. cgroup - * internal locking is planned to go through further simplification - * and this issue should go away soon. For now, let's trylock - * blkcg_pol_mutex and restart the write on failure. - * - * http://lkml.kernel.org/g/5363C04B.4010400@oracle.com - */ - if (!mutex_trylock(&blkcg_pol_mutex)) - return restart_syscall(); + mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* @@ -470,12 +449,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + blkg_rwstat_reset(&blkg->stat_bytes); + blkg_rwstat_reset(&blkg->stat_ios); + for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkcg_policy_enabled(blkg->q, pol) && - pol->pd_reset_stats_fn) - pol->pd_reset_stats_fn(blkg); + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); } } @@ -484,13 +465,14 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, return 0; } -static const char *blkg_dev_name(struct blkcg_gq *blkg) +const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ if (blkg->q->backing_dev_info.dev) return dev_name(blkg->q->backing_dev_info.dev); return NULL; } +EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -579,9 +561,10 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, for (i = 0; i < BLKG_RWSTAT_NR; i++) seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], - (unsigned long long)rwstat->cnt[i]); + (unsigned long long)atomic64_read(&rwstat->aux_cnt[i])); - v = rwstat->cnt[BLKG_RWSTAT_READ] + rwstat->cnt[BLKG_RWSTAT_WRITE]; + v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]); seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v); return v; } @@ -618,31 +601,122 @@ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, } EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); +static u64 blkg_prfill_rwstat_field(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off); + + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_bytes. + * cftype->private must be set to the blkcg_policy. + */ +int blkg_print_stat_bytes(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes); + +/** + * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios + * @sf: seq_file to print to + * @v: unused + * + * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private + * must be set to the blkcg_policy. + */ +int blkg_print_stat_ios(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios); + +static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg, + NULL, off); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} + +/** + * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_bytes), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive); + +/** + * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios + * @sf: seq_file to print to + * @v: unused + */ +int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat_field_recursive, + (void *)seq_cft(sf)->private, + offsetof(struct blkcg_gq, stat_ios), true); + return 0; +} +EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive); + /** * blkg_stat_recursive_sum - collect hierarchical blkg_stat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_stat + * @off: offset to the blkg_stat in blkg_policy_data or @blkg * - * Collect the blkg_stat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_stat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is + * at @off bytes into @blkg's blkg_policy_data of the policy. */ -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) +u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_stat *stat = (void *)pos_pd + off; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_stat *stat; + + if (!pos_blkg->online) + continue; + + if (pol) + stat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + stat = (void *)blkg + off; - if (pos_blkg->online) - sum += blkg_stat_read(stat); + sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt); } rcu_read_unlock(); @@ -652,37 +726,43 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum); /** * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat - * @pd: policy private data of interest - * @off: offset to the blkg_stat in @pd + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg * - * Collect the blkg_rwstat specified by @off from @pd and all its online - * descendants and return the sum. The caller must be holding the queue - * lock for online tests. + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. */ -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off) +struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, + struct blkcg_policy *pol, int off) { - struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(pd->blkg->q->queue_lock); + lockdep_assert_held(blkg->q->queue_lock); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { - struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); - struct blkg_rwstat *rwstat = (void *)pos_pd + off; - struct blkg_rwstat tmp; + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; if (!pos_blkg->online) continue; - tmp = blkg_rwstat_read(rwstat); + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum.cnt[i] += tmp.cnt[i]; + atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]), + &sum.aux_cnt[i]); } rcu_read_unlock(); @@ -698,29 +778,37 @@ EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->v the new - * value. This function returns with RCU read lock and queue lock held and - * must be paired with blkg_conf_finish(). + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx) + char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(disk->queue->queue_lock) { struct gendisk *disk; struct blkcg_gq *blkg; + struct module *owner; unsigned int major, minor; - unsigned long long v; - int part, ret; + int key_len, part, ret; + char *body; - if (sscanf(input, "%u:%u %llu", &major, &minor, &v) != 3) + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return -EINVAL; + body = input + key_len; + if (!isspace(*body)) + return -EINVAL; + body = skip_spaces(body); + disk = get_gendisk(MKDEV(major, minor), &part); if (!disk) - return -EINVAL; + return -ENODEV; if (part) { + owner = disk->fops->owner; put_disk(disk); - return -EINVAL; + module_put(owner); + return -ENODEV; } rcu_read_lock(); @@ -729,13 +817,15 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (blkcg_policy_enabled(disk->queue, pol)) blkg = blkg_lookup_create(blkcg, disk->queue); else - blkg = ERR_PTR(-EINVAL); + blkg = ERR_PTR(-EOPNOTSUPP); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); rcu_read_unlock(); spin_unlock_irq(disk->queue->queue_lock); + owner = disk->fops->owner; put_disk(disk); + module_put(owner); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -751,7 +841,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, ctx->disk = disk; ctx->blkg = blkg; - ctx->v = v; + ctx->body = body; return 0; } EXPORT_SYMBOL_GPL(blkg_conf_prep); @@ -766,14 +856,66 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(ctx->disk->queue->queue_lock) __releases(rcu) { + struct module *owner; + spin_unlock_irq(ctx->disk->queue->queue_lock); rcu_read_unlock(); + owner = ctx->disk->fops->owner; put_disk(ctx->disk); + module_put(owner); } EXPORT_SYMBOL_GPL(blkg_conf_finish); +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + const char *dname; + struct blkg_rwstat rwstat; + u64 rbytes, wbytes, rios, wios; + + dname = blkg_dev_name(blkg); + if (!dname) + continue; + + spin_lock_irq(blkg->q->queue_lock); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + rwstat = blkg_rwstat_recursive_sum(blkg, NULL, + offsetof(struct blkcg_gq, stat_ios)); + rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]); + wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); + + spin_unlock_irq(blkg->q->queue_lock); + + if (rbytes || wbytes || rios || wios) + seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n", + dname, rbytes, wbytes, rios, wios); + } + + rcu_read_unlock(); + return 0; +} + struct cftype blkcg_files[] = { { + .name = "stat", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +struct cftype blkcg_legacy_files[] = { + { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, @@ -813,38 +955,91 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) } spin_unlock_irq(&blkcg->lock); + + wb_blkcg_offline(blkcg); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); + int i; + + mutex_lock(&blkcg_pol_mutex); + + list_del(&blkcg->all_blkcgs_node); + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - if (blkcg != &blkcg_root) - kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; + struct cgroup_subsys_state *ret; + int i; + + mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; - goto done; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto free_blkcg; + } } - blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) - return ERR_PTR(-ENOMEM); + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_alloc_fn) + continue; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; + cpd->plid = i; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } - blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT; - blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT; -done: spin_lock_init(&blkcg->lock); - INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT); INIT_HLIST_HEAD(&blkcg->blkg_list); +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); +free_blkcg: + kfree(blkcg); + mutex_unlock(&blkcg_pol_mutex); + return ret; } /** @@ -859,9 +1054,45 @@ done: */ int blkcg_init_queue(struct request_queue *q) { - might_sleep(); + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); - return blk_throtl_init(q); + /* + * Make sure the root blkg exists and count the existing blkgs. As + * @q is bypassing at this point, blkg_lookup_create() can't be + * used. Open code insertion. + */ + rcu_read_lock(); + spin_lock_irq(q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + spin_unlock_irq(q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); + + if (IS_ERR(blkg)) { + blkg_free(new_blkg); + return PTR_ERR(blkg); + } + + q->root_blkg = blkg; + q->root_rl.blkg = blkg; + + ret = blk_throtl_init(q); + if (ret) { + spin_lock_irq(q->queue_lock); + blkg_destroy_all(q); + spin_unlock_irq(q->queue_lock); + } + return ret; } /** @@ -905,15 +1136,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *dst_css; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) @@ -925,12 +1156,35 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, - .legacy_cftypes = blkcg_files, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -940,7 +1194,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -961,96 +1215,54 @@ EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { - LIST_HEAD(pds); - struct blkcg_gq *blkg, *new_blkg; - struct blkg_policy_data *pd, *n; - int cnt = 0, ret; - bool preloaded; + struct blkg_policy_data *pd_prealloc = NULL; + struct blkcg_gq *blkg; + int ret; if (blkcg_policy_enabled(q, pol)) return 0; - /* preallocations for root blkg */ - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); - if (!new_blkg) - return -ENOMEM; - blk_queue_bypass_start(q); - - preloaded = !radix_tree_preload(GFP_KERNEL); - - /* - * Make sure the root blkg exists and count the existing blkgs. As - * @q is bypassing at this point, blkg_lookup_create() can't be - * used. Open code it. - */ - spin_lock_irq(q->queue_lock); - - rcu_read_lock(); - blkg = __blkg_lookup(&blkcg_root, q, false); - if (blkg) - blkg_free(new_blkg); - else - blkg = blkg_create(&blkcg_root, q, new_blkg); - rcu_read_unlock(); - - if (preloaded) - radix_tree_preload_end(); - - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto out_unlock; - } - - list_for_each_entry(blkg, &q->blkg_list, q_node) - cnt++; - - spin_unlock_irq(q->queue_lock); - - /* allocate policy_data for all existing blkgs */ - while (cnt--) { - pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node); - if (!pd) { +pd_prealloc: + if (!pd_prealloc) { + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); + if (!pd_prealloc) { ret = -ENOMEM; - goto out_free; + goto out_bypass_end; } - list_add_tail(&pd->alloc_node, &pds); } - /* - * Install the allocated pds. With @q bypassing, no new blkg - * should have been created while the queue lock was dropped. - */ spin_lock_irq(q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (WARN_ON(list_empty(&pds))) { - /* umm... this shouldn't happen, just abort */ - ret = -ENOMEM; - goto out_unlock; - } - pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node); - list_del_init(&pd->alloc_node); + struct blkg_policy_data *pd; - /* grab blkcg lock too while installing @pd on @blkg */ - spin_lock(&blkg->blkcg->lock); + if (blkg->pd[pol->plid]) + continue; + + pd = pol->pd_alloc_fn(GFP_NOWAIT, q->node); + if (!pd) + swap(pd, pd_prealloc); + if (!pd) { + spin_unlock_irq(q->queue_lock); + goto pd_prealloc; + } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; - pol->pd_init_fn(blkg); - - spin_unlock(&blkg->blkcg->lock); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); } __set_bit(pol->plid, q->blkcg_pols); ret = 0; -out_unlock: + spin_unlock_irq(q->queue_lock); -out_free: +out_bypass_end: blk_queue_bypass_end(q); - list_for_each_entry_safe(pd, n, &pds, alloc_node) - kfree(pd); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); return ret; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); @@ -1076,21 +1288,16 @@ void blkcg_deactivate_policy(struct request_queue *q, __clear_bit(pol->plid, q->blkcg_pols); - /* if no policy is left, no need for blkgs - shoot them down */ - if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS)) - blkg_destroy_all(q); - list_for_each_entry(blkg, &q->blkg_list, q_node) { /* grab blkcg lock too while removing @pd from @blkg */ spin_lock(&blkg->blkcg->lock); - if (pol->pd_offline_fn) - pol->pd_offline_fn(blkg); - if (pol->pd_exit_fn) - pol->pd_exit_fn(blkg); - - kfree(blkg->pd[pol->plid]); - blkg->pd[pol->plid] = NULL; + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } spin_unlock(&blkg->blkcg->lock); } @@ -1109,11 +1316,10 @@ EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); */ int blkcg_policy_register(struct blkcg_policy *pol) { + struct blkcg *blkcg; int i, ret; - if (WARN_ON(pol->pd_size < sizeof(struct blkg_policy_data))) - return -EINVAL; - + mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ @@ -1122,19 +1328,55 @@ int blkcg_policy_register(struct blkcg_policy *pol) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) - goto out_unlock; + goto err_unlock; - /* register and update blkgs */ + /* register @pol */ pol->plid = i; - blkcg_policy[i] = pol; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + mutex_unlock(&blkcg_pol_mutex); + goto err_free_cpds; + } + + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; + cpd->plid = pol->plid; + pol->cpd_init_fn(cpd); + } + } + + mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ - if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, - pol->cftypes)); - ret = 0; -out_unlock: + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_free_cpds: + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; +err_unlock: mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); @@ -1147,18 +1389,34 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - mutex_lock(&blkcg_pol_mutex); + struct blkcg *blkcg; + + mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ - if (pol->cftypes) - cgroup_rm_cftypes(pol->cftypes); + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); + + /* remove cpds and unregister */ + mutex_lock(&blkcg_pol_mutex); - /* unregister and update blkgs */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } blkcg_policy[pol->plid] = NULL; -out_unlock: + mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h deleted file mode 100644 index c567865b5f1d..000000000000 --- a/block/blk-cgroup.h +++ /dev/null @@ -1,603 +0,0 @@ -#ifndef _BLK_CGROUP_H -#define _BLK_CGROUP_H -/* - * Common Block IO controller cgroup interface - * - * Based on ideas and code from CFQ, CFS and BFQ: - * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> - * - * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> - * Paolo Valente <paolo.valente@unimore.it> - * - * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> - * Nauman Rafique <nauman@google.com> - */ - -#include <linux/cgroup.h> -#include <linux/u64_stats_sync.h> -#include <linux/seq_file.h> -#include <linux/radix-tree.h> -#include <linux/blkdev.h> -#include <linux/atomic.h> - -/* Max limits for throttle policy */ -#define THROTL_IOPS_MAX UINT_MAX - -/* CFQ specific, out here for blkcg->cfq_weight */ -#define CFQ_WEIGHT_MIN 10 -#define CFQ_WEIGHT_MAX 1000 -#define CFQ_WEIGHT_DEFAULT 500 - -#ifdef CONFIG_BLK_CGROUP - -enum blkg_rwstat_type { - BLKG_RWSTAT_READ, - BLKG_RWSTAT_WRITE, - BLKG_RWSTAT_SYNC, - BLKG_RWSTAT_ASYNC, - - BLKG_RWSTAT_NR, - BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, -}; - -struct blkcg_gq; - -struct blkcg { - struct cgroup_subsys_state css; - spinlock_t lock; - - struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; - struct hlist_head blkg_list; - - /* TODO: per-policy storage in blkcg */ - unsigned int cfq_weight; /* belongs to cfq */ - unsigned int cfq_leaf_weight; -}; - -struct blkg_stat { - struct u64_stats_sync syncp; - uint64_t cnt; -}; - -struct blkg_rwstat { - struct u64_stats_sync syncp; - uint64_t cnt[BLKG_RWSTAT_NR]; -}; - -/* - * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a - * request_queue (q). This is used by blkcg policies which need to track - * information per blkcg - q pair. - * - * There can be multiple active blkcg policies and each has its private - * data on each blkg, the size of which is determined by - * blkcg_policy->pd_size. blkcg core allocates and frees such areas - * together with blkg and invokes pd_init/exit_fn() methods. - * - * Such private data must embed struct blkg_policy_data (pd) at the - * beginning and pd_size can't be smaller than pd. - */ -struct blkg_policy_data { - /* the blkg and policy id this per-policy data belongs to */ - struct blkcg_gq *blkg; - int plid; - - /* used during policy activation */ - struct list_head alloc_node; -}; - -/* association between a blk cgroup and a request queue */ -struct blkcg_gq { - /* Pointer to the associated request_queue */ - struct request_queue *q; - struct list_head q_node; - struct hlist_node blkcg_node; - struct blkcg *blkcg; - - /* all non-root blkcg_gq's are guaranteed to have access to parent */ - struct blkcg_gq *parent; - - /* request allocation list for this blkcg-q pair */ - struct request_list rl; - - /* reference count */ - atomic_t refcnt; - - /* is this blkg online? protected by both blkcg and q locks */ - bool online; - - struct blkg_policy_data *pd[BLKCG_MAX_POLS]; - - struct rcu_head rcu_head; -}; - -typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg); -typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg); - -struct blkcg_policy { - int plid; - /* policy specific private data size */ - size_t pd_size; - /* cgroup files for the policy */ - struct cftype *cftypes; - - /* operations */ - blkcg_pol_init_pd_fn *pd_init_fn; - blkcg_pol_online_pd_fn *pd_online_fn; - blkcg_pol_offline_pd_fn *pd_offline_fn; - blkcg_pol_exit_pd_fn *pd_exit_fn; - blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; -}; - -extern struct blkcg blkcg_root; - -struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q); -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q); -int blkcg_init_queue(struct request_queue *q); -void blkcg_drain_queue(struct request_queue *q); -void blkcg_exit_queue(struct request_queue *q); - -/* Blkio controller policy registration */ -int blkcg_policy_register(struct blkcg_policy *pol); -void blkcg_policy_unregister(struct blkcg_policy *pol); -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol); -void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol); - -void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, - u64 (*prfill)(struct seq_file *, - struct blkg_policy_data *, int), - const struct blkcg_policy *pol, int data, - bool show_total); -u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); -u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - const struct blkg_rwstat *rwstat); -u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off); -u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, - int off); - -u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off); -struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, - int off); - -struct blkg_conf_ctx { - struct gendisk *disk; - struct blkcg_gq *blkg; - u64 v; -}; - -int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - const char *input, struct blkg_conf_ctx *ctx); -void blkg_conf_finish(struct blkg_conf_ctx *ctx); - - -static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct blkcg, css) : NULL; -} - -static inline struct blkcg *task_blkcg(struct task_struct *tsk) -{ - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); -} - -static inline struct blkcg *bio_blkcg(struct bio *bio) -{ - if (bio && bio->bi_css) - return css_to_blkcg(bio->bi_css); - return task_blkcg(current); -} - -/** - * blkcg_parent - get the parent of a blkcg - * @blkcg: blkcg of interest - * - * Return the parent blkcg of @blkcg. Can be called anytime. - */ -static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) -{ - return css_to_blkcg(blkcg->css.parent); -} - -/** - * blkg_to_pdata - get policy private data - * @blkg: blkg of interest - * @pol: policy of interest - * - * Return pointer to private data associated with the @blkg-@pol pair. - */ -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) -{ - return blkg ? blkg->pd[pol->plid] : NULL; -} - -/** - * pdata_to_blkg - get blkg associated with policy private data - * @pd: policy private data of interest - * - * @pd is policy private data. Determine the blkg it's associated with. - */ -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) -{ - return pd ? pd->blkg : NULL; -} - -/** - * blkg_path - format cgroup path of blkg - * @blkg: blkg of interest - * @buf: target buffer - * @buflen: target buffer length - * - * Format the path of the cgroup of @blkg into @buf. - */ -static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) -{ - char *p; - - p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); - if (!p) { - strncpy(buf, "<unavailable>", buflen); - return -ENAMETOOLONG; - } - - memmove(buf, p, buf + buflen - p); - return 0; -} - -/** - * blkg_get - get a blkg reference - * @blkg: blkg to get - * - * The caller should be holding an existing reference. - */ -static inline void blkg_get(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - atomic_inc(&blkg->refcnt); -} - -void __blkg_release_rcu(struct rcu_head *rcu); - -/** - * blkg_put - put a blkg reference - * @blkg: blkg to put - */ -static inline void blkg_put(struct blkcg_gq *blkg) -{ - WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0); - if (atomic_dec_and_test(&blkg->refcnt)) - call_rcu(&blkg->rcu_head, __blkg_release_rcu); -} - -struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, - bool update_hint); - -/** - * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU - * read locked. If called under either blkcg or queue lock, the iteration - * is guaranteed to include all and only online blkgs. The caller may - * update @pos_css by calling css_rightmost_descendant() to skip subtree. - * @p_blkg is included in the iteration and the first node to be visited. - */ -#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blkg_for_each_descendant_post - post-order walk of a blkg's descendants - * @d_blkg: loop cursor pointing to the current descendant - * @pos_css: used for iteration - * @p_blkg: target blkg to walk descendants of - * - * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. @p_blkg is - * included in the iteration and the last node to be visited. - */ -#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ - css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ - if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ - (p_blkg)->q, false))) - -/** - * blk_get_rl - get request_list to use - * @q: request_queue of interest - * @bio: bio which will be attached to the allocated request (may be %NULL) - * - * The caller wants to allocate a request from @q to use for @bio. Find - * the request_list to use and obtain a reference on it. Should be called - * under queue_lock. This function is guaranteed to return non-%NULL - * request_list. - */ -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) -{ - struct blkcg *blkcg; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - - /* bypass blkg lookup and use @q->root_rl directly for root */ - if (blkcg == &blkcg_root) - goto root_rl; - - /* - * Try to use blkg->rl. blkg lookup may fail under memory pressure - * or if either the blkcg or queue is going away. Fall back to - * root_rl in such cases. - */ - blkg = blkg_lookup_create(blkcg, q); - if (unlikely(IS_ERR(blkg))) - goto root_rl; - - blkg_get(blkg); - rcu_read_unlock(); - return &blkg->rl; -root_rl: - rcu_read_unlock(); - return &q->root_rl; -} - -/** - * blk_put_rl - put request_list - * @rl: request_list to put - * - * Put the reference acquired by blk_get_rl(). Should be called under - * queue_lock. - */ -static inline void blk_put_rl(struct request_list *rl) -{ - /* root_rl may not have blkg set */ - if (rl->blkg && rl->blkg->blkcg != &blkcg_root) - blkg_put(rl->blkg); -} - -/** - * blk_rq_set_rl - associate a request with a request_list - * @rq: request of interest - * @rl: target request_list - * - * Associate @rq with @rl so that accounting and freeing can know the - * request_list @rq came from. - */ -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) -{ - rq->rl = rl; -} - -/** - * blk_rq_rl - return the request_list a request came from - * @rq: request of interest - * - * Return the request_list @rq is allocated from. - */ -static inline struct request_list *blk_rq_rl(struct request *rq) -{ - return rq->rl; -} - -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q); -/** - * blk_queue_for_each_rl - iterate through all request_lists of a request_queue - * - * Should be used under queue_lock. - */ -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q))) - -static inline void blkg_stat_init(struct blkg_stat *stat) -{ - u64_stats_init(&stat->syncp); -} - -/** - * blkg_stat_add - add a value to a blkg_stat - * @stat: target blkg_stat - * @val: value to add - * - * Add @val to @stat. The caller is responsible for synchronizing calls to - * this function. - */ -static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val) -{ - u64_stats_update_begin(&stat->syncp); - stat->cnt += val; - u64_stats_update_end(&stat->syncp); -} - -/** - * blkg_stat_read - read the current value of a blkg_stat - * @stat: blkg_stat to read - * - * Read the current value of @stat. This function can be called without - * synchroniztion and takes care of u64 atomicity. - */ -static inline uint64_t blkg_stat_read(struct blkg_stat *stat) -{ - unsigned int start; - uint64_t v; - - do { - start = u64_stats_fetch_begin_irq(&stat->syncp); - v = stat->cnt; - } while (u64_stats_fetch_retry_irq(&stat->syncp, start)); - - return v; -} - -/** - * blkg_stat_reset - reset a blkg_stat - * @stat: blkg_stat to reset - */ -static inline void blkg_stat_reset(struct blkg_stat *stat) -{ - stat->cnt = 0; -} - -/** - * blkg_stat_merge - merge a blkg_stat into another - * @to: the destination blkg_stat - * @from: the source - * - * Add @from's count to @to. - */ -static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from) -{ - blkg_stat_add(to, blkg_stat_read(from)); -} - -static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat) -{ - u64_stats_init(&rwstat->syncp); -} - -/** - * blkg_rwstat_add - add a value to a blkg_rwstat - * @rwstat: target blkg_rwstat - * @rw: mask of REQ_{WRITE|SYNC} - * @val: value to add - * - * Add @val to @rwstat. The counters are chosen according to @rw. The - * caller is responsible for synchronizing calls to this function. - */ -static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - int rw, uint64_t val) -{ - u64_stats_update_begin(&rwstat->syncp); - - if (rw & REQ_WRITE) - rwstat->cnt[BLKG_RWSTAT_WRITE] += val; - else - rwstat->cnt[BLKG_RWSTAT_READ] += val; - if (rw & REQ_SYNC) - rwstat->cnt[BLKG_RWSTAT_SYNC] += val; - else - rwstat->cnt[BLKG_RWSTAT_ASYNC] += val; - - u64_stats_update_end(&rwstat->syncp); -} - -/** - * blkg_rwstat_read - read the current values of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Read the current snapshot of @rwstat and return it as the return value. - * This function can be called without synchronization and takes care of - * u64 atomicity. - */ -static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat) -{ - unsigned int start; - struct blkg_rwstat tmp; - - do { - start = u64_stats_fetch_begin_irq(&rwstat->syncp); - tmp = *rwstat; - } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start)); - - return tmp; -} - -/** - * blkg_rwstat_total - read the total count of a blkg_rwstat - * @rwstat: blkg_rwstat to read - * - * Return the total count of @rwstat regardless of the IO direction. This - * function can be called without synchronization and takes care of u64 - * atomicity. - */ -static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) -{ - struct blkg_rwstat tmp = blkg_rwstat_read(rwstat); - - return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; -} - -/** - * blkg_rwstat_reset - reset a blkg_rwstat - * @rwstat: blkg_rwstat to reset - */ -static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) -{ - memset(rwstat->cnt, 0, sizeof(rwstat->cnt)); -} - -/** - * blkg_rwstat_merge - merge a blkg_rwstat into another - * @to: the destination blkg_rwstat - * @from: the source - * - * Add @from's counts to @to. - */ -static inline void blkg_rwstat_merge(struct blkg_rwstat *to, - struct blkg_rwstat *from) -{ - struct blkg_rwstat v = blkg_rwstat_read(from); - int i; - - u64_stats_update_begin(&to->syncp); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - to->cnt[i] += v.cnt[i]; - u64_stats_update_end(&to->syncp); -} - -#else /* CONFIG_BLK_CGROUP */ - -struct cgroup; -struct blkcg; - -struct blkg_policy_data { -}; - -struct blkcg_gq { -}; - -struct blkcg_policy { -}; - -static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } -static inline int blkcg_init_queue(struct request_queue *q) { return 0; } -static inline void blkcg_drain_queue(struct request_queue *q) { } -static inline void blkcg_exit_queue(struct request_queue *q) { } -static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } -static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } -static inline int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { return 0; } -static inline void blkcg_deactivate_policy(struct request_queue *q, - const struct blkcg_policy *pol) { } - -static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } - -static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, - struct blkcg_policy *pol) { return NULL; } -static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } -static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } -static inline void blkg_get(struct blkcg_gq *blkg) { } -static inline void blkg_put(struct blkcg_gq *blkg) { } - -static inline struct request_list *blk_get_rl(struct request_queue *q, - struct bio *bio) { return &q->root_rl; } -static inline void blk_put_rl(struct request_list *rl) { } -static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { } -static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; } - -#define blk_queue_for_each_rl(rl, q) \ - for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) - -#endif /* CONFIG_BLK_CGROUP */ -#endif /* _BLK_CGROUP_H */ diff --git a/block/blk-core.c b/block/blk-core.c index bbbf36e6066b..cd88cee201d1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -32,12 +32,12 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); @@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep; */ static struct workqueue_struct *kblockd_workqueue; +static void blk_clear_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + clear_wb_congested(rl->blkg->wb_congested, sync); +#else + /* + * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't + * flip its congestion state for events on other blkcgs. + */ + if (rl == &rl->q->root_rl) + clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + +static void blk_set_congested(struct request_list *rl, int sync) +{ +#ifdef CONFIG_CGROUP_WRITEBACK + set_wb_congested(rl->blkg->wb_congested, sync); +#else + /* see blk_clear_congested() */ + if (rl == &rl->q->root_rl) + set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); +#endif +} + void blk_queue_congestion_threshold(struct request_queue *q) { int nr; @@ -554,7 +579,7 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_destroy(&q->backing_dev_info); + bdi_unregister(&q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); @@ -623,8 +648,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.state = 0; - q->backing_dev_info.capabilities = 0; + q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info.name = "block"; q->node = node_id; @@ -847,13 +871,8 @@ static void __freed_request(struct request_list *rl, int sync) { struct request_queue *q = rl->q; - /* - * bdi isn't aware of blkcg yet. As all async IOs end up root - * blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl && - rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, sync); + if (rl->count[sync] < queue_congestion_off_threshold(q)) + blk_clear_congested(rl, sync); if (rl->count[sync] + 1 <= q->nr_requests) { if (waitqueue_active(&rl->wait[sync])) @@ -886,25 +905,25 @@ static void freed_request(struct request_list *rl, unsigned int flags) int blk_update_nr_requests(struct request_queue *q, unsigned int nr) { struct request_list *rl; + int on_thresh, off_thresh; spin_lock_irq(q->queue_lock); q->nr_requests = nr; blk_queue_congestion_threshold(q); + on_thresh = queue_congestion_on_threshold(q); + off_thresh = queue_congestion_off_threshold(q); - /* congestion isn't cgroup aware and follows root blkcg for now */ - rl = &q->root_rl; - - if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_SYNC); + blk_queue_for_each_rl(rl, q) { + if (rl->count[BLK_RW_SYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_SYNC); + else if (rl->count[BLK_RW_SYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_SYNC); - if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q)) - blk_set_queue_congested(q, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q)) - blk_clear_queue_congested(q, BLK_RW_ASYNC); + if (rl->count[BLK_RW_ASYNC] >= on_thresh) + blk_set_congested(rl, BLK_RW_ASYNC); + else if (rl->count[BLK_RW_ASYNC] < off_thresh) + blk_clear_congested(rl, BLK_RW_ASYNC); - blk_queue_for_each_rl(rl, q) { if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { blk_set_rl_full(rl, BLK_RW_SYNC); } else { @@ -1014,12 +1033,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags, } } } - /* - * bdi isn't aware of blkcg yet. As all async IOs end up - * root blkcg anyway, just use root blkcg state. - */ - if (rl == &q->root_rl) - blk_set_queue_congested(q, is_sync); + blk_set_congested(rl, is_sync); } /* @@ -1869,8 +1883,8 @@ generic_make_request_checks(struct bio *bio) */ create_io_context(GFP_ATOMIC, q->node); - if (blk_throtl_bio(q, bio)) - return false; /* throttled, will be resubmitted later */ + if (!blkcg_bio_issue_check(q, bio)) + return false; trace_block_bio_queue(q, bio); return true; diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 79ffb4855af0..f548b64be092 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -21,6 +21,7 @@ */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> #include <linux/scatterlist.h> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 2b8fd302f677..145ddb6c6d31 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -6,11 +6,12 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/blktrace_api.h> #include <linux/blk-mq.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" #include "blk-mq.h" struct queue_sysfs_entry { @@ -501,6 +502,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); + bdi_exit(&q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 5b9c6d5c3636..2149a1ddbacf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -9,7 +9,7 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include "blk-cgroup.h" +#include <linux/blk-cgroup.h> #include "blk.h" /* Max dispatch from a group in 1 round */ @@ -83,14 +83,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -/* Per-cpu group stats */ -struct tg_stats_cpu { - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -141,12 +133,6 @@ struct throtl_grp { /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - - /* Per cpu stats pointer */ - struct tg_stats_cpu __percpu *stats_cpu; - - /* List of tgs waiting for per cpu stats memory to be allocated */ - struct list_head stats_alloc_node; }; struct throtl_data @@ -168,13 +154,6 @@ struct throtl_data struct work_struct dispatch_work; }; -/* list and work item to allocate percpu group stats */ -static DEFINE_SPINLOCK(tg_stats_alloc_lock); -static LIST_HEAD(tg_stats_alloc_list); - -static void tg_stats_alloc_fn(struct work_struct *); -static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); - static void throtl_pending_timer_fn(unsigned long arg); static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) @@ -192,11 +171,6 @@ static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) return pd_to_blkg(&tg->pd); } -static inline struct throtl_grp *td_root_tg(struct throtl_data *td) -{ - return blkg_to_tg(td->queue->root_blkg); -} - /** * sq_to_tg - return the throl_grp the specified service queue belongs to * @sq: the throtl_service_queue of interest @@ -256,53 +230,6 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) } \ } while (0) -static void tg_stats_init(struct tg_stats_cpu *tg_stats) -{ - blkg_rwstat_init(&tg_stats->service_bytes); - blkg_rwstat_init(&tg_stats->serviced); -} - -/* - * Worker for allocating per cpu stat for tgs. This is scheduled on the - * system_wq once there are some groups on the alloc_list waiting for - * allocation. - */ -static void tg_stats_alloc_fn(struct work_struct *work) -{ - static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ - struct delayed_work *dwork = to_delayed_work(work); - bool empty = false; - -alloc_stats: - if (!stats_cpu) { - int cpu; - - stats_cpu = alloc_percpu(struct tg_stats_cpu); - if (!stats_cpu) { - /* allocation failed, try again after some time */ - schedule_delayed_work(dwork, msecs_to_jiffies(10)); - return; - } - for_each_possible_cpu(cpu) - tg_stats_init(per_cpu_ptr(stats_cpu, cpu)); - } - - spin_lock_irq(&tg_stats_alloc_lock); - - if (!list_empty(&tg_stats_alloc_list)) { - struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, - struct throtl_grp, - stats_alloc_node); - swap(tg->stats_cpu, stats_cpu); - list_del_init(&tg->stats_alloc_node); - } - - empty = list_empty(&tg_stats_alloc_list); - spin_unlock_irq(&tg_stats_alloc_lock); - if (!empty) - goto alloc_stats; -} - static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) { INIT_LIST_HEAD(&qn->node); @@ -387,29 +314,46 @@ static struct bio *throtl_pop_queued(struct list_head *queued, } /* init a service_queue, assumes the caller zeroed it */ -static void throtl_service_queue_init(struct throtl_service_queue *sq, - struct throtl_service_queue *parent_sq) +static void throtl_service_queue_init(struct throtl_service_queue *sq) { INIT_LIST_HEAD(&sq->queued[0]); INIT_LIST_HEAD(&sq->queued[1]); sq->pending_tree = RB_ROOT; - sq->parent_sq = parent_sq; setup_timer(&sq->pending_timer, throtl_pending_timer_fn, (unsigned long)sq); } -static void throtl_service_queue_exit(struct throtl_service_queue *sq) +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) { - del_timer_sync(&sq->pending_timer); + struct throtl_grp *tg; + int rw; + + tg = kzalloc_node(sizeof(*tg), gfp, node); + if (!tg) + return NULL; + + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ] = -1; + tg->bps[WRITE] = -1; + tg->iops[READ] = -1; + tg->iops[WRITE] = -1; + + return &tg->pd; } -static void throtl_pd_init(struct blkcg_gq *blkg) +static void throtl_pd_init(struct blkg_policy_data *pd) { - struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); struct throtl_data *td = blkg->q->td; - struct throtl_service_queue *parent_sq; - unsigned long flags; - int rw; + struct throtl_service_queue *sq = &tg->service_queue; /* * If on the default hierarchy, we switch to properly hierarchical @@ -424,35 +368,10 @@ static void throtl_pd_init(struct blkcg_gq *blkg) * Limits of a group don't interact with limits of other groups * regardless of the position of the group in the hierarchy. */ - parent_sq = &td->service_queue; - - if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) - parent_sq = &blkg_to_tg(blkg->parent)->service_queue; - - throtl_service_queue_init(&tg->service_queue, parent_sq); - - for (rw = READ; rw <= WRITE; rw++) { - throtl_qnode_init(&tg->qnode_on_self[rw], tg); - throtl_qnode_init(&tg->qnode_on_parent[rw], tg); - } - - RB_CLEAR_NODE(&tg->rb_node); + sq->parent_sq = &td->service_queue; + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; tg->td = td; - - tg->bps[READ] = -1; - tg->bps[WRITE] = -1; - tg->iops[READ] = -1; - tg->iops[WRITE] = -1; - - /* - * Ugh... We need to perform per-cpu allocation for tg->stats_cpu - * but percpu allocator can't be called from IO path. Queue tg on - * tg_stats_alloc_list and allocate from work item. - */ - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); - schedule_delayed_work(&tg_stats_alloc_work, 0); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); } /* @@ -470,83 +389,21 @@ static void tg_update_has_rules(struct throtl_grp *tg) (tg->bps[rw] != -1 || tg->iops[rw] != -1); } -static void throtl_pd_online(struct blkcg_gq *blkg) +static void throtl_pd_online(struct blkg_policy_data *pd) { /* * We don't want new groups to escape the limits of its ancestors. * Update has_rules[] after a new group is brought online. */ - tg_update_has_rules(blkg_to_tg(blkg)); -} - -static void throtl_pd_exit(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - unsigned long flags; - - spin_lock_irqsave(&tg_stats_alloc_lock, flags); - list_del_init(&tg->stats_alloc_node); - spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); - - free_percpu(tg->stats_cpu); - - throtl_service_queue_exit(&tg->service_queue); -} - -static void throtl_pd_reset_stats(struct blkcg_gq *blkg) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - int cpu; - - if (tg->stats_cpu == NULL) - return; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - blkg_rwstat_reset(&sc->service_bytes); - blkg_rwstat_reset(&sc->serviced); - } -} - -static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, - struct blkcg *blkcg) -{ - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) - return td_root_tg(td); - - return blkg_to_tg(blkg_lookup(blkcg, td->queue)); + tg_update_has_rules(pd_to_tg(pd)); } -static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, - struct blkcg *blkcg) +static void throtl_pd_free(struct blkg_policy_data *pd) { - struct request_queue *q = td->queue; - struct throtl_grp *tg = NULL; - - /* - * This is the common case when there are no blkcgs. Avoid lookup - * in this case - */ - if (blkcg == &blkcg_root) { - tg = td_root_tg(td); - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - - /* if %NULL and @q is alive, fall back to root_tg */ - if (!IS_ERR(blkg)) - tg = blkg_to_tg(blkg); - else if (!blk_queue_dying(q)) - tg = td_root_tg(td); - } + struct throtl_grp *tg = pd_to_tg(pd); - return tg; + del_timer_sync(&tg->service_queue.pending_timer); + kfree(tg); } static struct throtl_grp * @@ -956,32 +813,6 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, return 0; } -static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, - int rw) -{ - struct throtl_grp *tg = blkg_to_tg(blkg); - struct tg_stats_cpu *stats_cpu; - unsigned long flags; - - /* If per cpu stats are not allocated yet, don't do any accounting. */ - if (tg->stats_cpu == NULL) - return; - - /* - * Disabling interrupts to provide mutual exclusion between two - * writes on same cpu. It probably is not needed for 64bit. Not - * optimizing that case yet. - */ - local_irq_save(flags); - - stats_cpu = this_cpu_ptr(tg->stats_cpu); - - blkg_rwstat_add(&stats_cpu->serviced, rw, 1); - blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); - - local_irq_restore(flags); -} - static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) { bool rw = bio_data_dir(bio); @@ -995,17 +826,9 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) * more than once as a throttled bio will go through blk-throtl the * second time when it eventually gets issued. Set it when a bio * is being charged to a tg. - * - * Dispatch stats aren't recursive and each @bio should only be - * accounted by the @tg it was originally associated with. Let's - * update the stats when setting REQ_THROTTLED for the first time - * which is guaranteed to be for the @bio's original tg. */ - if (!(bio->bi_rw & REQ_THROTTLED)) { + if (!(bio->bi_rw & REQ_THROTTLED)) bio->bi_rw |= REQ_THROTTLED; - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - } } /** @@ -1285,34 +1108,6 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) } } -static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct throtl_grp *tg = pd_to_tg(pd); - struct blkg_rwstat rwstat = { }, tmp; - int i, cpu; - - if (tg->stats_cpu == NULL) - return 0; - - for_each_possible_cpu(cpu) { - struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); - - tmp = blkg_rwstat_read((void *)sc + off); - for (i = 0; i < BLKG_RWSTAT_NR; i++) - rwstat.cnt[i] += tmp.cnt[i]; - } - - return __blkg_prfill_rwstat(sf, pd, &rwstat); -} - -static int tg_print_cpu_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_cpu_rwstat, - &blkcg_policy_throtl, seq_cft(sf)->private, true); - return 0; -} - static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -1349,31 +1144,11 @@ static int tg_print_conf_uint(struct seq_file *sf, void *v) return 0; } -static ssize_t tg_set_conf(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, bool is_u64) +static void tg_conf_updated(struct throtl_grp *tg) { - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct throtl_grp *tg; - struct throtl_service_queue *sq; - struct blkcg_gq *blkg; + struct throtl_service_queue *sq = &tg->service_queue; struct cgroup_subsys_state *pos_css; - int ret; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); - if (ret) - return ret; - - tg = blkg_to_tg(ctx.blkg); - sq = &tg->service_queue; - - if (!ctx.v) - ctx.v = -1; - - if (is_u64) - *(u64 *)((void *)tg + of_cft(of)->private) = ctx.v; - else - *(unsigned int *)((void *)tg + of_cft(of)->private) = ctx.v; + struct blkcg_gq *blkg; throtl_log(&tg->service_queue, "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", @@ -1387,7 +1162,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1405,9 +1180,39 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_update_disptime(tg); throtl_schedule_next_dispatch(sq->parent_sq, true); } +} + +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = -1; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg); + ret = 0; +out_finish: blkg_conf_finish(&ctx); - return nbytes; + return ret ?: nbytes; } static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, @@ -1422,7 +1227,7 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, return tg_set_conf(of, buf, nbytes, off, false); } -static struct cftype throtl_files[] = { +static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", .private = offsetof(struct throtl_grp, bps[READ]), @@ -1449,13 +1254,124 @@ static struct cftype throtl_files[] = { }, { .name = "throttle.io_service_bytes", - .private = offsetof(struct tg_stats_cpu, service_bytes), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_bytes, }, { .name = "throttle.io_serviced", - .private = offsetof(struct tg_stats_cpu, serviced), - .seq_show = tg_print_cpu_rwstat, + .private = (unsigned long)&blkcg_policy_throtl, + .seq_show = blkg_print_stat_ios, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + + if (!dname) + return 0; + if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && + tg->iops[READ] == -1 && tg->iops[WRITE] == -1) + return 0; + + if (tg->bps[READ] != -1) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); + if (tg->bps[WRITE] != -1) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); + if (tg->iops[READ] != -1) + snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); + if (tg->iops[WRITE] != -1) + snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3]); + return 0; +} + +static int tg_print_max(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_max(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; + + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = -1; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps")) + v[0] = val; + else if (!strcmp(tok, "wbps")) + v[1] = val; + else if (!strcmp(tok, "riops")) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops")) + v[3] = min_t(u64, val, UINT_MAX); + else + goto out_finish; + } + + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; + + tg_conf_updated(tg); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_max, + .write = tg_set_max, }, { } /* terminate */ }; @@ -1468,52 +1384,33 @@ static void throtl_shutdown_wq(struct request_queue *q) } static struct blkcg_policy blkcg_policy_throtl = { - .pd_size = sizeof(struct throtl_grp), - .cftypes = throtl_files, + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_exit_fn = throtl_pd_exit, - .pd_reset_stats_fn = throtl_pd_reset_stats, + .pd_free_fn = throtl_pd_free, }; -bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, + struct bio *bio) { - struct throtl_data *td = q->td; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg; + struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); - struct blkcg *blkcg; bool throttled = false; + WARN_ON_ONCE(!rcu_read_lock_held()); + /* see throtl_charge_bio() */ - if (bio->bi_rw & REQ_THROTTLED) + if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) goto out; - /* - * A throtl_grp pointer retrieved under rcu can be used to access - * basic fields like stats and io rates. If a group has no rules, - * just update the dispatch stats in lockless manner and return. - */ - rcu_read_lock(); - blkcg = bio_blkcg(bio); - tg = throtl_lookup_tg(td, blkcg); - if (tg) { - if (!tg->has_rules[rw]) { - throtl_update_dispatch_stats(tg_to_blkg(tg), - bio->bi_iter.bi_size, bio->bi_rw); - goto out_unlock_rcu; - } - } - - /* - * Either group has not been allocated yet or it is not an unlimited - * IO group - */ spin_lock_irq(q->queue_lock); - tg = throtl_lookup_create_tg(td, blkcg); - if (unlikely(!tg)) + + if (unlikely(blk_queue_bypass(q))) goto out_unlock; sq = &tg->service_queue; @@ -1580,8 +1477,6 @@ bool blk_throtl_bio(struct request_queue *q, struct bio *bio) out_unlock: spin_unlock_irq(q->queue_lock); -out_unlock_rcu: - rcu_read_unlock(); out: /* * As multiple blk-throtls may stack in the same issue path, we @@ -1667,7 +1562,7 @@ int blk_throtl_init(struct request_queue *q) return -ENOMEM; INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue, NULL); + throtl_service_queue_init(&td->service_queue); q->td = td; td->queue = q; diff --git a/block/blk.h b/block/blk.h index 43b036185712..923b34aaf6eb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -267,15 +267,10 @@ static inline struct io_context *create_io_context(gfp_t gfp_mask, int node) * Internal throttling interface */ #ifdef CONFIG_BLK_DEV_THROTTLING -extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); extern void blk_throtl_drain(struct request_queue *q); extern int blk_throtl_init(struct request_queue *q); extern void blk_throtl_exit(struct request_queue *q); #else /* CONFIG_BLK_DEV_THROTTLING */ -static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) -{ - return false; -} static inline void blk_throtl_drain(struct request_queue *q) { } static inline int blk_throtl_init(struct request_queue *q) { return 0; } static inline void blk_throtl_exit(struct request_queue *q) { } diff --git a/block/bounce.c b/block/bounce.c index ed9dd8067120..9ce1da95864c 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -13,6 +13,7 @@ #include <linux/pagemap.h> #include <linux/mempool.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> @@ -127,15 +128,14 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err) struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, *org_vec; int i; - - if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) - set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags); + int start = bio_orig->bi_iter.bi_idx; /* * free up bounce indirect pages used */ bio_for_each_segment_all(bvec, bio, i) { - org_vec = bio_orig->bi_io_vec + i; + org_vec = bio_orig->bi_io_vec + i + start; + if (bvec->bv_page == org_vec->bv_page) continue; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 5da8e6e9ab4b..1f9093e901da 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -14,8 +14,8 @@ #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/blktrace_api.h> +#include <linux/blk-cgroup.h> #include "blk.h" -#include "blk-cgroup.h" /* * tunables @@ -67,6 +67,11 @@ static struct kmem_cache *cfq_pool; #define sample_valid(samples) ((samples) > 80) #define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) +/* blkio-related constants */ +#define CFQ_WEIGHT_LEGACY_MIN 10 +#define CFQ_WEIGHT_LEGACY_DFL 500 +#define CFQ_WEIGHT_LEGACY_MAX 1000 + struct cfq_ttime { unsigned long last_end_request; @@ -172,10 +177,6 @@ enum wl_type_t { struct cfqg_stats { #ifdef CONFIG_CFQ_GROUP_IOSCHED - /* total bytes transferred */ - struct blkg_rwstat service_bytes; - /* total IOs serviced, post merge */ - struct blkg_rwstat serviced; /* number of ios merged */ struct blkg_rwstat merged; /* total time spent on device in ns, may not be accurate w/ queueing */ @@ -184,8 +185,6 @@ struct cfqg_stats { struct blkg_rwstat wait_time; /* number of IOs queued up */ struct blkg_rwstat queued; - /* total sectors transferred */ - struct blkg_stat sectors; /* total disk time and nr sectors dispatched by this group */ struct blkg_stat time; #ifdef CONFIG_DEBUG_BLK_CGROUP @@ -212,6 +211,15 @@ struct cfqg_stats { #endif /* CONFIG_CFQ_GROUP_IOSCHED */ }; +/* Per-cgroup data */ +struct cfq_group_data { + /* must be the first member */ + struct blkcg_policy_data cpd; + + unsigned int weight; + unsigned int leaf_weight; +}; + /* This is per cgroup per device grouping structure */ struct cfq_group { /* must be the first member */ @@ -290,7 +298,11 @@ struct cfq_group { int dispatched; struct cfq_ttime ttime; struct cfqg_stats stats; /* stats for this cfqg */ - struct cfqg_stats dead_stats; /* stats pushed from dead children */ + + /* async queue for each priority case */ + struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; + struct cfq_queue *async_idle_cfqq; + }; struct cfq_io_cq { @@ -356,12 +368,6 @@ struct cfq_data { struct cfq_queue *active_queue; struct cfq_io_cq *active_cic; - /* - * async queue for each priority case - */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - sector_t last_position; /* @@ -387,6 +393,7 @@ struct cfq_data { }; static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); +static void cfq_put_queue(struct cfq_queue *cfqq); static struct cfq_rb_root *st_for(struct cfq_group *cfqg, enum wl_class_t class, @@ -446,16 +453,6 @@ CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) /* cfqg stats flags */ @@ -600,6 +597,22 @@ static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } #ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct cfq_group, pd) : NULL; +} + +static struct cfq_group_data +*cpd_to_cfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; +} + +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) +{ + return pd_to_blkg(&cfqg->pd); +} + static struct blkcg_policy blkcg_policy_cfq; static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) @@ -607,6 +620,11 @@ static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); } +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) +{ + return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); +} + static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; @@ -668,14 +686,6 @@ static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) blkg_rwstat_add(&cfqg->stats.merged, rw, 1); } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) -{ - blkg_stat_add(&cfqg->stats.sectors, bytes >> 9); - blkg_rwstat_add(&cfqg->stats.serviced, rw, 1); - blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes); -} - static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { @@ -693,8 +703,6 @@ static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, static void cfqg_stats_reset(struct cfqg_stats *stats) { /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->service_bytes); - blkg_rwstat_reset(&stats->serviced); blkg_rwstat_reset(&stats->merged); blkg_rwstat_reset(&stats->service_time); blkg_rwstat_reset(&stats->wait_time); @@ -711,28 +719,26 @@ static void cfqg_stats_reset(struct cfqg_stats *stats) } /* @to += @from */ -static void cfqg_stats_merge(struct cfqg_stats *to, struct cfqg_stats *from) +static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) { /* queued stats shouldn't be cleared */ - blkg_rwstat_merge(&to->service_bytes, &from->service_bytes); - blkg_rwstat_merge(&to->serviced, &from->serviced); - blkg_rwstat_merge(&to->merged, &from->merged); - blkg_rwstat_merge(&to->service_time, &from->service_time); - blkg_rwstat_merge(&to->wait_time, &from->wait_time); - blkg_stat_merge(&from->time, &from->time); + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + blkg_stat_add_aux(&from->time, &from->time); #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_merge(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_merge(&to->dequeue, &from->dequeue); - blkg_stat_merge(&to->group_wait_time, &from->group_wait_time); - blkg_stat_merge(&to->idle_time, &from->idle_time); - blkg_stat_merge(&to->empty_time, &from->empty_time); + blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); + blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); + blkg_stat_add_aux(&to->dequeue, &from->dequeue); + blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + blkg_stat_add_aux(&to->idle_time, &from->idle_time); + blkg_stat_add_aux(&to->empty_time, &from->empty_time); #endif } /* - * Transfer @cfqg's stats to its parent's dead_stats so that the ancestors' + * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' * recursive stats can still account for the amount used by this cfqg after * it's gone. */ @@ -745,10 +751,8 @@ static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) if (unlikely(!parent)) return; - cfqg_stats_merge(&parent->dead_stats, &cfqg->stats); - cfqg_stats_merge(&parent->dead_stats, &cfqg->dead_stats); + cfqg_stats_add_aux(&parent->stats, &cfqg->stats); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } #else /* CONFIG_CFQ_GROUP_IOSCHED */ @@ -770,8 +774,6 @@ static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, unsigned long time, unsigned long unaccounted_time) { } static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { } static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { } -static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg, - uint64_t bytes, int rw) { } static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, uint64_t start_time, uint64_t io_start_time, int rw) { } @@ -858,8 +860,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio, - gfp_t gfp_mask); + struct cfq_io_cq *cic, struct bio *bio); static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) { @@ -1521,115 +1522,171 @@ static void cfq_init_cfqg_base(struct cfq_group *cfqg) } #ifdef CONFIG_CFQ_GROUP_IOSCHED -static void cfqg_stats_init(struct cfqg_stats *stats) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight); + +static void cfqg_stats_exit(struct cfqg_stats *stats) { - blkg_rwstat_init(&stats->service_bytes); - blkg_rwstat_init(&stats->serviced); - blkg_rwstat_init(&stats->merged); - blkg_rwstat_init(&stats->service_time); - blkg_rwstat_init(&stats->wait_time); - blkg_rwstat_init(&stats->queued); + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + blkg_stat_exit(&stats->time); +#ifdef CONFIG_DEBUG_BLK_CGROUP + blkg_stat_exit(&stats->unaccounted_time); + blkg_stat_exit(&stats->avg_queue_size_sum); + blkg_stat_exit(&stats->avg_queue_size_samples); + blkg_stat_exit(&stats->dequeue); + blkg_stat_exit(&stats->group_wait_time); + blkg_stat_exit(&stats->idle_time); + blkg_stat_exit(&stats->empty_time); +#endif +} - blkg_stat_init(&stats->sectors); - blkg_stat_init(&stats->time); +static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + blkg_stat_init(&stats->time, gfp)) + goto err; #ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_init(&stats->unaccounted_time); - blkg_stat_init(&stats->avg_queue_size_sum); - blkg_stat_init(&stats->avg_queue_size_samples); - blkg_stat_init(&stats->dequeue); - blkg_stat_init(&stats->group_wait_time); - blkg_stat_init(&stats->idle_time); - blkg_stat_init(&stats->empty_time); + if (blkg_stat_init(&stats->unaccounted_time, gfp) || + blkg_stat_init(&stats->avg_queue_size_sum, gfp) || + blkg_stat_init(&stats->avg_queue_size_samples, gfp) || + blkg_stat_init(&stats->dequeue, gfp) || + blkg_stat_init(&stats->group_wait_time, gfp) || + blkg_stat_init(&stats->idle_time, gfp) || + blkg_stat_init(&stats->empty_time, gfp)) + goto err; #endif + return 0; +err: + cfqg_stats_exit(stats); + return -ENOMEM; +} + +static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) +{ + struct cfq_group_data *cgd; + + cgd = kzalloc(sizeof(*cgd), GFP_KERNEL); + if (!cgd) + return NULL; + return &cgd->cpd; +} + +static void cfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); + unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (cpd_to_blkcg(cpd) == &blkcg_root) + weight *= 2; + + cgd->weight = weight; + cgd->leaf_weight = weight; +} + +static void cfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_cfqgd(cpd)); +} + +static void cfq_cpd_bind(struct blkcg_policy_data *cpd) +{ + struct blkcg *blkcg = cpd_to_blkcg(cpd); + bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); + unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; + + if (blkcg == &blkcg_root) + weight *= 2; + + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); + WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); } -static void cfq_pd_init(struct blkcg_gq *blkg) +static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg; + + cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); + if (!cfqg) + return NULL; cfq_init_cfqg_base(cfqg); - cfqg->weight = blkg->blkcg->cfq_weight; - cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight; - cfqg_stats_init(&cfqg->stats); - cfqg_stats_init(&cfqg->dead_stats); + if (cfqg_stats_init(&cfqg->stats, gfp)) { + kfree(cfqg); + return NULL; + } + + return &cfqg->pd; +} + +static void cfq_pd_init(struct blkg_policy_data *pd) +{ + struct cfq_group *cfqg = pd_to_cfqg(pd); + struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); + + cfqg->weight = cgd->weight; + cfqg->leaf_weight = cgd->leaf_weight; } -static void cfq_pd_offline(struct blkcg_gq *blkg) +static void cfq_pd_offline(struct blkg_policy_data *pd) { + struct cfq_group *cfqg = pd_to_cfqg(pd); + int i; + + for (i = 0; i < IOPRIO_BE_NR; i++) { + if (cfqg->async_cfqq[0][i]) + cfq_put_queue(cfqg->async_cfqq[0][i]); + if (cfqg->async_cfqq[1][i]) + cfq_put_queue(cfqg->async_cfqq[1][i]); + } + + if (cfqg->async_idle_cfqq) + cfq_put_queue(cfqg->async_idle_cfqq); + /* * @blkg is going offline and will be ignored by * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so * that they don't get lost. If IOs complete after this point, the * stats for them will be lost. Oh well... */ - cfqg_stats_xfer_dead(blkg_to_cfqg(blkg)); -} - -/* offset delta from cfqg->stats to cfqg->dead_stats */ -static const int dead_stats_off_delta = offsetof(struct cfq_group, dead_stats) - - offsetof(struct cfq_group, stats); - -/* to be used by recursive prfill, sums live and dead stats recursively */ -static u64 cfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -{ - u64 sum = 0; - - sum += blkg_stat_recursive_sum(pd, off); - sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta); - return sum; + cfqg_stats_xfer_dead(cfqg); } -/* to be used by recursive prfill, sums live and dead rwstats recursively */ -static struct blkg_rwstat cfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, - int off) +static void cfq_pd_free(struct blkg_policy_data *pd) { - struct blkg_rwstat a, b; + struct cfq_group *cfqg = pd_to_cfqg(pd); - a = blkg_rwstat_recursive_sum(pd, off); - b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta); - blkg_rwstat_merge(&a, &b); - return a; + cfqg_stats_exit(&cfqg->stats); + return kfree(cfqg); } -static void cfq_pd_reset_stats(struct blkcg_gq *blkg) +static void cfq_pd_reset_stats(struct blkg_policy_data *pd) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); + struct cfq_group *cfqg = pd_to_cfqg(pd); cfqg_stats_reset(&cfqg->stats); - cfqg_stats_reset(&cfqg->dead_stats); } -/* - * Search for the cfq group current task belongs to. request_queue lock must - * be held. - */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { - struct request_queue *q = cfqd->queue; - struct cfq_group *cfqg = NULL; - - /* avoid lookup for the common case where there's no blkcg */ - if (blkcg == &blkcg_root) { - cfqg = cfqd->root_group; - } else { - struct blkcg_gq *blkg; - - blkg = blkg_lookup_create(blkcg, q); - if (!IS_ERR(blkg)) - cfqg = blkg_to_cfqg(blkg); - } + struct blkcg_gq *blkg; - return cfqg; + blkg = blkg_lookup(blkcg, cfqd->queue); + if (likely(blkg)) + return blkg_to_cfqg(blkg); + return NULL; } static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - /* Currently, all async queues are mapped to root group */ - if (!cfq_cfqq_sync(cfqq)) - cfqg = cfqq->cfqd->root_group; - cfqq->cfqg = cfqg; /* cfqq reference on cfqg */ cfqg_get(cfqg); @@ -1673,42 +1730,74 @@ static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) static int cfq_print_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->weight; + + seq_printf(sf, "%u\n", val); return 0; } static int cfq_print_leaf_weight(struct seq_file *sf, void *v) { - seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight); + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + unsigned int val = 0; + + if (cgd) + val = cgd->leaf_weight; + + seq_printf(sf, "%u\n", val); return 0; } static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, - bool is_leaf_weight) + bool on_dfl, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; + struct cfq_group_data *cfqgd; int ret; + u64 v; ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); if (ret) return ret; - ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v && on_dfl) + goto out_finish; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out_finish; + } + cfqg = blkg_to_cfqg(ctx.blkg); - if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) { + cfqgd = blkcg_to_cfqgd(blkcg); + + ret = -ERANGE; + if (!v || (v >= min && v <= max)) { if (!is_leaf_weight) { - cfqg->dev_weight = ctx.v; - cfqg->new_weight = ctx.v ?: blkcg->cfq_weight; + cfqg->dev_weight = v; + cfqg->new_weight = v ?: cfqgd->weight; } else { - cfqg->dev_leaf_weight = ctx.v; - cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight; + cfqg->dev_leaf_weight = v; + cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; } ret = 0; } - +out_finish: blkg_conf_finish(&ctx); return ret ?: nbytes; } @@ -1716,30 +1805,39 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, false); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); } static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - return __cfqg_set_weight_device(of, buf, nbytes, off, true); + return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); } -static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val, bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, + bool on_dfl, bool reset_dev, bool is_leaf_weight) { + unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; + unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; + struct cfq_group_data *cfqgd; + int ret = 0; - if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) - return -EINVAL; + if (val < min || val > max) + return -ERANGE; spin_lock_irq(&blkcg->lock); + cfqgd = blkcg_to_cfqgd(blkcg); + if (!cfqgd) { + ret = -EINVAL; + goto out; + } if (!is_leaf_weight) - blkcg->cfq_weight = val; + cfqgd->weight = val; else - blkcg->cfq_leaf_weight = val; + cfqgd->leaf_weight = val; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct cfq_group *cfqg = blkg_to_cfqg(blkg); @@ -1748,28 +1846,33 @@ static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, continue; if (!is_leaf_weight) { + if (reset_dev) + cfqg->dev_weight = 0; if (!cfqg->dev_weight) - cfqg->new_weight = blkcg->cfq_weight; + cfqg->new_weight = cfqgd->weight; } else { + if (reset_dev) + cfqg->dev_leaf_weight = 0; if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = blkcg->cfq_leaf_weight; + cfqg->new_leaf_weight = cfqgd->leaf_weight; } } +out: spin_unlock_irq(&blkcg->lock); - return 0; + return ret; } static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, false); + return __cfq_set_weight(css, val, false, false, false); } static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - return __cfq_set_weight(css, cft, val, true); + return __cfq_set_weight(css, val, false, false, true); } static int cfqg_print_stat(struct seq_file *sf, void *v) @@ -1789,16 +1892,16 @@ static int cfqg_print_rwstat(struct seq_file *sf, void *v) static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - u64 sum = cfqg_stat_pd_recursive_sum(pd, off); - + u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_u64(sf, pd, sum); } static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, struct blkg_policy_data *pd, int off) { - struct blkg_rwstat sum = cfqg_rwstat_pd_recursive_sum(pd, off); - + struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), + &blkcg_policy_cfq, off); return __blkg_prfill_rwstat(sf, pd, &sum); } @@ -1818,6 +1921,40 @@ static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) return 0; } +static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); + return 0; +} + +static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, + offsetof(struct blkcg_gq, stat_bytes)); + u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + + atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, + false); + return 0; +} + #ifdef CONFIG_DEBUG_BLK_CGROUP static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, struct blkg_policy_data *pd, int off) @@ -1844,7 +1981,7 @@ static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) } #endif /* CONFIG_DEBUG_BLK_CGROUP */ -static struct cftype cfq_blkcg_files[] = { +static struct cftype cfq_blkcg_legacy_files[] = { /* on root, weight is mapped to leaf_weight */ { .name = "weight_device", @@ -1892,18 +2029,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat, + .seq_show = cfqg_print_stat_sectors, }, { .name = "io_service_bytes", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes, }, { .name = "io_serviced", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios, }, { .name = "io_service_time", @@ -1934,18 +2070,17 @@ static struct cftype cfq_blkcg_files[] = { }, { .name = "sectors_recursive", - .private = offsetof(struct cfq_group, stats.sectors), - .seq_show = cfqg_print_stat_recursive, + .seq_show = cfqg_print_stat_sectors_recursive, }, { .name = "io_service_bytes_recursive", - .private = offsetof(struct cfq_group, stats.service_bytes), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_bytes_recursive, }, { .name = "io_serviced_recursive", - .private = offsetof(struct cfq_group, stats.serviced), - .seq_show = cfqg_print_rwstat_recursive, + .private = (unsigned long)&blkcg_policy_cfq, + .seq_show = blkg_print_stat_ios_recursive, }, { .name = "io_service_time_recursive", @@ -2000,9 +2135,51 @@ static struct cftype cfq_blkcg_files[] = { #endif /* CONFIG_DEBUG_BLK_CGROUP */ { } /* terminate */ }; + +static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); + + seq_printf(sf, "default %u\n", cgd->weight); + blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); + return 0; +} + +static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = __cfq_set_weight(of_css(of), v, true, false, false); + return ret ?: nbytes; + } + + /* "MAJ:MIN WEIGHT" */ + return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); +} + +static struct cftype cfq_blkcg_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cfq_print_weight_on_dfl, + .write = cfq_set_weight_on_dfl, + }, + { } /* terminate */ +}; + #else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) +static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, + struct blkcg *blkcg) { return cfqd->root_group; } @@ -2805,7 +2982,6 @@ static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; cfqq->nr_sectors += blk_rq_sectors(rq); - cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags); } /* @@ -3438,14 +3614,14 @@ static void cfq_exit_icq(struct io_cq *icq) struct cfq_io_cq *cic = icq_to_cic(icq); struct cfq_data *cfqd = cic_to_cfqd(cic); - if (cic->cfqq[BLK_RW_ASYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); - cic->cfqq[BLK_RW_ASYNC] = NULL; + if (cic_to_cfqq(cic, false)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); + cic_set_cfqq(cic, NULL, false); } - if (cic->cfqq[BLK_RW_SYNC]) { - cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]); - cic->cfqq[BLK_RW_SYNC] = NULL; + if (cic_to_cfqq(cic, true)) { + cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); + cic_set_cfqq(cic, NULL, true); } } @@ -3504,18 +3680,14 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) return; - cfqq = cic->cfqq[BLK_RW_ASYNC]; + cfqq = cic_to_cfqq(cic, false); if (cfqq) { - struct cfq_queue *new_cfqq; - new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio, - GFP_ATOMIC); - if (new_cfqq) { - cic->cfqq[BLK_RW_ASYNC] = new_cfqq; - cfq_put_queue(cfqq); - } + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); + cic_set_cfqq(cic, cfqq, false); } - cfqq = cic->cfqq[BLK_RW_SYNC]; + cfqq = cic_to_cfqq(cic, true); if (cfqq) cfq_mark_cfqq_prio_changed(cfqq); @@ -3546,7 +3718,7 @@ static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *sync_cfqq; + struct cfq_queue *cfqq; uint64_t serial_nr; rcu_read_lock(); @@ -3560,15 +3732,22 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) return; - sync_cfqq = cic_to_cfqq(cic, 1); - if (sync_cfqq) { - /* - * Drop reference to sync queue. A new sync queue will be - * assigned in new group upon arrival of a fresh request. - */ - cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, 1); - cfq_put_queue(sync_cfqq); + /* + * Drop reference to queues. New queues will be assigned in new + * group upon arrival of fresh requests. + */ + cfqq = cic_to_cfqq(cic, false); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, false); + cfq_put_queue(cfqq); + } + + cfqq = cic_to_cfqq(cic, true); + if (cfqq) { + cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, true); + cfq_put_queue(cfqq); } cic->blkcg_serial_nr = serial_nr; @@ -3577,81 +3756,19 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { } #endif /* CONFIG_CFQ_GROUP_IOSCHED */ -static struct cfq_queue * -cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) -{ - struct blkcg *blkcg; - struct cfq_queue *cfqq, *new_cfqq = NULL; - struct cfq_group *cfqg; - -retry: - rcu_read_lock(); - - blkcg = bio_blkcg(bio); - cfqg = cfq_lookup_create_cfqg(cfqd, blkcg); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - cfqq = cic_to_cfqq(cic, is_sync); - - /* - * Always try a new alloc if we fell back to the OOM cfqq - * originally, since it should just be a temporary situation. - */ - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = NULL; - if (new_cfqq) { - cfqq = new_cfqq; - new_cfqq = NULL; - } else if (gfp_mask & __GFP_WAIT) { - rcu_read_unlock(); - spin_unlock_irq(cfqd->queue->queue_lock); - new_cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - spin_lock_irq(cfqd->queue->queue_lock); - if (new_cfqq) - goto retry; - else - return &cfqd->oom_cfqq; - } else { - cfqq = kmem_cache_alloc_node(cfq_pool, - gfp_mask | __GFP_ZERO, - cfqd->queue->node); - } - - if (cfqq) { - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - } else - cfqq = &cfqd->oom_cfqq; - } -out: - if (new_cfqq) - kmem_cache_free(cfq_pool, new_cfqq); - - rcu_read_unlock(); - return cfqq; -} - static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) +cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &cfqd->async_cfqq[0][ioprio]; + return &cfqg->async_cfqq[0][ioprio]; case IOPRIO_CLASS_NONE: ioprio = IOPRIO_NORM; /* fall through */ case IOPRIO_CLASS_BE: - return &cfqd->async_cfqq[1][ioprio]; + return &cfqg->async_cfqq[1][ioprio]; case IOPRIO_CLASS_IDLE: - return &cfqd->async_idle_cfqq; + return &cfqg->async_idle_cfqq; default: BUG(); } @@ -3659,12 +3776,20 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio) static struct cfq_queue * cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio, gfp_t gfp_mask) + struct bio *bio) { int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq = NULL; + struct cfq_queue *cfqq; + struct cfq_group *cfqg; + + rcu_read_lock(); + cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); + if (!cfqg) { + cfqq = &cfqd->oom_cfqq; + goto out; + } if (!is_sync) { if (!ioprio_valid(cic->ioprio)) { @@ -3672,22 +3797,32 @@ cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, ioprio = task_nice_ioprio(tsk); ioprio_class = task_nice_ioclass(tsk); } - async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio); + async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); cfqq = *async_cfqq; + if (cfqq) + goto out; } - if (!cfqq) - cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask); + cfqq = kmem_cache_alloc_node(cfq_pool, GFP_NOWAIT | __GFP_ZERO, + cfqd->queue->node); + if (!cfqq) { + cfqq = &cfqd->oom_cfqq; + goto out; + } - /* - * pin the queue now that it's allocated, scheduler exit will prune it - */ - if (!is_sync && !(*async_cfqq)) { + cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); + cfq_init_prio_data(cfqq, cic); + cfq_link_cfqq_cfqg(cfqq, cfqg); + cfq_log_cfqq(cfqd, cfqq, "alloced"); + + if (async_cfqq) { + /* a new async queue is created, pin and remember */ cfqq->ref++; *async_cfqq = cfqq; } - +out: cfqq->ref++; + rcu_read_unlock(); return cfqq; } @@ -4221,8 +4356,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, const bool is_sync = rq_is_sync(rq); struct cfq_queue *cfqq; - might_sleep_if(gfp_mask & __GFP_WAIT); - spin_lock_irq(q->queue_lock); check_ioprio_changed(cic, bio); @@ -4230,7 +4363,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask); + if (cfqq) + cfq_put_queue(cfqq); + cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); cic_set_cfqq(cic, cfqq, is_sync); } else { /* @@ -4336,21 +4471,6 @@ static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) cancel_work_sync(&cfqd->unplug_work); } -static void cfq_put_async_queues(struct cfq_data *cfqd) -{ - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqd->async_cfqq[0][i]) - cfq_put_queue(cfqd->async_cfqq[0][i]); - if (cfqd->async_cfqq[1][i]) - cfq_put_queue(cfqd->async_cfqq[1][i]); - } - - if (cfqd->async_idle_cfqq) - cfq_put_queue(cfqd->async_idle_cfqq); -} - static void cfq_exit_queue(struct elevator_queue *e) { struct cfq_data *cfqd = e->elevator_data; @@ -4363,8 +4483,6 @@ static void cfq_exit_queue(struct elevator_queue *e) if (cfqd->active_queue) __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - cfq_put_async_queues(cfqd); - spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); @@ -4418,9 +4536,9 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) goto out_free; cfq_init_cfqg_base(cfqd->root_group); + cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; + cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; #endif - cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT; /* * Not strictly needed (since RB_ROOT just clears the node and we @@ -4431,7 +4549,7 @@ static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) cfqd->prio_trees[i] = RB_ROOT; /* - * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues. + * Our fallback cfqq if cfq_get_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. oom_cfqq is linked to root_group * but shouldn't hold a reference as it'll never be unlinked. Lose @@ -4477,6 +4595,18 @@ out_free: return ret; } +static void cfq_registered_queue(struct request_queue *q) +{ + struct elevator_queue *e = q->elevator; + struct cfq_data *cfqd = e->elevator_data; + + /* + * Default to IOPS mode with no idling for SSDs + */ + if (blk_queue_nonrot(q)) + cfqd->cfq_slice_idle = 0; +} + /* * sysfs parts below --> */ @@ -4592,6 +4722,7 @@ static struct elevator_type iosched_cfq = { .elevator_may_queue_fn = cfq_may_queue, .elevator_init_fn = cfq_init_queue, .elevator_exit_fn = cfq_exit_queue, + .elevator_registered_fn = cfq_registered_queue, }, .icq_size = sizeof(struct cfq_io_cq), .icq_align = __alignof__(struct cfq_io_cq), @@ -4602,11 +4733,18 @@ static struct elevator_type iosched_cfq = { #ifdef CONFIG_CFQ_GROUP_IOSCHED static struct blkcg_policy blkcg_policy_cfq = { - .pd_size = sizeof(struct cfq_group), - .cftypes = cfq_blkcg_files, + .dfl_cftypes = cfq_blkcg_files, + .legacy_cftypes = cfq_blkcg_legacy_files, + + .cpd_alloc_fn = cfq_cpd_alloc, + .cpd_init_fn = cfq_cpd_init, + .cpd_free_fn = cfq_cpd_free, + .cpd_bind_fn = cfq_cpd_bind, + .pd_alloc_fn = cfq_pd_alloc, .pd_init_fn = cfq_pd_init, .pd_offline_fn = cfq_pd_offline, + .pd_free_fn = cfq_pd_free, .pd_reset_stats_fn = cfq_pd_reset_stats, }; #endif diff --git a/block/elevator.c b/block/elevator.c index 8985038f398c..84d63943f2de 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -35,11 +35,11 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" -#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -806,6 +806,8 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; + if (e->type->ops.elevator_registered_fn) + e->type->ops.elevator_registered_fn(q); } return error; } diff --git a/block/genhd.c b/block/genhd.c index f5d12185d631..28be651324f3 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -8,6 +8,7 @@ #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> |